From 4be1b49cb8873db5dfefbcd045d00ae98a30a7d2 Mon Sep 17 00:00:00 2001 From: Meng Date: Mon, 13 Apr 2026 00:18:00 +0000 Subject: [PATCH 01/45] Add DreamZero OpenPI serving and core wiring Implement the DreamZero omni serving path as a single clean feature commit without test- or doc-only files. This keeps the model registry/stage detection, root-config-driven pipeline initialization, root-checkpoint weight loading, DreamZero model/state wiring, and OpenPI serving / transform integration required for the feature branch. Signed-off-by: Meng Co-authored-by: Yangshen Deng --- vllm_omni/diffusion/data.py | 41 +- vllm_omni/diffusion/diffusion_engine.py | 16 + .../dreamzero/modeling/action_encoder.py | 121 ++ .../dreamzero/modeling/causal_wan_model.py | 1053 ++++++++++++ .../dreamzero/modeling/image_encoder.py | 245 +++ .../models/dreamzero/pipeline_dreamzero.py | 1480 +++++++++++++++++ .../models/dreamzero/state_dreamzero.py | 185 +++ vllm_omni/diffusion/registry.py | 5 + vllm_omni/diffusion/utils/hf_utils.py | 36 +- vllm_omni/entrypoints/openai/api_server.py | 29 + .../openai/realtime/robot/__init__.py | 1 + .../realtime/robot/openpi_connection.py | 96 ++ .../openai/realtime/robot/openpi_serving.py | 132 ++ .../realtime/robot/transform/__init__.py | 2 + .../openai/realtime/robot/transform/base.py | 141 ++ .../openai/realtime/robot/transform/droid.py | 195 +++ .../realtime/robot/transform/roboarena.py | 43 + 17 files changed, 3813 insertions(+), 8 deletions(-) create mode 100644 vllm_omni/diffusion/models/dreamzero/modeling/action_encoder.py create mode 100644 vllm_omni/diffusion/models/dreamzero/modeling/causal_wan_model.py create mode 100644 vllm_omni/diffusion/models/dreamzero/modeling/image_encoder.py create mode 100644 vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py create mode 100644 vllm_omni/diffusion/models/dreamzero/state_dreamzero.py create mode 100644 vllm_omni/entrypoints/openai/realtime/robot/__init__.py create mode 100644 vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py create mode 100644 vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py create mode 100644 vllm_omni/entrypoints/openai/realtime/robot/transform/__init__.py create mode 100644 vllm_omni/entrypoints/openai/realtime/robot/transform/base.py create mode 100644 vllm_omni/entrypoints/openai/realtime/robot/transform/droid.py create mode 100644 vllm_omni/entrypoints/openai/realtime/robot/transform/roboarena.py diff --git a/vllm_omni/diffusion/data.py b/vllm_omni/diffusion/data.py index a6fe1e4e9c7..c964ca13479 100644 --- a/vllm_omni/diffusion/data.py +++ b/vllm_omni/diffusion/data.py @@ -901,8 +901,39 @@ def enrich_config(self) -> None: self.model_class_name = "WanS2VPipeline" self.tf_model_config = TransformerConfig() self.update_multimodal_support() + elif model_type == "vla": + action_head_cfg = cfg.get("action_head_cfg") or {} + looks_like_dreamzero = False + if isinstance(action_head_cfg, Mapping): + action_head_cfg_config = action_head_cfg.get("config") or {} + diffusion_model_cfg = {} + if isinstance(action_head_cfg_config, Mapping): + diffusion_model_cfg = action_head_cfg_config.get("diffusion_model_cfg") or {} + if isinstance(diffusion_model_cfg, Mapping): + looks_like_dreamzero = ( + action_head_cfg.get("_target_") + == "groot.vla.model.dreamzero.action_head.wan_flow_matching_action_tf.WANPolicyHead" + and diffusion_model_cfg.get("_target_") + == ( + "groot.vla.model.dreamzero.modules." + "wan_video_dit_action_casual_chunk.CausalWanModel" + ) + ) + if looks_like_dreamzero or self.model_class_name == "DreamZeroPipeline": + self.model_class_name = "DreamZeroPipeline" + self.set_tf_model_config(TransformerConfig()) + self.update_multimodal_support() + else: + raise elif architectures and len(architectures) == 1: - self.model_class_name = architectures[0] + architecture = architectures[0] + from vllm_omni.diffusion.registry import DiffusionModelRegistry + + if ( + self.model_class_name is None + or DiffusionModelRegistry._try_load_model_cls(architecture) is not None + ): + self.model_class_name = architecture else: raise @@ -972,10 +1003,10 @@ class DiffusionOutput: """ # Fields may be replaced with SHM handle dicts by ipc.pack_diffusion_output_shm - output: torch.Tensor | dict | None = None - trajectory_timesteps: torch.Tensor | dict | None = None - trajectory_latents: torch.Tensor | dict | None = None - trajectory_log_probs: torch.Tensor | dict | None = None + output: torch.Tensor | tuple[Any, ...] | dict[str, Any] | None = None + trajectory_timesteps: torch.Tensor | dict[str, Any] | None = None + trajectory_latents: torch.Tensor | dict[str, Any] | None = None + trajectory_log_probs: torch.Tensor | dict[str, Any] | None = None trajectory_decoded: list[Image.Image] | None = None error: str | None = None aborted: bool = False diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py index c13bd3c0c37..d2624c65144 100644 --- a/vllm_omni/diffusion/diffusion_engine.py +++ b/vllm_omni/diffusion/diffusion_engine.py @@ -251,8 +251,10 @@ async def step(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]: custom_output = output.custom_output or {} model_audio_sample_rate = None model_fps = None + action_payload = None if isinstance(outputs, dict): audio_payload = outputs.get("audio") + action_payload = outputs.get("actions") custom_output.update(outputs.get("custom_output") or {}) model_audio_sample_rate = outputs.get("audio_sample_rate") model_fps = outputs.get("fps") @@ -346,6 +348,8 @@ def _audio_mm(payload: Any) -> dict[str, Any]: mm_output["audio_sample_rate"] = model_audio_sample_rate if model_fps is not None: mm_output["fps"] = model_fps + if action_payload is not None: + mm_output["actions"] = action_payload return [ OmniRequestOutput.from_diffusion( request_id=request_id, @@ -416,6 +420,18 @@ def _audio_mm(payload: Any) -> dict[str, Any]: mm_output["audio_sample_rate"] = model_audio_sample_rate if model_fps is not None: mm_output["fps"] = model_fps + if action_payload is not None: + sliced_actions = action_payload + if isinstance(action_payload, (list, tuple)): + sliced_actions = action_payload[start_idx:end_idx] + if len(sliced_actions) == 1: + sliced_actions = sliced_actions[0] + elif hasattr(action_payload, "shape") and getattr(action_payload, "shape", None) is not None: + if len(action_payload.shape) > 0 and action_payload.shape[0] >= end_idx: + sliced_actions = action_payload[start_idx:end_idx] + if num_outputs == 1: + sliced_actions = sliced_actions[0] + mm_output["actions"] = sliced_actions results.append( OmniRequestOutput.from_diffusion( request_id=request_id, diff --git a/vllm_omni/diffusion/models/dreamzero/modeling/action_encoder.py b/vllm_omni/diffusion/models/dreamzero/modeling/action_encoder.py new file mode 100644 index 00000000000..e9e55da2e46 --- /dev/null +++ b/vllm_omni/diffusion/models/dreamzero/modeling/action_encoder.py @@ -0,0 +1,121 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Action encoder/decoder for DreamZero. + +Adapted from: +- CategorySpecificLinear/MLP/MultiEmbodimentActionEncoder: + dreamzero/groot/vla/model/dreamzero/modules/wan_video_dit_action_casual_chunk.py L31-90 +- SinusoidalPositionalEncoding/swish: + dreamzero/groot/vla/model/n1_5/modules/action_encoder.py L1-41 +""" + +from __future__ import annotations + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def swish(x: torch.Tensor) -> torch.Tensor: + """swish activation: x * sigmoid(x) + Source: action_encoder.py L6-7 + """ + return x * torch.sigmoid(x) + + +class SinusoidalPositionalEncoding(nn.Module): + """Sinusoidal encoding: (B, T) timesteps → (B, T, dim) + + Source: action_encoder.py L10-40 + """ + + def __init__(self, embedding_dim: int) -> None: + super().__init__() + self.embedding_dim = embedding_dim + + def forward(self, timesteps: torch.Tensor) -> torch.Tensor: + # Source: action_encoder.py L20-40 + timesteps = timesteps.float() # L23: ensure float + half_dim = self.embedding_dim // 2 # L28 + exponent = -torch.arange( # L30-32 + half_dim, dtype=torch.float, device=timesteps.device + ) * (torch.log(torch.tensor(10000.0)) / half_dim) + freqs = timesteps.unsqueeze(-1) * exponent.exp() # L34: (B, T, half_dim) + return torch.cat([torch.sin(freqs), torch.cos(freqs)], dim=-1) # L36-38: (B, T, dim) + + +class CategorySpecificLinear(nn.Module): + """Per-category linear: W[cat_id] @ x + b[cat_id] + + Source: wan_video_dit_action_casual_chunk.py L31-42 + Params: + W: (num_categories, input_dim, hidden_dim) — note: 0.02 * randn init + b: (num_categories, hidden_dim) — zero init + """ + + def __init__(self, num_categories: int, input_dim: int, hidden_dim: int) -> None: + super().__init__() + self.W = nn.Parameter(0.02 * torch.randn(num_categories, input_dim, hidden_dim)) + self.b = nn.Parameter(torch.zeros(num_categories, hidden_dim)) + + def forward(self, x: torch.Tensor, cat_ids: torch.Tensor) -> torch.Tensor: + # Source: wan_video_dit_action_casual_chunk.py L39-42 + selected_W = self.W[cat_ids] # L40: (B, input_dim, hidden_dim) + selected_b = self.b[cat_ids] # L41: (B, hidden_dim) + return torch.bmm(x, selected_W) + selected_b.unsqueeze(1) # L42: (B, T, hidden_dim) + + +class CategorySpecificMLP(nn.Module): + """Two-layer MLP: layer1 (relu) → layer2 + + Source: wan_video_dit_action_casual_chunk.py L45-54 + """ + + def __init__(self, num_categories: int, input_dim: int, hidden_dim: int, output_dim: int) -> None: + super().__init__() + self.layer1 = CategorySpecificLinear(num_categories, input_dim, hidden_dim) + self.layer2 = CategorySpecificLinear(num_categories, hidden_dim, output_dim) + + def forward(self, x: torch.Tensor, cat_ids: torch.Tensor) -> torch.Tensor: + # Source: wan_video_dit_action_casual_chunk.py L52-54 + hidden = F.relu(self.layer1(x, cat_ids)) # L53 + return self.layer2(hidden, cat_ids) # L54 + + +class MultiEmbodimentActionEncoder(nn.Module): + """Encode actions with embodiment-specific weights + sinusoidal timestep. + + Source: wan_video_dit_action_casual_chunk.py L57-90 + Flow: actions → W1 → concat(a_emb, pos_enc(timesteps)) → W2 (swish) → W3 + + Args: + action_dim: action vector dimension (e.g. 32) + hidden_size: output/hidden dimension (e.g. 5120 = model dim) + num_embodiments: number of robot types (e.g. 32) + """ + + def __init__(self, action_dim: int, hidden_size: int, num_embodiments: int) -> None: + super().__init__() + self.hidden_size = hidden_size + self.W1 = CategorySpecificLinear(num_embodiments, action_dim, hidden_size) + self.W2 = CategorySpecificLinear(num_embodiments, 2 * hidden_size, hidden_size) + self.W3 = CategorySpecificLinear(num_embodiments, hidden_size, hidden_size) + self.pos_encoding = SinusoidalPositionalEncoding(hidden_size) + + def forward(self, actions: torch.Tensor, timesteps: torch.Tensor, cat_ids: torch.Tensor) -> torch.Tensor: + """ + Args: + actions: (B, T, action_dim) + timesteps: (B, T) — per-token timestep + cat_ids: (B,) — embodiment id per sample + Returns: + (B, T, hidden_size) + """ + # Source: wan_video_dit_action_casual_chunk.py L69-90 + a_emb = self.W1(actions, cat_ids) # L79: (B, T, hidden_size) + tau_emb = self.pos_encoding(timesteps).to(dtype=a_emb.dtype) # L82: (B, T, hidden_size) + x = torch.cat([a_emb, tau_emb], dim=-1) # L85: (B, T, 2*hidden_size) + x = swish(self.W2(x, cat_ids)) # L86: (B, T, hidden_size) + x = self.W3(x, cat_ids) # L89: (B, T, hidden_size) + return x diff --git a/vllm_omni/diffusion/models/dreamzero/modeling/causal_wan_model.py b/vllm_omni/diffusion/models/dreamzero/modeling/causal_wan_model.py new file mode 100644 index 00000000000..ec8a632d250 --- /dev/null +++ b/vllm_omni/diffusion/models/dreamzero/modeling/causal_wan_model.py @@ -0,0 +1,1053 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""CausalWanModel — 40-layer DiT with causal attention and KV cache. + +Adapted from: dreamzero/groot/vla/model/dreamzero/modules/ + wan_video_dit_action_casual_chunk.py L1218-2200 + +Key differences from WanTransformer3DModel (wan2_2_transformer.py): +- Causal self-attention (new frames only see history) +- KV cache for streaming inference +- Action/state token support (appended after video tokens) +- Extended RoPE with action/state-specific frequencies +- Inference-only forward with KV cache +""" + +from __future__ import annotations + +import math +from typing import Any + +import torch +import torch.nn as nn +from vllm.distributed import ( + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + get_tp_group, +) +from vllm.model_executor.layers.conv import Conv3dLayer +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.utils import set_weight_attrs + +from vllm_omni.diffusion.attention.layer import Attention +from vllm_omni.diffusion.models.dreamzero.modeling.action_encoder import ( + CategorySpecificMLP, + MultiEmbodimentActionEncoder, +) + +# ── RoPE utilities ────────────────────────────────────────────────── +# Source: wan2_1_submodule.py rope_params / rope_action_apply +# wan_video_dit_action_casual_chunk.py L93-185 causal_rope_action_apply + + +def sinusoidal_embedding_1d(dim: int, position: torch.Tensor) -> torch.Tensor: + """Sinusoidal positional embedding for timesteps. + Source: wan2_1_submodule.py L16-26 + """ + assert dim % 2 == 0 # L18 + half = dim // 2 # L19 + position = position.type(torch.float64) # L20 + sinusoid = torch.outer( # L23-24 + position, + torch.pow(10000, -torch.arange(half, dtype=position.dtype, device=position.device).div(half)), + ) + x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1) # L25 + return x + + +def rope_params(max_seq_len: int, dim: int) -> torch.Tensor: + """Precompute complex-valued RoPE frequencies (polar form). + Source: wan2_1_submodule.py L37-44 (rope_params_polar) + Returns: complex tensor [max_seq_len, dim // 2] + """ + assert dim % 2 == 0 # L38 + freqs = torch.outer( # L39-42 + torch.arange(max_seq_len), + 1.0 / torch.pow(10000, torch.arange(0, dim, 2).to(torch.float64).div(dim)), + ) + freqs = torch.polar(torch.ones_like(freqs), freqs) # L43 + return freqs + + +def rope_apply(x: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor: + """Apply RoPE to x using precomputed complex freqs. + Source: wan2_1_submodule.py L64-75 (rope_apply_polar) + """ + B, seq_len, n, _ = x.shape # L65 + x = torch.view_as_complex( # L68-70 + x.to(torch.float64).reshape(B, seq_len, n, -1, 2) + ) + freqs = freqs.unsqueeze(0) # L73 + x = torch.view_as_real(x * freqs).flatten(3) # L74 + return x + + +def rope_action_apply( + x: torch.Tensor, + freqs: torch.Tensor, + freqs_action: torch.Tensor, + freqs_state: torch.Tensor, + action_register_length: int | None, + num_action_per_block: int = 32, + num_state_per_block: int = 1, +) -> torch.Tensor: + """RoPE with action/state frequency tables for multi-step sequences. + Source: wan2_1_submodule.py L130-159 (rope_action_apply_polar) + """ + B, seq_len, n, _ = x.shape # L139 + x = torch.view_as_complex( # L142-144 + x.to(torch.float64).reshape(B, seq_len, n, -1, 2) + ) + if action_register_length is not None: # L146 + assert num_action_per_block is not None # L147 + assert num_state_per_block is not None # L148 + chunk_size = action_register_length // (num_action_per_block + num_state_per_block) # L150 + freqs_1d_action = freqs_action[: chunk_size * num_action_per_block].view( # L152 + chunk_size * num_action_per_block, 1, -1 + ) + freqs_1d_state = freqs_state[: chunk_size * num_state_per_block].view( # L153 + chunk_size * num_state_per_block, 1, -1 + ) + freqs = torch.cat([freqs, freqs_1d_action, freqs_1d_state], dim=0) # L154 + freqs = freqs.unsqueeze(0) # L157 + x = torch.view_as_real(x * freqs).flatten(3) # L158 + return x + + +def causal_rope_action_apply( + x: torch.Tensor, + freqs: torch.Tensor, + freqs_action: torch.Tensor, + freqs_state: torch.Tensor, + action_register_length: int | None, + num_action_per_block: int, + num_state_per_block: int, + action_state_index: int, +) -> torch.Tensor: + """RoPE for single inference step (causal / KV-cache mode). + Source: wan_video_dit_action_casual_chunk.py L153-185 (causal_rope_action_apply_polar) + """ + B, seq_len, n, _ = x.shape # L163 + x = torch.view_as_complex( # L166-168 + x.to(torch.float64).reshape(B, seq_len, n, -1, 2) + ) + if action_register_length is not None: # L170 + assert action_register_length == (num_action_per_block + num_state_per_block) # L171 + freqs_action = freqs_action[ # L172-174 + action_state_index * num_action_per_block : (action_state_index + 1) * num_action_per_block + ] + freqs_state = freqs_state[ # L175-177 + action_state_index * num_state_per_block : (action_state_index + 1) * num_state_per_block + ] + freqs_1d = torch.cat([freqs_action, freqs_state], dim=0).view( # L178 + action_register_length, 1, -1 + ) + freqs = torch.cat([freqs, freqs_1d], dim=0) # L179 + freqs = freqs.unsqueeze(0) # L182 + x = torch.view_as_real(x * freqs).flatten(3) # L183 + return x + + +# ── Normalization ─────────────────────────────────────────────────── +# Source: wan2_1_submodule.py L162-178 (WanRMSNorm) +# wan2_2_transformer.py L65-95 (DistributedRMSNorm — TP-aware version) + + +class WanLayerNorm(nn.LayerNorm): + """Source: wan2_1_submodule.py L181-184""" + + def __init__(self, dim: int, eps: float = 1e-6, elementwise_affine: bool = False) -> None: + super().__init__(dim, eps=eps, elementwise_affine=elementwise_affine) + + +class DistributedRMSNorm(nn.Module): + """RMSNorm that computes global RMS across tensor parallel ranks.""" + + def __init__(self, hidden_size: int, eps: float = 1e-6): + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.ones(hidden_size)) + set_weight_attrs(self.weight, {"weight_loader": self.weight_loader}) + + def weight_loader(self, param: torch.Tensor, loaded_weight: torch.Tensor) -> None: + if param.shape == loaded_weight.shape: + param.data.copy_(loaded_weight) + return + + tp_size = get_tensor_model_parallel_world_size() + if loaded_weight.shape[0] % tp_size != 0: + raise ValueError( + f"Cannot shard RMSNorm weight of shape {tuple(loaded_weight.shape)} across tp_size={tp_size}." + ) + + shard_size = loaded_weight.shape[0] // tp_size + start_idx = get_tensor_model_parallel_rank() * shard_size + shard = loaded_weight.narrow(0, start_idx, shard_size) + if param.shape != shard.shape: + raise ValueError(f"RMSNorm shard shape mismatch: param={tuple(param.shape)}, shard={tuple(shard.shape)}.") + param.data.copy_(shard) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + tp_size = get_tensor_model_parallel_world_size() + x_float = x.float() + local_sum_sq = x_float.pow(2).sum(dim=-1, keepdim=True) + local_count = x.shape[-1] + + if tp_size > 1: + global_sum_sq = local_sum_sq.clone() + torch.distributed.all_reduce(global_sum_sq, group=get_tp_group().device_group) + global_count = local_count * tp_size + else: + global_sum_sq = local_sum_sq + global_count = local_count + + mean_sq = global_sum_sq / global_count + # Keep the same numerical form as upstream `WanRMSNorm._norm()`: + # x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + eps) + # For TP>1, extend the mean to the global hidden dimension first, then + # apply the same `rsqrt` formulation on each local shard. + return (x_float * torch.rsqrt(mean_sq + self.eps)).type_as(x) * self.weight + + +# ── Projections ───────────────────────────────────────────────────── + + +class MLPProj(nn.Module): + """CLIP feature projection for i2v. + Source: wan2_1_submodule.py L565-577 + Uses ColumnParallelLinear + RowParallelLinear (Qwen3_VisionMLP pattern). + """ + + def __init__(self, in_dim: int, out_dim: int) -> None: + super().__init__() + self.norm1 = nn.LayerNorm(in_dim) # L571 + self.fc1 = ColumnParallelLinear( # L571 nn.Linear(in_dim, in_dim) + in_dim, + in_dim, + bias=True, + return_bias=False, + ) + self.act = nn.GELU() # L572 + self.fc2 = RowParallelLinear( # L572 nn.Linear(in_dim, out_dim) + in_dim, + out_dim, + bias=True, + return_bias=False, + ) + self.norm2 = nn.LayerNorm(out_dim) # L573 + + def forward(self, image_embeds: torch.Tensor) -> torch.Tensor: + x = self.norm1(image_embeds) # L576 + x = self.fc1(x) + x = self.act(x) + x = self.fc2(x) + x = self.norm2(x) + return x + + +# ── Cross-Attention ───────────────────────────────────────────────── +# Source: wan_video_dit_action_casual_chunk.py L1087-1190 (referenced) +# T2V and I2V cross-attention variants + + +class WanT2VCrossAttention(nn.Module): + """Text-to-video cross-attention. + Source: wan2_1_submodule.py L243-278 + Uses vllm-omni Attention for FlashAttn backend. + """ + + def __init__(self, dim: int, num_heads: int, window_size=(-1, -1), qk_norm: bool = True, eps: float = 1e-6) -> None: + super().__init__() + assert dim % num_heads == 0 # L195 + self.dim = dim + self.num_heads = num_heads + self.head_dim = dim // num_heads + tp_size = get_tensor_model_parallel_world_size() + if num_heads % tp_size != 0: + raise ValueError(f"num_heads={num_heads} must be divisible by tp_size={tp_size}.") + self.tp_num_heads = num_heads // tp_size + self.tp_inner_dim = self.tp_num_heads * self.head_dim + self.q = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) # L205 + self.k = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) # L206 + self.v = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) # L207 + self.o = RowParallelLinear(dim, dim, bias=True, input_is_parallel=True, return_bias=False) # L208 + self.norm_q = DistributedRMSNorm(self.tp_inner_dim, eps=eps) if qk_norm else nn.Identity() # L209 + self.norm_k = DistributedRMSNorm(self.tp_inner_dim, eps=eps) if qk_norm else nn.Identity() # L210 + self.attn = Attention( + self.tp_num_heads, + self.head_dim, + causal=False, + softmax_scale=self.head_dim**-0.5, + skip_sequence_parallel=True, + ) + + def forward( + self, + x: torch.Tensor, + context: torch.Tensor, + context_lens: torch.Tensor | None = None, + crossattn_cache: dict | None = None, + ) -> torch.Tensor: + """Source: wan2_1_submodule.py L245-278""" + del context_lens + n, d = self.tp_num_heads, self.head_dim # L253 + q = self.norm_q(self.q(x)).unflatten(2, (n, d)) # L256 + if crossattn_cache is not None: # L258 + if not crossattn_cache["is_init"]: # L259 + crossattn_cache["is_init"] = True # L260 + k = self.norm_k(self.k(context)).unflatten(2, (n, d)) # L261 + v = self.v(context).unflatten(2, (n, d)) # L262 + crossattn_cache["k"] = k # L263 + crossattn_cache["v"] = v # L264 + else: + k = crossattn_cache["k"] # L266 + v = crossattn_cache["v"] # L267 + else: + k = self.norm_k(self.k(context)).unflatten(2, (n, d)) # L269 + v = self.v(context).unflatten(2, (n, d)) # L270 + x = self.attn(q, k, v) # L273 + x = x.flatten(2) # L276 + x = self.o(x) # L277 + return x + + +class WanI2VCrossAttention(nn.Module): + """Image-to-video cross-attention (splits first 257 image tokens). + Source: wan2_1_submodule.py L308-362 + Uses vllm-omni Attention for FlashAttn backend. + """ + + def __init__(self, dim: int, num_heads: int, window_size=(-1, -1), qk_norm: bool = True, eps: float = 1e-6) -> None: + super().__init__() + assert dim % num_heads == 0 + self.dim = dim + self.num_heads = num_heads + self.head_dim = dim // num_heads + tp_size = get_tensor_model_parallel_world_size() + if num_heads % tp_size != 0: + raise ValueError(f"num_heads={num_heads} must be divisible by tp_size={tp_size}.") + self.tp_num_heads = num_heads // tp_size + self.tp_inner_dim = self.tp_num_heads * self.head_dim + self.q = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) # L205 + self.k = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) # L206 + self.v = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) # L207 + self.o = RowParallelLinear(dim, dim, bias=True, input_is_parallel=True, return_bias=False) # L208 + self.norm_q = DistributedRMSNorm(self.tp_inner_dim, eps=eps) if qk_norm else nn.Identity() + self.norm_k = DistributedRMSNorm(self.tp_inner_dim, eps=eps) if qk_norm else nn.Identity() + self.k_img = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) # L318 + self.v_img = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) # L319 + self.norm_k_img = DistributedRMSNorm(self.tp_inner_dim, eps=eps) if qk_norm else nn.Identity() # L321 + self.attn = Attention( + self.tp_num_heads, + self.head_dim, + causal=False, + softmax_scale=self.head_dim**-0.5, + skip_sequence_parallel=True, + ) + + def forward( + self, + x: torch.Tensor, + context: torch.Tensor, + context_lens: torch.Tensor | None = None, + crossattn_cache: dict | None = None, + ) -> torch.Tensor: + """Source: wan2_1_submodule.py L324-361""" + del context_lens + context_img = context[:, :257] # L330 + context = context[:, 257:] # L331 + n, d = self.tp_num_heads, self.head_dim # L332 + q = self.norm_q(self.q(x)).unflatten(2, (n, d)) # L334 + if crossattn_cache is not None: # L336 + if not crossattn_cache["is_init"]: + crossattn_cache["is_init"] = True + k = self.norm_k(self.k(context)).unflatten(2, (n, d)) + v = self.v(context).unflatten(2, (n, d)) + crossattn_cache["k"] = k + crossattn_cache["v"] = v + else: + k = crossattn_cache["k"] + v = crossattn_cache["v"] + else: + k = self.norm_k(self.k(context)).unflatten(2, (n, d)) # L348 + v = self.v(context).unflatten(2, (n, d)) # L349 + x = self.attn(q, k, v) # L350 + k_img = self.norm_k_img(self.k_img(context_img)).unflatten(2, (n, d)) # L352 + v_img = self.v_img(context_img).unflatten(2, (n, d)) # L353 + img_x = self.attn(q, k_img, v_img) # L354 + x = x.flatten(2) # L357 + img_x = img_x.flatten(2) # L358 + x = x + img_x # L359 + x = self.o(x) # L360 + return x + + +WAN_CROSSATTENTION_CLASSES = { # L364-366 + "t2v_cross_attn": WanT2VCrossAttention, + "i2v_cross_attn": WanI2VCrossAttention, +} + + +# ── Self-Attention with causal masking + KV cache ─────────────────── +# Source: wan_video_dit_action_casual_chunk.py L188-1085 + + +class CausalWanSelfAttention(nn.Module): + """Causal self-attention with KV cache + action/state tokens. + Source: wan_video_dit_action_casual_chunk.py L188-1085 + Inference-only implementation (KV cache path, L1008-1084). + """ + + def __init__( + self, + dim: int, + num_heads: int, + frame_seqlen: int, + local_attn_size: int = -1, + sink_size: int = 0, + num_frame_per_block: int = 1, + qk_norm: bool = True, + eps: float = 1e-6, + num_action_per_block: int = 32, + num_state_per_block: int = 1, + ) -> None: + assert dim % num_heads == 0 # L201 + super().__init__() + self.dim = dim # L203 + self.num_heads = num_heads # L204 + self.head_dim = dim // num_heads # L205 + tp_size = get_tensor_model_parallel_world_size() + if num_heads % tp_size != 0: + raise ValueError(f"num_heads={num_heads} must be divisible by tp_size={tp_size}.") + self.tp_num_heads = num_heads // tp_size + self.tp_inner_dim = self.tp_num_heads * self.head_dim + self.local_attn_size = local_attn_size # L206 + self.num_frame_per_block = num_frame_per_block # L208 + self.frame_seqlen = frame_seqlen # L212 + self.num_action_per_block = num_action_per_block # L213 + self.num_state_per_block = num_state_per_block # L214 + self.max_attention_size = ( # L211 + 21 * frame_seqlen if local_attn_size == -1 else local_attn_size * frame_seqlen + ) + # layers # L216-223 + self.q = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) + self.k = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) + self.v = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) + self.o = RowParallelLinear(dim, dim, bias=True, input_is_parallel=True, return_bias=False) + self.norm_q = DistributedRMSNorm(self.tp_inner_dim, eps=eps) if qk_norm else nn.Identity() + self.norm_k = DistributedRMSNorm(self.tp_inner_dim, eps=eps) if qk_norm else nn.Identity() + self.attn = Attention( + self.tp_num_heads, + self.head_dim, + causal=False, + softmax_scale=self.head_dim**-0.5, + skip_sequence_parallel=True, + ) + + def forward( + self, + x: torch.Tensor, + freqs: torch.Tensor, + freqs_action: torch.Tensor, + freqs_state: torch.Tensor, + action_register_length: int | None, + kv_cache: torch.Tensor | None = None, + current_start_frame: int = 0, + is_tf: bool = True, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + """Inference-only forward (KV cache path). + Source: wan_video_dit_action_casual_chunk.py L786-1084 (kv_cache branch L1008-1084) + """ + n, d = self.tp_num_heads, self.head_dim # L803 + + # QKV # L806-812 + q = self.norm_q(self.q(x)).unflatten(2, (n, d)) + k = self.norm_k(self.k(x)).unflatten(2, (n, d)) + v = self.v(x).unflatten(2, (n, d)) + + updated_kv_cache: torch.Tensor | None = None + + assert kv_cache is not None, "Inference only — kv_cache required." + if True: + # ── Inference path with KV cache ── L1008-1084 + action_state_index = max(0, (current_start_frame - 1) // self.num_frame_per_block) # L1009 + + roped_query = causal_rope_action_apply( # L1011-1020 + q, + freqs, + freqs_action, + freqs_state, + action_register_length, + self.num_action_per_block, + self.num_state_per_block, + action_state_index, + ).type_as(v) + roped_key = causal_rope_action_apply( # L1021-1030 + k, + freqs, + freqs_action, + freqs_state, + action_register_length, + self.num_action_per_block, + self.num_state_per_block, + action_state_index, + ).type_as(v) + + # Split action/state tokens from video tokens # L1032-1046 + roped_action_query = None + roped_action_key = None + action_v = None + + if action_register_length is not None: # L1037 + roped_action_query = roped_query[:, -action_register_length:] # L1038 + roped_query = roped_query[:, :-action_register_length] # L1039 + roped_action_key = roped_key[:, -action_register_length:] # L1040 + roped_key = roped_key[:, :-action_register_length] # L1041 + action_v = v[:, -action_register_length:] # L1042 + v = v[:, :-action_register_length] # L1043 + + # KV cache update # L1055-1064 + updated_k = kv_cache[0] + updated_v = kv_cache[1] + new_k = torch.cat([updated_k, roped_key], dim=1) # L1059 + new_v = torch.cat([updated_v, v], dim=1) # L1060 + new_k = new_k[:, -self.max_attention_size :] # L1063 + new_v = new_v[:, -self.max_attention_size :] # L1064 + + # Attention # L1066-1077 + if action_register_length is not None: # L1066 + q_cat = torch.cat([roped_query, roped_action_query], dim=1) + k_cat = torch.cat([new_k, roped_action_key], dim=1) + v_cat = torch.cat([new_v, action_v], dim=1) + else: # L1072 + q_cat = roped_query + k_cat = new_k + v_cat = new_v + + x = self.attn(q_cat, k_cat, v_cat) # L1067-1073 + updated_kv_cache = torch.stack([new_k, new_v], dim=0) # L1078 + + # output # L1082-1083 + x = x.flatten(2) + x = self.o(x) + return x, updated_kv_cache + + +# ── Attention Block ───────────────────────────────────────────────── +# Source: wan_video_dit_action_casual_chunk.py L1087-1190 + + +class CausalWanAttentionBlock(nn.Module): + """Transformer block: self-attn + cross-attn + FFN with 6-param modulation. + Source: wan_video_dit_action_casual_chunk.py L1087-1190 + """ + + def __init__( + self, + cross_attn_type: str, + dim: int, + ffn_dim: int, + num_heads: int, + frame_seqlen: int, + local_attn_size: int = -1, + sink_size: int = 0, + num_frame_per_block: int = 1, + qk_norm: bool = True, + cross_attn_norm: bool = False, + eps: float = 1e-6, + num_action_per_block: int = 32, + num_state_per_block: int = 1, + ) -> None: + super().__init__() + self.norm1 = WanLayerNorm(dim, eps) # L1113 + self.self_attn = CausalWanSelfAttention( # L1114-1124 + dim=dim, + num_heads=num_heads, + frame_seqlen=frame_seqlen, + local_attn_size=local_attn_size, + sink_size=sink_size, + num_frame_per_block=num_frame_per_block, + qk_norm=qk_norm, + eps=eps, + num_action_per_block=num_action_per_block, + num_state_per_block=num_state_per_block, + ) + self.norm3 = ( # L1126-1128 + WanLayerNorm(dim, eps, elementwise_affine=True) if cross_attn_norm else nn.Identity() + ) + self.cross_attn = WAN_CROSSATTENTION_CLASSES[cross_attn_type]( # L1129-1133 + dim, num_heads, (-1, -1), qk_norm, eps + ) + self.norm2 = WanLayerNorm(dim, eps) # L1134 + self.ffn = nn.Sequential( # L1135-1137 + ColumnParallelLinear(dim, ffn_dim, bias=True, gather_output=False, return_bias=False), + nn.GELU(approximate="tanh"), + RowParallelLinear(ffn_dim, dim, bias=True, input_is_parallel=True, return_bias=False), + ) + self.modulation = nn.Parameter( # L1140 + torch.randn(1, 6, dim) / dim**0.5 + ) + + def forward( + self, + x: torch.Tensor, + e: torch.Tensor, + freqs: torch.Tensor, + freqs_action: torch.Tensor, + freqs_state: torch.Tensor, + context: torch.Tensor, + action_register_length: int | None = None, + kv_cache: torch.Tensor | None = None, + crossattn_cache: dict | None = None, + current_start_frame: int = 0, + is_tf: bool = True, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + """Source: wan_video_dit_action_casual_chunk.py L1142-1187""" + e = (self.modulation.unsqueeze(1) + e).chunk(6, dim=2) # L1162 + + # self-attention # L1164-1174 + y, updated_kv_cache = self.self_attn( + x=(self.norm1(x) * (1 + e[1].squeeze(2)) + e[0].squeeze(2)), # L1166 + freqs=freqs, + freqs_action=freqs_action, + freqs_state=freqs_state, + action_register_length=action_register_length, + kv_cache=kv_cache, + is_tf=is_tf, + current_start_frame=current_start_frame, + ) + x = x + (y * e[2].squeeze(2)) # L1175 + + # cross-attention + FFN # L1178-1186 + x = x + self.cross_attn(self.norm3(x), context, crossattn_cache=crossattn_cache) # L1179 + y = self.ffn( # L1180-1181 + self.norm2(x) * (1 + e[4].squeeze(2)) + e[3].squeeze(2) + ) + x = x + (y * e[5].squeeze(2)) # L1183 + return x, updated_kv_cache + + +# ── Output Head ───────────────────────────────────────────────────── +# Source: wan_video_dit_action_casual_chunk.py L1190-1215 + + +class CausalHead(nn.Module): + """Output norm + linear with 2-param modulation. + Source: wan_video_dit_action_casual_chunk.py L1190-1215 + Runs once per step (not TP-critical), uses nn.Linear. + """ + + def __init__(self, dim: int, out_dim: int, patch_size: tuple, eps: float = 1e-6) -> None: + super().__init__() + self.dim = dim # L1194 + self.out_dim = out_dim # L1195 + self.patch_size = patch_size # L1196 + out_channels = math.prod(patch_size) * out_dim # L1200 + self.norm = WanLayerNorm(dim, eps) # L1201 + self.head = nn.Linear(dim, out_channels) # L1202 + self.modulation = nn.Parameter( # L1205 + torch.randn(1, 2, dim) / dim**0.5 + ) + + def forward(self, x: torch.Tensor, e: torch.Tensor) -> torch.Tensor: + """ + Args: + x: [B, L1, C] + e: [B, F, 1, C] (time embedding, unsqueezed) + Source: wan_video_dit_action_casual_chunk.py L1207-1215 + """ + e = (self.modulation.unsqueeze(1) + e).chunk(2, dim=2) # L1213 + x = self.head( # L1214 + self.norm(x) * (1 + e[1].squeeze(2)) + e[0].squeeze(2) + ) + return x + + +# ── Main Model ────────────────────────────────────────────────────── +# Source: wan_video_dit_action_casual_chunk.py L1218-2200 + + +class CausalWanModel(nn.Module): + """Causal video diffusion transformer for DreamZero. + + Source: wan_video_dit_action_casual_chunk.py L1218-2200 + Architecture (14B): 40 layers, dim=5120, heads=40, ffn=13824 + + __init__ params match original L1230-1256: + model_type, patch_size, frame_seqlen, text_len, in_dim, dim, + ffn_dim, freq_dim, text_dim, out_dim, num_heads, num_layers, + max_chunk_size, sink_size, qk_norm, cross_attn_norm, eps, + num_frame_per_block, action_dim, num_registers, max_state_dim, + max_num_embodiments, hidden_size, diffusion_model_pretrained_path, + num_action_per_block, num_state_per_block + """ + + def __init__( + self, + model_type: str = "t2v", + patch_size: tuple[int, int, int] = (1, 2, 2), + frame_seqlen: int = 220, + text_len: int = 512, + in_dim: int = 16, + dim: int = 2048, + ffn_dim: int = 8192, + freq_dim: int = 256, + text_dim: int = 4096, + out_dim: int = 16, + num_heads: int = 16, + num_layers: int = 32, + max_chunk_size: int = -1, + sink_size: int = 0, + qk_norm: bool = True, + cross_attn_norm: bool = True, + eps: float = 1e-6, + num_frame_per_block: int = 1, + action_dim: int = 32, + num_registers: int = 8, + max_state_dim: int = 64, + max_num_embodiments: int = 32, + hidden_size: int = 1024, + diffusion_model_pretrained_path: str | None = None, + num_action_per_block: int = 32, + num_state_per_block: int = 1, + ) -> None: + super().__init__() + assert model_type in ["t2v", "i2v", "ti2v"] # L1297 + self.model_type = model_type # L1298 + self.patch_size = patch_size # L1300 + self.frame_seqlen = frame_seqlen # L1301 + self.text_len = text_len # L1302 + self.dim = dim # L1304 + self.freq_dim = freq_dim # L1306 + self.out_dim = out_dim # L1308 + self.num_heads = num_heads # L1309 + self.num_layers = num_layers # L1310 + self.local_attn_size = ( # L1311 + max_chunk_size * num_frame_per_block + 1 if max_chunk_size != -1 else -1 + ) + self.num_frame_per_block = num_frame_per_block # L1315 + self.action_dim = action_dim # L1317 + self.num_action_per_block = num_action_per_block # L1322 + self.num_state_per_block = num_state_per_block # L1323 + + # Action encoder/decoder # L1327-1343 + max_num_embodiments_local = 1 # L1325 + self.state_encoder = CategorySpecificMLP( + num_categories=max_num_embodiments_local, + input_dim=max_state_dim, + hidden_dim=hidden_size, + output_dim=dim, + ) + self.action_encoder = MultiEmbodimentActionEncoder( + action_dim=action_dim, + hidden_size=dim, + num_embodiments=max_num_embodiments_local, + ) + self.action_decoder = CategorySpecificMLP( + num_categories=max_num_embodiments_local, + input_dim=dim, + hidden_dim=hidden_size, + output_dim=action_dim, + ) + + # Embeddings # L1346-1355 + # Upstream DreamZero uses a plain nn.Conv3d here + # (wan_video_dit_action_casual_chunk.py L1386-L1391). + # + # vLLM's Conv3dLayer can rewrite non-overlapping strided convs + # (kernel_size == stride, no padding) into a GEMM fast path. For + # DreamZero's bf16 i2v prefill, that changes accumulation order and + # causes the first-frame KV cache to drift from upstream even though + # the final outputs may still match. Force the native conv path so + # patch embedding stays numerically identical to upstream. + self.patch_embedding = Conv3dLayer( + in_dim, + dim, + kernel_size=patch_size, + stride=patch_size, + ) # L1346 + self.patch_embedding.enable_linear = False + self.text_embedding = nn.Sequential( # L1348-1350 + nn.Linear(text_dim, dim), + nn.GELU(approximate="tanh"), + nn.Linear(dim, dim), + ) + self.time_embedding = nn.Sequential( # L1352-1353 + nn.Linear(freq_dim, dim), + nn.SiLU(), + nn.Linear(dim, dim), + ) + self.time_projection = nn.Sequential( # L1354-1355 + nn.SiLU(), + nn.Linear(dim, dim * 6), + ) + + # Transformer blocks # L1358-1364 + cross_attn_type = "t2v_cross_attn" if model_type == "t2v" else "i2v_cross_attn" + self.blocks = nn.ModuleList( + [ + CausalWanAttentionBlock( + cross_attn_type, + dim, + ffn_dim, + num_heads, + frame_seqlen, + self.local_attn_size, + sink_size, + num_frame_per_block, + qk_norm, + cross_attn_norm, + eps, + num_action_per_block, + num_state_per_block, + ) + for _ in range(num_layers) + ] + ) + + # Head # L1367 + self.head = CausalHead(dim, out_dim, patch_size, eps) + + # RoPE buffers # L1370-1379 + assert (dim % num_heads) == 0 and (dim // num_heads) % 2 == 0 + d = dim // num_heads + self.freqs_action = rope_params(1024 * 10, d) # L1373 + self.freqs_state = rope_params(1024, d) # L1374 + self.freqs = [ # L1375-1379 + rope_params(1024, d - 4 * (d // 6)), + rope_params(1024, 2 * (d // 6)), + rope_params(1024, 2 * (d // 6)), + ] + + # Image embedding for i2v only # L1380-1381 + if model_type == "i2v": + self.img_emb = MLPProj(1280, dim) + + # Initialize weights # L1383-1384 + self.init_weights() + + def init_weights(self) -> None: + """Initialize parameters with DreamZero's bare-model scheme. + Source: wan_video_dit_action_casual_chunk.py L2176-2194 + + Upstream initializes every `nn.Linear` with Xavier uniform and + zero bias. This port applies the same rule to vLLM + `ColumnParallelLinear` / `RowParallelLinear`, using the full + unsharded fan-in/fan-out so the TP shards follow the same Xavier + distribution as the original dense weight. + """ + + def _init_linear_like(module: nn.Module) -> None: + if isinstance(module, nn.Linear): # L2182-2185 + nn.init.xavier_uniform_(module.weight) + if module.bias is not None: + nn.init.zeros_(module.bias) + return + + if isinstance(module, (ColumnParallelLinear, RowParallelLinear)): + fan_in = module.input_size + fan_out = module.output_size + bound = math.sqrt(6.0 / float(fan_in + fan_out)) + nn.init.uniform_(module.weight, -bound, bound) + if module.bias is not None: + nn.init.zeros_(module.bias) + + # Basic init # L2181-2185 + for module in self.modules(): + _init_linear_like(module) + + # Patch embedding follows upstream Conv3d handling: Xavier + # for weight, Conv3d-style uniform bias. # L2187-2191 + nn.init.xavier_uniform_(self.patch_embedding.weight.flatten(1)) + if self.patch_embedding.bias is not None: + fan_in = self.patch_embedding.in_channels * math.prod(self.patch_embedding.kernel_size) + bound = 1 / math.sqrt(fan_in) + nn.init.uniform_(self.patch_embedding.bias, -bound, bound) + + for module in self.text_embedding.modules(): # L2188-2190 + if isinstance(module, nn.Linear): + nn.init.normal_(module.weight, std=0.02) + + for module in self.time_embedding.modules(): # L2190-2191 + if isinstance(module, nn.Linear): + nn.init.normal_(module.weight, std=0.02) + + nn.init.zeros_(self.head.head.weight) # L2193 + + def _create_freqs(self, grid_size: torch.Tensor, start_frame: int) -> torch.Tensor: + """Create 3D RoPE frequency tensor. + Source: wan_video_dit_action_casual_chunk.py L2151-2174 + """ + device = self.patch_embedding.weight.device # L2156 + if any(freq.device != device for freq in self.freqs): # L2157-2158 + self.freqs = [freq.to(device) for freq in self.freqs] + if self.freqs_action.device != device: # L2159-2160 + self.freqs_action = self.freqs_action.to(device) + if self.freqs_state.device != device: # L2161-2162 + self.freqs_state = self.freqs_state.to(device) + + f, h, w = grid_size.tolist() # L2164 + freqs = torch.cat( + [ # L2165-2172 + self.freqs[0][start_frame : start_frame + f].view(f, 1, 1, -1).expand(f, h, w, -1), + self.freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1), + self.freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1), + ], + dim=-1, + ).reshape(f * h * w, 1, -1) + return freqs + + def unpatchify(self, x: torch.Tensor, grid_size: torch.Tensor) -> torch.Tensor: + """Reconstruct video from patch embeddings. + Source: wan_video_dit_action_casual_chunk.py L2127-2149 + """ + B = x.shape[0] # L2142 + c = self.out_dim # L2143 + grid_size = grid_size.tolist() # L2144 + assert x.shape[1] == math.prod(grid_size) # L2145 + x = x.view(B, *grid_size, *self.patch_size, c) # L2146 + x = torch.einsum("bfhwpqrc->bcfphqwr", x) # L2147 + x = x.reshape(B, c, *[i * j for i, j in zip(grid_size, self.patch_size)]) # L2148 + return x + + def _forward_blocks( + self, + x: torch.Tensor, + seq_len: int, + freqs: torch.Tensor, + timestep: torch.Tensor, + context: torch.Tensor, + clip_feature: torch.Tensor | None, + embodiment_id: torch.Tensor | None, + action: torch.Tensor | None, + timestep_action: torch.Tensor | None, + state: torch.Tensor | None, + kv_cache: list[torch.Tensor], + current_start_frame: int, + ) -> tuple[torch.Tensor, torch.Tensor | None, list[torch.Tensor]]: + """Source: wan_video_dit_action_casual_chunk.py L1691-1779""" + x = x.flatten(start_dim=2).transpose(1, 2) # L1709 + B = x.shape[0] # L1711 + F_t = timestep.shape[1] # L1712 + + # Action/state encoding # L1714-1726 + if action is not None: + embodiment_id = torch.tensor([0], device=x.device).repeat(B) # L1715 + action_features = self.action_encoder(action, timestep_action, embodiment_id) # L1716 + state_features = self.state_encoder(state, embodiment_id) # L1717 + action_register = torch.cat([action_features, state_features], dim=1) # L1718 + action_length = action_features.shape[1] # L1719 + action_register_length = action_register.shape[1] # L1720 + x = torch.cat([x, action_register], dim=1) # L1721 + else: + action_length = 0 # L1725 + action_register_length = None # L1726 + + # Time embeddings # L1728-1742 + timestep = timestep.unsqueeze(-1).expand(B, F_t, seq_len // F_t).reshape(B, -1) # L1729 + if action is not None: + assert timestep_action is not None and state is not None + state_features_t = self.state_encoder(state, embodiment_id) + stride = timestep_action.shape[1] // state_features_t.shape[1] # L1734 + timestep_state = timestep_action[:, ::stride] # L1735 + timestep = torch.cat([timestep, timestep_action, timestep_state], dim=1) # L1736 + + e = self.time_embedding( # L1738-1739 + sinusoidal_embedding_1d(self.freq_dim, timestep.flatten()).type_as(x) + ) + e = e.unflatten(dim=0, sizes=(B, -1)) # L1740 + e0 = self.time_projection(e) # L1741 + e0 = e0.unflatten(dim=2, sizes=(6, self.dim)) # L1742 + + # Context embedding # L1744-1749 + context = self.text_embedding(context) # L1745 + if clip_feature is not None: # L1747 + clip_embedding = self.img_emb(clip_feature) # L1748 + context = torch.cat([clip_embedding, context], dim=1) # L1749 + + # Transformer blocks # L1751-1764 + updated_kv_caches: list[torch.Tensor] = [] + for block_index, block in enumerate(self.blocks): + x, updated_kv_cache = block( + x=x, + e=e0, + freqs=freqs, + freqs_action=self.freqs_action, + freqs_state=self.freqs_state, + context=context, + action_register_length=action_register_length, + kv_cache=kv_cache[block_index] if kv_cache else None, + current_start_frame=current_start_frame, + ) + updated_kv_caches.append(updated_kv_cache) + + # Action decoding # L1766-1770 + if action is not None: + action_noise_pred = x[:, seq_len : seq_len + action_length] # L1767 + action_noise_pred = self.action_decoder(action_noise_pred, embodiment_id) # L1768 + else: + action_noise_pred = None # L1770 + + x_video = x[:, :seq_len] # L1773 + e_video = e[:, :seq_len] # L1774 + x_video = self.head(x_video, e_video.unsqueeze(2)) # L1777 + + return x_video, action_noise_pred, updated_kv_caches + + def _forward_inference( + self, + x: torch.Tensor, + timestep: torch.Tensor, + context: torch.Tensor, + seq_len: int, + kv_cache: list[torch.Tensor], + crossattn_cache: list[torch.Tensor], + current_start_frame: int, + y: torch.Tensor | None = None, + clip_feature: torch.Tensor | None = None, + action: torch.Tensor | None = None, + timestep_action: torch.Tensor | None = None, + state: torch.Tensor | None = None, + embodiment_id: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor | None, list[torch.Tensor]]: + """Source: wan_video_dit_action_casual_chunk.py L1863-1950""" + if self.model_type == "i2v": # L1910 + assert clip_feature is not None and y is not None + assert context.shape[1] == self.text_len # L1912 + + if y is not None: # L1914 + x = torch.cat([x, y.to(dtype=x.dtype)], dim=1) # L1915 + + x = self.patch_embedding(x) # L1918 + grid_size = torch.tensor(x.shape[2:], dtype=torch.long) # L1919 + freqs = self._create_freqs(grid_size, current_start_frame) # L1921-1924 + + x_video, action_noise_pred, updated_kv_caches = self._forward_blocks( # L1926-1939 + x=x, + seq_len=seq_len, + freqs=freqs, + timestep=timestep, + context=context, + clip_feature=clip_feature, + embodiment_id=embodiment_id, + action=action, + timestep_action=timestep_action, + state=state, + kv_cache=kv_cache, + current_start_frame=current_start_frame, + ) + + x_video = x_video.clone() # L1942 + if action_noise_pred is not None: + action_noise_pred = action_noise_pred.clone() # L1944 + + video_noise_pred = self.unpatchify(x_video, grid_size) # L1948 + return video_noise_pred, action_noise_pred, updated_kv_caches + + def forward(self, *args: Any, **kwargs: Any): + """Inference only. Requires kv_cache.""" + return self._forward_inference(*args, **kwargs) diff --git a/vllm_omni/diffusion/models/dreamzero/modeling/image_encoder.py b/vllm_omni/diffusion/models/dreamzero/modeling/image_encoder.py new file mode 100644 index 00000000000..c41859e2dc2 --- /dev/null +++ b/vllm_omni/diffusion/models/dreamzero/modeling/image_encoder.py @@ -0,0 +1,245 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""DreamZero image encoder port. + +Corresponds to: +- `wan_video_image_encoder.py` `VisionTransformer` / `AttentionBlock` +- `wan_video_image_encoder.py` `WanImageEncoder.encode_image()` + +Only the visual tower used by DreamZero I2V inference is ported here. The +checkpoint keys are kept source-compatible so root `action_head.image_encoder.*` +weights can be loaded by simple prefix stripping. +""" + +from __future__ import annotations + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchvision.transforms as T + + +class DreamZeroLayerNorm(nn.LayerNorm): + """Source: `wan_video_image_encoder.py` `LayerNorm`.""" + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return super().forward(x).type_as(x) + + +class DreamZeroVisionSelfAttention(nn.Module): + """Source: `wan_video_image_encoder.py` `SelfAttention` (vision branch).""" + + def __init__( + self, + dim: int, + num_heads: int, + proj_dropout: float = 0.0, + ) -> None: + super().__init__() + assert dim % num_heads == 0 + self.dim = dim + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.proj_dropout = proj_dropout + + self.to_qkv = nn.Linear(dim, dim * 3) + self.proj = nn.Linear(dim, dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + batch_size, seq_len, _ = x.shape + q, k, v = self.to_qkv(x).chunk(3, dim=-1) + q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).permute(0, 2, 1, 3) + k = k.view(batch_size, seq_len, self.num_heads, self.head_dim).permute(0, 2, 1, 3) + v = v.view(batch_size, seq_len, self.num_heads, self.head_dim).permute(0, 2, 1, 3) + x = F.scaled_dot_product_attention(q, k, v) + x = x.permute(0, 2, 1, 3).reshape(batch_size, seq_len, self.dim) + x = self.proj(x) + return F.dropout(x, self.proj_dropout, self.training) + + +class DreamZeroVisionAttentionBlock(nn.Module): + """Source: `wan_video_image_encoder.py` `AttentionBlock`.""" + + def __init__( + self, + dim: int, + mlp_ratio: float, + num_heads: int, + post_norm: bool = False, + activation: str = "gelu", + proj_dropout: float = 0.0, + norm_eps: float = 1e-5, + ) -> None: + super().__init__() + assert activation == "gelu", "DreamZero image encoder uses GELU." + self.post_norm = post_norm + hidden_dim = int(dim * mlp_ratio) + + self.norm1 = DreamZeroLayerNorm(dim, eps=norm_eps) + self.attn = DreamZeroVisionSelfAttention( + dim, + num_heads, + proj_dropout=proj_dropout, + ) + self.norm2 = DreamZeroLayerNorm(dim, eps=norm_eps) + self.mlp = nn.Sequential( + nn.Linear(dim, hidden_dim), + nn.GELU(), + nn.Linear(hidden_dim, dim), + nn.Dropout(proj_dropout), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.post_norm: + x = x + self.norm1(self.attn(x)) + x = x + self.norm2(self.mlp(x)) + else: + x = x + self.attn(self.norm1(x)) + x = x + self.mlp(self.norm2(x)) + return x + + +class DreamZeroVisionTransformer(nn.Module): + """Source: `wan_video_image_encoder.py` `VisionTransformer`.""" + + def __init__( + self, + image_size: int = 224, + patch_size: int = 14, + dim: int = 1280, + mlp_ratio: float = 4.0, + out_dim: int = 1024, + num_heads: int = 16, + num_layers: int = 32, + pool_type: str = "token", + pre_norm: bool = True, + post_norm: bool = False, + activation: str = "gelu", + proj_dropout: float = 0.0, + embedding_dropout: float = 0.0, + norm_eps: float = 1e-5, + ) -> None: + super().__init__() + assert pool_type == "token" + self.image_size = image_size + self.patch_size = patch_size + self.num_patches = (image_size // patch_size) ** 2 + self.dim = dim + self.num_heads = num_heads + self.num_layers = num_layers + self.pool_type = pool_type + + gain = 1.0 / math.sqrt(dim) + self.patch_embedding = nn.Conv2d( + 3, + dim, + kernel_size=patch_size, + stride=patch_size, + bias=not pre_norm, + ) + self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim)) + self.pos_embedding = nn.Parameter( + gain * torch.randn(1, self.num_patches + 1, dim), + ) + self.dropout = nn.Dropout(embedding_dropout) + self.pre_norm = DreamZeroLayerNorm(dim, eps=norm_eps) if pre_norm else None + self.transformer = nn.Sequential( + *[ + DreamZeroVisionAttentionBlock( + dim=dim, + mlp_ratio=mlp_ratio, + num_heads=num_heads, + post_norm=post_norm, + activation=activation, + proj_dropout=proj_dropout, + norm_eps=norm_eps, + ) + for _ in range(num_layers) + ] + ) + self.post_norm = DreamZeroLayerNorm(dim, eps=norm_eps) + self.head = nn.Parameter(gain * torch.randn(dim, out_dim)) + + def forward(self, x: torch.Tensor, use_31_block: bool = False) -> torch.Tensor: + batch_size = x.shape[0] + x = self.patch_embedding(x).flatten(2).permute(0, 2, 1) + x = torch.cat( + [ + self.cls_embedding.expand(batch_size, -1, -1).to(dtype=x.dtype, device=x.device), + x, + ], + dim=1, + ) + x = self.dropout(x + self.pos_embedding.to(dtype=x.dtype, device=x.device)) + if self.pre_norm is not None: + x = self.pre_norm(x) + + if use_31_block: + return self.transformer[:-1](x) + return self.transformer(x) + + +class _DreamZeroCLIPContainer(nn.Module): + """Minimal container matching source checkpoint names under `model.visual.*`.""" + + def __init__(self) -> None: + super().__init__() + self.log_scale = nn.Parameter(torch.ones(())) + self.visual = DreamZeroVisionTransformer( + image_size=224, + patch_size=14, + dim=1280, + mlp_ratio=4.0, + out_dim=1024, + num_heads=16, + num_layers=32, + pool_type="token", + pre_norm=True, + post_norm=False, + activation="gelu", + proj_dropout=0.0, + embedding_dropout=0.0, + norm_eps=1e-5, + ) + + +class DreamZeroImageEncoder(nn.Module): + """Source-equivalent port of `WanImageEncoder`.""" + + def __init__(self) -> None: + super().__init__() + self.model = _DreamZeroCLIPContainer() + # Source: `clip_xlm_roberta_vit_h_14(..., return_transforms=True)` + # returns a composed transform whose last stage is CLIP normalization. + self.transforms = T.Compose( + [ + T.Normalize( + mean=[0.48145466, 0.4578275, 0.40821073], + std=[0.26862954, 0.26130258, 0.27577711], + ), + ] + ) + + def encode_image(self, videos: torch.Tensor) -> torch.Tensor: + """Source: `wan_video_image_encoder.py` `WanImageEncoder.encode_image()`.""" + size = (self.model.visual.image_size,) * 2 + videos = torch.cat( + [ + F.interpolate( + frame_batch, + size=size, + mode="bicubic", + align_corners=False, + ) + for frame_batch in videos + ] + ) + videos = self.transforms.transforms[-1](videos.mul_(0.5).add_(0.5)) + + param_dtype = next(iter(self.model.visual.parameters())).dtype + videos = videos.to(dtype=param_dtype) + out = self.model.visual(videos, use_31_block=True) + return out.clone() diff --git a/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py b/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py new file mode 100644 index 00000000000..f6e387519cd --- /dev/null +++ b/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py @@ -0,0 +1,1480 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""DreamZero pipeline for vllm-omni. + +Corresponds to: WANPolicyHead.lazy_joint_video_action (L929-1270) +Entry point for DiffusionEngine.step() → pipeline.forward(req) +""" + +from __future__ import annotations + +import copy +import json +import logging +import os +import re as re_module +from collections.abc import Iterable + +import numpy as np +import torch +import torch.nn as nn +from huggingface_hub import hf_hub_download +from transformers import AutoTokenizer, UMT5Config, UMT5EncoderModel +from vllm.model_executor.model_loader.weight_utils import default_weight_loader + +from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig +from vllm_omni.diffusion.distributed.autoencoders.autoencoder_kl_wan import ( + DistributedAutoencoderKLWan, +) +from vllm_omni.diffusion.distributed.cfg_parallel import CFGParallelMixin +from vllm_omni.diffusion.distributed.parallel_state import get_classifier_free_guidance_world_size +from vllm_omni.diffusion.distributed.utils import get_local_device +from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader +from vllm_omni.diffusion.models.dreamzero.modeling.causal_wan_model import CausalWanModel +from vllm_omni.diffusion.models.dreamzero.modeling.image_encoder import DreamZeroImageEncoder +from vllm_omni.diffusion.models.dreamzero.state_dreamzero import DreamZeroState +from vllm_omni.diffusion.models.schedulers.scheduling_flow_unipc_multistep import FlowUniPCMultistepScheduler +from vllm_omni.diffusion.request import OmniDiffusionRequest + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# VideoActionScheduler — composite scheduler (same pattern as LTX2 PR #2160) +# --------------------------------------------------------------------------- + + +class VideoActionScheduler: + """Wraps video + action schedulers into single .step() interface. + Source pattern: LTX2 VideoAudioScheduler (PR #2160) + """ + + def __init__(self, video_scheduler, action_scheduler): + self.video_scheduler = video_scheduler + self.action_scheduler = action_scheduler + + def step(self, noise_pred, t, latents, return_dict=False, generator=None): + video_out = self.video_scheduler.step( + noise_pred[0], + t[0], + latents[0], + return_dict=False, + generator=generator, + )[0] + action_out = self.action_scheduler.step( + noise_pred[1], + t[1], + latents[1], + return_dict=False, + generator=generator, + )[0] + return ((video_out, action_out),) + + +# --------------------------------------------------------------------------- +# DreamZeroPipeline +# --------------------------------------------------------------------------- + + +class DreamZeroPipeline(nn.Module, CFGParallelMixin): + """DreamZero world model pipeline. + + Multi-output: predict_noise() returns (video_pred, action_pred). + CFG: video gets standard CFG, action takes positive branch only. + State: DreamZeroState manages KV cache + frame buffer across forward() calls. + """ + + def __init__(self, *, od_config: OmniDiffusionConfig, prefix: str = "") -> None: + """Initialize pipeline components. + Source: WANPolicyHead.__init__ (L156-235) + + DreamZero root checkpoint layout (GEAR-Dreams/DreamZero-DROID): + config.json — root config (action_head_cfg, architectures, etc.) + model-*.safetensors — all learned weights (action_head.{model,text_encoder,image_encoder,vae}.*) + experiment_cfg/metadata.json — per-embodiment action normalization stats + vae/ — symlink to Wan2.1 VAE (diffusers-compatible) + + Components are instantiated from config (not from_pretrained), then filled + by load_weights() which reads root safetensors and remaps key prefixes. + Exceptions: + - tokenizer loads from `google/umt5-xxl` + - VAE uses `DistributedAutoencoderKLWan` as the local execution module. + It can be bootstrapped either from an explicit diffusers source + (`od_config.model_paths["vae"]`) or directly from constructor defaults + that match Wan2.1 VAE, after which DreamZero root + `action_head.vae.*` weights are remapped onto that module in + `load_weights()` + """ + super().__init__() + + model_path = od_config.model # last_steps.md P0-3 + model_config = od_config.model_config + local_files_only = os.path.exists(model_path) + self.od_config = od_config + + # ---- Parse root config.json ---- (last_steps.md P0-4) + root_cfg = self._load_repo_json(model_path, "config.json", local_files_only) + if root_cfg is None: + raise ValueError(f"DreamZero requires root config.json in {model_path}.") + action_head_cfg = root_cfg["action_head_cfg"] + ah_config = action_head_cfg["config"] + diffusion_model_cfg = ah_config["diffusion_model_cfg"] + + # ---- Tokenizer ---- (follows wan2_2 convention: pipeline owns tokenizer) + # DreamZero root has no tokenizer/ subfolder; uses google/umt5-xxl + # Source: last_steps.md §2.1.1 B.1 + tokenizer_source = od_config.model_paths.get("tokenizer", "google/umt5-xxl") + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_source) + + # ---- Text encoder ---- (L169) + # Instantiate from config; weights loaded by load_weights() from root checkpoint + # Source key structure: action_head.text_encoder.blocks.{N}.attn.{q,k,v,o}.weight + # UMT5-XXL: d_model=4096, d_ff=10240, num_heads=64, num_layers=24, vocab=256384 + umt5_config = UMT5Config( + d_model=4096, + d_ff=10240, + num_heads=64, + num_layers=24, + vocab_size=256384, + relative_attention_num_buckets=32, + relative_attention_max_distance=128, + dense_act_fn="gelu_new", + feed_forward_proj="gated-gelu", + is_encoder_decoder=False, + ) + self.text_encoder = UMT5EncoderModel(umt5_config) + + # ---- Image encoder ---- (L170) + # Source module: `wan_video_image_encoder.py` `WanImageEncoder` + # + # The strict service-path parity check shows that HF `CLIPVisionModel` + # drifts from upstream `WanImageEncoder.encode_image()` on real bf16 + # inference input, even when weights are remapped correctly and + # preprocessing is source-equivalent. We therefore use the local + # source-shaped port `DreamZeroImageEncoder`, whose parameter names stay + # aligned with DreamZero root keys: + # action_head.image_encoder.model.* -> image_encoder.model.* + self.image_encoder = DreamZeroImageEncoder() + + # ---- VAE ---- (L171) + # DreamZero root checkpoints already carry `action_head.vae.*`, so the + # only thing we need at init time is a compatible module skeleton. + # + # Upstream source path: + # self.vae = instantiate(config.vae_cfg) # L171 + # vae_path = ensure_file(self.vae.vae_pretrained_path, "Wan2.1_VAE.pth") # L249-252 + # self.vae.model.load_state_dict(torch.load(vae_path, ...)) # L253 + # + # In vLLM we run the diffusers-compatible execution module + # `DistributedAutoencoderKLWan`, but the final learned weights still + # come from DreamZero root `action_head.vae.model.*` through + # `load_weights()`. To let users pass only the official DreamZero HF + # repo name, we no longer require a local `vae/` subfolder. + # + # Bootstrapping policy: + # 1. If `od_config.model_paths["vae"]` is explicitly provided, honor + # it and instantiate from that diffusers source. + # 2. Else if a local prepared layout exposes `model_path/vae`, use it. + # 3. Else instantiate `DistributedAutoencoderKLWan()` directly from + # constructor defaults, which match Wan2.1 VAE geometry / latent + # normalization constants. + # + # After instantiation, `load_weights()` remaps DreamZero root + # `action_head.vae.model.*` keys onto this module. + vae_source = od_config.model_paths.get("vae") + if vae_source: + self.vae = DistributedAutoencoderKLWan.from_pretrained( + vae_source, + torch_dtype=torch.float32, + ) + elif local_files_only and os.path.isdir(os.path.join(model_path, "vae")): + self.vae = DistributedAutoencoderKLWan.from_pretrained( + model_path, + subfolder="vae", + torch_dtype=torch.float32, + ) + else: + self.vae = DistributedAutoencoderKLWan() + self.vae.init_distributed() + if not ( + getattr(od_config, "enable_cpu_offload", False) or getattr(od_config, "enable_layerwise_offload", False) + ): + self.vae = self.vae.to(device=get_local_device(), dtype=od_config.dtype) + # DreamZero upstream WanVideoVAE.encode() returns normalized mu: + # mu = (mu - mean) / std + # Source: wan_video_vae.py VideoVAE_.encode() + self.register_buffer( + "vae_latents_mean", + torch.tensor(self.vae.config.latents_mean, dtype=torch.float32).view(1, -1, 1, 1, 1), + persistent=False, + ) + self.register_buffer( + "vae_latents_inv_std", + (1.0 / torch.tensor(self.vae.config.latents_std, dtype=torch.float32)).view(1, -1, 1, 1, 1), + persistent=False, + ) + + # ---- Transformer (DiT backbone) ---- (L232) + # Config parsed from root config.json -> action_head_cfg.config.diffusion_model_cfg + # Filter out keys not accepted by CausalWanModel.__init__ + transformer_kwargs = {k: v for k, v in diffusion_model_cfg.items() if k not in ("_convert_", "_target_")} + transformer_kwargs["action_dim"] = ah_config["action_dim"] + transformer_kwargs["max_state_dim"] = ah_config["max_state_dim"] + transformer_kwargs["num_frame_per_block"] = ah_config["num_frame_per_block"] + # Upstream WANPolicyHead instantiates the DiT strictly from + # `config.diffusion_model_cfg`: + # self.model = instantiate(config.diffusion_model_cfg) + # Source: `third_party/dreamzero/.../wan_flow_matching_action_tf.py:211` + # + # The action-head-level `hidden_size=64` belongs to WANPolicyHead state + # processing, not to `CausalWanModel`. The DiT keeps its own constructor + # default `hidden_size=1024`, which is what the root checkpoint weights + # expect (for example `action_decoder.layer1.W` has shape + # `(1, 5120, 1024)`). Passing `ah_config["hidden_size"]` here shrinks the + # local action/state MLPs to 64 and breaks root checkpoint loading. + self.transformer = CausalWanModel(**transformer_kwargs) + + # ---- Scheduler ---- (L172) + self.scheduler = FlowUniPCMultistepScheduler( + num_train_timesteps=1000, + shift=1, + use_dynamic_shifting=False, + ) + + # ---- Pipeline state ---- (L180-195) + self.state = DreamZeroState() + + # ---- Inference hyperparams ---- (L175-179) + # Root-config-backed inference geometry must come directly from the + # released DreamZero HF config. Do not fall back to runtime overrides + # or hard-coded defaults for fields that already exist in + # `action_head_cfg.config`. + # Source eager path uses the hard-coded `WANPolicyHead.num_inference_steps = 16` + # (`wan_flow_matching_action_tf.py` L175), while + # `config.num_inference_timesteps` is stored separately but is not what the + # real-world inference loop consumes. Reading the config value here would + # incorrectly shorten the denoising loop to 4 steps for the released + # DreamZero checkpoint. + self.num_inference_steps: int = model_config.get("num_inference_steps", 16) + self.cfg_scale: float = model_config.get("cfg_scale", 5.0) + self.sigma_shift: float = model_config.get("sigma_shift", 5.0) + # Source: `WANPolicyHead.__init__` reads `config.num_frames` + # from `action_head_cfg.config.num_frames` (33 for DreamZero DROID), + # not from the root HF config. This value feeds `encode_image()` + # mask/conditioning construction, so falling back to 81 changes the + # inference trajectory on real checkpoints. + self.num_frames: int = ah_config["num_frames"] + self.num_frame_per_block: int = ah_config["num_frame_per_block"] + self.action_horizon: int = ah_config["action_horizon"] + + # Decoupled inference noise config # L112-118 + self.decouple_inference_noise: bool = ah_config["decouple_inference_noise"] + self.video_inference_final_noise: float = ah_config["video_inference_final_noise"] + + # Fixed seed for deterministic noise generation # L176 + self.seed: int = model_config.get("seed", 1140) + + # Model-level constants for state/action padding # dreamzero_cotrain.yaml + self.max_state_dim: int = ah_config["max_state_dim"] + self.max_action_dim: int = ah_config["max_action_dim"] + + # Fixed negative prompt for CFG uncond branch # dreamzero_cotrain.py L532 + self.negative_prompt: str = ( + "Vibrant colors, overexposed, static, blurry details, text, subtitles, " + "style, artwork, painting, image, still, grayscale, dull, worst quality, " + "low quality, JPEG artifacts, ugly, mutilated, extra fingers, bad hands, " + "bad face, deformed, disfigured, mutated limbs, fused fingers, stagnant " + "image, cluttered background, three legs, many people in the background, " + "walking backwards." + ) + + # Embodiment name → numeric ID mapping (model knowledge) + # Source: dreamzero transform/base.yaml embodiment_tag_to_projector_index + self.embodiment_name_to_id: dict[str, int] = model_config.get( + "embodiment_name_to_id", + { + "oxe_droid": 17, + "agibot": 26, + "gr1_unified": 24, + "xdof": 22, + "yam": 32, + "mecka_hands": 27, + "lapa": 27, + "dream": 31, + }, + ) + + # Action normalization stats (per-embodiment, from checkpoint metadata) + # Prefer root experiment_cfg/metadata.json, fall back to model_config path + stats_path = model_config.get("action_norm_stats_path") + metadata = self._load_repo_json(model_path, "experiment_cfg/metadata.json", local_files_only) + if metadata is not None: + self.action_norm_stats = self._parse_action_norm_stats(metadata) + self.state_norm_stats = self._parse_state_norm_stats(metadata) + elif stats_path: + self.action_norm_stats = self._load_action_norm_stats(stats_path) + self.state_norm_stats = {} + else: + self.action_norm_stats: dict[str, dict[str, torch.Tensor]] = {} + self.state_norm_stats: dict[str, dict[str, torch.Tensor]] = {} + + # Whether model uses relative actions (need to add back last state) + self.relative_action: bool = model_config.get("relative_action", True) + # Number of action dims that are relative (DROID: 7 = joint only, gripper is absolute) + # Source: droid_relative.yaml L11 — relative_action_keys: [joint_position] + self.relative_action_dim: int = model_config.get("relative_action_dim", 7) + + # ---- Weights sources ---- (last_steps.md P0-5) + # Single source pointing to DreamZero root; load_weights() handles remapping + self._weights_sources = [ + DiffusersPipelineLoader.ComponentSource( + model_or_path=model_path, + subfolder=None, + revision=None, + prefix="", + fall_back_to_pt=False, + allow_patterns_overrides=[ + "model-*.safetensors", + "model.safetensors", + ], + ), + ] + + def to(self, *args, **kwargs): + """Defer dtype/device moves to the default module semantics. + + Source: `WANPolicyHead.post_initialize()` + - upstream moves `model / text_encoder / image_encoder / vae` + to `dtype=torch.bfloat16` on the target CUDA device. + - the HF CLIP vision backbone must therefore follow the same dtype + move instead of being pinned to fp32. + """ + return super().to(*args, **kwargs) + + # ----------------------------------------------------------------------- + # Root config loading + # ----------------------------------------------------------------------- + + @staticmethod + def _load_repo_json(model_path: str, relative_path: str, local_files_only: bool) -> dict | None: + """Load a JSON file from a local checkpoint directory or HF repo.""" + if local_files_only and os.path.isdir(model_path): + json_path = os.path.join(model_path, relative_path) + if not os.path.exists(json_path): + return None + with open(json_path) as f: + return json.load(f) + + try: + json_path = hf_hub_download(model_path, relative_path) + with open(json_path) as f: + return json.load(f) + except Exception: + logger.warning("Failed to load %s from %s", relative_path, model_path) + return None + + # ----------------------------------------------------------------------- + # CFGParallelMixin overrides + # ----------------------------------------------------------------------- + + def predict_noise(self, **kwargs) -> tuple[torch.Tensor, torch.Tensor]: + """Call CausalWanModel, return (video_pred, action_pred). + Source: _run_diffusion_steps (L852-865) single model call + """ + video_pred, action_pred, updated_kv_caches = self.transformer( # L885-899 + x=kwargs["hidden_states"], + timestep=kwargs["timestep_video"], + context=kwargs["encoder_hidden_states"], + seq_len=kwargs["seq_len"], + kv_cache=kwargs["kv_cache"], + crossattn_cache=kwargs["crossattn_cache"], + current_start_frame=kwargs["current_start_frame"], + y=kwargs.get("y"), + clip_feature=kwargs.get("clip_feature"), + action=kwargs.get("action"), + timestep_action=kwargs.get("timestep_action"), + state=kwargs.get("state_features"), + embodiment_id=kwargs.get("embodiment_id"), + ) + # KV cache update: side effect, write back to state # L856-858 + if kwargs.get("update_kv_cache", False) and updated_kv_caches: + is_neg = kwargs.get("is_negative", False) + for i, kv in enumerate(updated_kv_caches): + self.state.update_kv_cache(i, kv, is_negative=is_neg) + + video_pred = video_pred.clone() # L859 + if action_pred is not None: + action_pred = action_pred.clone() # L861 + else: + batch_size = kwargs["hidden_states"].shape[0] + action_pred = torch.empty( + batch_size, + 0, + self.transformer.action_dim, + device=video_pred.device, + dtype=video_pred.dtype, + ) # CFG-parallel-safe dummy action pred + return (video_pred, action_pred) + + def combine_cfg_noise( + self, + positive_noise_pred: torch.Tensor | tuple[torch.Tensor, ...], + negative_noise_pred: torch.Tensor | tuple[torch.Tensor, ...], + true_cfg_scale: float, + cfg_normalize: bool = False, + ) -> torch.Tensor | tuple[torch.Tensor, ...]: + """Video: standard CFG. Action: positive only (no CFG). + Source: L1212 — flow_pred = uncond + cfg_scale * (cond - uncond) + action = cond only (no uncond blending) + """ + (video_pos, action_pos) = positive_noise_pred + (video_neg, _) = negative_noise_pred + video_combined = super().combine_cfg_noise(video_pos, video_neg, true_cfg_scale, cfg_normalize) + return (video_combined, action_pos) + + # ----------------------------------------------------------------------- + # CFG parallel sync (PR #2160 pattern) + # ----------------------------------------------------------------------- + + def _synchronize_cfg_parallel_step_output( + self, + latents: tuple[torch.Tensor, torch.Tensor], + do_true_cfg: bool, + ) -> tuple[torch.Tensor, torch.Tensor]: + """Post-step sync: .contiguous() + cuda.synchronize() + Source: PR #2160 LTX2 _synchronize_cfg_parallel_step_output + """ + latents = tuple(t.contiguous() for t in latents) + if do_true_cfg and get_classifier_free_guidance_world_size() > 1: + device = next((t.device for t in latents if t.is_cuda), None) + if device is not None: + torch.cuda.current_stream(device).synchronize() + return latents + + # ----------------------------------------------------------------------- + # Video preprocessing + # ----------------------------------------------------------------------- + + def _preprocess_video(self, videos: torch.Tensor) -> torch.Tensor: + """uint8 [B,T,H,W,C] → bfloat16 [B,C,T,H,W] normalized to [-1,1]. + Source: lazy_joint_video_action L952-966 + """ + videos = videos.permute(0, 4, 1, 2, 3) # L952: b t h w c → b c t h w + if videos.dtype == torch.uint8: # L954 + videos = videos.float() / 255.0 # L955 + # Source eager path casts to bf16 *before* `normalize_video` + # (`wan_flow_matching_action_tf.py:956`). Doing the `* 2 - 1` + # normalization in fp32 and only then casting to bf16 changes the + # rounded input latents on real observations. + videos = videos.to(dtype=torch.bfloat16) # L956 + b, c, t, h, w = videos.shape # L957 + videos = videos.permute(0, 2, 1, 3, 4) # L958: b c t h w → b t c h w + videos = videos.reshape(b * t, c, h, w) # L959 + # normalize: (x - 0.5) / 0.5 = x * 2 - 1 # L960 (self.normalize_video) + videos = videos * 2.0 - 1.0 + videos = videos.reshape(b, t, c, h, w).permute(0, 2, 1, 3, 4) # L961: back to b c t h w + return videos.to(dtype=torch.bfloat16) # L966 + + # ----------------------------------------------------------------------- + # Text encoding + # ----------------------------------------------------------------------- + + def _encode_text(self, text_tokens: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: + """Encode text prompt via UMT5. + Source: encode_prompt (L525-531) + """ + seq_lens = attention_mask.gt(0).sum(dim=1).long() # L526 + prompt_emb = self.text_encoder( # L527 + text_tokens, + attention_mask, + ).last_hidden_state + prompt_emb = prompt_emb.clone().to(dtype=torch.bfloat16) # L528 + for i, v in enumerate(seq_lens): # L529-530 + prompt_emb[:, v:] = 0 + return prompt_emb + + # ----------------------------------------------------------------------- + # Image encoding + # ----------------------------------------------------------------------- + + def _encode_image( + self, + image: torch.Tensor, + num_frames: int, + height: int, + width: int, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Encode first frame via CLIP + VAE. + Source: wan_flow_matching_action_tf.py encode_image (L547-564) + CLIP source: wan_video_image_encoder.py L869-887 (WanImageEncoder.encode_image) + Returns: (clip_feas, ys, image_latent) + """ + device = image.device + batch_size = image.shape[0] # L548 + + with torch.amp.autocast(dtype=torch.bfloat16, device_type=device.type): + # CLIP encode # L549 + # Upstream `WanImageEncoder.encode_image()`: + # L872-877: bicubic resize each frame batch to 224x224 + # L879: `self.transforms.transforms[-1](x * 0.5 + 0.5)` + # L882-883: run visual tower + # L886: return `use_31_block=True` output + clip_context = self.image_encoder.encode_image(image) + + # Build mask # L550-554 + msk = torch.ones(batch_size, num_frames, height // 8, width // 8, device=device) + msk[:, 1:] = 0 + msk = torch.concat([torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]], dim=1) + msk = msk.view(batch_size, msk.shape[1] // 4, 4, height // 8, width // 8) + msk = msk.transpose(1, 2) + + # VAE encode: first frame + zeros # L556-560 + latent_dtype = image.dtype + image_input = image.transpose(1, 2) # L556: B,T,C,H,W → B,C,T,H,W + image_zeros = torch.zeros( + batch_size, + 3, + num_frames - 1, + height, + width, + dtype=latent_dtype, + device=device, + ) # L557 + vae_input = torch.concat([image_input, image_zeros], dim=2) + y = self._encode_vae_latents(vae_input) # L560 + y = y.to(dtype=latent_dtype) + + new_image = y[:, :, 0:1] # L561 + y = torch.concat([msk, y], dim=1) # L563: [B, 4+C_latent, T, H, W] + + return clip_context, y, new_image + + def _encode_vae_latents(self, videos: torch.Tensor) -> torch.Tensor: + """Encode videos with DreamZero upstream WanVideoVAE semantics. + + Upstream `WanVideoVAE.encode()` does not return the raw posterior mean from + `quant_conv`; it first takes `mu` from `quant_conv(out).chunk(2, dim=1)` and + then applies channel-wise normalization `(mu - mean) * (1 / std)`. + + The multiplication form matters for bf16 parity. Source `WanVideoVAE` + stores `scale = [mean, 1.0 / std]` in fp32 and then casts that + precomputed reciprocal into the runtime dtype before the multiply. + Using bf16 division here introduces a measurable drift versus the + upstream DreamZero server. + + Source: `wan_video_vae.py` `VideoVAE_.encode()` + """ + input_dtype = videos.dtype + hidden = self.vae._encode(videos.to(dtype=self.vae.dtype)) + mu, _ = hidden.chunk(2, dim=1) + mean = self.vae_latents_mean.to(device=mu.device, dtype=mu.dtype) + inv_std = self.vae_latents_inv_std.to(device=mu.device, dtype=mu.dtype) + mu = (mu - mean) * inv_std + return mu.to(dtype=input_dtype) + + # ----------------------------------------------------------------------- + # KV cache prefill + # ----------------------------------------------------------------------- + + def _prefill_kv_cache( + self, + image_latents: torch.Tensor, + prompt_embeds: torch.Tensor, + negative_prompt_embeds: torch.Tensor | None, + frame_seqlen: int, + seq_len: int, + do_true_cfg: bool, + ) -> None: + """Prefill KV cache with first frame and/or current observation. + Source: lazy_joint_video_action L1078-1125 + + Uses predict_noise_maybe_with_cfg() for CFG parallel — same path as + the denoise loop. The mixin handles rank dispatch automatically. + KV cache update happens as a side effect inside predict_noise(). + """ + batch_size = image_latents.shape[0] + device = image_latents.device + dtype = image_latents.dtype + num_heads = getattr(self.transformer.blocks[0].self_attn, "tp_num_heads", self.transformer.num_heads) + head_dim = self.transformer.dim // self.transformer.num_heads + + if self.state.current_start_frame == 0: + # First call: create caches + encode first frame # L1051-1063 + self.state.create_kv_caches( + batch_size, + dtype, + device, + self.transformer.num_layers, + num_heads, + head_dim, + ) + + zero_t = torch.zeros([batch_size, 1], device=device, dtype=torch.long) + y_first = self.state.ys[:, :, 0:1] if self.state.ys is not None else None + + # Prefill via predict_noise_maybe_with_cfg # L1080-1097 + # KV cache update is a side effect in predict_noise() + common = dict( + hidden_states=image_latents.transpose(1, 2), + timestep_video=zero_t, + seq_len=frame_seqlen, + current_start_frame=0, + y=y_first, + clip_feature=self.state.clip_feas, + update_kv_cache=True, + ) + positive_kwargs = dict( + encoder_hidden_states=prompt_embeds, + kv_cache=self.state.get_kv_caches(False), + crossattn_cache=self.state.get_crossattn_caches(False), + is_negative=False, + **common, + ) + negative_kwargs = ( + dict( + encoder_hidden_states=negative_prompt_embeds, + kv_cache=self.state.get_kv_caches(True), + crossattn_cache=self.state.get_crossattn_caches(True), + is_negative=True, + **common, + ) + if negative_prompt_embeds is not None + else None + ) + + self.predict_noise_maybe_with_cfg( + positive_kwargs=positive_kwargs, + negative_kwargs=negative_kwargs, + do_true_cfg=do_true_cfg, + true_cfg_scale=self.cfg_scale, + cfg_normalize=False, + ) + self.state.current_start_frame = 1 # L1098 + + # Subsequent: encode current observation # L1102-1125 + if self.state.current_start_frame != 1: + csf = self.state.current_start_frame + nfpb = self.num_frame_per_block + current_ref = image_latents[:, -nfpb:] + if self.state.ys is not None and csf <= self.state.ys.shape[2]: + y = self.state.ys[:, :, csf - nfpb : csf] + elif self.state.ys is not None: + y = self.state.ys[:, :, -nfpb:] + else: + y = None + + zero_t = torch.zeros([batch_size, nfpb], device=device, dtype=torch.long) + common = dict( + hidden_states=current_ref.transpose(1, 2), + timestep_video=zero_t, + seq_len=seq_len, + current_start_frame=csf - nfpb, + y=y, + clip_feature=self.state.clip_feas, + update_kv_cache=True, + ) + positive_kwargs = dict( + encoder_hidden_states=prompt_embeds, + kv_cache=self.state.get_kv_caches(False), + crossattn_cache=self.state.get_crossattn_caches(False), + is_negative=False, + **common, + ) + negative_kwargs = ( + dict( + encoder_hidden_states=negative_prompt_embeds, + kv_cache=self.state.get_kv_caches(True), + crossattn_cache=self.state.get_crossattn_caches(True), + is_negative=True, + **common, + ) + if negative_prompt_embeds is not None + else None + ) + + self.predict_noise_maybe_with_cfg( + positive_kwargs=positive_kwargs, + negative_kwargs=negative_kwargs, + do_true_cfg=do_true_cfg, + true_cfg_scale=self.cfg_scale, + cfg_normalize=False, + ) + + def diffuse( + self, + video_latents: torch.Tensor, + action_latents: torch.Tensor, + timesteps_video: torch.Tensor, + timesteps_action: torch.Tensor, + prompt_embeds: torch.Tensor, + negative_prompt_embeds: torch.Tensor | None, + video_action_scheduler: VideoActionScheduler, + do_true_cfg: bool, + **kwargs, + ) -> tuple[torch.Tensor, torch.Tensor]: + """Denoising loop with CFG parallel support. + Source: lazy_joint_video_action L1164-1241 + + For each timestep: + 1. Build positive_kwargs / negative_kwargs + 2. predict_noise_maybe_with_cfg() → (video_pred, action_pred) + 3. scheduler_step_maybe_with_cfg() → VideoActionScheduler + 4. _synchronize_cfg_parallel_step_output() + """ + seq_len = kwargs["seq_len"] # L1046 + state_features = kwargs.get("state_features") # L950 + embodiment_id = kwargs.get("embodiment_id") # L949 + + # Shared kwargs for predict_noise (both cond & uncond branches) + common_kwargs = dict( + seq_len=seq_len, + current_start_frame=self.state.current_start_frame, + state_features=state_features, + embodiment_id=embodiment_id, + update_kv_cache=False, # L1206: denoising steps don't update KV + ) + + noisy_input = video_latents # L1129 + noisy_input_action = action_latents # L1130 + for index in range(len(timesteps_video)): # L1164 + video_timestep = timesteps_video[index] # L1169 + action_timestep = timesteps_action[index] # L1168 + batch_size = noisy_input.shape[0] + + # Build per-frame timestep tensors # L1172-1181 + timestep = ( + torch.ones( + [batch_size, self.num_frame_per_block], + device=noisy_input.device, + dtype=torch.int64, + ) + * video_timestep + ) + timestep_action = ( + torch.ones( + [batch_size, self.action_horizon], + device=noisy_input.device, + dtype=torch.int64, + ) + * action_timestep + ) + + # Compute y (image conditioning) slice # L1187-1190 + csf = self.state.current_start_frame + if csf + self.num_frame_per_block <= self.state.ys.shape[2]: + y = self.state.ys[:, :, csf : csf + self.num_frame_per_block] # L1188 + else: + y = self.state.ys[:, :, -self.num_frame_per_block :] # L1190 + + # Positive (cond) kwargs # L1191-1208 + positive_kwargs = dict( + hidden_states=noisy_input.transpose(1, 2), # L1192 + timestep_video=timestep, + encoder_hidden_states=prompt_embeds, + kv_cache=self.state.get_kv_caches(False), + crossattn_cache=self.state.get_crossattn_caches(False), + y=y, + clip_feature=self.state.clip_feas, + action=noisy_input_action, # L1194 + timestep_action=timestep_action, # L1195 + is_negative=False, + **common_kwargs, + ) + + # Negative (uncond) kwargs + if do_true_cfg and negative_prompt_embeds is not None: + negative_kwargs = dict( + hidden_states=noisy_input.transpose(1, 2), + timestep_video=timestep, + encoder_hidden_states=negative_prompt_embeds, + kv_cache=self.state.get_kv_caches(True), + crossattn_cache=self.state.get_crossattn_caches(True), + y=y, + clip_feature=self.state.clip_feas, + action=noisy_input_action, + timestep_action=timestep_action, + is_negative=True, + **common_kwargs, + ) + else: + negative_kwargs = None + + noise_pred = self.predict_noise_maybe_with_cfg( + positive_kwargs=positive_kwargs, + negative_kwargs=negative_kwargs, + do_true_cfg=do_true_cfg, + true_cfg_scale=self.cfg_scale, + cfg_normalize=False, + ) + flow_pred, flow_pred_action = noise_pred + + # Scheduler step: video + action # L1225-1240 + latents = (noisy_input, noisy_input_action) + t = (video_timestep, action_timestep) + noise_pred_tuple = (flow_pred.transpose(1, 2), flow_pred_action) # L1226 + step_output = video_action_scheduler.step( + noise_pred_tuple, + t, + latents, + generator=kwargs.get("generator"), + ) + noisy_input, noisy_input_action = step_output[0] + + # Post-step sync # PR #2160 + noisy_input, noisy_input_action = self._synchronize_cfg_parallel_step_output( + (noisy_input, noisy_input_action), + do_true_cfg, + ) + + return noisy_input, noisy_input_action # L1242-1243 + + # ----------------------------------------------------------------------- + # Main entry point + # ----------------------------------------------------------------------- + + @torch.no_grad() + def forward(self, req: OmniDiffusionRequest, **kwargs) -> DiffusionOutput: + """Full inference step. Called by DiffusionEngine.step(). + Source: WANPolicyHead.lazy_joint_video_action (L929-1270) + """ + extra_args = req.sampling_params.extra_args or {} + unified_obs = extra_args.get("unified_obs") + if unified_obs is None: + first_prompt = req.prompts[0] if req.prompts else "" + prompt = first_prompt if isinstance(first_prompt, str) else (first_prompt.get("prompt") or "") + is_dummy_warmup = prompt == "dummy run" and req.sampling_params.num_inference_steps == 1 + if is_dummy_warmup: + logger.info("Skipping DreamZero dummy warmup request without unified_obs.") + return DiffusionOutput( + output={ + "actions": np.zeros( + (self.action_horizon, self.max_action_dim), + dtype=np.float32, + ), + }, + ) + raise KeyError("unified_obs") + device = get_local_device() + + # ---- Step 1: Extract inputs from unified observation ---- + prompt_str = unified_obs["prompt"] # str (templated) + stitched = unified_obs["images"] # ndarray (T,H,W,C) from transform + if not isinstance(stitched, np.ndarray): + stitched = np.asarray(stitched) + embodiment_name = unified_obs["embodiment_name"] + embodiment_id = torch.tensor( # (B,) tensor for CategorySpecificMLP + [self.embodiment_name_to_id[embodiment_name]], + dtype=torch.long, + device=device, + ) + + # State: raw from transform → pad to (B, state_horizon=1, max_state_dim) + raw_state = unified_obs["state"] + state_for_postprocess = None + if raw_state is not None: + if not isinstance(raw_state, np.ndarray): + raw_state = np.asarray(raw_state, dtype=np.float64) + raw_state = raw_state.flatten() + padded = np.zeros(self.max_state_dim, dtype=np.float64) + n = min(len(raw_state), self.max_state_dim) + padded[:n] = raw_state[:n] + state_for_postprocess = ( + torch.from_numpy(padded) + .reshape(1, 1, self.max_state_dim) + .to( + device=device, + dtype=torch.float32, + ) + ) + state_features = self._normalize_state( + state_for_postprocess, + embodiment_name, + ).to(dtype=torch.bfloat16) + else: + state_features = None + + # ---- Step 1b: Tokenize ---- (wan2_2 convention: pipeline owns tokenizer) + text_inputs = self.tokenizer( + prompt_str, + max_length=512, + padding="max_length", + truncation=True, + return_tensors="pt", + add_special_tokens=True, + ) + text_tokens = text_inputs["input_ids"].to(device) + attention_mask = text_inputs["attention_mask"].to(device) + + # ---- Step 2: Check reset + accumulate frames ---- (L968-981) + # Explicit reset from serving layer (session switch / client request) + if extra_args.get("reset", False): + self.state.reset() + # Auto-reset based on model state (before accumulation) + if self.state.should_reset(text_tokens, 0, self.transformer.local_attn_size): + self.state.reset() + self.state.language = text_tokens # L970/975 + + # Frame accumulation: stitched single frame → multi-frame video + video_frames = self.state.accumulate_frames(stitched) # (T, H, W, C) + videos = torch.from_numpy(video_frames).unsqueeze(0).to(device) # (B=1, T, H, W, C) + + # ---- Step 3: Preprocess video ---- (L952-966) + videos = self._preprocess_video(videos) # → [B,C,T,H,W] bf16 + _, _, num_frames_raw, height, width = videos.shape + + # ---- Step 4: Encode text ---- (L986-991) + prompt_embeds = self._encode_text(text_tokens, attention_mask) + # Negative prompt for CFG uncond branch (model constant) + negative_prompt_embeds = None + if self.cfg_scale > 1.0: + neg_inputs = self.tokenizer( + self.negative_prompt, + max_length=512, + padding="max_length", + truncation=True, + return_tensors="pt", + add_special_tokens=True, + ) + negative_prompt_embeds = self._encode_text( + neg_inputs["input_ids"].to(device), + neg_inputs["attention_mask"].to(device), + ) + + # ---- Step 5: Encode image (first call only) ---- (L1002-1005) + # Extract first/last frame for CLIP + VAE encoding + if num_frames_raw == 4 or num_frames_raw == 9: # L996-999 + image = videos[:, :, -1:].transpose(1, 2) # L998: real-world eval + else: + image = videos[:, :, :1].transpose(1, 2) # L1000 + + if self.state.current_start_frame == 0: # L1002 + clip_feas, ys, image = self._encode_image( + image, + self.num_frames, + height, + width, + ) + self.state.clip_feas = clip_feas.to(dtype=image.dtype) # L1004 + self.state.ys = ys.to(dtype=image.dtype) # L1005 + + # ---- Step 6: VAE encode observation frames ---- (L1013-1038) + if self.state.current_start_frame != 0: # L1013-1038 + # Subsequent calls: encode current observation via VAE + if (num_frames_raw - 1) // 4 == self.num_frame_per_block: + pass # L1020: no further action + elif num_frames_raw // 4 != self.num_frame_per_block: + # Repeat to match num_frame_per_block # L1023-1027 + repeat_factor = self.num_frame_per_block // (num_frames_raw // 4) + videos = torch.repeat_interleave(videos, repeat_factor, dim=2) + first_frame = videos[:, :, 0:1] + videos = torch.cat([first_frame, videos], dim=2) + else: + first_frame = videos[:, :, 0:1] # L1029-1030 + videos = torch.cat([first_frame, videos], dim=2) + + latent_dtype = videos.dtype + with torch.no_grad(): + image = self._encode_vae_latents(videos) # L1032-1038 + image = image.to(dtype=latent_dtype) + + # ---- Step 7: Generate noise (deterministic) ---- (L1041-1042, L176, L771) + # Source: wan_flow_matching_action_tf.py L1041 + batch_size = image.shape[0] + generator = torch.Generator(device=device).manual_seed(self.seed) # L771 + noise_obs = torch.randn( + batch_size, + 16, + self.num_frame_per_block, + height // 8, + width // 8, + device=device, + dtype=torch.bfloat16, + generator=generator, + ) # L1041 + generator = torch.Generator(device=device).manual_seed(self.seed) # L771 + noise_action = torch.randn( + batch_size, + self.action_horizon, + self.transformer.action_dim, + device=device, + dtype=torch.bfloat16, + generator=generator, + ) # L1042 + + _, num_channels, num_frames, h_latent, w_latent = noise_obs.shape + frame_seqlen = int(h_latent * w_latent / 4) # L1045 + seq_len = frame_seqlen * num_frames # L1046 + + image = image.transpose(1, 2) # L1048: [B,C,T,H,W]→[B,T,C,H,W] + noise_obs = noise_obs.transpose(1, 2) # L1049 + + # ---- Step 8: Prefill KV cache, ---- (L1078-1125) + do_true_cfg = self.cfg_scale > 1.0 and negative_prompt_embeds is not None + self._prefill_kv_cache( + image, + prompt_embeds, + negative_prompt_embeds, + frame_seqlen, + seq_len, + do_true_cfg, + ) + + # ---- Step 9: Create schedulers ---- (L1134-1155) + sample_scheduler = copy.deepcopy(self.scheduler) # L1134-1137 + sample_scheduler_action = copy.deepcopy(self.scheduler) # L1138-1141 + sample_scheduler.set_timesteps( + self.num_inference_steps, + device=device, + shift=self.sigma_shift, + ) # L1142-1143 + sample_scheduler_action.set_timesteps( + self.num_inference_steps, + device=device, + shift=self.sigma_shift, + ) # L1144-1145 + + # Decoupled inference: video sigmas end early # L1150-1157 + if self.decouple_inference_noise: + video_final_noise = self.video_inference_final_noise + sigma_max = sample_scheduler.sigmas[0].item() + sample_scheduler.sigmas = ( + sample_scheduler.sigmas * (sigma_max - video_final_noise) / sigma_max + video_final_noise + ) + sample_scheduler.timesteps = (sample_scheduler.sigmas[:-1] * 1000).to(torch.int64) + + video_action_scheduler = VideoActionScheduler( + sample_scheduler, + sample_scheduler_action, + ) + + # ---- Step 10: Denoising loop ---- (L1164-1241) + video_out, action_out = self.diffuse( + video_latents=noise_obs, # L1129 + action_latents=noise_action, # L1130 + timesteps_video=sample_scheduler.timesteps, + timesteps_action=sample_scheduler_action.timesteps, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + video_action_scheduler=video_action_scheduler, + do_true_cfg=do_true_cfg, + seq_len=seq_len, + state_features=state_features, + embodiment_id=embodiment_id, + ) + + # ---- Step 11: Post-process ---- (L1242-1273) + if self.state.current_start_frame == 1: # L1246-1247 + video_out = torch.cat([image, video_out], dim=1) + self.state.current_start_frame += self.num_frame_per_block # L1248 + + # ---- Step 12: Action denormalization ---- (sim_policy.py L500-569) + # q99 denorm: [-1,1] → real values + action_out = self._denormalize_action(action_out.float(), embodiment_name) + + # Relative → absolute: only for relative_action_keys (joint_position only) + # Source: droid_relative.yaml L11 — relative_action_keys: [joint_position] + # gripper_position is NOT relative, so don't add state back to it + if self.relative_action and state_for_postprocess is not None: + n_relative = self.relative_action_dim # 7 for DROID (joint only) + # Use original state precision for post-denorm absolute recovery. + # Upstream adds obs state after `eval_transform.unapply()` + # (`sim_policy.py` L511-566), i.e. after the action tensor has left + # the bf16 denoising path. + last_state = state_for_postprocess[:, 0, :n_relative] # (B, n_relative) + action_out[..., :n_relative] = ( + action_out[..., :n_relative] + last_state.unsqueeze(1) # broadcast over horizon + ) + + # Squeeze batch dim for output: (B, horizon, dim) → (horizon, dim) + actions_np = action_out.squeeze(0).float().cpu().numpy() # (horizon, max_action_dim) + + return DiffusionOutput( + output={ + "actions": actions_np, # L1273 + "video": video_out.transpose(1, 2).cpu(), + }, + ) + + # ----------------------------------------------------------------------- + # Action denormalization + # ----------------------------------------------------------------------- + + def _load_action_norm_stats(self, stats_path: str) -> dict[str, dict[str, torch.Tensor]]: + """Load per-embodiment action normalization stats from metadata.json. + Source: metadata.json → statistics.action.{joint_position,gripper_position}.{q01,q99} + + Returns: {embodiment_name: {"q01": Tensor(action_dim,), "q99": Tensor(action_dim,)}} + """ + with open(stats_path) as f: + metadata = json.load(f) + return self._parse_action_norm_stats(metadata) + + @staticmethod + def _parse_action_norm_stats(metadata: dict) -> dict[str, dict[str, torch.Tensor]]: + result = {} + for emb_name, emb_data in metadata.items(): + action_stats = emb_data.get("statistics", {}).get("action", {}) + q01_parts, q99_parts = [], [] + # Concatenate joint_position + gripper_position stats + for key in ["joint_position", "gripper_position"]: + if key in action_stats: + q01_parts.extend(action_stats[key]["q01"]) + q99_parts.extend(action_stats[key]["q99"]) + if q01_parts: + result[emb_name] = { + "q01": torch.tensor(q01_parts, dtype=torch.float32), + "q99": torch.tensor(q99_parts, dtype=torch.float32), + } + return result + + @staticmethod + def _parse_state_norm_stats(metadata: dict) -> dict[str, dict[str, torch.Tensor]]: + """Load per-embodiment state normalization stats from metadata.json. + Source: `StateActionTransform(normalization_modes=q99)` in eval transform. + """ + result = {} + for emb_name, emb_data in metadata.items(): + state_stats = emb_data.get("statistics", {}).get("state", {}) + q01_parts, q99_parts = [], [] + for key in ["joint_position", "gripper_position"]: + if key in state_stats: + q01_parts.extend(state_stats[key]["q01"]) + q99_parts.extend(state_stats[key]["q99"]) + if q01_parts: + result[emb_name] = { + "q01": torch.tensor(q01_parts, dtype=torch.float32), + "q99": torch.tensor(q99_parts, dtype=torch.float32), + } + return result + + def _normalize_state( + self, + state: torch.Tensor, + embodiment_name: str, + ) -> torch.Tensor: + """Normalize state with q99 stats before feeding the model. + Source: `StateActionTransform.apply()` → `Normalizer.forward(mode='q99')`. + """ + state_norm_stats = getattr(self, "state_norm_stats", {}) + if embodiment_name not in state_norm_stats: + return state + stats = state_norm_stats[embodiment_name] + q01 = stats["q01"].to(device=state.device, dtype=state.dtype) + q99 = stats["q99"].to(device=state.device, dtype=state.dtype) + actual_dim = q01.shape[0] + normalized = state.clone() + range_vals = q99 - q01 + mask = range_vals != 0 + normalized_slice = normalized[..., :actual_dim] + normalized_slice[..., mask] = 2 * (normalized_slice[..., mask] - q01[mask]) / range_vals[mask] - 1 + normalized_slice = torch.clamp(normalized_slice, -1, 1) + normalized[..., :actual_dim] = normalized_slice + return normalized + + def _denormalize_action( + self, + action: torch.Tensor, + embodiment_name: str, + ) -> torch.Tensor: + """Denormalize action from [-1,1] to real values using q99 mode. + Source: state_action.py Normalizer.inverse() L188-207 + + Formula: real = (normalized + 1) / 2 * (q99 - q01) + q01 + """ + if embodiment_name not in self.action_norm_stats: + return action + stats = self.action_norm_stats[embodiment_name] + q01 = stats["q01"].to(device=action.device, dtype=action.dtype) + q99 = stats["q99"].to(device=action.device, dtype=action.dtype) + # action shape: (B, horizon, action_dim) or (B, horizon, max_action_dim) + # q01/q99 shape: (actual_action_dim,) — only denorm actual dims + actual_dim = q01.shape[0] + action_real = action.clone() + action_real[..., :actual_dim] = (action[..., :actual_dim] + 1) / 2 * (q99 - q01) + q01 + return action_real + + # ----------------------------------------------------------------------- + # Weight loading + # ----------------------------------------------------------------------- + + @property + def weights_sources(self): + """ComponentSource list for DiffusersPipelineLoader. + Source: last_steps.md P0-5 + """ + return self._weights_sources + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + """Load weights from DreamZero root checkpoint with key remapping. + Source: last_steps.md P0-6 + + DreamZero root keys have prefix ``action_head.{component}.*``. + This method dispatches each key to the appropriate component converter: + action_head.model.* → transformer.* (6a: prefix strip) + action_head.text_encoder.* → text_encoder.* (6b: UMT5 remapping) + action_head.image_encoder.* → image_encoder.* (6c: CLIP remapping + QKV split) + action_head.vae.* → vae.* (6d: WanVideoVAE -> diffusers remap) + Other keys (e.g. backbone.*) are silently skipped. + """ + loaded: set[str] = set() + params = dict(self.named_parameters()) + buffers = dict(self.named_buffers()) + + for name, tensor in weights: + if name.startswith("action_head.model."): + # 6a. Transformer: prefix replacement + img_emb remap + new_name = "transformer." + name[len("action_head.model.") :] + # DreamZero img_emb uses nn.Sequential (proj.0/1/3/4), + # CausalWanModel uses named layers (norm1/fc1/norm2/fc2) + # Source: wan_video_dit_action_casual_chunk.py L1380 + # DreamZero MLPProj: + # Sequential([0:LN(1280), 1:Linear(1280,1280), 2:GELU, 3:Linear(1280,5120), 4:LN(5120)]) + # CausalWanModel MLPProj: norm1=LN, fc1=ColParallel, act=GELU, fc2=RowParallel, norm2=LN + # Source: wan2_1_submodule.py L570-573 + new_name = ( + new_name.replace("img_emb.proj.0.", "img_emb.norm1.") + .replace("img_emb.proj.1.", "img_emb.fc1.") + .replace("img_emb.proj.3.", "img_emb.fc2.") + .replace("img_emb.proj.4.", "img_emb.norm2.") + ) + if new_name in params: + # Use default_weight_loader for ColumnParallelLinear/RowParallelLinear + default_weight_loader(params[new_name], tensor) + loaded.add(new_name) + elif new_name in buffers: + buffers[new_name].data.copy_(tensor) + loaded.add(new_name) + + elif name.startswith("action_head.text_encoder."): + # 6b. Text encoder: DreamZero custom naming -> HF UMT5EncoderModel + mapped = self._remap_text_encoder_key(name) + if mapped is None: + continue + for new_name in mapped if isinstance(mapped, list) else [mapped]: + full_name = "text_encoder." + new_name + if full_name in params: + params[full_name].data.copy_(tensor) + loaded.add(full_name) + + elif name.startswith("action_head.image_encoder."): + # 6c. Image encoder: source-shaped local port. + # Root checkpoint keys already match the local module layout: + # action_head.image_encoder.model.* -> image_encoder.model.* + self._remap_image_encoder_key(name, tensor, params, loaded) + + elif name.startswith("action_head.vae."): + # 6d. VAE: DreamZero WanVideoVAE -> diffusers AutoencoderKLWan + mapped = self._remap_vae_key(name) + if mapped is None: + continue + full_name = "vae." + mapped + if full_name in params: + params[full_name].data.copy_(tensor) + loaded.add(full_name) + + # All other keys (backbone.*, etc.) are silently skipped + + logger.info( + "DreamZero load_weights: loaded %d parameters from root checkpoint", + len(loaded), + ) + return loaded + + # ----------------------------------------------------------------------- + # 6b. Text encoder key remapping (242 keys) + # ----------------------------------------------------------------------- + + @staticmethod + def _remap_text_encoder_key(name: str) -> str | list[str] | None: + """Remap a single DreamZero text encoder key to HF UMT5EncoderModel name(s). + + DreamZero text encoder is a custom reimplementation of UMT5. + Source key structure: action_head.text_encoder.{subkey} + Target: UMT5EncoderModel state_dict keys (without 'text_encoder.' prefix) + + Returns target name(s) relative to text_encoder, or None to skip. + """ + # Strip the source prefix + subkey = name[len("action_head.text_encoder.") :] + + # --- Global keys --- + if subkey == "token_embedding.weight": + # shared.weight and encoder.embed_tokens.weight are the same tensor (tied); + # only shared.weight appears in named_parameters() + return "shared.weight" + if subkey == "norm.weight": + return "encoder.final_layer_norm.weight" + + # --- Per-block keys --- + # Pattern: blocks.{N}.{rest} + m = re_module.match(r"blocks\.(\d+)\.(.*)", subkey) + if not m: + return None + block_idx = m.group(1) + rest = m.group(2) + + prefix = f"encoder.block.{block_idx}" + + # Attention layer (layer.0) + if rest == "attn.q.weight": + return f"{prefix}.layer.0.SelfAttention.q.weight" + if rest == "attn.k.weight": + return f"{prefix}.layer.0.SelfAttention.k.weight" + if rest == "attn.v.weight": + return f"{prefix}.layer.0.SelfAttention.v.weight" + if rest == "attn.o.weight": + return f"{prefix}.layer.0.SelfAttention.o.weight" + if rest == "pos_embedding.embedding.weight": + return f"{prefix}.layer.0.SelfAttention.relative_attention_bias.weight" + if rest == "norm1.weight": + return f"{prefix}.layer.0.layer_norm.weight" + + # FFN layer (layer.1) + if rest == "ffn.gate.0.weight": + return f"{prefix}.layer.1.DenseReluDense.wi_0.weight" + if rest == "ffn.fc1.weight": + return f"{prefix}.layer.1.DenseReluDense.wi_1.weight" + if rest == "ffn.fc2.weight": + return f"{prefix}.layer.1.DenseReluDense.wo.weight" + if rest == "norm2.weight": + return f"{prefix}.layer.1.layer_norm.weight" + + return None + + # ----------------------------------------------------------------------- + # 6d. VAE key remapping (194 keys) + # ----------------------------------------------------------------------- + + @staticmethod + def _remap_vae_key(name: str) -> str | None: + """Remap DreamZero WanVideoVAE keys to diffusers AutoencoderKLWan. + + Source key structure: `action_head.vae.model.*` + Upstream source: `wan_video_vae.py` `WanVideoVAE` / `VideoVAE_` + Target: diffusers `AutoencoderKLWan` state_dict keys (without `vae.` prefix) + """ + if not name.startswith("action_head.vae.model."): + return None + + rest = name[len("action_head.vae.model.") :] + + direct_prefix_map = { + "encoder.conv1.": "encoder.conv_in.", + "encoder.head.0.": "encoder.norm_out.", + "encoder.head.2.": "encoder.conv_out.", + "decoder.conv1.": "decoder.conv_in.", + "decoder.head.0.": "decoder.norm_out.", + "decoder.head.2.": "decoder.conv_out.", + "conv1.": "quant_conv.", + "conv2.": "post_quant_conv.", + } + for src_prefix, dst_prefix in direct_prefix_map.items(): + if rest.startswith(src_prefix): + return dst_prefix + rest[len(src_prefix) :] + + resnet_leaf_map = { + "residual.0.gamma": "norm1.gamma", + "residual.2.weight": "conv1.weight", + "residual.2.bias": "conv1.bias", + "residual.3.gamma": "norm2.gamma", + "residual.6.weight": "conv2.weight", + "residual.6.bias": "conv2.bias", + } + block_leaf_map = { + **resnet_leaf_map, + "shortcut.weight": "conv_shortcut.weight", + "shortcut.bias": "conv_shortcut.bias", + "resample.1.weight": "resample.1.weight", + "resample.1.bias": "resample.1.bias", + "time_conv.weight": "time_conv.weight", + "time_conv.bias": "time_conv.bias", + } + + m = re_module.match(r"encoder\.middle\.(\d+)\.(.*)", rest) + if m: + idx = int(m.group(1)) + tail = m.group(2) + if idx in (0, 2) and tail in resnet_leaf_map: + res_idx = 0 if idx == 0 else 1 + return f"encoder.mid_block.resnets.{res_idx}.{resnet_leaf_map[tail]}" + if idx == 1: + return f"encoder.mid_block.attentions.0.{tail}" + return None + + m = re_module.match(r"decoder\.middle\.(\d+)\.(.*)", rest) + if m: + idx = int(m.group(1)) + tail = m.group(2) + if idx in (0, 2) and tail in resnet_leaf_map: + res_idx = 0 if idx == 0 else 1 + return f"decoder.mid_block.resnets.{res_idx}.{resnet_leaf_map[tail]}" + if idx == 1: + return f"decoder.mid_block.attentions.0.{tail}" + return None + + m = re_module.match(r"encoder\.downsamples\.(\d+)\.(.*)", rest) + if m: + idx = int(m.group(1)) + tail = m.group(2) + if tail in block_leaf_map: + return f"encoder.down_blocks.{idx}.{block_leaf_map[tail]}" + return None + + m = re_module.match(r"decoder\.upsamples\.(\d+)\.(.*)", rest) + if m: + idx = int(m.group(1)) + tail = m.group(2) + if tail not in block_leaf_map: + return None + + if idx <= 2: + prefix = f"decoder.up_blocks.0.resnets.{idx}." + elif idx == 3: + prefix = "decoder.up_blocks.0.upsamplers.0." + elif 4 <= idx <= 6: + prefix = f"decoder.up_blocks.1.resnets.{idx - 4}." + elif idx == 7: + prefix = "decoder.up_blocks.1.upsamplers.0." + elif 8 <= idx <= 10: + prefix = f"decoder.up_blocks.2.resnets.{idx - 8}." + elif idx == 11: + prefix = "decoder.up_blocks.2.upsamplers.0." + elif 12 <= idx <= 14: + prefix = f"decoder.up_blocks.3.resnets.{idx - 12}." + else: + return None + return prefix + block_leaf_map[tail] + + return None + + # ----------------------------------------------------------------------- + # 6c. Image encoder key remapping + # ----------------------------------------------------------------------- + + def _remap_image_encoder_key( + self, + name: str, + tensor: torch.Tensor, + params: dict[str, torch.nn.Parameter], + loaded: set[str], + ) -> None: + """Map a DreamZero image encoder key onto the local source-shaped port. + + Source key structure: + action_head.image_encoder.model.* + + Target key structure: + image_encoder.model.* + + Because `DreamZeroImageEncoder` keeps DreamZero's original parameter + layout, this mapping is now a direct prefix strip instead of the older + HF `CLIPVisionModel` remap. + """ + if not name.startswith("action_head.image_encoder."): + return + + full_name = "image_encoder." + name[len("action_head.image_encoder.") :] + if full_name in params: + params[full_name].data.copy_(tensor) + loaded.add(full_name) diff --git a/vllm_omni/diffusion/models/dreamzero/state_dreamzero.py b/vllm_omni/diffusion/models/dreamzero/state_dreamzero.py new file mode 100644 index 00000000000..aca54db5059 --- /dev/null +++ b/vllm_omni/diffusion/models/dreamzero/state_dreamzero.py @@ -0,0 +1,185 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""DreamZero pipeline persistent state. + +Consolidates all cross-forward() state that was originally scattered across: +- ARDroidRoboarenaPolicy._frame_buffers (socket_test_optimized_AR.py) +- WANPolicyHead.kv_cache1/kv_cache_neg (wan_flow_matching_action_tf.py) +- WANPolicyHead.clip_feas/ys (wan_flow_matching_action_tf.py) +""" + +from __future__ import annotations + +import logging + +import numpy as np +import torch + +logger = logging.getLogger(__name__) + +# Number of frames per chunk for subsequent calls (first call uses 1) +# Corresponds to: ARDroidRoboarenaPolicy.FRAMES_PER_CHUNK = 4 +FRAMES_PER_CHUNK = 4 + + +class DreamZeroState: + """Pipeline persistent state across forward() calls. + + Lifecycle: + - Created once in DreamZeroPipeline.__init__() + - Mutated every forward() call (frame append, KV cache grow) + - reset() on new session / language change / local_attn_size exceeded + """ + + def __init__(self) -> None: + self.reset() + + # ------------------------------------------------------------------ + # Frame accumulation (single stitched buffer) + # Transform outputs stitched single frame per call. + # We accumulate here to build multi-frame video for AR inference. + # Source: socket_test_optimized_AR.py L110-144 (adapted from per-camera to stitched) + # ------------------------------------------------------------------ + + def accumulate_frames(self, stitched: np.ndarray) -> np.ndarray: + """Accumulate stitched frames and return multi-frame video. + + Args: + stitched: (H, W, C) single frame or (T, H, W, C) multi-frame, + already stitched by transform. + + Returns: + (T, H, W, C) ndarray. T=1 for first call, T=FRAMES_PER_CHUNK(4) after. + """ + if stitched.ndim == 3: + self.stitched_buffer.append(stitched) + elif stitched.ndim == 4: + self.stitched_buffer.extend(list(stitched)) + else: + raise ValueError(f"Expected 3D or 4D stitched, got {stitched.ndim}D") + + num_frames = 1 if self.call_count == 0 else FRAMES_PER_CHUNK + + if len(self.stitched_buffer) >= num_frames: + frames = self.stitched_buffer[-num_frames:] + else: + # Pad by repeating first frame + frames = list(self.stitched_buffer) + while len(frames) < num_frames: + frames.insert(0, self.stitched_buffer[0]) + + self.call_count += 1 + return np.stack(frames, axis=0) # (T, H, W, C) + + # ------------------------------------------------------------------ + # Reset / should_reset + # Source: wan_flow_matching_action_tf.py L968-981 + # ------------------------------------------------------------------ + + def reset(self) -> None: + """Clear all state. + + Source: + - socket_test_optimized_AR.py L302-330: ARDroidRoboarenaPolicy._reset_state + - wan_flow_matching_action_tf.py L185-199: WANPolicyHead.__init__ state fields + """ + # Frame buffer — single stitched buffer + self.stitched_buffer: list[np.ndarray] = [] + self.call_count: int = 0 + + # KV cache — from WANPolicyHead.__init__ L185-188 + self.kv_cache: list[torch.Tensor] | None = None + self.kv_cache_neg: list[torch.Tensor] | None = None + self.crossattn_cache: list[dict[str, bool | torch.Tensor | None]] | None = None + self.crossattn_cache_neg: list[dict[str, bool | torch.Tensor | None]] | None = None + self.current_start_frame: int = 0 # WANPolicyHead L199 + + # Encoding cache — from WANPolicyHead.__init__ L197-200 + self.clip_feas: torch.Tensor | None = None + self.ys: torch.Tensor | None = None + self.language: torch.Tensor | None = None # WANPolicyHead L200 + + def should_reset(self, text_tokens: torch.Tensor | None, num_video_frames: int, local_attn_size: int) -> bool: + """Determine if state should be reset before this forward(). + + Source: wan_flow_matching_action_tf.py L968-981 + """ + # L968-971: first call (language not set yet) + if self.language is None: + logger.info("language is None, resetting") + return True + + # L972-975: language changed + if text_tokens is not None and not torch.equal(self.language, text_tokens): + logger.info("language changed, resetting") + return True + + # L976-978: single-frame input (signals new episode in real-world eval) + # NOTE: after accumulate_frames, num_video_frames is the accumulated T + # (1 for first call, 4 for subsequent). Only reset on true single-frame + # which happens when the stitched_buffer was cleared externally. + if num_video_frames == 1 and self.call_count > 1: + logger.info("single frame input after first call, resetting") + return True + + # L979-981: KV cache exceeded local attention window + if local_attn_size != -1 and self.current_start_frame >= local_attn_size: + logger.info( + "current_start_frame %d >= local_attn_size %d, resetting", self.current_start_frame, local_attn_size + ) + return True + + return False + + # ------------------------------------------------------------------ + # KV cache management + # Source: wan_flow_matching_action_tf.py L480-512 + # ------------------------------------------------------------------ + + def create_kv_caches( + self, + batch_size: int, + dtype: torch.dtype, + device: torch.device, + num_layers: int, + num_heads: int, + head_dim: int, + ) -> None: + """Initialize empty KV caches and cross-attention caches. + Source: wan_flow_matching_action_tf.py L480-512 + """ + self.kv_cache = [ + torch.zeros(2, batch_size, 0, num_heads, head_dim, dtype=dtype, device=device) for _ in range(num_layers) + ] + self.kv_cache_neg = [ + torch.zeros(2, batch_size, 0, num_heads, head_dim, dtype=dtype, device=device) for _ in range(num_layers) + ] + + self.crossattn_cache = [{"is_init": False, "k": None, "v": None} for _ in range(num_layers)] + self.crossattn_cache_neg = [{"is_init": False, "k": None, "v": None} for _ in range(num_layers)] + + def update_kv_cache( + self, + layer_index: int, + updated_kv: torch.Tensor, + is_negative: bool = False, + ) -> None: + """Update a single layer's KV cache after prefill. + Source: wan_flow_matching_action_tf.py L856-858 + """ + cache = self.kv_cache_neg if is_negative else self.kv_cache + assert cache is not None, "KV caches not initialized, call create_kv_caches first" + cache[layer_index] = updated_kv.clone() + + def get_kv_caches(self, is_negative: bool = False) -> list[torch.Tensor]: + """Get KV caches for the specified branch.""" + cache = self.kv_cache_neg if is_negative else self.kv_cache + assert cache is not None, "KV caches not initialized" + return cache + + def get_crossattn_caches(self, is_negative: bool = False) -> list[dict[str, bool | torch.Tensor | None]]: + """Get cross-attention caches for the specified branch.""" + cache = self.crossattn_cache_neg if is_negative else self.crossattn_cache + assert cache is not None, "Cross-attn caches not initialized" + return cache diff --git a/vllm_omni/diffusion/registry.py b/vllm_omni/diffusion/registry.py index d8302c11501..3ceb720f1da 100644 --- a/vllm_omni/diffusion/registry.py +++ b/vllm_omni/diffusion/registry.py @@ -266,6 +266,11 @@ "pipeline_hidream_image", "HiDreamImagePipeline", ), + "DreamZeroPipeline": ( + "dreamzero", + "pipeline_dreamzero", + "DreamZeroPipeline", + ), } diff --git a/vllm_omni/diffusion/utils/hf_utils.py b/vllm_omni/diffusion/utils/hf_utils.py index 6beb1823ce0..0f765f50288 100644 --- a/vllm_omni/diffusion/utils/hf_utils.py +++ b/vllm_omni/diffusion/utils/hf_utils.py @@ -1,4 +1,5 @@ import os +from collections.abc import Mapping from functools import lru_cache from vllm.logger import init_logger @@ -27,6 +28,34 @@ def _looks_like_bagel(model_name: str) -> bool: return False +def _looks_like_dreamzero(model_name: str) -> bool: + """Best-effort detection for DreamZero-style VLA diffusion checkpoints.""" + try: + cfg = get_hf_file_to_dict("config.json", model_name) + if cfg.get("model_type") != "vla": + return False + action_head_cfg = cfg.get("action_head_cfg") or {} + if not isinstance(action_head_cfg, Mapping): + return False + action_head_config = action_head_cfg.get("config") or {} + if not isinstance(action_head_config, Mapping): + return False + diffusion_model_cfg = action_head_config.get("diffusion_model_cfg") or {} + if not isinstance(diffusion_model_cfg, Mapping): + return False + return ( + action_head_cfg.get("_target_") + == "groot.vla.model.dreamzero.action_head.wan_flow_matching_action_tf.WANPolicyHead" + and diffusion_model_cfg.get("_target_") + == ( + "groot.vla.model.dreamzero.modules." + "wan_video_dit_action_casual_chunk.CausalWanModel" + ) + ) + except Exception: + return False + + @lru_cache def is_diffusion_model(model_name: str) -> bool: """Check if a model is a diffusion model. @@ -72,6 +101,7 @@ def is_diffusion_model(model_name: str) -> bool: except Exception as e: logger.debug("Failed to load diffusers config via DiffusionPipeline: %s", e) - # Bagel is not a diffusers pipeline (no model_index.json), but is still a - # diffusion-style model in vllm-omni. Detect it via config.json. - return _looks_like_bagel(model_name) + # Bagel and DreamZero are not diffusers pipelines (no model_index.json), + # but are still diffusion-style models in vllm-omni. Detect them via + # config.json. + return _looks_like_bagel(model_name) or _looks_like_dreamzero(model_name) diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 29788d95868..59fedb71339 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -115,6 +115,7 @@ VideoListResponse, VideoResponse, ) +from vllm_omni.entrypoints.openai.realtime.robot.openpi_serving import ServingRealtimeRobotOpenPI from vllm_omni.entrypoints.openai.realtime_connection import RealtimeConnection from vllm_omni.entrypoints.openai.serving_audio_generate import OmniOpenAIServingAudioGenerate from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat @@ -629,6 +630,10 @@ async def omni_init_app_state( ) state.openai_streaming_speech = None state.openai_streaming_video = None + state.openai_serving_realtime_robot = ServingRealtimeRobotOpenPI( + engine_client=engine_client, + model_name=model_name, + ) state.enable_server_load_tracking = getattr(args, "enable_server_load_tracking", False) state.server_load_metrics = 0 @@ -947,6 +952,8 @@ async def omni_init_app_state( stage_configs=state.stage_configs, ) + state.openai_serving_realtime_robot = None + state.enable_server_load_tracking = args.enable_server_load_tracking state.server_load_metrics = 0 state.sleeping_stages = set() @@ -1406,6 +1413,28 @@ async def realtime_websocket(websocket: WebSocket): await connection.handle_connection() +@router.websocket("/v1/realtime/robot/openpi") +async def realtime_robot_openpi(websocket: WebSocket): + """WebSocket endpoint for robot policy inference (OpenPI protocol). + + Binary frames: msgpack observation/action (DreamZero/OpenPI compatible). + Text frames: JSON control events (session.update, etc.). + See realtime.robot.openpi_connection.py for protocol details. + """ + from vllm_omni.entrypoints.openai.realtime.robot.openpi_connection import ( + RobotRealtimeConnection, + ) + + serving = getattr(websocket.app.state, "openai_serving_realtime_robot", None) + if serving is None: + await websocket.accept() + await websocket.send_json({"type": "error", "error": "Robot policy not available", "code": "unsupported"}) + await websocket.close() + return + connection = RobotRealtimeConnection(websocket, serving) + await connection.handle_connection() + + # Health and Model endpoints for diffusion mode diff --git a/vllm_omni/entrypoints/openai/realtime/robot/__init__.py b/vllm_omni/entrypoints/openai/realtime/robot/__init__.py new file mode 100644 index 00000000000..9881313609a --- /dev/null +++ b/vllm_omni/entrypoints/openai/realtime/robot/__init__.py @@ -0,0 +1 @@ +# SPDX-License-Identifier: Apache-2.0 diff --git a/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py b/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py new file mode 100644 index 00000000000..ec77f50d8b1 --- /dev/null +++ b/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py @@ -0,0 +1,96 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""WebSocket connection for robot policy inference (OpenPI protocol). + +Protocol (compatible with DreamZero test_client_AR.py): + Connect -> server sends msgpack(PolicyServerConfig fields) + Infer -> client sends msgpack(obs), server sends msgpack(ndarray) + Reset -> client sends msgpack({endpoint:reset}), server sends "reset successful" +""" + +from __future__ import annotations + +import traceback +from typing import Any + +from fastapi import WebSocket +from starlette.websockets import WebSocketDisconnect +from vllm.logger import init_logger + +from vllm_omni.entrypoints.openai.realtime.robot.openpi_serving import ( + ServingRealtimeRobotOpenPI, +) + +logger = init_logger(__name__) + + +def _pack(obj: Any) -> bytes: + from openpi_client import msgpack_numpy + + return msgpack_numpy.packb(obj) + + +def _unpack(data: bytes) -> Any: + from openpi_client import msgpack_numpy + + return msgpack_numpy.unpackb(data) + + +class RobotRealtimeConnection: + """WebSocket connection for robot policy inference.""" + + def __init__( + self, + websocket: WebSocket, + serving: ServingRealtimeRobotOpenPI, + ) -> None: + self.websocket = websocket + self.serving = serving + + async def handle_connection(self) -> None: + """Main loop. Matches DreamZero policy_server.py._handler.""" + await self.websocket.accept() + + try: + # Send metadata (PolicyServerConfig fields) + metadata = { + "image_resolution": (180, 320), + "n_external_cameras": 2, + "needs_wrist_camera": True, + "needs_stereo_camera": False, + "needs_session_id": True, + "action_space": "joint_position", + } + await self.websocket.send_bytes(_pack(metadata)) + + while True: + msg = await self.websocket.receive() + + if msg.get("type") == "websocket.disconnect": + break + + if "bytes" not in msg or not msg["bytes"]: + continue + + try: + obs = _unpack(msg["bytes"]) + endpoint = obs.pop("endpoint", "infer") + + if endpoint == "reset": + self.serving.reset(obs) + await self.websocket.send_text("reset successful") + else: + actions = await self.serving.infer(obs) + await self.websocket.send_bytes(_pack(actions)) + except Exception: + logger.exception("Error handling request") + try: + await self.websocket.send_text(traceback.format_exc()) + except Exception: + break + + except WebSocketDisconnect: + pass + except Exception: + logger.exception("Connection error") diff --git a/vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py b/vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py new file mode 100644 index 00000000000..5b92c79bb7d --- /dev/null +++ b/vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py @@ -0,0 +1,132 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Serving layer for robot policy inference via `/v1/realtime/robot/openpi`. + +Flow: raw obs → transform (dataset key mapping) → unified obs → +`DiffusionEngine.step()` → actions. +Transform is stateless and selected per-request via `obs["embodiment"]`. +""" + +from __future__ import annotations + +from typing import Any + +import numpy as np +import torch +from vllm.logger import init_logger + +from vllm_omni.entrypoints.openai.realtime.robot.transform.base import ( + RobotPolicyTransform, + get_transform, +) + +logger = init_logger(__name__) + +# Default embodiment when not specified in obs +DEFAULT_EMBODIMENT = "roboarena" + + +class ServingRealtimeRobotOpenPI: + """Robot policy serving layer for OpenPI protocol. + + Stateless transform routes by obs["embodiment"]. + Model-specific state (frame buffer, KV cache) lives in pipeline. + """ + + def __init__( + self, + engine_client: Any, + model_name: str | None = None, + default_embodiment: str = DEFAULT_EMBODIMENT, + ) -> None: + self.engine_client = engine_client + self.model_name = model_name + self.default_embodiment = default_embodiment + self._current_session_id: str | None = None + self._call_count = 0 + + # Ensure default transforms are registered + self._ensure_transforms_loaded() + + @staticmethod + def _ensure_transforms_loaded() -> None: + """Import transform modules to trigger register_transform calls.""" + import vllm_omni.entrypoints.openai.realtime.robot.transform.droid # noqa: F401 + import vllm_omni.entrypoints.openai.realtime.robot.transform.roboarena # noqa: F401 + + def reset(self, obs: dict) -> None: + """Reset session state.""" + self._call_count = 0 + self._current_session_id = None + + async def infer(self, obs: dict) -> np.ndarray: + """raw obs → transform → engine → actions.""" + # Session tracking + session_id = obs.get("session_id") + if session_id is not None and session_id != self._current_session_id: + if self._current_session_id is not None: + logger.info("Session changed %s → %s", self._current_session_id, session_id) + self.reset({}) + self._current_session_id = session_id + + self._call_count += 1 + + # Transform: dataset format → unified format + transform = self._get_transform(obs) + unified_obs = transform.transform_input(obs) + + # Build request, run inference through AsyncOmni + request = self._build_request(unified_obs) + result = None + async for output in self.engine_client.generate( + prompt=request.prompts[0], + request_id=request.request_ids[0], + sampling_params_list=[request.sampling_params], + ): + result = output + if result is None: + raise RuntimeError("Robot OpenPI request produced no output.") + + # Extract actions (via transform or default) + return self._extract_actions(result, transform) + + def _get_transform(self, obs: dict) -> RobotPolicyTransform: + """Select transform by obs['embodiment'] or default.""" + embodiment = obs.get("embodiment", self.default_embodiment) + return get_transform(embodiment) + + def _build_request(self, unified_obs: dict) -> Any: + """Build engine request from unified obs. + + Returns an `OmniDiffusionRequest` payload consumed by + `AsyncOmni.generate()` and routed to the diffusion stage. + """ + from vllm_omni.diffusion.request import OmniDiffusionRequest + from vllm_omni.inputs.data import OmniDiffusionSamplingParams + + extra_args = { + "reset": self._call_count <= 1, + "session_id": self._current_session_id or "default", + "unified_obs": unified_obs, + } + + prompt = unified_obs["prompt"] + sampling_params = OmniDiffusionSamplingParams(extra_args=extra_args) + return OmniDiffusionRequest( + prompts=[prompt], + sampling_params=sampling_params, + request_ids=[f"robot-{self._current_session_id or 'default'}"], + ) + + def _extract_actions(self, result: Any, transform: RobotPolicyTransform) -> np.ndarray: + """Extract actions from engine result.""" + if hasattr(result, "__iter__"): + result = list(result) + if result: + result = result[0] + + actions = transform.transform_output(result) + if isinstance(actions, torch.Tensor): + return actions.cpu().float().numpy() + return np.asarray(actions, dtype=np.float32) diff --git a/vllm_omni/entrypoints/openai/realtime/robot/transform/__init__.py b/vllm_omni/entrypoints/openai/realtime/robot/transform/__init__.py new file mode 100644 index 00000000000..208f01a7cb5 --- /dev/null +++ b/vllm_omni/entrypoints/openai/realtime/robot/transform/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project diff --git a/vllm_omni/entrypoints/openai/realtime/robot/transform/base.py b/vllm_omni/entrypoints/openai/realtime/robot/transform/base.py new file mode 100644 index 00000000000..1a0ce0e3002 --- /dev/null +++ b/vllm_omni/entrypoints/openai/realtime/robot/transform/base.py @@ -0,0 +1,141 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Base transform interface for robot policy serving. + +Transforms handle dataset-specific concerns ONLY: + - Observation key mapping + - Multi-view stitching (embodiment-specific layout) + - Language template wrapping (embodiment-specific) + - Raw state extraction (dataset-specific keys) + - Output action slicing (to actual action_dim) + +Model-specific concerns belong in the pipeline: + - Tokenization (pipeline owns tokenizer) + - State padding (pipeline knows MAX_STATE_DIM) + - Negative prompt (pipeline owns the string) + - Noise generation, encoding, decoding + +Flow: + raw obs (dataset format) + → Transform.transform_input() + → unified dict (stitched video, templated prompt str, raw state) + → Pipeline.forward() (tokenize, pad, encode, denoise) + → DiffusionOutput + → Transform.transform_output() + → ndarray (N, action_dim) +""" + +from __future__ import annotations + +from typing import Any + +import numpy as np + + +class RobotPolicyTransform: + """Base class for dataset-specific observation transforms. + + Subclasses MUST define: + IMAGE_KEY_MAP: dict — dataset obs keys → unified keys + EMBODIMENT_NAME: str — embodiment identity (pipeline maps to numeric ID) + ACTION_DIM: int — actual action dimensions (for output slicing) + + Subclasses MUST override: + _stitch_views() — multi-view → single stitched image + _language_template() — prompt → embodiment-aware template + _extract_raw_state() — obs → raw state ndarray + """ + + IMAGE_KEY_MAP: dict[str, str] + EMBODIMENT_NAME: str + ACTION_DIM: int + + def transform_input(self, obs: dict) -> dict: + """Dataset-specific transform: key map → stitch → template → state. + Source: dreamzero_cotrain.py apply_single() L498-596 + """ + # 1. Map image keys → unified keys + images: dict[str, np.ndarray] = {} + for src_key, dst_key in self.IMAGE_KEY_MAP.items(): + if src_key in obs: + images[dst_key] = np.asarray(obs[src_key]) + + # 2. Multi-view stitching + stitched = self._stitch_views(images) + + # 3. Language template (string only, pipeline tokenizes) + prompt = obs.get("prompt", "") + templated_prompt = self._language_template(prompt) + + # 4. Raw state extraction (pipeline pads) + raw_state = self._extract_raw_state(obs) + + # 5. Build unified output + unified: dict[str, Any] = { + "images": stitched, # ndarray (T, H_out, W_out, 3) + "prompt": templated_prompt, # str (templated, not tokenized) + "state": raw_state, # ndarray (state_dim,) — pipeline pads + "embodiment_name": self.EMBODIMENT_NAME, + } + if "session_id" in obs: + unified["session_id"] = obs["session_id"] + return unified + + def transform_output(self, result: Any) -> np.ndarray: + """Extract action ndarray (N, ACTION_DIM) from model output. + + Engine outputs actions through ``multimodal_output["actions"]``. + Pipeline outputs (horizon, max_action_dim) after batch squeeze. + We slice to actual ACTION_DIM. + """ + if not hasattr(result, "multimodal_output") or result.multimodal_output is None: + raise RuntimeError("Missing multimodal_output in robot policy result") + + actions = result.multimodal_output.get("actions") + if actions is None: + raise RuntimeError("Missing multimodal_output['actions'] in robot policy result") + + actions = np.asarray(actions, dtype=np.float32) + # Handle any remaining batch dims: squeeze to 2D (horizon, dim) + while actions.ndim > 2: + actions = actions[0] + # Slice padded dim to actual ACTION_DIM + if actions.ndim == 2 and actions.shape[-1] > self.ACTION_DIM: + actions = actions[:, : self.ACTION_DIM] + return actions + + # ------------------------------------------------------------------ + # Subclass MUST override + # ------------------------------------------------------------------ + + def _stitch_views(self, images: dict[str, np.ndarray]) -> np.ndarray: + """Stitch camera views into single image. + Input: unified key → ndarray (H,W,3) or (T,H,W,3). + Output: ndarray (T, H_out, W_out, 3). + """ + raise NotImplementedError + + def _language_template(self, prompt: str) -> str: + """Wrap prompt in embodiment-specific template string.""" + raise NotImplementedError + + def _extract_raw_state(self, obs: dict) -> np.ndarray: + """Extract raw state vector from obs. + Returns: ndarray (state_dim,) float64. Pipeline handles padding. + """ + raise NotImplementedError + + +# Transform registry — keyed by embodiment/dataset name +TRANSFORMS: dict[str, RobotPolicyTransform] = {} + + +def register_transform(name: str, transform: RobotPolicyTransform) -> None: + TRANSFORMS[name] = transform + + +def get_transform(name: str) -> RobotPolicyTransform: + if name not in TRANSFORMS: + raise KeyError(f"Unknown transform '{name}'. Available: {list(TRANSFORMS.keys())}") + return TRANSFORMS[name] diff --git a/vllm_omni/entrypoints/openai/realtime/robot/transform/droid.py b/vllm_omni/entrypoints/openai/realtime/robot/transform/droid.py new file mode 100644 index 00000000000..c6053c54bd0 --- /dev/null +++ b/vllm_omni/entrypoints/openai/realtime/robot/transform/droid.py @@ -0,0 +1,195 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""DROID dataset transform. + +DROID uses 1-indexed exterior cameras, 3 views total (OXE_DROID embodiment). +Stitching layout (same as RoboArena — both are OXE_DROID): + ┌─────────────────────────┐ + │ wrist (2x width) │ ← pixel-repeat along width + ├────────────┬────────────┤ + │ left ext │ right ext │ + └────────────┴────────────┘ + +Direct stitching source: + `third_party/dreamzero/groot/vla/model/dreamzero/transform/dreamzero_cotrain.py:337` + to + `third_party/dreamzero/groot/vla/model/dreamzero/transform/dreamzero_cotrain.py:355` + +Size assumptions for the current DreamZero path: + `third_party/dreamzero/groot/vla/configs/model/dreamzero/action_head/wan_flow_matching_action_tf.yaml:17` + `third_party/dreamzero/scripts/train/droid_training_full_finetune.sh:82` + `third_party/dreamzero/scripts/train/droid_training_full_finetune.sh:83` + `third_party/dreamzero/scripts/train/droid_training_full_finetune.sh:86` +""" + +from __future__ import annotations + +import numpy as np +import torch +import torchvision.transforms.v2 as T + +from vllm_omni.entrypoints.openai.realtime.robot.transform.base import ( + RobotPolicyTransform, + register_transform, +) + + +class DroidTransform(RobotPolicyTransform): + """Transform for DROID dataset (OXE_DROID embodiment). + + DROID observation keys (1-indexed exterior cameras): + observation/exterior_image_1_left → left exterior + observation/exterior_image_2_left → right exterior + observation/wrist_image_left → wrist + """ + + IMAGE_KEY_MAP = { + "observation/exterior_image_1_left": "images/exterior_0", + "observation/exterior_image_2_left": "images/exterior_1", + "observation/wrist_image_left": "images/wrist", + } + EMBODIMENT_NAME = "oxe_droid" + ACTION_DIM = 8 # 7 joint + 1 gripper + _VIDEO_CROP_SCALE = 0.95 + _VIDEO_RESIZE_HW = (176, 320) + + @classmethod + def _preprocess_view(cls, arr: np.ndarray) -> np.ndarray: + """Match source eval transform for OXE_DROID camera views. + + Source transform chain from `experiment_cfg/conf.yaml`: + `VideoToTensor -> VideoCrop(scale=0.95, eval=center crop) -> + VideoResize(height=176, width=320, interpolation=linear, antialias=True) -> + VideoToNumpy` + """ + frames = torch.from_numpy(arr).to(torch.float32).permute(0, 3, 1, 2) / 255.0 + crop_h = int(arr.shape[1] * cls._VIDEO_CROP_SCALE) + crop_w = int(arr.shape[2] * cls._VIDEO_CROP_SCALE) + frames = T.CenterCrop((crop_h, crop_w))(frames) + frames = T.Resize( + cls._VIDEO_RESIZE_HW, + interpolation=T.InterpolationMode.BILINEAR, + antialias=True, + )(frames) + return (frames.permute(0, 2, 3, 1) * 255.0).to(torch.uint8).cpu().numpy() + + def _stitch_views(self, images: dict[str, np.ndarray]) -> np.ndarray: + """OXE_DROID 2x2 stitching: wrist top (2x wide), exteriors bottom. + Direct layout correspondence: + - output canvas `(t, 2H, 2W)` ↔ `dreamzero_cotrain.py:337` + - wrist repeat-along-width ↔ `dreamzero_cotrain.py:339`-`342` + - bottom left/right placement ↔ `dreamzero_cotrain.py:344`-`353` + + The resize-to-176x320 step is not done inside upstream + `_prepare_video()`. Upstream expects the video path to already satisfy + the model's spatial assumptions; for the current DreamZero config that + assumption comes from: + - `wan_flow_matching_action_tf.yaml:17` (`frame_seqlen: 880`) + - `droid_training_full_finetune.sh:82`-`86` + so we materialize that precondition here for online serving. + """ + left_ext = images.get("images/exterior_0") + right_ext = images.get("images/exterior_1") + wrist = images.get("images/wrist") + + # Ensure 4D: (T, H, W, C) + def ensure_4d(arr: np.ndarray | None) -> np.ndarray | None: + if arr is None: + return None + return arr if arr.ndim == 4 else arr[np.newaxis] + + left_ext = ensure_4d(left_ext) + right_ext = ensure_4d(right_ext) + wrist = ensure_4d(wrist) + + # Determine shape from first available view. + # Upstream `_prepare_video()` assumes views already share the same H/W + # before it allocates `concat_images`; see `dreamzero_cotrain.py:337`. + ref = next((v for v in [wrist, left_ext, right_ext] if v is not None), None) + if ref is None: + # No direct upstream line: this is a serving-side empty placeholder. + # We choose 352x640 so the empty sample matches the active DreamZero + # DROID path (per-view 176x320 -> stitched 352x640), consistent with + # `droid_training_full_finetune.sh:82`-`86` and + # `wan_flow_matching_action_tf.yaml:17`. + return np.zeros((1, 352, 640, 3), dtype=np.uint8) + + # Match the source eval transform chain before `ConcatTransform` / + # `DreamTransform`: center crop by 0.95, then resize each view to + # 176x320. This is the actual preprocessing path used by + # `GrootSimPolicy.eval_transform`, not just a serving-side heuristic. + def maybe_preprocess(arr: np.ndarray | None) -> np.ndarray | None: + if arr is None: + return None + return self._preprocess_view(arr) + + left_ext = maybe_preprocess(left_ext) + right_ext = maybe_preprocess(right_ext) + wrist = maybe_preprocess(wrist) + ref = next((v for v in [wrist, left_ext, right_ext] if v is not None), None) + assert ref is not None + t, h, w, c = ref.shape + + # Match upstream canvas dtype exactly: + # `concat_images = np.zeros(..., dtype=images.dtype)` at + # `dreamzero_cotrain.py:337`. + out = np.zeros((t, 2 * h, 2 * w, c), dtype=ref.dtype) # (T, 2H, 2W, C) + + # Top row: wrist repeated 2x along width. + # Corresponds to `dreamzero_cotrain.py:339`-`342`. + if wrist is not None: + wrist_wide = np.repeat(wrist, 2, axis=2) # (T, H, 2W, C) + out[:, :h, :] = wrist_wide + + # Bottom row: left exterior | right exterior. + # Corresponds to `dreamzero_cotrain.py:344`-`353`. + if left_ext is not None: + out[:, h:, :w] = left_ext + if right_ext is not None: + out[:, h:, w:] = right_ext + + return out + + def _language_template(self, prompt: str) -> str: + """Match the source OXE_DROID language prompt expansion exactly. + + Source correspondence: + - `dreamzero_cotrain.py:collate()` OXE_DROID branch + - `dreamzero_cotrain.py:HuggingfaceTokenizer(clean='whitespace')` + + Upstream online eval does *not* tokenize the raw instruction directly. + After `DreamTransform.apply_single()` emits the raw language string, + `collate()` rewrites it into the multi-view description below and only + then tokenizes it. Using the raw prompt here changes the token ids and + measurably changes the denoising trajectory. + """ + prompt = (prompt or "Perform the default behavior.").strip() + prompt_lower = prompt.lower() + return ( + "A multi-view video shows that a robot " + + prompt_lower + + " The video is split into three views: The top view shows the " + + "camera view from the robot's wrist, the bottom-left view shows " + + "the camera view from the left exterior camera, and the " + + "bottom-right view shows the camera view from the right exterior " + + "camera. During training, one of the two bottom exterior views " + + "may be a black screen (dropped view). The robot " + + prompt_lower + ) + + def _extract_raw_state(self, obs: dict) -> np.ndarray: + """OXE_DROID state: 7 joint + 1 gripper = 8 dims. + Source: dreamzero_cotrain.py _prepare_state() L436-467 + """ + parts = [] + if "observation/joint_position" in obs: + parts.append(np.asarray(obs["observation/joint_position"], dtype=np.float64).flatten()) + if "observation/gripper_position" in obs: + parts.append(np.asarray(obs["observation/gripper_position"], dtype=np.float64).flatten()) + if parts: + return np.concatenate(parts) + return np.zeros(8, dtype=np.float64) + + +register_transform("droid", DroidTransform()) diff --git a/vllm_omni/entrypoints/openai/realtime/robot/transform/roboarena.py b/vllm_omni/entrypoints/openai/realtime/robot/transform/roboarena.py new file mode 100644 index 00000000000..b75b94b7a64 --- /dev/null +++ b/vllm_omni/entrypoints/openai/realtime/robot/transform/roboarena.py @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""RoboArena dataset transform. + +RoboArena uses 0-indexed exterior cameras, 3 views total (OXE_DROID embodiment). +Same stitching layout as DROID — both map to OXE_DROID in DreamZero. + +Source: socket_test_optimized_AR.py L104-108 (key mapping) +""" + +from __future__ import annotations + +from vllm_omni.entrypoints.openai.realtime.robot.transform.base import ( + register_transform, +) +from vllm_omni.entrypoints.openai.realtime.robot.transform.droid import ( + DroidTransform, +) + + +class RoboArenaTransform(DroidTransform): + """Transform for RoboArena dataset. + + Same embodiment as DROID (OXE_DROID), same stitching and template. + Only difference: 0-indexed exterior camera keys. + + RoboArena observation keys (0-indexed): + observation/exterior_image_0_left → left exterior + observation/exterior_image_1_left → right exterior + observation/wrist_image_left → wrist + + Source: socket_test_optimized_AR.py L104-108 + """ + + IMAGE_KEY_MAP = { + "observation/exterior_image_0_left": "images/exterior_0", + "observation/exterior_image_1_left": "images/exterior_1", + "observation/wrist_image_left": "images/wrist", + } + + +register_transform("roboarena", RoboArenaTransform()) From 106a3d5702c25b4031c5fc3c78602acef5faee20 Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Mon, 13 Apr 2026 01:14:02 +0000 Subject: [PATCH 02/45] Fix DreamZero debug video decode parity Add a dedicated DreamZero video-latent decode helper that matches upstream WanVideoVAE decode semantics. The fix keeps forward() output as normalized video latents for serving, but documents the contract clearly and restores exact debug-video parity by inverting latent normalization in bf16 the same way as the upstream source path. Signed-off-by: Yangshen Deng Co-authored-by: Meng --- .../models/dreamzero/pipeline_dreamzero.py | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py b/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py index f6e387519cd..f7cd4a7e714 100644 --- a/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py +++ b/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py @@ -573,6 +573,36 @@ def _encode_vae_latents(self, videos: torch.Tensor) -> torch.Tensor: mu = (mu - mean) * inv_std return mu.to(dtype=input_dtype) + def decode_video_latents(self, video_latents: torch.Tensor) -> torch.Tensor: + """Decode DreamZero normalized VAE latents into RGB video tensors. + + `forward()` returns `video` in the same form as upstream + `WANPolicyHead.lazy_joint_video_action()` / `GrootSimPolicy`: normalized + VAE latents shaped `[B, C, T, H, W]`, not decoded RGB frames. Upstream + only decodes those latents when saving the debug video on reset. + + Source correspondence: + - `socket_test_optimized_AR.py` `_reset_state()` calls + `action_head.vae.decode(video_across_time_cat, ...)`. + - `wan_video_vae.py` `WanVideoVAE.decode()` delegates to + `VideoVAE_.decode(z, scale)`. + - `wan_video_vae.py` `VideoVAE_.decode()` first inverts latent + normalization as `z = z / scale[1] + scale[0]`, where `scale[1]` is + the precomputed fp32 reciprocal std cast to the runtime dtype. + + The cast-before-division detail is required for bf16 video parity; doing + the inverse in fp32 and then casting changes RGB frames even though the + action path is unaffected. + """ + vae_dtype = self.vae.dtype + vae_device = next(self.vae.parameters()).device + latents = video_latents.to(device=vae_device, dtype=vae_dtype) + mean = self.vae_latents_mean.to(device=vae_device, dtype=vae_dtype) + inv_std = self.vae_latents_inv_std.to(device=vae_device, dtype=vae_dtype) + latents = latents / inv_std + mean + with torch.no_grad(): + return self.vae.decode(latents, return_dict=False)[0] + # ----------------------------------------------------------------------- # KV cache prefill # ----------------------------------------------------------------------- @@ -1092,6 +1122,9 @@ def forward(self, req: OmniDiffusionRequest, **kwargs) -> DiffusionOutput: return DiffusionOutput( output={ "actions": actions_np, # L1273 + # Source `video_pred` is normalized VAE latent output, not RGB. + # Use `decode_video_latents()` for DreamZero-equivalent debug + # video decoding. "video": video_out.transpose(1, 2).cpu(), }, ) From 4e82bb92da0f5c2fcd194986b74f549a4ac08529 Mon Sep 17 00:00:00 2001 From: Meng Date: Sat, 18 Apr 2026 21:15:27 +0000 Subject: [PATCH 03/45] Address DreamZero PR review feedback Signed-off-by: Meng Co-authored-by: Yangshen Deng --- docs/models/dreamzero/README.md | 4 + docs/models/dreamzero/quick_start.md | 143 +++++++ examples/online_serving/dreamzero/README.md | 64 +++ .../assets/exterior_image_1_left.mp4 | Bin 0 -> 13104 bytes .../assets/exterior_image_2_left.mp4 | Bin 0 -> 11847 bytes .../dreamzero/assets/wrist_image_left.mp4 | Bin 0 -> 27370 bytes .../online_serving/dreamzero/openpi_client.py | 343 ++++++++++++++++ .../online_serving/dreamzero/run_server.sh | 27 ++ tests/diffusion/test_diffusion_scheduler.py | 44 +++ tests/diffusion/test_stage_diffusion_proc.py | 40 ++ tests/dreamzero/test_utils.py | 24 ++ .../upstream/openpi_test_client_ar.py | 366 ++++++++++++++++++ .../upstream/test_client_ar_path_parity.py | 171 ++++++++ .../upstream/test_openpi_e2e_source_parity.py | 345 +++++++++++++++++ .../test_roboarena_transform_source_parity.py | 121 ++++++ .../test_video_preprocess_source_parity.py | 68 ++++ .../upstream_socket_server_no_compile.py | 177 +++++++++ tests/e2e/online_serving/test_dreamzero.py | 111 ++++++ .../openai_api/test_openpi_connection.py | 168 ++++++++ .../openai_api/test_openpi_serving.py | 147 +++++++ tests/entrypoints/test_omni_entrypoints.py | 13 + .../test_resolve_dreamzero_config.py | 49 +++ .../examples/online_serving/test_dreamzero.py | 101 +++++ .../models/dreamzero/pipeline_dreamzero.py | 79 ++-- .../models/dreamzero/state_dreamzero.py | 4 +- .../models/dreamzero/transform/__init__.py | 35 ++ .../models/dreamzero}/transform/base.py | 27 +- .../models/dreamzero}/transform/droid.py | 2 +- .../models/dreamzero}/transform/roboarena.py | 4 +- vllm_omni/diffusion/models/dreamzero/utils.py | 29 ++ vllm_omni/entrypoints/openai/api_server.py | 2 +- .../realtime/robot/openpi_connection.py | 71 +++- .../openai/realtime/robot/openpi_serving.py | 147 +++++-- .../realtime/robot/transform/__init__.py | 2 - vllm_omni/entrypoints/utils.py | 4 + .../stage_configs/dreamzero.yaml | 21 + 36 files changed, 2826 insertions(+), 127 deletions(-) create mode 100644 docs/models/dreamzero/README.md create mode 100644 docs/models/dreamzero/quick_start.md create mode 100644 examples/online_serving/dreamzero/README.md create mode 100644 examples/online_serving/dreamzero/assets/exterior_image_1_left.mp4 create mode 100644 examples/online_serving/dreamzero/assets/exterior_image_2_left.mp4 create mode 100644 examples/online_serving/dreamzero/assets/wrist_image_left.mp4 create mode 100755 examples/online_serving/dreamzero/openpi_client.py create mode 100755 examples/online_serving/dreamzero/run_server.sh create mode 100644 tests/dreamzero/test_utils.py create mode 100644 tests/dreamzero/upstream/openpi_test_client_ar.py create mode 100644 tests/dreamzero/upstream/test_client_ar_path_parity.py create mode 100644 tests/dreamzero/upstream/test_openpi_e2e_source_parity.py create mode 100644 tests/dreamzero/upstream/test_roboarena_transform_source_parity.py create mode 100644 tests/dreamzero/upstream/test_video_preprocess_source_parity.py create mode 100644 tests/dreamzero/upstream/upstream_socket_server_no_compile.py create mode 100644 tests/e2e/online_serving/test_dreamzero.py create mode 100644 tests/entrypoints/openai_api/test_openpi_connection.py create mode 100644 tests/entrypoints/openai_api/test_openpi_serving.py create mode 100644 tests/entrypoints/test_resolve_dreamzero_config.py create mode 100644 tests/examples/online_serving/test_dreamzero.py create mode 100644 vllm_omni/diffusion/models/dreamzero/transform/__init__.py rename vllm_omni/{entrypoints/openai/realtime/robot => diffusion/models/dreamzero}/transform/base.py (82%) rename vllm_omni/{entrypoints/openai/realtime/robot => diffusion/models/dreamzero}/transform/droid.py (99%) rename vllm_omni/{entrypoints/openai/realtime/robot => diffusion/models/dreamzero}/transform/roboarena.py (89%) create mode 100644 vllm_omni/diffusion/models/dreamzero/utils.py delete mode 100644 vllm_omni/entrypoints/openai/realtime/robot/transform/__init__.py create mode 100644 vllm_omni/model_executor/stage_configs/dreamzero.yaml diff --git a/docs/models/dreamzero/README.md b/docs/models/dreamzero/README.md new file mode 100644 index 00000000000..0b6cb624af0 --- /dev/null +++ b/docs/models/dreamzero/README.md @@ -0,0 +1,4 @@ +# DreamZero + +- `docs/models/dreamzero/quick_start.md`: quick start, standard e2e/example entry points, and optional upstream parity checks +- `examples/online_serving/dreamzero/README.md`: self-contained OpenPI server/client example with bundled real videos diff --git a/docs/models/dreamzero/quick_start.md b/docs/models/dreamzero/quick_start.md new file mode 100644 index 00000000000..9b97d54b65f --- /dev/null +++ b/docs/models/dreamzero/quick_start.md @@ -0,0 +1,143 @@ +# DreamZero Quick Start + +This document is the shortest path to launching the DreamZero service and connecting the compatible client. + +The commands below assume you run them from the repository root. + +For the self-contained example, use the bundled client and videos under +`examples/online_serving/dreamzero/`. + +Upstream DreamZero-dependent parity checks are optional and live under +`tests/dreamzero/upstream/`. + +--- + +## 1. Start the vLLM DreamZero server + +Default example: official HF model + `CF_P=2`. + +```bash +ATTENTION_BACKEND=torch \ +DIFFUSION_ATTENTION_BACKEND=TORCH_SDPA \ +CUDA_VISIBLE_DEVICES=0,1 \ +MASTER_PORT=29628 \ +vllm serve \ + GEAR-Dreams/DreamZero-DROID \ + --omni \ + --host 127.0.0.1 \ + --port 8000 \ + --served-model-name dreamzero-droid \ + --cfg-parallel-size 2 \ + --enforce-eager +``` + +If you only have 1 GPU: + +- change `CUDA_VISIBLE_DEVICES=0,1` to `CUDA_VISIBLE_DEVICES=0` +- remove `--cfg-parallel-size 2` + +OpenPI WebSocket endpoint: + +- `ws://127.0.0.1:8000/v1/realtime/robot/openpi` + +--- + +## 2. Connect the client to the vLLM server + +Use the self-contained DreamZero example client: + +- `examples/online_serving/dreamzero/openpi_client.py` + +When connecting to vLLM, the default websocket path already targets OpenPI: + +```bash +python examples/online_serving/dreamzero/openpi_client.py \ + --host 127.0.0.1 \ + --port 8000 +``` + +--- + +## 3. Standard online e2e test + +The standard self-contained online serving e2e test is: + +```bash +PYTHONPATH=. .venv/bin/python -m pytest tests/e2e/online_serving/test_dreamzero.py -q +``` + +This test starts a real DreamZero server, sends bundled real camera videos, and +checks metadata, action output shape, finite values, and reset behavior. + +--- + +## 4. Shared example test + +The example test executes the same client script from `examples/`: + +```bash +PYTHONPATH=. .venv/bin/python -m pytest tests/examples/online_serving/test_dreamzero.py -q +``` + +--- + +## 5. Optional upstream parity baseline + +The currently validated strict-parity baseline is: + +- upstream DreamZero in eager mode +- no `torch.compile` +- no DiT cache / skip schedule +- `TP=1` +- `CF_P=1` or `CF_P=2` + +Current status: + +- `TP=1, CF_P=1`: strict parity +- `TP=1, CF_P=2`: strict parity +- `TP=2, CF_P=1/2`: runs, but strict numerical parity is not guaranteed + +--- + +## 6. Recommended first run + +If you want the least surprising setup, start with: + +- `GEAR-Dreams/DreamZero-DROID` +- `--enforce-eager` +- `TP=1` +- `CF_P=1` + +Then move to `CF_P=2` if you want CFG parallel. + +--- + +## 7. Formal upstream end-to-end parity test + +The formal server-vs-server parity test is: + +```bash +PYTHONPATH=. .venv/bin/python -m pytest tests/dreamzero/upstream/test_openpi_e2e_source_parity.py -q +``` + +Run the same parity test on GPUs `0,1` with `CF_P=2`: + +```bash +OPENPI_E2E_GPUS=0,1 \ +OPENPI_E2E_CFG_PARALLEL_SIZE=2 \ +PYTHONPATH=. .venv/bin/python -m pytest tests/dreamzero/upstream/test_openpi_e2e_source_parity.py -q +``` + +This test checks: + +- upstream DreamZero server +- vLLM DreamZero server +- the same DreamZero-compatible client logic +- strict action-output parity under the non-TP, non-compile baseline + +--- + +## 8. Related docs + +- `docs/models/dreamzero/README.md`: DreamZero documentation index +- `examples/online_serving/dreamzero/README.md`: bundled OpenPI example diff --git a/examples/online_serving/dreamzero/README.md b/examples/online_serving/dreamzero/README.md new file mode 100644 index 00000000000..cc5b48da1f6 --- /dev/null +++ b/examples/online_serving/dreamzero/README.md @@ -0,0 +1,64 @@ +# DreamZero OpenPI Example + +This example shows how to serve DreamZero with `vllm serve --omni` and connect a +compatible OpenPI websocket client using bundled real camera videos. + +## Files + +- `run_server.sh`: launch DreamZero OpenPI serving +- `openpi_client.py`: websocket client that sends real observations +- `assets/`: minimal real camera videos used by the example + +## Start the server + +From the repository root: + +```bash +CUDA_VISIBLE_DEVICES=0,1 \ +examples/online_serving/dreamzero/run_server.sh +``` + +If you only want 1 GPU: + +```bash +CUDA_VISIBLE_DEVICES=0 \ +CFG_PARALLEL_SIZE=1 \ +examples/online_serving/dreamzero/run_server.sh +``` + +The websocket endpoint is: + +- `ws://127.0.0.1:8000/v1/realtime/robot/openpi` + +## Run the client + +From the repository root: + +```bash +python examples/online_serving/dreamzero/openpi_client.py \ + --host 127.0.0.1 \ + --port 8000 +``` + +The client sends: + +- one initial single-frame observation +- one four-frame observation +- one websocket reset +- one post-reset single-frame observation + +It validates: + +- DreamZero metadata contract +- action tensor shape `(24, 8)` +- finite action values +- reset response + +## Optional upstream parity checks + +The upstream DreamZero-dependent parity tests are kept under: + +- `tests/dreamzero/upstream/` + +Those tests require a local upstream DreamZero checkout and are not needed for +the standard vLLM example above. diff --git a/examples/online_serving/dreamzero/assets/exterior_image_1_left.mp4 b/examples/online_serving/dreamzero/assets/exterior_image_1_left.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..baecf8fa3760c090e7f918975e3fb6f56c5a5444 GIT binary patch literal 13104 zcmaL81yo$kvM@ThI|K)m?kfo;^Jv5D0AT`OekO!^H^%f(5-`0JtFDmYhzmJe(j92-(^l z3I&1KemPk|JOP~wM1<$((iNHgww*=!##ELC@FMl`i4PYa5118f>EaFra{{D0Hz$Vx zn41&IDQGPS2;|uS1$Gs6897!Su!OD@plJoQ1O!qpuJ0V7)}CN44i0WsE)FhkfNAUL z=_bom zW$6j9EZv-dp%oNj_1?u9aL>uY$qBZGczBw*dN|m*zG(c*ft#zDi?y`})Duwmv~>pz zJVg1xjxH_^5L;ki_8%T6*u&Az5(wo#3=Xif`#)1y+BrcyU&63+_Jq1SLI5RzZ{g_W z4tZy0>Eh%HdGT!ttdOTW#LgLT0VujdUUaP8Ax=;aQEsq>nd>`%wzC2j!omz<1#x{z z!otkL4)T(gm7P2EpS0dW?QCp4EdZH|E7aM{#>Ewo{)_pNlLPb};9Hc7m*d~0nUkHf zC?_x2!xHKYwe<27<>7b<(;f1%qV7-+Tfp4i((M1HE6VvY7PWM@20K{*%leWQFc9Su zVB-M0y(A;b!Nv!Wt}l-N!4Mx&K0$!t;R$sW+h`UHi6b{(TJ`at~+ZeU{wk)f4Hw44k0-# z?+l>ZvG_dMv~0-}q`TxUOOMT^I;7g~asOHYQ}s+@6v+$EbvS*|_`C^pxU1N#R9Xnz z+53@8inAU9%lQswLvQ`Kkb0D^nw92`)l3I`uZfWr!)SK{2Sg zKjKHdapkP)?hDa>z&FfM+J9$@$djpT_l7l8s~I7MEP?5{Ja;2vDB50Q{J8vA4hUi% zq^0gQ#gllR(#p;;nVj>ot3hSJ40 z%dx_t3h0SG+|lfbYjPi?x9DX*2`XyNwk_Z_AuX03PK@s!li(S$y|*|(p4*82(PBju zqlwHe(r49SwaPact_2qcVrS^3`JUZ(^H%jS`&eeHW_o=A&Qzh+3xC-25s!xmLxr;E zo+5U*s;=J`A?5p3;U_t5Yx~|{T;6P-i#9aWDchdvIlO~t{1TZCdf##VV?GC25fG4J@0Vq>!)(P1q^Lchf!oF{Bzt{~%%0B2w@ z33)2vfO*S(Sa*=?f5dWKnn{A5MljsfoYsEBz#7lR7FwCS9T7t)PrI zoI|a1f7Go>S6hXANz@1fEzbi_YhNi61zo}83lQcS{GbIp<^9re*n2;hU%*9l_nJ-A z0$i{D3${X)+3r-Ve#U|8P2NkLw^uVswpZmX!67L%rR^B_3 zFWwuh82R%Zmh@t~`9qJd%H{+9Mn*i5m$t}qfk}bfYG++Umw>~}LiA@!3xmuh)vYYz zA*JC2uJrsFbz*hiMmatX{2AD}=|E+D2#GO`*AGNy?qO5gHHe z!drf4ws8_0AVd4Ha-?z>8ltCgFyJ5JuVFg=%@&l{>~!paokJFcEQl?F|KZA#-Xyhb zji5~?c~j%TEMD-p`fc&m_j$MRhvTmEpbLj-=kr7{$K zrPrptRv=AGqhDm#+o(}4ev4PJY;yp&!~S!))mW|1RPT zy6mkLi7KMR*HBYZfv*N4Z8G5rsNXq9c+;a=Yz4LWRLE6iW(Z4cpFmW<9A`~!{*mVy z3dtwQfcczZe`-MeNzbjpnogr|IJ2C8>0E7T)j-UDcF^icp+1|!qyh<*&(HMdekj%c z-KJ1#eT(`o+M=8;yh1I?t$PO+rM)1(#9BjimSNw|AoRuXpip?(i(Q6ZBYEY39*FQp z0;!eb?w$Ud=>1cvnnI&glZeV(2r&}TsM$91k9xtXmAWsB=3fIIC2uK*`RF3`@djZ2 z;vI&xMnAAsptx7tN@>>C`6_nyNXa1nO7cx?WO{H(QHpzaQC>m-d3FxCGa*(Il6lm4 zDrwy=D#geo=zR)WjJ>qY!(3*2h7ZJ0>wxG?-m2RWtpB!{`bZ!u^77mg>*&VS?0>j@)MQ!sbhwp9PIP@wwvnY z+0YVku|!XWtfz?nXf1mY)&G3Gj(;ebb|I>boFlVh$oy>~98r~NfPj(vUCvya> z(Rc`TBOx(|-_(ps=G3dm2U<_EVH?T9(w8M%=!q|E(2K84>&@C-{gFj@{mq|(kmw0@ zzw%tv5APlo)O*BGYHqUTYBvqO#@1hPII+i*ns26X%IOtCE0G=d>hI*Ere$ru_$_mx zVcCubheMi2azWl5F-8`C^6O6@me(h;W7F~F>g`yA$+4rxnn2mCtWIZD4lPs|XVkk4 zmqxk;P7KN>cjd$Z&^9uKqzh2@$DZ}$Bc@x+nU89F=NpLZZ7 zznOWP`X`UrP`^X>7BOq~X6ik#=&K`cs&!3v_Qhd^z{usqHN=z`? zgt{ueC-g48F=QBEmebuvh5zE-vMY!3^HYIQCTY|5@}PZh>jsRGo$X{&^_cZ^_((Ry zAlQCBc|hnWs+sL{F-UkRHg&bO<77a<{K$_w7vggcp3$k!(l>vY^mct+8KaPW@;U$b z#yZFh47IfbB_V-{@@-v5_cP+K`ST3fBDneVH9NF)3_=jvjlQ! z;E8{9T*@1AU6(7y$qgrDkMwqhLzYCsdsVrDMaXE+5>q(m91~62x$V-<{_i!V&o&}i zO>X)3FM}(x>vW$zS>HdneY0fWpLI6lv{O}>4`Z#9EGJC%{G6BVJUZ3cb56~!EqUs2 zJsp}}X6|s1-08_FDNo2im3SJJx5>wZ-xww6;ZiFlRG#*uf^q>Hfvb%tNtZIAxhdg=%=R2rXZv?B*S;$U5MVOvgJ)?S}|KxEo6@3vA zCO;;!k3>e1btBAB$@DQFF9Xrm^gMdNi>Y|L_q{s@Uk|~(5W#dO;r_RN|3jWTJ@Kvd z1oK(@-bMPszH!4_T~N|8V*O8rqt6#S@-o*;N*{P9_uSoRz-KN4uvm!UZ{sN2)cUQt z2R5y$Ptd0ivW|WimmDnW{d`grdvlc#gU?wo)(l_9Z9q7IN!3|A2>K|ONjCh31Fk|c z)K5_ObDSgb3_8oD06Na|3^?qmEWDxWG#^prlzN&2H=v#w+pgir9KAva8fp3NNI>vo z1)N8Tn`Zh%_+ZKVx3A$11e3z}?Hf!l`M5PJw&J@LsnhC{gHP7Ft zJ>cUTnfh{#8Dp~&qUQUnTO&CGod%sSRGDfExT4Kkh)(d)$)(S$pHhiGqZLsiD+7Yx zc?Y^DMq*G5`e~&qf(#u9;Wk$-yCHHpxhu-`3+y7gC04YNIMfuBIVoLks*L@pSCp1Y zoW36Fcsp<7-vr{+#z37LXM|&1EH56#;MB%4kIw}tLeo^oq6tt7JJS2RaqnFnN|PV6 zWc;4r?AA_cc_A+-i>XMZS}Cpxv2`Iuyu~D$+_1=isEn?Cr$8fewQ2e->DoitM6Fgm zCTKV^@+MeBax0}BhUeqEGjcx03}pr7Hnt#DHh7r2_CZRvMcvKR@HO?Cg(Mc#uS4Ck zb3cj|#N;=(vug~Us{T5eJMTa$^ahlsP#(s0`8Hk-L)3W5t+{#M>-=Brg zliF;_+;(PU%j*S+a1K>W&`a67pRuic8KpYI<1zL^%C&y|Kt8#pNK7PVlnruKglBR< zKT6WAFz;~8)D44!{kuFEX^#D;$z-8K53E6Sef_Sv(d<6exv5`JE|F8r?bW5{VTw_z zH6u&Z>0hg_)V2z!?HEuOnCKR(ig_$UwH4{Rhm87b7|zyR<>(;O{1Zh4WplYsu*_2T z4t|03oIIr#N7Ni*eA{p0EaCT#PF9)fcfE!-pb~WycV#tv1HGzx(fEO@?@dWQsef#j zGloMMyW}F<1eBG@bzC6k%w_^9%SJjE1awbQ@g3a`<|!iUuY}L`5G^qXWoNw4kMeg? zVQ22Ky-8&&65{P|tBb^oO|Q*qL0&3zFsSZ#qso~H5~qLXx?@-x6b*}oCxoPzh1iY! zNh%PCuugu4OVY>S-uqpls6D%GU49$2Ubp@2@bThHy*oO{hEN|n7@M|W1Mbt z+(os@k53eowO5!R>Hg;^~ohB2RAK2yxqn#U7K9HQIDGp}?!?-qC%Kn5u&fD$u9>Ew&#_^3w z)p zh+AhtT9pZwCg;wF4Q#`w+wrjaov2`5zQCFh#@L&L-E%d;%7H(28tW}vuQ%KrNo2`P zy(4k`SxNR_S1~q5#Knt{8dfip(S-qZzVCds_u&I3k%h5~dh#aWx)X7NJ3jkEvc2%s zI8U976H1{<12m0xw&@DKPMqNCpy9R0?ln#xyi-A~IHtBpg31TeteczF@{*m-LIO$G zTlIcCk)nW!ggGd9X5_g!8zYLdAIaTz%+fKq^G|sqi`0s5R!Iiz2UH4EWBgbbuoqJR zUx%AaJl1RSLJW)pyZPtX_UOmdXfO&AL(R&9hJY_6PEeNPs&I}{M(k@go@3tm%510& z(?UOZO*&MD_$W=9V(!92oFFeBJMO&z-i9gE8dUb>b-23V^j@EG&! zK8s`usQ%zN6bv!8>_*cShEcfG?t(O}n~Gs4+Cur{Zv`oJk+c zJ(3I}wV;uriiDhq^O3sKEcu1=Fd7aViYA@IWR9Mn8D{%ip_8sL$Pd$r!3tOU%}6^A z#O)rSDOS~WEk~qncMzle=`;xr6PNt>`-|I0vv@mB+6`lJv7j^Zu8!HS#$alv*-ePu zXy?Ze0iB&nl)*5n%9J0>wRE>~db|{;J%kwqis#7M1QzHDpQtK&TM4Ga8>kGd^A4;$ zq>w_GFGwl__$jjG3Vxj?MWRSL#c|fjpc_$}`QH=Tf{Z_&zs)4d=;LxUJox)#dG0JR z7(ZaX{NxkOlqEARCWFD~cA}VG=RAtZM(^&Y(&FPR@fM$5&JN$ZJ2l)beb2HpV>nR9 z;)Yz>RjaGCjGRHe8Z`LZ68W3=r_ZH=xTu#M_!}e_pQhXz)2TEPHK~|zYSmg|Do0C2 zQj}#`oCd`abf#tb)>0N&CpH^8e#bj`;H$60Glo#B_cva@m(Y9Q4-oyWqjGc`gQQ2H z{=@dqO~M6=PapUEmp#b$-q^o87Vo^K;F){WW@+o#$V>U9#M)ri7l_1%2ZziC%rTKw z^x;yJj@8rUz*SUpHx$^+awgSSyNoRar4}}7K^(j3nlg8httLX+W7W{JkZMEzIprkKm>-YuPbzSL>ggyi>cIZ zA$niJ@{Gpa#6y&D-vcRTI3_;2iwKEmgO1~Lllm#PR$8ZH%*J4Z#Q%u_F>;Cq@7aK0 z)amatz1c_x@~AjzRt_&X;Y1)|s`(f*JK3!nN&HQvczD>?ir1NuDRT#khUw#3T}#<2 zUyKKIGNUI~L6E=RHr=&2^gH9J*2Lu0K&1FiAJSml8u+*MnxVy{x&fxbgC@@JW3eug zHbz>^PU(3KrVNxSpYyYo@{ZRLI~m zJC|Xb=~rhkoVLpk7e!9swf|*Jk1sO6X2HW}EU%I98X>O`XEqaCt5Vc8-}o>vsT*tANb_0HGA;zHV zp(Xa&*eO=od|O%0HuvT*EcnN|^< zVrrxk>PBIFLJv?_`ipWZR3|}(C=+myH7l$a_4Lzc+!oa!5p%I7xAWpFR)X-b-M!z+ zSeyVoSSU~M?GE#E)4WL+p+&D+y8JYfKMHp8W-vImqGi0MAVl1kO8%8mDU%ALJN{dv zJX9s!P_KF2kCxgF6+<8CZUtVw0xwz`HK@M9*955scmCZ^5&WfJwz*0firiPnWx-Ic zODyW{sG|4l%ib-Her9#-laefzi_5~O#~?2=Vi)ksRis1)6;b3fD0+q1yppP1zeJDY zo_2z}pse$L9r9ac+;@T#S98lWFp@Fr^bh_06a0I%_c%Y0a^zf-oK?aIk3?wPGTxAg z*cO%X)$?%rjqO3Vka0wZ@@bF>FE=OCZXEwShhDC${btkCI2WoiY^+W<-Fw}C8;KRzIWoVO>pLY zh_P|nyZw_A-6{+rK=@k=_dqB{C`a z0DAfcCtLQ=SirDNC>xEjsYIQ8HhN1=u0j@{l_7;)0i>a{R}P;D z+zOmvN}0xI?L@cu<-9;3m|}?xD6($SC(H?6C`GD-Q`2Bi?S;-@JT+9UrX zi4xQ!yKS^f^Sn>qoT*j_zEf-BBk)s;e9Y%XA}ALVC6?>IVFiA&b)8JC*?*!#%hi=B;B8SO+tcP~03C9cd2q58--pRc>_Ki1jonYY~4^ z%SYVkyl7J=`XV=T)lf8kLDz6ZqVks~2M@O0-T7GBHsc#!r=jlYWo0xo^G}Q`F1%$PbdIh?~M|y5j zHJ4f?%_Nn!FS+_!U3&W2*#_$8@n4sMfy#uvkjRJtE}f?I%eX4~K?s_o-(vbghnK_+ zhxJ)bH1?s@Kr?@X*u8JrGMr)g+r-0m>1R{yZ0WdY#{?#6vn_x4dzm&3nflX*t0-uL z3bU7@87bAGYxl2(hQU_<@>suM7F1tvpS*x~M6Kq{-X}RikkY|+oz!Dw5^9iz=f(c& zZm(;^S>V;c%xG76;rC_3H5IET#A%26LepWg=Ao=Y#jQWu4lo=^hguF8{!ky&w)X3_ z@XSr{ydZTelR(i4$3gzr^*3oAZ$7V+&sPhI{`rDxBf|M(FSuN-XbST*_!yj_&Vhc7 zo10pH$+EJ4mbs4VjK)POuuUC>M<%%Z_y*~lVRdoD^#YRJh8?~*`SkZ}O#`HMJH6PG zb8%2hXu^1E=2d^tb0?U<O zycP%X(@0sfNEp10;4f;JpT^Z#N7#!c5&a4yTYtY!NE^M@CHx$lLDv)|%dE6BypNmj zB;=DmTKC$GK)_Dt?-`U%aGv$T7={-%XEAdr{;b>7*;l|kK5ZBr;9O?|d%dCkUW62P zGx$dOGq}w{{)PdoN5SvL;Ngzq)Ajb%s!;k5^C$g(jXwTcK8ezet4`BnX zJpl)M1e@51!DXh99I>=%n;D9kBu~CtBAijDlH}y_4`x3DLbLU?SjMZ6WQ~&&#y;a) z%W!O-^zbO z;Rp#~NmcjA!xMbuzO$QMxJO^U$qY`jow!l`gw--%_Zshy)Rd+@Jj?v^Ax3SLAAthF zUZnA`G`yyzL8r`HclH4CLVRI*cFUSmD4k9qCUFjfm?4C2Qwm7vF!(5e~j4;C@%Ib?Hj33Y^=xlF=#-%Fxh&)l0y18+7M+_dr@DI=vpYH z;0cX1{J~@&k@z-2KzWPTji~$*;Q;;|(@yeQY5zC^tLnbax;f9)tcMI|Xwp4XcbdQ{ z+LF9jvfEtp5!cN#;-UveJ5LGM;8Baft?PA%enx9L7Z`%&-H~{ryHU9uAy2!6Ml#z zKMmZm@`siY!tO8yA$4mqhY9FT!;at24aC%4k&}W_Z=}a$UgyfN!69Jr$FmTi3CHkxC<3s*4L!2$tcB(0j)TJ%B?fn@O{#3B$B;3 z$KSZ`2C~&SQzey%($&PCt3wE1Ti6)vJmKsmnW0z<_|YVHl&Ro)HxB(dXOQq%5=tTn z1cxn=G0LMX{fxwk`<6YFtT1ZnVve~C!Y5JDz~JYlL|<4v+XrV-V_7K`CUHIxx{C5LbRs@{TFfCb8uDnG+RM>VRKVp>v>$Ibgw)0&*%m^hhe zUdQiuj$I@hN>BT@fr(;?rG#{rFYBe@S#k3yT|Drt)8HNhN}bpDnsnS%-VF1kfBm+T z;gj;%g&8_OJbz~Y{)$tA8n$Usmx!OJX~Z$BiQSc-V@o8~q*!*l@FLxd*4fB&7y17;gi_B!4RQvTtJlOQxYfs6b;Lny)vv-ox?;>lz1h)C- zKBrk!g?@#`BKdhdYlh+mjl$<#F1r~kvaRt2q;fQMi28M2WcAJXLKX&0ZO6Z_OMK>w zkQef0QM2+Y$?m_rnHjfamU#Y*?kKrjvb$H&l$9Y)#{5;r0<=cQL@++pyvuP=F9B=s zw(ndXPKmIax9UxoICW%wgU(4qB4H~s*-zFM!wNO8u;)9RS}8Iza~FQj$zB6v=D0H zaCx{(TFy9dxX%8D5=X{*iyI^1hhc3;Z!EqZ2OV zfPf)pOs7Iq*0XZilV-A@o3-jFNh6JzRGk~K3WVCH!(0qFO>i<;e_%yY<9ngII8_BfYMMs9v-h3<0W4KaplQ4+I@UE-y(<{b*a z(sF1tF2hw%JYn%m=C$gpT=xjL3kz6SGS$`7&r6#=$Mz->4PlzRjU$IAZTWfoUXI~o zBr(h%R9p>q-AU~g8Vu#>S9CLcg9t+-1|~w$gM#6Ea_FjynDCpkm(~kGUNY z-p@jzG@3P4x3l;9!?5{xl)ujCEdIa^3ILJ%!sMXI1eW2r;(p*QOMwx+{~+i)_-O-8 zp1)5D|NG#Nw)EQKsro7dHC4qc&J>KH6`Hhi9ISL_v{m8VMkLW9O1NAc8~U&gZhw*h z@>gxk0r0{LVbGJ0#Z&?|DD-td2M}9E27{BVG?n%*JA?SuG&Iyc8xDr$-Bd{y0w3-lC<2FbBz)%cqc`K$<-C`mtZESA<`&2`e3I*m2F}Q5}SE+D>@l zB)|;Z&pS=4nSMtoI+V)vRH|+YX2^fze8fkwmGab{<9Zj)eRsnU8OqfMZ|Zuck|%w( zwQD9_)$#i!lDj4D!-9xi7lH-7P7+hvY{$P%x8lu^O4eV#Ti)75VV$N26X`UO^4M72 zi>3!D!i9yd4#gcUt*REAf4>imydt;9#MsclN)QmGdY|Jd(he&8_(yvRCI^+dcpsLu z6Zy3cG~rD&?^~?1n3C`xzVli4yfjuW0^HbaT7874${w{v-IzpiGAhgv8Iu(=7>vO2 z5~dM~2%Dd`VKeSJLbE9&BehxfTAesmcW<(fkm5T&5zKW};p30A+AS@$Bt=Tjm-OB8 zITj*V|5;0>LK!Wj3c@=lj;YspMvTQnzn)V@PM5Fv=Kt*tA$DoPWCYKzHu>34BrtK5 zzdpv2YLmI#i0EYLA56ZJx!qZ-l}ma15zMMuPlC=VZdc75zVkJ}Nk(!Srzp&D;MR3( z(V0nDQr}@X)R1pPe|T5*;ZgXs&g9D+=<^n_X^*io#$%@TqJI3!60yJcBTxdi5}9m* zoOsW~1TfO?7e!EbCp7rhd)ZFpbSOy``~%=nqUM~J&A`?$QX-W&^`r5QD#YhElrRMh zzMHMp5gg=q^hEG6VtIA(XKPc|(6k|rnZ$6XJgOfhXRJSDIJ7);puW@Ao$Hivf;$%Jjd~NB z28SFX0dp$r675Mx+ zdcN4)i06zmF!7FcS$y36-9_p~D50E##pghG-}d-z$H*rVDOfS}e5XN?PUGrntMXpp z`Ls7rYz8fTM~*#0h_$in%|uQTL(rFDsaOeyFY-Bj!hdg*a?KbHe(J%%N zEhtp;Dlf6xH(*@A<;P4%aILf}D5p{)v*U3Yk^woRz~s2d1gd>Yfd>YB5}9rw`_HU}Y{bN&rhriOfWz zoM#XnSULnnAC}Pw>t32U&?N0^0sUM_x0~#*CP0Z)BD3~IiFTFlfu52kZ-empiKMh# zLs6VX&lQksN@UIwORELKPWdXvT9fU}ARIxcoJkhR%M&B<$H_wOA_qO-jgTD3sh9Hx zT0%zxWaLpNa?iBd;n3p_eg@2E7;cG2-6iTWs>^?;QsyglUJL@fBE#fdO6P!tfNx8n zB#F!$Vl{f??>$}cs^r8~K_FIwea}2Sv+7ELd(mL?=0i`jC-P!Xj?Aj{Ps`jBiQ3(o z%uHG*5F6>=-VKKj9Vz$v5BA18S_vV6*z$`q+Jc@RDbwIMa>PF8uuN{?8MMR7C0exo zaW`_F8@EYkSQ1X%*!;lzwDJfkuxrhgcJ&2g2fl$$llB zqtcOgU|Nt*Jpby84BpggK*-Ja-2)Sh~gj^;0E$v-2XWM1ph+_0*U{(?*E&H1)k`&fFsckfXLX> z;h#C-{!8uO-hg@kcl%%S{C~|0^I}!^k0A}z%F_d&NTH4%FYh5=8sOo@FY>=*h8=OT zvV#D8u#?sQ&Rqmxn}c+E0R#fE+FCif|3d-Y%fbBr&@Wi`<)FSi#M#Ob`l18t4V=}s z1_*p_r+=LOci1#m|B{hf0SCfgjDhwdNB44f1pkYM<@4~ga0KW|4^Iy(0Kc>so$QzY zCj2w{cliIN|3N_Fbs%|we1Vq+L!b=k^Kr0obFy)AaDg4|EMERZ@Gs#%J<;DL!yk{ohM^zzqVu z0Eo!L^PlB@k-tPs1eDebjSpn;vJEfoh5rK8mj90~{=ay@mi!|_|2O`}7ATef@L$wk zZ2vQU8NQ7F(F5@Rul&FK{15*?+5N}g|KJ1u{>PXU$iecT8UGbOAb-j4pIkfv+53OX zUJ$@1`3B$^0A7MR0o2NXs<;90%PWK~0P2Bt1!OS5{Gb=zKtK-AFpdCy2>=EFLIBVL z00ux8fR}*M0N?`vSW`WAb1070D*%>V!Z literal 0 HcmV?d00001 diff --git a/examples/online_serving/dreamzero/assets/exterior_image_2_left.mp4 b/examples/online_serving/dreamzero/assets/exterior_image_2_left.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..10b9cb03cabf4f3a3b0b59128b7829492c8f9795 GIT binary patch literal 11847 zcmaKS1yogCx99<+L%NZcjzc#bNkO^-Oa%Ej(lJc2!v+g z41<9{tOEAtP**^w90mE|p?F^QPwVE4LPHY!G|deC{(&ct03QtpjTzh-Mgs;&XI?P3 z5DhOF1{Ssu1_TP6fC87Ynyfqr9}PrD8qhR{nE?W6xTBXH%)*t1hnt(1gNK`k7hqbs zx;lz*ad~)naJpNY!{ByM2Tr)NCD-FAoK~*(c7P7t(bXF6;37g}1~rA6iGyjJVHUvC z9A;_=7! zZsrQG%$)3jtvL*8{ub^4xCgU?!88_77gu9P7h7w`M~#0RI5`@_Ei7DMu7J9$l`~-A zA}&B<2Z!52t$>a3zj`m{~lpxZ4Y&Q48z*N73ORQ1(X23shyiM z)XUfmZtnymIut*(#q8okii{c4#t*nM?m@y^D!q|m>1w%oQI$LU!<|UwSzdA zpT@-u<^VHua~0>~ehkwY`dCqCn2Qx)?rdiKU+Id2ANS&B&K5NGra)OA(*ic)JVKn@ zG)|Anh;wra0Houi@xIWtVmOT&ozqw3bDr6DkuTZvYXtSM+v>8d~rKdvTY1Haq8FiGnm7#M^qT3 zhC}tv;erG{8$tN z+pLLn-)tnsN=BI4UCLDegdg$e!c3DX%fdS$5Jg^skPXRKbXl{CJLWL3{_hDT{pE%a z4hU()XWpuTd|8Xvs8a_o_Svu*l|Y9SCkUf#8`$n2f9%AncBUpT zaZ=e7jaT{iiHT$EHad)|efQIk~!G6ra94tRKmD<9tI&*aQYkG^h(aEs`(pQLyV# z(0}*W0LO`_K*=A~H&!d>s>bcPOE? zoF%UMUdVN3msGBR;j^wF^*pxFvrr|`Ji<@2*X{x-G*0zs-^9yDdwrL>F0IZSlw#i6 zkl3QVs$cD}h#ukX-4GKtwz0jUuByJtrgpPDFMZZ@X2!wMUCfD$k!Oq;?x zM|2<|{T@rm!+|#mx`)JVw3c?p79;t|I-Aj*D@LL(@=?4Z#fxaQN)V|n zeHr0--IXG@psVrv=p6E*eM0oJN>zeqx4g`kdIZHtV?_-$4AR6LR(8DN?Z6@WCSwm8af=wS_LKIZx~xPJaNQ>1*hM?Sc&yMDbG|OkFubni?$TcQj{cCU<}Sp+dHp{<=n@4| zO(3;At)OaF{qb_;j^lwhRL6A+A9Weo8VocWI!XrC#jnTP6jF19m?*pVZ3S4=ie3_E zu6jJ_Zk1Ju$ItcMbIUf|QtJ|2`GR3%01qRK=<*mC5U;vHrDmT|OJSrI$&T%z*^Ver zPsJ0VsGIe>ffci_iUj-3k2G>zAaTh~5AF+g)2&Qe{rSx8E#1o`A z>)$~vGU+R2J`K0xM-sRlXmn#X=It@wlrtxk4vJ>?@PsG$_jo}eO;x2z5UuX z?wFvJHWqZKIT6?X)9B#B2=V zwsQ5=j%K{Ja90$P=(ztZmL$O!CT^8_Fn>Vm z@|skufUj&Lg_2^JZZmiP5BDs;CQZVXZ=-`8M+46Bpq?`&IB#1jJJT?$CC7{Im#jui zmqRAXc-FM_n8DDw#7ns|3@Oy_TY+Nhd7R?Acq-L6Ui};aR6Syu=n9=4UtRDHh`w!l z@2@3H57|f%H*r*jJ$Gmv*V4=Psp_BUpbKSZotl*QUHB-xrAN{mU)>{_M{`qt=p3n| zpp0|gwx5mF>l{%WA0hOjaz8MV`{>HxhWw*X75HQ;IZADu_(vk4J?j>mIu*FaO1D30 z9O}Sgwn*4Nq@y;_`I&mW$*}i5iPPQfEW)d&^|gClzFre;MV%)p-{qRiRCbq#=sMDb zufEgAm&L^Hl#1Y3`XyRWkP>Nb?f9!=gQg5v61voAC@RFE6MD!*l}R_W2#EM(cbF^W z(QgaY!emifhm=|_Vl8xuG|lDwKIC6c~VnUC?(wREz+&#~j}dlMhb9FQ_zQhjHpp zM|>zmV5|y(nI#;md9$*N+Zyn!)R0o24}VuyN+Ul;P^OfywF}jB{VdV$C*CSi;@ce) z8+Xd%p7x$&m~vj}_fQx^L90onFS9!lJ##+H);bjdpF?SS=i6%r6Y+ONAwM>G>89ls94AG8?cCL8r@ zPZ;WDo@lWTyqoRr|E8x$F>5_Bmg3py{*fz&i>1*8hvb)ff9&>#%#*u7bRw1bZ-YrU zPYWb7ieLK>PoiVCy)ujp{dCIm%<~rQ&rFeJV@Orih1a0vLr8e27&&Lu{Q(np)fHnV zPF{v7%E7|3wO=@s9?qUz7d3%}GSM+vMc7tlG3NvB(a*Fs(}yKfGUEL`U-GBq>Hais z%0qxBRCbM=P~VR+s-@$kgySmH3aGVQdDG9zyY%=FLT5tGMb zYWBwZxHfaJut*WP&|LcYQ1lc2LH%?YZ>$ppHPK4Px#*K?fVnAACl+s~#wjZyfie<1 zVN4;pNWrnhp|83d8z_C1=?U{E-(GI_MncTfCF9$Tmv+6!YKNUIqWXS3L&+|gZ%JB^ z_KJ$;+iKJbq1=PGF|*8CDx*}-nRTpWy*wM{}|kf$BfKcRXUIS*$^&5FsGP2Rdx~GAj$WW`s6iY(h}SX`zjx+ zV-MFMXg|;vR&*NuB&xZ^ewf*)iu9W=#!Zg29|0*}+VBKdq~)l+yUR5zi-@y7D6{#c zEgeCC%hJyUKVn$gPqIoc_e|Yo865&zP^=h88!U1bi1M-A$=CPZTlr5xGE06}`)L{0uE_<97~D>T>r)XE(AT-! zVZ|h@6aC;6=f#cGTv84WWQLb|=)C(7KS@!nyzEcjo_4Q<&k?0M(uaP*Ahp`nEFUyC ztupzvmtw_6f(&HPM{iFQ*FKvo$VejSDR)r}Rt^+iC$|P6fN0tWd zs`I;^eW9Akp%=>O$LBpE7*Dz$;?ngh_>_UkdM`TMad7Z`*dfiSw^*>XPwfp};;mBU z2w73R`O!&HIM?r**-@C;gr2e#jxW0b&EO842*2y=jYc!_(eoif*S`4cQXhkUr}S_O zk|C;bv7r`awjXQ5sRc>;#*gxMTW^&o)j=C9pbBR8O6hH3qtpPcwNe2FE7w=-Rp?s6ComR zb%gu2@UHL`PeIBTnZci`CJP&P2WaZ9%=lSn=(((guxM?Y2@HV}@Ve5s)C$B%brVmN zy_>{svH{``B-QhbBrCGKS2{mjW(XR3J_e6}KLtsTr)+_BcKj~Kp0P>_Sm6)(%rOjL z*(6pfZ+mxp4wk{#4#nG>YWJ2^^UbrMC2pG3QbyudilQ_!%>+Z$D%#lSFUs zLzyIg*-@LT)kp13oM|dnJ?WAO_dB1oeEHIO;jQFFR&{^9RLGl-e1{qo0d4mpQPs*o zMZe!*OCBDYScO}J+IS-8j~>CHCls$Ht8>+Yxb1WD?oN1`kBw}gmI~q+*12`*f39RV zy0P30>MBoEr;uwXLSFxBXhqqoR|?zwOqEH~8Y^0K zhl$F+6xGwALbR{UkQqpXDZ`k#n}c5y)KO2wN&8;&X34#oI2G4s6hrN<=o6J;D?uhz z@*3yd2YQ#Q^wn(my|nIVh4rt20e6RNIoMY2sU83$ z^^=@oBw<7ywhdn3ODB2;Z(~bRI!44jDXHv-nwZRHa#qSUkDW=oXS1Y~DM6pL>_ZDV z#Xmi?2E6AzYLBHx7SZjGMG2e_AjVj(5~GaM&P(f&uCKPybL#6u9#khjKWY)I9|WCO z2w(6rt4r5Pgn=PWhnkZZOa`nZDjg>k#;VI${-$6jMm#UY{{3 z4&*ojrU}Lp+!w*U*QOGv*NckTNk31+nPQ3%@#p;1j`L4X>MKWr$V7q$GK!eZQF7~> z`B@-g!)*bn*ydzMfopTf5}D|%?6f{I;XFH!riZF+@@?wm2W|SBd$f$% zKGb<9Lq=h*O*Pj8@mllb6*}cnF!EGwxw$4`7zg*)2wGTMZ=ro8FBkzfrhTPp5u_DD7)%e=59~;tZ*X&L8Z@ zy-zbvcJ}*?M5TT(n72F1nb>ew3x^u>`QYp#`zfMv@(=Nfwl`-P`El^SIknlJ?AGj9 zHQ=Duzf<`k_9x-g=L?4R33Ks+a(84Yu9ht_)a$4(!rx5WhTHZLqkLymS_q>Ftd34EQn8-+_SHtCK?D9j34 z#1Mi#Eh^*T1#2eRh5M)dYXpS6H_3^YGivf0eDuh@2s#o|c|tn0`2O=Nj^0+Rv0|~c zf^SA&J6#aVB+_Y~cuQbPI;EBhj&;AUaE@ZG6_AEPGyI$@NRv_MMM{Szn%PKlOzY*s} zX0nM+m3$~0;4>#rK*CkSXcwVeDRP_gY4fCaI(z zH8a!!QMala##!}-UFtKAQ~|akKSEyCk&Xolyvpfmbvq^fp8oemL2062%b6+HF2ZMB z?O-utcG4+=I6JRPb<5|zv3$e<{*(iYwE| zkbM%;Fo(P5<&a+39pz^5;w$g1MS)kn??A+V;^c`{WRr2QJuVtluEcxP=5_K|LJc=* z8Lmg!XxbcgcU}g$MUUw`%#Xg}f%bh%qk}~p`m1UeGx43wHuDyO}3%8Cai5tGQamcS`p8|4qCK- zvPHF?yQt1dyUzwweMGk!&-&fkVvmLFq=#ZK+nr2N)_yEFu^H6<%4hH62k~b0fTqEK zAPDgHzXc?fjx_$EcoHlW#vUZ zm`Eg(BH1sID+fL@JSY2uu$HRF&uaW+YlM3dntAUbw?O+FVQJ*+?qQgSoO~=V$^PQPxw=p zQljyli%B4uNKB$a^ocNAoCb=Lhn9`M=MMErND7=5&tut89Y=P}X$H^DNcH*0CSGPe z11m?v9uqI(##=1^ZN&%&X5NgFy^V4jXrx*);keXoDmXj+AeO6=4q-rsapg3~-=KtO z)w_z_dTxy#eG#__Uh${3{0Es3=d#<>DT6#j_t)`RddqR$ceAbU8CUJXd7YH*F`D`v z!~(}vVbv?N4)YH=+!PlnJvH;HX$f?iiXmMu`a&0u*dlxu%!_{GJ&=%`e2Hg?Qyouk z^dHD#oHxED`E{$D-1a=wmobw1fFRZWc1ybAq&9}1n&>ked7DG87nhQ(mqw-?DDn&> z)tt04=h}A7vX}Au1X)Iz`m(#sir0qHiM0J|p~85J<1H0<&w9xil{(ppDje!=7f`me z@`TL=eW%TG>rKHEwu-A+Om(8==l3|jj;1>oLJJrUW1p#Lp_HY$3gjGhOwCk98Fc0o*`N>@G0q0fw_ztma@?`dO0lr zjjN`10I$@VP0NUdL?7K(Ucu0LZugPog&EPcaW+0IVvIz5+p;`NPQQl9V)B|=$M9Pe z8_aS{zPQR{>&wgO-P#eiqv#6Pd6cY=^LBy_+Fp`dy>0N_W|gu6B%y6=SK$JZ3r99| zcVhhFT{0%Q-%M9wJd1zMv7-eN6_9rCU8q}fQP~sbX1fqP7gwnDeQBlKLbVzQoO9@u z3}$JUbpp6aYo>zoo_%&?Lu}2@p=69#qvU1lY(sPn*zEXjFNF?N$P{Vx3 z_O4E0n^M{tvORsK#qTlMeSa!p?G%4+Db(3lDTHBoZpaa$038h`&sX%W%N69zC0ZH9-1{H$Yg{oDQ zwI9hYf1Czaqo^#S-o}Z9r9Tlf-VvFXE~%HCBJ_v(icynY&@sJjK0Aul{c7doU9SKK~|@_U#v%e^73xcO?NYD~y6qp#Q| zZs(8lSbJ(nY9DE3cc24i-ho{RNqXe&%Quaz_21a%Rv`RAj8C*2TpKM?>So8%E!R@7 zk`4>2X?Hf*ZjV8uf5`1P$YtPqoTx%j_T{5w^rs>gvSmHioGCoN!LJwR){oFC`a}9g z?qqHYrL(wDwdF49V~;hd7CwhiWqH}ea&I)JPkiEReNANeNqFXqr0KHjBm*f=&-J)W zT<+9V1wZQNcfAmLluw^t9P+TIDQz|MAo=pZREH7~e&TIkDHjx(-$&ryvQnYb7I*Q@ zO})aKOyfTr_Zh#6y?R&ECB$`pNAbLAxvt>Gbr@`N-B}}riJ;6ZZ*7u0>F9OGS_FPY zoBee)clg?>-SkT~>d`~e&oqa4B7?QIT5ab7`;Dz9sAJ21K#4v;QnyGeb5VwM{j7bC z2F5i?_J(#M;V%r^J}iLc4(KSCUsh=Tx;|H(ezuQHd4NCC7v;a)P@nA>WcEwy8N$a~ zhX>_x)K}v3W^66G^>B=o%#?E3TXux^?xVs#sK`Hn$JDLc#u%CqB0W`z>V&^ln%mZT z2iJV+b!N8--b_30(U>~Lz^Y74U@jW!KbGn6J6$!R$tX0%P4QuzmSr6)&M0mAsbRTv zD-MeC@zXwCoI9B`4cHNVk)^<67829te{ZA4MtHy2PStuA#0if6s+0TG+R?8}Y}YG8 zHqIjAYoHAqqcxrDWdzrZF!IfYCPn=egET~QCf-9`GW^n4z5+SmGzWr;-U(CuhnowvKtGFWF3*^OoZOMlFfe}HuOn$OV`+=A-c4l^smJd`HjK4{JC7agOBFv zF*Q~vo}lY(*}fXm*G*0kdIqOI(2aUX8aioZOcJr7?&Q}IPQ1jfFB7Iul;aYgyq9-2 zLo|>X_}z=!5FtaZEg>a~Zjc=sIbBhO4f&G}YBW_om`A4QQraFk+?1UU#E`y00cF@C zYO8I`nWC1uw1l*4aY!?ds9H*Gr?E*Duj^3ARf@sJaq9hfZ9rpE(dJX>f$ruSZXZI2 zsQ3mJiUidF$5j&TDV^7C|;4PK2jQ21Bk5>gL#Jc<$q!{Z*lkdi7j+!mNH- zLknRxvz`3;$D7`dXBJwCfru`N9yS76zkWuJ8i66-W=dI&suj)+Rp=5Th2BpvGOv*t z%Cv?PbpRhtBMy@6rRq{mKiE7KD)x4#$q7y}>mNu+uhIPcO{N$epS^sMivx^k;6c*Z z$YfNF&BTZq^o5jVGRqeqB(M})$xU7`Fbg*NMqvvPLG_>H=&(D=i6pDooUU@BnWW3N z5WRG|cSuS-Zhj%RqH*Ua3_c-Bb-l4b|ETpw>w0}bfV)s3Q++8`XEiqSv^QBgImXkj z=ey2g9`UnftM9i9L$b<)rx}>MsBT{RLu@~-XrmmFkPl?1!ckJbA+wYB?~}-Cl@3i4 zm3Tk>k?c(IISsnazmaBxDJ2zqwnOCZn`WO8_lN#0;P#1}LZ(bRVhyR zoI&58hho*3AAJg&r72u?q)sm0`Oy3~%Y%OfqE=wfQ+qsi2e z-Sa!yCs@fM0baTVH|crMu;_KFl{k@sN!Ga`mp_?gYy z_fyybOy3I?6v*Jx9CR$9RKq0Kgt7ayDP93$YE0bA%V%UE)p6Z)Qpja^J$uC6uhKgqxryzH z_jlejKgY1tQz<;u>du_TJf@SD4iiDINL9W3W$HAseR{WA#?34s5g?f-8eh!WB-W`t>%_@T#1>(>f?OP!bG<7n9(p0!NAN%Mu&;kKSnmuVH zbPk!(R46ZxN|n~@2X%gteS`;lunnE}wpgAYU!fe*5iuhS~NWUse>T_<7qddQV{}_8~{o;P$ zsO?y)I9)!G=cZ|u&Et1;tx)Rt@S$1V7ic3R31Royj=o)U2Wwq5j!o7~%9|_T>i>eI zMFYQKox{=&DP^)Y;>5O+QWiR8kDS!U8CK#ZncQ0)xk!|CufpjzDp;V`ccc@DuZjW_ zw}U2KD9+z6^K=?!Zap7)&JWhL~*fZ~hByt>0?wK1ks2xagpZOShL`E2`<#6CaFUV-JVB11H$o zyIYw9_&*jB&`lI65CZUO`w#Aa-vEODCIo?`{!{mV(}=*C1i+-BEg*X1YWw$0r2kO+ zmp9oaLi2upZ@DZq9Zz|Ii5MF0Q6_0A1nY>S7My$MvX_`8e|XclWv05>Num=ny+Lt|%c`Z$~WkMQr4^x*-(e1MiTf;kB5VFg5H3E;9B z2qY^LxSmL$#|Ogl(Th(e2%slc2jPQvmzR&Ggun>sze+l`2?9L=h}6aPZ@C}kkI|BX zKu;cNLLiIBHaxCJ{v$Y={a;mP=k2-&CQUA8;|H~)*m;S%)AGiNy5BT}Z zrvzd!`z!e?`x^rg-(Tzs$lm@-_J{yJ$pL_E0Kf)RUITamz;pn-1ArcYXaJgkP7?t# z34jg)Km?$t04N3E@g*b;fH(k=08jw{eD(ju`Tz!MVg?8B<9e(IHvpz?=B`ix>)XTr zVt@qE{^zv=uBtQC(ed$S{MX{J{^xEp7Q4b99Z~@U;Ezrooq~Y=TLAqB3jqD+{yz}e B{igr` literal 0 HcmV?d00001 diff --git a/examples/online_serving/dreamzero/assets/wrist_image_left.mp4 b/examples/online_serving/dreamzero/assets/wrist_image_left.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..0d7677816ed635f06a574dc4aa8073a3ee7a8a36 GIT binary patch literal 27370 zcmX_nb6{pY&~I%UTibT~)VA$DwQYNAZtZT3t!;a2+s4+oeZTMC`^U+dOnx)TBqt}C z$vGe(AjIabUXE5S4t5|QU?Bh9zb{r}cT*NSM|Ktv5D*x1XCM#+#C*)o%-HoCQw;_A z^;Nzhe%f`kD%Fxkze2o9c6IH^%E?a5Ky2#Z3?ydxCY{+>n7N7BSb!`%<~-j8QjFgK zCV3@sNd|UeVGYr5q#4lkyFk>z(aRQS?n=zc%*@8X%FN34&9roNb>wAY^6>Crbhk1C zI@lW9Gdeh1F#T5xqou2z?Kj53(bdYq-i4Rg)Y!z>RDgxp8EF2UngLC09ZYQmSa_Lv znTd_AEtek=WW0nWd%EHpscDm2?aOJ8Of8D55B-O#Ztq*H8V=l2%__YH$T7WBt%xI{#55}pV|{w zad(GCdO&zcdrbq|44^V;9=mx?ITs=nR?${d{E*0=McklB2P+E+)?KA;HA~qN9(XEY znpItit4qJqt?a+B$KlEExyi_uMx0mC@kK%EEBxEqW4Wx<9Vd!&CDOkmJN*P>1slb; zj)H?p)I|lmlFYR~eI0>3$MWCloqK<1T1VFLCTW)?@Zdgd8M8`tM1`b8Au>eQOle(< z{GZVO2I^dAGHy@pCf6i;coq99uW zI=U$Fd$d)NAn((q>D-v`zFa|nB{c1&j8wIZAxLPHb41nrQUcpKc4H_67z2z`YA#z0 z?06l<0-O=q^lrx>!`~=|=g_tP zHg-2d*KwRi?9hUe`5KX2l=VHClnUQb);Z+ZIdo^D%feYp>Ce2LHE7j@nao1nz`b_= zMP4Ip*&{0MXv&mO+hwn@toyBEmSd_dae!3C2(>yzlrZ>L%~uS}O!AVmBHH}+sM_{G zdm4d-`p@3ZqK1*1uT$(HS2;NvL~tKJ+)d-NysC7HXXM}hyXa*|zCD)itVF$nKKhs; zJx$?p+TgfR72j2UMkxdmQN`Z)nYu5fsi``g-N)Y$V}}eQ&#h&sjGewuX5tcJXNk*F zZDR3%#Gq^q&9eSc>bD6iT6&A(rAa?^Dx-#YJEMsg8p$_C-dW$1g1e&?Mf^DGSQtS$ z=C17hb*e^-6Tp@!jExl>Dbs4DN1KITbV`c|F7mV^-{feMr{1rmug*R?<&6kwEnB|d zx1H$jiG-t`h+LM(SUyXk7#YegHNL)cE5hAuR;3Y3B&;`~%x&SIO&X@9#caA&e_*7Y zCBR+X&(Bhn$(?mZ!#F7ZqHOh5aL7JEs~ZK|8Mz@H_vf^#m^R&?Vpy6 zi&!3}NMUo1BKV6O*Z#u&ykx~W1!lO(Y>R_UGwW%%R}3PaKc_u%Ftk?1i#0<&jvy2t zFA23yto6Epo8hr5=L*A_3DM3jkLuk`2R4LKbd4U<)lpqSMiO4Pbqs~jF`{I{KoDn) zIaIf!^m5@xa|W@xzw3^INgD?ZnfKM&ObFJKRNk*<7`d#nU$K}sir0)q-5A{^lYhKK< ze~xlX7Q4!ESCb9h4cCL0)~j?6-4@Q&2EiSo^aUN#Y}tXB681k<^5?Crn2Xr zv~;zetC3^Xi$gk z%9kTBJctN8v}8f9U@-m&?F>dmz|gn&3%0DOi#*RcotCR4?e~Qm7lQZaT;Eq;*{|Cz zmX)solIe@2Adoab;i=}-PExS--|HSahXL81J*r5-7%x^54RqNFvCRbb;W2N_UeB%f zeDKuI4Ntg?!L=xc)<}O}K6PE8HGe?b+w(xIaQyL)*d51)Q}$Ki{7U%5=i+@P*^@x= z=8_^xuhkItizk&?&g6gNGj|n*IVoYrQd;n!M)O@1e<#P3(E@skLo1%*Gx!uo*UBwZ zXU|N-wvehm$jkl4&)YbZg@&9oDhU>jkDBZ9-N|}pW43DHrB<7j5G2xJ&Blf01MrT;;-Z~Hd=vL5D( z&`LE$oJ;*gRDM49J|h&dTGd*$gi4$8s8CBOQ{+nY$teVJ$CmVO{H&A`1s6d`^0uMl z_dv(MXtCac)a;aKDS5^innd5dKaqx>4)?JCTU4ngg37_9&Vaw(1@ecQovH>h zDeBdp;KE0n=3nsXYr_&6t=g^MaC_RPbr+Ew=VEU}^W4Ke^p74mOR^lHgydwr3Dkrb zOf4FNquxD6A7ji0C=e#x>9ICt-}y~LFnyN8q0uZBr^jnCthCVEABo;Nc$qP~o9@*U zqwsgFX>)(nw|Ni-*klO(#FUkA49Q7JepHU2mC4R|_>UQ8CHV04o&}L1@%R5=>+B9( zb-$xF(2jQ2C7jT$JpGcqO)q1Ct?9ndhukeVu{s+D$5_SCGgQ?0a`(xgS)V6W9hv2^G%1hvpjug0xAEVb3q0DYG~ zA~IrB-E0fiCv3q+0tBzl9;q9gxd|R~I`ioX&)Gd%RH}6;nSVmyh}fzl50kjh%__mT%L+;j^s;bIK!Rk;(y-i31UQ`GLt`~TU<%Y;-#H#1t9!OE$I~X;x%`AlDVtS zio|^s@7Ba{_2p*Z=?;b(F>SBhtg&@?4TRBpPAd{>a`gz6v-T%wJA z^O`hiEnG0b`rDmZf6hQG+Aq3iOm`cTj)%85p2aJ$Bv2cz(wsA5fvcqgD}0`lCE<64 zSyUo5v^n%u(nMm8Lb<%eQE`Z0W9Q4!sDzvYHEq>M9(?ek!=#yE1nwQ}XAm~dQ)(uB2zstI6bzuFpjK24JhSBrel&Vnwj=Wtpl%#8oVz3=&5w+ZxL@)d2}K?=clAZ4MCC?EJb-wM28Lgi!1)1R_t z<-%33CA&Q`(ZuQ~;Bunczk+U|>*uv8uC=?!LWuKl{us|DxU0qgCk)9nV7qsXaRf;> zl!~qSuVs#Dm*G2}QYV~Ah7l6OXdb;0Bk7pXa?qC{3HJiPWadOI@-Ro$I#W`7!dr7x8Zuv{v~% zd(Tz9fr}%iv)2|peHa4^a!B7Y*DJ2Ck%K6WT}!9`Q*F$X9KX0jl>;BXxi`5t1?&UtR!&B!K6U@sm-EpS%iK(t!hgiMDQHNWNa?UgwLpBvGn(^&=J!@ zT8fGrZR4>yTtV37e(}YSwIN?Ji*B94H+rMeGU{ZGEf3VvOY--gZn}lYUu0`4epRR| zSF`bHww?KIDVV?#V3a=ls+bj=klRPz@6tPFY=)xsipIjQnK%}ax5eazdvIW)%G z?F_|LR!+e^Ox^qopb~gDT6__(T7W_%kI5G&xaYA)E5S-OGb(TlH<0~ItjE5qvFcf@ zY3EAbuU?3dy<+byd9p<^;vsi&{9!$&3qgDfPSX z$}0G;(Vq$-!dnY4iz^DxLyH8=sNhZq#t_RQ!3^@VeJ?JP?ZI5?wREyknP1rX!TOqY`_##u`CQ-BPEMw zC{(vCgy{x%h={3Bd`E@A(S$S2*yTjNj-cpS$>ep~4Z~&yb!6t3zP~ zJD+Tx9^*U6Dbkz;@NZ_u&iL<+&!4z`inW`YE`SWm4lUBsWjoO26)VuvgQRwAh%j?) zq3Igj2u(tBwFTcA&4&?tnU1cY5Qg5B=}M1x(*py)_&lWq4%a5JnL~Hfn;3g`+4C>|s&5{sMQ!_0h+=}; zoX(9d(B(!v-+`o~%qNl0wn$!07x+XFYechWnLgHzZ{D}I z6Ra-((3~W&reO)ig^E}Rta6_Hdle-LGO-e7$Ut2lCg4ZOqxI)hlrL^UktoGumz|#A zOturErDuNC!=sspcFuvKT#q?LRZ17NzhW<1*4K|P$FJXzpw%-wK6l}^NJ0LpwaN_SAd;e zdu|-K(Dh=Mn9EoI;d^;V$^qSJV1+cKMh?^mtDBaJaIOfk3>Gn+eH8_=)zfT{Gx;ru zpdOlhvrm^Lo%B){rGu5$t&V(Gtg}pfmi-@wdJ5nn*=^7eZs5GUC@`~%%D4Luwd??3 zYrDZ~6)y4#3Nm71+Wc*L>0@Y;7$jC>RhSu;`$omOB~=z_E@hhG+LYTiuYg%$i@CWKDNXSx}yvyni=gThS$Rprg9kCSg z$(nQEK4Iw4r|9JiW$xkJpN-2;=H}IkpBOJs(q9r9+egyOt0q0cfqMo!mmBkN=^rtt3Vx%15 z-m9Q~)5EI*I8AXIELatl-c;~y{*Co_upc~2I5Uig1>wt^(bse?2Ba-;`(KyY3!#4p zDOtFkPvcPe*S=~qt>QyP;VrXTTda1!iI_vu?^2o!tK(}_`uG@@*E7`la;FqqqjwuK zL$t5<*J3f(&(Xo>CZPla>?{U##nlI{+dWp15W_d=>aR;_s9%-vWPF>B)LKMnlgo5z zQkO?<#$3TIFAp_<*BNt>r7$T*9h?aJr|N{|l_^G7fL4{OEf{bYzc zPn2H6@Acvfk02)pUmnzzR*JvJBKwHW;}_~xfX)>`022?BCPYHf1xe`X87}q0^t4bp zE;{PX2I_>cc7trl`r--`;!c8=owQl-5- zDTyE#zQ4FdI6!eMEsXq^Q!Vqw9dBSQffRV3l$N>o0uqXmc8c+y6&9vI2#(Gjx1SR3 zJ&w5A>uOBM&?y?fSZepu)~d9GQ3U>P`yF-{9WWB1ekD8#u*3eexvW9`xy!na&_MD; z^;1WXnc_MCEoMtWv!-*Q9{W8$K18PRhT#vslCeU>u%;mtn=0s>cJ1tQ11el0MORDN z=FfNxj?ACbb_9$H7<#8M%WFsk@VOA73158SjD)J3{vLDhw3-lZ;Fjm!OWrA*v0&zu zZ*G(ZGHlh!$7YJ>;Iy{2#V4;t< zP{tPTQKq5yUy?qDD#POc(fR~+9I;JU=&2v{-&J3g{^|y-bN6Jh-lB2}(RR}SxW(B1 zxzFw0LeX?7_M06~V151Izx55{^eig@Pd-$!uxo5`HtAl)G?`t-eug8l`z$(}*9U_) z0?Wj6yMLQ3pGfg29-Xu;G^0C(3T0ydeQwK80xc@WaEmI8VF7=0f_s0?-zH_iln2h< zs&q(RS%o%)?CmbYua}r4`8_?og)>Y4@AO<97654@H$y}LA>!jY-N1`cLZw zupN7-Xh}*2iea>+5^N&8VjTRIsGK6~jrM_5^I&#`LNZC$>obEC22X{=6 zxX;_WkBNnRi=87wyJ!+E`V-;(MD5Ar{1-qijAWJJ3f?pC>rd}fkW5$od?~2=XZ0!n zaQ(LOvH=ptgVU%tZQuvoY#jP0rm9==oxta#`JT2C1D=k*q1L`6Q%>%~`gvx?`?=!? zKJwek-*{PV#@|15`E87d?iuP!Y+0vXZrAI~aF=ddLtRU<0i`0Ip`1AFfQ%dcc80d4 zJXY#r7Jt>~w&mM-MEE|-gB;d9U^6ZOzA({UmCaGi%@X9ggpD?Y9IzN(29TXS(!-~W z(d*LlU`jRY+X15q(&BT+gbH=mW4VOv=nSi%t6pO75F~jtoG?gmRPg@ykEtI4I9S$O}bM=baPTtEobWTi-sQdsIqf%;#O zg@Q+zb?+x0&*nn7w z*ezb3fAphU^3;s<0ADF~5(!=k079Fd9yB=ey62L~eT0v`LrJZlOq=NswHsBOBM&}w zW@~z03`Yj=?XQt*{gA3YFYmH+R5{ZS<{H$wkKx%Sh})`7Ke_G9S2(K1dS5mZF@eM+ z;`VO8q|F3yX~w)6jeT>gWRW)F$KVtiJX|VjklvpIze&%eZQB{E>c2p}gE6rk*EF>f z(GKd?!ZFbv?^gL@&<`~rsz0jpm{khKwEH5?%lEuWFS!oB=XgVJvT=_fRMo0sV4fpU6g#p}mF_T;-5nvF-lW*KCxcRRNs=V(VNW z2zR|;Y2%pf5kU))&oVC3Vj_zVHqNYXf*R9vkT}iMO~Xw4OcYeC?rM&%hWM&;Wzg zU_M%6CVvkJ?#Vm!PJ@ioZ~3pBoX&9YGfamFvW6mDQ;(7_zGJK6Q&O6B^Z0DeWB=~O z#3{Q?yPCFcZ6-&41|#opfyizI7TL=*w1&$c$VpyptYHsJX~@|zXs`{vCYFH^QqqkA zr~*2|@YyL9NXT0F?Jkx*bO=(I3D@8_E0tr$eER5&fJ#t&o)ZlBG!!y-JO5(o^4Vfhplgl})p z+*Eg2jPe}SU&Y^%V_n%~4?1L^Kf)$?XpImWN+11yurS{N#-FQu54TnK*fKo%FMB@L z7fQjsy^*M>l{-zfQ7s6KGttTXoPzc%W&Pl9a5fJQpP`zq$CF!_e=Y+BND@h<^73_0 zl5$3N>Z8LzS^u%~XdEePx$Yn2L3C zpgk{XiHMcr+#vKGH*wWeBu-Q3$|Gk)o9D!J2{Qi@phyiOy(D}{NSF2a`>*Y2)KCmW z@u@~vG5soci|uFaIj3;37h|j?I@u_W`~}#?&=7bei%F6?lKIL*^Eua5dX<$=jF(WI zGuf~CcEK43Y__2our(z|p2$`&+*ARkaFc^g%_KZekx$}@aTKCJzA5Zkjrr7Qt9}hU zUr4ddshss;eX;&TMI1F*cLtbk*}j;(qM67rMh)xIT>DKE-fTtkSx9`+H|3K=Bp;z< z8CI0PFB?^vV)bOPxi|N>dQOwb)sgWmns=7Z?vV8+Jvf5X;N`V{#I^yD>aq((sdpTB zjB(gOyElw=%fy4s|F(&~c>4y5m`l2y$W!IrdGjkTo#u$<-w6F2910kVcI6fjA{7jD zt(TpUuFBPSBBY+hMeVpIB_tRwm7l#50z9tL@=DojF)b~B#UwuS8;92?mqpqWH4e#f8W{_rmiS^J0NRh zTIA&ehEQ4SdOJc^goip4Fuojt@6`+LrK8w;@%MGm09QY$J%ckcHr1EqlZ>fbspd!1o{(zOfgJq1BY@=z{>OL*4!5L5kTBu>87@l^@xHG7q_76Nx1>RK)l*qaY zmavnFxy6>;u~2drE;n|JRK!QJ(Aw7)Q3zc2Wob8NA1m_FqrtlX8j`mUk>1}?BdC^} zg?`(Q1Y4sZNg5DdV?Ed2hEr;E8=i+u7HGOmKWj>55 zp!L0+O6mt*l^$10%on~OLp0T=oQ2t~@R+{icDJ*r^OQ?UTwVN>;N+2w3e1WL#HVsy z;)g}}YfcnM69%)2u%)g=kO12>UG*C$%(i&oE*wecSn_seAJrt1Lap`Smtkz*;$Rn~ zQME*<%;I|f$GojZ>`z73*A?ev(ttI#^ZT-gVk}J+B;Y{qsb>@hua%~VV%7#u7le1N zyZYbOD*735N!t2xNj5v{Pn&2`9=P$U!9VEEQUZ3IXG&shKjqt_U*oeai{No%mzQI7 zBdRRFlu01+?g`l>7D2_&i=|hM(gY~DXqrny(IM*sd=Mi*Z*m_jK_B~X-kB-dy!Td_ zx@L4vR8zTud+jHi>~^b>;R+2qpAOk)MiKhB7Aac}^sg#10H>rFVFI#F(m%dF+$#Nu zawyvz=l0&dW1yQ3=u)kPP^3FyLp2~o-T_~xVqqAqPkFS1V^fPcWLJj>pHa2;jFz|q zupGtN#Esd{PzA)?2I4qB_7EM3l!V%DPU_VFt)CJ4zfk?;9-7&6CkHmiTdH$9ss8Nt zk0F&;^B`7LE{us)h`2s#^-RY+M6v?nBHO7UOFQKMRH6^x=?3Cz-jb?u&rb^~!n1M@ zwV{76gCxS4C*RB9NlJZYnB%y=3hOKgR2uRo8;1#3Z`2}a$H4r9r*#9Er^gL+AEF1} z7+X&lsBu>mNX-=8y-FNS(&027G%X}}Z15}9UB7ZH3c}_vl#{n`2%Xc<i>*V@j|j~@^LCBt=U*3(o~#j%_{GzWfjitZVo*2yzZ5;T^SenR(k|`Gl7~IN*KxK zYFPx6G@Cz>dLVC!&S+^*e$?i1z{vq8>Rb~yQ|*kRwi1_;ky(oW>9J_G)T*()q zVZBJW-SYyLIKPX@s-*bLt%!hcA6pagk4{!@!zXI7{? z$Z3IE@UK*=K<{sAYobDu;mJVKs}Wf#dVtH*4WJRfTj9Ix+U-k`Po@iz@=}zlYMG>e;9i`&I1=-3$JpJ!JZD;2C275trN%)rk^JZri@j4KT*Tu)rh zfUWSkG|HL|^`o9PAMnKql-=a3XoeVSsm;2%-SoIYYdxQt2{s?jDxgK;CMws#kA5sk z(+{(=3_Mdgl>Sxc4*Pd6sb2e0R;3~a_H_L2V^+eI@AHPyGmtTphi<$6`||}@0jn-7beIsxqBFPkN^znWB8&-Ki9*i*Q7~fOFkP>ZB_V5|}hD^Q}8boaJJe z7U-nRJ}(N>f)_ZQnuTiTkd7>w=UAW>%luJ-crc`fNp%-Xje#y+Wf=`M9BRB$o1Ycf zyG+2o?myERi1UX{bC2530UQJ#KIo#?W4L8=M??mThf>Z=KXa}yxR9fWg&F=ZLEn7F z0rb*L@b21^rO4z!)2)DH5G5i^R|PZ2$0>ru zYNQmwc))y)npg*FQt{UKhc}fvI@6+h5-DNg_J8do)b60ma z_V~7K5l%5c+5MuL?QT6Rgrk`+W|4K?tgor$8XiC%(TebAqg3=z$J(b409}? zBFjsSgx+F6g&@Mqs|#=xPd{*329u4_0NUZyy6!7VTZiK7U8Efbs5)5ceO4I+Zn0Yx zv0bHat%%0c{QS9rZ>(T00e<;b1K4W8joaArSLuzhVU_f%GCG1K=$}X?vsLL(!pU5& z4pRfazuTO-g_Bv;qHv85HJ*DMYn3u|KY(W7O{D6^lpn=ENg?@yhkdewg;y6R@gCe@ zMNk#Pjm13-wj@>_N@D{*^~2=z)f5)(2!d<%E2q~j?{W-s{e*8+K9lMj>>zC4i4#5} zw8hs$WhB7ZCE7>NX*_qnMdBd{^Zr4xjbCO|-?z6&N~RXUMYh1iojn-y*8m6Fvqs(a z*RY^iVJ~hOp6Bhc`jmhk(y-<^-1fHE<#Z;S3b{Y(_48?Zt7ffHA!4Mg_FWJ2rtL6<0_PH_uQdT8LZ=ADj7^xpGjC)7p1)RBEH>)W!fy6GFD1?S%74aR)L|qNGECibyypE$0Th1;{>eng zw$T11QrpYcP-V4kd=2E4I1tdZyAH9ZO)SmX!Rm;?W;VNBV_|}w!3JbX+UtxSsOyl_ z*d6oYgZ4AWxl@qb%6EyRFD$xA4=ZQs+{+ly$;MjC_JY zQTxPIuYVO%!Bi{CC^s9;cuwDPvap6EUk5S|j=fcFFcx8Vl=#U+oB);z+#RSYa_0vH z#iRIt#J1p*{FH1T7w`T|lor8t8QtUSHYJE0l?pj|%LmbFAWL(I)K+ZI1Sc&>l*8bpoJ#Gkuj(!sX>ER`C$?j% z#cx5{0)heZU}c*FfMe^c<%}aXmO;u^VIOsh&Y_Rq*h>DY@Tb^YveyMDv&(EiDf2Gy z_H$Z@C4x5x;vsU#EqGyGn`e^@#O~IVnY9STgiI-)f&|;oc>wY1L<^Dzj~RIAITi!D z7rQYuPoHjk8z=trT1|EEPu|)l-xH$tt*S**a^pj$*dfBLi^>sIlP=8SbC1~i!>ed8 zT2voduwWd*K6E5KSbn|zY@4aGO7~%i6V>eF6!?hob9Bri^7E%hQ2p~9@ehLMq{pVh-O7h!nrUus<29S>m!AXkJo;R-y53 zV(kXofe^No>!`kJ%~}KV4431>Y8kjEAMH6|V?%M$q7+iZ2Bu-PEq0@lsxP}I2ZRVD zwtkxkh$t~tX%drzwn#s8$h|F?Y3Q2SUH3Wsmi&}i6dC`F~apuRahv!)|F$`plj2O^y9oNUUp>9M!9^|9K@kMuDN98lug zi0V&7@M!3gY=jDphcnlV4n~%M*5lh7R#3yJ2MyC-7b!yDE}aE-uUbSNDk1u=RA95R&=*ME@quA3X-|U& z6|ZrNY>Sp141!fFy(c6p0{#UwuG_f8d_s92GrYS9YSV_-pH;=kAvZ{j29@ z#>|zaWnt4?ik8KIlg~(O)dUu3_`^PxN;8SMJZ8FM2d4gepbR$56Vs1|oEM^<8Ch{e zbT||c%ly+MF7LgcCWRWO@(SbmM>JXKXr6Y15~o>VnRZ&C;k4xdBbAC)4s!%488Z_> z+?ak3U^v;(jlW}5Y*NiMR>usPpsP0x0EJ{h^7Ehz%cm`4c7g`A%nl^ebaEnKMFH?K zJ;lhYiIx9vp+#2hA*mjvYA1<^l}hjxHw^P%h)LX6OZr@O+IAyL&?TXY# z0_TO z!hb_7h&WlZ1yE`o*tb}3WHWy1IL9JC*Ix(B>?g0#ySrQ-@S~J7>=kV_5sf~k*u-P9 z$bCxwdlIkvLUAzo!|({2(VD`y$-W&o!^EAGtgR77DGwK}niI&>5dwPSWZ0N-S7d{*6UZ^UTwJE=^efrip#= zYv$lYBP7w|Bi!r*HLZXRelkqg2-)U_;sA@W|9<$6c>yyc{({?aazNa3exMHl`TFNmVErj0oS2Q(D0d z{>Hp2Ri~s}XNiJ6d8beRip^UF8t5o1>WaYtn>3^{+w1UOs=bVF6ih;zwCV&ESk9CY z-W_TNItk7EfpQ~hHp}}iJ|LXKejwN@f!KiW)PL{3&mquRap2(F`L0|3ox?P3fh;i0|Gt@OR&ptMw?ScNPo4BnoGB+Dbr?ssVSQ2o#Xm_9|#}2aO?6)o(lpqzcz_R9eGsHAI}PZebH*#XVQ7M1H>D%2IwNn z#}W<1Q;dE2@&Eptwz7q~Y?jgB!AKfFAGmn{*;dslq>=*M`cZ)J@=MmFfJDK5hr)Nx z&|6#OX;CYAK(Uk}HXiun!~nBnmEwa5jU3U?FelU1zXnhnJ1K6hf;GZ=?N60w;OlU=Vn9iB2=*N$jk1E9Bt8LW`C}D_gG= zi_;8J;8p=0#(1buHHO@*^$|9epz zYoG3^a)BUrVALq!CssTB_TDn;Kf>Z{uz};?gVe#r>nuQEW^m5u2(2TsNWi@t+_VC% zid3u9SGx6y+g2PDrLQ9tjnrMiE=ZN<5h}`v*s=kZg(6aPO2k1}o*j4DnnTgc0#s-e z-bRoc%v-o=GhVeMzA&qLJuoGc4kBO8x%k%n_UG{?sJ`ch3ci(U5YF zFZjo8KbUs@EDZ&sU6*Fy~2ep^Pis&)Dv(8g!5*JNrsp}ZSlfudKpo|eA%MO~# zhINAcw()}3!eM3_DV^(nAY4g}V44g80!(&Zzlt{+?2fC}&po(B%pzD;&$-fa!J>Ko znFWO_Ax-QQIdyer7rlBpm&8J<)=6H2%7WHg$7J3voMN(p1b%H)RN>btKWhKA*7ImO zlk@c}-Ps@W%#Zra^8JE0^`l(v+wvYa*|_PrkdM~|t*$5cjg4Mf&3fAoftX{gwNy_M z&D?XRI22?djr_&3(45u4u*($iPKw`Y* z5M?hxzVYtuxCo1K^Bji|KU9i;VIojoPLJ;rM>Z0!8676XT&^VQS0GK;f}p%%ztTAE zh>`-G4zcRM2cVwK ztUl~xIM1}xZBji2M;^q4r}wovcAA)@#xvkPbQfnwGS%WBPm*kp0lB^6lLvL9v|yke zlOOLce}boBgNR$YdSKV)rj)zfjS0-yu6@Lijlh4V`RAA!<~CT0f9^(I`H@1u&4h8I z%$JrZ5-rw0{^gV+vb195bbQMuuYJpnv+QbzeN_6S6oy(3yPJM}kD8I_gb%R5<6>+a zj4nWJhxC%cbfm?so;brjVz4+a8L4=iSTY0U=TsS57dk&9vpr??lSD*Gm7Ykx5KWQs z?hEP~r)6n!KaU|ELk_s4tZLQ6iD-iXXm?}vw>d)EsqbNX#sxh@{F5Ti&xE#+S~AuT zgh2g+s{um-dR#0n{=R`~w%I5pC%!kn4LB%* zSok4Lol%2297E@KGHd?Y_DL zt72XlkCd|^v#VKvRU?;@{qySgOSnn37l4VW+rydG1?tfB@-=Kir>rc@GsfH|~I zvn=|w+0exIA0ROAj6#1zrKM$AGEPZDNU{bI(wX7FJ6DQ>Dna{>g8(o&T&X!=$RrMimR`qAcARx$k+pP9=&o$D&&*3z{}K3Avj|8iavhmKiFh^AAD5^C}1nB0LJ zalQ$6Wn+DG6u2vnCdjSGYe;$)h0`Izls8+Zo>61nybQmX;FJ>iq_5okb$?zL?DM2c zucYxHQOXBat-q}681|8lj!2A6nq10|i4>`1N~G`24Pm00$(<9QvOsx&MRvQmlzIL1 zNJwl91NGtv0&!z8p&;D}--$JH0LR2-50HK!0(a?!tDPWOOfM0VfP2r=EvKWWjej<> zzjfAFn>@o6qflePNa|(@7`xcD95%ZB+6~V=iruZ7*`lZ2$|vjL0w~xv1?q{6K2Z>l zu)bNtkh}P-OPB|NEcN`%$oDFXPGgvrgHle+ewGxDsJ&8f2S zwb9Gq1Pg}ZyD9206w5%Jy(@_4c-F22C~^LIiw!0RgUMYFXIs*zhXCWrU%}FMvVWAP)%E**>nx!$ z4U;!0nXl|D&7_is+!ZN=GzHPTTG&NqMnL<3$PIVPQ!vrxmR!q*3(v;?$%v;Hg6xRap)pz|#1q z0)5>jGW~~xK5bv{uj|{z2Wzu#mP#kZkc{k`cs}$>($wi(bJ;aIG4HVP2#KAK4RtM7 zAy|zmMc=2;Ld3@HWUI|q`_^vhgsOFcq%P8m>1cK<)+rZOa#lFLrVXfvk z!1EVj^ou>rrKdWEwtnySVu}X(N+W?rJ>~mt7xJ;!vVr073$1lq5erufX}I%KQyMiP zoZxeXYLu-=-g-8S5PJ$pl(zO^zwD-kWM6P*I2WfaA34jeec|J@1Iv|^;At+*K{|F z>0Z00WE=J=&i20=nou_>f8>6yYX&C@vh082aL+VO2o_K?THT=})64jNy%YJ&<|poV z>=sdK$K}{xDKIkWVywVm_J(1wrP#o1SEvwK{4130zz~k2DJ9xuQZ|dx6QY4E-%zbO zo7a&*6XAA1#uT*?@4?_145t>cv|Nis=&8cUW%<+S?;frp;-jBJX8LO!1jYQ;T@#;E zUNB?wN%oY@liQ+nHGREVNdkLDQd)j0SqMQm3Q(G278IBr38C9iXfiM6+c=+Z?b<;U zi_U1uE`{$d87i}K>YTZe-{V$cH$lT~nR`d?Y~Oe#sPd5`nh(lq>mDi0Rc zYC))$vmDinlwpWat;*Ee{AHWD@6yF^#BWka>2Fo0&kX20b-xcq(-sWdRK1>0Ms+WR z_%)c9#O;*ZYR>JUGcI*s+oUXW&0Pz=pA`0N_Zekupd_}kmCrpq)30&PO0#gB6bZM& zdYS7FwZzsp17^Vjre1-lWx|m>eushbe4rc{G$iyjvF`8fzH76YzPAi0hZua$!ohm< zKvp1Mw4&Gd`az*t{?7i#Q4_O!mW`3>M}ac{k}%EB0a#WpO9sE^quN7W_gyA9l~5R| zKmkYSV#7aL(R>X%GGy>SzJ4n?53s8Ul=uapZ7b87<-268j2dx?#q1vX!vu&GBB=H-K_u5_zEI8fCwEGA;GVN`7B z7NY(p*Q;%j&`k7|82N(=a9lBgn>dg~#JrAEVkgKGdn0~8Hr?v?qWp}W!}*{VqfHDX zDmQ`mxR(jiVG^L22Oth3bIXH)fyWDHM~b2IbG-ei_HLbjRc4R+HeYr8?2eHmf$4OE zhq!!rlE4z#;FWVL{pXV!3T72=Mzmt1$o;n)0V^oJK9!|BDWLI*YuIV_$iEgrBp1Te&C7Uqs3qd0!|qrur6pSpUOoYBF^ zj(Tgzt?&4ha?PuJsX7r#PapU51-I)}Az##m-_iS~gRk zz3CL}II=2|+$(0R=3s~QFFouDFMf15odTJ>>7gtQCJ2y@nlfkAwLG(!5*bsl0e7^< zQ!~WwkHfXfHaD_6Q=Wvs*hac?M{BD*Dvxg+jtvud7VDup*BSIJRu&f!QGYB#wN+oO z5uB%gBw;eciFgEU&|qD`v&%sV4x(61$nrBKKZ1W2hC|OC(Ip1}4SD0A?%KDsu#a3k z!I;W&@KrmiFz8r&=_)oaMLIF&5_spVw@}pPuC! zp5~ju?ev9{A!e2N6PdFb26VwdUqbc4du`bOKJv4Zagx>Dv3d(Rq)PJeF9+X@WyP@Q zjHSrn2nX0xi?YD(agi{M+P;TCuA}hI$Kias#tp|q)ru;d-Bpm4u%<74Q`@Vv+5K`B z%vGmel+gwzc5Nouw!1}QPtanz9jwa9TBBbQ*TGCtN+x6-@!+v;AX`upJbYBLU1=C! zm)uvjV?qkN7{f-GcP!q^^WRd=6l?MS@tprPtH-V^r^dYfe59oFLWF%TmCk4C$fa6N zoyubr0VX=3Pt^Il7*5NX68)Vov zq}`37iqk89@0(^u$jC?Wo=Tmwlq+;$3PQF)1XR;`Z|d^++NDwA5?P^euRElxU;c@f z`laQVRU*beU-T$>CCfy#KS;x`=+rSN>+0saWLV=p09nUAbG1?Jd$X0$sI01H8T#Ix zv$LUwtC)Y%t+^5h9b>j?5%694n(LNHt@cfQ08goaf*v(jk`0T`K82&AXWrUUEK4W} z`dH1XA}n>{>Roxo$4a?LF|vsBDh}MIwH(iYAUtM`A>sA^XiCqpZ5nPPfHgi-SyzH; zb5x8u?R_*gmarrBDc?rOb;Ic%sW9<)yV~YNq>gwUcoLVHmHNw9CBTfZWy1Vg+#&Sf zBQO0-3AmOnwW9-ab#-W>tj#S9!aOW8L7cqA*wom}tW8^c5_`w}jL2Xg3~YqElQHSY zL!~2^MGTP|HZ+^PFY3iuCJoxnc0f4yPwKHCv2iKkta z#YADQq+m{NQoFh&);Hl)x(i1bv1L<1(%vrx{jpR~_Zr3+10|KuM@AW=OHiaE*wc;! za*zvr-6t_m%t*k^O5!H1uFrb#rvr!PXCA#D!l$2#t>II1Q<#BrmBG~8KwJc zcZ5T@XGxDq3m&-{4olAveWIf`-j$n(+rRU5aOwNJ@^noce^-H755O{4DUIk* z-?lD7S<&iU&naP2cd}m|mx5hQC1q*PsHfehZfQKY+TDk-dmZi>D{W!&)4L*7u3jrH{ z5aFAG9&4vKym*h?+|At{I?zbf>Xu8x&vKg}1IHE_Bl#GsWmZ+U{g%Lms!>qeLzTJM z%6T=upKReZH~2P?EHql5)_dOK%*>uWefm(nJydFqw924ax71JN%1?-(=>X|QRNA)y zc}41ZfZe@!;&(sVGG-KFJUtN}%aQm(Vf)hU#ta})*ycyD)$Z*r+A{**a`n|hF`ha7 zSmI+_H!fo&m$XHXc05LQ9;8es^CMKZth-ha7Yrd`qmCeyWY}J3m`defiYE99&g!&9 z$er|P>t<8-K?{2_RC~!J z^aRV34Fh@4>u3m&hBjW*HXmZ489vIHdhY z(s8rSA<=zs9Po=v$r^}vQmjnDm%M~bxpJ+@t0pIz`q)p=_OT?y?(}hudX|jT!4n5L zBj46YYHm)<_@bH^bZW(DXbgeQ)e$CR_H=P9Qy&(VBcu&8DkZ(uGwMi-P{!xWJJjZ6Ox7f~{=jAWd_kEadRHpc&ZHf@^SYparm z7{69|+!9_7_arE1R#G^<>Z5E?@Y{|`hd5mhKG6WZ5vQ-1S%h}F#;Y56k=7q`-i6Ts z7=WZcX%hn?ED;LW_|eTS6npysFjA3P+CalUUU$kbbAXjBHR(E5UDmBacg>^r$M=>w zdfl540mSB@7|0Y>a-GgBV^*!)4kUwchF7af@2RyH=Un}fzpYZTu zBC+otdB;igE;qIn%Wenevr-N&E0dp0YOOar(oD0qe3!p0#biP_v+`Vr-KFabRLyNS z^X*he>Jvj6>&gGrQ-b$xogCXl`4Me$)3ffo@DO1_)T&Ekf{{Qu|0ofs#qw$lw-JfK zl3k1blr=eK9^U0OyJHFT+FD+6BCxHIA}+@9N7pRC<8DZoZi@cR+$k|u%W;kFvxF%* zZu9L6dH8z%SLmtl)T_KzdwY3Vn?ZYYc!i+~>t}8lZ+#H=4yC>OcOEhrL(c9^e(q^4 z6JsVY(U{FJ5<#wev{f=iN9i8z2n(D(Jml)l2-ZHWs-{+H#6}fUIT5MUffIFJPqN}6 zrO!8XL{eUYfpZi9_O;*JPkI$|Ys4`??FgBw;Ph^BElmW=8{BR9Poi>M7Y#v6X_`%P zPWX5u_5wOHlvEiDL~9#(&hfMbP;Pu>koIrO9i|ckF3)Viv)Fn5>*Mnv&TX2ag)+h+-y9tF&rHs)X|nO z$m^F zm6h=4`Lyg!SJwa~;$&s^gmm>0GP5E(ufjLV2AhLe*1gtX?jjK94<#<34Yj$VA*lj* zgHXO%)YO$dRYu_rjR1!G2clWq`hk$Zd%`RyAK>OGieOOZWyLvv z5Cc}f@lTTvJ~71y0kq(jAxd{0yWf$n^S=(^VH;bvYjRseZ3iOA5i_~&LQW{PT$n3o z4!H+zHjQx6Xw#9ms~1jp>cj`^xSAdZ?&eIF(+E^ZVw-y|=5XG0aqU}{Jehjfpnrr= z=SWj>u^Y6;3ZJZhw&ZLLyX3}HtxC@2xj$h`vwy@C5rL89aEXu@S8_2u*RVRu3+j=)~2a!bXO96_tNaj0-B7_^Om@eDxGDWq4PvPt9yRk(!Jpyo;C{W@yY*`H!UHxKvZ#Y4pDwJ@N$ zpshO28dHeVlQbHv>Qy5%63i2J|C@X8&~XtRGvYK)(rwI_bRQ6TftVw#bPD{$pwZqY zWlpk?p&vRJ^pRXlPXe_hbVn&ZRy{@tM%=8@0gIYdta4A;cUggDC%5^CaV?oT<0W-T zO}J&kr~n3{odj~l?W@RgQo+bjRt)5HOqMC7$NIvuO<8kB9O<+OKS5M(z zjPwFr`k1v~k0A!voZiS1u?@&g5>-sl;*-zZ!8(F^}Ed$K#eWqmJMimmpS^yY8~_ zX*C7W(24ijb&3<^f`+w_v%;2fH}sLz|^DO&bAsCD{K zu)EXBlAT{5(EK;ijy!dp`^F8S3#AotlDBGy(J1Y0ws$X-CbVeU;BTP{eLNe5xbHHw z1k6fb@+gWsbLX318Z)xFSxvN;n|tf0@-DwP<`r-_$JF4(?o8Sew4$kBT+Fx7_DlG& z@?=KW`8dUMHN>`ZX35R$v0LYV2ilQ+?u;66&q=O8MbSC;Iurk#Gx%+yZn3rtTH<8z z7)L)g|3XE$PD~l;nmlTYqQPWfIZ(g#-VK4 z)D@W!o}V%9Xi4n4l{?HxSj#Y z&*{t3QnulTv_jsXP=w9e%9JG14}=JRYI_G$3efDpsZg5YGq4-5okKSco>ADVYO?*g zwBr~>z|&oXe%ciqML5u9h0haBH;^$EpF@YB2U@rdTojLDu?x55Ko7=!e9 z6Ht8)ClUijITT^qdVKvo>e=N*W7|Xh#S{Ie+67l*q}*TFds9Nb_nVMeevZs}p76}4 ztbBvhK%#x!K`aHrw)E72Ciiw;#|` zbA(_XvEEcg3lz-SDI!z2i*hyidGy3zL}9jMiUkAnv{@69XMcu%8>A7gld9}ayXoa` zQcwH{nho~i=^1Vuhgl-`0u+;Y+O97dInu3}74mM3yRqy6_>{$kEl2z|ES;G3T*L!W zVeSsxrp;XpY++-4t4AiUrol&9I1Kag2n+Sldmh4u_!I`$y*Nqx+-<7eZ!_a{GKcp2 z-%+NnQTRKlhcr-E7SYZ}zCbn3bcn!Ekd02GX|6d@*)O zAK9LWyWxdGk#375)A`>unJG+%r zz{_u4K9W-#*3fG@9wUf0(Y2ORvj>)=c&}(@{RY!0@F%jgwa`QC#B04uxRHx8Y>qKt zTzE{(tU#z5P#fxv^K+Ml@Is1J-O_*(Z7PyW_rI)I?mgs40t6FpIf)iGd2W(XZN@}?%7e$&x9%*rxn^lJ$-`Ne|6 zweDx;u@e=>i>{2gJVu2s$jQQZINxVg+qIuz*vi`tZL`rsv}Ich3IG~WOO2L2zWIak zE8-kdhI|%V3#`E|C)b_SAD#xmMwx1`F*DU?8cqhR-sCVS_?vA~W><1Flp%g%(=`1* zQk%NG%C6Jm>6n|qjW@)QV7AFR#VcL}BOs0>mR(`Nz%<7Rx>~(X_=c9Q##ueLNrQ`@ zIn~;k@ZXB31i-aXd-oD))NoL{R&b_u_Iyha2U=~Wk(i5L=~mFRfHf0NyI%Z&c9T=(Xe!-bvNIpqyeF@GK6{ z`wy@tAxSf_9{DrOVl+iyBb}B;k6FyxX=p4XYA>f2>?e}n>GlKR93PH&X@x5Iq;wlH zhV{#_dN8vIpmNMxhIayHucoX@CX&@jJF!C+pFdr`Uf1y@da+{z7UP4B;9<0LN$0GE zip)Gx*J!e%RqrQz_dikIj0!OtN05ZpkgKEX_N5c&l@Em zCo=#qNNby>lAz+ZD#jgVK$whi>D4CS*I*W_CIslml$V%iH0hWy)C`0AWiuU!*}NWq z-hcn7VXqH8zx|vfF1Sa`zo!i-qNp|=Ht>Thil!2;0rS3Ow6yK6On`@qd~v7nfJVmN z?!2-Kmf-;)AvVq1i7e*8vCId&t`=>?iv~K_AtpM&i?augV5z9F`-kKBaI8n$Pga+v zCP>U(c(QtXom&kw5TmS4+sc>WPoolr7Q>bro1R4BdVJKH9wDFI%HPmMLUFwzUf_`u zU}p1f@8P|Jmq*aZAPYN&+(LmZhSW;&2!JjdD>*!q{&C$b`Orb}TgY+S?hdtzbjjy4=G>7s9*R012txt5%Gs2c)?$iLK zc&O+k>#iLK<>gCU-+4NI&M;>-nUAa^{)Gwx^5$ZGoBsNl$s*3T(1MBs?2Gs^zRs>4 zl^Py}X?5s~y(oe9K9ur1Wfmy5G-#$2(4Wd@;WRGLwlnDM*xGBY30%bo^-IFGlL@AF zr(ym6qz8gU1=cQ+Z&QAK9SkTdofl9lJf+X;U~PU=ZH3)nqk!L@XgMUWP65#aC}q07 z@QRP*tW&$PPxMElT_kN&1LGO7GPyyo)0&r(Hq)upv&KC6U{!BO&bG@c2v+Rl`BQd{ z3@58G;*}KY1Ia=~#{#Z7I;xS0A zwU;)U5X_p!3ZOOh)(gE&BlTI2nNCDj}2J-0lL?%h!Mn6hoZq#zBT} zqgB}uDaL}s2{B}qZai_!4|M%z6)Dn|I=CvKK44gfo$_3Lq};UYlqL1~fMmi*J@YZc zaaNX3JUJVd?QlYR%+FnK`Z&@Wn`Ad4IPQC6WMJ^sctRa#koJ#ATh8w`CJzeUW{n8@ zR%G07pZn3K6+@&N8!0g1E%AQl#79qwIs#ICaN#L_<*11x>EsU{owj(iZLD1&VQ^Ix zT!t(@xG&{~UrgBz)a6x-jBh7P<#WgFx45+S$6M$uE2pfPA&=G;>=lDgh5!ymvQ+%# zbb`JO#j(N|8W^w=nV4aw)3u7SmZ=K~`F^s^rd}N!-lX^ZRY)&}C6ZCY=e+!wb+JPz zN5vX%9^|OtOl!ZgIz}%u2CZqs`#~Lr@9yx!N3||0I3zps-@&>^HdCov;^5rmgv#-k zfK(`gfrz-S7n~^mLg{&}wL^`O*oU~WpK zZRWR$$)(4UazNHH9;w3{*L8iT=a8W>J={wSdo(|Hy30iRL7NlYhI;ZQ{KoR{6bQPe zU6n1roQS`@_3h=*#ZfQ;@k)lDEq|Z2fc}j|R67Q zajMX5*AdRLt1_RdFso8}KIlIRXS+bH$HxFEu2iH^TfP;r0E@(1qsE>W@568ctYHxK z;>I8~6fYSwoNt|OYpm4#o6xJh_EU^LfNhD^=i8B#Sg<`$#CmyUphXBVE(5$H44jh;Wil`KYwyW6H+kdp? z!$}v6rg`q}F9QiMM6lKT^vpVRgXG2sguc zqq{)s7%1xkKW(sMmCizce2BGohd0vYHi2H$w+iE$mj0zW-9{Z(sB{F8mL%aXmf}Q! z0ow_8FFJbb=|(!gdQYl7%%!Nmtc+p=qb6`7T`1KaY?+?TiA3X`Ux|l6MDb>QLnyvz z(X7Vq%M7Sp1Nv5{Sm3$|;H%=btT_Q`tfH=nVL$2?nEmIrC{po;B z(^umA5@!SGNGrHQDwsI4lC?=#A)F|K0p`93^h?u6qA^m7oM|hK2uwt((2vUer7=ph z;PWMF?_1?q_5fm--_2n8f-%3(3_?Usm_@ zk95g{>VD=5(X@`qReGXI{6fVvGC{`!VcnRG=3pwRFeWDC6>qml1gi}GJ7ju^ zOEB3<>dTvOcY$Dp&2*i+p99}XYc<;Y-MCWrqN2v@Jha7Dc2Mmk5gGUnh)Y z{*D#LoG{ecCzFcy$ji_=xmuNbF=?b*gPvF5`9<4=mM`vYX%8p(<&ce1XG#pJBzlUS!=2(dt&t6vop;--lkz}EYFH*n zHb8#P9K}!fla0jblkM>B=fdwSa5oYWRvHN5flGh*Lt=s}@&Q{Zx;m~&h_$*g5@D%X zO3@(`T%C0zI6z~pB=T`I(ZrzxccG$IJ--MFJr=ri7={c}KwhUSoQ(dsv(nK+p;=J0 zWY6{bMfFM>i??5i?&s8S=24rDlOJ*?fVbhFsHMr`xO~5sj1wKU zoVnIh0yyL^&IPaxshT7gaLM7-D;Cn&dq2F4ZhXr4RDWbv#WO?0Y@oLM4bFUY1;Y`O zD88$$&#a9fAwfrWeo?->*l%t5Ors!#hus#J&(aa8egtvPge?Z!@3lPiqVx7@2ty zM*P`34Rrxc`J(&r(T=)LlBlRLI%T2ut#pCWT!#n?sM+%bOS_ZBz@|VJpt9#|x8dnm z%rRy_etcgO%pa;Dm;sGA@4SUYax?jmO>H^wqTaO(5oMoV>n3mOY7>#5Dh-)3Q0ON` zxS+3!LWN+R-+#IIY__c;F1XphSRvEntST)xeFU}`db5TQN=s~+{u<}wLp-3Q#y}gs z;>MfK!S9}#6+v?;eXjmd0Cajt$8eo;g5(Q zl99_E$>oIH-YUo{mDsAVik!-hRzOwYsRSAHLMaP%l)Ev1(CpcZBauq!hut@dlFm!;ol z==p?irSUC>8hWAvzT08=6ccBX*YHD|`bK!noPTHSp?vV~5GD2p4MmD&b7o?`c`qqI|t$5)7`B*%R2oNazOG1<&Qi zQL;qJc>_C0Dr~`V$BzOVQQXW+wR1ymZN?7cg_2ybYx+FSFaw~2)$hV-8oX?XZ+q+# zP1>VfQsrO-X8`RE>kID%(M~6WF3qD_S*B)Wv~rl_Zuqd8eG1#?mfjQt7Mmq|&hNdm znk~8_UnBkdywf~W=^*SwSw^ndG(gs9<~7(GSPoB6TaalE?-N{R=vFcoYoL$^-+z^eOOBcI0=`{?hl^MCwy zvvpeBys1W}MRbQr`1ynAF`p{fj8Yq}+=bAHE#J;1sYh`5=0M-rX!g>Z#C#LZ>sl4k zZ4ar>K+mWwAg?6QPmZN&Ktn>R^dAs*)*zJDdFDXn9LY}?T@Bz(dONE4v@W;S-Kw!M=U5xi?M0%clv9_Kz*pCeb+_@ng=()*#FfWymKy12<+d2qL5Ba6~JNNup zhe^(9p_9!kK;Cx3YFN2Y~Ru0pvdD z4dpW!IB4$vjr-qkK!X1!1Oxm0@4Ek!h5+?R0V;m-4J6WW{Pwp`$bVD&mp4$q{|W!m z&;Qmh_`6lzUqeb0fFlq@5t>*5L4|t$0?5O=U$}q843TaPurLHkNUZ_?`Rt(VFfao! zwVuCrOy&S9hrbl)@?L`Hzvy?Y{$7$t($EHAW%8~AAr7j_VhSQYI9va9`tM;=0{)Q^ z0zjo--i<-)Jj-`+s@BS=dG8~}#&wgHA`2Et{F!ot~E zXrNgFd%u8#N)DtNfP+DqpM?#-+~2>OegIXa`DaKWzAK==P&dW4{u zjNcuA;J;!5#rv0b1j*e0O8&Wk_~?qDklq&+1*qkK#snMGvO&!o^t&g};T9|vDBcuM zHeoRczkeC)2gp@22z!F;WI;A0ptc3#d4ZOnAIOdY)I>q_F^HE0l7Ys^*cOD};XNK! zP&0A@I2wYmwzbJ$3?jkE{(TvOmac=Lo!$FN`M "DreamZeroServerMetadata": + required_keys = ( + "image_resolution", + "n_external_cameras", + "needs_wrist_camera", + "needs_stereo_camera", + "needs_session_id", + "action_space", + ) + missing_keys = [key for key in required_keys if key not in payload] + if missing_keys: + raise ValueError(f"Missing DreamZero metadata keys: {missing_keys}") + + image_resolution = payload["image_resolution"] + if not isinstance(image_resolution, (list, tuple)) or len(image_resolution) != 2: + raise ValueError(f"Invalid image_resolution: {image_resolution!r}") + + return cls( + image_resolution=(int(image_resolution[0]), int(image_resolution[1])), + n_external_cameras=int(payload["n_external_cameras"]), + needs_wrist_camera=bool(payload["needs_wrist_camera"]), + needs_stereo_camera=bool(payload["needs_stereo_camera"]), + needs_session_id=bool(payload["needs_session_id"]), + action_space=str(payload["action_space"]), + ) + + +class OpenPIWebsocketClient: + def __init__( + self, + *, + host: str = DEFAULT_HOST, + port: int = DEFAULT_PORT, + path: str = DEFAULT_PATH, + ) -> None: + self._uri = f"ws://{host}:{port}{path}" + self._packer = msgpack_numpy.Packer() + self._ws, self._server_metadata = self._connect() + + def _connect(self): + logging.info("Connecting to %s", self._uri) + conn = websockets.sync.client.connect( + self._uri, + compression=None, + max_size=None, + ping_interval=PING_INTERVAL_SECS, + ping_timeout=PING_TIMEOUT_SECS, + ) + metadata = msgpack_numpy.unpackb(conn.recv()) + if not isinstance(metadata, dict): + raise TypeError(f"Expected dict metadata from server, got {type(metadata)!r}") + return conn, metadata + + def get_server_metadata(self) -> dict[str, Any]: + return dict(self._server_metadata) + + def infer(self, obs: dict[str, Any]) -> np.ndarray: + payload = dict(obs) + payload["endpoint"] = "infer" + self._ws.send(self._packer.pack(payload)) + response = self._ws.recv() + if isinstance(response, str): + raise RuntimeError(f"Inference failed: {response}") + return np.asarray(msgpack_numpy.unpackb(response), dtype=np.float32) + + def reset(self, reset_info: dict[str, Any] | None = None) -> str: + payload = dict(reset_info or {}) + payload["endpoint"] = "reset" + self._ws.send(self._packer.pack(payload)) + response = self._ws.recv() + if not isinstance(response, str): + raise RuntimeError(f"Unexpected reset response: {type(response)!r}") + return response + + def close(self) -> None: + self._ws.close() + + +def load_all_frames(video_path: Path) -> np.ndarray: + cap = cv2.VideoCapture(str(video_path)) + frames = [] + while True: + ok, frame = cap.read() + if not ok: + break + frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) + cap.release() + if not frames: + raise RuntimeError(f"No frames loaded from {video_path}") + return np.stack(frames, axis=0) + + +def load_camera_frames(video_dir: Path) -> dict[str, np.ndarray]: + camera_frames: dict[str, np.ndarray] = {} + for camera_key, file_name in CAMERA_FILES.items(): + video_path = video_dir / file_name + if not video_path.exists(): + raise FileNotFoundError(f"Missing DreamZero example asset: {video_path}") + camera_frames[camera_key] = load_all_frames(video_path) + return camera_frames + + +def build_frame_schedule(total_frames: int, num_chunks: int) -> list[list[int]]: + chunks: list[list[int]] = [] + current_frame = 23 + for _ in range(num_chunks): + indices = [max(current_frame + offset, 0) for offset in RELATIVE_OFFSETS] + if indices[-1] >= total_frames: + break + chunks.append(indices) + current_frame += ACTION_HORIZON + return chunks + + +def make_obs_from_video( + camera_frames: dict[str, np.ndarray], + frame_indices: list[int], + *, + prompt: str, + session_id: str, +) -> dict[str, Any]: + obs: dict[str, Any] = {} + for camera_key, all_frames in camera_frames.items(): + selected = all_frames[frame_indices] + obs[camera_key] = selected[0] if len(frame_indices) == 1 else selected + + obs["observation/joint_position"] = np.zeros(7, dtype=np.float32) + obs["observation/cartesian_position"] = np.zeros(6, dtype=np.float32) + obs["observation/gripper_position"] = np.zeros(1, dtype=np.float32) + obs["prompt"] = prompt + obs["session_id"] = session_id + return obs + + +def build_demo_observations( + camera_frames: dict[str, np.ndarray], + *, + prompt: str, + session_id: str, + num_chunks: int = 2, +) -> list[dict[str, Any]]: + if num_chunks < 1: + raise ValueError("num_chunks must be at least 1") + + total_frames = min(frames.shape[0] for frames in camera_frames.values()) + observations = [ + make_obs_from_video( + camera_frames, + [0], + prompt=prompt, + session_id=session_id, + ) + ] + for indices in build_frame_schedule(total_frames, num_chunks - 1): + observations.append( + make_obs_from_video( + camera_frames, + indices, + prompt=prompt, + session_id=session_id, + ) + ) + return observations + + +def validate_session_result( + result: dict[str, Any], + *, + expected_action_horizon: int = ACTION_HORIZON, + expected_action_dim: int = DEFAULT_ACTION_DIM, +) -> None: + metadata = DreamZeroServerMetadata.from_dict(result["metadata"]) + if metadata.image_resolution != (180, 320): + raise AssertionError(f"Unexpected image_resolution: {metadata.image_resolution}") + if metadata.n_external_cameras != 2: + raise AssertionError(f"Unexpected n_external_cameras: {metadata.n_external_cameras}") + if not metadata.needs_wrist_camera: + raise AssertionError("DreamZero example expects wrist camera metadata") + if metadata.action_space != "joint_position": + raise AssertionError(f"Unexpected action_space: {metadata.action_space}") + + actions = result["actions"] + if len(actions) != 3: + raise AssertionError(f"Expected 3 action tensors, got {len(actions)}") + for index, action in enumerate(actions): + if action.shape != (expected_action_horizon, expected_action_dim): + raise AssertionError( + f"Action {index} shape mismatch: expected " + f"{(expected_action_horizon, expected_action_dim)}, got {action.shape}" + ) + if not np.isfinite(action).all(): + raise AssertionError(f"Action {index} contains non-finite values") + + if result["reset_status"] != "reset successful": + raise AssertionError(f"Unexpected reset status: {result['reset_status']!r}") + + +def run_policy_session( + *, + host: str = DEFAULT_HOST, + port: int = DEFAULT_PORT, + path: str = DEFAULT_PATH, + video_dir: Path = ASSETS_DIR, + prompt: str = DEFAULT_PROMPT, + session_id: str | None = None, + num_chunks: int = 2, +) -> dict[str, Any]: + session_id = session_id or str(uuid.uuid4()) + camera_frames = load_camera_frames(video_dir) + observations = build_demo_observations( + camera_frames, + prompt=prompt, + session_id=session_id, + num_chunks=num_chunks, + ) + + client = OpenPIWebsocketClient(host=host, port=port, path=path) + try: + metadata = client.get_server_metadata() + actions = [client.infer(obs) for obs in observations] + reset_status = client.reset({}) + actions.append(client.infer(observations[0])) + return { + "metadata": metadata, + "actions": actions, + "reset_status": reset_status, + "session_id": session_id, + } + finally: + client.close() + + +def format_action_summary(index: int, action: np.ndarray) -> str: + return ( + f"Action {index}: shape={tuple(action.shape)} dtype={action.dtype} " + f"min={action.min():.6f} max={action.max():.6f}" + ) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="DreamZero OpenPI client example with bundled real videos.") + parser.add_argument("--host", default=DEFAULT_HOST) + parser.add_argument("--port", type=int, default=DEFAULT_PORT) + parser.add_argument("--path", default=DEFAULT_PATH) + parser.add_argument("--video-dir", type=Path, default=ASSETS_DIR) + parser.add_argument("--prompt", default=DEFAULT_PROMPT) + parser.add_argument("--session-id", default=None) + parser.add_argument("--num-chunks", type=int, default=2) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + logging.basicConfig(level=logging.INFO) + + result = run_policy_session( + host=args.host, + port=args.port, + path=args.path, + video_dir=args.video_dir, + prompt=args.prompt, + session_id=args.session_id, + num_chunks=args.num_chunks, + ) + validate_session_result(result) + + print("Server metadata:", json.dumps(result["metadata"], sort_keys=True)) + for index, action in enumerate(result["actions"]): + print(format_action_summary(index, action)) + print("Reset status:", result["reset_status"]) + print("Session ID:", result["session_id"]) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/examples/online_serving/dreamzero/run_server.sh b/examples/online_serving/dreamzero/run_server.sh new file mode 100755 index 00000000000..49785463db5 --- /dev/null +++ b/examples/online_serving/dreamzero/run_server.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +set -euo pipefail + +MODEL="${MODEL:-GEAR-Dreams/DreamZero-DROID}" +HOST="${HOST:-127.0.0.1}" +PORT="${PORT:-8000}" +CFG_PARALLEL_SIZE="${CFG_PARALLEL_SIZE:-2}" +SERVED_MODEL_NAME="${SERVED_MODEL_NAME:-dreamzero-droid}" + +args=( + serve + "$MODEL" + --omni + --host "$HOST" + --port "$PORT" + --served-model-name "$SERVED_MODEL_NAME" + --enforce-eager + --disable-log-stats +) + +if [[ -n "$CFG_PARALLEL_SIZE" ]]; then + args+=(--cfg-parallel-size "$CFG_PARALLEL_SIZE") +fi + +ATTENTION_BACKEND="${ATTENTION_BACKEND:-torch}" \ +DIFFUSION_ATTENTION_BACKEND="${DIFFUSION_ATTENTION_BACKEND:-TORCH_SDPA}" \ +vllm "${args[@]}" diff --git a/tests/diffusion/test_diffusion_scheduler.py b/tests/diffusion/test_diffusion_scheduler.py index ca28b26294e..9409fb2c993 100644 --- a/tests/diffusion/test_diffusion_scheduler.py +++ b/tests/diffusion/test_diffusion_scheduler.py @@ -563,6 +563,50 @@ def test_dummy_run_raises_on_output_error(self, mocker: MockerFixture) -> None: with pytest.raises(RuntimeError, match="Dummy run failed: boom"): engine._dummy_run() + def test_step_multi_request_reuses_multimodal_slice_logic(self) -> None: + engine = DiffusionEngine.__new__(DiffusionEngine) + engine.od_config = SimpleNamespace( + model_class_name="mock_model", + enable_cpu_offload=False, + ) + engine.pre_process_func = None + engine.post_process_func = None + engine.add_req_and_wait_for_response = Mock( + return_value=DiffusionOutput( + output={ + "video": ["frame-0", "frame-1"], + "audio": ["audio-0", "audio-1"], + "actions": torch.tensor([[1.0, 2.0], [3.0, 4.0]]), + } + ) + ) + + request = OmniDiffusionRequest( + prompts=["prompt-0", "prompt-1"], + sampling_params=OmniDiffusionSamplingParams( + num_inference_steps=1, + num_outputs_per_prompt=1, + ), + request_ids=["req-0", "req-1"], + ) + + with patch("vllm_omni.diffusion.diffusion_engine.supports_audio_output", return_value=False): + outputs = engine.step(request) + + assert len(outputs) == 2 + assert outputs[0].images == ["frame-0"] + assert outputs[1].images == ["frame-1"] + assert outputs[0].multimodal_output["audio"] == "audio-0" + assert outputs[1].multimodal_output["audio"] == "audio-1" + torch.testing.assert_close( + outputs[0].multimodal_output["actions"], + torch.tensor([1.0, 2.0]), + ) + torch.testing.assert_close( + outputs[1].multimodal_output["actions"], + torch.tensor([3.0, 4.0]), + ) + class TestStepScheduler: def setup_method(self) -> None: diff --git a/tests/diffusion/test_stage_diffusion_proc.py b/tests/diffusion/test_stage_diffusion_proc.py index 36ceb6cfc49..e2cb8087d59 100644 --- a/tests/diffusion/test_stage_diffusion_proc.py +++ b/tests/diffusion/test_stage_diffusion_proc.py @@ -9,6 +9,7 @@ import pytest +from vllm_omni.diffusion.data import OmniDiffusionConfig from vllm_omni.diffusion.stage_diffusion_proc import StageDiffusionProc from vllm_omni.inputs.data import OmniDiffusionSamplingParams @@ -146,3 +147,42 @@ async def run_task(req_data): time_gap = elapsed_time - base_time assert time_gap > time_gap_std - eps and time_gap < time_gap_std + eps base_time = elapsed_time + + +def test_enrich_config_preserves_explicit_model_class_name(monkeypatch): + monkeypatch.setattr( + "vllm.transformers_utils.config.get_hf_file_to_dict", + lambda path, _model: None if path == "model_index.json" else {"model_type": "vla", "architectures": ["VLA"]}, + ) + + od_config = OmniDiffusionConfig( + model="GEAR-Dreams/DreamZero-DROID", + model_class_name="DreamZeroPipeline", + ) + proc = StageDiffusionProc(od_config.model, od_config) + + proc._enrich_config() + + assert od_config.model_class_name == "DreamZeroPipeline" + + +def test_enrich_config_keeps_omnivoice_architecture_behavior(monkeypatch): + monkeypatch.setattr( + "vllm.transformers_utils.config.get_hf_file_to_dict", + lambda path, _model: None + if path == "model_index.json" + else { + "model_type": "omnivoice", + "architectures": ["OmniVoice"], + }, + ) + + od_config = OmniDiffusionConfig( + model="k2-fsa/OmniVoice", + model_class_name="OmniVoicePipeline", + ) + proc = StageDiffusionProc(od_config.model, od_config) + + proc._enrich_config() + + assert od_config.model_class_name == "OmniVoice" diff --git a/tests/dreamzero/test_utils.py b/tests/dreamzero/test_utils.py new file mode 100644 index 00000000000..c399bd70b1b --- /dev/null +++ b/tests/dreamzero/test_utils.py @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest + +from vllm_omni.diffusion.models.dreamzero.utils import ( + DEFAULT_CFG_SCALE, + DEFAULT_EMBODIMENT_NAME_TO_ID, + DEFAULT_NEGATIVE_PROMPT, + DEFAULT_NUM_INFERENCE_STEPS, + DEFAULT_SEED, + DEFAULT_SIGMA_SHIFT, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +def test_dreamzero_default_constants_match_source_baseline(): + assert DEFAULT_NUM_INFERENCE_STEPS == 16 + assert DEFAULT_CFG_SCALE == 5.0 + assert DEFAULT_SIGMA_SHIFT == 5.0 + assert DEFAULT_SEED == 1140 + assert "worst quality" in DEFAULT_NEGATIVE_PROMPT + assert DEFAULT_EMBODIMENT_NAME_TO_ID["oxe_droid"] == 17 diff --git a/tests/dreamzero/upstream/openpi_test_client_ar.py b/tests/dreamzero/upstream/openpi_test_client_ar.py new file mode 100644 index 00000000000..d68af43dd54 --- /dev/null +++ b/tests/dreamzero/upstream/openpi_test_client_ar.py @@ -0,0 +1,366 @@ +#!/usr/bin/env python3 +""" +Copied from the DreamZero repository's `test_client_AR.py`: +https://github.com/dreamzero0/dreamzero/blob/main/test_client_AR.py + +Kept here for end-to-end compatibility / parity testing against the vLLM +OpenPI server. + +Test client for AR_droid policy server using roboarena interface. + +Sends real video frames from debug_image/ directory instead of zero dummy images. + +Frame schedule (matching debug_inference.py): + - Step 0 (initial): send frame [0] (1 frame, H W 3) + - Step 1: send frames [0, 7, 15, 23] (4 frames, 4 H W 3) + - Step 2: send frames [24, 31, 39, 47] (4 frames) + - Step 3: send frames [48, 55, 63, 71] (4 frames) + - ... + +Expected server configuration: + - image_resolution: (180, 320) + - n_external_cameras: 2 + - needs_wrist_camera: True + - action_space: "joint_position" + +Usage: + # Start server with roboarena interface: + torchrun --nproc_per_node=8 socket_test_optimized_AR.py --port 8000 + + # Run against the original DreamZero websocket server (default path): + python test_client_AR.py --host --port 8000 + + # Run against the vLLM OpenPI server: + python test_client_AR.py --host --port 8000 --path /v1/realtime/robot/openpi + + # Use zero images instead of real video (old behavior): + python test_client_AR.py --host --port 8000 --use-zero-images +""" + +import argparse +import logging +import os +import sys +import time +import uuid +from pathlib import Path + +DREAMZERO_REPO = Path(os.environ.get("DREAMZERO_REPO", "~/code/dreamzero")).expanduser() +if DREAMZERO_REPO.exists() and str(DREAMZERO_REPO) not in sys.path: + sys.path.insert(0, str(DREAMZERO_REPO)) + +import cv2 +import eval_utils.policy_server as policy_server +import numpy as np +from eval_utils.policy_client import WebsocketClientPolicy +from openpi_client import msgpack_numpy + +VIDEO_DIR = os.environ.get("DREAMZERO_VIDEO_DIR", str(DREAMZERO_REPO / "debug_image")) + +# roboarena key -> video filename +CAMERA_FILES = { + "observation/exterior_image_0_left": "exterior_image_1_left.mp4", + "observation/exterior_image_1_left": "exterior_image_2_left.mp4", + "observation/wrist_image_left": "wrist_image_left.mp4", +} + +# Frame schedule constants (matching debug_inference.py) +RELATIVE_OFFSETS = [-23, -16, -8, 0] +ACTION_HORIZON = 24 +DEFAULT_WEBSOCKET_PATH = "" + + +class OpenPIWebsocketClientPolicy(WebsocketClientPolicy): + """DreamZero websocket client with a configurable path suffix. + + The original DreamZero client connects to ``ws://host:port``. + vLLM serves the compatible robot policy endpoint at + ``/v1/realtime/robot/openpi`` when ``path`` is set accordingly. + """ + + def __init__( + self, + host: str = "0.0.0.0", + port: int = 8000, + path: str = DEFAULT_WEBSOCKET_PATH, + ) -> None: + self._uri = f"ws://{host}:{port}{path}" + self._packer = msgpack_numpy.Packer() + self._ws, self._server_metadata = self._wait_for_server() + + +def load_all_frames(video_path: str) -> np.ndarray: + """Load all frames from a video file. Returns (N, H, W, 3) uint8 array (RGB).""" + cap = cv2.VideoCapture(video_path) + frames = [] + while True: + ret, frame = cap.read() + if not ret: + break + frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) + cap.release() + if not frames: + raise RuntimeError(f"No frames loaded from {video_path}") + return np.stack(frames, axis=0) + + +def load_camera_frames() -> dict[str, np.ndarray]: + """Load all video frames for each camera from the debug_image/ directory. + + Returns: + Dict mapping roboarena camera keys to (N, H, W, 3) uint8 arrays. + """ + camera_frames: dict[str, np.ndarray] = {} + for cam_key, fname in CAMERA_FILES.items(): + path = os.path.join(VIDEO_DIR, fname) + camera_frames[cam_key] = load_all_frames(path) + logging.info(f"Loaded {cam_key}: {camera_frames[cam_key].shape}") + return camera_frames + + +def build_frame_schedule(total_frames: int, num_chunks: int) -> list[list[int]]: + """Build the frame index schedule for multi-frame chunks. + + Returns a list of frame-index lists. Each inner list has 4 indices. + """ + chunks: list[list[int]] = [] + current_frame = 23 # first anchor frame + for _ in range(num_chunks): + indices = [max(current_frame + off, 0) for off in RELATIVE_OFFSETS] + if indices[-1] >= total_frames: + logging.info(f"Frame {indices[-1]} >= {total_frames}, stopping at {len(chunks)} chunks") + break + chunks.append(indices) + current_frame += ACTION_HORIZON + return chunks + + +def _make_obs_from_video( + camera_frames: dict[str, np.ndarray], + frame_indices: list[int], + prompt: str, + session_id: str, +) -> dict: + """Build an observation dict from real video frames. + + For 1 frame: each image key is (H, W, 3). + For 4 frames: each image key is (4, H, W, 3). + """ + obs: dict = {} + for cam_key, all_frames in camera_frames.items(): + selected = all_frames[frame_indices] # (T, H, W, 3) + if len(frame_indices) == 1: + selected = selected[0] # (H, W, 3) + obs[cam_key] = selected + + obs["observation/joint_position"] = np.zeros(7, dtype=np.float32) + obs["observation/cartesian_position"] = np.zeros(6, dtype=np.float32) + obs["observation/gripper_position"] = np.zeros(1, dtype=np.float32) + obs["prompt"] = prompt + obs["session_id"] = session_id + return obs + + +def _make_zero_observation( + server_config: policy_server.PolicyServerConfig, + prompt: str = "pick up the object", + session_id: str | None = None, +) -> dict: + """Create a dummy observation matching AR_droid expectations. + + AR_droid expects: + - 2 external cameras (exterior_image_0_left, exterior_image_1_left) + - 1 wrist camera (wrist_image_left) + - Image resolution: 180x320 (H x W) + - joint_position: 7 DoF + - gripper_position: 1 DoF + """ + obs = {} + + # Determine image resolution + if server_config.image_resolution is not None: + h, w = server_config.image_resolution + else: + # Default for AR_droid + h, w = 180, 320 + + # External cameras (0-indexed in roboarena) + for i in range(server_config.n_external_cameras): + obs[f"observation/exterior_image_{i}_left"] = np.zeros((h, w, 3), dtype=np.uint8) + if server_config.needs_stereo_camera: + obs[f"observation/exterior_image_{i}_right"] = np.zeros((h, w, 3), dtype=np.uint8) + + # Wrist camera + if server_config.needs_wrist_camera: + obs["observation/wrist_image_left"] = np.zeros((h, w, 3), dtype=np.uint8) + if server_config.needs_stereo_camera: + obs["observation/wrist_image_right"] = np.zeros((h, w, 3), dtype=np.uint8) + + # Session ID - should be passed in to ensure consistency within a session + if server_config.needs_session_id: + import uuid + + # Generate unique session ID if not provided + obs["session_id"] = session_id if session_id else str(uuid.uuid4()) + + # State observations (AR_droid: 7 DoF arm + 1 gripper) + obs["observation/joint_position"] = np.zeros(7, dtype=np.float32) + obs["observation/cartesian_position"] = np.zeros(6, dtype=np.float32) + obs["observation/gripper_position"] = np.zeros(1, dtype=np.float32) + + # Language prompt + obs["prompt"] = prompt + + return obs + + +def test_ar_droid_policy_server( + host: str = "localhost", + port: int = 8000, + path: str = DEFAULT_WEBSOCKET_PATH, + num_chunks: int = 15, + prompt: str = "Move the pan forward and use the brush in the middle of the plates to brush the inside of the pan", + use_zero_images: bool = False, +): + """Test the AR_droid policy server with roboarena interface. + + When use_zero_images is False (default), loads real video frames from + debug_image/ and follows the frame schedule from debug_inference.py. + """ + logging.info(f"Connecting to AR_droid server at ws://{host}:{port}{path} ...") + + client = OpenPIWebsocketClientPolicy(host=host, port=port, path=path) + + # Validate server metadata + metadata = client.get_server_metadata() + logging.info(f"Server metadata: {metadata}") + assert isinstance(metadata, dict), "Metadata should be a dict" + + try: + server_config = policy_server.PolicyServerConfig(**metadata) + except Exception as e: + logging.error(f"Error parsing metadata: {e}") + raise e + + # Validate expected AR_droid configuration + logging.info(f"Server config: {server_config}") + assert server_config.n_external_cameras == 2, f"Expected 2 external cameras, got {server_config.n_external_cameras}" + assert server_config.needs_wrist_camera, "Expected wrist camera to be enabled" + assert server_config.action_space == "joint_position", ( + f"Expected joint_position action space, got {server_config.action_space}" + ) + + logging.info("Server configuration validated for AR_droid") + + # Generate unique session ID for this test run + session_id = str(uuid.uuid4()) + logging.info(f"Session ID: {session_id}") + + # ── Zero-image fallback mode ────────────────────────────────────── + if use_zero_images: + logging.info("Using ZERO dummy images (legacy mode)") + for i in range(num_chunks): + obs = _make_zero_observation(server_config, prompt=prompt, session_id=session_id) + logging.info(f"Inference {i + 1}/{num_chunks}: prompt='{prompt}'") + t0 = time.time() + actions = client.infer(obs) + dt = time.time() - t0 + _log_action(actions, dt) + + logging.info("Sending reset...") + client.reset({}) + logging.info("Done (zero-image mode).") + return + + # ── Real video frame mode ───────────────────────────────────────── + logging.info("Loading real video frames from debug_image/ directory") + camera_frames = load_camera_frames() + + total_frames = min(v.shape[0] for v in camera_frames.values()) + logging.info(f"Total frames available: {total_frames}") + + # Build frame schedule + chunks = build_frame_schedule(total_frames, num_chunks) + + logging.info("Frame schedule:") + logging.info(" Initial: [0]") + for i, indices in enumerate(chunks): + logging.info(f" Chunk {i}: {indices}") + + # Step 0: initial single frame + logging.info("=== Initial: frame [0] ===") + obs = _make_obs_from_video(camera_frames, [0], prompt, session_id) + t0 = time.time() + actions = client.infer(obs) + dt = time.time() - t0 + _log_action(actions, dt) + + # Subsequent chunks: send 4 frames at a time + for chunk_idx, frame_indices in enumerate(chunks): + logging.info(f"=== Chunk {chunk_idx}: frames {frame_indices} ===") + obs = _make_obs_from_video(camera_frames, frame_indices, prompt, session_id) + t0 = time.time() + actions = client.infer(obs) + dt = time.time() - t0 + _log_action(actions, dt) + + # Reset triggers video save on the server + logging.info("Sending reset to save video...") + client.reset({}) + + logging.info("Done.") + + +def _log_action(actions: np.ndarray, dt: float) -> None: + """Pretty-print action shape, range, and timing.""" + assert isinstance(actions, np.ndarray), f"Expected numpy array, got {type(actions)}" + assert actions.ndim == 2, f"Expected 2D array, got shape {actions.shape}" + assert actions.shape[-1] == 8, f"Expected 8 action dims (7 joints + 1 gripper), got {actions.shape[-1]}" + logging.info(f" Action shape: {actions.shape}, range: [{actions.min():.4f}, {actions.max():.4f}], time: {dt:.2f}s") + + +def main(): + parser = argparse.ArgumentParser(description="Test AR_droid policy server with real video frames from debug_image/") + parser.add_argument("--host", default="localhost", help="Server hostname") + parser.add_argument("--port", type=int, default=8000, help="Server port") + parser.add_argument( + "--path", + default=DEFAULT_WEBSOCKET_PATH, + help="WebSocket path suffix (default: empty string for the original DreamZero server)", + ) + parser.add_argument( + "--num-chunks", + type=int, + default=15, + help="Number of 4-frame chunks to send after the initial frame (default: 15)", + ) + parser.add_argument( + "--prompt", + default="Move the pan forward and use the brush in the middle of the plates to brush the inside of the pan", + help="Language prompt for the policy", + ) + parser.add_argument( + "--use-zero-images", + action="store_true", + help="Use zero dummy images instead of real video frames (legacy mode)", + ) + + args = parser.parse_args() + + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + ) + + test_ar_droid_policy_server( + host=args.host, + port=args.port, + path=args.path, + num_chunks=args.num_chunks, + prompt=args.prompt, + use_zero_images=args.use_zero_images, + ) + + +if __name__ == "__main__": + main() diff --git a/tests/dreamzero/upstream/test_client_ar_path_parity.py b/tests/dreamzero/upstream/test_client_ar_path_parity.py new file mode 100644 index 00000000000..0705b9e88ed --- /dev/null +++ b/tests/dreamzero/upstream/test_client_ar_path_parity.py @@ -0,0 +1,171 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Checks that the copied DreamZero client keeps identical logic across paths. + +The only difference between talking to the upstream DreamZero websocket server +and the vLLM OpenPI websocket server should be the websocket URI suffix: + +- upstream DreamZero: ``ws://host:port`` +- vLLM OpenPI: ``ws://host:port/v1/realtime/robot/openpi`` + +This file verifies that `tests/dreamzero/upstream/openpi_test_client_ar.py` preserves the same +observation / infer / reset flow for both cases. +""" + +from __future__ import annotations + +import importlib.util +import os +import sys +import uuid +from pathlib import Path + +import numpy as np +import pytest + +DREAMZERO_REPO = Path(os.environ.get("DREAMZERO_REPO", "~/code/dreamzero")).expanduser() +CLIENT_SCRIPT = Path(__file__).resolve().with_name("openpi_test_client_ar.py") + +pytestmark = pytest.mark.skipif( + not DREAMZERO_REPO.exists(), + reason="DreamZero source repo is required", +) + +if str(DREAMZERO_REPO) not in sys.path: + sys.path.insert(0, str(DREAMZERO_REPO)) + + +def _load_client_module(): + spec = importlib.util.spec_from_file_location( + "dreamzero_test_client_ar_module", + CLIENT_SCRIPT, + ) + assert spec is not None + assert spec.loader is not None + module = importlib.util.module_from_spec(spec) + try: + spec.loader.exec_module(module) + except ModuleNotFoundError as exc: + pytest.skip(f"DreamZero client dependency is missing: {exc.name}") + return module + + +def _snapshot_obs(obs: dict) -> dict: + snapshot = {} + for key, value in obs.items(): + if isinstance(value, np.ndarray): + snapshot[key] = value.copy() + else: + snapshot[key] = value + return snapshot + + +def _assert_obs_sequence_equal(actual: list[dict], expected: list[dict]) -> None: + assert len(actual) == len(expected) + for actual_obs, expected_obs in zip(actual, expected, strict=True): + assert set(actual_obs) == set(expected_obs) + for key in actual_obs: + actual_value = actual_obs[key] + expected_value = expected_obs[key] + if isinstance(actual_value, np.ndarray): + assert isinstance(expected_value, np.ndarray) + assert actual_value.dtype == expected_value.dtype + assert actual_value.shape == expected_value.shape + assert np.array_equal(actual_value, expected_value) + else: + assert actual_value == expected_value + + +def test_websocket_uri_differs_only_by_path(monkeypatch) -> None: + client_mod = _load_client_module() + monkeypatch.setattr( + client_mod.OpenPIWebsocketClientPolicy, + "_wait_for_server", + lambda self: (object(), {}), + raising=False, + ) + + upstream = client_mod.OpenPIWebsocketClientPolicy( + host="127.0.0.1", + port=8000, + path="", + ) + vllm = client_mod.OpenPIWebsocketClientPolicy( + host="127.0.0.1", + port=8000, + path="/v1/realtime/robot/openpi", + ) + + assert upstream._uri == "ws://127.0.0.1:8000" + assert vllm._uri == "ws://127.0.0.1:8000/v1/realtime/robot/openpi" + + +def test_zero_image_client_flow_is_identical_across_server_paths(monkeypatch) -> None: + client_mod = _load_client_module() + fixed_session_id = uuid.UUID("12345678-1234-5678-1234-567812345678") + monkeypatch.setattr(uuid, "uuid4", lambda: fixed_session_id) + monkeypatch.setattr(client_mod, "_log_action", lambda actions, dt: None) + + instances = [] + + class FakeClient: + def __init__(self, host: str, port: int, path: str) -> None: + self.host = host + self.port = port + self.path = path + self.metadata_calls = 0 + self.infer_obs = [] + self.reset_payloads = [] + instances.append(self) + + def get_server_metadata(self) -> dict: + self.metadata_calls += 1 + return { + "image_resolution": [180, 320], + "n_external_cameras": 2, + "needs_wrist_camera": True, + "needs_stereo_camera": False, + "needs_session_id": True, + "action_space": "joint_position", + } + + def infer(self, obs: dict) -> np.ndarray: + self.infer_obs.append(_snapshot_obs(obs)) + return np.zeros((24, 8), dtype=np.float32) + + def reset(self, payload: dict) -> str: + self.reset_payloads.append(dict(payload)) + return "reset successful" + + monkeypatch.setattr(client_mod, "OpenPIWebsocketClientPolicy", FakeClient) + + client_mod.test_ar_droid_policy_server( + host="127.0.0.1", + port=8000, + path="", + num_chunks=2, + prompt="pick up the object", + use_zero_images=True, + ) + client_mod.test_ar_droid_policy_server( + host="127.0.0.1", + port=8000, + path="/v1/realtime/robot/openpi", + num_chunks=2, + prompt="pick up the object", + use_zero_images=True, + ) + + assert len(instances) == 2 + upstream, vllm = instances + + assert upstream.path == "" + assert vllm.path == "/v1/realtime/robot/openpi" + assert upstream.metadata_calls == 1 + assert vllm.metadata_calls == 1 + assert len(upstream.infer_obs) == 2 + assert len(vllm.infer_obs) == 2 + _assert_obs_sequence_equal(upstream.infer_obs, vllm.infer_obs) + assert upstream.reset_payloads == [{}] + assert vllm.reset_payloads == [{}] diff --git a/tests/dreamzero/upstream/test_openpi_e2e_source_parity.py b/tests/dreamzero/upstream/test_openpi_e2e_source_parity.py new file mode 100644 index 00000000000..065ff144d00 --- /dev/null +++ b/tests/dreamzero/upstream/test_openpi_e2e_source_parity.py @@ -0,0 +1,345 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Formal OpenPI end-to-end parity: upstream DreamZero server vs `vllm serve`. + +This test uses DreamZero's own client-side observation builders from +`~/code/dreamzero/test_client_AR.py`, and client-side websocket protocol from +`~/code/dreamzero/eval_utils/policy_client.py`. + +The only client-side adaptation for vLLM is the websocket path: +DreamZero's upstream server serves at `/`, while vLLM serves OpenPI at +`/v1/realtime/robot/openpi`. + +Current scope for this test: +- non-TP (`nproc_per_node=1` on upstream, single-GPU or cfg-parallel `vllm serve`) +- non-`torch.compile` (upstream launched through + `upstream_socket_server_no_compile.py`, vLLM with `--enforce-eager`) +- non-DiT-cache / non-skip-schedule (`NUM_DIT_STEPS=16`) + +Serving contract locked by this test: +- upstream DreamZero still boots from the local checkpoint directory +- vLLM boots from the official DreamZero HF repo name (`GEAR-Dreams/DreamZero-DROID`) + rather than a prepared local bundle path +""" + +from __future__ import annotations + +import os +import shutil +import socket +import subprocess +import sys +import time +from pathlib import Path + +import numpy as np +import pytest +import torch + +msgpack_numpy = pytest.importorskip("openpi_client.msgpack_numpy") + +DREAMZERO_REPO = Path(os.environ.get("DREAMZERO_REPO", "~/code/dreamzero")).expanduser() +if str(DREAMZERO_REPO) not in sys.path: + sys.path.insert(0, str(DREAMZERO_REPO)) + +try: + import test_client_AR as dreamzero_client + from eval_utils.policy_client import WebsocketClientPolicy +except Exception: # pragma: no cover - guarded by pytest skip below + dreamzero_client = None + WebsocketClientPolicy = None + +CHECKPOINT_DIR = DREAMZERO_REPO / "checkpoints" / "dreamzero" +VLLM_MODEL = os.environ.get("VLLM_DREAMZERO_MODEL", "GEAR-Dreams/DreamZero-DROID") +SERVICE_READY_TIMEOUT_S = int(os.environ.get("OPENPI_SERVICE_READY_TIMEOUT_S", "900")) +PROMPT = "Move the pan forward and use the brush in the middle of the plates to brush the inside of the pan" +SESSION_ID = "openpi-e2e-parity-session" + +pytestmark = [ + pytest.mark.skipif(not torch.cuda.is_available(), reason="GPU required"), + pytest.mark.skipif( + dreamzero_client is None or WebsocketClientPolicy is None, + reason="DreamZero client modules are required on PYTHONPATH", + ), + pytest.mark.skipif(not DREAMZERO_REPO.exists(), reason="DreamZero source repo is required at ~/code/dreamzero"), + pytest.mark.skipif(not CHECKPOINT_DIR.exists(), reason="DreamZero local checkpoint is required"), +] + + +class OpenPIWebsocketClientPolicy(WebsocketClientPolicy): + """DreamZero client protocol with an OpenPI websocket path suffix.""" + + def __init__( + self, + host: str = "127.0.0.1", + port: int = 8000, + path: str = "/v1/realtime/robot/openpi", + ) -> None: + self._uri = f"ws://{host}:{port}{path}" + self._packer = msgpack_numpy.Packer() + self._ws, self._server_metadata = self._wait_for_server() + + +def _find_free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.bind(("127.0.0.1", 0)) + return int(sock.getsockname()[1]) + + +def _vllm_executable() -> str: + fallback = Path(sys.executable).with_name("vllm") + if fallback.exists(): + return str(fallback) + exe = shutil.which("vllm") + if exe: + return exe + raise FileNotFoundError("Unable to locate `vllm` executable in current environment.") + + +def _cfg_parallel_size() -> int: + return int(os.environ.get("OPENPI_E2E_CFG_PARALLEL_SIZE", "1")) + + +def _pick_test_gpus() -> list[str]: + override = os.environ.get("OPENPI_E2E_GPUS") or os.environ.get("OPENPI_E2E_GPU") + if override is not None: + gpus = [part.strip() for part in override.split(",") if part.strip()] + if not gpus: + raise ValueError("OPENPI_E2E_GPUS is set but empty.") + return gpus + + cfg_parallel_size = _cfg_parallel_size() + + query = subprocess.check_output( + [ + "nvidia-smi", + "--query-gpu=index,memory.used", + "--format=csv,noheader,nounits", + ], + text=True, + ) + gpu_rows = [] + for line in query.strip().splitlines(): + gpu_index, used_mb = [part.strip() for part in line.split(",", maxsplit=1)] + gpu_rows.append((int(used_mb), gpu_index)) + gpu_rows.sort() + gpus = [gpu_index for _, gpu_index in gpu_rows[: max(cfg_parallel_size, 1)]] + if len(gpus) < cfg_parallel_size: + raise RuntimeError( + f"Need {cfg_parallel_size} GPUs for cfg_parallel_size={cfg_parallel_size}, " + f"but found only {len(gpus)} candidates." + ) + return gpus + + +def _torchrun_argv(script: str, port: int) -> list[str]: + return [ + sys.executable, + "-m", + "torch.distributed.run", + "--standalone", + "--nproc_per_node=1", + script, + "--port", + str(port), + "--model_path", + str(CHECKPOINT_DIR), + ] + + +def _run_upstream_service(port: int, log_path: Path) -> subprocess.Popen[str]: + env = os.environ.copy() + env.setdefault("PYTHONPATH", "") + env["PYTHONPATH"] = f"{Path.cwd()}:{DREAMZERO_REPO}:{env['PYTHONPATH']}".rstrip(":") + env["CUDA_VISIBLE_DEVICES"] = _pick_test_gpus()[0] + env.setdefault("NO_ALBUMENTATIONS_UPDATE", "1") + env.setdefault("ENABLE_TENSORRT", "false") + env["ENABLE_DIT_CACHE"] = "false" + env["NUM_DIT_STEPS"] = "16" + env["DYNAMIC_CACHE_SCHEDULE"] = "false" + argv = _torchrun_argv( + str(Path("tests/dreamzero/upstream/upstream_socket_server_no_compile.py")), + port, + ) + log_file = log_path.open("w") + proc = subprocess.Popen( + argv, + stdout=log_file, + stderr=subprocess.STDOUT, + text=True, + cwd=str(Path.cwd()), + env=env, + ) + proc._codex_log_file = log_file # type: ignore[attr-defined] + return proc + + +def _run_vllm_service(port: int, log_path: Path) -> subprocess.Popen[str]: + env = os.environ.copy() + gpus = _pick_test_gpus() + cfg_parallel_size = _cfg_parallel_size() + if cfg_parallel_size > len(gpus): + raise RuntimeError( + f"cfg_parallel_size={cfg_parallel_size} requires at least {cfg_parallel_size} GPUs, " + f"but only got {gpus}." + ) + env["CUDA_VISIBLE_DEVICES"] = ",".join(gpus[:cfg_parallel_size]) + env.setdefault("ATTENTION_BACKEND", "torch") + env.setdefault("DIFFUSION_ATTENTION_BACKEND", "TORCH_SDPA") + env.setdefault("MASTER_PORT", str(_find_free_port())) + argv = [ + _vllm_executable(), + "serve", + VLLM_MODEL, + "--omni", + "--host", + "127.0.0.1", + "--port", + str(port), + "--served-model-name", + "dreamzero-droid", + "--enforce-eager", + ] + if cfg_parallel_size > 1: + argv.extend(["--cfg-parallel-size", str(cfg_parallel_size)]) + log_file = log_path.open("w") + proc = subprocess.Popen( + argv, + stdout=log_file, + stderr=subprocess.STDOUT, + text=True, + env=env, + cwd=str(Path.cwd()), + ) + proc._codex_log_file = log_file # type: ignore[attr-defined] + return proc + + +def _stop_process(proc: subprocess.Popen[str]) -> None: + log_file = getattr(proc, "_codex_log_file", None) + if proc.poll() is None: + proc.terminate() + try: + proc.wait(timeout=30) + except subprocess.TimeoutExpired: # pragma: no cover - cleanup path + proc.kill() + proc.wait(timeout=10) + if log_file is not None: + log_file.close() + + +def _build_obs_sequence() -> tuple[dict, dict]: + camera_frames = dreamzero_client.load_camera_frames() + chunks = dreamzero_client.build_frame_schedule( + min(v.shape[0] for v in camera_frames.values()), + 1, + ) + obs0 = dreamzero_client._make_obs_from_video(camera_frames, [0], PROMPT, SESSION_ID) + obs1 = dreamzero_client._make_obs_from_video(camera_frames, chunks[0], PROMPT, SESSION_ID) + return obs0, obs1 + + +def _wait_for_client_ready(client_factory, timeout_s: float, proc=None, log_path: Path | None = None): + deadline = time.time() + timeout_s + last_err: Exception | None = None + while time.time() < deadline: + if proc is not None and proc.poll() is not None: + details = "" + if log_path is not None and log_path.exists(): + details = log_path.read_text(errors="replace")[-8000:] + raise RuntimeError(f"Service exited before becoming ready with code {proc.returncode}.\n{details}") + try: + return client_factory() + except Exception as exc: # pragma: no cover - retry path + last_err = exc + time.sleep(1) + raise TimeoutError(f"Timed out waiting for websocket service: {last_err}") + + +def _collect_outputs_with_client(client) -> tuple[dict, list[np.ndarray]]: + metadata = client.get_server_metadata() + obs0, obs1 = _build_obs_sequence() + outputs = [ + client.infer(dict(obs0)), + client.infer(dict(obs1)), + ] + assert client.reset({}) == "reset successful" + outputs.append(client.infer(dict(obs0))) + client._ws.close() + return metadata, outputs + + +def _normalize_metadata(metadata: dict) -> dict: + normalized = dict(metadata) + if isinstance(normalized.get("image_resolution"), tuple): + normalized["image_resolution"] = list(normalized["image_resolution"]) + return normalized + + +def _assert_logs_clean(log_path: Path) -> None: + text = log_path.read_text(errors="replace") + if "SignalException: Process" in text and "got signal: 15" in text: + text = text.split("Traceback (most recent call last):", 1)[0] + assert "Traceback" not in text, text + assert "RuntimeError:" not in text, text + + +def _assert_upstream_log_matches_vllm_baseline(log_path: Path) -> None: + text = log_path.read_text(errors="replace") + assert "DIT Compute Steps 8 steps" not in text, text + assert "DIT Compute Steps 16 steps" in text, text + + +def test_openpi_service_matches_upstream_server_noncompile(tmp_path: Path) -> None: + expected_metadata = { + "image_resolution": [180, 320], + "n_external_cameras": 2, + "needs_wrist_camera": True, + "needs_stereo_camera": False, + "needs_session_id": True, + "action_space": "joint_position", + } + + upstream_port = _find_free_port() + upstream_log = tmp_path / "dreamzero_upstream.log" + upstream_proc = _run_upstream_service(upstream_port, upstream_log) + try: + upstream_client = _wait_for_client_ready( + lambda: WebsocketClientPolicy(host="127.0.0.1", port=upstream_port), + timeout_s=SERVICE_READY_TIMEOUT_S, + proc=upstream_proc, + log_path=upstream_log, + ) + upstream_metadata, upstream_outputs = _collect_outputs_with_client(upstream_client) + finally: + _stop_process(upstream_proc) + _assert_logs_clean(upstream_log) + _assert_upstream_log_matches_vllm_baseline(upstream_log) + + vllm_port = _find_free_port() + vllm_log = tmp_path / "vllm_openpi.log" + vllm_proc = _run_vllm_service(vllm_port, vllm_log) + try: + vllm_client = _wait_for_client_ready( + lambda: OpenPIWebsocketClientPolicy(host="127.0.0.1", port=vllm_port), + timeout_s=SERVICE_READY_TIMEOUT_S, + proc=vllm_proc, + log_path=vllm_log, + ) + vllm_metadata, vllm_outputs = _collect_outputs_with_client(vllm_client) + finally: + _stop_process(vllm_proc) + _assert_logs_clean(vllm_log) + + assert _normalize_metadata(upstream_metadata) == expected_metadata + assert _normalize_metadata(vllm_metadata) == expected_metadata + + for idx, (actual, expected) in enumerate(zip(vllm_outputs, upstream_outputs, strict=True)): + np.testing.assert_allclose( + actual, + expected, + rtol=0.0, + atol=0.0, + err_msg=f"OpenPI step {idx} output mismatch", + ) diff --git a/tests/dreamzero/upstream/test_roboarena_transform_source_parity.py b/tests/dreamzero/upstream/test_roboarena_transform_source_parity.py new file mode 100644 index 00000000000..6a97a882643 --- /dev/null +++ b/tests/dreamzero/upstream/test_roboarena_transform_source_parity.py @@ -0,0 +1,121 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Source parity checks for the RoboArena → DreamZero input path. + +This test targets the non-model part of the OpenPI DreamZero chain: + +- `socket_test_optimized_AR.py:ARDroidRoboarenaPolicy._convert_observation()` +- upstream `eval_transform.apply()` +- local `RoboArenaTransform.transform_input()` +- local prompt tokenization + state normalization path used by + `DreamZeroPipeline.forward()` + +The goal is to make sure the local serving pre-processing feeds the same +stitched video, prompt tokens, and normalized state into the model as the +upstream DreamZero source server. +""" + +from __future__ import annotations + +import json +import os +import sys +from pathlib import Path + +import numpy as np +import pytest +import torch + +from vllm_omni.diffusion.models.dreamzero.pipeline_dreamzero import DreamZeroPipeline +from vllm_omni.diffusion.models.dreamzero.transform.roboarena import ( + RoboArenaTransform, +) + +instantiate = pytest.importorskip("hydra.utils").instantiate +OmegaConf = pytest.importorskip("omegaconf").OmegaConf +AutoTokenizer = pytest.importorskip("transformers").AutoTokenizer + +DREAMZERO_REPO = Path(os.environ.get("DREAMZERO_REPO", "~/code/dreamzero")).expanduser() +CHECKPOINT_DIR = DREAMZERO_REPO / "checkpoints" / "dreamzero" +PROMPT = "Move the pan forward and use the brush in the middle of the plates to brush the inside of the pan" +SESSION_ID = "roboarena-transform-source-parity" + +pytestmark = [ + pytest.mark.skipif(not DREAMZERO_REPO.exists(), reason="DreamZero source repo is required at ~/code/dreamzero"), + pytest.mark.skipif(not CHECKPOINT_DIR.exists(), reason="DreamZero local checkpoint is required"), +] + + +def _load_source_normalized_input(): + if str(DREAMZERO_REPO) not in sys.path: + sys.path.insert(0, str(DREAMZERO_REPO)) + + import test_client_AR as dreamzero_client + from groot.vla.data.schema import DatasetMetadata, EmbodimentTag + from groot.vla.model.n1_5.sim_policy import unsqueeze_dict_values + from socket_test_optimized_AR import ARDroidRoboarenaPolicy + + class DummyPolicy: + pass + + camera_frames = dreamzero_client.load_camera_frames() + obs0 = dreamzero_client._make_obs_from_video(camera_frames, [0], PROMPT, SESSION_ID) + + adapter = ARDroidRoboarenaPolicy(groot_policy=DummyPolicy(), signal_group=None) + converted = unsqueeze_dict_values(adapter._convert_observation(dict(obs0))) + + train_cfg = OmegaConf.load(CHECKPOINT_DIR / "experiment_cfg" / "conf.yaml") + with open(CHECKPOINT_DIR / "experiment_cfg" / "metadata.json") as f: + metadatas = json.load(f) + + metadata = DatasetMetadata.model_validate(metadatas[EmbodimentTag.OXE_DROID.value]) + eval_transform = instantiate(train_cfg.transforms[EmbodimentTag.OXE_DROID.value]) + eval_transform.set_metadata(metadata) + eval_transform.eval() + normalized = eval_transform.apply(dict(converted)) + + return obs0, metadatas, normalized + + +def test_roboarena_transform_matches_source_video_prompt_and_state(): + obs0, metadatas, source_normalized = _load_source_normalized_input() + + local_transform = RoboArenaTransform() + local_unified = local_transform.transform_input(dict(obs0)) + + source_images = source_normalized["images"].cpu().numpy() + if source_images.ndim == 5 and source_images.shape[0] == 1: + source_images = source_images[0] + assert np.array_equal(local_unified["images"], source_images) + + tokenizer = AutoTokenizer.from_pretrained("google/umt5-xxl") + local_text = tokenizer( + local_unified["prompt"], + max_length=512, + padding="max_length", + truncation=True, + return_tensors="pt", + add_special_tokens=True, + ) + assert torch.equal(local_text["input_ids"], source_normalized["text"].cpu()) + assert torch.equal( + local_text["attention_mask"], + source_normalized["text_attention_mask"].cpu(), + ) + + pipe = DreamZeroPipeline.__new__(DreamZeroPipeline) + pipe.state_norm_stats = DreamZeroPipeline._parse_state_norm_stats(metadatas) + + raw_state = np.asarray(local_unified["state"], dtype=np.float64) + padded = np.zeros(64, dtype=np.float64) + padded[: len(raw_state)] = raw_state + local_state = torch.from_numpy(padded).reshape(1, 1, 64).to(dtype=torch.float32) + local_state = DreamZeroPipeline._normalize_state(pipe, local_state, "oxe_droid") + + torch.testing.assert_close( + local_state, + source_normalized["state"].float(), + atol=1e-7, + rtol=0.0, + ) diff --git a/tests/dreamzero/upstream/test_video_preprocess_source_parity.py b/tests/dreamzero/upstream/test_video_preprocess_source_parity.py new file mode 100644 index 00000000000..69d76d1e240 --- /dev/null +++ b/tests/dreamzero/upstream/test_video_preprocess_source_parity.py @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Parity test for DreamZero video preprocessing order. + +The upstream eager path in +`groot/vla/model/dreamzero/action_head/wan_flow_matching_action_tf.py:952-966` +casts the input video to `bfloat16` *before* applying `normalize_video` +(`x * 2 - 1`). That cast order matters on CUDA and must be preserved for +end-to-end parity. +""" + +from __future__ import annotations + +import os +import sys +from pathlib import Path + +import pytest +import torch + +from vllm_omni.diffusion.models.dreamzero.pipeline_dreamzero import ( + DreamZeroPipeline, +) +from vllm_omni.diffusion.models.dreamzero.transform.roboarena import ( + RoboArenaTransform, +) + +DREAMZERO_REPO = Path(os.environ.get("DREAMZERO_REPO", "~/code/dreamzero")).expanduser() +PROMPT = "Move the pan forward and use the brush in the middle of the plates to brush the inside of the pan" +SESSION_ID = "video-preprocess-source-parity" + +pytestmark = [ + pytest.mark.skipif(not torch.cuda.is_available(), reason="GPU required"), + pytest.mark.skipif(not DREAMZERO_REPO.exists(), reason="DreamZero source repo is required at ~/code/dreamzero"), +] + + +def _load_real_video() -> torch.Tensor: + if str(DREAMZERO_REPO) not in sys.path: + sys.path.insert(0, str(DREAMZERO_REPO)) + + import test_client_AR as dreamzero_client + + camera_frames = dreamzero_client.load_camera_frames() + obs = dreamzero_client._make_obs_from_video(camera_frames, [0], PROMPT, SESSION_ID) + stitched = RoboArenaTransform().transform_input(obs)["images"] + return torch.from_numpy(stitched).unsqueeze(0).to(device="cuda:0") + + +def test_preprocess_video_matches_source_bf16_cast_order() -> None: + videos = _load_real_video() # uint8 [B, T, H, W, C] + + actual = DreamZeroPipeline._preprocess_video(None, videos).float() + + expected = videos.permute(0, 4, 1, 2, 3) + expected = expected.float() / 255.0 + expected = expected.to(dtype=torch.bfloat16) + batch_size, channels, num_frames, height, width = expected.shape + expected = expected.permute(0, 2, 1, 3, 4) + expected = expected.reshape(batch_size * num_frames, channels, height, width) + expected = expected * 2.0 - 1.0 + expected = expected.reshape(batch_size, num_frames, channels, height, width) + expected = expected.permute(0, 2, 1, 3, 4).to(dtype=torch.bfloat16).float() + + diff = (actual - expected).abs() + assert diff.max().item() == 0.0 + assert diff.mean().item() == 0.0 diff --git a/tests/dreamzero/upstream/upstream_socket_server_no_compile.py b/tests/dreamzero/upstream/upstream_socket_server_no_compile.py new file mode 100644 index 00000000000..8ea38237b67 --- /dev/null +++ b/tests/dreamzero/upstream/upstream_socket_server_no_compile.py @@ -0,0 +1,177 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Launch the upstream DreamZero websocket server with `torch.compile` disabled. + +This wrapper is meant for formal parity tests against `vllm serve --omni`. +It monkeypatches `torch.compile` before importing DreamZero modules so that +all import-time decorators and `post_initialize()` compile calls become eager. + +For the current DreamZero port baseline we also disable all upstream DiT cache +and step-skipping behavior so the reference server matches vLLM's current +eager/no-skip implementation: +- `ENABLE_DIT_CACHE=false` +- `NUM_DIT_STEPS=16` +- `DYNAMIC_CACHE_SCHEDULE=false` + +In our CI/dev environment we also do not install Transformer Engine or +FlashAttention, so upstream's default `socket_test_optimized_AR.main()` path +(`ATTENTION_BACKEND=TE` and then fallback to `FA2`) cannot execute. To keep the +formal server-vs-server test runnable without pulling in those heavyweight +optional deps, this wrapper reproduces upstream `main()` but pins attention to +PyTorch SDPA (`ATTENTION_BACKEND=torch`) for this subprocess only. + +Usage: + PYTHONPATH=/home/yangshen/code/dreamzero \\ + .venv/bin/python -m torch.distributed.run --standalone --nproc_per_node=1 \\ + tests/dreamzero/upstream/upstream_socket_server_no_compile.py --port 18081 \\ + --model_path /home/yangshen/code/dreamzero/checkpoints/dreamzero +""" + +from __future__ import annotations + +import os +import sys +from pathlib import Path + +import torch +import torch.nn.functional as F + + +def _identity_compile(*args, **kwargs): + if args and callable(args[0]) and len(args) == 1 and not kwargs: + return args[0] + + def deco(fn): + return fn + + return deco + + +torch.compile = _identity_compile + +os.environ.setdefault("NO_ALBUMENTATIONS_UPDATE", "1") + +DREAMZERO_REPO = Path(os.environ.get("DREAMZERO_REPO", "~/code/dreamzero")).expanduser() +if str(DREAMZERO_REPO) not in sys.path: + sys.path.insert(0, str(DREAMZERO_REPO)) + +import socket_test_optimized_AR as upstream # noqa: E402 +import tyro # noqa: E402 +from groot.vla.model.dreamzero.modules import attention as upstream_attention # noqa: E402 +from groot.vla.model.dreamzero.modules import wan2_1_submodule as upstream_submodule # noqa: E402 + + +def _torch_varlen_flash_attention( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + q_lens: torch.Tensor | None = None, + k_lens: torch.Tensor | None = None, + dropout_p: float = 0.0, + causal: bool = False, + dtype: torch.dtype = torch.bfloat16, + **_: object, +) -> torch.Tensor: + if q_lens is not None or k_lens is not None: + upstream_attention.warnings.warn( + "Padding mask is disabled in the test-only SDPA fallback.", + ) + out_dtype = q.dtype + q = q.transpose(1, 2).to(dtype) + k = k.transpose(1, 2).to(dtype) + v = v.transpose(1, 2).to(dtype) + out = F.scaled_dot_product_attention( + q, + k, + v, + attn_mask=None, + is_causal=causal, + dropout_p=dropout_p, + ) + return out.transpose(1, 2).contiguous().to(out_dtype) + + +upstream_attention.flash_attention = _torch_varlen_flash_attention +upstream_submodule.flash_attention = _torch_varlen_flash_attention + + +def main(args: upstream.Args) -> None: + os.environ["ENABLE_DIT_CACHE"] = "false" + os.environ["ATTENTION_BACKEND"] = "torch" + os.environ["NUM_DIT_STEPS"] = "16" + os.environ["DYNAMIC_CACHE_SCHEDULE"] = "false" + torch._dynamo.config.recompile_limit = 800 + + embodiment_tag = "oxe_droid" + model_path = args.model_path + + device_mesh = upstream.init_mesh() + rank = upstream.dist.get_rank() + + timeout_delta = upstream.datetime.timedelta(seconds=args.timeout_seconds) + signal_group = upstream.dist.new_group(backend="gloo", timeout=timeout_delta) + upstream.logger.info("Rank %s initialized signal_group (gloo)", rank) + + policy = upstream.GrootSimPolicy( + embodiment_tag=upstream.EmbodimentTag(embodiment_tag), + model_path=model_path, + device="cuda" if torch.cuda.is_available() else "cpu", + device_mesh=device_mesh, + ) + + hostname = upstream.socket.gethostname() + local_ip = upstream.socket.gethostbyname(hostname) + + if rank == 0: + upstream.logging.info("Creating server (host: %s, ip: %s)", hostname, local_ip) + parent_dir = os.path.dirname(model_path) + date_suffix = upstream.datetime.datetime.now().strftime("%Y%m%d") + checkpoint_name = os.path.basename(model_path) + output_dir = os.path.join( + parent_dir, + f"real_world_eval_gen_{date_suffix}_{args.index}", + checkpoint_name, + ) + os.makedirs(output_dir, exist_ok=True) + upstream.logging.info("Videos will be saved to: %s", output_dir) + else: + output_dir = None + upstream.logging.info("Rank %s starting as worker for distributed inference...", rank) + + wrapper_policy = upstream.ARDroidRoboarenaPolicy( + groot_policy=policy, + signal_group=signal_group, + output_dir=output_dir, + ) + + server_config = upstream.PolicyServerConfig( + image_resolution=(180, 320), + needs_wrist_camera=True, + n_external_cameras=2, + needs_stereo_camera=False, + needs_session_id=True, + action_space="joint_position", + ) + + if rank == 0: + upstream.logging.info("Using roboarena policy server interface") + upstream.logging.info("Server config: %s", server_config) + roboarena_server = upstream.RoboarenaServer( + policy=wrapper_policy, + server_config=server_config, + host="0.0.0.0", + port=args.port, + ) + roboarena_server.serve_forever() + else: + worker = upstream.DistributedWorker( + policy=policy, + signal_group=signal_group, + ) + upstream.asyncio.run(worker.run()) + + +if __name__ == "__main__": + upstream.logging.basicConfig(level=upstream.logging.INFO, force=True) + main(tyro.cli(upstream.Args)) diff --git a/tests/e2e/online_serving/test_dreamzero.py b/tests/e2e/online_serving/test_dreamzero.py new file mode 100644 index 00000000000..4ed2a4e9b62 --- /dev/null +++ b/tests/e2e/online_serving/test_dreamzero.py @@ -0,0 +1,111 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""E2E online serving test for DreamZero OpenPI websocket serving.""" + +from __future__ import annotations + +import importlib.util +import os +import socket +import subprocess +import sys +from pathlib import Path + +import pytest + +from tests.conftest import OmniServerParams +from tests.utils import hardware_test + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" + +MODEL = "GEAR-Dreams/DreamZero-DROID" +EXAMPLE_DIR = Path(__file__).resolve().parents[3] / "examples" / "online_serving" / "dreamzero" +CLIENT_SCRIPT = EXAMPLE_DIR / "openpi_client.py" +ASSETS_DIR = EXAMPLE_DIR / "assets" + + +def _find_free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.bind(("127.0.0.1", 0)) + return int(sock.getsockname()[1]) + + +def _pick_test_gpus() -> str: + override = os.environ.get("DREAMZERO_TEST_GPUS") or os.environ.get("OPENPI_E2E_GPUS") + if override: + return override + + try: + query = subprocess.check_output( + [ + "nvidia-smi", + "--query-gpu=index,memory.used", + "--format=csv,noheader,nounits", + ], + text=True, + ) + except Exception: + return "0,1" + + gpu_rows = [] + for line in query.strip().splitlines(): + gpu_index, used_mb = [part.strip() for part in line.split(",", maxsplit=1)] + gpu_rows.append((int(used_mb), gpu_index)) + gpu_rows.sort() + return ",".join(gpu_index for _, gpu_index in gpu_rows[:2]) or "0,1" + +test_params = [ + OmniServerParams( + model=MODEL, + port=8091, + server_args=[ + "--cfg-parallel-size", + "2", + "--enforce-eager", + "--disable-log-stats", + ], + env_dict={ + "ATTENTION_BACKEND": "torch", + "DIFFUSION_ATTENTION_BACKEND": "TORCH_SDPA", + "VLLM_DISABLE_COMPILE_CACHE": "1", + "CUDA_VISIBLE_DEVICES": _pick_test_gpus(), + "MASTER_PORT": str(_find_free_port()), + }, + ) +] + + +def _load_client_module(): + spec = importlib.util.spec_from_file_location("dreamzero_openpi_example_client", CLIENT_SCRIPT) + assert spec is not None + assert spec.loader is not None + module = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = module + try: + spec.loader.exec_module(module) + except ModuleNotFoundError as exc: + pytest.skip(f"DreamZero OpenPI example dependency is missing: {exc.name}") + return module + + +@pytest.mark.advanced_model +@pytest.mark.diffusion +@hardware_test(res={"cuda": "H100"}, num_cards=2) +@pytest.mark.parametrize("omni_server", test_params, indirect=True) +def test_dreamzero_openpi_online(omni_server) -> None: + client_mod = _load_client_module() + result = client_mod.run_policy_session( + host=omni_server.host, + port=omni_server.port, + video_dir=ASSETS_DIR, + session_id="dreamzero-online-e2e", + ) + + client_mod.validate_session_result(result) + + metadata = result["metadata"] + assert metadata["needs_session_id"] is True + assert metadata["needs_stereo_camera"] is False + assert tuple(metadata["image_resolution"]) == (180, 320) diff --git a/tests/entrypoints/openai_api/test_openpi_connection.py b/tests/entrypoints/openai_api/test_openpi_connection.py new file mode 100644 index 00000000000..18391dc66f5 --- /dev/null +++ b/tests/entrypoints/openai_api/test_openpi_connection.py @@ -0,0 +1,168 @@ +import asyncio +import builtins +import sys +import types +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from vllm_omni.entrypoints.openai.realtime.robot import openpi_connection +from vllm_omni.entrypoints.openai.realtime.robot.openpi_serving import PolicyServerConfig + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +class FakeWebSocket: + def __init__(self, messages): + self._messages = list(messages) + self.sent_bytes = [] + self.sent_texts = [] + self.accepted = False + self.closed = False + + async def accept(self): + self.accepted = True + + async def send_bytes(self, data): + self.sent_bytes.append(data) + + async def send_text(self, data): + self.sent_texts.append(data) + + async def receive(self): + return self._messages.pop(0) + + async def close(self): + self.closed = True + + +def test_pack_reports_clear_error_when_openpi_client_is_missing(monkeypatch): + real_import = builtins.__import__ + + def import_without_openpi_client(name, globals=None, locals=None, fromlist=(), level=0): + if name == "openpi_client": + raise ModuleNotFoundError("No module named 'openpi_client'", name="openpi_client") + return real_import(name, globals, locals, fromlist, level) + + monkeypatch.setattr(builtins, "__import__", import_without_openpi_client) + + with pytest.raises(ImportError) as exc_info: + openpi_connection._pack({"prompt": "pick up the object"}) + + message = str(exc_info.value) + assert "/v1/realtime/robot/openpi" in message + assert "pip install openpi-client" in message + + +def test_pack_and_unpack_delegate_to_openpi_msgpack_numpy(monkeypatch): + calls = [] + + class FakeMsgpackNumpy: + @staticmethod + def packb(obj): + calls.append(("packb", obj)) + return b"packed" + + @staticmethod + def unpackb(data): + calls.append(("unpackb", data)) + return {"unpacked": data} + + fake_openpi_client = types.ModuleType("openpi_client") + fake_openpi_client.msgpack_numpy = FakeMsgpackNumpy + monkeypatch.setitem(sys.modules, "openpi_client", fake_openpi_client) + + assert openpi_connection._pack({"x": 1}) == b"packed" + assert openpi_connection._unpack(b"payload") == {"unpacked": b"payload"} + assert calls == [ + ("packb", {"x": 1}), + ("unpackb", b"payload"), + ] + + +def test_handle_connection_returns_structured_error_for_invalid_payload(monkeypatch): + monkeypatch.setattr(openpi_connection, "_pack", lambda obj: obj) + monkeypatch.setattr( + openpi_connection, + "_unpack", + lambda _data: (_ for _ in ()).throw(ValueError("bad payload traceback")), + ) + + websocket = FakeWebSocket( + [ + {"type": "websocket.receive", "bytes": b"bad"}, + {"type": "websocket.disconnect"}, + ] + ) + serving = MagicMock() + connection = openpi_connection.RobotRealtimeConnection(websocket, serving) + + asyncio.run(connection.handle_connection()) + + assert websocket.accepted is True + assert websocket.sent_bytes[1] == {"type": "error", "message": "Invalid request payload"} + assert "traceback" not in str(websocket.sent_bytes[1]).lower() + assert websocket.sent_texts == [] + serving.infer.assert_not_called() + serving.reset.assert_not_called() + + +def test_handle_connection_returns_structured_error_for_infer_exception(monkeypatch): + monkeypatch.setattr(openpi_connection, "_pack", lambda obj: obj) + monkeypatch.setattr( + openpi_connection, + "_unpack", + lambda _data: {"prompt": "pick up the object"}, + ) + + websocket = FakeWebSocket( + [ + {"type": "websocket.receive", "bytes": b"request"}, + {"type": "websocket.disconnect"}, + ] + ) + serving = MagicMock() + serving.infer = AsyncMock(side_effect=RuntimeError("secret traceback text")) + connection = openpi_connection.RobotRealtimeConnection(websocket, serving) + + asyncio.run(connection.handle_connection()) + + assert websocket.sent_bytes[1] == {"type": "error", "message": "Internal inference error"} + assert "secret traceback text" not in str(websocket.sent_bytes[1]) + assert websocket.sent_texts == [] + serving.infer.assert_awaited_once_with({"prompt": "pick up the object"}) + + +def test_handle_connection_closes_websocket_on_idle_timeout(monkeypatch): + monkeypatch.setattr(openpi_connection, "_pack", lambda obj: obj) + + websocket = FakeWebSocket([]) + + async def never_receives(): + await asyncio.sleep(1) + + websocket.receive = never_receives + serving = MagicMock() + serving.policy_server_config = PolicyServerConfig( + { + "image_resolution": (180, 320), + "n_external_cameras": 2, + "needs_wrist_camera": True, + "needs_stereo_camera": False, + "needs_session_id": True, + "action_space": "joint_position", + } + ) + connection = openpi_connection.RobotRealtimeConnection( + websocket, + serving, + idle_timeout=0.01, + ) + + asyncio.run(connection.handle_connection()) + + assert websocket.accepted is True + assert websocket.sent_bytes[0]["action_space"] == "joint_position" + assert websocket.closed is True + assert websocket.sent_texts == [] + serving.infer.assert_not_called() diff --git a/tests/entrypoints/openai_api/test_openpi_serving.py b/tests/entrypoints/openai_api/test_openpi_serving.py new file mode 100644 index 00000000000..53e9683de89 --- /dev/null +++ b/tests/entrypoints/openai_api/test_openpi_serving.py @@ -0,0 +1,147 @@ +from types import SimpleNamespace + +import pytest +from omegaconf import OmegaConf + +from vllm_omni.diffusion.models.dreamzero import transform as dreamzero_transform +from vllm_omni.entrypoints.openai.realtime.robot import openpi_serving + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + +TEST_POLICY_SERVER_CONFIG = { + "image_resolution": (180, 320), + "n_external_cameras": 2, + "needs_wrist_camera": True, + "needs_stereo_camera": False, + "needs_session_id": True, + "action_space": "joint_position", +} + + +def _engine_with_policy_config(policy_config=None): + od_config = SimpleNamespace( + model_config={"policy_server_config": policy_config or TEST_POLICY_SERVER_CONFIG} + ) + return SimpleNamespace(get_diffusion_od_config=lambda: od_config) + + +def test_ensure_transforms_loaded_fails_fast_on_import_error(monkeypatch): + def fail_import(module_name): + raise ModuleNotFoundError(f"missing module: {module_name}") + + monkeypatch.setattr(dreamzero_transform.importlib, "import_module", fail_import) + + with pytest.raises(RuntimeError) as exc_info: + dreamzero_transform.ensure_transforms_loaded() + + assert "Failed to import DreamZero transform module" in str(exc_info.value) + + +def test_ensure_transforms_loaded_fails_when_default_transform_missing(monkeypatch): + monkeypatch.setattr(dreamzero_transform.importlib, "import_module", lambda _module_name: None) + monkeypatch.setattr(dreamzero_transform, "TRANSFORMS", {}) + + with pytest.raises(RuntimeError) as exc_info: + dreamzero_transform.ensure_transforms_loaded() + + assert "roboarena" in str(exc_info.value) + assert "not registered" in str(exc_info.value) + + +def test_policy_server_config_reads_diffusion_model_config(): + policy_config = { + "image_resolution": [64, 64], + "n_external_cameras": 1, + "custom_model_key": {"nested": True}, + } + od_config = SimpleNamespace( + model_config={"policy_server_config": policy_config} + ) + engine_client = SimpleNamespace(get_diffusion_od_config=lambda: od_config) + + serving = openpi_serving.ServingRealtimeRobotOpenPI(engine_client=engine_client) + + assert serving.policy_server_config.to_dict() == policy_config + + +def test_policy_server_config_reads_stage_config_model_config(): + policy_config = {"custom_model_key": "from-stage-config"} + engine_client = SimpleNamespace( + get_diffusion_od_config=lambda: None, + stage_configs=[ + SimpleNamespace(stage_type="diffusion", engine_args=SimpleNamespace(model_config={ + "policy_server_config": policy_config + })) + ], + ) + + serving = openpi_serving.ServingRealtimeRobotOpenPI(engine_client=engine_client) + + assert serving.policy_server_config.to_dict() == policy_config + + +def test_policy_server_config_reads_omegaconf_stage_config(): + engine_client = SimpleNamespace( + get_diffusion_od_config=lambda: None, + stage_configs=[ + SimpleNamespace( + stage_type="diffusion", + engine_args=SimpleNamespace(model_config=OmegaConf.create({ + "policy_server_config": {"custom_model_key": "from-omegaconf"} + })), + ) + ], + ) + + serving = openpi_serving.ServingRealtimeRobotOpenPI(engine_client=engine_client) + + assert serving.policy_server_config.to_dict() == { + "custom_model_key": "from-omegaconf" + } + + +def test_policy_server_config_is_required(): + od_config = SimpleNamespace(model_config={}) + engine_client = SimpleNamespace(get_diffusion_od_config=lambda: od_config) + + with pytest.raises(ValueError) as exc_info: + openpi_serving.ServingRealtimeRobotOpenPI(engine_client=engine_client) + + assert "policy_server_config" in str(exc_info.value) + + +def test_create_policy_server_returns_none_without_policy_config(): + od_config = SimpleNamespace(model_config={}) + engine_client = SimpleNamespace(get_diffusion_od_config=lambda: od_config) + + serving = openpi_serving.ServingRealtimeRobotOpenPI.create_policy_server( + engine_client=engine_client, + model_name="k2-fsa/OmniVoice", + ) + + assert serving is None + + +def test_policy_server_config_reads_engine_model_config(): + policy_config = {"custom_model_key": "custom-value"} + engine_client = SimpleNamespace( + model_config=SimpleNamespace(policy_server_config=policy_config) + ) + + serving = openpi_serving.ServingRealtimeRobotOpenPI(engine_client=engine_client) + + assert serving.policy_server_config.to_dict() == policy_config + + +def test_reset_marks_next_request_for_engine_state_reset(): + serving = openpi_serving.ServingRealtimeRobotOpenPI( + engine_client=_engine_with_policy_config() + ) + serving._call_count = 3 + + serving.reset({}) + serving._call_count += 1 + request = serving._build_request({"prompt": "pick up the object"}) + + assert request.sampling_params.extra_args["reset"] is True + assert request.sampling_params.extra_args["robot_obs"]["prompt"] == "pick up the object" diff --git a/tests/entrypoints/test_omni_entrypoints.py b/tests/entrypoints/test_omni_entrypoints.py index adcdc3e9780..7cf1f696a05 100644 --- a/tests/entrypoints/test_omni_entrypoints.py +++ b/tests/entrypoints/test_omni_entrypoints.py @@ -482,6 +482,19 @@ def test_openai_serving_models_can_consume_async_omni_compat_attrs(): def test_get_diffusion_od_config_returns_diffusion_stage_config(): + diffusion_od_config = object() + omni = object.__new__(AsyncOmni) + omni.engine = SimpleNamespace( + stage_clients=[ + SimpleNamespace(stage_type="llm"), + SimpleNamespace(stage_type="diffusion", od_config=diffusion_od_config), + ] + ) + + assert omni.get_diffusion_od_config() is diffusion_od_config + + +def test_get_diffusion_od_config_falls_back_to_inner_engine(): diffusion_od_config = object() omni = object.__new__(AsyncOmni) omni.engine = SimpleNamespace( diff --git a/tests/entrypoints/test_resolve_dreamzero_config.py b/tests/entrypoints/test_resolve_dreamzero_config.py new file mode 100644 index 00000000000..dd1185b33b4 --- /dev/null +++ b/tests/entrypoints/test_resolve_dreamzero_config.py @@ -0,0 +1,49 @@ +import os + +import pytest + +from vllm_omni.entrypoints.utils import load_stage_configs_from_model, resolve_model_config_path + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +def test_dreamzero_vla_resolves_to_dreamzero_config(monkeypatch): + monkeypatch.setattr( + "vllm_omni.entrypoints.utils.get_config", + lambda _model, trust_remote_code=True: type("Cfg", (), {"model_type": "vla"})(), + ) + monkeypatch.setattr( + "vllm_omni.entrypoints.utils._looks_like_dreamzero", + lambda _model: True, + ) + monkeypatch.setattr( + "vllm_omni.entrypoints.utils.current_omni_platform.get_default_stage_config_path", + lambda: "vllm_omni/model_executor/stage_configs", + ) + + original_exists = os.path.exists + + def mock_exists(path): + if "dreamzero.yaml" in str(path): + return True + return original_exists(path) + + monkeypatch.setattr(os.path, "exists", mock_exists) + + result = resolve_model_config_path("GEAR-Dreams/DreamZero-DROID") + + assert result is not None + assert "dreamzero.yaml" in result + + +def test_dreamzero_config_sets_model_class_and_policy_config(monkeypatch): + monkeypatch.setattr( + "vllm_omni.entrypoints.utils.resolve_model_config_path", + lambda _model: "vllm_omni/model_executor/stage_configs/dreamzero.yaml", + ) + + stage_configs = load_stage_configs_from_model("GEAR-Dreams/DreamZero-DROID") + engine_args = stage_configs[0].engine_args + + assert engine_args.model_class_name == "DreamZeroPipeline" + assert engine_args.model_config.policy_server_config.action_space == "joint_position" diff --git a/tests/examples/online_serving/test_dreamzero.py b/tests/examples/online_serving/test_dreamzero.py new file mode 100644 index 00000000000..f268786e8d7 --- /dev/null +++ b/tests/examples/online_serving/test_dreamzero.py @@ -0,0 +1,101 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Online serving example test: DreamZero. +See examples/online_serving/dreamzero/README.md +""" + +from __future__ import annotations + +import os +import socket +import subprocess +import sys +from pathlib import Path + +import pytest + +from tests.conftest import OmniServerParams +from tests.examples.conftest import run_cmd +from tests.utils import hardware_test + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + +pytestmark = [pytest.mark.advanced_model, pytest.mark.example] + +MODEL = "GEAR-Dreams/DreamZero-DROID" +EXAMPLE_DIR = Path(__file__).resolve().parents[3] / "examples" / "online_serving" / "dreamzero" + + +def _find_free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.bind(("127.0.0.1", 0)) + return int(sock.getsockname()[1]) + + +def _pick_test_gpus() -> str: + override = os.environ.get("DREAMZERO_TEST_GPUS") or os.environ.get("OPENPI_E2E_GPUS") + if override: + return override + + try: + query = subprocess.check_output( + [ + "nvidia-smi", + "--query-gpu=index,memory.used", + "--format=csv,noheader,nounits", + ], + text=True, + ) + except Exception: + return "0,1" + + gpu_rows = [] + for line in query.strip().splitlines(): + gpu_index, used_mb = [part.strip() for part in line.split(",", maxsplit=1)] + gpu_rows.append((int(used_mb), gpu_index)) + gpu_rows.sort() + return ",".join(gpu_index for _, gpu_index in gpu_rows[:2]) or "0,1" + +test_params = [ + OmniServerParams( + model=MODEL, + port=8092, + server_args=[ + "--cfg-parallel-size", + "2", + "--enforce-eager", + "--disable-log-stats", + ], + env_dict={ + "ATTENTION_BACKEND": "torch", + "DIFFUSION_ATTENTION_BACKEND": "TORCH_SDPA", + "VLLM_DISABLE_COMPILE_CACHE": "1", + "CUDA_VISIBLE_DEVICES": _pick_test_gpus(), + "MASTER_PORT": str(_find_free_port()), + }, + ) +] + + +@pytest.mark.advanced_model +@pytest.mark.diffusion +@hardware_test(res={"cuda": "H100"}, num_cards=2) +@pytest.mark.parametrize("omni_server", test_params, indirect=True) +def test_dreamzero_openpi_client_example(omni_server) -> None: + command = [ + sys.executable, + str(EXAMPLE_DIR / "openpi_client.py"), + "--host", + omni_server.host, + "--port", + str(omni_server.port), + ] + + result = run_cmd(command) + assert "Server metadata:" in result + assert "Action 0:" in result + assert "Action 1:" in result + assert "Action 2:" in result + assert "Reset status: reset successful" in result diff --git a/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py b/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py index f7cd4a7e714..29ba4b8867e 100644 --- a/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py +++ b/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py @@ -34,6 +34,19 @@ from vllm_omni.diffusion.models.dreamzero.modeling.causal_wan_model import CausalWanModel from vllm_omni.diffusion.models.dreamzero.modeling.image_encoder import DreamZeroImageEncoder from vllm_omni.diffusion.models.dreamzero.state_dreamzero import DreamZeroState +from vllm_omni.diffusion.models.dreamzero.transform import ( + DEFAULT_EMBODIMENT, + ensure_transforms_loaded, +) +from vllm_omni.diffusion.models.dreamzero.transform.base import get_transform +from vllm_omni.diffusion.models.dreamzero.utils import ( + DEFAULT_CFG_SCALE, + DEFAULT_EMBODIMENT_NAME_TO_ID, + DEFAULT_NEGATIVE_PROMPT, + DEFAULT_NUM_INFERENCE_STEPS, + DEFAULT_SEED, + DEFAULT_SIGMA_SHIFT, +) from vllm_omni.diffusion.models.schedulers.scheduling_flow_unipc_multistep import FlowUniPCMultistepScheduler from vllm_omni.diffusion.request import OmniDiffusionRequest @@ -112,6 +125,11 @@ def __init__(self, *, od_config: OmniDiffusionConfig, prefix: str = "") -> None: model_config = od_config.model_config local_files_only = os.path.exists(model_path) self.od_config = od_config + ensure_transforms_loaded() + self.default_robot_embodiment = model_config.get( + "default_robot_embodiment", + DEFAULT_EMBODIMENT, + ) # ---- Parse root config.json ---- (last_steps.md P0-4) root_cfg = self._load_repo_json(model_path, "config.json", local_files_only) @@ -256,9 +274,12 @@ def __init__(self, *, od_config: OmniDiffusionConfig, prefix: str = "") -> None: # real-world inference loop consumes. Reading the config value here would # incorrectly shorten the denoising loop to 4 steps for the released # DreamZero checkpoint. - self.num_inference_steps: int = model_config.get("num_inference_steps", 16) - self.cfg_scale: float = model_config.get("cfg_scale", 5.0) - self.sigma_shift: float = model_config.get("sigma_shift", 5.0) + self.num_inference_steps: int = model_config.get( + "num_inference_steps", + DEFAULT_NUM_INFERENCE_STEPS, + ) + self.cfg_scale: float = model_config.get("cfg_scale", DEFAULT_CFG_SCALE) + self.sigma_shift: float = model_config.get("sigma_shift", DEFAULT_SIGMA_SHIFT) # Source: `WANPolicyHead.__init__` reads `config.num_frames` # from `action_head_cfg.config.num_frames` (33 for DreamZero DROID), # not from the root HF config. This value feeds `encode_image()` @@ -273,36 +294,20 @@ def __init__(self, *, od_config: OmniDiffusionConfig, prefix: str = "") -> None: self.video_inference_final_noise: float = ah_config["video_inference_final_noise"] # Fixed seed for deterministic noise generation # L176 - self.seed: int = model_config.get("seed", 1140) + self.seed: int = model_config.get("seed", DEFAULT_SEED) # Model-level constants for state/action padding # dreamzero_cotrain.yaml self.max_state_dim: int = ah_config["max_state_dim"] self.max_action_dim: int = ah_config["max_action_dim"] # Fixed negative prompt for CFG uncond branch # dreamzero_cotrain.py L532 - self.negative_prompt: str = ( - "Vibrant colors, overexposed, static, blurry details, text, subtitles, " - "style, artwork, painting, image, still, grayscale, dull, worst quality, " - "low quality, JPEG artifacts, ugly, mutilated, extra fingers, bad hands, " - "bad face, deformed, disfigured, mutated limbs, fused fingers, stagnant " - "image, cluttered background, three legs, many people in the background, " - "walking backwards." - ) + self.negative_prompt: str = model_config.get("negative_prompt", DEFAULT_NEGATIVE_PROMPT) # Embodiment name → numeric ID mapping (model knowledge) # Source: dreamzero transform/base.yaml embodiment_tag_to_projector_index self.embodiment_name_to_id: dict[str, int] = model_config.get( "embodiment_name_to_id", - { - "oxe_droid": 17, - "agibot": 26, - "gr1_unified": 24, - "xdof": 22, - "yam": 32, - "mecka_hands": 27, - "lapa": 27, - "dream": 31, - }, + DEFAULT_EMBODIMENT_NAME_TO_ID, ) # Action normalization stats (per-embodiment, from checkpoint metadata) @@ -341,17 +346,6 @@ def __init__(self, *, od_config: OmniDiffusionConfig, prefix: str = "") -> None: ), ] - def to(self, *args, **kwargs): - """Defer dtype/device moves to the default module semantics. - - Source: `WANPolicyHead.post_initialize()` - - upstream moves `model / text_encoder / image_encoder / vae` - to `dtype=torch.bfloat16` on the target CUDA device. - - the HF CLIP vision backbone must therefore follow the same dtype - move instead of being pinned to fp32. - """ - return super().to(*args, **kwargs) - # ----------------------------------------------------------------------- # Root config loading # ----------------------------------------------------------------------- @@ -863,19 +857,25 @@ def diffuse( # Main entry point # ----------------------------------------------------------------------- + def _transform_robot_obs(self, robot_obs: dict): + """Select DreamZero robot transform and convert raw obs to model input.""" + embodiment = robot_obs.get("embodiment", self.default_robot_embodiment) + transform = get_transform(embodiment) + return transform, transform.transform_input(robot_obs) + @torch.no_grad() def forward(self, req: OmniDiffusionRequest, **kwargs) -> DiffusionOutput: """Full inference step. Called by DiffusionEngine.step(). Source: WANPolicyHead.lazy_joint_video_action (L929-1270) """ extra_args = req.sampling_params.extra_args or {} - unified_obs = extra_args.get("unified_obs") - if unified_obs is None: + robot_obs = extra_args.get("robot_obs") + if robot_obs is None: first_prompt = req.prompts[0] if req.prompts else "" prompt = first_prompt if isinstance(first_prompt, str) else (first_prompt.get("prompt") or "") is_dummy_warmup = prompt == "dummy run" and req.sampling_params.num_inference_steps == 1 if is_dummy_warmup: - logger.info("Skipping DreamZero dummy warmup request without unified_obs.") + logger.info("Skipping DreamZero dummy warmup request without robot_obs.") return DiffusionOutput( output={ "actions": np.zeros( @@ -884,7 +884,8 @@ def forward(self, req: OmniDiffusionRequest, **kwargs) -> DiffusionOutput: ), }, ) - raise KeyError("unified_obs") + raise KeyError("robot_obs") + transform, unified_obs = self._transform_robot_obs(robot_obs) device = get_local_device() # ---- Step 1: Extract inputs from unified observation ---- @@ -937,7 +938,8 @@ def forward(self, req: OmniDiffusionRequest, **kwargs) -> DiffusionOutput: attention_mask = text_inputs["attention_mask"].to(device) # ---- Step 2: Check reset + accumulate frames ---- (L968-981) - # Explicit reset from serving layer (session switch / client request) + # Explicit reset from OpenPI serving is carried by `extra_args["reset"]` + # on the next inference request after websocket reset/session switch. if extra_args.get("reset", False): self.state.reset() # Auto-reset based on model state (before accumulation) @@ -1118,6 +1120,7 @@ def forward(self, req: OmniDiffusionRequest, **kwargs) -> DiffusionOutput: # Squeeze batch dim for output: (B, horizon, dim) → (horizon, dim) actions_np = action_out.squeeze(0).float().cpu().numpy() # (horizon, max_action_dim) + actions_np = transform.transform_action_output(actions_np) return DiffusionOutput( output={ diff --git a/vllm_omni/diffusion/models/dreamzero/state_dreamzero.py b/vllm_omni/diffusion/models/dreamzero/state_dreamzero.py index aca54db5059..e8ab22dbada 100644 --- a/vllm_omni/diffusion/models/dreamzero/state_dreamzero.py +++ b/vllm_omni/diffusion/models/dreamzero/state_dreamzero.py @@ -88,7 +88,9 @@ def reset(self) -> None: self.stitched_buffer: list[np.ndarray] = [] self.call_count: int = 0 - # KV cache — from WANPolicyHead.__init__ L185-188 + # KV cache — from WANPolicyHead.__init__ L185-188. + # TODO(DreamZero): replace this model-local cache with vLLM's managed + # KV cache once robot-policy diffusion supports that integration. self.kv_cache: list[torch.Tensor] | None = None self.kv_cache_neg: list[torch.Tensor] | None = None self.crossattn_cache: list[dict[str, bool | torch.Tensor | None]] | None = None diff --git a/vllm_omni/diffusion/models/dreamzero/transform/__init__.py b/vllm_omni/diffusion/models/dreamzero/transform/__init__.py new file mode 100644 index 00000000000..b6ce4fa6681 --- /dev/null +++ b/vllm_omni/diffusion/models/dreamzero/transform/__init__.py @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from __future__ import annotations + +import importlib + +from vllm.logger import init_logger + +from vllm_omni.diffusion.models.dreamzero.transform.base import TRANSFORMS + +logger = init_logger(__name__) + +DEFAULT_EMBODIMENT = "roboarena" +_BUILTIN_TRANSFORM_MODULES = ( + "vllm_omni.diffusion.models.dreamzero.transform.droid", + "vllm_omni.diffusion.models.dreamzero.transform.roboarena", +) + + +def ensure_transforms_loaded() -> None: + """Import DreamZero transform modules and verify registration.""" + for module_name in _BUILTIN_TRANSFORM_MODULES: + try: + importlib.import_module(module_name) + except Exception as exc: + logger.exception("Failed to import DreamZero transform module %s", module_name) + raise RuntimeError( + f"Failed to import DreamZero transform module '{module_name}'." + ) from exc + + if DEFAULT_EMBODIMENT not in TRANSFORMS: + raise RuntimeError( + f"Built-in DreamZero transform '{DEFAULT_EMBODIMENT}' is not registered after import." + ) diff --git a/vllm_omni/entrypoints/openai/realtime/robot/transform/base.py b/vllm_omni/diffusion/models/dreamzero/transform/base.py similarity index 82% rename from vllm_omni/entrypoints/openai/realtime/robot/transform/base.py rename to vllm_omni/diffusion/models/dreamzero/transform/base.py index 1a0ce0e3002..e6a3fee4283 100644 --- a/vllm_omni/entrypoints/openai/realtime/robot/transform/base.py +++ b/vllm_omni/diffusion/models/dreamzero/transform/base.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Base transform interface for robot policy serving. +"""Base transform interface for DreamZero robot policy serving. Transforms handle dataset-specific concerns ONLY: - Observation key mapping @@ -18,12 +18,11 @@ Flow: raw obs (dataset format) - → Transform.transform_input() + → DreamZeroPipeline selects transform by embodiment → unified dict (stitched video, templated prompt str, raw state) - → Pipeline.forward() (tokenize, pad, encode, denoise) - → DiffusionOutput - → Transform.transform_output() - → ndarray (N, action_dim) + → tokenize, pad, encode, denoise + → transform_action_output() + → ndarray (N, action_dim) """ from __future__ import annotations @@ -82,20 +81,8 @@ def transform_input(self, obs: dict) -> dict: unified["session_id"] = obs["session_id"] return unified - def transform_output(self, result: Any) -> np.ndarray: - """Extract action ndarray (N, ACTION_DIM) from model output. - - Engine outputs actions through ``multimodal_output["actions"]``. - Pipeline outputs (horizon, max_action_dim) after batch squeeze. - We slice to actual ACTION_DIM. - """ - if not hasattr(result, "multimodal_output") or result.multimodal_output is None: - raise RuntimeError("Missing multimodal_output in robot policy result") - - actions = result.multimodal_output.get("actions") - if actions is None: - raise RuntimeError("Missing multimodal_output['actions'] in robot policy result") - + def transform_action_output(self, actions: Any) -> np.ndarray: + """Adapt model action output to this transform's action dimensions.""" actions = np.asarray(actions, dtype=np.float32) # Handle any remaining batch dims: squeeze to 2D (horizon, dim) while actions.ndim > 2: diff --git a/vllm_omni/entrypoints/openai/realtime/robot/transform/droid.py b/vllm_omni/diffusion/models/dreamzero/transform/droid.py similarity index 99% rename from vllm_omni/entrypoints/openai/realtime/robot/transform/droid.py rename to vllm_omni/diffusion/models/dreamzero/transform/droid.py index c6053c54bd0..ad98a2b7a3a 100644 --- a/vllm_omni/entrypoints/openai/realtime/robot/transform/droid.py +++ b/vllm_omni/diffusion/models/dreamzero/transform/droid.py @@ -29,7 +29,7 @@ import torch import torchvision.transforms.v2 as T -from vllm_omni.entrypoints.openai.realtime.robot.transform.base import ( +from vllm_omni.diffusion.models.dreamzero.transform.base import ( RobotPolicyTransform, register_transform, ) diff --git a/vllm_omni/entrypoints/openai/realtime/robot/transform/roboarena.py b/vllm_omni/diffusion/models/dreamzero/transform/roboarena.py similarity index 89% rename from vllm_omni/entrypoints/openai/realtime/robot/transform/roboarena.py rename to vllm_omni/diffusion/models/dreamzero/transform/roboarena.py index b75b94b7a64..102c08ccb3b 100644 --- a/vllm_omni/entrypoints/openai/realtime/robot/transform/roboarena.py +++ b/vllm_omni/diffusion/models/dreamzero/transform/roboarena.py @@ -11,10 +11,10 @@ from __future__ import annotations -from vllm_omni.entrypoints.openai.realtime.robot.transform.base import ( +from vllm_omni.diffusion.models.dreamzero.transform.base import ( register_transform, ) -from vllm_omni.entrypoints.openai.realtime.robot.transform.droid import ( +from vllm_omni.diffusion.models.dreamzero.transform.droid import ( DroidTransform, ) diff --git a/vllm_omni/diffusion/models/dreamzero/utils.py b/vllm_omni/diffusion/models/dreamzero/utils.py new file mode 100644 index 00000000000..88b97ac2252 --- /dev/null +++ b/vllm_omni/diffusion/models/dreamzero/utils.py @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""DreamZero model constants shared by the pipeline.""" + +DEFAULT_NUM_INFERENCE_STEPS = 16 +DEFAULT_CFG_SCALE = 5.0 +DEFAULT_SIGMA_SHIFT = 5.0 +DEFAULT_SEED = 1140 + +DEFAULT_NEGATIVE_PROMPT = ( + "Vibrant colors, overexposed, static, blurry details, text, subtitles, " + "style, artwork, painting, image, still, grayscale, dull, worst quality, " + "low quality, JPEG artifacts, ugly, mutilated, extra fingers, bad hands, " + "bad face, deformed, disfigured, mutated limbs, fused fingers, stagnant " + "image, cluttered background, three legs, many people in the background, " + "walking backwards." +) + +DEFAULT_EMBODIMENT_NAME_TO_ID = { + "oxe_droid": 17, + "agibot": 26, + "gr1_unified": 24, + "xdof": 22, + "yam": 32, + "mecka_hands": 27, + "lapa": 27, + "dream": 31, +} diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 59fedb71339..ad887098b1d 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -630,7 +630,7 @@ async def omni_init_app_state( ) state.openai_streaming_speech = None state.openai_streaming_video = None - state.openai_serving_realtime_robot = ServingRealtimeRobotOpenPI( + state.openai_serving_realtime_robot = ServingRealtimeRobotOpenPI.create_policy_server( engine_client=engine_client, model_name=model_name, ) diff --git a/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py b/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py index ec77f50d8b1..59b6db0d95e 100644 --- a/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py +++ b/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py @@ -11,7 +11,7 @@ from __future__ import annotations -import traceback +import asyncio from typing import Any from fastapi import WebSocket @@ -23,18 +23,26 @@ ) logger = init_logger(__name__) +_DEFAULT_IDLE_TIMEOUT = 30.0 +def _get_msgpack_numpy() -> Any: + try: + from openpi_client import msgpack_numpy + except ImportError as exc: + raise ImportError( + "The `/v1/realtime/robot/openpi` endpoint requires the optional " + "`openpi-client` dependency. Install it with `pip install openpi-client`." + ) from exc + + return msgpack_numpy -def _pack(obj: Any) -> bytes: - from openpi_client import msgpack_numpy - return msgpack_numpy.packb(obj) +def _pack(obj: Any) -> bytes: + return _get_msgpack_numpy().packb(obj) def _unpack(data: bytes) -> Any: - from openpi_client import msgpack_numpy - - return msgpack_numpy.unpackb(data) + return _get_msgpack_numpy().unpackb(data) class RobotRealtimeConnection: @@ -44,28 +52,44 @@ def __init__( self, websocket: WebSocket, serving: ServingRealtimeRobotOpenPI, + idle_timeout: float = _DEFAULT_IDLE_TIMEOUT, ) -> None: self.websocket = websocket self.serving = serving + self._idle_timeout = idle_timeout + + async def _send_error(self, message: str) -> None: + await self.websocket.send_bytes(_pack({"type": "error", "message": message})) + + def _unpack_request(self, data: bytes) -> dict[str, Any]: + obs = _unpack(data) + if not isinstance(obs, dict): + raise ValueError("Invalid request payload") + return obs async def handle_connection(self) -> None: """Main loop. Matches DreamZero policy_server.py._handler.""" await self.websocket.accept() try: - # Send metadata (PolicyServerConfig fields) - metadata = { - "image_resolution": (180, 320), - "n_external_cameras": 2, - "needs_wrist_camera": True, - "needs_stereo_camera": False, - "needs_session_id": True, - "action_space": "joint_position", - } + # Send model-specific PolicyServerConfig resolved by serving from + # diffusion od_config.model_config. + metadata = self.serving.policy_server_config.to_dict() await self.websocket.send_bytes(_pack(metadata)) while True: - msg = await self.websocket.receive() + try: + msg = await asyncio.wait_for( + self.websocket.receive(), + timeout=self._idle_timeout, + ) + except asyncio.TimeoutError: + logger.info("Robot OpenPI connection idle timeout after %.1f seconds", self._idle_timeout) + try: + await self.websocket.close() + except Exception: + logger.debug("Failed to close idle robot OpenPI websocket", exc_info=True) + return if msg.get("type") == "websocket.disconnect": break @@ -74,7 +98,16 @@ async def handle_connection(self) -> None: continue try: - obs = _unpack(msg["bytes"]) + obs = self._unpack_request(msg["bytes"]) + except Exception: + logger.exception("Invalid robot OpenPI request payload") + try: + await self._send_error("Invalid request payload") + except Exception: + break + continue + + try: endpoint = obs.pop("endpoint", "infer") if endpoint == "reset": @@ -86,7 +119,7 @@ async def handle_connection(self) -> None: except Exception: logger.exception("Error handling request") try: - await self.websocket.send_text(traceback.format_exc()) + await self._send_error("Internal inference error") except Exception: break diff --git a/vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py b/vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py index 5b92c79bb7d..7200afe99be 100644 --- a/vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py +++ b/vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py @@ -3,65 +3,127 @@ """Serving layer for robot policy inference via `/v1/realtime/robot/openpi`. -Flow: raw obs → transform (dataset key mapping) → unified obs → -`DiffusionEngine.step()` → actions. -Transform is stateless and selected per-request via `obs["embodiment"]`. +Flow: raw obs → `DiffusionEngine.step()` → actions. +DreamZero owns dataset transforms inside the diffusion pipeline. """ from __future__ import annotations +from collections.abc import Mapping +from dataclasses import dataclass from typing import Any import numpy as np -import torch +from omegaconf import OmegaConf from vllm.logger import init_logger -from vllm_omni.entrypoints.openai.realtime.robot.transform.base import ( - RobotPolicyTransform, - get_transform, -) - logger = init_logger(__name__) -# Default embodiment when not specified in obs -DEFAULT_EMBODIMENT = "roboarena" + +def _to_builtin_container(value: Any) -> Any: + if OmegaConf.is_config(value): + return OmegaConf.to_container(value, resolve=True) + if isinstance(value, Mapping): + return {key: _to_builtin_container(item) for key, item in value.items()} + if isinstance(value, (list, tuple)): + return [_to_builtin_container(item) for item in value] + return value + +@dataclass(frozen=True) +class PolicyServerConfig: + """OpenPI policy server handshake config. + + Values are model-specific and must be provided by the loaded policy model. + """ + + values: dict[str, Any] + + @classmethod + def from_model_config(cls, model_config: Any) -> "PolicyServerConfig": + if isinstance(model_config, Mapping): + raw_config = model_config.get("policy_server_config") + else: + raw_config = getattr(model_config, "policy_server_config", None) + + if raw_config is None: + raise ValueError("Robot OpenPI serving requires policy_server_config.") + if isinstance(raw_config, cls): + return raw_config + if not isinstance(raw_config, Mapping): + raise TypeError("policy_server_config must be a dict.") + return cls(_to_builtin_container(raw_config)) + + def to_dict(self) -> dict[str, Any]: + return _to_builtin_container(self.values) class ServingRealtimeRobotOpenPI: """Robot policy serving layer for OpenPI protocol. - Stateless transform routes by obs["embodiment"]. - Model-specific state (frame buffer, KV cache) lives in pipeline. + Model-specific transform/state lives in the diffusion pipeline. """ def __init__( self, engine_client: Any, model_name: str | None = None, - default_embodiment: str = DEFAULT_EMBODIMENT, ) -> None: self.engine_client = engine_client self.model_name = model_name - self.default_embodiment = default_embodiment self._current_session_id: str | None = None self._call_count = 0 + self.policy_server_config = self._get_policy_server_config(engine_client) - # Ensure default transforms are registered - self._ensure_transforms_loaded() + @classmethod + def create_policy_server( + cls, + engine_client: Any, + model_name: str | None = None, + ) -> "ServingRealtimeRobotOpenPI | None": + try: + return cls(engine_client=engine_client, model_name=model_name) + except ValueError as exc: + if "policy_server_config" not in str(exc): + raise + logger.info("Robot OpenPI serving disabled for model %s", model_name) + return None @staticmethod - def _ensure_transforms_loaded() -> None: - """Import transform modules to trigger register_transform calls.""" - import vllm_omni.entrypoints.openai.realtime.robot.transform.droid # noqa: F401 - import vllm_omni.entrypoints.openai.realtime.robot.transform.roboarena # noqa: F401 + def _get_policy_server_config(engine_client: Any) -> PolicyServerConfig: + model_config = None + get_od_config = getattr(engine_client, "get_diffusion_od_config", None) + if callable(get_od_config): + od_config = get_od_config() + model_config = getattr(od_config, "model_config", None) + + if model_config is None: + for stage_config in getattr(engine_client, "stage_configs", []) or []: + if getattr(stage_config, "stage_type", None) != "diffusion": + continue + engine_args = getattr(stage_config, "engine_args", None) + model_config = getattr(engine_args, "model_config", None) + if model_config is not None: + break + + if model_config is None: + od_config = getattr(engine_client, "od_config", None) + model_config = getattr(od_config, "model_config", None) + + if model_config is None: + model_config = getattr(engine_client, "model_config", None) + return PolicyServerConfig.from_model_config(model_config) def reset(self, obs: dict) -> None: - """Reset session state.""" + """Reset serving state. + + Engine-side DreamZero state is reset on the next inference request via + `extra_args["reset"]`, not by an immediate websocket-side RPC. + """ self._call_count = 0 self._current_session_id = None async def infer(self, obs: dict) -> np.ndarray: - """raw obs → transform → engine → actions.""" + """raw obs → engine → actions.""" # Session tracking session_id = obs.get("session_id") if session_id is not None and session_id != self._current_session_id: @@ -72,13 +134,12 @@ async def infer(self, obs: dict) -> np.ndarray: self._call_count += 1 - # Transform: dataset format → unified format - transform = self._get_transform(obs) - unified_obs = transform.transform_input(obs) - # Build request, run inference through AsyncOmni - request = self._build_request(unified_obs) + request = self._build_request(obs) result = None + # OpenPI policy serving is one request -> one action reply. AsyncOmni + # exposes an async iterator, so consume it to completion and use the + # final output, matching other non-streaming OpenAI serving paths. async for output in self.engine_client.generate( prompt=request.prompts[0], request_id=request.request_ids[0], @@ -88,16 +149,10 @@ async def infer(self, obs: dict) -> np.ndarray: if result is None: raise RuntimeError("Robot OpenPI request produced no output.") - # Extract actions (via transform or default) - return self._extract_actions(result, transform) + return self._extract_actions(result) - def _get_transform(self, obs: dict) -> RobotPolicyTransform: - """Select transform by obs['embodiment'] or default.""" - embodiment = obs.get("embodiment", self.default_embodiment) - return get_transform(embodiment) - - def _build_request(self, unified_obs: dict) -> Any: - """Build engine request from unified obs. + def _build_request(self, obs: dict) -> Any: + """Build engine request from raw robot obs. Returns an `OmniDiffusionRequest` payload consumed by `AsyncOmni.generate()` and routed to the diffusion stage. @@ -105,13 +160,16 @@ def _build_request(self, unified_obs: dict) -> Any: from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.inputs.data import OmniDiffusionSamplingParams + # `_call_count` is reset by websocket reset/session switches, then + # incremented before this request is built. DreamZero pipeline consumes + # this flag and clears its frame buffer / KV cache before accumulation. extra_args = { "reset": self._call_count <= 1, "session_id": self._current_session_id or "default", - "unified_obs": unified_obs, + "robot_obs": obs, } - prompt = unified_obs["prompt"] + prompt = obs.get("prompt", "") sampling_params = OmniDiffusionSamplingParams(extra_args=extra_args) return OmniDiffusionRequest( prompts=[prompt], @@ -119,14 +177,17 @@ def _build_request(self, unified_obs: dict) -> Any: request_ids=[f"robot-{self._current_session_id or 'default'}"], ) - def _extract_actions(self, result: Any, transform: RobotPolicyTransform) -> np.ndarray: + def _extract_actions(self, result: Any) -> np.ndarray: """Extract actions from engine result.""" if hasattr(result, "__iter__"): result = list(result) if result: result = result[0] - actions = transform.transform_output(result) - if isinstance(actions, torch.Tensor): - return actions.cpu().float().numpy() + if not hasattr(result, "multimodal_output") or result.multimodal_output is None: + raise RuntimeError("Missing multimodal_output in robot policy result") + + actions = result.multimodal_output.get("actions") + if actions is None: + raise RuntimeError("Missing multimodal_output['actions'] in robot policy result") return np.asarray(actions, dtype=np.float32) diff --git a/vllm_omni/entrypoints/openai/realtime/robot/transform/__init__.py b/vllm_omni/entrypoints/openai/realtime/robot/transform/__init__.py deleted file mode 100644 index 208f01a7cb5..00000000000 --- a/vllm_omni/entrypoints/openai/realtime/robot/transform/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py index de7eb5f4c7e..578d7c04a21 100644 --- a/vllm_omni/entrypoints/utils.py +++ b/vllm_omni/entrypoints/utils.py @@ -14,6 +14,7 @@ from vllm_omni.config.stage_config import StageConfigFactory from vllm_omni.config.yaml_util import create_config, load_yaml_config, merge_configs +from vllm_omni.diffusion.utils.hf_utils import _looks_like_dreamzero from vllm_omni.entrypoints.stage_utils import _to_dict from vllm_omni.inputs.data import OmniSamplingParams from vllm_omni.platforms import current_omni_platform @@ -336,6 +337,9 @@ def resolve_model_config_path(model: str) -> str: ) default_config_path = current_omni_platform.get_default_stage_config_path() + if model_type == "vla" and _looks_like_dreamzero(model): + model_type = "dreamzero" + if model_type in _DIFFUSERS_CLASS_TO_CONFIG: normalized_model_type = _DIFFUSERS_CLASS_TO_CONFIG[model_type] else: diff --git a/vllm_omni/model_executor/stage_configs/dreamzero.yaml b/vllm_omni/model_executor/stage_configs/dreamzero.yaml new file mode 100644 index 00000000000..cd5cb9e9c54 --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/dreamzero.yaml @@ -0,0 +1,21 @@ +stage_args: + - stage_id: 0 + stage_type: diffusion + runtime: + devices: "0" + engine_args: + model_stage: diffusion + model_class_name: DreamZeroPipeline + distributed_executor_backend: "mp" + max_num_seqs: 1 + model_config: + default_robot_embodiment: roboarena + policy_server_config: + image_resolution: [180, 320] + n_external_cameras: 2 + needs_wrist_camera: true + needs_stereo_camera: false + needs_session_id: true + action_space: joint_position + final_output: true + final_output_type: image From 07fa6fd2c8cc45790bfee5ff04e5c58a8826f23f Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Sun, 19 Apr 2026 18:25:04 +0000 Subject: [PATCH 04/45] docs(dreamzero): clarify runtime envs and tighten client deps Add concise environment guidance for DreamZero serving, bundled OpenPI client usage, and DROID sim-eval rollout usage. Also guard optional client-side imports in the DreamZero example scripts so missing non-core dependencies fail with explicit messages instead of opaque import errors. Signed-off-by: Yangshen Deng Co-authored-by: Meng --- docs/models/dreamzero/README.md | 4 +- docs/models/dreamzero/quick_start.md | 9 + examples/online_serving/dreamzero/README.md | 219 +++++ .../dreamzero/droid_sim_eval_client.py | 842 ++++++++++++++++++ .../online_serving/dreamzero/openpi_client.py | 6 +- 5 files changed, 1077 insertions(+), 3 deletions(-) create mode 100644 examples/online_serving/dreamzero/droid_sim_eval_client.py diff --git a/docs/models/dreamzero/README.md b/docs/models/dreamzero/README.md index 0b6cb624af0..49791db386f 100644 --- a/docs/models/dreamzero/README.md +++ b/docs/models/dreamzero/README.md @@ -1,4 +1,4 @@ # DreamZero -- `docs/models/dreamzero/quick_start.md`: quick start, standard e2e/example entry points, and optional upstream parity checks -- `examples/online_serving/dreamzero/README.md`: self-contained OpenPI server/client example with bundled real videos +- `docs/models/dreamzero/quick_start.md`: quick start, environment checklist, standard e2e/example entry points, and optional upstream parity checks +- `examples/online_serving/dreamzero/README.md`: self-contained OpenPI server/client example with bundled real videos, plus per-script environment / dependency requirements diff --git a/docs/models/dreamzero/quick_start.md b/docs/models/dreamzero/quick_start.md index 9b97d54b65f..8b674b482fa 100644 --- a/docs/models/dreamzero/quick_start.md +++ b/docs/models/dreamzero/quick_start.md @@ -10,6 +10,15 @@ For the self-contained example, use the bundled client and videos under Upstream DreamZero-dependent parity checks are optional and live under `tests/dreamzero/upstream/`. +## Environment checklist + +- Sections 1-4: use the local `vllm-omni` environment. +- Bundled OpenPI client extra deps: `openpi-client`, `websockets`, `opencv-python` +- DROID sim-eval client: use an external Isaac Lab / `sim-evals` environment, + plus `openpi-client`, `websockets`, `opencv-python`, and `mediapy` +- Optional upstream parity tests: also require `~/code/dreamzero` and + `~/code/dreamzero/checkpoints/dreamzero` + --- ## 1. Start the vLLM DreamZero server diff --git a/examples/online_serving/dreamzero/README.md b/examples/online_serving/dreamzero/README.md index cc5b48da1f6..7d8ff099442 100644 --- a/examples/online_serving/dreamzero/README.md +++ b/examples/online_serving/dreamzero/README.md @@ -7,8 +7,36 @@ compatible OpenPI websocket client using bundled real camera videos. - `run_server.sh`: launch DreamZero OpenPI serving - `openpi_client.py`: websocket client that sends real observations +- `droid_sim_eval_client.py`: DROID `sim-evals` rollout client for the vLLM OpenPI server - `assets/`: minimal real camera videos used by the example +## Environment requirements + +- `run_server.sh`, `vllm serve`, `openpi_client.py`, and the standard example/e2e tests: + use the local `vllm-omni` environment. +- `openpi_client.py` extra deps: + +```bash +pip install openpi-client websockets opencv-python +``` + +- `droid_sim_eval_client.py` must run in an external Isaac Lab / `sim-evals` + environment, and also needs: + +```bash +pip install openpi-client websockets opencv-python mediapy +``` + +- On Python `< 3.12`, also install: + +```bash +pip install typing-extensions +``` + +- Optional `tests/dreamzero/upstream/*` parity tests also require: + - local upstream repo at `~/code/dreamzero` + - local checkpoint at `~/code/dreamzero/checkpoints/dreamzero` + ## Start the server From the repository root: @@ -34,6 +62,11 @@ The websocket endpoint is: From the repository root: +Environment: + +- run this in the `vllm-omni` repo environment +- if imports are missing, install `openpi-client`, `websockets`, and `opencv-python` + ```bash python examples/online_serving/dreamzero/openpi_client.py \ --host 127.0.0.1 \ @@ -54,6 +87,192 @@ It validates: - finite action values - reset response +## Run DROID sim-eval against the vLLM server + +This is the closest setup to an end-to-end simulated policy rollout. + +### 1. Start the vLLM DreamZero server + +From the repository root: + +Environment: + +- run this in the `vllm-omni` repo environment +- no extra DreamZero-specific client package is needed for the server itself + +```bash +CUDA_VISIBLE_DEVICES=0 \ +ATTENTION_BACKEND=torch \ +DIFFUSION_ATTENTION_BACKEND=TORCH_SDPA \ +vllm serve \ + GEAR-Dreams/DreamZero-DROID \ + --omni \ + --host 127.0.0.1 \ + --port 8000 \ + --served-model-name dreamzero-droid \ + --enforce-eager +``` + +### 2. Start the DROID simulation client + +This step runs from an external `sim-evals` checkout, because Isaac Lab / +Isaac Sim assets and environment registration live there. + +Like upstream DreamZero, this client does not guess any local path for +`sim_evals` or Isaac Lab. Run it from an environment where those packages are +already importable (for example, the `sim-evals` project environment). + +Environment: + +- do **not** run this from the plain `vllm-omni` env unless it already has Isaac Lab and `sim_evals` +- launch it from the Isaac Lab / `sim-evals` environment +- make sure the following imports work there: + - `isaaclab` + - `isaaclab_tasks` + - `sim_evals` + - `gymnasium` + - `openpi_client` + - `websockets` + - `cv2` + - `mediapy` + +Example command: + +```bash +cd /path/to/sim-evals + +CUDA_VISIBLE_DEVICES=1 \ +./submodules/IsaacLab/isaaclab.sh -p \ + /path/to/vllm-omni/examples/online_serving/dreamzero/droid_sim_eval_client.py \ + --host 127.0.0.1 \ + --port 8000 \ + --scene 1 \ + --episodes 1 \ + --headless \ + --device cuda:0 +``` + +Notes: + +- `CUDA_VISIBLE_DEVICES=1` keeps Isaac Sim off the GPU used by the vLLM server. +- `--scene` selects one of the built-in DROID tasks: + - `1`: `put the cube in the bowl` + - `2`: `pick up the can and put it in the mug` + - `3`: `put the banana in the bin` +- The client keeps the upstream DreamZero sim-eval behavior: + - DROID observation extraction from `external_cam`, `external_cam_2`, and `wrist_cam` + - `resize_with_pad(..., 180, 320)` + - `open_loop_horizon=8` + - 24-step action chunks with 8 action dimensions + +### Action chunk vs open-loop horizon + +DreamZero predicts longer action chunks than the number of actions the +sim-eval client executes before replanning: + +- model output action chunk: `(24, 8)` + - `24`: predicted future action horizon + - `8`: action dimension, i.e. 7 arm joints + 1 gripper +- sim-eval execution horizon: `open_loop_horizon=8` + - after one model call, the client executes only the first `8` actions + - the remaining `16` predicted actions are not consumed + - the client then sends a fresh observation and asks the server for a new + `(24, 8)` chunk + +This follows the upstream DreamZero sim-eval client: + +- `third_party/dreamzero/eval_utils/run_sim_eval.py` defaults + `open_loop_horizon` to `8` +- DreamZero action outputs use `action_horizon=24` + +The split is intentional: `24` lets the model predict a longer future plan, +while `8` keeps execution closed-loop by replanning after roughly half a second +in the DROID simulator. + +## How the sim-eval rollout works + +At a high level, one rollout does the following: + +1. Isaac Lab loads the DROID scene and resets the environment twice. +2. `droid_sim_eval_client.py` reads the current robot observation: + - two external cameras + - one wrist camera + - 7-DoF arm joint positions + - 1-DoF gripper position +3. The client converts the observation into the DreamZero/OpenPI websocket payload: + - `observation/exterior_image_0_left` + - `observation/exterior_image_1_left` + - `observation/wrist_image_left` + - `observation/joint_position` + - `observation/cartesian_position` + - `observation/gripper_position` + - `prompt` + - `session_id` +4. vLLM DreamZero returns one action chunk with shape `(24, 8)`. +5. The sim client consumes that chunk in open loop for `8` control steps. +6. After the local chunk budget is exhausted, the client requests the next action chunk. +7. This repeats until the environment hits its time limit. + +The current DROID sim environment does not expose a built-in task success flag, +so the rollout result should be judged primarily from the video and the final +trajectory JSON. + +## How to read the `runs/` outputs + +By default the client writes results under: + +- `runs/dreamzero_sim_eval/_/` + +The key files are: + +- `episode_00.mp4` + - the rollout video + - this is the first file to inspect +- `episode_00.json` + - per-step trace for one episode + - includes: + - `prompt` + - `steps_executed` + - `server_calls` + - `episode_wall_time_s` + - `server_time_s` + - `trajectory` +- `summary.json` + - top-level run summary across episodes + - includes: + - scene id + - prompt + - server metadata + - per-episode summaries + +Inside `episode_00.json`, the `trajectory` list contains one entry per control +step. Each entry records: + +- `step_index`: control step index +- `used_server_call`: whether this step triggered a new model chunk request +- `chunk_latency_s`: model latency for that chunk request +- `action`: the 8-D action applied to the simulator +- `joint_position`: observed robot joints before the next step +- `gripper_position`: observed gripper state +- `reward`, `terminated`, `truncated` + +Practical reading order: + +1. watch `episode_00.mp4` +2. open `summary.json` and check: + - prompt + - total steps + - total wall time + - total model time + - number of server calls +3. if the behavior looks odd, inspect `episode_00.json` + - check whether actions saturate + - check whether the robot stalls + - check how often a new chunk was requested + +For GitHub issues / PR comments, you can also convert `episode_00.mp4` to a GIF +with `ffmpeg` and attach it directly. + ## Optional upstream parity checks The upstream DreamZero-dependent parity tests are kept under: diff --git a/examples/online_serving/dreamzero/droid_sim_eval_client.py b/examples/online_serving/dreamzero/droid_sim_eval_client.py new file mode 100644 index 00000000000..4518f23eac2 --- /dev/null +++ b/examples/online_serving/dreamzero/droid_sim_eval_client.py @@ -0,0 +1,842 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Run one or more DROID sim-eval rollouts against the vLLM DreamZero server. + +This script is the vLLM/OpenPI adaptation of the upstream DreamZero sim-eval +client: +`third_party/dreamzero/eval_utils/run_sim_eval.py` + +Behavior intentionally kept close to upstream: +- same DROID observation extraction (`external_cam`, `external_cam_2`, + `wrist_cam`, joint position, gripper position) +- same resize-with-pad preprocessing to `(180, 320)` +- same `open_loop_horizon=8` +- same gripper binarization rule (`> 0.5 -> 1`, else `0`) +- same per-scene language prompts + +Unlike upstream DreamZero, vLLM serves the compatible websocket policy endpoint +at `/v1/realtime/robot/openpi`, so this script includes the path suffix in the +client URI. + +Run this script through Isaac Lab's launcher from the external `sim-evals` +checkout, for example: + + cd /path/to/sim-evals + ./submodules/IsaacLab/isaaclab.sh -p \ + /path/to/vllm-omni-wm/examples/online_serving/dreamzero/droid_sim_eval_client.py \ + --host 127.0.0.1 \ + --port 8000 \ + --scene 1 \ + --episodes 1 \ + --headless \ + --device cuda:1 +""" + +from __future__ import annotations + +import argparse +import json +import logging +import sys +import time +import uuid +from dataclasses import dataclass +from datetime import datetime, UTC +from pathlib import Path +from typing import Any + +import numpy as np +import torch + +try: + import cv2 +except ImportError as exc: # pragma: no cover - runtime dependency guard + raise ImportError( + "DreamZero sim-eval client requires `opencv-python`." + ) from exc + +try: + import mediapy +except ImportError as exc: # pragma: no cover - runtime dependency guard + raise ImportError( + "DreamZero sim-eval client requires `mediapy`." + ) from exc + +try: + from typing import override +except ImportError: + try: + from typing_extensions import override + except ImportError as exc: # pragma: no cover - runtime dependency guard + raise ImportError( + "DreamZero sim-eval client requires `typing-extensions` on Python < 3.12." + ) from exc + +try: + import websockets.sync.client +except ImportError as exc: # pragma: no cover - runtime dependency guard + raise ImportError( + "DreamZero sim-eval client requires `websockets`." + ) from exc + +# NOTE: +# This directory already contains a local file named `openpi_client.py`. +# However, what we want here is the *installed* `openpi_client` package from +# upstream OpenPI, not the sibling example file. When a script is executed +# directly, Python often puts the script directory at `sys.path[0]`, which +# would cause `import openpi_client` to resolve to the local example file and +# create a circular import. +# +# To avoid that ambiguity, temporarily remove the current example directory +# from the front of `sys.path`, import the real package, and then restore the +# path afterwards. +example_dir = str(Path(__file__).resolve().parent) +removed_path = False +if sys.path and sys.path[0] == example_dir: + sys.path.pop(0) + removed_path = True +try: + from openpi_client import image_tools, msgpack_numpy + from openpi_client.base_policy import BasePolicy +except ImportError as exc: # pragma: no cover - runtime dependency guard + raise ImportError( + "DreamZero sim-eval client requires the optional `openpi-client` package." + ) from exc +finally: + if removed_path: + sys.path.insert(0, example_dir) + +# ----------------------------------------------------------------------------- +# Constants +# ----------------------------------------------------------------------------- +# +# These values intentionally mirror the upstream DreamZero sim-eval client +# where possible. The important distinction is: +# +# - ACTION_HORIZON = 24 +# The model returns 24 future actions per inference call. +# - DEFAULT_OPEN_LOOP_HORIZON = 8 +# The sim client only executes the first 8 actions locally before asking +# the server to replan from a fresh observation. +# +# So a single server call predicts 24x8 actions, but the rollout consumes only +# 8 of them before replanning. +PING_INTERVAL_SECS = 60 +PING_TIMEOUT_SECS = 600 +DEFAULT_PATH = "/v1/realtime/robot/openpi" +DEFAULT_OPEN_LOOP_HORIZON = 8 +ACTION_HORIZON = 24 +ACTION_DIM = 8 +DEFAULT_OUTPUT_ROOT = Path("runs") / "dreamzero_sim_eval" +SCENE_PROMPTS = { + 1: "put the cube in the bowl", + 2: "pick up the can and put it in the mug", + 3: "put the banana in the bin", +} + + +@dataclass(frozen=True) +class StepRecord: + """One fully materialized rollout step for later JSON export. + + The `episode_00.json` artifact is intended to be human-readable and + post-process-friendly. Instead of keeping raw tensors around, each step is + flattened into plain Python types so it can be serialized directly. + """ + + # Index of this control step within the episode. + step_index: int + # Whether this step triggered a fresh model call (as opposed to reusing the + # cached open-loop chunk from the previous server response). + used_server_call: bool + # End-to-end latency of the server call that produced the current chunk. + # This is `None` on steps that only reuse cached actions. + chunk_latency_s: float | None + # The concrete 8-D action sent into the simulator at this step. + action: list[float] + # Observed 7-DoF arm joint positions before the next environment step. + joint_position: list[float] + # Observed gripper scalar before the next environment step. + gripper_position: list[float] + # Reward and termination signals directly returned by the simulator. + reward: float + terminated: bool + truncated: bool + # Optional scene object positions for downstream debugging / success + # heuristics. This may be empty if the environment does not expose them. + scene_objects: dict[str, list[float]] + + +class OpenPIWebsocketClientPolicy(BasePolicy): + """Minimal websocket client for the DreamZero/OpenPI policy protocol. + + Protocol shape: + - connect -> server immediately sends a metadata dict + - infer -> send msgpack observation, receive action chunk + - reset -> send msgpack reset command, receive confirmation string + + This class intentionally stays very small because the more interesting + DreamZero-specific behavior lives one layer above, in + `DreamZeroJointPosClient`. + """ + + def __init__( + self, + host: str = "127.0.0.1", + port: int = 8000, + path: str = DEFAULT_PATH, + ) -> None: + # vLLM serves the robot endpoint under `/v1/realtime/robot/openpi`. + self._uri = f"ws://{host}:{port}{path}" + # Upstream protocol uses msgpack with numpy support, not JSON. + self._packer = msgpack_numpy.Packer() + # Connect immediately and cache the server handshake metadata. + self._ws, self._server_metadata = self._wait_for_server() + + def get_server_metadata(self) -> dict[str, Any]: + """Return a copy of the server handshake metadata.""" + + return dict(self._server_metadata) + + def _wait_for_server(self): + """Connect to the websocket server and read the initial metadata frame.""" + + logging.info("Connecting to %s", self._uri) + conn = websockets.sync.client.connect( + self._uri, + compression=None, + max_size=None, + ping_interval=PING_INTERVAL_SECS, + ping_timeout=PING_TIMEOUT_SECS, + ) + metadata = msgpack_numpy.unpackb(conn.recv()) + if not isinstance(metadata, dict): + raise TypeError(f"Expected dict metadata from server, got {type(metadata)!r}") + return conn, metadata + + @override + def infer(self, obs: dict[str, Any]) -> np.ndarray: + """Send an inference request and return the decoded action chunk.""" + + # Keep the upstream DreamZero/OpenPI convention that the request itself + # tells the server which logical endpoint is being called. + payload = dict(obs) + payload["endpoint"] = "infer" + self._ws.send(self._packer.pack(payload)) + response = self._ws.recv() + if isinstance(response, str): + raise RuntimeError(f"Error in inference server:\n{response}") + return np.asarray(msgpack_numpy.unpackb(response), dtype=np.float32) + + @override + def reset(self, reset_info: dict[str, Any] | None = None) -> str: + """Tell the server to reset its session-side state.""" + + payload = dict(reset_info or {}) + payload["endpoint"] = "reset" + self._ws.send(self._packer.pack(payload)) + response = self._ws.recv() + if not isinstance(response, str): + raise RuntimeError(f"Unexpected reset response: {type(response)!r}") + return response + + def close(self) -> None: + """Close the websocket connection explicitly.""" + + self._ws.close() + + +class DreamZeroJointPosClient: + """DROID sim-eval client that talks to the vLLM OpenPI websocket server. + + This is the main compatibility layer between: + - Isaac Lab DROID observations (`obs["policy"][...]`) + - DreamZero/OpenPI websocket payloads + - local open-loop action reuse across several simulator steps + + In other words: + simulator obs -> websocket request -> action chunk -> one action per step + """ + + def __init__( + self, + remote_host: str = "127.0.0.1", + remote_port: int = 8000, + path: str = DEFAULT_PATH, + open_loop_horizon: int = DEFAULT_OPEN_LOOP_HORIZON, + ) -> None: + # Low-level transport client. + self.client = OpenPIWebsocketClientPolicy(remote_host, remote_port, path=path) + # Number of actions to execute locally before replanning. + self.open_loop_horizon = open_loop_horizon + # Cursor into the currently cached action chunk. + self.actions_from_chunk_completed = 0 + # Most recent `(ACTION_HORIZON, ACTION_DIM)` server response. + self.pred_action_chunk: np.ndarray | None = None + # Session id is part of the DreamZero serving contract. Changing it + # causes the server side to treat the rollout as a fresh episode. + self.session_id = str(uuid.uuid4()) + # Simple runtime stats for reporting. + self.server_calls = 0 + self.last_chunk_latency_s: float | None = None + self.last_used_server_call = False + + def metadata(self) -> dict[str, Any]: + """Expose the server metadata to callers / logs.""" + + return self.client.get_server_metadata() + + def reset(self) -> str: + """Reset local chunk state and remote session state. + + Local reset: + - drop cached action chunk + - rewind chunk cursor + - allocate a fresh session id + + Remote reset: + - send a websocket `reset` message so the server can clear any + request/session-side state it associates with this client + """ + + self.actions_from_chunk_completed = 0 + self.pred_action_chunk = None + self.session_id = str(uuid.uuid4()) + self.last_chunk_latency_s = None + self.last_used_server_call = False + return self.client.reset({}) + + def infer(self, obs: dict[str, Any], instruction: str) -> dict[str, Any]: + """Turn one simulator observation into one executable 8-D action. + + Key behavior: + - call the server only when the local chunk cache is empty/exhausted + - otherwise, keep consuming the cached chunk open-loop + - always return exactly one 8-D action for the current simulator step + """ + + # Convert Isaac Lab observation structure into a plain numpy-friendly + # record that is easier to serialize and visualize. + curr_obs = self._extract_observation(obs) + self.last_used_server_call = False + + # Replan if: + # 1. this is the first step of a rollout / chunk + # 2. we already consumed `open_loop_horizon` actions from the current chunk + # 3. no cached chunk is currently available + if ( + self.actions_from_chunk_completed == 0 + or self.actions_from_chunk_completed >= self.open_loop_horizon + or self.pred_action_chunk is None + ): + self.actions_from_chunk_completed = 0 + # Build the exact DreamZero/OpenPI payload expected by the server. + # + # Notes: + # - images are resized/padded to the serving contract's 180x320 + # - proprio is cast to float64 to match upstream client behavior + # - cartesian_position is currently unused by DreamZero DROID, so + # a dummy zero vector is sent for protocol completeness + request_data = { + "observation/exterior_image_0_left": image_tools.resize_with_pad( + curr_obs["right_image"], 180, 320 + ), + "observation/exterior_image_1_left": image_tools.resize_with_pad( + curr_obs["left_image"], 180, 320 + ), + "observation/wrist_image_left": image_tools.resize_with_pad( + curr_obs["wrist_image"], 180, 320 + ), + "observation/joint_position": curr_obs["joint_position"].astype(np.float64), + "observation/cartesian_position": np.zeros((6,), dtype=np.float64), + "observation/gripper_position": curr_obs["gripper_position"].astype(np.float64), + "prompt": instruction, + "session_id": self.session_id, + } + + # Measure end-to-end server latency for this chunk request. + start = time.perf_counter() + actions = self.client.infer(request_data) + self.last_chunk_latency_s = time.perf_counter() - start + self.last_used_server_call = True + self.server_calls += 1 + + # DreamZero DROID serving is expected to return an action chunk with + # 24 future actions, each action being 8-D. + if actions.ndim != 2: + raise AssertionError(f"Expected 2D action array, got shape {actions.shape}") + if actions.shape != (ACTION_HORIZON, ACTION_DIM): + raise AssertionError( + f"Expected action shape {(ACTION_HORIZON, ACTION_DIM)}, got {actions.shape}" + ) + self.pred_action_chunk = actions + + # Consume exactly one action row from the cached chunk for this + # simulator step. + action = np.array(self.pred_action_chunk[self.actions_from_chunk_completed], copy=True) + self.actions_from_chunk_completed += 1 + + # Upstream DreamZero sim-eval binarizes the gripper command. + action[-1] = 1.0 if action[-1].item() > 0.5 else 0.0 + + # Produce a human-friendly visualization strip for videos: + # right external | wrist | left external + img1 = image_tools.resize_with_pad(curr_obs["right_image"], 224, 224) + img2 = image_tools.resize_with_pad(curr_obs["wrist_image"], 224, 224) + img3 = image_tools.resize_with_pad(curr_obs["left_image"], 224, 224) + viz = np.concatenate([img1, img2, img3], axis=1) + + # Return both the executable action and auxiliary debug info. + return { + "action": action, + "viz": viz, + "joint_position": curr_obs["joint_position"], + "gripper_position": curr_obs["gripper_position"], + "used_server_call": self.last_used_server_call, + "chunk_latency_s": self.last_chunk_latency_s if self.last_used_server_call else None, + } + + @staticmethod + def _extract_observation(obs_dict: dict[str, Any]) -> dict[str, np.ndarray]: + """Extract the pieces DreamZero cares about from Isaac Lab observations. + + `sim-evals` exposes camera frames and robot state inside the + `obs["policy"]` group. This helper converts those tensors into numpy + arrays so they can be fed into image preprocessing / websocket packing. + """ + + policy = obs_dict["policy"] + # Isaac Lab stores camera observations as batched tensors; use env 0. + right_image = policy["external_cam"][0].clone().detach().cpu().numpy() + left_image = policy["external_cam_2"][0].clone().detach().cpu().numpy() + wrist_image = policy["wrist_cam"][0].clone().detach().cpu().numpy() + # Robot proprioception. + joint_position = policy["arm_joint_pos"].clone().detach().cpu().numpy() + gripper_position = policy["gripper_pos"].clone().detach().cpu().numpy() + + return { + "right_image": right_image, + "left_image": left_image, + "wrist_image": wrist_image, + "joint_position": joint_position, + "gripper_position": gripper_position, + } + + +def _scene_instruction(scene: int) -> str: + """Map a numeric scene id onto the fixed language prompt used for rollout.""" + + try: + return SCENE_PROMPTS[scene] + except KeyError as exc: + raise ValueError(f"Unsupported scene {scene}. Available scenes: {sorted(SCENE_PROMPTS)}") from exc + + +def _capture_scene_objects(env: Any) -> dict[str, list[float]]: + """Best-effort extraction of scene object root positions. + + This is only for debugging / reporting. The rollout logic itself does not + depend on these positions. + """ + + objects: dict[str, list[float]] = {} + scene = getattr(env, "scene", None) + if scene is None: + return objects + + # Skip non-task entities such as cameras, the robot, and lighting. + for name in scene.keys(): + if name in {"robot", "external_cam", "external_cam_2", "wrist_cam", "sphere_light", "scene"}: + continue + entity = scene[name] + data = getattr(entity, "data", None) + root_pos_w = getattr(data, "root_pos_w", None) + if root_pos_w is None: + continue + # Convert tensors to plain lists for JSON serialization. + value = root_pos_w[0].detach().cpu().to(torch.float32).tolist() + objects[str(name)] = [float(x) for x in value] + return objects + + +def _maybe_infer_success(scene: int, final_objects: dict[str, list[float]]) -> dict[str, Any]: + """Best-effort geometric heuristic. + + The simulator itself does not expose a built-in success term; this function + provides a transparent fallback for human-readable reporting only. + """ + + task_pairs = { + 1: ("cube", "bowl"), + 2: ("can", "mug"), + 3: ("banana", "bin"), + } + source_name, target_name = task_pairs.get(scene, (None, None)) + if source_name not in final_objects or target_name not in final_objects: + return { + "has_builtin_success": False, + "heuristic_success": None, + "reason": "scene object names unavailable for heuristic", + } + + source = np.asarray(final_objects[source_name], dtype=np.float32) + target = np.asarray(final_objects[target_name], dtype=np.float32) + xy_distance = float(np.linalg.norm(source[:2] - target[:2])) + z_delta = float(source[2] - target[2]) + heuristic_success = bool(xy_distance < 0.12 and z_delta > -0.08) + return { + "has_builtin_success": False, + "heuristic_success": heuristic_success, + "xy_distance": xy_distance, + "z_delta": z_delta, + "source_object": source_name, + "target_object": target_name, + "reason": "no built-in env success flag; using final object pose heuristic", + } + + +def _dump_json(path: Path, payload: dict[str, Any]) -> None: + """Write a JSON file with stable UTF-8 formatting.""" + + path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + + +def _scalar_from_env_value(value: Any) -> float: + """Normalize simulator scalar outputs into a plain Python float. + + Isaac Lab / Gym values may come back as tensors, numpy arrays, tuples, or + direct Python scalars depending on the wrapper stack. Centralizing the + conversion here makes the rollout loop cleaner and more robust. + """ + + if isinstance(value, torch.Tensor): + return float(value.reshape(-1)[0].detach().cpu().item()) + if isinstance(value, np.ndarray): + return float(value.reshape(-1)[0]) + if isinstance(value, (list, tuple)): + return float(np.asarray(value).reshape(-1)[0]) + return float(value) + + +def _bool_from_env_value(value: Any) -> bool: + """Normalize simulator boolean-like outputs into a plain Python bool.""" + + if isinstance(value, torch.Tensor): + return bool(value.reshape(-1)[0].detach().cpu().item()) + if isinstance(value, np.ndarray): + return bool(value.reshape(-1)[0]) + if isinstance(value, (list, tuple)): + return bool(np.asarray(value).reshape(-1)[0]) + return bool(value) + + +def _make_output_dir(output_root: Path, scene: int) -> Path: + """Create a timestamped output directory for one sim-eval run.""" + + timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ") + output_dir = output_root / f"scene{scene}_{timestamp}" + output_dir.mkdir(parents=True, exist_ok=True) + return output_dir + + +def main() -> None: + """Entry point for one or more DROID simulation rollouts. + + High-level flow: + 1. parse command-line flags + 2. bootstrap Isaac Lab / sim-evals imports + 3. create the DROID environment + 4. connect the DreamZero websocket client + 5. run `episodes` rollouts + 6. export videos + JSON summaries + """ + + # Script-level arguments. + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--episodes", type=int, default=1, help="Number of episodes to run.") + parser.add_argument("--scene", type=int, default=1, help="DROID scene id (1/2/3).") + parser.add_argument("--host", type=str, default="127.0.0.1", help="vLLM DreamZero server host.") + parser.add_argument("--port", type=int, default=8000, help="vLLM DreamZero server port.") + parser.add_argument("--path", type=str, default=DEFAULT_PATH, help="Websocket path suffix.") + parser.add_argument( + "--open-loop-horizon", + type=int, + default=DEFAULT_OPEN_LOOP_HORIZON, + help="How many actions to consume locally before requesting the next chunk.", + ) + parser.add_argument( + "--output-root", + type=Path, + default=DEFAULT_OUTPUT_ROOT, + help="Directory where videos and trajectory logs are stored.", + ) + + try: + from isaaclab.app import AppLauncher + except ImportError as exc: # pragma: no cover - runtime dependency guard + raise ImportError( + "DreamZero sim-eval client requires Isaac Lab (`isaaclab`). " + "Launch it from an Isaac Lab environment, e.g. via `isaaclab.sh -p`." + ) from exc + + # Let Isaac Lab inject its own runtime flags (e.g. headless, device). + AppLauncher.add_app_launcher_args(parser) + args = parser.parse_args() + + # DreamZero sim-eval always needs camera observations enabled. + args.enable_cameras = True + # Boot Isaac Sim / Isaac Lab. + app_launcher = AppLauncher(args) + simulation_app = app_launcher.app + # Set defaults so the `finally` block can clean up safely even if an + # earlier step fails. + env = None + client = None + + # Import simulator modules only *after* the app is launched. This matches + # Isaac Lab's required import ordering. + try: + import gymnasium as gym + except ImportError as exc: # pragma: no cover - runtime dependency guard + raise ImportError( + "DreamZero sim-eval client requires `gymnasium`." + ) from exc + + try: + import sim_evals.environments # noqa: F401 + except ImportError as exc: # pragma: no cover - runtime dependency guard + raise ImportError( + "DreamZero sim-eval client requires the external `sim-evals` package " + "or checkout to be importable." + ) from exc + + try: + from isaaclab_tasks.utils import parse_env_cfg + except ImportError as exc: # pragma: no cover - runtime dependency guard + raise ImportError( + "DreamZero sim-eval client requires `isaaclab_tasks`." + ) from exc + + # Resolve output location and scene prompt. + output_dir = _make_output_dir(args.output_root.expanduser().resolve(), args.scene) + instruction = _scene_instruction(args.scene) + + # Build the DROID environment configuration from `sim-evals`. + env_cfg = parse_env_cfg( + "DROID", + device=args.device, + num_envs=1, + use_fabric=True, + ) + # Select one of the pre-authored scenes/tasks. + env_cfg.set_scene(args.scene) + env = gym.make("DROID", cfg=env_cfg) + + # Upstream sim-evals resets twice so materials / cameras are fully ready. + obs, _ = env.reset() + obs, _ = env.reset() + + # Connect the websocket policy client. + client = DreamZeroJointPosClient( + remote_host=args.host, + remote_port=args.port, + path=args.path, + open_loop_horizon=args.open_loop_horizon, + ) + + # Aggregated per-run results. + all_episode_summaries: list[dict[str, Any]] = [] + max_steps = int(env.env.max_episode_length) + logging.info("DreamZero metadata: %s", client.metadata()) + logging.info("Scene %s prompt: %s", args.scene, instruction) + logging.info("Writing outputs to %s", output_dir) + + try: + # No gradients are needed in inference-only rollout mode. + with torch.no_grad(): + for episode_index in range(args.episodes): + # Per-episode collectors. + frames: list[np.ndarray] = [] + step_records: list[StepRecord] = [] + episode_start = time.perf_counter() + server_time_s = 0.0 + final_reward = 0.0 + terminated = False + truncated = False + + for step_index in range(max_steps): + # Ask the policy for the next action. Internally this may or + # may not trigger a real server request depending on whether + # the local chunk cache has been exhausted. + logging.debug("Episode %d step %d: requesting action", episode_index, step_index) + result = client.infer(obs, instruction) + logging.debug( + "Episode %d step %d: got action (server_call=%s latency=%s)", + episode_index, + step_index, + result["used_server_call"], + result["chunk_latency_s"], + ) + # Save one visualization frame per simulator step. + frames.append(result["viz"]) + + # Isaac Lab expects batched actions, hence `[None]`. + action_tensor = torch.tensor(result["action"], dtype=torch.float32)[None] + logging.debug("Episode %d step %d: stepping env", episode_index, step_index) + obs, reward, term, trunc, info = env.step(action_tensor) + logging.debug("Episode %d step %d: env.step returned", episode_index, step_index) + logging.debug("Episode %d step %d: parsing reward/flags", episode_index, step_index) + logging.debug( + "Episode %d step %d: raw types reward=%s term=%s trunc=%s", + episode_index, + step_index, + type(reward).__name__, + type(term).__name__, + type(trunc).__name__, + ) + # Normalize environment outputs into plain Python scalars so + # the rest of the code does not depend on wrapper-specific types. + reward_value = _scalar_from_env_value(reward) + term_value = _bool_from_env_value(term) + trunc_value = _bool_from_env_value(trunc) + logging.debug( + "Episode %d step %d: parsed reward=%s term=%s trunc=%s", + episode_index, + step_index, + reward_value, + term_value, + trunc_value, + ) + # Keep scene-object capture optional. It is useful for + # debugging / success heuristics, but the rollout should not + # fail if the environment does not expose object roots. + scene_objects = _capture_scene_objects(env) + + # Accumulate total server-side time only on steps that + # triggered a fresh chunk inference. + if result["chunk_latency_s"] is not None: + server_time_s += float(result["chunk_latency_s"]) + + # Materialize one JSON-serializable trajectory record. + logging.debug("Episode %d step %d: appending trajectory", episode_index, step_index) + step_records.append( + StepRecord( + step_index=step_index, + used_server_call=bool(result["used_server_call"]), + chunk_latency_s=( + float(result["chunk_latency_s"]) + if result["chunk_latency_s"] is not None + else None + ), + action=[float(x) for x in np.asarray(result["action"], dtype=np.float32).tolist()], + joint_position=[ + float(x) + for x in np.asarray(result["joint_position"], dtype=np.float32).tolist() + ], + gripper_position=[ + float(x) + for x in np.asarray(result["gripper_position"], dtype=np.float32).tolist() + ], + reward=reward_value, + terminated=term_value, + truncated=trunc_value, + scene_objects=scene_objects, + ) + ) + logging.debug("Episode %d step %d: trajectory appended", episode_index, step_index) + + # Track final status for summary export. + final_reward = reward_value + terminated = term_value + truncated = trunc_value + if term_value or trunc_value: + # End the rollout early if the environment terminates. + break + + # Episode-level timing and video export. + episode_wall_time_s = time.perf_counter() - episode_start + video_path = output_dir / f"episode_{episode_index:02d}.mp4" + logging.info("Episode %d: writing video to %s", episode_index, video_path) + mediapy.write_video(video_path, frames, fps=15) + + # Reset the policy server between episodes. + logging.info("Episode %d: sending reset", episode_index) + reset_response = client.reset() + final_objects = step_records[-1].scene_objects if step_records else {} + success_report = _maybe_infer_success(args.scene, final_objects) + + # Assemble the per-episode summary that is written to + # `episode_XX.json`. + episode_summary = { + "episode_index": episode_index, + "prompt": instruction, + "video_path": str(video_path), + "steps_executed": len(step_records), + "max_steps": max_steps, + "terminated": terminated, + "truncated": truncated, + "final_reward": final_reward, + "server_calls": client.server_calls, + "server_time_s": server_time_s, + "episode_wall_time_s": episode_wall_time_s, + "avg_server_time_per_call_s": ( + server_time_s / client.server_calls if client.server_calls else None + ), + "reset_response": reset_response, + "success_report": success_report, + "server_metadata": client.metadata(), + "trajectory": [record.__dict__ for record in step_records], + } + _dump_json(output_dir / f"episode_{episode_index:02d}.json", episode_summary) + all_episode_summaries.append(episode_summary) + + logging.info( + "Episode %d done: steps=%d wall=%.2fs server_calls=%d heuristic_success=%s", + episode_index, + len(step_records), + episode_wall_time_s, + client.server_calls, + success_report.get("heuristic_success"), + ) + + # Reset per-episode counters while keeping the client alive. + client.server_calls = 0 + + # Top-level run summary across all episodes. + summary = { + "scene": args.scene, + "prompt": instruction, + "episodes": args.episodes, + "host": args.host, + "port": args.port, + "path": args.path, + "device": args.device, + "output_dir": str(output_dir), + "server_metadata": client.metadata(), + "episodes_summary": all_episode_summaries, + } + _dump_json(output_dir / "summary.json", summary) + # Also print the summary to stdout so the caller can capture it in logs. + print(json.dumps(summary, ensure_ascii=False, indent=2)) + finally: + # Best-effort cleanup. Avoid masking the main error if cleanup fails. + try: + if client is not None: + client.client.close() + except Exception: + pass + if env is not None: + env.close() + simulation_app.close() + + +if __name__ == "__main__": + # Keep the script-level logs readable. Per-step rollout details are still + # available via `DEBUG` if needed, but websocket / asyncio internals are + # usually too noisy for normal usage. + logging.basicConfig(level=logging.INFO) + logging.getLogger("websockets").setLevel(logging.WARNING) + logging.getLogger("asyncio").setLevel(logging.WARNING) + main() diff --git a/examples/online_serving/dreamzero/openpi_client.py b/examples/online_serving/dreamzero/openpi_client.py index e4503e0b698..86422e6597c 100755 --- a/examples/online_serving/dreamzero/openpi_client.py +++ b/examples/online_serving/dreamzero/openpi_client.py @@ -13,9 +13,13 @@ from pathlib import Path from typing import Any -import cv2 import numpy as np +try: + import cv2 +except ImportError as exc: # pragma: no cover - runtime dependency guard + raise ImportError("DreamZero OpenPI example requires `opencv-python`.") from exc + try: import websockets.sync.client except ImportError as exc: # pragma: no cover - runtime dependency guard From 834d116d67c804bf5e838a284de87831cd9b0ebb Mon Sep 17 00:00:00 2001 From: Meng Date: Sun, 19 Apr 2026 18:31:42 +0000 Subject: [PATCH 05/45] docs(dreamzero): clarify external isaac lab launcher usage Document the sim-eval launch flow without assuming Isaac Lab lives inside the vllm-omni repo, while still assuming commands are run from the repository root. Signed-off-by: Meng Co-authored-by: Yangshen Deng --- examples/online_serving/dreamzero/README.md | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/examples/online_serving/dreamzero/README.md b/examples/online_serving/dreamzero/README.md index 7d8ff099442..73cfb2b7a13 100644 --- a/examples/online_serving/dreamzero/README.md +++ b/examples/online_serving/dreamzero/README.md @@ -115,12 +115,8 @@ vllm serve \ ### 2. Start the DROID simulation client -This step runs from an external `sim-evals` checkout, because Isaac Lab / -Isaac Sim assets and environment registration live there. - -Like upstream DreamZero, this client does not guess any local path for -`sim_evals` or Isaac Lab. Run it from an environment where those packages are -already importable (for example, the `sim-evals` project environment). +Run this from an environment where `isaaclab`, `isaaclab_tasks`, +`sim_evals`, and `gymnasium` are already importable. Environment: @@ -136,14 +132,13 @@ Environment: - `cv2` - `mediapy` -Example command: +From the `vllm-omni` repository root, invoke the client through an external +Isaac Lab launcher, for example: ```bash -cd /path/to/sim-evals - CUDA_VISIBLE_DEVICES=1 \ -./submodules/IsaacLab/isaaclab.sh -p \ - /path/to/vllm-omni/examples/online_serving/dreamzero/droid_sim_eval_client.py \ +/path/to/isaaclab.sh -p \ + examples/online_serving/dreamzero/droid_sim_eval_client.py \ --host 127.0.0.1 \ --port 8000 \ --scene 1 \ From 26073b4fb7543c03de884ead92ca571271df34c8 Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Sun, 19 Apr 2026 18:55:23 +0000 Subject: [PATCH 06/45] chore(dreamzero): remove PR-only source mapping comments Signed-off-by: Yangshen Deng Co-authored-by: Meng --- .../dreamzero/modeling/action_encoder.py | 63 +- .../dreamzero/modeling/causal_wan_model.py | 585 +++++++----------- .../dreamzero/modeling/image_encoder.py | 25 +- .../models/dreamzero/pipeline_dreamzero.py | 426 +++---------- .../models/dreamzero/state_dreamzero.py | 43 +- .../models/dreamzero/transform/base.py | 4 +- .../models/dreamzero/transform/droid.py | 68 +- .../models/dreamzero/transform/roboarena.py | 2 - 8 files changed, 370 insertions(+), 846 deletions(-) diff --git a/vllm_omni/diffusion/models/dreamzero/modeling/action_encoder.py b/vllm_omni/diffusion/models/dreamzero/modeling/action_encoder.py index e9e55da2e46..e5fcc6604f1 100644 --- a/vllm_omni/diffusion/models/dreamzero/modeling/action_encoder.py +++ b/vllm_omni/diffusion/models/dreamzero/modeling/action_encoder.py @@ -1,14 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Action encoder/decoder for DreamZero. - -Adapted from: -- CategorySpecificLinear/MLP/MultiEmbodimentActionEncoder: - dreamzero/groot/vla/model/dreamzero/modules/wan_video_dit_action_casual_chunk.py L31-90 -- SinusoidalPositionalEncoding/swish: - dreamzero/groot/vla/model/n1_5/modules/action_encoder.py L1-41 -""" +"""Action encoder/decoder for DreamZero.""" from __future__ import annotations @@ -18,37 +11,30 @@ def swish(x: torch.Tensor) -> torch.Tensor: - """swish activation: x * sigmoid(x) - Source: action_encoder.py L6-7 - """ + """swish activation: x * sigmoid(x)""" return x * torch.sigmoid(x) class SinusoidalPositionalEncoding(nn.Module): - """Sinusoidal encoding: (B, T) timesteps → (B, T, dim) - - Source: action_encoder.py L10-40 - """ + """Sinusoidal encoding: (B, T) timesteps → (B, T, dim)""" def __init__(self, embedding_dim: int) -> None: super().__init__() self.embedding_dim = embedding_dim def forward(self, timesteps: torch.Tensor) -> torch.Tensor: - # Source: action_encoder.py L20-40 - timesteps = timesteps.float() # L23: ensure float - half_dim = self.embedding_dim // 2 # L28 - exponent = -torch.arange( # L30-32 - half_dim, dtype=torch.float, device=timesteps.device - ) * (torch.log(torch.tensor(10000.0)) / half_dim) - freqs = timesteps.unsqueeze(-1) * exponent.exp() # L34: (B, T, half_dim) - return torch.cat([torch.sin(freqs), torch.cos(freqs)], dim=-1) # L36-38: (B, T, dim) + timesteps = timesteps.float() + half_dim = self.embedding_dim // 2 + exponent = -torch.arange(half_dim, dtype=torch.float, device=timesteps.device) * ( + torch.log(torch.tensor(10000.0)) / half_dim + ) + freqs = timesteps.unsqueeze(-1) * exponent.exp() + return torch.cat([torch.sin(freqs), torch.cos(freqs)], dim=-1) class CategorySpecificLinear(nn.Module): """Per-category linear: W[cat_id] @ x + b[cat_id] - Source: wan_video_dit_action_casual_chunk.py L31-42 Params: W: (num_categories, input_dim, hidden_dim) — note: 0.02 * randn init b: (num_categories, hidden_dim) — zero init @@ -60,17 +46,13 @@ def __init__(self, num_categories: int, input_dim: int, hidden_dim: int) -> None self.b = nn.Parameter(torch.zeros(num_categories, hidden_dim)) def forward(self, x: torch.Tensor, cat_ids: torch.Tensor) -> torch.Tensor: - # Source: wan_video_dit_action_casual_chunk.py L39-42 - selected_W = self.W[cat_ids] # L40: (B, input_dim, hidden_dim) - selected_b = self.b[cat_ids] # L41: (B, hidden_dim) - return torch.bmm(x, selected_W) + selected_b.unsqueeze(1) # L42: (B, T, hidden_dim) + selected_W = self.W[cat_ids] + selected_b = self.b[cat_ids] + return torch.bmm(x, selected_W) + selected_b.unsqueeze(1) class CategorySpecificMLP(nn.Module): - """Two-layer MLP: layer1 (relu) → layer2 - - Source: wan_video_dit_action_casual_chunk.py L45-54 - """ + """Two-layer MLP: layer1 (relu) → layer2""" def __init__(self, num_categories: int, input_dim: int, hidden_dim: int, output_dim: int) -> None: super().__init__() @@ -78,15 +60,13 @@ def __init__(self, num_categories: int, input_dim: int, hidden_dim: int, output_ self.layer2 = CategorySpecificLinear(num_categories, hidden_dim, output_dim) def forward(self, x: torch.Tensor, cat_ids: torch.Tensor) -> torch.Tensor: - # Source: wan_video_dit_action_casual_chunk.py L52-54 - hidden = F.relu(self.layer1(x, cat_ids)) # L53 - return self.layer2(hidden, cat_ids) # L54 + hidden = F.relu(self.layer1(x, cat_ids)) + return self.layer2(hidden, cat_ids) class MultiEmbodimentActionEncoder(nn.Module): """Encode actions with embodiment-specific weights + sinusoidal timestep. - Source: wan_video_dit_action_casual_chunk.py L57-90 Flow: actions → W1 → concat(a_emb, pos_enc(timesteps)) → W2 (swish) → W3 Args: @@ -112,10 +92,9 @@ def forward(self, actions: torch.Tensor, timesteps: torch.Tensor, cat_ids: torch Returns: (B, T, hidden_size) """ - # Source: wan_video_dit_action_casual_chunk.py L69-90 - a_emb = self.W1(actions, cat_ids) # L79: (B, T, hidden_size) - tau_emb = self.pos_encoding(timesteps).to(dtype=a_emb.dtype) # L82: (B, T, hidden_size) - x = torch.cat([a_emb, tau_emb], dim=-1) # L85: (B, T, 2*hidden_size) - x = swish(self.W2(x, cat_ids)) # L86: (B, T, hidden_size) - x = self.W3(x, cat_ids) # L89: (B, T, hidden_size) + a_emb = self.W1(actions, cat_ids) + tau_emb = self.pos_encoding(timesteps).to(dtype=a_emb.dtype) + x = torch.cat([a_emb, tau_emb], dim=-1) + x = swish(self.W2(x, cat_ids)) + x = self.W3(x, cat_ids) return x diff --git a/vllm_omni/diffusion/models/dreamzero/modeling/causal_wan_model.py b/vllm_omni/diffusion/models/dreamzero/modeling/causal_wan_model.py index ec8a632d250..4e772967ee7 100644 --- a/vllm_omni/diffusion/models/dreamzero/modeling/causal_wan_model.py +++ b/vllm_omni/diffusion/models/dreamzero/modeling/causal_wan_model.py @@ -3,10 +3,7 @@ """CausalWanModel — 40-layer DiT with causal attention and KV cache. -Adapted from: dreamzero/groot/vla/model/dreamzero/modules/ - wan_video_dit_action_casual_chunk.py L1218-2200 - -Key differences from WanTransformer3DModel (wan2_2_transformer.py): +Key differences from WanTransformer3DModel: - Causal self-attention (new frames only see history) - KV cache for streaming inference - Action/state token support (appended after video tokens) @@ -40,49 +37,40 @@ ) # ── RoPE utilities ────────────────────────────────────────────────── -# Source: wan2_1_submodule.py rope_params / rope_action_apply -# wan_video_dit_action_casual_chunk.py L93-185 causal_rope_action_apply def sinusoidal_embedding_1d(dim: int, position: torch.Tensor) -> torch.Tensor: - """Sinusoidal positional embedding for timesteps. - Source: wan2_1_submodule.py L16-26 - """ - assert dim % 2 == 0 # L18 - half = dim // 2 # L19 - position = position.type(torch.float64) # L20 - sinusoid = torch.outer( # L23-24 + """Sinusoidal positional embedding for timesteps.""" + assert dim % 2 == 0 + half = dim // 2 + position = position.type(torch.float64) + sinusoid = torch.outer( position, torch.pow(10000, -torch.arange(half, dtype=position.dtype, device=position.device).div(half)), ) - x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1) # L25 + x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1) return x def rope_params(max_seq_len: int, dim: int) -> torch.Tensor: """Precompute complex-valued RoPE frequencies (polar form). - Source: wan2_1_submodule.py L37-44 (rope_params_polar) Returns: complex tensor [max_seq_len, dim // 2] """ - assert dim % 2 == 0 # L38 - freqs = torch.outer( # L39-42 + assert dim % 2 == 0 + freqs = torch.outer( torch.arange(max_seq_len), 1.0 / torch.pow(10000, torch.arange(0, dim, 2).to(torch.float64).div(dim)), ) - freqs = torch.polar(torch.ones_like(freqs), freqs) # L43 + freqs = torch.polar(torch.ones_like(freqs), freqs) return freqs def rope_apply(x: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor: - """Apply RoPE to x using precomputed complex freqs. - Source: wan2_1_submodule.py L64-75 (rope_apply_polar) - """ - B, seq_len, n, _ = x.shape # L65 - x = torch.view_as_complex( # L68-70 - x.to(torch.float64).reshape(B, seq_len, n, -1, 2) - ) - freqs = freqs.unsqueeze(0) # L73 - x = torch.view_as_real(x * freqs).flatten(3) # L74 + """Apply RoPE to x using precomputed complex freqs.""" + B, seq_len, n, _ = x.shape + x = torch.view_as_complex(x.to(torch.float64).reshape(B, seq_len, n, -1, 2)) + freqs = freqs.unsqueeze(0) + x = torch.view_as_real(x * freqs).flatten(3) return x @@ -95,26 +83,20 @@ def rope_action_apply( num_action_per_block: int = 32, num_state_per_block: int = 1, ) -> torch.Tensor: - """RoPE with action/state frequency tables for multi-step sequences. - Source: wan2_1_submodule.py L130-159 (rope_action_apply_polar) - """ - B, seq_len, n, _ = x.shape # L139 - x = torch.view_as_complex( # L142-144 - x.to(torch.float64).reshape(B, seq_len, n, -1, 2) - ) - if action_register_length is not None: # L146 - assert num_action_per_block is not None # L147 - assert num_state_per_block is not None # L148 - chunk_size = action_register_length // (num_action_per_block + num_state_per_block) # L150 - freqs_1d_action = freqs_action[: chunk_size * num_action_per_block].view( # L152 + """RoPE with action/state frequency tables for multi-step sequences.""" + B, seq_len, n, _ = x.shape + x = torch.view_as_complex(x.to(torch.float64).reshape(B, seq_len, n, -1, 2)) + if action_register_length is not None: + assert num_action_per_block is not None + assert num_state_per_block is not None + chunk_size = action_register_length // (num_action_per_block + num_state_per_block) + freqs_1d_action = freqs_action[: chunk_size * num_action_per_block].view( chunk_size * num_action_per_block, 1, -1 ) - freqs_1d_state = freqs_state[: chunk_size * num_state_per_block].view( # L153 - chunk_size * num_state_per_block, 1, -1 - ) - freqs = torch.cat([freqs, freqs_1d_action, freqs_1d_state], dim=0) # L154 - freqs = freqs.unsqueeze(0) # L157 - x = torch.view_as_real(x * freqs).flatten(3) # L158 + freqs_1d_state = freqs_state[: chunk_size * num_state_per_block].view(chunk_size * num_state_per_block, 1, -1) + freqs = torch.cat([freqs, freqs_1d_action, freqs_1d_state], dim=0) + freqs = freqs.unsqueeze(0) + x = torch.view_as_real(x * freqs).flatten(3) return x @@ -128,37 +110,29 @@ def causal_rope_action_apply( num_state_per_block: int, action_state_index: int, ) -> torch.Tensor: - """RoPE for single inference step (causal / KV-cache mode). - Source: wan_video_dit_action_casual_chunk.py L153-185 (causal_rope_action_apply_polar) - """ - B, seq_len, n, _ = x.shape # L163 - x = torch.view_as_complex( # L166-168 - x.to(torch.float64).reshape(B, seq_len, n, -1, 2) - ) - if action_register_length is not None: # L170 - assert action_register_length == (num_action_per_block + num_state_per_block) # L171 - freqs_action = freqs_action[ # L172-174 + """RoPE for single inference step (causal / KV-cache mode).""" + B, seq_len, n, _ = x.shape + x = torch.view_as_complex(x.to(torch.float64).reshape(B, seq_len, n, -1, 2)) + if action_register_length is not None: + assert action_register_length == (num_action_per_block + num_state_per_block) + freqs_action = freqs_action[ action_state_index * num_action_per_block : (action_state_index + 1) * num_action_per_block ] - freqs_state = freqs_state[ # L175-177 + freqs_state = freqs_state[ action_state_index * num_state_per_block : (action_state_index + 1) * num_state_per_block ] - freqs_1d = torch.cat([freqs_action, freqs_state], dim=0).view( # L178 - action_register_length, 1, -1 - ) - freqs = torch.cat([freqs, freqs_1d], dim=0) # L179 - freqs = freqs.unsqueeze(0) # L182 - x = torch.view_as_real(x * freqs).flatten(3) # L183 + freqs_1d = torch.cat([freqs_action, freqs_state], dim=0).view(action_register_length, 1, -1) + freqs = torch.cat([freqs, freqs_1d], dim=0) + freqs = freqs.unsqueeze(0) + x = torch.view_as_real(x * freqs).flatten(3) return x # ── Normalization ─────────────────────────────────────────────────── -# Source: wan2_1_submodule.py L162-178 (WanRMSNorm) -# wan2_2_transformer.py L65-95 (DistributedRMSNorm — TP-aware version) class WanLayerNorm(nn.LayerNorm): - """Source: wan2_1_submodule.py L181-184""" + """LayerNorm wrapper used by DreamZero blocks.""" def __init__(self, dim: int, eps: float = 1e-6, elementwise_affine: bool = False) -> None: super().__init__(dim, eps=eps, elementwise_affine=elementwise_affine) @@ -206,10 +180,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: global_count = local_count mean_sq = global_sum_sq / global_count - # Keep the same numerical form as upstream `WanRMSNorm._norm()`: - # x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + eps) - # For TP>1, extend the mean to the global hidden dimension first, then - # apply the same `rsqrt` formulation on each local shard. return (x_float * torch.rsqrt(mean_sq + self.eps)).type_as(x) * self.weight @@ -218,30 +188,29 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class MLPProj(nn.Module): """CLIP feature projection for i2v. - Source: wan2_1_submodule.py L565-577 Uses ColumnParallelLinear + RowParallelLinear (Qwen3_VisionMLP pattern). """ def __init__(self, in_dim: int, out_dim: int) -> None: super().__init__() - self.norm1 = nn.LayerNorm(in_dim) # L571 - self.fc1 = ColumnParallelLinear( # L571 nn.Linear(in_dim, in_dim) + self.norm1 = nn.LayerNorm(in_dim) + self.fc1 = ColumnParallelLinear( in_dim, in_dim, bias=True, return_bias=False, ) - self.act = nn.GELU() # L572 - self.fc2 = RowParallelLinear( # L572 nn.Linear(in_dim, out_dim) + self.act = nn.GELU() + self.fc2 = RowParallelLinear( in_dim, out_dim, bias=True, return_bias=False, ) - self.norm2 = nn.LayerNorm(out_dim) # L573 + self.norm2 = nn.LayerNorm(out_dim) def forward(self, image_embeds: torch.Tensor) -> torch.Tensor: - x = self.norm1(image_embeds) # L576 + x = self.norm1(image_embeds) x = self.fc1(x) x = self.act(x) x = self.fc2(x) @@ -250,19 +219,17 @@ def forward(self, image_embeds: torch.Tensor) -> torch.Tensor: # ── Cross-Attention ───────────────────────────────────────────────── -# Source: wan_video_dit_action_casual_chunk.py L1087-1190 (referenced) # T2V and I2V cross-attention variants class WanT2VCrossAttention(nn.Module): """Text-to-video cross-attention. - Source: wan2_1_submodule.py L243-278 Uses vllm-omni Attention for FlashAttn backend. """ def __init__(self, dim: int, num_heads: int, window_size=(-1, -1), qk_norm: bool = True, eps: float = 1e-6) -> None: super().__init__() - assert dim % num_heads == 0 # L195 + assert dim % num_heads == 0 self.dim = dim self.num_heads = num_heads self.head_dim = dim // num_heads @@ -271,12 +238,12 @@ def __init__(self, dim: int, num_heads: int, window_size=(-1, -1), qk_norm: bool raise ValueError(f"num_heads={num_heads} must be divisible by tp_size={tp_size}.") self.tp_num_heads = num_heads // tp_size self.tp_inner_dim = self.tp_num_heads * self.head_dim - self.q = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) # L205 - self.k = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) # L206 - self.v = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) # L207 - self.o = RowParallelLinear(dim, dim, bias=True, input_is_parallel=True, return_bias=False) # L208 - self.norm_q = DistributedRMSNorm(self.tp_inner_dim, eps=eps) if qk_norm else nn.Identity() # L209 - self.norm_k = DistributedRMSNorm(self.tp_inner_dim, eps=eps) if qk_norm else nn.Identity() # L210 + self.q = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) + self.k = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) + self.v = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) + self.o = RowParallelLinear(dim, dim, bias=True, input_is_parallel=True, return_bias=False) + self.norm_q = DistributedRMSNorm(self.tp_inner_dim, eps=eps) if qk_norm else nn.Identity() + self.norm_k = DistributedRMSNorm(self.tp_inner_dim, eps=eps) if qk_norm else nn.Identity() self.attn = Attention( self.tp_num_heads, self.head_dim, @@ -292,32 +259,30 @@ def forward( context_lens: torch.Tensor | None = None, crossattn_cache: dict | None = None, ) -> torch.Tensor: - """Source: wan2_1_submodule.py L245-278""" del context_lens - n, d = self.tp_num_heads, self.head_dim # L253 - q = self.norm_q(self.q(x)).unflatten(2, (n, d)) # L256 - if crossattn_cache is not None: # L258 - if not crossattn_cache["is_init"]: # L259 - crossattn_cache["is_init"] = True # L260 - k = self.norm_k(self.k(context)).unflatten(2, (n, d)) # L261 - v = self.v(context).unflatten(2, (n, d)) # L262 - crossattn_cache["k"] = k # L263 - crossattn_cache["v"] = v # L264 + n, d = self.tp_num_heads, self.head_dim + q = self.norm_q(self.q(x)).unflatten(2, (n, d)) + if crossattn_cache is not None: + if not crossattn_cache["is_init"]: + crossattn_cache["is_init"] = True + k = self.norm_k(self.k(context)).unflatten(2, (n, d)) + v = self.v(context).unflatten(2, (n, d)) + crossattn_cache["k"] = k + crossattn_cache["v"] = v else: - k = crossattn_cache["k"] # L266 - v = crossattn_cache["v"] # L267 + k = crossattn_cache["k"] + v = crossattn_cache["v"] else: - k = self.norm_k(self.k(context)).unflatten(2, (n, d)) # L269 - v = self.v(context).unflatten(2, (n, d)) # L270 - x = self.attn(q, k, v) # L273 - x = x.flatten(2) # L276 - x = self.o(x) # L277 + k = self.norm_k(self.k(context)).unflatten(2, (n, d)) + v = self.v(context).unflatten(2, (n, d)) + x = self.attn(q, k, v) + x = x.flatten(2) + x = self.o(x) return x class WanI2VCrossAttention(nn.Module): """Image-to-video cross-attention (splits first 257 image tokens). - Source: wan2_1_submodule.py L308-362 Uses vllm-omni Attention for FlashAttn backend. """ @@ -332,15 +297,15 @@ def __init__(self, dim: int, num_heads: int, window_size=(-1, -1), qk_norm: bool raise ValueError(f"num_heads={num_heads} must be divisible by tp_size={tp_size}.") self.tp_num_heads = num_heads // tp_size self.tp_inner_dim = self.tp_num_heads * self.head_dim - self.q = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) # L205 - self.k = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) # L206 - self.v = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) # L207 - self.o = RowParallelLinear(dim, dim, bias=True, input_is_parallel=True, return_bias=False) # L208 + self.q = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) + self.k = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) + self.v = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) + self.o = RowParallelLinear(dim, dim, bias=True, input_is_parallel=True, return_bias=False) self.norm_q = DistributedRMSNorm(self.tp_inner_dim, eps=eps) if qk_norm else nn.Identity() self.norm_k = DistributedRMSNorm(self.tp_inner_dim, eps=eps) if qk_norm else nn.Identity() - self.k_img = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) # L318 - self.v_img = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) # L319 - self.norm_k_img = DistributedRMSNorm(self.tp_inner_dim, eps=eps) if qk_norm else nn.Identity() # L321 + self.k_img = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) + self.v_img = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) + self.norm_k_img = DistributedRMSNorm(self.tp_inner_dim, eps=eps) if qk_norm else nn.Identity() self.attn = Attention( self.tp_num_heads, self.head_dim, @@ -356,13 +321,12 @@ def forward( context_lens: torch.Tensor | None = None, crossattn_cache: dict | None = None, ) -> torch.Tensor: - """Source: wan2_1_submodule.py L324-361""" del context_lens - context_img = context[:, :257] # L330 - context = context[:, 257:] # L331 - n, d = self.tp_num_heads, self.head_dim # L332 - q = self.norm_q(self.q(x)).unflatten(2, (n, d)) # L334 - if crossattn_cache is not None: # L336 + context_img = context[:, :257] + context = context[:, 257:] + n, d = self.tp_num_heads, self.head_dim + q = self.norm_q(self.q(x)).unflatten(2, (n, d)) + if crossattn_cache is not None: if not crossattn_cache["is_init"]: crossattn_cache["is_init"] = True k = self.norm_k(self.k(context)).unflatten(2, (n, d)) @@ -373,34 +337,30 @@ def forward( k = crossattn_cache["k"] v = crossattn_cache["v"] else: - k = self.norm_k(self.k(context)).unflatten(2, (n, d)) # L348 - v = self.v(context).unflatten(2, (n, d)) # L349 - x = self.attn(q, k, v) # L350 - k_img = self.norm_k_img(self.k_img(context_img)).unflatten(2, (n, d)) # L352 - v_img = self.v_img(context_img).unflatten(2, (n, d)) # L353 - img_x = self.attn(q, k_img, v_img) # L354 - x = x.flatten(2) # L357 - img_x = img_x.flatten(2) # L358 - x = x + img_x # L359 - x = self.o(x) # L360 + k = self.norm_k(self.k(context)).unflatten(2, (n, d)) + v = self.v(context).unflatten(2, (n, d)) + x = self.attn(q, k, v) + k_img = self.norm_k_img(self.k_img(context_img)).unflatten(2, (n, d)) + v_img = self.v_img(context_img).unflatten(2, (n, d)) + img_x = self.attn(q, k_img, v_img) + x = x.flatten(2) + img_x = img_x.flatten(2) + x = x + img_x + x = self.o(x) return x -WAN_CROSSATTENTION_CLASSES = { # L364-366 +WAN_CROSSATTENTION_CLASSES = { "t2v_cross_attn": WanT2VCrossAttention, "i2v_cross_attn": WanI2VCrossAttention, } # ── Self-Attention with causal masking + KV cache ─────────────────── -# Source: wan_video_dit_action_casual_chunk.py L188-1085 class CausalWanSelfAttention(nn.Module): - """Causal self-attention with KV cache + action/state tokens. - Source: wan_video_dit_action_casual_chunk.py L188-1085 - Inference-only implementation (KV cache path, L1008-1084). - """ + """Causal self-attention with KV cache + action/state tokens.""" def __init__( self, @@ -415,25 +375,22 @@ def __init__( num_action_per_block: int = 32, num_state_per_block: int = 1, ) -> None: - assert dim % num_heads == 0 # L201 + assert dim % num_heads == 0 super().__init__() - self.dim = dim # L203 - self.num_heads = num_heads # L204 - self.head_dim = dim // num_heads # L205 + self.dim = dim + self.num_heads = num_heads + self.head_dim = dim // num_heads tp_size = get_tensor_model_parallel_world_size() if num_heads % tp_size != 0: raise ValueError(f"num_heads={num_heads} must be divisible by tp_size={tp_size}.") self.tp_num_heads = num_heads // tp_size self.tp_inner_dim = self.tp_num_heads * self.head_dim - self.local_attn_size = local_attn_size # L206 - self.num_frame_per_block = num_frame_per_block # L208 - self.frame_seqlen = frame_seqlen # L212 - self.num_action_per_block = num_action_per_block # L213 - self.num_state_per_block = num_state_per_block # L214 - self.max_attention_size = ( # L211 - 21 * frame_seqlen if local_attn_size == -1 else local_attn_size * frame_seqlen - ) - # layers # L216-223 + self.local_attn_size = local_attn_size + self.num_frame_per_block = num_frame_per_block + self.frame_seqlen = frame_seqlen + self.num_action_per_block = num_action_per_block + self.num_state_per_block = num_state_per_block + self.max_attention_size = 21 * frame_seqlen if local_attn_size == -1 else local_attn_size * frame_seqlen self.q = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) self.k = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) self.v = ColumnParallelLinear(dim, dim, bias=True, gather_output=False, return_bias=False) @@ -459,12 +416,9 @@ def forward( current_start_frame: int = 0, is_tf: bool = True, ) -> tuple[torch.Tensor, torch.Tensor | None]: - """Inference-only forward (KV cache path). - Source: wan_video_dit_action_casual_chunk.py L786-1084 (kv_cache branch L1008-1084) - """ - n, d = self.tp_num_heads, self.head_dim # L803 + """Inference-only forward (KV cache path).""" + n, d = self.tp_num_heads, self.head_dim - # QKV # L806-812 q = self.norm_q(self.q(x)).unflatten(2, (n, d)) k = self.norm_k(self.k(x)).unflatten(2, (n, d)) v = self.v(x).unflatten(2, (n, d)) @@ -473,10 +427,9 @@ def forward( assert kv_cache is not None, "Inference only — kv_cache required." if True: - # ── Inference path with KV cache ── L1008-1084 - action_state_index = max(0, (current_start_frame - 1) // self.num_frame_per_block) # L1009 + action_state_index = max(0, (current_start_frame - 1) // self.num_frame_per_block) - roped_query = causal_rope_action_apply( # L1011-1020 + roped_query = causal_rope_action_apply( q, freqs, freqs_action, @@ -486,7 +439,7 @@ def forward( self.num_state_per_block, action_state_index, ).type_as(v) - roped_key = causal_rope_action_apply( # L1021-1030 + roped_key = causal_rope_action_apply( k, freqs, freqs_action, @@ -497,54 +450,47 @@ def forward( action_state_index, ).type_as(v) - # Split action/state tokens from video tokens # L1032-1046 roped_action_query = None roped_action_key = None action_v = None - if action_register_length is not None: # L1037 - roped_action_query = roped_query[:, -action_register_length:] # L1038 - roped_query = roped_query[:, :-action_register_length] # L1039 - roped_action_key = roped_key[:, -action_register_length:] # L1040 - roped_key = roped_key[:, :-action_register_length] # L1041 - action_v = v[:, -action_register_length:] # L1042 - v = v[:, :-action_register_length] # L1043 + if action_register_length is not None: + roped_action_query = roped_query[:, -action_register_length:] + roped_query = roped_query[:, :-action_register_length] + roped_action_key = roped_key[:, -action_register_length:] + roped_key = roped_key[:, :-action_register_length] + action_v = v[:, -action_register_length:] + v = v[:, :-action_register_length] - # KV cache update # L1055-1064 updated_k = kv_cache[0] updated_v = kv_cache[1] - new_k = torch.cat([updated_k, roped_key], dim=1) # L1059 - new_v = torch.cat([updated_v, v], dim=1) # L1060 - new_k = new_k[:, -self.max_attention_size :] # L1063 - new_v = new_v[:, -self.max_attention_size :] # L1064 + new_k = torch.cat([updated_k, roped_key], dim=1) + new_v = torch.cat([updated_v, v], dim=1) + new_k = new_k[:, -self.max_attention_size :] + new_v = new_v[:, -self.max_attention_size :] - # Attention # L1066-1077 - if action_register_length is not None: # L1066 + if action_register_length is not None: q_cat = torch.cat([roped_query, roped_action_query], dim=1) k_cat = torch.cat([new_k, roped_action_key], dim=1) v_cat = torch.cat([new_v, action_v], dim=1) - else: # L1072 + else: q_cat = roped_query k_cat = new_k v_cat = new_v - x = self.attn(q_cat, k_cat, v_cat) # L1067-1073 - updated_kv_cache = torch.stack([new_k, new_v], dim=0) # L1078 + x = self.attn(q_cat, k_cat, v_cat) + updated_kv_cache = torch.stack([new_k, new_v], dim=0) - # output # L1082-1083 x = x.flatten(2) x = self.o(x) return x, updated_kv_cache # ── Attention Block ───────────────────────────────────────────────── -# Source: wan_video_dit_action_casual_chunk.py L1087-1190 class CausalWanAttentionBlock(nn.Module): - """Transformer block: self-attn + cross-attn + FFN with 6-param modulation. - Source: wan_video_dit_action_casual_chunk.py L1087-1190 - """ + """Transformer block: self-attn + cross-attn + FFN with 6-param modulation.""" def __init__( self, @@ -563,8 +509,8 @@ def __init__( num_state_per_block: int = 1, ) -> None: super().__init__() - self.norm1 = WanLayerNorm(dim, eps) # L1113 - self.self_attn = CausalWanSelfAttention( # L1114-1124 + self.norm1 = WanLayerNorm(dim, eps) + self.self_attn = CausalWanSelfAttention( dim=dim, num_heads=num_heads, frame_seqlen=frame_seqlen, @@ -576,21 +522,15 @@ def __init__( num_action_per_block=num_action_per_block, num_state_per_block=num_state_per_block, ) - self.norm3 = ( # L1126-1128 - WanLayerNorm(dim, eps, elementwise_affine=True) if cross_attn_norm else nn.Identity() - ) - self.cross_attn = WAN_CROSSATTENTION_CLASSES[cross_attn_type]( # L1129-1133 - dim, num_heads, (-1, -1), qk_norm, eps - ) - self.norm2 = WanLayerNorm(dim, eps) # L1134 - self.ffn = nn.Sequential( # L1135-1137 + self.norm3 = WanLayerNorm(dim, eps, elementwise_affine=True) if cross_attn_norm else nn.Identity() + self.cross_attn = WAN_CROSSATTENTION_CLASSES[cross_attn_type](dim, num_heads, (-1, -1), qk_norm, eps) + self.norm2 = WanLayerNorm(dim, eps) + self.ffn = nn.Sequential( ColumnParallelLinear(dim, ffn_dim, bias=True, gather_output=False, return_bias=False), nn.GELU(approximate="tanh"), RowParallelLinear(ffn_dim, dim, bias=True, input_is_parallel=True, return_bias=False), ) - self.modulation = nn.Parameter( # L1140 - torch.randn(1, 6, dim) / dim**0.5 - ) + self.modulation = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5) def forward( self, @@ -606,12 +546,10 @@ def forward( current_start_frame: int = 0, is_tf: bool = True, ) -> tuple[torch.Tensor, torch.Tensor | None]: - """Source: wan_video_dit_action_casual_chunk.py L1142-1187""" - e = (self.modulation.unsqueeze(1) + e).chunk(6, dim=2) # L1162 + e = (self.modulation.unsqueeze(1) + e).chunk(6, dim=2) - # self-attention # L1164-1174 y, updated_kv_cache = self.self_attn( - x=(self.norm1(x) * (1 + e[1].squeeze(2)) + e[0].squeeze(2)), # L1166 + x=(self.norm1(x) * (1 + e[1].squeeze(2)) + e[0].squeeze(2)), freqs=freqs, freqs_action=freqs_action, freqs_state=freqs_state, @@ -620,70 +558,50 @@ def forward( is_tf=is_tf, current_start_frame=current_start_frame, ) - x = x + (y * e[2].squeeze(2)) # L1175 + x = x + (y * e[2].squeeze(2)) - # cross-attention + FFN # L1178-1186 - x = x + self.cross_attn(self.norm3(x), context, crossattn_cache=crossattn_cache) # L1179 - y = self.ffn( # L1180-1181 - self.norm2(x) * (1 + e[4].squeeze(2)) + e[3].squeeze(2) - ) - x = x + (y * e[5].squeeze(2)) # L1183 + x = x + self.cross_attn(self.norm3(x), context, crossattn_cache=crossattn_cache) + y = self.ffn(self.norm2(x) * (1 + e[4].squeeze(2)) + e[3].squeeze(2)) + x = x + (y * e[5].squeeze(2)) return x, updated_kv_cache # ── Output Head ───────────────────────────────────────────────────── -# Source: wan_video_dit_action_casual_chunk.py L1190-1215 class CausalHead(nn.Module): """Output norm + linear with 2-param modulation. - Source: wan_video_dit_action_casual_chunk.py L1190-1215 Runs once per step (not TP-critical), uses nn.Linear. """ def __init__(self, dim: int, out_dim: int, patch_size: tuple, eps: float = 1e-6) -> None: super().__init__() - self.dim = dim # L1194 - self.out_dim = out_dim # L1195 - self.patch_size = patch_size # L1196 - out_channels = math.prod(patch_size) * out_dim # L1200 - self.norm = WanLayerNorm(dim, eps) # L1201 - self.head = nn.Linear(dim, out_channels) # L1202 - self.modulation = nn.Parameter( # L1205 - torch.randn(1, 2, dim) / dim**0.5 - ) + self.dim = dim + self.out_dim = out_dim + self.patch_size = patch_size + out_channels = math.prod(patch_size) * out_dim + self.norm = WanLayerNorm(dim, eps) + self.head = nn.Linear(dim, out_channels) + self.modulation = nn.Parameter(torch.randn(1, 2, dim) / dim**0.5) def forward(self, x: torch.Tensor, e: torch.Tensor) -> torch.Tensor: """ Args: x: [B, L1, C] e: [B, F, 1, C] (time embedding, unsqueezed) - Source: wan_video_dit_action_casual_chunk.py L1207-1215 """ - e = (self.modulation.unsqueeze(1) + e).chunk(2, dim=2) # L1213 - x = self.head( # L1214 - self.norm(x) * (1 + e[1].squeeze(2)) + e[0].squeeze(2) - ) + e = (self.modulation.unsqueeze(1) + e).chunk(2, dim=2) + x = self.head(self.norm(x) * (1 + e[1].squeeze(2)) + e[0].squeeze(2)) return x # ── Main Model ────────────────────────────────────────────────────── -# Source: wan_video_dit_action_casual_chunk.py L1218-2200 class CausalWanModel(nn.Module): """Causal video diffusion transformer for DreamZero. - Source: wan_video_dit_action_casual_chunk.py L1218-2200 Architecture (14B): 40 layers, dim=5120, heads=40, ffn=13824 - - __init__ params match original L1230-1256: - model_type, patch_size, frame_seqlen, text_len, in_dim, dim, - ffn_dim, freq_dim, text_dim, out_dim, num_heads, num_layers, - max_chunk_size, sink_size, qk_norm, cross_attn_norm, eps, - num_frame_per_block, action_dim, num_registers, max_state_dim, - max_num_embodiments, hidden_size, diffusion_model_pretrained_path, - num_action_per_block, num_state_per_block """ def __init__( @@ -716,26 +634,23 @@ def __init__( num_state_per_block: int = 1, ) -> None: super().__init__() - assert model_type in ["t2v", "i2v", "ti2v"] # L1297 - self.model_type = model_type # L1298 - self.patch_size = patch_size # L1300 - self.frame_seqlen = frame_seqlen # L1301 - self.text_len = text_len # L1302 - self.dim = dim # L1304 - self.freq_dim = freq_dim # L1306 - self.out_dim = out_dim # L1308 - self.num_heads = num_heads # L1309 - self.num_layers = num_layers # L1310 - self.local_attn_size = ( # L1311 - max_chunk_size * num_frame_per_block + 1 if max_chunk_size != -1 else -1 - ) - self.num_frame_per_block = num_frame_per_block # L1315 - self.action_dim = action_dim # L1317 - self.num_action_per_block = num_action_per_block # L1322 - self.num_state_per_block = num_state_per_block # L1323 - - # Action encoder/decoder # L1327-1343 - max_num_embodiments_local = 1 # L1325 + assert model_type in ["t2v", "i2v", "ti2v"] + self.model_type = model_type + self.patch_size = patch_size + self.frame_seqlen = frame_seqlen + self.text_len = text_len + self.dim = dim + self.freq_dim = freq_dim + self.out_dim = out_dim + self.num_heads = num_heads + self.num_layers = num_layers + self.local_attn_size = max_chunk_size * num_frame_per_block + 1 if max_chunk_size != -1 else -1 + self.num_frame_per_block = num_frame_per_block + self.action_dim = action_dim + self.num_action_per_block = num_action_per_block + self.num_state_per_block = num_state_per_block + + max_num_embodiments_local = 1 self.state_encoder = CategorySpecificMLP( num_categories=max_num_embodiments_local, input_dim=max_state_dim, @@ -754,39 +669,29 @@ def __init__( output_dim=action_dim, ) - # Embeddings # L1346-1355 - # Upstream DreamZero uses a plain nn.Conv3d here - # (wan_video_dit_action_casual_chunk.py L1386-L1391). - # - # vLLM's Conv3dLayer can rewrite non-overlapping strided convs - # (kernel_size == stride, no padding) into a GEMM fast path. For - # DreamZero's bf16 i2v prefill, that changes accumulation order and - # causes the first-frame KV cache to drift from upstream even though - # the final outputs may still match. Force the native conv path so - # patch embedding stays numerically identical to upstream. + # Disable the Conv3d GEMM rewrite for patch embedding. self.patch_embedding = Conv3dLayer( in_dim, dim, kernel_size=patch_size, stride=patch_size, - ) # L1346 + ) self.patch_embedding.enable_linear = False - self.text_embedding = nn.Sequential( # L1348-1350 + self.text_embedding = nn.Sequential( nn.Linear(text_dim, dim), nn.GELU(approximate="tanh"), nn.Linear(dim, dim), ) - self.time_embedding = nn.Sequential( # L1352-1353 + self.time_embedding = nn.Sequential( nn.Linear(freq_dim, dim), nn.SiLU(), nn.Linear(dim, dim), ) - self.time_projection = nn.Sequential( # L1354-1355 + self.time_projection = nn.Sequential( nn.SiLU(), nn.Linear(dim, dim * 6), ) - # Transformer blocks # L1358-1364 cross_attn_type = "t2v_cross_attn" if model_type == "t2v" else "i2v_cross_attn" self.blocks = nn.ModuleList( [ @@ -809,40 +714,28 @@ def __init__( ] ) - # Head # L1367 self.head = CausalHead(dim, out_dim, patch_size, eps) - # RoPE buffers # L1370-1379 assert (dim % num_heads) == 0 and (dim // num_heads) % 2 == 0 d = dim // num_heads - self.freqs_action = rope_params(1024 * 10, d) # L1373 - self.freqs_state = rope_params(1024, d) # L1374 - self.freqs = [ # L1375-1379 + self.freqs_action = rope_params(1024 * 10, d) + self.freqs_state = rope_params(1024, d) + self.freqs = [ rope_params(1024, d - 4 * (d // 6)), rope_params(1024, 2 * (d // 6)), rope_params(1024, 2 * (d // 6)), ] - # Image embedding for i2v only # L1380-1381 if model_type == "i2v": self.img_emb = MLPProj(1280, dim) - # Initialize weights # L1383-1384 self.init_weights() def init_weights(self) -> None: - """Initialize parameters with DreamZero's bare-model scheme. - Source: wan_video_dit_action_casual_chunk.py L2176-2194 - - Upstream initializes every `nn.Linear` with Xavier uniform and - zero bias. This port applies the same rule to vLLM - `ColumnParallelLinear` / `RowParallelLinear`, using the full - unsharded fan-in/fan-out so the TP shards follow the same Xavier - distribution as the original dense weight. - """ + """Initialize parameters.""" def _init_linear_like(module: nn.Module) -> None: - if isinstance(module, nn.Linear): # L2182-2185 + if isinstance(module, nn.Linear): nn.init.xavier_uniform_(module.weight) if module.bias is not None: nn.init.zeros_(module.bias) @@ -856,43 +749,38 @@ def _init_linear_like(module: nn.Module) -> None: if module.bias is not None: nn.init.zeros_(module.bias) - # Basic init # L2181-2185 for module in self.modules(): _init_linear_like(module) - # Patch embedding follows upstream Conv3d handling: Xavier - # for weight, Conv3d-style uniform bias. # L2187-2191 nn.init.xavier_uniform_(self.patch_embedding.weight.flatten(1)) if self.patch_embedding.bias is not None: fan_in = self.patch_embedding.in_channels * math.prod(self.patch_embedding.kernel_size) bound = 1 / math.sqrt(fan_in) nn.init.uniform_(self.patch_embedding.bias, -bound, bound) - for module in self.text_embedding.modules(): # L2188-2190 + for module in self.text_embedding.modules(): if isinstance(module, nn.Linear): nn.init.normal_(module.weight, std=0.02) - for module in self.time_embedding.modules(): # L2190-2191 + for module in self.time_embedding.modules(): if isinstance(module, nn.Linear): nn.init.normal_(module.weight, std=0.02) - nn.init.zeros_(self.head.head.weight) # L2193 + nn.init.zeros_(self.head.head.weight) def _create_freqs(self, grid_size: torch.Tensor, start_frame: int) -> torch.Tensor: - """Create 3D RoPE frequency tensor. - Source: wan_video_dit_action_casual_chunk.py L2151-2174 - """ - device = self.patch_embedding.weight.device # L2156 - if any(freq.device != device for freq in self.freqs): # L2157-2158 + """Create 3D RoPE frequency tensor.""" + device = self.patch_embedding.weight.device + if any(freq.device != device for freq in self.freqs): self.freqs = [freq.to(device) for freq in self.freqs] - if self.freqs_action.device != device: # L2159-2160 + if self.freqs_action.device != device: self.freqs_action = self.freqs_action.to(device) - if self.freqs_state.device != device: # L2161-2162 + if self.freqs_state.device != device: self.freqs_state = self.freqs_state.to(device) - f, h, w = grid_size.tolist() # L2164 + f, h, w = grid_size.tolist() freqs = torch.cat( - [ # L2165-2172 + [ self.freqs[0][start_frame : start_frame + f].view(f, 1, 1, -1).expand(f, h, w, -1), self.freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1), self.freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1), @@ -902,16 +790,14 @@ def _create_freqs(self, grid_size: torch.Tensor, start_frame: int) -> torch.Tens return freqs def unpatchify(self, x: torch.Tensor, grid_size: torch.Tensor) -> torch.Tensor: - """Reconstruct video from patch embeddings. - Source: wan_video_dit_action_casual_chunk.py L2127-2149 - """ - B = x.shape[0] # L2142 - c = self.out_dim # L2143 - grid_size = grid_size.tolist() # L2144 - assert x.shape[1] == math.prod(grid_size) # L2145 - x = x.view(B, *grid_size, *self.patch_size, c) # L2146 - x = torch.einsum("bfhwpqrc->bcfphqwr", x) # L2147 - x = x.reshape(B, c, *[i * j for i, j in zip(grid_size, self.patch_size)]) # L2148 + """Reconstruct video from patch embeddings.""" + B = x.shape[0] + c = self.out_dim + grid_size = grid_size.tolist() + assert x.shape[1] == math.prod(grid_size) + x = x.view(B, *grid_size, *self.patch_size, c) + x = torch.einsum("bfhwpqrc->bcfphqwr", x) + x = x.reshape(B, c, *[i * j for i, j in zip(grid_size, self.patch_size)]) return x def _forward_blocks( @@ -929,47 +815,40 @@ def _forward_blocks( kv_cache: list[torch.Tensor], current_start_frame: int, ) -> tuple[torch.Tensor, torch.Tensor | None, list[torch.Tensor]]: - """Source: wan_video_dit_action_casual_chunk.py L1691-1779""" - x = x.flatten(start_dim=2).transpose(1, 2) # L1709 - B = x.shape[0] # L1711 - F_t = timestep.shape[1] # L1712 + x = x.flatten(start_dim=2).transpose(1, 2) + B = x.shape[0] + F_t = timestep.shape[1] - # Action/state encoding # L1714-1726 if action is not None: - embodiment_id = torch.tensor([0], device=x.device).repeat(B) # L1715 - action_features = self.action_encoder(action, timestep_action, embodiment_id) # L1716 - state_features = self.state_encoder(state, embodiment_id) # L1717 - action_register = torch.cat([action_features, state_features], dim=1) # L1718 - action_length = action_features.shape[1] # L1719 - action_register_length = action_register.shape[1] # L1720 - x = torch.cat([x, action_register], dim=1) # L1721 + embodiment_id = torch.tensor([0], device=x.device).repeat(B) + action_features = self.action_encoder(action, timestep_action, embodiment_id) + state_features = self.state_encoder(state, embodiment_id) + action_register = torch.cat([action_features, state_features], dim=1) + action_length = action_features.shape[1] + action_register_length = action_register.shape[1] + x = torch.cat([x, action_register], dim=1) else: - action_length = 0 # L1725 - action_register_length = None # L1726 + action_length = 0 + action_register_length = None - # Time embeddings # L1728-1742 - timestep = timestep.unsqueeze(-1).expand(B, F_t, seq_len // F_t).reshape(B, -1) # L1729 + timestep = timestep.unsqueeze(-1).expand(B, F_t, seq_len // F_t).reshape(B, -1) if action is not None: assert timestep_action is not None and state is not None state_features_t = self.state_encoder(state, embodiment_id) - stride = timestep_action.shape[1] // state_features_t.shape[1] # L1734 - timestep_state = timestep_action[:, ::stride] # L1735 - timestep = torch.cat([timestep, timestep_action, timestep_state], dim=1) # L1736 + stride = timestep_action.shape[1] // state_features_t.shape[1] + timestep_state = timestep_action[:, ::stride] + timestep = torch.cat([timestep, timestep_action, timestep_state], dim=1) - e = self.time_embedding( # L1738-1739 - sinusoidal_embedding_1d(self.freq_dim, timestep.flatten()).type_as(x) - ) - e = e.unflatten(dim=0, sizes=(B, -1)) # L1740 - e0 = self.time_projection(e) # L1741 - e0 = e0.unflatten(dim=2, sizes=(6, self.dim)) # L1742 + e = self.time_embedding(sinusoidal_embedding_1d(self.freq_dim, timestep.flatten()).type_as(x)) + e = e.unflatten(dim=0, sizes=(B, -1)) + e0 = self.time_projection(e) + e0 = e0.unflatten(dim=2, sizes=(6, self.dim)) - # Context embedding # L1744-1749 - context = self.text_embedding(context) # L1745 - if clip_feature is not None: # L1747 - clip_embedding = self.img_emb(clip_feature) # L1748 - context = torch.cat([clip_embedding, context], dim=1) # L1749 + context = self.text_embedding(context) + if clip_feature is not None: + clip_embedding = self.img_emb(clip_feature) + context = torch.cat([clip_embedding, context], dim=1) - # Transformer blocks # L1751-1764 updated_kv_caches: list[torch.Tensor] = [] for block_index, block in enumerate(self.blocks): x, updated_kv_cache = block( @@ -985,16 +864,15 @@ def _forward_blocks( ) updated_kv_caches.append(updated_kv_cache) - # Action decoding # L1766-1770 if action is not None: - action_noise_pred = x[:, seq_len : seq_len + action_length] # L1767 - action_noise_pred = self.action_decoder(action_noise_pred, embodiment_id) # L1768 + action_noise_pred = x[:, seq_len : seq_len + action_length] + action_noise_pred = self.action_decoder(action_noise_pred, embodiment_id) else: - action_noise_pred = None # L1770 + action_noise_pred = None - x_video = x[:, :seq_len] # L1773 - e_video = e[:, :seq_len] # L1774 - x_video = self.head(x_video, e_video.unsqueeze(2)) # L1777 + x_video = x[:, :seq_len] + e_video = e[:, :seq_len] + x_video = self.head(x_video, e_video.unsqueeze(2)) return x_video, action_noise_pred, updated_kv_caches @@ -1014,19 +892,18 @@ def _forward_inference( state: torch.Tensor | None = None, embodiment_id: torch.Tensor | None = None, ) -> tuple[torch.Tensor, torch.Tensor | None, list[torch.Tensor]]: - """Source: wan_video_dit_action_casual_chunk.py L1863-1950""" - if self.model_type == "i2v": # L1910 + if self.model_type == "i2v": assert clip_feature is not None and y is not None - assert context.shape[1] == self.text_len # L1912 + assert context.shape[1] == self.text_len - if y is not None: # L1914 - x = torch.cat([x, y.to(dtype=x.dtype)], dim=1) # L1915 + if y is not None: + x = torch.cat([x, y.to(dtype=x.dtype)], dim=1) - x = self.patch_embedding(x) # L1918 - grid_size = torch.tensor(x.shape[2:], dtype=torch.long) # L1919 - freqs = self._create_freqs(grid_size, current_start_frame) # L1921-1924 + x = self.patch_embedding(x) + grid_size = torch.tensor(x.shape[2:], dtype=torch.long) + freqs = self._create_freqs(grid_size, current_start_frame) - x_video, action_noise_pred, updated_kv_caches = self._forward_blocks( # L1926-1939 + x_video, action_noise_pred, updated_kv_caches = self._forward_blocks( x=x, seq_len=seq_len, freqs=freqs, @@ -1041,11 +918,11 @@ def _forward_inference( current_start_frame=current_start_frame, ) - x_video = x_video.clone() # L1942 + x_video = x_video.clone() if action_noise_pred is not None: - action_noise_pred = action_noise_pred.clone() # L1944 + action_noise_pred = action_noise_pred.clone() - video_noise_pred = self.unpatchify(x_video, grid_size) # L1948 + video_noise_pred = self.unpatchify(x_video, grid_size) return video_noise_pred, action_noise_pred, updated_kv_caches def forward(self, *args: Any, **kwargs: Any): diff --git a/vllm_omni/diffusion/models/dreamzero/modeling/image_encoder.py b/vllm_omni/diffusion/models/dreamzero/modeling/image_encoder.py index c41859e2dc2..df0895eaecf 100644 --- a/vllm_omni/diffusion/models/dreamzero/modeling/image_encoder.py +++ b/vllm_omni/diffusion/models/dreamzero/modeling/image_encoder.py @@ -1,15 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""DreamZero image encoder port. - -Corresponds to: -- `wan_video_image_encoder.py` `VisionTransformer` / `AttentionBlock` -- `wan_video_image_encoder.py` `WanImageEncoder.encode_image()` +"""DreamZero image encoder. Only the visual tower used by DreamZero I2V inference is ported here. The -checkpoint keys are kept source-compatible so root `action_head.image_encoder.*` -weights can be loaded by simple prefix stripping. +checkpoint keys under `action_head.image_encoder.*` load via simple prefix +stripping. """ from __future__ import annotations @@ -23,14 +19,14 @@ class DreamZeroLayerNorm(nn.LayerNorm): - """Source: `wan_video_image_encoder.py` `LayerNorm`.""" + """LayerNorm that preserves the input dtype.""" def forward(self, x: torch.Tensor) -> torch.Tensor: return super().forward(x).type_as(x) class DreamZeroVisionSelfAttention(nn.Module): - """Source: `wan_video_image_encoder.py` `SelfAttention` (vision branch).""" + """Self-attention for the vision tower.""" def __init__( self, @@ -61,7 +57,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class DreamZeroVisionAttentionBlock(nn.Module): - """Source: `wan_video_image_encoder.py` `AttentionBlock`.""" + """Attention block for the vision tower.""" def __init__( self, @@ -103,7 +99,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class DreamZeroVisionTransformer(nn.Module): - """Source: `wan_video_image_encoder.py` `VisionTransformer`.""" + """Vision transformer used by the image encoder.""" def __init__( self, @@ -183,7 +179,7 @@ def forward(self, x: torch.Tensor, use_31_block: bool = False) -> torch.Tensor: class _DreamZeroCLIPContainer(nn.Module): - """Minimal container matching source checkpoint names under `model.visual.*`.""" + """Container matching checkpoint names under `model.visual.*`.""" def __init__(self) -> None: super().__init__() @@ -207,12 +203,11 @@ def __init__(self) -> None: class DreamZeroImageEncoder(nn.Module): - """Source-equivalent port of `WanImageEncoder`.""" + """Image encoder wrapper.""" def __init__(self) -> None: super().__init__() self.model = _DreamZeroCLIPContainer() - # Source: `clip_xlm_roberta_vit_h_14(..., return_transforms=True)` # returns a composed transform whose last stage is CLIP normalization. self.transforms = T.Compose( [ @@ -224,7 +219,7 @@ def __init__(self) -> None: ) def encode_image(self, videos: torch.Tensor) -> torch.Tensor: - """Source: `wan_video_image_encoder.py` `WanImageEncoder.encode_image()`.""" + """Encode images for I2V conditioning.""" size = (self.model.visual.image_size,) * 2 videos = torch.cat( [ diff --git a/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py b/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py index 29ba4b8867e..59011eb22b0 100644 --- a/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py +++ b/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py @@ -3,7 +3,6 @@ """DreamZero pipeline for vllm-omni. -Corresponds to: WANPolicyHead.lazy_joint_video_action (L929-1270) Entry point for DiffusionEngine.step() → pipeline.forward(req) """ @@ -54,14 +53,11 @@ # --------------------------------------------------------------------------- -# VideoActionScheduler — composite scheduler (same pattern as LTX2 PR #2160) # --------------------------------------------------------------------------- class VideoActionScheduler: - """Wraps video + action schedulers into single .step() interface. - Source pattern: LTX2 VideoAudioScheduler (PR #2160) - """ + """Wraps video + action schedulers into single .step() interface.""" def __init__(self, video_scheduler, action_scheduler): self.video_scheduler = video_scheduler @@ -100,7 +96,6 @@ class DreamZeroPipeline(nn.Module, CFGParallelMixin): def __init__(self, *, od_config: OmniDiffusionConfig, prefix: str = "") -> None: """Initialize pipeline components. - Source: WANPolicyHead.__init__ (L156-235) DreamZero root checkpoint layout (GEAR-Dreams/DreamZero-DROID): config.json — root config (action_head_cfg, architectures, etc.) @@ -121,7 +116,7 @@ def __init__(self, *, od_config: OmniDiffusionConfig, prefix: str = "") -> None: """ super().__init__() - model_path = od_config.model # last_steps.md P0-3 + model_path = od_config.model model_config = od_config.model_config local_files_only = os.path.exists(model_path) self.od_config = od_config @@ -131,7 +126,6 @@ def __init__(self, *, od_config: OmniDiffusionConfig, prefix: str = "") -> None: DEFAULT_EMBODIMENT, ) - # ---- Parse root config.json ---- (last_steps.md P0-4) root_cfg = self._load_repo_json(model_path, "config.json", local_files_only) if root_cfg is None: raise ValueError(f"DreamZero requires root config.json in {model_path}.") @@ -139,16 +133,11 @@ def __init__(self, *, od_config: OmniDiffusionConfig, prefix: str = "") -> None: ah_config = action_head_cfg["config"] diffusion_model_cfg = ah_config["diffusion_model_cfg"] - # ---- Tokenizer ---- (follows wan2_2 convention: pipeline owns tokenizer) - # DreamZero root has no tokenizer/ subfolder; uses google/umt5-xxl - # Source: last_steps.md §2.1.1 B.1 + # ---- Tokenizer ---- tokenizer_source = od_config.model_paths.get("tokenizer", "google/umt5-xxl") self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_source) - # ---- Text encoder ---- (L169) - # Instantiate from config; weights loaded by load_weights() from root checkpoint - # Source key structure: action_head.text_encoder.blocks.{N}.attn.{q,k,v,o}.weight - # UMT5-XXL: d_model=4096, d_ff=10240, num_heads=64, num_layers=24, vocab=256384 + # Instantiate from config; weights load through `load_weights()`. umt5_config = UMT5Config( d_model=4096, d_ff=10240, @@ -163,43 +152,9 @@ def __init__(self, *, od_config: OmniDiffusionConfig, prefix: str = "") -> None: ) self.text_encoder = UMT5EncoderModel(umt5_config) - # ---- Image encoder ---- (L170) - # Source module: `wan_video_image_encoder.py` `WanImageEncoder` - # - # The strict service-path parity check shows that HF `CLIPVisionModel` - # drifts from upstream `WanImageEncoder.encode_image()` on real bf16 - # inference input, even when weights are remapped correctly and - # preprocessing is source-equivalent. We therefore use the local - # source-shaped port `DreamZeroImageEncoder`, whose parameter names stay - # aligned with DreamZero root keys: - # action_head.image_encoder.model.* -> image_encoder.model.* self.image_encoder = DreamZeroImageEncoder() - # ---- VAE ---- (L171) - # DreamZero root checkpoints already carry `action_head.vae.*`, so the - # only thing we need at init time is a compatible module skeleton. - # - # Upstream source path: - # self.vae = instantiate(config.vae_cfg) # L171 - # vae_path = ensure_file(self.vae.vae_pretrained_path, "Wan2.1_VAE.pth") # L249-252 - # self.vae.model.load_state_dict(torch.load(vae_path, ...)) # L253 - # - # In vLLM we run the diffusers-compatible execution module - # `DistributedAutoencoderKLWan`, but the final learned weights still - # come from DreamZero root `action_head.vae.model.*` through - # `load_weights()`. To let users pass only the official DreamZero HF - # repo name, we no longer require a local `vae/` subfolder. - # - # Bootstrapping policy: - # 1. If `od_config.model_paths["vae"]` is explicitly provided, honor - # it and instantiate from that diffusers source. - # 2. Else if a local prepared layout exposes `model_path/vae`, use it. - # 3. Else instantiate `DistributedAutoencoderKLWan()` directly from - # constructor defaults, which match Wan2.1 VAE geometry / latent - # normalization constants. - # - # After instantiation, `load_weights()` remaps DreamZero root - # `action_head.vae.model.*` keys onto this module. + # Build a compatible VAE module, then fill it through `load_weights()`. vae_source = od_config.model_paths.get("vae") if vae_source: self.vae = DistributedAutoencoderKLWan.from_pretrained( @@ -219,9 +174,6 @@ def __init__(self, *, od_config: OmniDiffusionConfig, prefix: str = "") -> None: getattr(od_config, "enable_cpu_offload", False) or getattr(od_config, "enable_layerwise_offload", False) ): self.vae = self.vae.to(device=get_local_device(), dtype=od_config.dtype) - # DreamZero upstream WanVideoVAE.encode() returns normalized mu: - # mu = (mu - mean) / std - # Source: wan_video_vae.py VideoVAE_.encode() self.register_buffer( "vae_latents_mean", torch.tensor(self.vae.config.latents_mean, dtype=torch.float32).view(1, -1, 1, 1, 1), @@ -233,85 +185,50 @@ def __init__(self, *, od_config: OmniDiffusionConfig, prefix: str = "") -> None: persistent=False, ) - # ---- Transformer (DiT backbone) ---- (L232) - # Config parsed from root config.json -> action_head_cfg.config.diffusion_model_cfg - # Filter out keys not accepted by CausalWanModel.__init__ + # Filter out keys not accepted by `CausalWanModel.__init__`. transformer_kwargs = {k: v for k, v in diffusion_model_cfg.items() if k not in ("_convert_", "_target_")} transformer_kwargs["action_dim"] = ah_config["action_dim"] transformer_kwargs["max_state_dim"] = ah_config["max_state_dim"] transformer_kwargs["num_frame_per_block"] = ah_config["num_frame_per_block"] - # Upstream WANPolicyHead instantiates the DiT strictly from - # `config.diffusion_model_cfg`: - # self.model = instantiate(config.diffusion_model_cfg) - # Source: `third_party/dreamzero/.../wan_flow_matching_action_tf.py:211` - # - # The action-head-level `hidden_size=64` belongs to WANPolicyHead state - # processing, not to `CausalWanModel`. The DiT keeps its own constructor - # default `hidden_size=1024`, which is what the root checkpoint weights - # expect (for example `action_decoder.layer1.W` has shape - # `(1, 5120, 1024)`). Passing `ah_config["hidden_size"]` here shrinks the - # local action/state MLPs to 64 and breaks root checkpoint loading. self.transformer = CausalWanModel(**transformer_kwargs) - # ---- Scheduler ---- (L172) self.scheduler = FlowUniPCMultistepScheduler( num_train_timesteps=1000, shift=1, use_dynamic_shifting=False, ) - # ---- Pipeline state ---- (L180-195) self.state = DreamZeroState() - # ---- Inference hyperparams ---- (L175-179) - # Root-config-backed inference geometry must come directly from the - # released DreamZero HF config. Do not fall back to runtime overrides - # or hard-coded defaults for fields that already exist in - # `action_head_cfg.config`. - # Source eager path uses the hard-coded `WANPolicyHead.num_inference_steps = 16` - # (`wan_flow_matching_action_tf.py` L175), while - # `config.num_inference_timesteps` is stored separately but is not what the - # real-world inference loop consumes. Reading the config value here would - # incorrectly shorten the denoising loop to 4 steps for the released - # DreamZero checkpoint. + # Keep runtime inference settings separate from the training-time config. self.num_inference_steps: int = model_config.get( "num_inference_steps", DEFAULT_NUM_INFERENCE_STEPS, ) self.cfg_scale: float = model_config.get("cfg_scale", DEFAULT_CFG_SCALE) self.sigma_shift: float = model_config.get("sigma_shift", DEFAULT_SIGMA_SHIFT) - # Source: `WANPolicyHead.__init__` reads `config.num_frames` - # from `action_head_cfg.config.num_frames` (33 for DreamZero DROID), - # not from the root HF config. This value feeds `encode_image()` - # mask/conditioning construction, so falling back to 81 changes the - # inference trajectory on real checkpoints. self.num_frames: int = ah_config["num_frames"] self.num_frame_per_block: int = ah_config["num_frame_per_block"] self.action_horizon: int = ah_config["action_horizon"] - # Decoupled inference noise config # L112-118 self.decouple_inference_noise: bool = ah_config["decouple_inference_noise"] self.video_inference_final_noise: float = ah_config["video_inference_final_noise"] - # Fixed seed for deterministic noise generation # L176 self.seed: int = model_config.get("seed", DEFAULT_SEED) - # Model-level constants for state/action padding # dreamzero_cotrain.yaml + # Model-level constants for state/action padding. self.max_state_dim: int = ah_config["max_state_dim"] self.max_action_dim: int = ah_config["max_action_dim"] - # Fixed negative prompt for CFG uncond branch # dreamzero_cotrain.py L532 self.negative_prompt: str = model_config.get("negative_prompt", DEFAULT_NEGATIVE_PROMPT) # Embodiment name → numeric ID mapping (model knowledge) - # Source: dreamzero transform/base.yaml embodiment_tag_to_projector_index self.embodiment_name_to_id: dict[str, int] = model_config.get( "embodiment_name_to_id", DEFAULT_EMBODIMENT_NAME_TO_ID, ) - # Action normalization stats (per-embodiment, from checkpoint metadata) - # Prefer root experiment_cfg/metadata.json, fall back to model_config path + # Prefer root `experiment_cfg/metadata.json`, then `model_config`. stats_path = model_config.get("action_norm_stats_path") metadata = self._load_repo_json(model_path, "experiment_cfg/metadata.json", local_files_only) if metadata is not None: @@ -327,11 +244,8 @@ def __init__(self, *, od_config: OmniDiffusionConfig, prefix: str = "") -> None: # Whether model uses relative actions (need to add back last state) self.relative_action: bool = model_config.get("relative_action", True) # Number of action dims that are relative (DROID: 7 = joint only, gripper is absolute) - # Source: droid_relative.yaml L11 — relative_action_keys: [joint_position] self.relative_action_dim: int = model_config.get("relative_action_dim", 7) - # ---- Weights sources ---- (last_steps.md P0-5) - # Single source pointing to DreamZero root; load_weights() handles remapping self._weights_sources = [ DiffusersPipelineLoader.ComponentSource( model_or_path=model_path, @@ -373,10 +287,8 @@ def _load_repo_json(model_path: str, relative_path: str, local_files_only: bool) # ----------------------------------------------------------------------- def predict_noise(self, **kwargs) -> tuple[torch.Tensor, torch.Tensor]: - """Call CausalWanModel, return (video_pred, action_pred). - Source: _run_diffusion_steps (L852-865) single model call - """ - video_pred, action_pred, updated_kv_caches = self.transformer( # L885-899 + """Call CausalWanModel, return (video_pred, action_pred).""" + video_pred, action_pred, updated_kv_caches = self.transformer( x=kwargs["hidden_states"], timestep=kwargs["timestep_video"], context=kwargs["encoder_hidden_states"], @@ -391,15 +303,14 @@ def predict_noise(self, **kwargs) -> tuple[torch.Tensor, torch.Tensor]: state=kwargs.get("state_features"), embodiment_id=kwargs.get("embodiment_id"), ) - # KV cache update: side effect, write back to state # L856-858 if kwargs.get("update_kv_cache", False) and updated_kv_caches: is_neg = kwargs.get("is_negative", False) for i, kv in enumerate(updated_kv_caches): self.state.update_kv_cache(i, kv, is_negative=is_neg) - video_pred = video_pred.clone() # L859 + video_pred = video_pred.clone() if action_pred is not None: - action_pred = action_pred.clone() # L861 + action_pred = action_pred.clone() else: batch_size = kwargs["hidden_states"].shape[0] action_pred = torch.empty( @@ -419,8 +330,7 @@ def combine_cfg_noise( cfg_normalize: bool = False, ) -> torch.Tensor | tuple[torch.Tensor, ...]: """Video: standard CFG. Action: positive only (no CFG). - Source: L1212 — flow_pred = uncond + cfg_scale * (cond - uncond) - action = cond only (no uncond blending) + action = cond only (no uncond blending) """ (video_pos, action_pos) = positive_noise_pred (video_neg, _) = negative_noise_pred @@ -428,7 +338,6 @@ def combine_cfg_noise( return (video_combined, action_pos) # ----------------------------------------------------------------------- - # CFG parallel sync (PR #2160 pattern) # ----------------------------------------------------------------------- def _synchronize_cfg_parallel_step_output( @@ -436,9 +345,7 @@ def _synchronize_cfg_parallel_step_output( latents: tuple[torch.Tensor, torch.Tensor], do_true_cfg: bool, ) -> tuple[torch.Tensor, torch.Tensor]: - """Post-step sync: .contiguous() + cuda.synchronize() - Source: PR #2160 LTX2 _synchronize_cfg_parallel_step_output - """ + """Post-step sync: .contiguous() + cuda.synchronize()""" latents = tuple(t.contiguous() for t in latents) if do_true_cfg and get_classifier_free_guidance_world_size() > 1: device = next((t.device for t in latents if t.is_cuda), None) @@ -451,40 +358,32 @@ def _synchronize_cfg_parallel_step_output( # ----------------------------------------------------------------------- def _preprocess_video(self, videos: torch.Tensor) -> torch.Tensor: - """uint8 [B,T,H,W,C] → bfloat16 [B,C,T,H,W] normalized to [-1,1]. - Source: lazy_joint_video_action L952-966 - """ - videos = videos.permute(0, 4, 1, 2, 3) # L952: b t h w c → b c t h w - if videos.dtype == torch.uint8: # L954 - videos = videos.float() / 255.0 # L955 - # Source eager path casts to bf16 *before* `normalize_video` - # (`wan_flow_matching_action_tf.py:956`). Doing the `* 2 - 1` - # normalization in fp32 and only then casting to bf16 changes the - # rounded input latents on real observations. - videos = videos.to(dtype=torch.bfloat16) # L956 - b, c, t, h, w = videos.shape # L957 - videos = videos.permute(0, 2, 1, 3, 4) # L958: b c t h w → b t c h w - videos = videos.reshape(b * t, c, h, w) # L959 - # normalize: (x - 0.5) / 0.5 = x * 2 - 1 # L960 (self.normalize_video) + """uint8 [B,T,H,W,C] → bfloat16 [B,C,T,H,W] normalized to [-1,1].""" + videos = videos.permute(0, 4, 1, 2, 3) + if videos.dtype == torch.uint8: + videos = videos.float() / 255.0 + # Cast to bf16 before normalization to preserve input rounding. + videos = videos.to(dtype=torch.bfloat16) + b, c, t, h, w = videos.shape + videos = videos.permute(0, 2, 1, 3, 4) + videos = videos.reshape(b * t, c, h, w) videos = videos * 2.0 - 1.0 - videos = videos.reshape(b, t, c, h, w).permute(0, 2, 1, 3, 4) # L961: back to b c t h w - return videos.to(dtype=torch.bfloat16) # L966 + videos = videos.reshape(b, t, c, h, w).permute(0, 2, 1, 3, 4) + return videos.to(dtype=torch.bfloat16) # ----------------------------------------------------------------------- # Text encoding # ----------------------------------------------------------------------- def _encode_text(self, text_tokens: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: - """Encode text prompt via UMT5. - Source: encode_prompt (L525-531) - """ - seq_lens = attention_mask.gt(0).sum(dim=1).long() # L526 - prompt_emb = self.text_encoder( # L527 + """Encode text prompt via UMT5.""" + seq_lens = attention_mask.gt(0).sum(dim=1).long() + prompt_emb = self.text_encoder( text_tokens, attention_mask, ).last_hidden_state - prompt_emb = prompt_emb.clone().to(dtype=torch.bfloat16) # L528 - for i, v in enumerate(seq_lens): # L529-530 + prompt_emb = prompt_emb.clone().to(dtype=torch.bfloat16) + for i, v in enumerate(seq_lens): prompt_emb[:, v:] = 0 return prompt_emb @@ -500,32 +399,22 @@ def _encode_image( width: int, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """Encode first frame via CLIP + VAE. - Source: wan_flow_matching_action_tf.py encode_image (L547-564) - CLIP source: wan_video_image_encoder.py L869-887 (WanImageEncoder.encode_image) Returns: (clip_feas, ys, image_latent) """ device = image.device - batch_size = image.shape[0] # L548 + batch_size = image.shape[0] with torch.amp.autocast(dtype=torch.bfloat16, device_type=device.type): - # CLIP encode # L549 - # Upstream `WanImageEncoder.encode_image()`: - # L872-877: bicubic resize each frame batch to 224x224 - # L879: `self.transforms.transforms[-1](x * 0.5 + 0.5)` - # L882-883: run visual tower - # L886: return `use_31_block=True` output clip_context = self.image_encoder.encode_image(image) - # Build mask # L550-554 msk = torch.ones(batch_size, num_frames, height // 8, width // 8, device=device) msk[:, 1:] = 0 msk = torch.concat([torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]], dim=1) msk = msk.view(batch_size, msk.shape[1] // 4, 4, height // 8, width // 8) msk = msk.transpose(1, 2) - # VAE encode: first frame + zeros # L556-560 latent_dtype = image.dtype - image_input = image.transpose(1, 2) # L556: B,T,C,H,W → B,C,T,H,W + image_input = image.transpose(1, 2) image_zeros = torch.zeros( batch_size, 3, @@ -534,31 +423,18 @@ def _encode_image( width, dtype=latent_dtype, device=device, - ) # L557 + ) vae_input = torch.concat([image_input, image_zeros], dim=2) - y = self._encode_vae_latents(vae_input) # L560 + y = self._encode_vae_latents(vae_input) y = y.to(dtype=latent_dtype) - new_image = y[:, :, 0:1] # L561 - y = torch.concat([msk, y], dim=1) # L563: [B, 4+C_latent, T, H, W] + new_image = y[:, :, 0:1] + y = torch.concat([msk, y], dim=1) return clip_context, y, new_image def _encode_vae_latents(self, videos: torch.Tensor) -> torch.Tensor: - """Encode videos with DreamZero upstream WanVideoVAE semantics. - - Upstream `WanVideoVAE.encode()` does not return the raw posterior mean from - `quant_conv`; it first takes `mu` from `quant_conv(out).chunk(2, dim=1)` and - then applies channel-wise normalization `(mu - mean) * (1 / std)`. - - The multiplication form matters for bf16 parity. Source `WanVideoVAE` - stores `scale = [mean, 1.0 / std]` in fp32 and then casts that - precomputed reciprocal into the runtime dtype before the multiply. - Using bf16 division here introduces a measurable drift versus the - upstream DreamZero server. - - Source: `wan_video_vae.py` `VideoVAE_.encode()` - """ + """Encode videos into normalized VAE latents.""" input_dtype = videos.dtype hidden = self.vae._encode(videos.to(dtype=self.vae.dtype)) mu, _ = hidden.chunk(2, dim=1) @@ -568,26 +444,7 @@ def _encode_vae_latents(self, videos: torch.Tensor) -> torch.Tensor: return mu.to(dtype=input_dtype) def decode_video_latents(self, video_latents: torch.Tensor) -> torch.Tensor: - """Decode DreamZero normalized VAE latents into RGB video tensors. - - `forward()` returns `video` in the same form as upstream - `WANPolicyHead.lazy_joint_video_action()` / `GrootSimPolicy`: normalized - VAE latents shaped `[B, C, T, H, W]`, not decoded RGB frames. Upstream - only decodes those latents when saving the debug video on reset. - - Source correspondence: - - `socket_test_optimized_AR.py` `_reset_state()` calls - `action_head.vae.decode(video_across_time_cat, ...)`. - - `wan_video_vae.py` `WanVideoVAE.decode()` delegates to - `VideoVAE_.decode(z, scale)`. - - `wan_video_vae.py` `VideoVAE_.decode()` first inverts latent - normalization as `z = z / scale[1] + scale[0]`, where `scale[1]` is - the precomputed fp32 reciprocal std cast to the runtime dtype. - - The cast-before-division detail is required for bf16 video parity; doing - the inverse in fp32 and then casting changes RGB frames even though the - action path is unaffected. - """ + """Decode normalized VAE latents into RGB video tensors.""" vae_dtype = self.vae.dtype vae_device = next(self.vae.parameters()).device latents = video_latents.to(device=vae_device, dtype=vae_dtype) @@ -611,7 +468,6 @@ def _prefill_kv_cache( do_true_cfg: bool, ) -> None: """Prefill KV cache with first frame and/or current observation. - Source: lazy_joint_video_action L1078-1125 Uses predict_noise_maybe_with_cfg() for CFG parallel — same path as the denoise loop. The mixin handles rank dispatch automatically. @@ -624,7 +480,6 @@ def _prefill_kv_cache( head_dim = self.transformer.dim // self.transformer.num_heads if self.state.current_start_frame == 0: - # First call: create caches + encode first frame # L1051-1063 self.state.create_kv_caches( batch_size, dtype, @@ -637,7 +492,6 @@ def _prefill_kv_cache( zero_t = torch.zeros([batch_size, 1], device=device, dtype=torch.long) y_first = self.state.ys[:, :, 0:1] if self.state.ys is not None else None - # Prefill via predict_noise_maybe_with_cfg # L1080-1097 # KV cache update is a side effect in predict_noise() common = dict( hidden_states=image_latents.transpose(1, 2), @@ -674,9 +528,8 @@ def _prefill_kv_cache( true_cfg_scale=self.cfg_scale, cfg_normalize=False, ) - self.state.current_start_frame = 1 # L1098 + self.state.current_start_frame = 1 - # Subsequent: encode current observation # L1102-1125 if self.state.current_start_frame != 1: csf = self.state.current_start_frame nfpb = self.num_frame_per_block @@ -738,7 +591,6 @@ def diffuse( **kwargs, ) -> tuple[torch.Tensor, torch.Tensor]: """Denoising loop with CFG parallel support. - Source: lazy_joint_video_action L1164-1241 For each timestep: 1. Build positive_kwargs / negative_kwargs @@ -746,9 +598,9 @@ def diffuse( 3. scheduler_step_maybe_with_cfg() → VideoActionScheduler 4. _synchronize_cfg_parallel_step_output() """ - seq_len = kwargs["seq_len"] # L1046 - state_features = kwargs.get("state_features") # L950 - embodiment_id = kwargs.get("embodiment_id") # L949 + seq_len = kwargs["seq_len"] + state_features = kwargs.get("state_features") + embodiment_id = kwargs.get("embodiment_id") # Shared kwargs for predict_noise (both cond & uncond branches) common_kwargs = dict( @@ -756,17 +608,16 @@ def diffuse( current_start_frame=self.state.current_start_frame, state_features=state_features, embodiment_id=embodiment_id, - update_kv_cache=False, # L1206: denoising steps don't update KV + update_kv_cache=False, ) - noisy_input = video_latents # L1129 - noisy_input_action = action_latents # L1130 - for index in range(len(timesteps_video)): # L1164 - video_timestep = timesteps_video[index] # L1169 - action_timestep = timesteps_action[index] # L1168 + noisy_input = video_latents + noisy_input_action = action_latents + for index in range(len(timesteps_video)): + video_timestep = timesteps_video[index] + action_timestep = timesteps_action[index] batch_size = noisy_input.shape[0] - # Build per-frame timestep tensors # L1172-1181 timestep = ( torch.ones( [batch_size, self.num_frame_per_block], @@ -784,24 +635,22 @@ def diffuse( * action_timestep ) - # Compute y (image conditioning) slice # L1187-1190 csf = self.state.current_start_frame if csf + self.num_frame_per_block <= self.state.ys.shape[2]: - y = self.state.ys[:, :, csf : csf + self.num_frame_per_block] # L1188 + y = self.state.ys[:, :, csf : csf + self.num_frame_per_block] else: - y = self.state.ys[:, :, -self.num_frame_per_block :] # L1190 + y = self.state.ys[:, :, -self.num_frame_per_block :] - # Positive (cond) kwargs # L1191-1208 positive_kwargs = dict( - hidden_states=noisy_input.transpose(1, 2), # L1192 + hidden_states=noisy_input.transpose(1, 2), timestep_video=timestep, encoder_hidden_states=prompt_embeds, kv_cache=self.state.get_kv_caches(False), crossattn_cache=self.state.get_crossattn_caches(False), y=y, clip_feature=self.state.clip_feas, - action=noisy_input_action, # L1194 - timestep_action=timestep_action, # L1195 + action=noisy_input_action, + timestep_action=timestep_action, is_negative=False, **common_kwargs, ) @@ -833,10 +682,9 @@ def diffuse( ) flow_pred, flow_pred_action = noise_pred - # Scheduler step: video + action # L1225-1240 latents = (noisy_input, noisy_input_action) t = (video_timestep, action_timestep) - noise_pred_tuple = (flow_pred.transpose(1, 2), flow_pred_action) # L1226 + noise_pred_tuple = (flow_pred.transpose(1, 2), flow_pred_action) step_output = video_action_scheduler.step( noise_pred_tuple, t, @@ -845,13 +693,12 @@ def diffuse( ) noisy_input, noisy_input_action = step_output[0] - # Post-step sync # PR #2160 noisy_input, noisy_input_action = self._synchronize_cfg_parallel_step_output( (noisy_input, noisy_input_action), do_true_cfg, ) - return noisy_input, noisy_input_action # L1242-1243 + return noisy_input, noisy_input_action # ----------------------------------------------------------------------- # Main entry point @@ -865,9 +712,7 @@ def _transform_robot_obs(self, robot_obs: dict): @torch.no_grad() def forward(self, req: OmniDiffusionRequest, **kwargs) -> DiffusionOutput: - """Full inference step. Called by DiffusionEngine.step(). - Source: WANPolicyHead.lazy_joint_video_action (L929-1270) - """ + """Full inference step. Called by DiffusionEngine.step().""" extra_args = req.sampling_params.extra_args or {} robot_obs = extra_args.get("robot_obs") if robot_obs is None: @@ -937,7 +782,6 @@ def forward(self, req: OmniDiffusionRequest, **kwargs) -> DiffusionOutput: text_tokens = text_inputs["input_ids"].to(device) attention_mask = text_inputs["attention_mask"].to(device) - # ---- Step 2: Check reset + accumulate frames ---- (L968-981) # Explicit reset from OpenPI serving is carried by `extra_args["reset"]` # on the next inference request after websocket reset/session switch. if extra_args.get("reset", False): @@ -945,17 +789,15 @@ def forward(self, req: OmniDiffusionRequest, **kwargs) -> DiffusionOutput: # Auto-reset based on model state (before accumulation) if self.state.should_reset(text_tokens, 0, self.transformer.local_attn_size): self.state.reset() - self.state.language = text_tokens # L970/975 + self.state.language = text_tokens # Frame accumulation: stitched single frame → multi-frame video video_frames = self.state.accumulate_frames(stitched) # (T, H, W, C) videos = torch.from_numpy(video_frames).unsqueeze(0).to(device) # (B=1, T, H, W, C) - # ---- Step 3: Preprocess video ---- (L952-966) videos = self._preprocess_video(videos) # → [B,C,T,H,W] bf16 _, _, num_frames_raw, height, width = videos.shape - # ---- Step 4: Encode text ---- (L986-991) prompt_embeds = self._encode_text(text_tokens, attention_mask) # Negative prompt for CFG uncond branch (model constant) negative_prompt_embeds = None @@ -973,47 +815,42 @@ def forward(self, req: OmniDiffusionRequest, **kwargs) -> DiffusionOutput: neg_inputs["attention_mask"].to(device), ) - # ---- Step 5: Encode image (first call only) ---- (L1002-1005) # Extract first/last frame for CLIP + VAE encoding - if num_frames_raw == 4 or num_frames_raw == 9: # L996-999 - image = videos[:, :, -1:].transpose(1, 2) # L998: real-world eval + if num_frames_raw == 4 or num_frames_raw == 9: + image = videos[:, :, -1:].transpose(1, 2) else: - image = videos[:, :, :1].transpose(1, 2) # L1000 + image = videos[:, :, :1].transpose(1, 2) - if self.state.current_start_frame == 0: # L1002 + if self.state.current_start_frame == 0: clip_feas, ys, image = self._encode_image( image, self.num_frames, height, width, ) - self.state.clip_feas = clip_feas.to(dtype=image.dtype) # L1004 - self.state.ys = ys.to(dtype=image.dtype) # L1005 + self.state.clip_feas = clip_feas.to(dtype=image.dtype) + self.state.ys = ys.to(dtype=image.dtype) - # ---- Step 6: VAE encode observation frames ---- (L1013-1038) - if self.state.current_start_frame != 0: # L1013-1038 + if self.state.current_start_frame != 0: # Subsequent calls: encode current observation via VAE if (num_frames_raw - 1) // 4 == self.num_frame_per_block: - pass # L1020: no further action + pass elif num_frames_raw // 4 != self.num_frame_per_block: - # Repeat to match num_frame_per_block # L1023-1027 repeat_factor = self.num_frame_per_block // (num_frames_raw // 4) videos = torch.repeat_interleave(videos, repeat_factor, dim=2) first_frame = videos[:, :, 0:1] videos = torch.cat([first_frame, videos], dim=2) else: - first_frame = videos[:, :, 0:1] # L1029-1030 + first_frame = videos[:, :, 0:1] videos = torch.cat([first_frame, videos], dim=2) latent_dtype = videos.dtype with torch.no_grad(): - image = self._encode_vae_latents(videos) # L1032-1038 + image = self._encode_vae_latents(videos) image = image.to(dtype=latent_dtype) - # ---- Step 7: Generate noise (deterministic) ---- (L1041-1042, L176, L771) - # Source: wan_flow_matching_action_tf.py L1041 batch_size = image.shape[0] - generator = torch.Generator(device=device).manual_seed(self.seed) # L771 + generator = torch.Generator(device=device).manual_seed(self.seed) noise_obs = torch.randn( batch_size, 16, @@ -1023,8 +860,8 @@ def forward(self, req: OmniDiffusionRequest, **kwargs) -> DiffusionOutput: device=device, dtype=torch.bfloat16, generator=generator, - ) # L1041 - generator = torch.Generator(device=device).manual_seed(self.seed) # L771 + ) + generator = torch.Generator(device=device).manual_seed(self.seed) noise_action = torch.randn( batch_size, self.action_horizon, @@ -1032,16 +869,15 @@ def forward(self, req: OmniDiffusionRequest, **kwargs) -> DiffusionOutput: device=device, dtype=torch.bfloat16, generator=generator, - ) # L1042 + ) _, num_channels, num_frames, h_latent, w_latent = noise_obs.shape - frame_seqlen = int(h_latent * w_latent / 4) # L1045 - seq_len = frame_seqlen * num_frames # L1046 + frame_seqlen = int(h_latent * w_latent / 4) + seq_len = frame_seqlen * num_frames - image = image.transpose(1, 2) # L1048: [B,C,T,H,W]→[B,T,C,H,W] - noise_obs = noise_obs.transpose(1, 2) # L1049 + image = image.transpose(1, 2) + noise_obs = noise_obs.transpose(1, 2) - # ---- Step 8: Prefill KV cache, ---- (L1078-1125) do_true_cfg = self.cfg_scale > 1.0 and negative_prompt_embeds is not None self._prefill_kv_cache( image, @@ -1052,21 +888,19 @@ def forward(self, req: OmniDiffusionRequest, **kwargs) -> DiffusionOutput: do_true_cfg, ) - # ---- Step 9: Create schedulers ---- (L1134-1155) - sample_scheduler = copy.deepcopy(self.scheduler) # L1134-1137 - sample_scheduler_action = copy.deepcopy(self.scheduler) # L1138-1141 + sample_scheduler = copy.deepcopy(self.scheduler) + sample_scheduler_action = copy.deepcopy(self.scheduler) sample_scheduler.set_timesteps( self.num_inference_steps, device=device, shift=self.sigma_shift, - ) # L1142-1143 + ) sample_scheduler_action.set_timesteps( self.num_inference_steps, device=device, shift=self.sigma_shift, - ) # L1144-1145 + ) - # Decoupled inference: video sigmas end early # L1150-1157 if self.decouple_inference_noise: video_final_noise = self.video_inference_final_noise sigma_max = sample_scheduler.sigmas[0].item() @@ -1080,10 +914,9 @@ def forward(self, req: OmniDiffusionRequest, **kwargs) -> DiffusionOutput: sample_scheduler_action, ) - # ---- Step 10: Denoising loop ---- (L1164-1241) video_out, action_out = self.diffuse( - video_latents=noise_obs, # L1129 - action_latents=noise_action, # L1130 + video_latents=noise_obs, + action_latents=noise_action, timesteps_video=sample_scheduler.timesteps, timesteps_action=sample_scheduler_action.timesteps, prompt_embeds=prompt_embeds, @@ -1095,23 +928,19 @@ def forward(self, req: OmniDiffusionRequest, **kwargs) -> DiffusionOutput: embodiment_id=embodiment_id, ) - # ---- Step 11: Post-process ---- (L1242-1273) - if self.state.current_start_frame == 1: # L1246-1247 + if self.state.current_start_frame == 1: video_out = torch.cat([image, video_out], dim=1) - self.state.current_start_frame += self.num_frame_per_block # L1248 + self.state.current_start_frame += self.num_frame_per_block - # ---- Step 12: Action denormalization ---- (sim_policy.py L500-569) # q99 denorm: [-1,1] → real values action_out = self._denormalize_action(action_out.float(), embodiment_name) # Relative → absolute: only for relative_action_keys (joint_position only) - # Source: droid_relative.yaml L11 — relative_action_keys: [joint_position] # gripper_position is NOT relative, so don't add state back to it if self.relative_action and state_for_postprocess is not None: n_relative = self.relative_action_dim # 7 for DROID (joint only) # Use original state precision for post-denorm absolute recovery. # Upstream adds obs state after `eval_transform.unapply()` - # (`sim_policy.py` L511-566), i.e. after the action tensor has left # the bf16 denoising path. last_state = state_for_postprocess[:, 0, :n_relative] # (B, n_relative) action_out[..., :n_relative] = ( @@ -1124,7 +953,7 @@ def forward(self, req: OmniDiffusionRequest, **kwargs) -> DiffusionOutput: return DiffusionOutput( output={ - "actions": actions_np, # L1273 + "actions": actions_np, # Source `video_pred` is normalized VAE latent output, not RGB. # Use `decode_video_latents()` for DreamZero-equivalent debug # video decoding. @@ -1138,7 +967,6 @@ def forward(self, req: OmniDiffusionRequest, **kwargs) -> DiffusionOutput: def _load_action_norm_stats(self, stats_path: str) -> dict[str, dict[str, torch.Tensor]]: """Load per-embodiment action normalization stats from metadata.json. - Source: metadata.json → statistics.action.{joint_position,gripper_position}.{q01,q99} Returns: {embodiment_name: {"q01": Tensor(action_dim,), "q99": Tensor(action_dim,)}} """ @@ -1166,9 +994,7 @@ def _parse_action_norm_stats(metadata: dict) -> dict[str, dict[str, torch.Tensor @staticmethod def _parse_state_norm_stats(metadata: dict) -> dict[str, dict[str, torch.Tensor]]: - """Load per-embodiment state normalization stats from metadata.json. - Source: `StateActionTransform(normalization_modes=q99)` in eval transform. - """ + """Load per-embodiment state normalization stats from metadata.json.""" result = {} for emb_name, emb_data in metadata.items(): state_stats = emb_data.get("statistics", {}).get("state", {}) @@ -1189,9 +1015,7 @@ def _normalize_state( state: torch.Tensor, embodiment_name: str, ) -> torch.Tensor: - """Normalize state with q99 stats before feeding the model. - Source: `StateActionTransform.apply()` → `Normalizer.forward(mode='q99')`. - """ + """Normalize state with q99 stats before feeding the model.""" state_norm_stats = getattr(self, "state_norm_stats", {}) if embodiment_name not in state_norm_stats: return state @@ -1214,7 +1038,6 @@ def _denormalize_action( embodiment_name: str, ) -> torch.Tensor: """Denormalize action from [-1,1] to real values using q99 mode. - Source: state_action.py Normalizer.inverse() L188-207 Formula: real = (normalized + 1) / 2 * (q99 - q01) + q01 """ @@ -1236,38 +1059,18 @@ def _denormalize_action( @property def weights_sources(self): - """ComponentSource list for DiffusersPipelineLoader. - Source: last_steps.md P0-5 - """ + """ComponentSource list for DiffusersPipelineLoader.""" return self._weights_sources def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - """Load weights from DreamZero root checkpoint with key remapping. - Source: last_steps.md P0-6 - - DreamZero root keys have prefix ``action_head.{component}.*``. - This method dispatches each key to the appropriate component converter: - action_head.model.* → transformer.* (6a: prefix strip) - action_head.text_encoder.* → text_encoder.* (6b: UMT5 remapping) - action_head.image_encoder.* → image_encoder.* (6c: CLIP remapping + QKV split) - action_head.vae.* → vae.* (6d: WanVideoVAE -> diffusers remap) - Other keys (e.g. backbone.*) are silently skipped. - """ + """Load checkpoint weights with key remapping.""" loaded: set[str] = set() params = dict(self.named_parameters()) buffers = dict(self.named_buffers()) for name, tensor in weights: if name.startswith("action_head.model."): - # 6a. Transformer: prefix replacement + img_emb remap new_name = "transformer." + name[len("action_head.model.") :] - # DreamZero img_emb uses nn.Sequential (proj.0/1/3/4), - # CausalWanModel uses named layers (norm1/fc1/norm2/fc2) - # Source: wan_video_dit_action_casual_chunk.py L1380 - # DreamZero MLPProj: - # Sequential([0:LN(1280), 1:Linear(1280,1280), 2:GELU, 3:Linear(1280,5120), 4:LN(5120)]) - # CausalWanModel MLPProj: norm1=LN, fc1=ColParallel, act=GELU, fc2=RowParallel, norm2=LN - # Source: wan2_1_submodule.py L570-573 new_name = ( new_name.replace("img_emb.proj.0.", "img_emb.norm1.") .replace("img_emb.proj.1.", "img_emb.fc1.") @@ -1275,7 +1078,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: .replace("img_emb.proj.4.", "img_emb.norm2.") ) if new_name in params: - # Use default_weight_loader for ColumnParallelLinear/RowParallelLinear default_weight_loader(params[new_name], tensor) loaded.add(new_name) elif new_name in buffers: @@ -1283,7 +1085,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loaded.add(new_name) elif name.startswith("action_head.text_encoder."): - # 6b. Text encoder: DreamZero custom naming -> HF UMT5EncoderModel mapped = self._remap_text_encoder_key(name) if mapped is None: continue @@ -1294,13 +1095,9 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loaded.add(full_name) elif name.startswith("action_head.image_encoder."): - # 6c. Image encoder: source-shaped local port. - # Root checkpoint keys already match the local module layout: - # action_head.image_encoder.model.* -> image_encoder.model.* self._remap_image_encoder_key(name, tensor, params, loaded) elif name.startswith("action_head.vae."): - # 6d. VAE: DreamZero WanVideoVAE -> diffusers AutoencoderKLWan mapped = self._remap_vae_key(name) if mapped is None: continue @@ -1309,8 +1106,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: params[full_name].data.copy_(tensor) loaded.add(full_name) - # All other keys (backbone.*, etc.) are silently skipped - logger.info( "DreamZero load_weights: loaded %d parameters from root checkpoint", len(loaded), @@ -1318,32 +1113,19 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: return loaded # ----------------------------------------------------------------------- - # 6b. Text encoder key remapping (242 keys) + # Text encoder key remapping # ----------------------------------------------------------------------- @staticmethod def _remap_text_encoder_key(name: str) -> str | list[str] | None: - """Remap a single DreamZero text encoder key to HF UMT5EncoderModel name(s). - - DreamZero text encoder is a custom reimplementation of UMT5. - Source key structure: action_head.text_encoder.{subkey} - Target: UMT5EncoderModel state_dict keys (without 'text_encoder.' prefix) - - Returns target name(s) relative to text_encoder, or None to skip. - """ - # Strip the source prefix + """Remap a single text encoder key.""" subkey = name[len("action_head.text_encoder.") :] - # --- Global keys --- if subkey == "token_embedding.weight": - # shared.weight and encoder.embed_tokens.weight are the same tensor (tied); - # only shared.weight appears in named_parameters() return "shared.weight" if subkey == "norm.weight": return "encoder.final_layer_norm.weight" - # --- Per-block keys --- - # Pattern: blocks.{N}.{rest} m = re_module.match(r"blocks\.(\d+)\.(.*)", subkey) if not m: return None @@ -1352,7 +1134,6 @@ def _remap_text_encoder_key(name: str) -> str | list[str] | None: prefix = f"encoder.block.{block_idx}" - # Attention layer (layer.0) if rest == "attn.q.weight": return f"{prefix}.layer.0.SelfAttention.q.weight" if rest == "attn.k.weight": @@ -1366,7 +1147,6 @@ def _remap_text_encoder_key(name: str) -> str | list[str] | None: if rest == "norm1.weight": return f"{prefix}.layer.0.layer_norm.weight" - # FFN layer (layer.1) if rest == "ffn.gate.0.weight": return f"{prefix}.layer.1.DenseReluDense.wi_0.weight" if rest == "ffn.fc1.weight": @@ -1379,17 +1159,12 @@ def _remap_text_encoder_key(name: str) -> str | list[str] | None: return None # ----------------------------------------------------------------------- - # 6d. VAE key remapping (194 keys) + # VAE key remapping # ----------------------------------------------------------------------- @staticmethod def _remap_vae_key(name: str) -> str | None: - """Remap DreamZero WanVideoVAE keys to diffusers AutoencoderKLWan. - - Source key structure: `action_head.vae.model.*` - Upstream source: `wan_video_vae.py` `WanVideoVAE` / `VideoVAE_` - Target: diffusers `AutoencoderKLWan` state_dict keys (without `vae.` prefix) - """ + """Remap DreamZero VAE keys to `DistributedAutoencoderKLWan` keys.""" if not name.startswith("action_head.vae.model."): return None @@ -1485,7 +1260,7 @@ def _remap_vae_key(name: str) -> str | None: return None # ----------------------------------------------------------------------- - # 6c. Image encoder key remapping + # Image encoder key remapping # ----------------------------------------------------------------------- def _remap_image_encoder_key( @@ -1495,18 +1270,7 @@ def _remap_image_encoder_key( params: dict[str, torch.nn.Parameter], loaded: set[str], ) -> None: - """Map a DreamZero image encoder key onto the local source-shaped port. - - Source key structure: - action_head.image_encoder.model.* - - Target key structure: - image_encoder.model.* - - Because `DreamZeroImageEncoder` keeps DreamZero's original parameter - layout, this mapping is now a direct prefix strip instead of the older - HF `CLIPVisionModel` remap. - """ + """Map an image encoder key onto the local module.""" if not name.startswith("action_head.image_encoder."): return diff --git a/vllm_omni/diffusion/models/dreamzero/state_dreamzero.py b/vllm_omni/diffusion/models/dreamzero/state_dreamzero.py index e8ab22dbada..676d5c486c8 100644 --- a/vllm_omni/diffusion/models/dreamzero/state_dreamzero.py +++ b/vllm_omni/diffusion/models/dreamzero/state_dreamzero.py @@ -1,13 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""DreamZero pipeline persistent state. - -Consolidates all cross-forward() state that was originally scattered across: -- ARDroidRoboarenaPolicy._frame_buffers (socket_test_optimized_AR.py) -- WANPolicyHead.kv_cache1/kv_cache_neg (wan_flow_matching_action_tf.py) -- WANPolicyHead.clip_feas/ys (wan_flow_matching_action_tf.py) -""" +"""DreamZero pipeline persistent state.""" from __future__ import annotations @@ -19,7 +13,6 @@ logger = logging.getLogger(__name__) # Number of frames per chunk for subsequent calls (first call uses 1) -# Corresponds to: ARDroidRoboarenaPolicy.FRAMES_PER_CHUNK = 4 FRAMES_PER_CHUNK = 4 @@ -39,7 +32,6 @@ def __init__(self) -> None: # Frame accumulation (single stitched buffer) # Transform outputs stitched single frame per call. # We accumulate here to build multi-frame video for AR inference. - # Source: socket_test_optimized_AR.py L110-144 (adapted from per-camera to stitched) # ------------------------------------------------------------------ def accumulate_frames(self, stitched: np.ndarray) -> np.ndarray: @@ -74,50 +66,35 @@ def accumulate_frames(self, stitched: np.ndarray) -> np.ndarray: # ------------------------------------------------------------------ # Reset / should_reset - # Source: wan_flow_matching_action_tf.py L968-981 # ------------------------------------------------------------------ def reset(self) -> None: - """Clear all state. - - Source: - - socket_test_optimized_AR.py L302-330: ARDroidRoboarenaPolicy._reset_state - - wan_flow_matching_action_tf.py L185-199: WANPolicyHead.__init__ state fields - """ + """Clear all state.""" # Frame buffer — single stitched buffer self.stitched_buffer: list[np.ndarray] = [] self.call_count: int = 0 - # KV cache — from WANPolicyHead.__init__ L185-188. - # TODO(DreamZero): replace this model-local cache with vLLM's managed # KV cache once robot-policy diffusion supports that integration. self.kv_cache: list[torch.Tensor] | None = None self.kv_cache_neg: list[torch.Tensor] | None = None self.crossattn_cache: list[dict[str, bool | torch.Tensor | None]] | None = None self.crossattn_cache_neg: list[dict[str, bool | torch.Tensor | None]] | None = None - self.current_start_frame: int = 0 # WANPolicyHead L199 + self.current_start_frame: int = 0 - # Encoding cache — from WANPolicyHead.__init__ L197-200 self.clip_feas: torch.Tensor | None = None self.ys: torch.Tensor | None = None - self.language: torch.Tensor | None = None # WANPolicyHead L200 + self.language: torch.Tensor | None = None def should_reset(self, text_tokens: torch.Tensor | None, num_video_frames: int, local_attn_size: int) -> bool: - """Determine if state should be reset before this forward(). - - Source: wan_flow_matching_action_tf.py L968-981 - """ - # L968-971: first call (language not set yet) + """Determine if state should be reset before this forward().""" if self.language is None: logger.info("language is None, resetting") return True - # L972-975: language changed if text_tokens is not None and not torch.equal(self.language, text_tokens): logger.info("language changed, resetting") return True - # L976-978: single-frame input (signals new episode in real-world eval) # NOTE: after accumulate_frames, num_video_frames is the accumulated T # (1 for first call, 4 for subsequent). Only reset on true single-frame # which happens when the stitched_buffer was cleared externally. @@ -125,7 +102,6 @@ def should_reset(self, text_tokens: torch.Tensor | None, num_video_frames: int, logger.info("single frame input after first call, resetting") return True - # L979-981: KV cache exceeded local attention window if local_attn_size != -1 and self.current_start_frame >= local_attn_size: logger.info( "current_start_frame %d >= local_attn_size %d, resetting", self.current_start_frame, local_attn_size @@ -136,7 +112,6 @@ def should_reset(self, text_tokens: torch.Tensor | None, num_video_frames: int, # ------------------------------------------------------------------ # KV cache management - # Source: wan_flow_matching_action_tf.py L480-512 # ------------------------------------------------------------------ def create_kv_caches( @@ -148,9 +123,7 @@ def create_kv_caches( num_heads: int, head_dim: int, ) -> None: - """Initialize empty KV caches and cross-attention caches. - Source: wan_flow_matching_action_tf.py L480-512 - """ + """Initialize empty KV caches and cross-attention caches.""" self.kv_cache = [ torch.zeros(2, batch_size, 0, num_heads, head_dim, dtype=dtype, device=device) for _ in range(num_layers) ] @@ -167,9 +140,7 @@ def update_kv_cache( updated_kv: torch.Tensor, is_negative: bool = False, ) -> None: - """Update a single layer's KV cache after prefill. - Source: wan_flow_matching_action_tf.py L856-858 - """ + """Update a single layer's KV cache after prefill.""" cache = self.kv_cache_neg if is_negative else self.kv_cache assert cache is not None, "KV caches not initialized, call create_kv_caches first" cache[layer_index] = updated_kv.clone() diff --git a/vllm_omni/diffusion/models/dreamzero/transform/base.py b/vllm_omni/diffusion/models/dreamzero/transform/base.py index e6a3fee4283..dee78eb5450 100644 --- a/vllm_omni/diffusion/models/dreamzero/transform/base.py +++ b/vllm_omni/diffusion/models/dreamzero/transform/base.py @@ -51,9 +51,7 @@ class RobotPolicyTransform: ACTION_DIM: int def transform_input(self, obs: dict) -> dict: - """Dataset-specific transform: key map → stitch → template → state. - Source: dreamzero_cotrain.py apply_single() L498-596 - """ + """Dataset-specific transform: key map → stitch → template → state.""" # 1. Map image keys → unified keys images: dict[str, np.ndarray] = {} for src_key, dst_key in self.IMAGE_KEY_MAP.items(): diff --git a/vllm_omni/diffusion/models/dreamzero/transform/droid.py b/vllm_omni/diffusion/models/dreamzero/transform/droid.py index ad98a2b7a3a..981895b58b4 100644 --- a/vllm_omni/diffusion/models/dreamzero/transform/droid.py +++ b/vllm_omni/diffusion/models/dreamzero/transform/droid.py @@ -10,17 +10,6 @@ ├────────────┬────────────┤ │ left ext │ right ext │ └────────────┴────────────┘ - -Direct stitching source: - `third_party/dreamzero/groot/vla/model/dreamzero/transform/dreamzero_cotrain.py:337` - to - `third_party/dreamzero/groot/vla/model/dreamzero/transform/dreamzero_cotrain.py:355` - -Size assumptions for the current DreamZero path: - `third_party/dreamzero/groot/vla/configs/model/dreamzero/action_head/wan_flow_matching_action_tf.yaml:17` - `third_party/dreamzero/scripts/train/droid_training_full_finetune.sh:82` - `third_party/dreamzero/scripts/train/droid_training_full_finetune.sh:83` - `third_party/dreamzero/scripts/train/droid_training_full_finetune.sh:86` """ from __future__ import annotations @@ -56,13 +45,7 @@ class DroidTransform(RobotPolicyTransform): @classmethod def _preprocess_view(cls, arr: np.ndarray) -> np.ndarray: - """Match source eval transform for OXE_DROID camera views. - - Source transform chain from `experiment_cfg/conf.yaml`: - `VideoToTensor -> VideoCrop(scale=0.95, eval=center crop) -> - VideoResize(height=176, width=320, interpolation=linear, antialias=True) -> - VideoToNumpy` - """ + """Apply per-view crop and resize before stitching.""" frames = torch.from_numpy(arr).to(torch.float32).permute(0, 3, 1, 2) / 255.0 crop_h = int(arr.shape[1] * cls._VIDEO_CROP_SCALE) crop_w = int(arr.shape[2] * cls._VIDEO_CROP_SCALE) @@ -75,20 +58,7 @@ def _preprocess_view(cls, arr: np.ndarray) -> np.ndarray: return (frames.permute(0, 2, 3, 1) * 255.0).to(torch.uint8).cpu().numpy() def _stitch_views(self, images: dict[str, np.ndarray]) -> np.ndarray: - """OXE_DROID 2x2 stitching: wrist top (2x wide), exteriors bottom. - Direct layout correspondence: - - output canvas `(t, 2H, 2W)` ↔ `dreamzero_cotrain.py:337` - - wrist repeat-along-width ↔ `dreamzero_cotrain.py:339`-`342` - - bottom left/right placement ↔ `dreamzero_cotrain.py:344`-`353` - - The resize-to-176x320 step is not done inside upstream - `_prepare_video()`. Upstream expects the video path to already satisfy - the model's spatial assumptions; for the current DreamZero config that - assumption comes from: - - `wan_flow_matching_action_tf.yaml:17` (`frame_seqlen: 880`) - - `droid_training_full_finetune.sh:82`-`86` - so we materialize that precondition here for online serving. - """ + """OXE_DROID 2x2 stitching: wrist top (2x wide), exteriors bottom.""" left_ext = images.get("images/exterior_0") right_ext = images.get("images/exterior_1") wrist = images.get("images/wrist") @@ -104,21 +74,11 @@ def ensure_4d(arr: np.ndarray | None) -> np.ndarray | None: wrist = ensure_4d(wrist) # Determine shape from first available view. - # Upstream `_prepare_video()` assumes views already share the same H/W - # before it allocates `concat_images`; see `dreamzero_cotrain.py:337`. ref = next((v for v in [wrist, left_ext, right_ext] if v is not None), None) if ref is None: - # No direct upstream line: this is a serving-side empty placeholder. - # We choose 352x640 so the empty sample matches the active DreamZero - # DROID path (per-view 176x320 -> stitched 352x640), consistent with - # `droid_training_full_finetune.sh:82`-`86` and - # `wan_flow_matching_action_tf.yaml:17`. return np.zeros((1, 352, 640, 3), dtype=np.uint8) - # Match the source eval transform chain before `ConcatTransform` / - # `DreamTransform`: center crop by 0.95, then resize each view to - # 176x320. This is the actual preprocessing path used by - # `GrootSimPolicy.eval_transform`, not just a serving-side heuristic. + # Apply per-view crop + resize before stitching. def maybe_preprocess(arr: np.ndarray | None) -> np.ndarray | None: if arr is None: return None @@ -131,19 +91,14 @@ def maybe_preprocess(arr: np.ndarray | None) -> np.ndarray | None: assert ref is not None t, h, w, c = ref.shape - # Match upstream canvas dtype exactly: - # `concat_images = np.zeros(..., dtype=images.dtype)` at - # `dreamzero_cotrain.py:337`. out = np.zeros((t, 2 * h, 2 * w, c), dtype=ref.dtype) # (T, 2H, 2W, C) # Top row: wrist repeated 2x along width. - # Corresponds to `dreamzero_cotrain.py:339`-`342`. if wrist is not None: wrist_wide = np.repeat(wrist, 2, axis=2) # (T, H, 2W, C) out[:, :h, :] = wrist_wide # Bottom row: left exterior | right exterior. - # Corresponds to `dreamzero_cotrain.py:344`-`353`. if left_ext is not None: out[:, h:, :w] = left_ext if right_ext is not None: @@ -152,18 +107,7 @@ def maybe_preprocess(arr: np.ndarray | None) -> np.ndarray | None: return out def _language_template(self, prompt: str) -> str: - """Match the source OXE_DROID language prompt expansion exactly. - - Source correspondence: - - `dreamzero_cotrain.py:collate()` OXE_DROID branch - - `dreamzero_cotrain.py:HuggingfaceTokenizer(clean='whitespace')` - - Upstream online eval does *not* tokenize the raw instruction directly. - After `DreamTransform.apply_single()` emits the raw language string, - `collate()` rewrites it into the multi-view description below and only - then tokenizes it. Using the raw prompt here changes the token ids and - measurably changes the denoising trajectory. - """ + """Expand the language prompt for the OXE_DROID multi-view format.""" prompt = (prompt or "Perform the default behavior.").strip() prompt_lower = prompt.lower() return ( @@ -179,9 +123,7 @@ def _language_template(self, prompt: str) -> str: ) def _extract_raw_state(self, obs: dict) -> np.ndarray: - """OXE_DROID state: 7 joint + 1 gripper = 8 dims. - Source: dreamzero_cotrain.py _prepare_state() L436-467 - """ + """OXE_DROID state: 7 joint + 1 gripper = 8 dims.""" parts = [] if "observation/joint_position" in obs: parts.append(np.asarray(obs["observation/joint_position"], dtype=np.float64).flatten()) diff --git a/vllm_omni/diffusion/models/dreamzero/transform/roboarena.py b/vllm_omni/diffusion/models/dreamzero/transform/roboarena.py index 102c08ccb3b..d19bb0f8ce7 100644 --- a/vllm_omni/diffusion/models/dreamzero/transform/roboarena.py +++ b/vllm_omni/diffusion/models/dreamzero/transform/roboarena.py @@ -6,7 +6,6 @@ RoboArena uses 0-indexed exterior cameras, 3 views total (OXE_DROID embodiment). Same stitching layout as DROID — both map to OXE_DROID in DreamZero. -Source: socket_test_optimized_AR.py L104-108 (key mapping) """ from __future__ import annotations @@ -30,7 +29,6 @@ class RoboArenaTransform(DroidTransform): observation/exterior_image_1_left → right exterior observation/wrist_image_left → wrist - Source: socket_test_optimized_AR.py L104-108 """ IMAGE_KEY_MAP = { From a668866af67a3189317c0549d5fc420a6270b9c9 Mon Sep 17 00:00:00 2001 From: Meng Date: Sun, 19 Apr 2026 19:48:01 +0000 Subject: [PATCH 07/45] tests: fix diffusion scheduler mock imports after rebase Signed-off-by: Meng Co-authored-by: Yangshen Deng --- tests/diffusion/test_diffusion_scheduler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/diffusion/test_diffusion_scheduler.py b/tests/diffusion/test_diffusion_scheduler.py index 9409fb2c993..d2501d0ab0d 100644 --- a/tests/diffusion/test_diffusion_scheduler.py +++ b/tests/diffusion/test_diffusion_scheduler.py @@ -5,6 +5,7 @@ import queue import threading from types import SimpleNamespace +from unittest.mock import Mock, patch import pytest import torch From 87513773a597e7aa718cb3efd6709715f536d8dd Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Mon, 20 Apr 2026 18:59:21 +0000 Subject: [PATCH 08/45] feat: add DreamZero offline video export helpers Add offline example scripts to export DreamZero prediction videos and generate TP/CFG comparison outputs without changing the serving path. Document the workflow in the DreamZero quick start and example README, ignore local generated video artifacts, and add stage YAMLs for TP/CFG variants used by the comparison helper. Also update DreamZero weight loading to honor custom parameter weight loaders during remapped checkpoint loading. Signed-off-by: Yangshen Deng Co-authored-by: Meng --- .gitignore | 1 + docs/models/dreamzero/README.md | 4 +- docs/models/dreamzero/quick_start.md | 64 +++- examples/online_serving/dreamzero/README.md | 89 +++++- .../dreamzero/export_prediction_video.py | 293 ++++++++++++++++++ .../dreamzero/generate_comparison_videos.py | 169 ++++++++++ .../models/dreamzero/pipeline_dreamzero.py | 4 +- .../models/dreamzero/video_export_worker.py | 21 ++ .../stage_configs/dreamzero_tp1_cfg2.yaml | 24 ++ .../stage_configs/dreamzero_tp2_cfg1.yaml | 24 ++ .../stage_configs/dreamzero_tp2_cfg2.yaml | 24 ++ 11 files changed, 699 insertions(+), 18 deletions(-) create mode 100644 examples/online_serving/dreamzero/export_prediction_video.py create mode 100644 examples/online_serving/dreamzero/generate_comparison_videos.py create mode 100644 vllm_omni/diffusion/models/dreamzero/video_export_worker.py create mode 100644 vllm_omni/model_executor/stage_configs/dreamzero_tp1_cfg2.yaml create mode 100644 vllm_omni/model_executor/stage_configs/dreamzero_tp2_cfg1.yaml create mode 100644 vllm_omni/model_executor/stage_configs/dreamzero_tp2_cfg2.yaml diff --git a/.gitignore b/.gitignore index 378de441c7e..11357256016 100644 --- a/.gitignore +++ b/.gitignore @@ -249,6 +249,7 @@ outputs/ results/ generated/ output_*/ +/examples/online_serving/dreamzero/generated_predictions/ # Configuration overrides configs/local.yaml diff --git a/docs/models/dreamzero/README.md b/docs/models/dreamzero/README.md index 49791db386f..92d0b2208ec 100644 --- a/docs/models/dreamzero/README.md +++ b/docs/models/dreamzero/README.md @@ -1,4 +1,4 @@ # DreamZero -- `docs/models/dreamzero/quick_start.md`: quick start, environment checklist, standard e2e/example entry points, and optional upstream parity checks -- `examples/online_serving/dreamzero/README.md`: self-contained OpenPI server/client example with bundled real videos, plus per-script environment / dependency requirements +- `docs/models/dreamzero/quick_start.md`: quick start, server/client launch commands, prediction-video export, standard e2e/example entry points, and optional upstream parity checks +- `examples/online_serving/dreamzero/README.md`: self-contained OpenPI server/client example with bundled real videos, video-export helpers, DROID sim-eval notes, and per-script dependency requirements diff --git a/docs/models/dreamzero/quick_start.md b/docs/models/dreamzero/quick_start.md index 8b674b482fa..b450172f02a 100644 --- a/docs/models/dreamzero/quick_start.md +++ b/docs/models/dreamzero/quick_start.md @@ -12,12 +12,14 @@ Upstream DreamZero-dependent parity checks are optional and live under ## Environment checklist -- Sections 1-4: use the local `vllm-omni` environment. +- Sections 1-5: use the local `vllm-omni` environment. - Bundled OpenPI client extra deps: `openpi-client`, `websockets`, `opencv-python` +- Prediction-video export helpers: local `vllm-omni` environment plus + `opencv-python` and `pillow` - DROID sim-eval client: use an external Isaac Lab / `sim-evals` environment, plus `openpi-client`, `websockets`, `opencv-python`, and `mediapy` -- Optional upstream parity tests: also require `~/code/dreamzero` and - `~/code/dreamzero/checkpoints/dreamzero` +- Optional upstream parity tests: also require `DREAMZERO_REPO` with a + checkpoint under `DREAMZERO_REPO/checkpoints/dreamzero` --- @@ -67,7 +69,51 @@ python examples/online_serving/dreamzero/openpi_client.py \ --- -## 3. Standard online e2e test +## 3. Export prediction videos + +The OpenPI client receives only actions. For visual debugging, use the offline +example helper to collect DreamZero `video_pred` latents from vLLM and decode +them to MP4. + +Single `TP=1, CF_P=1` export: + +```bash +python examples/online_serving/dreamzero/export_prediction_video.py \ + --model GEAR-Dreams/DreamZero-DROID \ + --stage-configs-path vllm_omni/model_executor/stage_configs/dreamzero.yaml \ + --output-dir examples/online_serving/dreamzero/generated_predictions/comparison_videos \ + --output-stem tp1_cfg1_vllm_example +``` + +Generate the comparison set: + +```bash +python examples/online_serving/dreamzero/generate_comparison_videos.py \ + --skip-existing \ + --continue-on-error +``` + +Outputs are written to: + +- `examples/online_serving/dreamzero/generated_predictions/comparison_videos/` + +Useful files: + +- `dreamzero_input_reference.mp4`: stitched real camera input +- `tp1_cfg1_vllm_example.mp4` +- `tp1_cfg2_vllm_example.mp4` +- `tp2_cfg1_vllm_example.mp4` +- `tp2_cfg2_vllm_example.mp4` +- `dreamzero_upstream_reference.mp4` when an upstream reference video is supplied with `--upstream-video` +- `manifest.json`: successful videos and any failed variants + +`tp2_cfg2` requires four free GPUs (`TP=2`, `CF_P=2`). If GPU capacity is not +available, the helper can still keep the successful variants and record the +failure in `manifest.json`. + +--- + +## 4. Standard online e2e test The standard self-contained online serving e2e test is: @@ -80,7 +126,7 @@ checks metadata, action output shape, finite values, and reset behavior. --- -## 4. Shared example test +## 5. Shared example test The example test executes the same client script from `examples/`: @@ -90,7 +136,7 @@ PYTHONPATH=. .venv/bin/python -m pytest tests/examples/online_serving/test_dream --- -## 5. Optional upstream parity baseline +## 6. Optional upstream parity baseline The currently validated strict-parity baseline is: @@ -108,7 +154,7 @@ Current status: --- -## 6. Recommended first run +## 7. Recommended first run If you want the least surprising setup, start with: @@ -121,7 +167,7 @@ Then move to `CF_P=2` if you want CFG parallel. --- -## 7. Formal upstream end-to-end parity test +## 8. Formal upstream end-to-end parity test The formal server-vs-server parity test is: @@ -146,7 +192,7 @@ This test checks: --- -## 8. Related docs +## 9. Related docs - `docs/models/dreamzero/README.md`: DreamZero documentation index - `examples/online_serving/dreamzero/README.md`: bundled OpenPI example diff --git a/examples/online_serving/dreamzero/README.md b/examples/online_serving/dreamzero/README.md index 73cfb2b7a13..ab6213bbc75 100644 --- a/examples/online_serving/dreamzero/README.md +++ b/examples/online_serving/dreamzero/README.md @@ -7,12 +7,16 @@ compatible OpenPI websocket client using bundled real camera videos. - `run_server.sh`: launch DreamZero OpenPI serving - `openpi_client.py`: websocket client that sends real observations +- `export_prediction_video.py`: offline helper that runs vLLM once and decodes DreamZero `video_pred` latents to MP4 +- `generate_comparison_videos.py`: batch helper for TP/CFG comparison videos - `droid_sim_eval_client.py`: DROID `sim-evals` rollout client for the vLLM OpenPI server - `assets/`: minimal real camera videos used by the example +- `generated_predictions/`: ignored local debug/video outputs; do not upload or rely on this directory for serving ## Environment requirements -- `run_server.sh`, `vllm serve`, `openpi_client.py`, and the standard example/e2e tests: +- `run_server.sh`, `vllm serve`, `openpi_client.py`, `export_prediction_video.py`, + `generate_comparison_videos.py`, and the standard example/e2e tests: use the local `vllm-omni` environment. - `openpi_client.py` extra deps: @@ -20,6 +24,12 @@ compatible OpenPI websocket client using bundled real camera videos. pip install openpi-client websockets opencv-python ``` +- video export helper extra deps: + +```bash +pip install opencv-python pillow +``` + - `droid_sim_eval_client.py` must run in an external Isaac Lab / `sim-evals` environment, and also needs: @@ -34,8 +44,8 @@ pip install typing-extensions ``` - Optional `tests/dreamzero/upstream/*` parity tests also require: - - local upstream repo at `~/code/dreamzero` - - local checkpoint at `~/code/dreamzero/checkpoints/dreamzero` + - `DREAMZERO_REPO` pointing to an upstream DreamZero checkout + - an upstream checkpoint at `DREAMZERO_REPO/checkpoints/dreamzero` ## Start the server @@ -87,6 +97,74 @@ It validates: - finite action values - reset response +## Export prediction videos from example inputs + +DreamZero serving returns actions to the websocket client. The model also +produces a latent `video_pred`, but vLLM does **not** auto-save it from the +server path. Use the offline helper below when you want visual debug videos. + +This script: + +1. loads the bundled camera videos from `assets/` +2. builds the same DreamZero/OpenPI observations as the client +3. runs vLLM locally through `Omni` +4. collects `video_pred` latents from `OmniRequestOutput.images` +5. decodes them on the DreamZero worker through `DreamZeroVideoExportWorkerExtension` +6. writes an MP4 under `generated_predictions/` + +Single-config export: + +```bash +python examples/online_serving/dreamzero/export_prediction_video.py \ + --model GEAR-Dreams/DreamZero-DROID \ + --stage-configs-path vllm_omni/model_executor/stage_configs/dreamzero.yaml \ + --output-dir examples/online_serving/dreamzero/generated_predictions/comparison_videos \ + --output-stem tp1_cfg1_vllm_example +``` + +Optional flags: + +- `--save-input-video`: also writes a stitched real-input camera video +- `--save-gif`: also writes GIFs for GitHub comments +- `--save-actions`: also writes action chunks as `.npz` + +Batch comparison export: + +```bash +python examples/online_serving/dreamzero/generate_comparison_videos.py \ + --skip-existing \ + --continue-on-error +``` + +The batch helper tries to generate: + +- `dreamzero_input_reference.mp4`: stitched real input video +- `tp1_cfg1_vllm_example.mp4` +- `tp1_cfg2_vllm_example.mp4` +- `tp2_cfg1_vllm_example.mp4` +- `tp2_cfg2_vllm_example.mp4` +- `dreamzero_upstream_reference.mp4`: copied when `--upstream-video` is provided + +Notes: + +- `tp2_cfg2` needs four free GPUs because `TP=2` and `CF_P=2`. +- If a variant fails, `manifest.json` records the failure and keeps all successful videos. +- The helper does not run the upstream DreamZero server. To include an upstream + reference video, pass `--upstream-video /path/to/video.mp4`. + +Current cleaned comparison outputs are under: + +- `examples/online_serving/dreamzero/generated_predictions/comparison_videos/` + +The useful files are: + +- `dreamzero_input_reference.mp4` +- `tp1_cfg1_vllm_example.mp4` +- `tp1_cfg2_vllm_example.mp4` +- `tp2_cfg1_vllm_example.mp4` +- `dreamzero_upstream_reference.mp4` +- `manifest.json` + ## Run DROID sim-eval against the vLLM server This is the closest setup to an end-to-end simulated policy rollout. @@ -174,10 +252,9 @@ sim-eval client executes before replanning: - the client then sends a fresh observation and asks the server for a new `(24, 8)` chunk -This follows the upstream DreamZero sim-eval client: +This follows the upstream DreamZero sim-eval client behavior: -- `third_party/dreamzero/eval_utils/run_sim_eval.py` defaults - `open_loop_horizon` to `8` +- the upstream sim-eval default `open_loop_horizon` is `8` - DreamZero action outputs use `action_horizon=24` The split is intentional: `24` lets the model predict a longer future plan, diff --git a/examples/online_serving/dreamzero/export_prediction_video.py b/examples/online_serving/dreamzero/export_prediction_video.py new file mode 100644 index 00000000000..1a57e6eb0f8 --- /dev/null +++ b/examples/online_serving/dreamzero/export_prediction_video.py @@ -0,0 +1,293 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from __future__ import annotations + +import argparse +import uuid +from pathlib import Path + +import cv2 +import numpy as np +import torch +from PIL import Image + +from vllm_omni import Omni +from vllm_omni.inputs.data import OmniDiffusionSamplingParams +from vllm_omni.outputs import OmniRequestOutput + +WORKER_EXTENSION = "vllm_omni.diffusion.models.dreamzero.video_export_worker.DreamZeroVideoExportWorkerExtension" +DEFAULT_MODEL = "GEAR-Dreams/DreamZero-DROID" +DEFAULT_PROMPT = "Move the pan forward and use the brush in the middle of the plates to brush the inside of the pan" +DEFAULT_OUTPUT_DIR = Path(__file__).resolve().parent / "generated_predictions" +DEFAULT_OUTPUT_STEM = "dreamzero_prediction" +DEFAULT_SESSION_PREFIX = "dreamzero-export" +ASSETS_DIR = Path(__file__).resolve().parent / "assets" +RELATIVE_OFFSETS = [-23, -16, -8, 0] +ACTION_HORIZON = 24 +CAMERA_FILES = { + "observation/exterior_image_0_left": "exterior_image_1_left.mp4", + "observation/exterior_image_1_left": "exterior_image_2_left.mp4", + "observation/wrist_image_left": "wrist_image_left.mp4", +} + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Export DreamZero prediction video from bundled example inputs.") + parser.add_argument("--model", default=DEFAULT_MODEL) + parser.add_argument("--stage-configs-path", type=Path, required=True) + parser.add_argument("--video-dir", type=Path, default=ASSETS_DIR) + parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR) + parser.add_argument("--output-stem", default=DEFAULT_OUTPUT_STEM) + parser.add_argument("--prompt", default=DEFAULT_PROMPT) + parser.add_argument("--session-id", default=None) + parser.add_argument("--save-input-video", action="store_true") + parser.add_argument("--save-gif", action="store_true") + parser.add_argument("--save-actions", action="store_true") + parser.add_argument("--fps", type=int, default=5) + return parser.parse_args() + + +def _load_all_frames(video_path: Path) -> np.ndarray: + cap = cv2.VideoCapture(str(video_path)) + frames = [] + while True: + ok, frame = cap.read() + if not ok: + break + frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) + cap.release() + if not frames: + raise RuntimeError(f"No frames loaded from {video_path}") + return np.stack(frames, axis=0) + + +def _load_camera_frames(video_dir: Path) -> dict[str, np.ndarray]: + camera_frames: dict[str, np.ndarray] = {} + for camera_key, file_name in CAMERA_FILES.items(): + video_path = video_dir / file_name + if not video_path.exists(): + raise FileNotFoundError(f"Missing DreamZero example asset: {video_path}") + camera_frames[camera_key] = _load_all_frames(video_path) + return camera_frames + + +def _build_frame_schedule(total_frames: int, num_chunks: int) -> list[list[int]]: + chunks: list[list[int]] = [] + current_frame = 23 + for _ in range(num_chunks): + indices = [max(current_frame + offset, 0) for offset in RELATIVE_OFFSETS] + if indices[-1] >= total_frames: + break + chunks.append(indices) + current_frame += ACTION_HORIZON + return chunks + + +def _make_obs_from_video( + camera_frames: dict[str, np.ndarray], + frame_indices: list[int], + *, + prompt: str, + session_id: str, +) -> dict: + obs: dict = {} + for camera_key, all_frames in camera_frames.items(): + selected = all_frames[frame_indices] + obs[camera_key] = selected[0] if len(frame_indices) == 1 else selected + + obs["observation/joint_position"] = np.zeros(7, dtype=np.float32) + obs["observation/cartesian_position"] = np.zeros(6, dtype=np.float32) + obs["observation/gripper_position"] = np.zeros(1, dtype=np.float32) + obs["prompt"] = prompt + obs["session_id"] = session_id + return obs + + +def _build_observations(video_dir: Path, prompt: str, session_id: str) -> tuple[dict[str, np.ndarray], list[dict]]: + camera_frames = _load_camera_frames(video_dir) + total_frames = min(frames.shape[0] for frames in camera_frames.values()) + chunks = _build_frame_schedule(total_frames, 1) + observations = [ + _make_obs_from_video(camera_frames, [0], prompt=prompt, session_id=session_id), + ] + if chunks: + observations.append( + _make_obs_from_video( + camera_frames, + chunks[0], + prompt=prompt, + session_id=session_id, + ) + ) + if len(observations) < 2: + raise RuntimeError("Need at least two DreamZero example observations to export a prediction video.") + return camera_frames, observations[:2] + + +def _extract_latents(output: OmniRequestOutput) -> torch.Tensor: + if not isinstance(output, OmniRequestOutput): + raise TypeError(f"Expected OmniRequestOutput, got {type(output)!r}") + if not output.images: + raise RuntimeError("DreamZero output does not contain video latents in `images`.") + + latents = output.images[0] + if not isinstance(latents, torch.Tensor): + raise TypeError(f"Expected tensor latents, got {type(latents)!r}") + + latents = latents.detach().cpu() + if latents.dim() == 4: + latents = latents.unsqueeze(0) + if latents.dim() != 5: + raise ValueError(f"Unexpected latent shape: {tuple(latents.shape)}") + + if latents.shape[1] < latents.shape[2]: + latents = latents.transpose(1, 2).contiguous() + return latents + + +def _write_mp4(path: Path, frames: np.ndarray, fps: int) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + height, width = frames.shape[1:3] + writer = cv2.VideoWriter( + str(path), + cv2.VideoWriter_fourcc(*"mp4v"), + float(fps), + (width, height), + ) + if not writer.isOpened(): + raise RuntimeError(f"Failed to open video writer for {path}") + try: + for frame in frames: + writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)) + finally: + writer.release() + + +def _write_gif(path: Path, frames: np.ndarray, fps: int) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + images = [Image.fromarray(frame) for frame in frames] + duration_ms = max(int(round(1000 / max(fps, 1))), 1) + images[0].save( + path, + save_all=True, + append_images=images[1:], + duration=duration_ms, + loop=0, + ) + + +def _stitch_input_frames(camera_frames: dict[str, np.ndarray]) -> np.ndarray: + total_frames = min(frames.shape[0] for frames in camera_frames.values()) + stitched = [] + for frame_index in range(total_frames): + left = camera_frames["observation/exterior_image_0_left"][frame_index] + right = camera_frames["observation/exterior_image_1_left"][frame_index] + wrist = camera_frames["observation/wrist_image_left"][frame_index] + pad = np.zeros((left.shape[0], left.shape[1], 3), dtype=np.uint8) + canvas = np.concatenate([left, right], axis=1) + bottom = np.concatenate([wrist, pad], axis=1) + stitched.append(np.concatenate([canvas, bottom], axis=0)) + return np.stack(stitched, axis=0) + + +def _run_generation( + model: str, stage_configs_path: Path, observations: list[dict] +) -> tuple[Omni, list[OmniRequestOutput]]: + omni = Omni( + model=model, + stage_configs_path=str(stage_configs_path), + enforce_eager=True, + worker_extension_cls=WORKER_EXTENSION, + ) + + outputs: list[OmniRequestOutput] = [] + for index, obs in enumerate(observations): + sampling_params = OmniDiffusionSamplingParams( + extra_args={ + "reset": index == 0, + "session_id": obs["session_id"], + "robot_obs": obs, + } + ) + result = omni.generate(obs["prompt"], sampling_params_list=[sampling_params]) + if not result: + raise RuntimeError(f"No output returned for DreamZero request {index}") + outputs.append(result[0]) + return omni, outputs + + +def _decode_with_worker(omni: Omni, full_latents: torch.Tensor) -> np.ndarray: + stage_client = omni.engine.stage_clients[0] + engine = getattr(stage_client, "_engine", None) + if engine is None: + raise RuntimeError("DreamZero export requires inline diffusion stage access.") + + decoded = engine.executor.collective_rpc( + "decode_video_latents_to_uint8", + args=(full_latents,), + unique_reply_rank=0, + exec_all_ranks=True, + ) + if isinstance(decoded, torch.Tensor): + decoded = decoded.numpy() + if not isinstance(decoded, np.ndarray): + raise TypeError(f"Unexpected decoded output type: {type(decoded)!r}") + return decoded + + +def main() -> None: + args = _parse_args() + session_id = args.session_id or f"{DEFAULT_SESSION_PREFIX}-{uuid.uuid4()}" + + camera_frames, observations = _build_observations( + video_dir=args.video_dir, + prompt=args.prompt, + session_id=session_id, + ) + + args.output_dir.mkdir(parents=True, exist_ok=True) + + if args.save_input_video: + input_frames = _stitch_input_frames(camera_frames) + _write_mp4(args.output_dir / f"{args.output_stem}_input.mp4", input_frames, fps=15) + if args.save_gif: + _write_gif(args.output_dir / f"{args.output_stem}_input.gif", input_frames[::3], fps=5) + + omni = None + try: + omni, outputs = _run_generation( + model=args.model, + stage_configs_path=args.stage_configs_path, + observations=observations, + ) + latent_steps = [_extract_latents(output) for output in outputs] + full_latents = torch.cat(latent_steps, dim=2) + frames = _decode_with_worker(omni, full_latents) + finally: + if omni is not None: + omni.close() + + mp4_path = args.output_dir / f"{args.output_stem}.mp4" + + _write_mp4(mp4_path, frames, fps=args.fps) + print(f"SAVED_MP4={mp4_path}") + + if args.save_gif: + gif_path = args.output_dir / f"{args.output_stem}.gif" + _write_gif(gif_path, frames, fps=args.fps) + print(f"SAVED_GIF={gif_path}") + + if args.save_actions: + npz_path = args.output_dir / f"{args.output_stem}_actions.npz" + np.savez( + npz_path, + step0=np.asarray(outputs[0].multimodal_output.get("actions")), + step1=np.asarray(outputs[1].multimodal_output.get("actions")), + ) + print(f"SAVED_ACTIONS={npz_path}") + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/dreamzero/generate_comparison_videos.py b/examples/online_serving/dreamzero/generate_comparison_videos.py new file mode 100644 index 00000000000..e604874c71c --- /dev/null +++ b/examples/online_serving/dreamzero/generate_comparison_videos.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from __future__ import annotations + +import argparse +import json +import shutil +import subprocess +import sys +from pathlib import Path + +import cv2 +import numpy as np + +REPO_ROOT = Path(__file__).resolve().parents[3] +EXAMPLE_DIR = Path(__file__).resolve().parent +EXPORT_SCRIPT = EXAMPLE_DIR / "export_prediction_video.py" +DEFAULT_OUTPUT_DIR = EXAMPLE_DIR / "generated_predictions" / "comparison_videos" +DEFAULT_MODEL = "GEAR-Dreams/DreamZero-DROID" +ASSETS_DIR = EXAMPLE_DIR / "assets" + +CAMERA_FILES = { + "observation/exterior_image_0_left": "exterior_image_1_left.mp4", + "observation/exterior_image_1_left": "exterior_image_2_left.mp4", + "observation/wrist_image_left": "wrist_image_left.mp4", +} + +STAGE_CONFIGS = { + "tp1_cfg1": REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "dreamzero.yaml", + "tp1_cfg2": REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "dreamzero_tp1_cfg2.yaml", + "tp2_cfg1": REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "dreamzero_tp2_cfg1.yaml", + "tp2_cfg2": REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "dreamzero_tp2_cfg2.yaml", +} + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Generate DreamZero comparison videos for four vLLM configs.") + parser.add_argument("--model", default=DEFAULT_MODEL) + parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR) + parser.add_argument("--python", default=sys.executable) + parser.add_argument("--upstream-video", type=Path, default=None) + parser.add_argument("--fps", type=int, default=5) + parser.add_argument("--skip-existing", action="store_true") + parser.add_argument("--continue-on-error", action="store_true") + return parser.parse_args() + + +def _load_all_frames(video_path: Path) -> np.ndarray: + cap = cv2.VideoCapture(str(video_path)) + frames = [] + while True: + ok, frame = cap.read() + if not ok: + break + frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) + cap.release() + if not frames: + raise RuntimeError(f"No frames loaded from {video_path}") + return np.stack(frames, axis=0) + + +def _load_camera_frames() -> dict[str, np.ndarray]: + camera_frames: dict[str, np.ndarray] = {} + for camera_key, file_name in CAMERA_FILES.items(): + camera_frames[camera_key] = _load_all_frames(ASSETS_DIR / file_name) + return camera_frames + + +def _write_mp4(path: Path, frames: np.ndarray, fps: int) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + height, width = frames.shape[1:3] + writer = cv2.VideoWriter(str(path), cv2.VideoWriter_fourcc(*"mp4v"), float(fps), (width, height)) + if not writer.isOpened(): + raise RuntimeError(f"Failed to open video writer for {path}") + try: + for frame in frames: + writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)) + finally: + writer.release() + + +def _write_input_reference(output_dir: Path) -> Path: + output_path = output_dir / "dreamzero_input_reference.mp4" + camera_frames = _load_camera_frames() + total_frames = min(frames.shape[0] for frames in camera_frames.values()) + stitched = [] + for frame_index in range(total_frames): + left = camera_frames["observation/exterior_image_0_left"][frame_index] + right = camera_frames["observation/exterior_image_1_left"][frame_index] + wrist = camera_frames["observation/wrist_image_left"][frame_index] + pad = np.zeros((left.shape[0], left.shape[1], 3), dtype=np.uint8) + top = np.concatenate([left, right], axis=1) + bottom = np.concatenate([wrist, pad], axis=1) + stitched.append(np.concatenate([top, bottom], axis=0)) + _write_mp4(output_path, np.stack(stitched, axis=0), fps=15) + return output_path + + +def _run_export(args: argparse.Namespace, config_name: str, stage_config_path: Path) -> Path: + output_stem = f"{config_name}_vllm_example" + output_path = args.output_dir / f"{output_stem}.mp4" + if args.skip_existing and output_path.exists(): + return output_path + + cmd = [ + args.python, + str(EXPORT_SCRIPT), + "--model", + args.model, + "--stage-configs-path", + str(stage_config_path), + "--output-dir", + str(args.output_dir), + "--output-stem", + output_stem, + "--fps", + str(args.fps), + ] + subprocess.run(cmd, check=True, cwd=REPO_ROOT) + return output_path + + +def _copy_upstream_video(output_dir: Path, upstream_video: Path) -> Path: + output_dir.mkdir(parents=True, exist_ok=True) + dst = output_dir / "dreamzero_upstream_reference.mp4" + shutil.copy2(upstream_video, dst) + return dst + + +def _display_path(path: Path) -> str: + try: + return str(path.resolve().relative_to(REPO_ROOT)) + except ValueError: + return str(path) + + +def main() -> None: + args = _parse_args() + args.output_dir.mkdir(parents=True, exist_ok=True) + + manifest: dict[str, str] = {} + failures: dict[str, str] = {} + manifest["input_reference"] = _display_path(_write_input_reference(args.output_dir)) + + for config_name, stage_config_path in STAGE_CONFIGS.items(): + try: + manifest[config_name] = _display_path(_run_export(args, config_name, stage_config_path)) + except subprocess.CalledProcessError as exc: + if not args.continue_on_error: + raise + failures[config_name] = str(exc).replace(str(REPO_ROOT) + "/", "") + + if args.upstream_video is not None: + manifest["upstream_reference"] = _display_path(_copy_upstream_video(args.output_dir, args.upstream_video)) + + manifest_path = args.output_dir / "manifest.json" + manifest_path.write_text(json.dumps({"videos": manifest, "failures": failures}, indent=2) + "\n") + + for name, path in manifest.items(): + print(f"{name}={path}") + for name, error in failures.items(): + print(f"FAILED_{name}={error}") + print(f"manifest={manifest_path}") + + +if __name__ == "__main__": + main() diff --git a/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py b/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py index 59011eb22b0..24834604185 100644 --- a/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py +++ b/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py @@ -1078,7 +1078,9 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: .replace("img_emb.proj.4.", "img_emb.norm2.") ) if new_name in params: - default_weight_loader(params[new_name], tensor) + param = params[new_name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, tensor) loaded.add(new_name) elif new_name in buffers: buffers[new_name].data.copy_(tensor) diff --git a/vllm_omni/diffusion/models/dreamzero/video_export_worker.py b/vllm_omni/diffusion/models/dreamzero/video_export_worker.py new file mode 100644 index 00000000000..20c8cd8cdc7 --- /dev/null +++ b/vllm_omni/diffusion/models/dreamzero/video_export_worker.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from __future__ import annotations + +import torch + + +class DreamZeroVideoExportWorkerExtension: + """DreamZero worker RPCs used by offline example video export.""" + + def decode_video_latents_to_uint8(self, video_latents: torch.Tensor) -> torch.Tensor: + if self.model_runner is None or self.model_runner.pipeline is None: + raise RuntimeError("DreamZero pipeline is not initialized on this worker.") + + with torch.inference_mode(): + decoded = self.model_runner.pipeline.decode_video_latents(video_latents) + decoded = decoded.squeeze(0).permute(1, 2, 3, 0).contiguous() + decoded = decoded.clamp(-1, 1) * 0.5 + 0.5 + decoded = (decoded * 255.0).round().to(torch.uint8).cpu() + return decoded diff --git a/vllm_omni/model_executor/stage_configs/dreamzero_tp1_cfg2.yaml b/vllm_omni/model_executor/stage_configs/dreamzero_tp1_cfg2.yaml new file mode 100644 index 00000000000..b83f77460f9 --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/dreamzero_tp1_cfg2.yaml @@ -0,0 +1,24 @@ +stage_args: + - stage_id: 0 + stage_type: diffusion + runtime: + devices: "0,1" + engine_args: + model_stage: diffusion + model_class_name: DreamZeroPipeline + distributed_executor_backend: "mp" + max_num_seqs: 1 + parallel_config: + tensor_parallel_size: 1 + cfg_parallel_size: 2 + model_config: + default_robot_embodiment: roboarena + policy_server_config: + image_resolution: [180, 320] + n_external_cameras: 2 + needs_wrist_camera: true + needs_stereo_camera: false + needs_session_id: true + action_space: joint_position + final_output: true + final_output_type: image diff --git a/vllm_omni/model_executor/stage_configs/dreamzero_tp2_cfg1.yaml b/vllm_omni/model_executor/stage_configs/dreamzero_tp2_cfg1.yaml new file mode 100644 index 00000000000..7f5a0c60598 --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/dreamzero_tp2_cfg1.yaml @@ -0,0 +1,24 @@ +stage_args: + - stage_id: 0 + stage_type: diffusion + runtime: + devices: "0,1" + engine_args: + model_stage: diffusion + model_class_name: DreamZeroPipeline + distributed_executor_backend: "mp" + max_num_seqs: 1 + parallel_config: + tensor_parallel_size: 2 + cfg_parallel_size: 1 + model_config: + default_robot_embodiment: roboarena + policy_server_config: + image_resolution: [180, 320] + n_external_cameras: 2 + needs_wrist_camera: true + needs_stereo_camera: false + needs_session_id: true + action_space: joint_position + final_output: true + final_output_type: image diff --git a/vllm_omni/model_executor/stage_configs/dreamzero_tp2_cfg2.yaml b/vllm_omni/model_executor/stage_configs/dreamzero_tp2_cfg2.yaml new file mode 100644 index 00000000000..20e7c581c9a --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/dreamzero_tp2_cfg2.yaml @@ -0,0 +1,24 @@ +stage_args: + - stage_id: 0 + stage_type: diffusion + runtime: + devices: "0,1,2,3" + engine_args: + model_stage: diffusion + model_class_name: DreamZeroPipeline + distributed_executor_backend: "mp" + max_num_seqs: 1 + parallel_config: + tensor_parallel_size: 2 + cfg_parallel_size: 2 + model_config: + default_robot_embodiment: roboarena + policy_server_config: + image_resolution: [180, 320] + n_external_cameras: 2 + needs_wrist_camera: true + needs_stereo_camera: false + needs_session_id: true + action_space: joint_position + final_output: true + final_output_type: image From 857ff5286e4ab8df9f62031edd462452e2378229 Mon Sep 17 00:00:00 2001 From: Meng Date: Mon, 20 Apr 2026 19:01:40 +0000 Subject: [PATCH 09/45] fix: satisfy DreamZero client pre-commit checks Move the upstream DreamZero policy imports in test_openpi_client_ar behind a helper so the file passes E402 without changing behavior, and restore the BasePolicy import while removing the unused cv2 dependency guard in the DROID sim-eval client. Signed-off-by: Meng Co-authored-by: Yangshen Deng --- .../dreamzero/droid_sim_eval_client.py | 62 +++++-------------- .../upstream/openpi_test_client_ar.py | 21 +++++-- 2 files changed, 30 insertions(+), 53 deletions(-) diff --git a/examples/online_serving/dreamzero/droid_sim_eval_client.py b/examples/online_serving/dreamzero/droid_sim_eval_client.py index 4518f23eac2..3a4999cc106 100644 --- a/examples/online_serving/dreamzero/droid_sim_eval_client.py +++ b/examples/online_serving/dreamzero/droid_sim_eval_client.py @@ -43,26 +43,17 @@ import time import uuid from dataclasses import dataclass -from datetime import datetime, UTC +from datetime import UTC, datetime from pathlib import Path from typing import Any import numpy as np import torch -try: - import cv2 -except ImportError as exc: # pragma: no cover - runtime dependency guard - raise ImportError( - "DreamZero sim-eval client requires `opencv-python`." - ) from exc - try: import mediapy except ImportError as exc: # pragma: no cover - runtime dependency guard - raise ImportError( - "DreamZero sim-eval client requires `mediapy`." - ) from exc + raise ImportError("DreamZero sim-eval client requires `mediapy`.") from exc try: from typing import override @@ -70,16 +61,12 @@ try: from typing_extensions import override except ImportError as exc: # pragma: no cover - runtime dependency guard - raise ImportError( - "DreamZero sim-eval client requires `typing-extensions` on Python < 3.12." - ) from exc + raise ImportError("DreamZero sim-eval client requires `typing-extensions` on Python < 3.12.") from exc try: import websockets.sync.client except ImportError as exc: # pragma: no cover - runtime dependency guard - raise ImportError( - "DreamZero sim-eval client requires `websockets`." - ) from exc + raise ImportError("DreamZero sim-eval client requires `websockets`.") from exc # NOTE: # This directory already contains a local file named `openpi_client.py`. @@ -101,9 +88,7 @@ from openpi_client import image_tools, msgpack_numpy from openpi_client.base_policy import BasePolicy except ImportError as exc: # pragma: no cover - runtime dependency guard - raise ImportError( - "DreamZero sim-eval client requires the optional `openpi-client` package." - ) from exc + raise ImportError("DreamZero sim-eval client requires the optional `openpi-client` package.") from exc finally: if removed_path: sys.path.insert(0, example_dir) @@ -340,15 +325,9 @@ def infer(self, obs: dict[str, Any], instruction: str) -> dict[str, Any]: # - cartesian_position is currently unused by DreamZero DROID, so # a dummy zero vector is sent for protocol completeness request_data = { - "observation/exterior_image_0_left": image_tools.resize_with_pad( - curr_obs["right_image"], 180, 320 - ), - "observation/exterior_image_1_left": image_tools.resize_with_pad( - curr_obs["left_image"], 180, 320 - ), - "observation/wrist_image_left": image_tools.resize_with_pad( - curr_obs["wrist_image"], 180, 320 - ), + "observation/exterior_image_0_left": image_tools.resize_with_pad(curr_obs["right_image"], 180, 320), + "observation/exterior_image_1_left": image_tools.resize_with_pad(curr_obs["left_image"], 180, 320), + "observation/wrist_image_left": image_tools.resize_with_pad(curr_obs["wrist_image"], 180, 320), "observation/joint_position": curr_obs["joint_position"].astype(np.float64), "observation/cartesian_position": np.zeros((6,), dtype=np.float64), "observation/gripper_position": curr_obs["gripper_position"].astype(np.float64), @@ -368,9 +347,7 @@ def infer(self, obs: dict[str, Any], instruction: str) -> dict[str, Any]: if actions.ndim != 2: raise AssertionError(f"Expected 2D action array, got shape {actions.shape}") if actions.shape != (ACTION_HORIZON, ACTION_DIM): - raise AssertionError( - f"Expected action shape {(ACTION_HORIZON, ACTION_DIM)}, got {actions.shape}" - ) + raise AssertionError(f"Expected action shape {(ACTION_HORIZON, ACTION_DIM)}, got {actions.shape}") self.pred_action_chunk = actions # Consume exactly one action row from the cached chunk for this @@ -600,24 +577,19 @@ def main() -> None: try: import gymnasium as gym except ImportError as exc: # pragma: no cover - runtime dependency guard - raise ImportError( - "DreamZero sim-eval client requires `gymnasium`." - ) from exc + raise ImportError("DreamZero sim-eval client requires `gymnasium`.") from exc try: import sim_evals.environments # noqa: F401 except ImportError as exc: # pragma: no cover - runtime dependency guard raise ImportError( - "DreamZero sim-eval client requires the external `sim-evals` package " - "or checkout to be importable." + "DreamZero sim-eval client requires the external `sim-evals` package or checkout to be importable." ) from exc try: from isaaclab_tasks.utils import parse_env_cfg except ImportError as exc: # pragma: no cover - runtime dependency guard - raise ImportError( - "DreamZero sim-eval client requires `isaaclab_tasks`." - ) from exc + raise ImportError("DreamZero sim-eval client requires `isaaclab_tasks`.") from exc # Resolve output location and scene prompt. output_dir = _make_output_dir(args.output_root.expanduser().resolve(), args.scene) @@ -726,18 +698,14 @@ def main() -> None: step_index=step_index, used_server_call=bool(result["used_server_call"]), chunk_latency_s=( - float(result["chunk_latency_s"]) - if result["chunk_latency_s"] is not None - else None + float(result["chunk_latency_s"]) if result["chunk_latency_s"] is not None else None ), action=[float(x) for x in np.asarray(result["action"], dtype=np.float32).tolist()], joint_position=[ - float(x) - for x in np.asarray(result["joint_position"], dtype=np.float32).tolist() + float(x) for x in np.asarray(result["joint_position"], dtype=np.float32).tolist() ], gripper_position=[ - float(x) - for x in np.asarray(result["gripper_position"], dtype=np.float32).tolist() + float(x) for x in np.asarray(result["gripper_position"], dtype=np.float32).tolist() ], reward=reward_value, terminated=term_value, diff --git a/tests/dreamzero/upstream/openpi_test_client_ar.py b/tests/dreamzero/upstream/openpi_test_client_ar.py index d68af43dd54..8f6de5e487b 100644 --- a/tests/dreamzero/upstream/openpi_test_client_ar.py +++ b/tests/dreamzero/upstream/openpi_test_client_ar.py @@ -45,16 +45,25 @@ import uuid from pathlib import Path -DREAMZERO_REPO = Path(os.environ.get("DREAMZERO_REPO", "~/code/dreamzero")).expanduser() -if DREAMZERO_REPO.exists() and str(DREAMZERO_REPO) not in sys.path: - sys.path.insert(0, str(DREAMZERO_REPO)) - import cv2 -import eval_utils.policy_server as policy_server import numpy as np -from eval_utils.policy_client import WebsocketClientPolicy from openpi_client import msgpack_numpy +DREAMZERO_REPO = Path(os.environ.get("DREAMZERO_REPO", "~/code/dreamzero")).expanduser() + + +def _import_upstream_policy_modules(): + if DREAMZERO_REPO.exists() and str(DREAMZERO_REPO) not in sys.path: + sys.path.insert(0, str(DREAMZERO_REPO)) + + import eval_utils.policy_server as policy_server + from eval_utils.policy_client import WebsocketClientPolicy + + return policy_server, WebsocketClientPolicy + + +policy_server, WebsocketClientPolicy = _import_upstream_policy_modules() + VIDEO_DIR = os.environ.get("DREAMZERO_VIDEO_DIR", str(DREAMZERO_REPO / "debug_image")) # roboarena key -> video filename From fc394185326a89f21a54a37e999129473e83c313 Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Mon, 20 Apr 2026 19:05:41 +0000 Subject: [PATCH 10/45] style: apply repository-wide pre-commit fixes Run the same pre-commit --all-files pass used by CI and commit the resulting ruff/format adjustments so the DreamZero PR branch is clean under the repo's global hooks. Signed-off-by: Yangshen Deng Co-authored-by: Meng --- .../online_serving/dreamzero/openpi_client.py | 7 ++-- .../upstream/test_openpi_e2e_source_parity.py | 3 +- tests/e2e/online_serving/test_dreamzero.py | 1 + .../openai_api/test_openpi_serving.py | 33 +++++++------------ .../examples/online_serving/test_dreamzero.py | 1 + .../models/dreamzero/transform/__init__.py | 8 ++--- vllm_omni/entrypoints/openai/api_server.py | 4 ++- .../realtime/robot/openpi_connection.py | 1 + .../openai/realtime/robot/openpi_serving.py | 5 +-- 9 files changed, 26 insertions(+), 37 deletions(-) diff --git a/examples/online_serving/dreamzero/openpi_client.py b/examples/online_serving/dreamzero/openpi_client.py index 86422e6597c..a5d9f6a1d78 100755 --- a/examples/online_serving/dreamzero/openpi_client.py +++ b/examples/online_serving/dreamzero/openpi_client.py @@ -44,10 +44,7 @@ DEFAULT_HOST = "127.0.0.1" DEFAULT_PORT = 8000 DEFAULT_PATH = "/v1/realtime/robot/openpi" -DEFAULT_PROMPT = ( - "Move the pan forward and use the brush in the middle of the plates " - "to brush the inside of the pan" -) +DEFAULT_PROMPT = "Move the pan forward and use the brush in the middle of the plates to brush the inside of the pan" ACTION_HORIZON = 24 DEFAULT_ACTION_DIM = 8 RELATIVE_OFFSETS = [-23, -16, -8, 0] @@ -69,7 +66,7 @@ class DreamZeroServerMetadata: action_space: str @classmethod - def from_dict(cls, payload: dict[str, Any]) -> "DreamZeroServerMetadata": + def from_dict(cls, payload: dict[str, Any]) -> DreamZeroServerMetadata: required_keys = ( "image_resolution", "n_external_cameras", diff --git a/tests/dreamzero/upstream/test_openpi_e2e_source_parity.py b/tests/dreamzero/upstream/test_openpi_e2e_source_parity.py index 065ff144d00..505c30b365d 100644 --- a/tests/dreamzero/upstream/test_openpi_e2e_source_parity.py +++ b/tests/dreamzero/upstream/test_openpi_e2e_source_parity.py @@ -181,8 +181,7 @@ def _run_vllm_service(port: int, log_path: Path) -> subprocess.Popen[str]: cfg_parallel_size = _cfg_parallel_size() if cfg_parallel_size > len(gpus): raise RuntimeError( - f"cfg_parallel_size={cfg_parallel_size} requires at least {cfg_parallel_size} GPUs, " - f"but only got {gpus}." + f"cfg_parallel_size={cfg_parallel_size} requires at least {cfg_parallel_size} GPUs, but only got {gpus}." ) env["CUDA_VISIBLE_DEVICES"] = ",".join(gpus[:cfg_parallel_size]) env.setdefault("ATTENTION_BACKEND", "torch") diff --git a/tests/e2e/online_serving/test_dreamzero.py b/tests/e2e/online_serving/test_dreamzero.py index 4ed2a4e9b62..a3b46cb7b0b 100644 --- a/tests/e2e/online_serving/test_dreamzero.py +++ b/tests/e2e/online_serving/test_dreamzero.py @@ -56,6 +56,7 @@ def _pick_test_gpus() -> str: gpu_rows.sort() return ",".join(gpu_index for _, gpu_index in gpu_rows[:2]) or "0,1" + test_params = [ OmniServerParams( model=MODEL, diff --git a/tests/entrypoints/openai_api/test_openpi_serving.py b/tests/entrypoints/openai_api/test_openpi_serving.py index 53e9683de89..2b3db13ab9e 100644 --- a/tests/entrypoints/openai_api/test_openpi_serving.py +++ b/tests/entrypoints/openai_api/test_openpi_serving.py @@ -19,9 +19,7 @@ def _engine_with_policy_config(policy_config=None): - od_config = SimpleNamespace( - model_config={"policy_server_config": policy_config or TEST_POLICY_SERVER_CONFIG} - ) + od_config = SimpleNamespace(model_config={"policy_server_config": policy_config or TEST_POLICY_SERVER_CONFIG}) return SimpleNamespace(get_diffusion_od_config=lambda: od_config) @@ -54,9 +52,7 @@ def test_policy_server_config_reads_diffusion_model_config(): "n_external_cameras": 1, "custom_model_key": {"nested": True}, } - od_config = SimpleNamespace( - model_config={"policy_server_config": policy_config} - ) + od_config = SimpleNamespace(model_config={"policy_server_config": policy_config}) engine_client = SimpleNamespace(get_diffusion_od_config=lambda: od_config) serving = openpi_serving.ServingRealtimeRobotOpenPI(engine_client=engine_client) @@ -69,9 +65,10 @@ def test_policy_server_config_reads_stage_config_model_config(): engine_client = SimpleNamespace( get_diffusion_od_config=lambda: None, stage_configs=[ - SimpleNamespace(stage_type="diffusion", engine_args=SimpleNamespace(model_config={ - "policy_server_config": policy_config - })) + SimpleNamespace( + stage_type="diffusion", + engine_args=SimpleNamespace(model_config={"policy_server_config": policy_config}), + ) ], ) @@ -86,18 +83,16 @@ def test_policy_server_config_reads_omegaconf_stage_config(): stage_configs=[ SimpleNamespace( stage_type="diffusion", - engine_args=SimpleNamespace(model_config=OmegaConf.create({ - "policy_server_config": {"custom_model_key": "from-omegaconf"} - })), + engine_args=SimpleNamespace( + model_config=OmegaConf.create({"policy_server_config": {"custom_model_key": "from-omegaconf"}}) + ), ) ], ) serving = openpi_serving.ServingRealtimeRobotOpenPI(engine_client=engine_client) - assert serving.policy_server_config.to_dict() == { - "custom_model_key": "from-omegaconf" - } + assert serving.policy_server_config.to_dict() == {"custom_model_key": "from-omegaconf"} def test_policy_server_config_is_required(): @@ -124,9 +119,7 @@ def test_create_policy_server_returns_none_without_policy_config(): def test_policy_server_config_reads_engine_model_config(): policy_config = {"custom_model_key": "custom-value"} - engine_client = SimpleNamespace( - model_config=SimpleNamespace(policy_server_config=policy_config) - ) + engine_client = SimpleNamespace(model_config=SimpleNamespace(policy_server_config=policy_config)) serving = openpi_serving.ServingRealtimeRobotOpenPI(engine_client=engine_client) @@ -134,9 +127,7 @@ def test_policy_server_config_reads_engine_model_config(): def test_reset_marks_next_request_for_engine_state_reset(): - serving = openpi_serving.ServingRealtimeRobotOpenPI( - engine_client=_engine_with_policy_config() - ) + serving = openpi_serving.ServingRealtimeRobotOpenPI(engine_client=_engine_with_policy_config()) serving._call_count = 3 serving.reset({}) diff --git a/tests/examples/online_serving/test_dreamzero.py b/tests/examples/online_serving/test_dreamzero.py index f268786e8d7..dfedf1a3535 100644 --- a/tests/examples/online_serving/test_dreamzero.py +++ b/tests/examples/online_serving/test_dreamzero.py @@ -58,6 +58,7 @@ def _pick_test_gpus() -> str: gpu_rows.sort() return ",".join(gpu_index for _, gpu_index in gpu_rows[:2]) or "0,1" + test_params = [ OmniServerParams( model=MODEL, diff --git a/vllm_omni/diffusion/models/dreamzero/transform/__init__.py b/vllm_omni/diffusion/models/dreamzero/transform/__init__.py index b6ce4fa6681..4dec3a72b01 100644 --- a/vllm_omni/diffusion/models/dreamzero/transform/__init__.py +++ b/vllm_omni/diffusion/models/dreamzero/transform/__init__.py @@ -25,11 +25,7 @@ def ensure_transforms_loaded() -> None: importlib.import_module(module_name) except Exception as exc: logger.exception("Failed to import DreamZero transform module %s", module_name) - raise RuntimeError( - f"Failed to import DreamZero transform module '{module_name}'." - ) from exc + raise RuntimeError(f"Failed to import DreamZero transform module '{module_name}'.") from exc if DEFAULT_EMBODIMENT not in TRANSFORMS: - raise RuntimeError( - f"Built-in DreamZero transform '{DEFAULT_EMBODIMENT}' is not registered after import." - ) + raise RuntimeError(f"Built-in DreamZero transform '{DEFAULT_EMBODIMENT}' is not registered after import.") diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index ad887098b1d..c86a2718233 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1428,7 +1428,9 @@ async def realtime_robot_openpi(websocket: WebSocket): serving = getattr(websocket.app.state, "openai_serving_realtime_robot", None) if serving is None: await websocket.accept() - await websocket.send_json({"type": "error", "error": "Robot policy not available", "code": "unsupported"}) + await websocket.send_json( + {"type": "error", "error": "Robot policy not available", "code": "unsupported"} + ) await websocket.close() return connection = RobotRealtimeConnection(websocket, serving) diff --git a/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py b/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py index 59b6db0d95e..b85033afad7 100644 --- a/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py +++ b/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py @@ -25,6 +25,7 @@ logger = init_logger(__name__) _DEFAULT_IDLE_TIMEOUT = 30.0 + def _get_msgpack_numpy() -> Any: try: from openpi_client import msgpack_numpy diff --git a/vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py b/vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py index 7200afe99be..00ccdcfa616 100644 --- a/vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py +++ b/vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py @@ -29,6 +29,7 @@ def _to_builtin_container(value: Any) -> Any: return [_to_builtin_container(item) for item in value] return value + @dataclass(frozen=True) class PolicyServerConfig: """OpenPI policy server handshake config. @@ -39,7 +40,7 @@ class PolicyServerConfig: values: dict[str, Any] @classmethod - def from_model_config(cls, model_config: Any) -> "PolicyServerConfig": + def from_model_config(cls, model_config: Any) -> PolicyServerConfig: if isinstance(model_config, Mapping): raw_config = model_config.get("policy_server_config") else: @@ -79,7 +80,7 @@ def create_policy_server( cls, engine_client: Any, model_name: str | None = None, - ) -> "ServingRealtimeRobotOpenPI | None": + ) -> ServingRealtimeRobotOpenPI | None: try: return cls(engine_client=engine_client, model_name=model_name) except ValueError as exc: From 905f24b57637ceb983464c961af441a50bc337f4 Mon Sep 17 00:00:00 2001 From: Meng Date: Tue, 12 May 2026 15:26:54 +0000 Subject: [PATCH 11/45] feat: add MolmoSpaces DreamZero evaluation demo and server script Introduce a new evaluation demo script for DreamZero on MolmoSpaces benchmarks, allowing users to run evaluations with specified parameters. Additionally, add a script to facilitate server startup with TP=2 configuration for improved performance. Update README to include instructions for running the evaluation and server setup. Signed-off-by: Meng Co-authored-by: Yangshen Deng --- examples/online_serving/dreamzero/README.md | 31 ++++++ .../molmospace_dreamzero_eval_demo.py | 94 +++++++++++++++++++ .../dreamzero/run_server_with_tp2_config.sh | 10 ++ 3 files changed, 135 insertions(+) create mode 100644 examples/online_serving/dreamzero/molmospace_dreamzero_eval_demo.py create mode 100644 examples/online_serving/dreamzero/run_server_with_tp2_config.sh diff --git a/examples/online_serving/dreamzero/README.md b/examples/online_serving/dreamzero/README.md index ab6213bbc75..37bbdb65c56 100644 --- a/examples/online_serving/dreamzero/README.md +++ b/examples/online_serving/dreamzero/README.md @@ -56,6 +56,12 @@ CUDA_VISIBLE_DEVICES=0,1 \ examples/online_serving/dreamzero/run_server.sh ``` +If you have 2 GPUs with moderate VRAM (less than 80GB), you can use the following command to start the server with TP=2 configuration files: +```bash +CUDA_VISIBLE_DEVICES=0,1 \ +examples/online_serving/dreamzero/run_server_with_tp2_config.sh +``` + If you only want 1 GPU: ```bash @@ -63,6 +69,7 @@ CUDA_VISIBLE_DEVICES=0 \ CFG_PARALLEL_SIZE=1 \ examples/online_serving/dreamzero/run_server.sh ``` +Please note DreamZero requires >=74GB VRAM for single-GPU serving. The websocket endpoint is: @@ -353,3 +360,27 @@ The upstream DreamZero-dependent parity tests are kept under: Those tests require a local upstream DreamZero checkout and are not needed for the standard vLLM example above. + + +# MolmoSpaces DreamZero Evaluation Demo + +This example shows how to use the vllm-host to evaluate DreamZero on molmospaces benchmarks. + +## Files + +- `molmospace_dreamzero_eval_demo.py`: evaluate DreamZero on molmospaces benchmarks + +## Environment requirements + +- Install molmospaces in your python environment by following the instructions in [molmospaces/README.md](https://github.com/allenai/molmospaces/blob/main/README.md) +- Prepare the benchmark/assets directory by following the instructions in molmospaces. + +## Run the evaluation + +From the repository root: + +```bash +python examples/online_serving/dreamzero/molmospace_dreamzero_eval_demo.py \ + --benchmark_dir /path/to/molmospaces/assets/benchmarks/molmospaces-bench-v2/benchmarks/20260327/ithor/FrankaCloseHardBench/FrankaCloseHardBench_20260206_json_benchmark \ + --output_dir /path/to/eval_output --max_episodes 1 --task_horizon_steps 240 --episode_idx 1 +``` \ No newline at end of file diff --git a/examples/online_serving/dreamzero/molmospace_dreamzero_eval_demo.py b/examples/online_serving/dreamzero/molmospace_dreamzero_eval_demo.py new file mode 100644 index 00000000000..cf03d89a475 --- /dev/null +++ b/examples/online_serving/dreamzero/molmospace_dreamzero_eval_demo.py @@ -0,0 +1,94 @@ +from __future__ import annotations + +import argparse +import os +import sys +from pathlib import Path + +os.environ.setdefault("MUJOCO_GL", "egl") +os.environ.setdefault("PYOPENGL_PLATFORM", "egl") + +_DEMO_HOST = os.environ.get("VLLM_OMNI_DEMO_HOST", "127.0.0.1") +_DEMO_PORT = int(os.environ.get("VLLM_OMNI_DEMO_PORT", "8000")) + +# Import base configs at module top level so the subclasses below are pickle- +# resolvable (worker processes import this module fresh via __main__). +from molmo_spaces.configs.policy_configs_baselines import ( + DreamZeroPolicyConfig, +) +from molmo_spaces.evaluation.configs.evaluation_configs import ( + DreamZeroPolicyEvalConfig, +) + + +# We only need to change the backend host and port to the vllm-host! +class DreamZeroVllmOmniPolicyConfig(DreamZeroPolicyConfig): + remote_config: dict = dict(host=_DEMO_HOST, port=_DEMO_PORT) + + +class DreamZeroVllmOmniEvalConfig(DreamZeroPolicyEvalConfig): + policy_config: DreamZeroVllmOmniPolicyConfig = DreamZeroVllmOmniPolicyConfig() + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--host", default="127.0.0.1") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument( + "--benchmark_dir", + required=True, + help=( + "Path to benchmark directory, e.g. " + "/path/to/molmospaces/assets/benchmarks/molmospaces-bench-v2/benchmarks/" + "20260327/ithor/FrankaCloseHardBench/FrankaCloseHardBench_20260206_json_benchmark" + ), + ) + parser.add_argument("--max_episodes", type=int, default=1) + parser.add_argument("--task_horizon_steps", type=int, default=80) + parser.add_argument( + "--output_dir", + required=True, + help="Directory to write evaluation outputs (created if missing).", + ) + parser.add_argument("--episode_idx", type=int, default=None) + args = parser.parse_args() + + os.environ["VLLM_OMNI_DEMO_HOST"] = args.host + os.environ["VLLM_OMNI_DEMO_PORT"] = str(args.port) + DreamZeroVllmOmniPolicyConfig.model_fields["remote_config"].default = dict( + host=args.host, port=args.port + ) + + # Import after env vars are set so MuJoCo picks EGL. + from molmo_spaces.evaluation import run_evaluation + + cfg_cls = DreamZeroVllmOmniEvalConfig + + output_dir = args.output_dir + Path(output_dir).mkdir(parents=True, exist_ok=True) + + print(f"[eval] benchmark_dir={args.benchmark_dir}") + print(f"[eval] max_episodes={args.max_episodes} task_horizon_steps={args.task_horizon_steps}") + print(f"[eval] remote policy: ws://{args.host}:{args.port}/v1/realtime/robot/openpi") + + results = run_evaluation( + eval_config_cls=cfg_cls, + benchmark_dir=Path(args.benchmark_dir), + max_episodes=args.max_episodes, + task_horizon_steps=args.task_horizon_steps, + num_workers=1, + use_wandb=False, + output_dir=output_dir, + episode_idx=args.episode_idx, + ) + + print( + f"[eval] success={results.success_count}/{results.total_count} " + f"({results.success_rate:.1%})" + ) + print(f"[eval] output_dir={results.output_dir}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/online_serving/dreamzero/run_server_with_tp2_config.sh b/examples/online_serving/dreamzero/run_server_with_tp2_config.sh new file mode 100644 index 00000000000..8b7701e8d01 --- /dev/null +++ b/examples/online_serving/dreamzero/run_server_with_tp2_config.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +set -euo pipefail + +export PYTORCH_ALLOC_CONF="${PYTORCH_ALLOC_CONF:-expandable_segments:True}" + +CUDA_VISIBLE_DEVICES=0,1 vllm serve GEAR-Dreams/DreamZero-DROID --omni \ + --host 127.0.0.1 --port 8000 \ + --served-model-name dreamzero-droid \ + --enforce-eager --disable-log-stats \ + --stage-configs-path vllm_omni/model_executor/stage_configs/dreamzero_tp2_cfg1.yaml \ No newline at end of file From 6233ad8d48484ad4eaa8edcd42f6b005e79d79a6 Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Sat, 16 May 2026 21:58:48 +0000 Subject: [PATCH 12/45] fix: address DreamZero rebase regressions Set dtype explicitly in the DreamZero legacy stage YAML so omitted --dtype values do not leak through the legacy merge path as None. Update DreamZero tests for the async DiffusionEngine.step API and current tests.helpers fixture imports after rebasing onto latest main. Signed-off-by: Yangshen Deng Co-authored-by: Meng --- examples/online_serving/dreamzero/README.md | 2 +- .../dreamzero/molmospace_dreamzero_eval_demo.py | 13 ++++--------- .../dreamzero/run_server_with_tp2_config.sh | 2 +- tests/diffusion/test_diffusion_scheduler.py | 17 +++++++---------- tests/e2e/online_serving/test_dreamzero.py | 4 ++-- tests/examples/online_serving/test_dreamzero.py | 6 +++--- vllm_omni/diffusion/data.py | 3 +-- vllm_omni/diffusion/utils/hf_utils.py | 5 +---- vllm_omni/entrypoints/openai/api_server.py | 4 +--- .../model_executor/stage_configs/dreamzero.yaml | 1 + 10 files changed, 22 insertions(+), 35 deletions(-) diff --git a/examples/online_serving/dreamzero/README.md b/examples/online_serving/dreamzero/README.md index 37bbdb65c56..707f6d986eb 100644 --- a/examples/online_serving/dreamzero/README.md +++ b/examples/online_serving/dreamzero/README.md @@ -383,4 +383,4 @@ From the repository root: python examples/online_serving/dreamzero/molmospace_dreamzero_eval_demo.py \ --benchmark_dir /path/to/molmospaces/assets/benchmarks/molmospaces-bench-v2/benchmarks/20260327/ithor/FrankaCloseHardBench/FrankaCloseHardBench_20260206_json_benchmark \ --output_dir /path/to/eval_output --max_episodes 1 --task_horizon_steps 240 --episode_idx 1 -``` \ No newline at end of file +``` diff --git a/examples/online_serving/dreamzero/molmospace_dreamzero_eval_demo.py b/examples/online_serving/dreamzero/molmospace_dreamzero_eval_demo.py index cf03d89a475..42c80ebb07e 100644 --- a/examples/online_serving/dreamzero/molmospace_dreamzero_eval_demo.py +++ b/examples/online_serving/dreamzero/molmospace_dreamzero_eval_demo.py @@ -13,10 +13,10 @@ # Import base configs at module top level so the subclasses below are pickle- # resolvable (worker processes import this module fresh via __main__). -from molmo_spaces.configs.policy_configs_baselines import ( +from molmo_spaces.configs.policy_configs_baselines import ( # noqa: E402 DreamZeroPolicyConfig, ) -from molmo_spaces.evaluation.configs.evaluation_configs import ( +from molmo_spaces.evaluation.configs.evaluation_configs import ( # noqa: E402 DreamZeroPolicyEvalConfig, ) @@ -55,9 +55,7 @@ def main() -> int: os.environ["VLLM_OMNI_DEMO_HOST"] = args.host os.environ["VLLM_OMNI_DEMO_PORT"] = str(args.port) - DreamZeroVllmOmniPolicyConfig.model_fields["remote_config"].default = dict( - host=args.host, port=args.port - ) + DreamZeroVllmOmniPolicyConfig.model_fields["remote_config"].default = dict(host=args.host, port=args.port) # Import after env vars are set so MuJoCo picks EGL. from molmo_spaces.evaluation import run_evaluation @@ -82,10 +80,7 @@ def main() -> int: episode_idx=args.episode_idx, ) - print( - f"[eval] success={results.success_count}/{results.total_count} " - f"({results.success_rate:.1%})" - ) + print(f"[eval] success={results.success_count}/{results.total_count} ({results.success_rate:.1%})") print(f"[eval] output_dir={results.output_dir}") return 0 diff --git a/examples/online_serving/dreamzero/run_server_with_tp2_config.sh b/examples/online_serving/dreamzero/run_server_with_tp2_config.sh index 8b7701e8d01..8e33bebcc08 100644 --- a/examples/online_serving/dreamzero/run_server_with_tp2_config.sh +++ b/examples/online_serving/dreamzero/run_server_with_tp2_config.sh @@ -7,4 +7,4 @@ CUDA_VISIBLE_DEVICES=0,1 vllm serve GEAR-Dreams/DreamZero-DROID --omni \ --host 127.0.0.1 --port 8000 \ --served-model-name dreamzero-droid \ --enforce-eager --disable-log-stats \ - --stage-configs-path vllm_omni/model_executor/stage_configs/dreamzero_tp2_cfg1.yaml \ No newline at end of file + --stage-configs-path vllm_omni/model_executor/stage_configs/dreamzero_tp2_cfg1.yaml diff --git a/tests/diffusion/test_diffusion_scheduler.py b/tests/diffusion/test_diffusion_scheduler.py index d2501d0ab0d..d76211e5994 100644 --- a/tests/diffusion/test_diffusion_scheduler.py +++ b/tests/diffusion/test_diffusion_scheduler.py @@ -1,11 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import asyncio import queue import threading from types import SimpleNamespace -from unittest.mock import Mock, patch +from unittest.mock import patch import pytest import torch @@ -485,11 +484,7 @@ def test_scheduler_alias_keeps_default_request_scheduler(self) -> None: @pytest.mark.asyncio async def test_step_raises_aborted_error(self, mocker: MockerFixture) -> None: engine = DiffusionEngine.__new__(DiffusionEngine) - engine._closed = False - engine._loop_started = True - engine._init_lock = asyncio.Lock() - engine.main_loop = asyncio.get_running_loop() - engine.stop_event = threading.Event() + engine._check_and_start_background_loop = mocker.AsyncMock() engine.pre_process_func = None engine.async_add_req_and_wait_for_response = mocker.AsyncMock( return_value=DiffusionOutput(aborted=True, abort_message="Request req-abort aborted.") @@ -564,7 +559,8 @@ def test_dummy_run_raises_on_output_error(self, mocker: MockerFixture) -> None: with pytest.raises(RuntimeError, match="Dummy run failed: boom"): engine._dummy_run() - def test_step_multi_request_reuses_multimodal_slice_logic(self) -> None: + @pytest.mark.asyncio + async def test_step_multi_request_reuses_multimodal_slice_logic(self, mocker: MockerFixture) -> None: engine = DiffusionEngine.__new__(DiffusionEngine) engine.od_config = SimpleNamespace( model_class_name="mock_model", @@ -572,7 +568,8 @@ def test_step_multi_request_reuses_multimodal_slice_logic(self) -> None: ) engine.pre_process_func = None engine.post_process_func = None - engine.add_req_and_wait_for_response = Mock( + engine._check_and_start_background_loop = mocker.AsyncMock() + engine.async_add_req_and_wait_for_response = mocker.AsyncMock( return_value=DiffusionOutput( output={ "video": ["frame-0", "frame-1"], @@ -592,7 +589,7 @@ def test_step_multi_request_reuses_multimodal_slice_logic(self) -> None: ) with patch("vllm_omni.diffusion.diffusion_engine.supports_audio_output", return_value=False): - outputs = engine.step(request) + outputs = await engine.step(request) assert len(outputs) == 2 assert outputs[0].images == ["frame-0"] diff --git a/tests/e2e/online_serving/test_dreamzero.py b/tests/e2e/online_serving/test_dreamzero.py index a3b46cb7b0b..9625216c7f1 100644 --- a/tests/e2e/online_serving/test_dreamzero.py +++ b/tests/e2e/online_serving/test_dreamzero.py @@ -14,8 +14,8 @@ import pytest -from tests.conftest import OmniServerParams -from tests.utils import hardware_test +from tests.helpers.mark import hardware_test +from tests.helpers.runtime import OmniServerParams os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" diff --git a/tests/examples/online_serving/test_dreamzero.py b/tests/examples/online_serving/test_dreamzero.py index dfedf1a3535..b586fc88d05 100644 --- a/tests/examples/online_serving/test_dreamzero.py +++ b/tests/examples/online_serving/test_dreamzero.py @@ -16,9 +16,9 @@ import pytest -from tests.conftest import OmniServerParams -from tests.examples.conftest import run_cmd -from tests.utils import hardware_test +from tests.examples.helpers import run_cmd +from tests.helpers.mark import hardware_test +from tests.helpers.runtime import OmniServerParams os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" diff --git a/vllm_omni/diffusion/data.py b/vllm_omni/diffusion/data.py index c964ca13479..ccb7f75b615 100644 --- a/vllm_omni/diffusion/data.py +++ b/vllm_omni/diffusion/data.py @@ -915,8 +915,7 @@ def enrich_config(self) -> None: == "groot.vla.model.dreamzero.action_head.wan_flow_matching_action_tf.WANPolicyHead" and diffusion_model_cfg.get("_target_") == ( - "groot.vla.model.dreamzero.modules." - "wan_video_dit_action_casual_chunk.CausalWanModel" + "groot.vla.model.dreamzero.modules.wan_video_dit_action_casual_chunk.CausalWanModel" ) ) if looks_like_dreamzero or self.model_class_name == "DreamZeroPipeline": diff --git a/vllm_omni/diffusion/utils/hf_utils.py b/vllm_omni/diffusion/utils/hf_utils.py index 0f765f50288..dad02583f08 100644 --- a/vllm_omni/diffusion/utils/hf_utils.py +++ b/vllm_omni/diffusion/utils/hf_utils.py @@ -47,10 +47,7 @@ def _looks_like_dreamzero(model_name: str) -> bool: action_head_cfg.get("_target_") == "groot.vla.model.dreamzero.action_head.wan_flow_matching_action_tf.WANPolicyHead" and diffusion_model_cfg.get("_target_") - == ( - "groot.vla.model.dreamzero.modules." - "wan_video_dit_action_casual_chunk.CausalWanModel" - ) + == ("groot.vla.model.dreamzero.modules.wan_video_dit_action_casual_chunk.CausalWanModel") ) except Exception: return False diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index c86a2718233..ad887098b1d 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1428,9 +1428,7 @@ async def realtime_robot_openpi(websocket: WebSocket): serving = getattr(websocket.app.state, "openai_serving_realtime_robot", None) if serving is None: await websocket.accept() - await websocket.send_json( - {"type": "error", "error": "Robot policy not available", "code": "unsupported"} - ) + await websocket.send_json({"type": "error", "error": "Robot policy not available", "code": "unsupported"}) await websocket.close() return connection = RobotRealtimeConnection(websocket, serving) diff --git a/vllm_omni/model_executor/stage_configs/dreamzero.yaml b/vllm_omni/model_executor/stage_configs/dreamzero.yaml index cd5cb9e9c54..a2a8559ac5c 100644 --- a/vllm_omni/model_executor/stage_configs/dreamzero.yaml +++ b/vllm_omni/model_executor/stage_configs/dreamzero.yaml @@ -4,6 +4,7 @@ stage_args: runtime: devices: "0" engine_args: + dtype: bfloat16 model_stage: diffusion model_class_name: DreamZeroPipeline distributed_executor_backend: "mp" From b7fb2b485fabcf07f8ab12131f65607615e31e90 Mon Sep 17 00:00:00 2001 From: Meng Date: Sun, 17 May 2026 00:22:09 +0000 Subject: [PATCH 13/45] docs: move DreamZero usage docs into examples Remove the duplicated docs/models/dreamzero pages so model-specific usage lives under examples/online_serving/dreamzero. Signed-off-by: Meng Co-authored-by: Yangshen Deng --- docs/models/dreamzero/README.md | 4 - docs/models/dreamzero/quick_start.md | 198 --------------------------- 2 files changed, 202 deletions(-) delete mode 100644 docs/models/dreamzero/README.md delete mode 100644 docs/models/dreamzero/quick_start.md diff --git a/docs/models/dreamzero/README.md b/docs/models/dreamzero/README.md deleted file mode 100644 index 92d0b2208ec..00000000000 --- a/docs/models/dreamzero/README.md +++ /dev/null @@ -1,4 +0,0 @@ -# DreamZero - -- `docs/models/dreamzero/quick_start.md`: quick start, server/client launch commands, prediction-video export, standard e2e/example entry points, and optional upstream parity checks -- `examples/online_serving/dreamzero/README.md`: self-contained OpenPI server/client example with bundled real videos, video-export helpers, DROID sim-eval notes, and per-script dependency requirements diff --git a/docs/models/dreamzero/quick_start.md b/docs/models/dreamzero/quick_start.md deleted file mode 100644 index b450172f02a..00000000000 --- a/docs/models/dreamzero/quick_start.md +++ /dev/null @@ -1,198 +0,0 @@ -# DreamZero Quick Start - -This document is the shortest path to launching the DreamZero service and connecting the compatible client. - -The commands below assume you run them from the repository root. - -For the self-contained example, use the bundled client and videos under -`examples/online_serving/dreamzero/`. - -Upstream DreamZero-dependent parity checks are optional and live under -`tests/dreamzero/upstream/`. - -## Environment checklist - -- Sections 1-5: use the local `vllm-omni` environment. -- Bundled OpenPI client extra deps: `openpi-client`, `websockets`, `opencv-python` -- Prediction-video export helpers: local `vllm-omni` environment plus - `opencv-python` and `pillow` -- DROID sim-eval client: use an external Isaac Lab / `sim-evals` environment, - plus `openpi-client`, `websockets`, `opencv-python`, and `mediapy` -- Optional upstream parity tests: also require `DREAMZERO_REPO` with a - checkpoint under `DREAMZERO_REPO/checkpoints/dreamzero` - ---- - -## 1. Start the vLLM DreamZero server - -Default example: official HF model + `CF_P=2`. - -```bash -ATTENTION_BACKEND=torch \ -DIFFUSION_ATTENTION_BACKEND=TORCH_SDPA \ -CUDA_VISIBLE_DEVICES=0,1 \ -MASTER_PORT=29628 \ -vllm serve \ - GEAR-Dreams/DreamZero-DROID \ - --omni \ - --host 127.0.0.1 \ - --port 8000 \ - --served-model-name dreamzero-droid \ - --cfg-parallel-size 2 \ - --enforce-eager -``` - -If you only have 1 GPU: - -- change `CUDA_VISIBLE_DEVICES=0,1` to `CUDA_VISIBLE_DEVICES=0` -- remove `--cfg-parallel-size 2` - -OpenPI WebSocket endpoint: - -- `ws://127.0.0.1:8000/v1/realtime/robot/openpi` - ---- - -## 2. Connect the client to the vLLM server - -Use the self-contained DreamZero example client: - -- `examples/online_serving/dreamzero/openpi_client.py` - -When connecting to vLLM, the default websocket path already targets OpenPI: - -```bash -python examples/online_serving/dreamzero/openpi_client.py \ - --host 127.0.0.1 \ - --port 8000 -``` - ---- - -## 3. Export prediction videos - -The OpenPI client receives only actions. For visual debugging, use the offline -example helper to collect DreamZero `video_pred` latents from vLLM and decode -them to MP4. - -Single `TP=1, CF_P=1` export: - -```bash -python examples/online_serving/dreamzero/export_prediction_video.py \ - --model GEAR-Dreams/DreamZero-DROID \ - --stage-configs-path vllm_omni/model_executor/stage_configs/dreamzero.yaml \ - --output-dir examples/online_serving/dreamzero/generated_predictions/comparison_videos \ - --output-stem tp1_cfg1_vllm_example -``` - -Generate the comparison set: - -```bash -python examples/online_serving/dreamzero/generate_comparison_videos.py \ - --skip-existing \ - --continue-on-error -``` - -Outputs are written to: - -- `examples/online_serving/dreamzero/generated_predictions/comparison_videos/` - -Useful files: - -- `dreamzero_input_reference.mp4`: stitched real camera input -- `tp1_cfg1_vllm_example.mp4` -- `tp1_cfg2_vllm_example.mp4` -- `tp2_cfg1_vllm_example.mp4` -- `tp2_cfg2_vllm_example.mp4` -- `dreamzero_upstream_reference.mp4` when an upstream reference video is supplied with `--upstream-video` -- `manifest.json`: successful videos and any failed variants - -`tp2_cfg2` requires four free GPUs (`TP=2`, `CF_P=2`). If GPU capacity is not -available, the helper can still keep the successful variants and record the -failure in `manifest.json`. - ---- - -## 4. Standard online e2e test - -The standard self-contained online serving e2e test is: - -```bash -PYTHONPATH=. .venv/bin/python -m pytest tests/e2e/online_serving/test_dreamzero.py -q -``` - -This test starts a real DreamZero server, sends bundled real camera videos, and -checks metadata, action output shape, finite values, and reset behavior. - ---- - -## 5. Shared example test - -The example test executes the same client script from `examples/`: - -```bash -PYTHONPATH=. .venv/bin/python -m pytest tests/examples/online_serving/test_dreamzero.py -q -``` - ---- - -## 6. Optional upstream parity baseline - -The currently validated strict-parity baseline is: - -- upstream DreamZero in eager mode -- no `torch.compile` -- no DiT cache / skip schedule -- `TP=1` -- `CF_P=1` or `CF_P=2` - -Current status: - -- `TP=1, CF_P=1`: strict parity -- `TP=1, CF_P=2`: strict parity -- `TP=2, CF_P=1/2`: runs, but strict numerical parity is not guaranteed - ---- - -## 7. Recommended first run - -If you want the least surprising setup, start with: - -- `GEAR-Dreams/DreamZero-DROID` -- `--enforce-eager` -- `TP=1` -- `CF_P=1` - -Then move to `CF_P=2` if you want CFG parallel. - ---- - -## 8. Formal upstream end-to-end parity test - -The formal server-vs-server parity test is: - -```bash -PYTHONPATH=. .venv/bin/python -m pytest tests/dreamzero/upstream/test_openpi_e2e_source_parity.py -q -``` - -Run the same parity test on GPUs `0,1` with `CF_P=2`: - -```bash -OPENPI_E2E_GPUS=0,1 \ -OPENPI_E2E_CFG_PARALLEL_SIZE=2 \ -PYTHONPATH=. .venv/bin/python -m pytest tests/dreamzero/upstream/test_openpi_e2e_source_parity.py -q -``` - -This test checks: - -- upstream DreamZero server -- vLLM DreamZero server -- the same DreamZero-compatible client logic -- strict action-output parity under the non-TP, non-compile baseline - ---- - -## 9. Related docs - -- `docs/models/dreamzero/README.md`: DreamZero documentation index -- `examples/online_serving/dreamzero/README.md`: bundled OpenPI example From ba63e6b8347c4aae0cba625fe7ad233cdf9c077d Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Sun, 17 May 2026 00:31:37 +0000 Subject: [PATCH 14/45] examples: use generic DreamZero output directory Move DreamZero video export helper outputs under the existing outputs/ ignore tree instead of relying on a model-specific .gitignore entry. Signed-off-by: Yangshen Deng Co-authored-by: Meng --- .gitignore | 1 - examples/online_serving/dreamzero/README.md | 7 +++---- .../online_serving/dreamzero/export_prediction_video.py | 3 ++- .../online_serving/dreamzero/generate_comparison_videos.py | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 11357256016..378de441c7e 100644 --- a/.gitignore +++ b/.gitignore @@ -249,7 +249,6 @@ outputs/ results/ generated/ output_*/ -/examples/online_serving/dreamzero/generated_predictions/ # Configuration overrides configs/local.yaml diff --git a/examples/online_serving/dreamzero/README.md b/examples/online_serving/dreamzero/README.md index 707f6d986eb..53793b4782b 100644 --- a/examples/online_serving/dreamzero/README.md +++ b/examples/online_serving/dreamzero/README.md @@ -11,7 +11,6 @@ compatible OpenPI websocket client using bundled real camera videos. - `generate_comparison_videos.py`: batch helper for TP/CFG comparison videos - `droid_sim_eval_client.py`: DROID `sim-evals` rollout client for the vLLM OpenPI server - `assets/`: minimal real camera videos used by the example -- `generated_predictions/`: ignored local debug/video outputs; do not upload or rely on this directory for serving ## Environment requirements @@ -117,7 +116,7 @@ This script: 3. runs vLLM locally through `Omni` 4. collects `video_pred` latents from `OmniRequestOutput.images` 5. decodes them on the DreamZero worker through `DreamZeroVideoExportWorkerExtension` -6. writes an MP4 under `generated_predictions/` +6. writes an MP4 under `outputs/dreamzero/generated_predictions/` Single-config export: @@ -125,7 +124,7 @@ Single-config export: python examples/online_serving/dreamzero/export_prediction_video.py \ --model GEAR-Dreams/DreamZero-DROID \ --stage-configs-path vllm_omni/model_executor/stage_configs/dreamzero.yaml \ - --output-dir examples/online_serving/dreamzero/generated_predictions/comparison_videos \ + --output-dir outputs/dreamzero/comparison_videos \ --output-stem tp1_cfg1_vllm_example ``` @@ -161,7 +160,7 @@ Notes: Current cleaned comparison outputs are under: -- `examples/online_serving/dreamzero/generated_predictions/comparison_videos/` +- `outputs/dreamzero/comparison_videos/` The useful files are: diff --git a/examples/online_serving/dreamzero/export_prediction_video.py b/examples/online_serving/dreamzero/export_prediction_video.py index 1a57e6eb0f8..b5dafb369c7 100644 --- a/examples/online_serving/dreamzero/export_prediction_video.py +++ b/examples/online_serving/dreamzero/export_prediction_video.py @@ -20,7 +20,8 @@ WORKER_EXTENSION = "vllm_omni.diffusion.models.dreamzero.video_export_worker.DreamZeroVideoExportWorkerExtension" DEFAULT_MODEL = "GEAR-Dreams/DreamZero-DROID" DEFAULT_PROMPT = "Move the pan forward and use the brush in the middle of the plates to brush the inside of the pan" -DEFAULT_OUTPUT_DIR = Path(__file__).resolve().parent / "generated_predictions" +REPO_ROOT = Path(__file__).resolve().parents[3] +DEFAULT_OUTPUT_DIR = REPO_ROOT / "outputs" / "dreamzero" / "generated_predictions" DEFAULT_OUTPUT_STEM = "dreamzero_prediction" DEFAULT_SESSION_PREFIX = "dreamzero-export" ASSETS_DIR = Path(__file__).resolve().parent / "assets" diff --git a/examples/online_serving/dreamzero/generate_comparison_videos.py b/examples/online_serving/dreamzero/generate_comparison_videos.py index e604874c71c..55726c81430 100644 --- a/examples/online_serving/dreamzero/generate_comparison_videos.py +++ b/examples/online_serving/dreamzero/generate_comparison_videos.py @@ -17,7 +17,7 @@ REPO_ROOT = Path(__file__).resolve().parents[3] EXAMPLE_DIR = Path(__file__).resolve().parent EXPORT_SCRIPT = EXAMPLE_DIR / "export_prediction_video.py" -DEFAULT_OUTPUT_DIR = EXAMPLE_DIR / "generated_predictions" / "comparison_videos" +DEFAULT_OUTPUT_DIR = REPO_ROOT / "outputs" / "dreamzero" / "comparison_videos" DEFAULT_MODEL = "GEAR-Dreams/DreamZero-DROID" ASSETS_DIR = EXAMPLE_DIR / "assets" From f977bd95f1777e66a1d53567f84f95c79ce5ac6c Mon Sep 17 00:00:00 2001 From: Meng Date: Sun, 17 May 2026 00:49:54 +0000 Subject: [PATCH 15/45] config: migrate DreamZero to deploy pipeline config Register DreamZero as an explicit single-stage diffusion PipelineConfig so generic HF VLA checkpoints route through the new registry/deploy path instead of legacy stage_configs. Move DreamZero TP/CFG variants to vllm_omni/deploy and update examples/tests to pass --deploy-config. Signed-off-by: Meng Co-authored-by: Yangshen Deng --- examples/online_serving/dreamzero/README.md | 4 +-- .../dreamzero/export_prediction_video.py | 8 +++--- .../dreamzero/generate_comparison_videos.py | 20 +++++++------- .../online_serving/dreamzero/run_server.sh | 7 ++--- .../dreamzero/run_server_with_tp2_config.sh | 2 +- tests/e2e/online_serving/test_dreamzero.py | 4 +-- .../test_resolve_dreamzero_config.py | 26 +++++-------------- .../examples/online_serving/test_dreamzero.py | 4 +-- vllm_omni/config/pipeline_registry.py | 13 ++++++---- vllm_omni/config/stage_config.py | 5 ++++ vllm_omni/deploy/dreamzero.yaml | 25 ++++++++++++++++++ vllm_omni/deploy/dreamzero_tp1_cfg2.yaml | 25 ++++++++++++++++++ vllm_omni/deploy/dreamzero_tp2_cfg1.yaml | 25 ++++++++++++++++++ vllm_omni/deploy/dreamzero_tp2_cfg2.yaml | 25 ++++++++++++++++++ .../models/dreamzero/__init__.py | 6 +++++ .../models/dreamzero/pipeline.py | 25 ++++++++++++++++++ .../stage_configs/dreamzero.yaml | 22 ---------------- .../stage_configs/dreamzero_tp1_cfg2.yaml | 24 ----------------- .../stage_configs/dreamzero_tp2_cfg1.yaml | 24 ----------------- .../stage_configs/dreamzero_tp2_cfg2.yaml | 24 ----------------- 20 files changed, 174 insertions(+), 144 deletions(-) create mode 100644 vllm_omni/deploy/dreamzero.yaml create mode 100644 vllm_omni/deploy/dreamzero_tp1_cfg2.yaml create mode 100644 vllm_omni/deploy/dreamzero_tp2_cfg1.yaml create mode 100644 vllm_omni/deploy/dreamzero_tp2_cfg2.yaml create mode 100644 vllm_omni/model_executor/models/dreamzero/__init__.py create mode 100644 vllm_omni/model_executor/models/dreamzero/pipeline.py delete mode 100644 vllm_omni/model_executor/stage_configs/dreamzero.yaml delete mode 100644 vllm_omni/model_executor/stage_configs/dreamzero_tp1_cfg2.yaml delete mode 100644 vllm_omni/model_executor/stage_configs/dreamzero_tp2_cfg1.yaml delete mode 100644 vllm_omni/model_executor/stage_configs/dreamzero_tp2_cfg2.yaml diff --git a/examples/online_serving/dreamzero/README.md b/examples/online_serving/dreamzero/README.md index 53793b4782b..e324f8cf2ed 100644 --- a/examples/online_serving/dreamzero/README.md +++ b/examples/online_serving/dreamzero/README.md @@ -65,7 +65,7 @@ If you only want 1 GPU: ```bash CUDA_VISIBLE_DEVICES=0 \ -CFG_PARALLEL_SIZE=1 \ +DEPLOY_CONFIG=vllm_omni/deploy/dreamzero.yaml \ examples/online_serving/dreamzero/run_server.sh ``` Please note DreamZero requires >=74GB VRAM for single-GPU serving. @@ -123,7 +123,7 @@ Single-config export: ```bash python examples/online_serving/dreamzero/export_prediction_video.py \ --model GEAR-Dreams/DreamZero-DROID \ - --stage-configs-path vllm_omni/model_executor/stage_configs/dreamzero.yaml \ + --deploy-config vllm_omni/deploy/dreamzero.yaml \ --output-dir outputs/dreamzero/comparison_videos \ --output-stem tp1_cfg1_vllm_example ``` diff --git a/examples/online_serving/dreamzero/export_prediction_video.py b/examples/online_serving/dreamzero/export_prediction_video.py index b5dafb369c7..b35937298e6 100644 --- a/examples/online_serving/dreamzero/export_prediction_video.py +++ b/examples/online_serving/dreamzero/export_prediction_video.py @@ -37,7 +37,7 @@ def _parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Export DreamZero prediction video from bundled example inputs.") parser.add_argument("--model", default=DEFAULT_MODEL) - parser.add_argument("--stage-configs-path", type=Path, required=True) + parser.add_argument("--deploy-config", type=Path, required=True) parser.add_argument("--video-dir", type=Path, default=ASSETS_DIR) parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR) parser.add_argument("--output-stem", default=DEFAULT_OUTPUT_STEM) @@ -194,11 +194,11 @@ def _stitch_input_frames(camera_frames: dict[str, np.ndarray]) -> np.ndarray: def _run_generation( - model: str, stage_configs_path: Path, observations: list[dict] + model: str, deploy_config_path: Path, observations: list[dict] ) -> tuple[Omni, list[OmniRequestOutput]]: omni = Omni( model=model, - stage_configs_path=str(stage_configs_path), + deploy_config=str(deploy_config_path), enforce_eager=True, worker_extension_cls=WORKER_EXTENSION, ) @@ -260,7 +260,7 @@ def main() -> None: try: omni, outputs = _run_generation( model=args.model, - stage_configs_path=args.stage_configs_path, + deploy_config_path=args.deploy_config, observations=observations, ) latent_steps = [_extract_latents(output) for output in outputs] diff --git a/examples/online_serving/dreamzero/generate_comparison_videos.py b/examples/online_serving/dreamzero/generate_comparison_videos.py index 55726c81430..cd06669253e 100644 --- a/examples/online_serving/dreamzero/generate_comparison_videos.py +++ b/examples/online_serving/dreamzero/generate_comparison_videos.py @@ -27,11 +27,11 @@ "observation/wrist_image_left": "wrist_image_left.mp4", } -STAGE_CONFIGS = { - "tp1_cfg1": REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "dreamzero.yaml", - "tp1_cfg2": REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "dreamzero_tp1_cfg2.yaml", - "tp2_cfg1": REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "dreamzero_tp2_cfg1.yaml", - "tp2_cfg2": REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "dreamzero_tp2_cfg2.yaml", +DEPLOY_CONFIGS = { + "tp1_cfg1": REPO_ROOT / "vllm_omni" / "deploy" / "dreamzero.yaml", + "tp1_cfg2": REPO_ROOT / "vllm_omni" / "deploy" / "dreamzero_tp1_cfg2.yaml", + "tp2_cfg1": REPO_ROOT / "vllm_omni" / "deploy" / "dreamzero_tp2_cfg1.yaml", + "tp2_cfg2": REPO_ROOT / "vllm_omni" / "deploy" / "dreamzero_tp2_cfg2.yaml", } @@ -98,7 +98,7 @@ def _write_input_reference(output_dir: Path) -> Path: return output_path -def _run_export(args: argparse.Namespace, config_name: str, stage_config_path: Path) -> Path: +def _run_export(args: argparse.Namespace, config_name: str, deploy_config_path: Path) -> Path: output_stem = f"{config_name}_vllm_example" output_path = args.output_dir / f"{output_stem}.mp4" if args.skip_existing and output_path.exists(): @@ -109,8 +109,8 @@ def _run_export(args: argparse.Namespace, config_name: str, stage_config_path: P str(EXPORT_SCRIPT), "--model", args.model, - "--stage-configs-path", - str(stage_config_path), + "--deploy-config", + str(deploy_config_path), "--output-dir", str(args.output_dir), "--output-stem", @@ -144,9 +144,9 @@ def main() -> None: failures: dict[str, str] = {} manifest["input_reference"] = _display_path(_write_input_reference(args.output_dir)) - for config_name, stage_config_path in STAGE_CONFIGS.items(): + for config_name, deploy_config_path in DEPLOY_CONFIGS.items(): try: - manifest[config_name] = _display_path(_run_export(args, config_name, stage_config_path)) + manifest[config_name] = _display_path(_run_export(args, config_name, deploy_config_path)) except subprocess.CalledProcessError as exc: if not args.continue_on_error: raise diff --git a/examples/online_serving/dreamzero/run_server.sh b/examples/online_serving/dreamzero/run_server.sh index 49785463db5..c28e0aee5da 100755 --- a/examples/online_serving/dreamzero/run_server.sh +++ b/examples/online_serving/dreamzero/run_server.sh @@ -4,7 +4,7 @@ set -euo pipefail MODEL="${MODEL:-GEAR-Dreams/DreamZero-DROID}" HOST="${HOST:-127.0.0.1}" PORT="${PORT:-8000}" -CFG_PARALLEL_SIZE="${CFG_PARALLEL_SIZE:-2}" +DEPLOY_CONFIG="${DEPLOY_CONFIG:-vllm_omni/deploy/dreamzero_tp1_cfg2.yaml}" SERVED_MODEL_NAME="${SERVED_MODEL_NAME:-dreamzero-droid}" args=( @@ -14,14 +14,11 @@ args=( --host "$HOST" --port "$PORT" --served-model-name "$SERVED_MODEL_NAME" + --deploy-config "$DEPLOY_CONFIG" --enforce-eager --disable-log-stats ) -if [[ -n "$CFG_PARALLEL_SIZE" ]]; then - args+=(--cfg-parallel-size "$CFG_PARALLEL_SIZE") -fi - ATTENTION_BACKEND="${ATTENTION_BACKEND:-torch}" \ DIFFUSION_ATTENTION_BACKEND="${DIFFUSION_ATTENTION_BACKEND:-TORCH_SDPA}" \ vllm "${args[@]}" diff --git a/examples/online_serving/dreamzero/run_server_with_tp2_config.sh b/examples/online_serving/dreamzero/run_server_with_tp2_config.sh index 8e33bebcc08..2510192b775 100644 --- a/examples/online_serving/dreamzero/run_server_with_tp2_config.sh +++ b/examples/online_serving/dreamzero/run_server_with_tp2_config.sh @@ -7,4 +7,4 @@ CUDA_VISIBLE_DEVICES=0,1 vllm serve GEAR-Dreams/DreamZero-DROID --omni \ --host 127.0.0.1 --port 8000 \ --served-model-name dreamzero-droid \ --enforce-eager --disable-log-stats \ - --stage-configs-path vllm_omni/model_executor/stage_configs/dreamzero_tp2_cfg1.yaml + --deploy-config vllm_omni/deploy/dreamzero_tp2_cfg1.yaml diff --git a/tests/e2e/online_serving/test_dreamzero.py b/tests/e2e/online_serving/test_dreamzero.py index 9625216c7f1..0084af15216 100644 --- a/tests/e2e/online_serving/test_dreamzero.py +++ b/tests/e2e/online_serving/test_dreamzero.py @@ -62,8 +62,8 @@ def _pick_test_gpus() -> str: model=MODEL, port=8091, server_args=[ - "--cfg-parallel-size", - "2", + "--deploy-config", + "vllm_omni/deploy/dreamzero_tp1_cfg2.yaml", "--enforce-eager", "--disable-log-stats", ], diff --git a/tests/entrypoints/test_resolve_dreamzero_config.py b/tests/entrypoints/test_resolve_dreamzero_config.py index dd1185b33b4..52581d27236 100644 --- a/tests/entrypoints/test_resolve_dreamzero_config.py +++ b/tests/entrypoints/test_resolve_dreamzero_config.py @@ -1,5 +1,3 @@ -import os - import pytest from vllm_omni.entrypoints.utils import load_stage_configs_from_model, resolve_model_config_path @@ -16,30 +14,20 @@ def test_dreamzero_vla_resolves_to_dreamzero_config(monkeypatch): "vllm_omni.entrypoints.utils._looks_like_dreamzero", lambda _model: True, ) - monkeypatch.setattr( - "vllm_omni.entrypoints.utils.current_omni_platform.get_default_stage_config_path", - lambda: "vllm_omni/model_executor/stage_configs", - ) - - original_exists = os.path.exists - - def mock_exists(path): - if "dreamzero.yaml" in str(path): - return True - return original_exists(path) - - monkeypatch.setattr(os.path, "exists", mock_exists) - result = resolve_model_config_path("GEAR-Dreams/DreamZero-DROID") assert result is not None - assert "dreamzero.yaml" in result + assert result.endswith("vllm_omni/deploy/dreamzero.yaml") def test_dreamzero_config_sets_model_class_and_policy_config(monkeypatch): monkeypatch.setattr( - "vllm_omni.entrypoints.utils.resolve_model_config_path", - lambda _model: "vllm_omni/model_executor/stage_configs/dreamzero.yaml", + "vllm_omni.config.stage_config.StageConfigFactory._auto_detect_model_type", + classmethod(lambda _cls, _model, trust_remote_code=True: ("vla", None)), + ) + monkeypatch.setattr( + "vllm_omni.diffusion.utils.hf_utils._looks_like_dreamzero", + lambda _model: True, ) stage_configs = load_stage_configs_from_model("GEAR-Dreams/DreamZero-DROID") diff --git a/tests/examples/online_serving/test_dreamzero.py b/tests/examples/online_serving/test_dreamzero.py index b586fc88d05..6f305d3eea7 100644 --- a/tests/examples/online_serving/test_dreamzero.py +++ b/tests/examples/online_serving/test_dreamzero.py @@ -64,8 +64,8 @@ def _pick_test_gpus() -> str: model=MODEL, port=8092, server_args=[ - "--cfg-parallel-size", - "2", + "--deploy-config", + "vllm_omni/deploy/dreamzero_tp1_cfg2.yaml", "--enforce-eager", "--disable-log-stats", ], diff --git a/vllm_omni/config/pipeline_registry.py b/vllm_omni/config/pipeline_registry.py index 555f35e173a..c88dcfd5ddd 100644 --- a/vllm_omni/config/pipeline_registry.py +++ b/vllm_omni/config/pipeline_registry.py @@ -17,11 +17,10 @@ ``vllm_omni/.../pipeline.py``. 2. Add one line to ``_OMNI_PIPELINES`` below. -Single-stage diffusion models continue to use the -``_create_default_diffusion_stage_cfg`` fallback in -``async_omni_engine.py`` — they don't need a registry entry. The empty -``_DIFFUSION_PIPELINES`` placeholder previously here (#2915) was removed -once #2987 (which would have populated it) was deferred. +Plain single-stage diffusion models continue to use the +``_create_default_diffusion_stage_cfg`` fallback in ``async_omni_engine.py``. +The empty ``_DIFFUSION_PIPELINES`` placeholder previously here (#2915) was +removed once #2987 (which would have populated it) was deferred. ``register_pipeline(config)`` in ``stage_config`` is still supported for out-of-tree plugins and tests that create pipelines at runtime; those override @@ -65,6 +64,10 @@ "vllm_omni.model_executor.models.bagel.pipeline", "BAGEL_SINGLE_STAGE_PIPELINE", ), + "dreamzero": ( + "vllm_omni.model_executor.models.dreamzero.pipeline", + "DREAMZERO_PIPELINE", + ), "glm_image": ( "vllm_omni.model_executor.models.glm_image.pipeline", "GLM_IMAGE_PIPELINE", diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py index c459ecabe73..aa142a9ffed 100644 --- a/vllm_omni/config/stage_config.py +++ b/vllm_omni/config/stage_config.py @@ -1093,6 +1093,11 @@ def create_from_model( # --- New path: check pipeline registry by model_type first --- model_type, hf_config = cls._auto_detect_model_type(model, trust_remote_code=trust_remote_code) + if model_type == "vla": + from vllm_omni.diffusion.utils.hf_utils import _looks_like_dreamzero + + if _looks_like_dreamzero(model): + model_type = "dreamzero" if model_type and model_type in _PIPELINE_REGISTRY: return cls._create_from_registry(model_type, cli_overrides, deploy_config_path) diff --git a/vllm_omni/deploy/dreamzero.yaml b/vllm_omni/deploy/dreamzero.yaml new file mode 100644 index 00000000000..e77afad8409 --- /dev/null +++ b/vllm_omni/deploy/dreamzero.yaml @@ -0,0 +1,25 @@ +# DreamZero-DROID deploy: single diffusion stage. +# +# Topology is declared in vllm_omni/model_executor/models/dreamzero/pipeline.py. +# This default uses one GPU with TP=1 and CFG parallel disabled. + +pipeline: dreamzero +async_chunk: false +distributed_executor_backend: mp +dtype: bfloat16 + +stages: + - stage_id: 0 + devices: "0" + max_num_seqs: 1 + enforce_eager: true + model_class_name: DreamZeroPipeline + model_config: + default_robot_embodiment: roboarena + policy_server_config: + image_resolution: [180, 320] + n_external_cameras: 2 + needs_wrist_camera: true + needs_stereo_camera: false + needs_session_id: true + action_space: joint_position diff --git a/vllm_omni/deploy/dreamzero_tp1_cfg2.yaml b/vllm_omni/deploy/dreamzero_tp1_cfg2.yaml new file mode 100644 index 00000000000..ed9988c92a3 --- /dev/null +++ b/vllm_omni/deploy/dreamzero_tp1_cfg2.yaml @@ -0,0 +1,25 @@ +# DreamZero-DROID deploy: TP=1, CFG parallel size=2. + +pipeline: dreamzero +async_chunk: false +distributed_executor_backend: mp +dtype: bfloat16 + +stages: + - stage_id: 0 + devices: "0,1" + max_num_seqs: 1 + enforce_eager: true + model_class_name: DreamZeroPipeline + parallel_config: + tensor_parallel_size: 1 + cfg_parallel_size: 2 + model_config: + default_robot_embodiment: roboarena + policy_server_config: + image_resolution: [180, 320] + n_external_cameras: 2 + needs_wrist_camera: true + needs_stereo_camera: false + needs_session_id: true + action_space: joint_position diff --git a/vllm_omni/deploy/dreamzero_tp2_cfg1.yaml b/vllm_omni/deploy/dreamzero_tp2_cfg1.yaml new file mode 100644 index 00000000000..16f0aa7ef21 --- /dev/null +++ b/vllm_omni/deploy/dreamzero_tp2_cfg1.yaml @@ -0,0 +1,25 @@ +# DreamZero-DROID deploy: TP=2, CFG parallel disabled. + +pipeline: dreamzero +async_chunk: false +distributed_executor_backend: mp +dtype: bfloat16 + +stages: + - stage_id: 0 + devices: "0,1" + max_num_seqs: 1 + enforce_eager: true + model_class_name: DreamZeroPipeline + parallel_config: + tensor_parallel_size: 2 + cfg_parallel_size: 1 + model_config: + default_robot_embodiment: roboarena + policy_server_config: + image_resolution: [180, 320] + n_external_cameras: 2 + needs_wrist_camera: true + needs_stereo_camera: false + needs_session_id: true + action_space: joint_position diff --git a/vllm_omni/deploy/dreamzero_tp2_cfg2.yaml b/vllm_omni/deploy/dreamzero_tp2_cfg2.yaml new file mode 100644 index 00000000000..76e57a8ca49 --- /dev/null +++ b/vllm_omni/deploy/dreamzero_tp2_cfg2.yaml @@ -0,0 +1,25 @@ +# DreamZero-DROID deploy: TP=2, CFG parallel size=2. + +pipeline: dreamzero +async_chunk: false +distributed_executor_backend: mp +dtype: bfloat16 + +stages: + - stage_id: 0 + devices: "0,1,2,3" + max_num_seqs: 1 + enforce_eager: true + model_class_name: DreamZeroPipeline + parallel_config: + tensor_parallel_size: 2 + cfg_parallel_size: 2 + model_config: + default_robot_embodiment: roboarena + policy_server_config: + image_resolution: [180, 320] + n_external_cameras: 2 + needs_wrist_camera: true + needs_stereo_camera: false + needs_session_id: true + action_space: joint_position diff --git a/vllm_omni/model_executor/models/dreamzero/__init__.py b/vllm_omni/model_executor/models/dreamzero/__init__.py new file mode 100644 index 00000000000..0897383fc8d --- /dev/null +++ b/vllm_omni/model_executor/models/dreamzero/__init__.py @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm_omni.model_executor.models.dreamzero.pipeline import DREAMZERO_PIPELINE + +__all__ = ["DREAMZERO_PIPELINE"] diff --git a/vllm_omni/model_executor/models/dreamzero/pipeline.py b/vllm_omni/model_executor/models/dreamzero/pipeline.py new file mode 100644 index 00000000000..bc815dedcf7 --- /dev/null +++ b/vllm_omni/model_executor/models/dreamzero/pipeline.py @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""DreamZero single-stage diffusion topology.""" + +from vllm_omni.config.stage_config import ( + PipelineConfig, + StageExecutionType, + StagePipelineConfig, +) + +DREAMZERO_PIPELINE = PipelineConfig( + model_type="dreamzero", + model_arch="DreamZeroPipeline", + stages=( + StagePipelineConfig( + stage_id=0, + model_stage="diffusion", + execution_type=StageExecutionType.DIFFUSION, + input_sources=(), + final_output=True, + final_output_type="image", + model_arch="DreamZeroPipeline", + ), + ), +) diff --git a/vllm_omni/model_executor/stage_configs/dreamzero.yaml b/vllm_omni/model_executor/stage_configs/dreamzero.yaml deleted file mode 100644 index a2a8559ac5c..00000000000 --- a/vllm_omni/model_executor/stage_configs/dreamzero.yaml +++ /dev/null @@ -1,22 +0,0 @@ -stage_args: - - stage_id: 0 - stage_type: diffusion - runtime: - devices: "0" - engine_args: - dtype: bfloat16 - model_stage: diffusion - model_class_name: DreamZeroPipeline - distributed_executor_backend: "mp" - max_num_seqs: 1 - model_config: - default_robot_embodiment: roboarena - policy_server_config: - image_resolution: [180, 320] - n_external_cameras: 2 - needs_wrist_camera: true - needs_stereo_camera: false - needs_session_id: true - action_space: joint_position - final_output: true - final_output_type: image diff --git a/vllm_omni/model_executor/stage_configs/dreamzero_tp1_cfg2.yaml b/vllm_omni/model_executor/stage_configs/dreamzero_tp1_cfg2.yaml deleted file mode 100644 index b83f77460f9..00000000000 --- a/vllm_omni/model_executor/stage_configs/dreamzero_tp1_cfg2.yaml +++ /dev/null @@ -1,24 +0,0 @@ -stage_args: - - stage_id: 0 - stage_type: diffusion - runtime: - devices: "0,1" - engine_args: - model_stage: diffusion - model_class_name: DreamZeroPipeline - distributed_executor_backend: "mp" - max_num_seqs: 1 - parallel_config: - tensor_parallel_size: 1 - cfg_parallel_size: 2 - model_config: - default_robot_embodiment: roboarena - policy_server_config: - image_resolution: [180, 320] - n_external_cameras: 2 - needs_wrist_camera: true - needs_stereo_camera: false - needs_session_id: true - action_space: joint_position - final_output: true - final_output_type: image diff --git a/vllm_omni/model_executor/stage_configs/dreamzero_tp2_cfg1.yaml b/vllm_omni/model_executor/stage_configs/dreamzero_tp2_cfg1.yaml deleted file mode 100644 index 7f5a0c60598..00000000000 --- a/vllm_omni/model_executor/stage_configs/dreamzero_tp2_cfg1.yaml +++ /dev/null @@ -1,24 +0,0 @@ -stage_args: - - stage_id: 0 - stage_type: diffusion - runtime: - devices: "0,1" - engine_args: - model_stage: diffusion - model_class_name: DreamZeroPipeline - distributed_executor_backend: "mp" - max_num_seqs: 1 - parallel_config: - tensor_parallel_size: 2 - cfg_parallel_size: 1 - model_config: - default_robot_embodiment: roboarena - policy_server_config: - image_resolution: [180, 320] - n_external_cameras: 2 - needs_wrist_camera: true - needs_stereo_camera: false - needs_session_id: true - action_space: joint_position - final_output: true - final_output_type: image diff --git a/vllm_omni/model_executor/stage_configs/dreamzero_tp2_cfg2.yaml b/vllm_omni/model_executor/stage_configs/dreamzero_tp2_cfg2.yaml deleted file mode 100644 index 20e7c581c9a..00000000000 --- a/vllm_omni/model_executor/stage_configs/dreamzero_tp2_cfg2.yaml +++ /dev/null @@ -1,24 +0,0 @@ -stage_args: - - stage_id: 0 - stage_type: diffusion - runtime: - devices: "0,1,2,3" - engine_args: - model_stage: diffusion - model_class_name: DreamZeroPipeline - distributed_executor_backend: "mp" - max_num_seqs: 1 - parallel_config: - tensor_parallel_size: 2 - cfg_parallel_size: 2 - model_config: - default_robot_embodiment: roboarena - policy_server_config: - image_resolution: [180, 320] - n_external_cameras: 2 - needs_wrist_camera: true - needs_stereo_camera: false - needs_session_id: true - action_space: joint_position - final_output: true - final_output_type: image From 285cc577db76f017470a1579f7c60405bb8e10fe Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Sun, 17 May 2026 01:30:11 +0000 Subject: [PATCH 16/45] docs: clarify OpenPI robot serving ownership Keep preprocessing ownership in the loaded policy pipeline and avoid implying that the OpenPI websocket route is model-specific. Signed-off-by: Yangshen Deng Co-authored-by: Meng --- vllm_omni/entrypoints/openai/api_server.py | 2 +- .../openai/realtime/robot/openpi_connection.py | 4 ++-- .../entrypoints/openai/realtime/robot/openpi_serving.py | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index ad887098b1d..d406b3a0dc0 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1417,7 +1417,7 @@ async def realtime_websocket(websocket: WebSocket): async def realtime_robot_openpi(websocket: WebSocket): """WebSocket endpoint for robot policy inference (OpenPI protocol). - Binary frames: msgpack observation/action (DreamZero/OpenPI compatible). + Binary frames: msgpack observation/action (OpenPI compatible). Text frames: JSON control events (session.update, etc.). See realtime.robot.openpi_connection.py for protocol details. """ diff --git a/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py b/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py index b85033afad7..dca8306ff41 100644 --- a/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py +++ b/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py @@ -3,7 +3,7 @@ """WebSocket connection for robot policy inference (OpenPI protocol). -Protocol (compatible with DreamZero test_client_AR.py): +Protocol (compatible with OpenPI policy clients): Connect -> server sends msgpack(PolicyServerConfig fields) Infer -> client sends msgpack(obs), server sends msgpack(ndarray) Reset -> client sends msgpack({endpoint:reset}), server sends "reset successful" @@ -69,7 +69,7 @@ def _unpack_request(self, data: bytes) -> dict[str, Any]: return obs async def handle_connection(self) -> None: - """Main loop. Matches DreamZero policy_server.py._handler.""" + """Main loop for OpenPI-compatible policy serving.""" await self.websocket.accept() try: diff --git a/vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py b/vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py index 00ccdcfa616..f8f878de031 100644 --- a/vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py +++ b/vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py @@ -3,8 +3,8 @@ """Serving layer for robot policy inference via `/v1/realtime/robot/openpi`. -Flow: raw obs → `DiffusionEngine.step()` → actions. -DreamZero owns dataset transforms inside the diffusion pipeline. +Flow: raw obs → engine request → actions. +The loaded policy model owns dataset transforms inside its pipeline. """ from __future__ import annotations @@ -117,7 +117,7 @@ def _get_policy_server_config(engine_client: Any) -> PolicyServerConfig: def reset(self, obs: dict) -> None: """Reset serving state. - Engine-side DreamZero state is reset on the next inference request via + Engine-side policy state is reset on the next inference request via `extra_args["reset"]`, not by an immediate websocket-side RPC. """ self._call_count = 0 @@ -162,7 +162,7 @@ def _build_request(self, obs: dict) -> Any: from vllm_omni.inputs.data import OmniDiffusionSamplingParams # `_call_count` is reset by websocket reset/session switches, then - # incremented before this request is built. DreamZero pipeline consumes + # incremented before this request is built. The policy pipeline consumes # this flag and clears its frame buffer / KV cache before accumulation. extra_args = { "reset": self._call_count <= 1, From ae73883ead68280a5bc4f78d6c67cffa50e83ae2 Mon Sep 17 00:00:00 2001 From: Meng Date: Sun, 17 May 2026 01:38:11 +0000 Subject: [PATCH 17/45] examples: remove DreamZero comparison batch helper Drop the local-only batch video comparison wrapper from the online serving example and keep the README focused on serving, client usage, and the single-config export helper. Signed-off-by: Meng Co-authored-by: Yangshen Deng --- examples/online_serving/dreamzero/README.md | 44 +---- .../dreamzero/generate_comparison_videos.py | 169 ------------------ 2 files changed, 3 insertions(+), 210 deletions(-) delete mode 100644 examples/online_serving/dreamzero/generate_comparison_videos.py diff --git a/examples/online_serving/dreamzero/README.md b/examples/online_serving/dreamzero/README.md index e324f8cf2ed..468c83d3ed7 100644 --- a/examples/online_serving/dreamzero/README.md +++ b/examples/online_serving/dreamzero/README.md @@ -8,14 +8,13 @@ compatible OpenPI websocket client using bundled real camera videos. - `run_server.sh`: launch DreamZero OpenPI serving - `openpi_client.py`: websocket client that sends real observations - `export_prediction_video.py`: offline helper that runs vLLM once and decodes DreamZero `video_pred` latents to MP4 -- `generate_comparison_videos.py`: batch helper for TP/CFG comparison videos - `droid_sim_eval_client.py`: DROID `sim-evals` rollout client for the vLLM OpenPI server - `assets/`: minimal real camera videos used by the example ## Environment requirements -- `run_server.sh`, `vllm serve`, `openpi_client.py`, `export_prediction_video.py`, - `generate_comparison_videos.py`, and the standard example/e2e tests: +- `run_server.sh`, `vllm serve`, `openpi_client.py`, + `export_prediction_video.py`, and the standard example/e2e tests: use the local `vllm-omni` environment. - `openpi_client.py` extra deps: @@ -124,7 +123,7 @@ Single-config export: python examples/online_serving/dreamzero/export_prediction_video.py \ --model GEAR-Dreams/DreamZero-DROID \ --deploy-config vllm_omni/deploy/dreamzero.yaml \ - --output-dir outputs/dreamzero/comparison_videos \ + --output-dir outputs/dreamzero/generated_predictions \ --output-stem tp1_cfg1_vllm_example ``` @@ -134,43 +133,6 @@ Optional flags: - `--save-gif`: also writes GIFs for GitHub comments - `--save-actions`: also writes action chunks as `.npz` -Batch comparison export: - -```bash -python examples/online_serving/dreamzero/generate_comparison_videos.py \ - --skip-existing \ - --continue-on-error -``` - -The batch helper tries to generate: - -- `dreamzero_input_reference.mp4`: stitched real input video -- `tp1_cfg1_vllm_example.mp4` -- `tp1_cfg2_vllm_example.mp4` -- `tp2_cfg1_vllm_example.mp4` -- `tp2_cfg2_vllm_example.mp4` -- `dreamzero_upstream_reference.mp4`: copied when `--upstream-video` is provided - -Notes: - -- `tp2_cfg2` needs four free GPUs because `TP=2` and `CF_P=2`. -- If a variant fails, `manifest.json` records the failure and keeps all successful videos. -- The helper does not run the upstream DreamZero server. To include an upstream - reference video, pass `--upstream-video /path/to/video.mp4`. - -Current cleaned comparison outputs are under: - -- `outputs/dreamzero/comparison_videos/` - -The useful files are: - -- `dreamzero_input_reference.mp4` -- `tp1_cfg1_vllm_example.mp4` -- `tp1_cfg2_vllm_example.mp4` -- `tp2_cfg1_vllm_example.mp4` -- `dreamzero_upstream_reference.mp4` -- `manifest.json` - ## Run DROID sim-eval against the vLLM server This is the closest setup to an end-to-end simulated policy rollout. diff --git a/examples/online_serving/dreamzero/generate_comparison_videos.py b/examples/online_serving/dreamzero/generate_comparison_videos.py deleted file mode 100644 index cd06669253e..00000000000 --- a/examples/online_serving/dreamzero/generate_comparison_videos.py +++ /dev/null @@ -1,169 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from __future__ import annotations - -import argparse -import json -import shutil -import subprocess -import sys -from pathlib import Path - -import cv2 -import numpy as np - -REPO_ROOT = Path(__file__).resolve().parents[3] -EXAMPLE_DIR = Path(__file__).resolve().parent -EXPORT_SCRIPT = EXAMPLE_DIR / "export_prediction_video.py" -DEFAULT_OUTPUT_DIR = REPO_ROOT / "outputs" / "dreamzero" / "comparison_videos" -DEFAULT_MODEL = "GEAR-Dreams/DreamZero-DROID" -ASSETS_DIR = EXAMPLE_DIR / "assets" - -CAMERA_FILES = { - "observation/exterior_image_0_left": "exterior_image_1_left.mp4", - "observation/exterior_image_1_left": "exterior_image_2_left.mp4", - "observation/wrist_image_left": "wrist_image_left.mp4", -} - -DEPLOY_CONFIGS = { - "tp1_cfg1": REPO_ROOT / "vllm_omni" / "deploy" / "dreamzero.yaml", - "tp1_cfg2": REPO_ROOT / "vllm_omni" / "deploy" / "dreamzero_tp1_cfg2.yaml", - "tp2_cfg1": REPO_ROOT / "vllm_omni" / "deploy" / "dreamzero_tp2_cfg1.yaml", - "tp2_cfg2": REPO_ROOT / "vllm_omni" / "deploy" / "dreamzero_tp2_cfg2.yaml", -} - - -def _parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Generate DreamZero comparison videos for four vLLM configs.") - parser.add_argument("--model", default=DEFAULT_MODEL) - parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR) - parser.add_argument("--python", default=sys.executable) - parser.add_argument("--upstream-video", type=Path, default=None) - parser.add_argument("--fps", type=int, default=5) - parser.add_argument("--skip-existing", action="store_true") - parser.add_argument("--continue-on-error", action="store_true") - return parser.parse_args() - - -def _load_all_frames(video_path: Path) -> np.ndarray: - cap = cv2.VideoCapture(str(video_path)) - frames = [] - while True: - ok, frame = cap.read() - if not ok: - break - frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) - cap.release() - if not frames: - raise RuntimeError(f"No frames loaded from {video_path}") - return np.stack(frames, axis=0) - - -def _load_camera_frames() -> dict[str, np.ndarray]: - camera_frames: dict[str, np.ndarray] = {} - for camera_key, file_name in CAMERA_FILES.items(): - camera_frames[camera_key] = _load_all_frames(ASSETS_DIR / file_name) - return camera_frames - - -def _write_mp4(path: Path, frames: np.ndarray, fps: int) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - height, width = frames.shape[1:3] - writer = cv2.VideoWriter(str(path), cv2.VideoWriter_fourcc(*"mp4v"), float(fps), (width, height)) - if not writer.isOpened(): - raise RuntimeError(f"Failed to open video writer for {path}") - try: - for frame in frames: - writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)) - finally: - writer.release() - - -def _write_input_reference(output_dir: Path) -> Path: - output_path = output_dir / "dreamzero_input_reference.mp4" - camera_frames = _load_camera_frames() - total_frames = min(frames.shape[0] for frames in camera_frames.values()) - stitched = [] - for frame_index in range(total_frames): - left = camera_frames["observation/exterior_image_0_left"][frame_index] - right = camera_frames["observation/exterior_image_1_left"][frame_index] - wrist = camera_frames["observation/wrist_image_left"][frame_index] - pad = np.zeros((left.shape[0], left.shape[1], 3), dtype=np.uint8) - top = np.concatenate([left, right], axis=1) - bottom = np.concatenate([wrist, pad], axis=1) - stitched.append(np.concatenate([top, bottom], axis=0)) - _write_mp4(output_path, np.stack(stitched, axis=0), fps=15) - return output_path - - -def _run_export(args: argparse.Namespace, config_name: str, deploy_config_path: Path) -> Path: - output_stem = f"{config_name}_vllm_example" - output_path = args.output_dir / f"{output_stem}.mp4" - if args.skip_existing and output_path.exists(): - return output_path - - cmd = [ - args.python, - str(EXPORT_SCRIPT), - "--model", - args.model, - "--deploy-config", - str(deploy_config_path), - "--output-dir", - str(args.output_dir), - "--output-stem", - output_stem, - "--fps", - str(args.fps), - ] - subprocess.run(cmd, check=True, cwd=REPO_ROOT) - return output_path - - -def _copy_upstream_video(output_dir: Path, upstream_video: Path) -> Path: - output_dir.mkdir(parents=True, exist_ok=True) - dst = output_dir / "dreamzero_upstream_reference.mp4" - shutil.copy2(upstream_video, dst) - return dst - - -def _display_path(path: Path) -> str: - try: - return str(path.resolve().relative_to(REPO_ROOT)) - except ValueError: - return str(path) - - -def main() -> None: - args = _parse_args() - args.output_dir.mkdir(parents=True, exist_ok=True) - - manifest: dict[str, str] = {} - failures: dict[str, str] = {} - manifest["input_reference"] = _display_path(_write_input_reference(args.output_dir)) - - for config_name, deploy_config_path in DEPLOY_CONFIGS.items(): - try: - manifest[config_name] = _display_path(_run_export(args, config_name, deploy_config_path)) - except subprocess.CalledProcessError as exc: - if not args.continue_on_error: - raise - failures[config_name] = str(exc).replace(str(REPO_ROOT) + "/", "") - - if args.upstream_video is not None: - manifest["upstream_reference"] = _display_path(_copy_upstream_video(args.output_dir, args.upstream_video)) - - manifest_path = args.output_dir / "manifest.json" - manifest_path.write_text(json.dumps({"videos": manifest, "failures": failures}, indent=2) + "\n") - - for name, path in manifest.items(): - print(f"{name}={path}") - for name, error in failures.items(): - print(f"FAILED_{name}={error}") - print(f"manifest={manifest_path}") - - -if __name__ == "__main__": - main() From 07ca1a68ea38a289ca247c0ebe512dc5d4b30b99 Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Sun, 17 May 2026 01:47:36 +0000 Subject: [PATCH 18/45] tests: move DreamZero config regression out of stage proc Keep StageDiffusionProc tests focused on runtime batching behavior, move the DreamZero-specific config enrichment regression to the DreamZero config tests, and remove the unrelated OmniVoice regression. Signed-off-by: Yangshen Deng Co-authored-by: Meng --- tests/diffusion/test_stage_diffusion_proc.py | 40 ------------------- .../test_resolve_dreamzero_config.py | 19 +++++++++ 2 files changed, 19 insertions(+), 40 deletions(-) diff --git a/tests/diffusion/test_stage_diffusion_proc.py b/tests/diffusion/test_stage_diffusion_proc.py index e2cb8087d59..36ceb6cfc49 100644 --- a/tests/diffusion/test_stage_diffusion_proc.py +++ b/tests/diffusion/test_stage_diffusion_proc.py @@ -9,7 +9,6 @@ import pytest -from vllm_omni.diffusion.data import OmniDiffusionConfig from vllm_omni.diffusion.stage_diffusion_proc import StageDiffusionProc from vllm_omni.inputs.data import OmniDiffusionSamplingParams @@ -147,42 +146,3 @@ async def run_task(req_data): time_gap = elapsed_time - base_time assert time_gap > time_gap_std - eps and time_gap < time_gap_std + eps base_time = elapsed_time - - -def test_enrich_config_preserves_explicit_model_class_name(monkeypatch): - monkeypatch.setattr( - "vllm.transformers_utils.config.get_hf_file_to_dict", - lambda path, _model: None if path == "model_index.json" else {"model_type": "vla", "architectures": ["VLA"]}, - ) - - od_config = OmniDiffusionConfig( - model="GEAR-Dreams/DreamZero-DROID", - model_class_name="DreamZeroPipeline", - ) - proc = StageDiffusionProc(od_config.model, od_config) - - proc._enrich_config() - - assert od_config.model_class_name == "DreamZeroPipeline" - - -def test_enrich_config_keeps_omnivoice_architecture_behavior(monkeypatch): - monkeypatch.setattr( - "vllm.transformers_utils.config.get_hf_file_to_dict", - lambda path, _model: None - if path == "model_index.json" - else { - "model_type": "omnivoice", - "architectures": ["OmniVoice"], - }, - ) - - od_config = OmniDiffusionConfig( - model="k2-fsa/OmniVoice", - model_class_name="OmniVoicePipeline", - ) - proc = StageDiffusionProc(od_config.model, od_config) - - proc._enrich_config() - - assert od_config.model_class_name == "OmniVoice" diff --git a/tests/entrypoints/test_resolve_dreamzero_config.py b/tests/entrypoints/test_resolve_dreamzero_config.py index 52581d27236..a962c5262c1 100644 --- a/tests/entrypoints/test_resolve_dreamzero_config.py +++ b/tests/entrypoints/test_resolve_dreamzero_config.py @@ -1,5 +1,7 @@ import pytest +from vllm_omni.diffusion.data import OmniDiffusionConfig +from vllm_omni.diffusion.stage_diffusion_proc import StageDiffusionProc from vllm_omni.entrypoints.utils import load_stage_configs_from_model, resolve_model_config_path pytestmark = [pytest.mark.core_model, pytest.mark.cpu] @@ -35,3 +37,20 @@ def test_dreamzero_config_sets_model_class_and_policy_config(monkeypatch): assert engine_args.model_class_name == "DreamZeroPipeline" assert engine_args.model_config.policy_server_config.action_space == "joint_position" + + +def test_dreamzero_enrich_config_preserves_explicit_model_class_name(monkeypatch): + monkeypatch.setattr( + "vllm.transformers_utils.config.get_hf_file_to_dict", + lambda path, _model: None if path == "model_index.json" else {"model_type": "vla", "architectures": ["VLA"]}, + ) + + od_config = OmniDiffusionConfig( + model="GEAR-Dreams/DreamZero-DROID", + model_class_name="DreamZeroPipeline", + ) + proc = StageDiffusionProc(od_config.model, od_config) + + proc._enrich_config() + + assert od_config.model_class_name == "DreamZeroPipeline" From ae5673487a6953d274b5bc8053f244ad3ec2130d Mon Sep 17 00:00:00 2001 From: Meng Date: Sun, 17 May 2026 01:51:13 +0000 Subject: [PATCH 19/45] docs: remove local absolute paths from DreamZero examples Replace machine-specific and placeholder absolute paths in DreamZero example docs and helper usage strings with repository-relative paths or environment variables. Signed-off-by: Meng Co-authored-by: Yangshen Deng --- examples/online_serving/dreamzero/README.md | 9 ++++++--- .../online_serving/dreamzero/droid_sim_eval_client.py | 9 ++++----- .../dreamzero/molmospace_dreamzero_eval_demo.py | 6 +++--- .../upstream/upstream_socket_server_no_compile.py | 4 ++-- 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/examples/online_serving/dreamzero/README.md b/examples/online_serving/dreamzero/README.md index 468c83d3ed7..13704fe28de 100644 --- a/examples/online_serving/dreamzero/README.md +++ b/examples/online_serving/dreamzero/README.md @@ -183,7 +183,7 @@ Isaac Lab launcher, for example: ```bash CUDA_VISIBLE_DEVICES=1 \ -/path/to/isaaclab.sh -p \ +"${ISAACLAB_LAUNCHER}" -p \ examples/online_serving/dreamzero/droid_sim_eval_client.py \ --host 127.0.0.1 \ --port 8000 \ @@ -342,6 +342,9 @@ From the repository root: ```bash python examples/online_serving/dreamzero/molmospace_dreamzero_eval_demo.py \ - --benchmark_dir /path/to/molmospaces/assets/benchmarks/molmospaces-bench-v2/benchmarks/20260327/ithor/FrankaCloseHardBench/FrankaCloseHardBench_20260206_json_benchmark \ - --output_dir /path/to/eval_output --max_episodes 1 --task_horizon_steps 240 --episode_idx 1 + --benchmark_dir "${MOLMOSPACES_BENCHMARK_DIR}/20260327/ithor/FrankaCloseHardBench/FrankaCloseHardBench_20260206_json_benchmark" \ + --output_dir outputs/dreamzero/molmospaces \ + --max_episodes 1 \ + --task_horizon_steps 240 \ + --episode_idx 1 ``` diff --git a/examples/online_serving/dreamzero/droid_sim_eval_client.py b/examples/online_serving/dreamzero/droid_sim_eval_client.py index 3a4999cc106..c1910282f47 100644 --- a/examples/online_serving/dreamzero/droid_sim_eval_client.py +++ b/examples/online_serving/dreamzero/droid_sim_eval_client.py @@ -20,12 +20,11 @@ at `/v1/realtime/robot/openpi`, so this script includes the path suffix in the client URI. -Run this script through Isaac Lab's launcher from the external `sim-evals` -checkout, for example: +Run this script through Isaac Lab's launcher from the vLLM-Omni repository +root, for example: - cd /path/to/sim-evals - ./submodules/IsaacLab/isaaclab.sh -p \ - /path/to/vllm-omni-wm/examples/online_serving/dreamzero/droid_sim_eval_client.py \ + "${ISAACLAB_LAUNCHER}" -p \ + examples/online_serving/dreamzero/droid_sim_eval_client.py \ --host 127.0.0.1 \ --port 8000 \ --scene 1 \ diff --git a/examples/online_serving/dreamzero/molmospace_dreamzero_eval_demo.py b/examples/online_serving/dreamzero/molmospace_dreamzero_eval_demo.py index 42c80ebb07e..11f5dc68b86 100644 --- a/examples/online_serving/dreamzero/molmospace_dreamzero_eval_demo.py +++ b/examples/online_serving/dreamzero/molmospace_dreamzero_eval_demo.py @@ -38,9 +38,9 @@ def main() -> int: "--benchmark_dir", required=True, help=( - "Path to benchmark directory, e.g. " - "/path/to/molmospaces/assets/benchmarks/molmospaces-bench-v2/benchmarks/" - "20260327/ithor/FrankaCloseHardBench/FrankaCloseHardBench_20260206_json_benchmark" + "Path to a MolmoSpaces benchmark directory, for example " + "$MOLMOSPACES_BENCHMARK_DIR/20260327/ithor/FrankaCloseHardBench/" + "FrankaCloseHardBench_20260206_json_benchmark" ), ) parser.add_argument("--max_episodes", type=int, default=1) diff --git a/tests/dreamzero/upstream/upstream_socket_server_no_compile.py b/tests/dreamzero/upstream/upstream_socket_server_no_compile.py index 8ea38237b67..82ed6866bba 100644 --- a/tests/dreamzero/upstream/upstream_socket_server_no_compile.py +++ b/tests/dreamzero/upstream/upstream_socket_server_no_compile.py @@ -22,10 +22,10 @@ PyTorch SDPA (`ATTENTION_BACKEND=torch`) for this subprocess only. Usage: - PYTHONPATH=/home/yangshen/code/dreamzero \\ + PYTHONPATH="${DREAMZERO_REPO}" \\ .venv/bin/python -m torch.distributed.run --standalone --nproc_per_node=1 \\ tests/dreamzero/upstream/upstream_socket_server_no_compile.py --port 18081 \\ - --model_path /home/yangshen/code/dreamzero/checkpoints/dreamzero + --model_path "${DREAMZERO_REPO}/checkpoints/dreamzero" """ from __future__ import annotations From 79820d479e5d8c934846413af00cbfeb6a36c09b Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Sun, 17 May 2026 01:55:54 +0000 Subject: [PATCH 20/45] docs: clarify DreamZero sim-eval optional dependencies Document that DROID sim-eval requires an external Isaac Lab/sim-evals environment while ordinary serving and e2e usage do not, and align import error messages with that optional dependency boundary. Signed-off-by: Yangshen Deng Co-authored-by: Meng --- examples/online_serving/dreamzero/README.md | 32 ++++++++----------- .../dreamzero/droid_sim_eval_client.py | 16 +++++----- 2 files changed, 21 insertions(+), 27 deletions(-) diff --git a/examples/online_serving/dreamzero/README.md b/examples/online_serving/dreamzero/README.md index 13704fe28de..2414f0da207 100644 --- a/examples/online_serving/dreamzero/README.md +++ b/examples/online_serving/dreamzero/README.md @@ -28,18 +28,23 @@ pip install openpi-client websockets opencv-python pip install opencv-python pillow ``` +Optional DROID sim-eval dependencies: + +- Plain serving, `openpi_client.py`, and standard e2e tests do **not** require + Isaac Lab or `sim-evals`. - `droid_sim_eval_client.py` must run in an external Isaac Lab / `sim-evals` - environment, and also needs: + environment where these imports already work: + - `isaaclab` + - `isaaclab_tasks` + - `sim_evals` + - `gymnasium` +- In that simulator environment, also install the OpenPI/client-side helpers: ```bash -pip install openpi-client websockets opencv-python mediapy +pip install openpi-client websockets opencv-python mediapy typing-extensions ``` -- On Python `< 3.12`, also install: - -```bash -pip install typing-extensions -``` +- `typing-extensions` is only needed on Python `< 3.12`. - Optional `tests/dreamzero/upstream/*` parity tests also require: - `DREAMZERO_REPO` pointing to an upstream DreamZero checkout @@ -161,22 +166,11 @@ vllm serve \ ### 2. Start the DROID simulation client -Run this from an environment where `isaaclab`, `isaaclab_tasks`, -`sim_evals`, and `gymnasium` are already importable. - Environment: - do **not** run this from the plain `vllm-omni` env unless it already has Isaac Lab and `sim_evals` - launch it from the Isaac Lab / `sim-evals` environment -- make sure the following imports work there: - - `isaaclab` - - `isaaclab_tasks` - - `sim_evals` - - `gymnasium` - - `openpi_client` - - `websockets` - - `cv2` - - `mediapy` +- see the optional DROID sim-eval dependencies above From the `vllm-omni` repository root, invoke the client through an external Isaac Lab launcher, for example: diff --git a/examples/online_serving/dreamzero/droid_sim_eval_client.py b/examples/online_serving/dreamzero/droid_sim_eval_client.py index c1910282f47..f3c00c4592d 100644 --- a/examples/online_serving/dreamzero/droid_sim_eval_client.py +++ b/examples/online_serving/dreamzero/droid_sim_eval_client.py @@ -52,7 +52,7 @@ try: import mediapy except ImportError as exc: # pragma: no cover - runtime dependency guard - raise ImportError("DreamZero sim-eval client requires `mediapy`.") from exc + raise ImportError("Optional DROID sim-eval client requires `mediapy`.") from exc try: from typing import override @@ -60,12 +60,12 @@ try: from typing_extensions import override except ImportError as exc: # pragma: no cover - runtime dependency guard - raise ImportError("DreamZero sim-eval client requires `typing-extensions` on Python < 3.12.") from exc + raise ImportError("Optional DROID sim-eval client requires `typing-extensions` on Python < 3.12.") from exc try: import websockets.sync.client except ImportError as exc: # pragma: no cover - runtime dependency guard - raise ImportError("DreamZero sim-eval client requires `websockets`.") from exc + raise ImportError("Optional DROID sim-eval client requires `websockets`.") from exc # NOTE: # This directory already contains a local file named `openpi_client.py`. @@ -87,7 +87,7 @@ from openpi_client import image_tools, msgpack_numpy from openpi_client.base_policy import BasePolicy except ImportError as exc: # pragma: no cover - runtime dependency guard - raise ImportError("DreamZero sim-eval client requires the optional `openpi-client` package.") from exc + raise ImportError("Optional DROID sim-eval client requires the `openpi-client` package.") from exc finally: if removed_path: sys.path.insert(0, example_dir) @@ -553,7 +553,7 @@ def main() -> None: from isaaclab.app import AppLauncher except ImportError as exc: # pragma: no cover - runtime dependency guard raise ImportError( - "DreamZero sim-eval client requires Isaac Lab (`isaaclab`). " + "Optional DROID sim-eval client requires Isaac Lab (`isaaclab`). " "Launch it from an Isaac Lab environment, e.g. via `isaaclab.sh -p`." ) from exc @@ -576,19 +576,19 @@ def main() -> None: try: import gymnasium as gym except ImportError as exc: # pragma: no cover - runtime dependency guard - raise ImportError("DreamZero sim-eval client requires `gymnasium`.") from exc + raise ImportError("Optional DROID sim-eval client requires `gymnasium`.") from exc try: import sim_evals.environments # noqa: F401 except ImportError as exc: # pragma: no cover - runtime dependency guard raise ImportError( - "DreamZero sim-eval client requires the external `sim-evals` package or checkout to be importable." + "Optional DROID sim-eval client requires the external `sim-evals` package or checkout to be importable." ) from exc try: from isaaclab_tasks.utils import parse_env_cfg except ImportError as exc: # pragma: no cover - runtime dependency guard - raise ImportError("DreamZero sim-eval client requires `isaaclab_tasks`.") from exc + raise ImportError("Optional DROID sim-eval client requires `isaaclab_tasks`.") from exc # Resolve output location and scene prompt. output_dir = _make_output_dir(args.output_root.expanduser().resolve(), args.scene) From a3c458989cec19ac9fd57861ef28c7d803c608f8 Mon Sep 17 00:00:00 2001 From: Meng Date: Sun, 17 May 2026 02:06:31 +0000 Subject: [PATCH 21/45] tests: keep only DreamZero upstream e2e parity Remove smaller upstream-dependent DreamZero parity tests and keep the formal OpenPI server-vs-server e2e parity test plus its helpers. Require DREAMZERO_REPO explicitly instead of assuming a local checkout path. Signed-off-by: Meng Co-authored-by: Yangshen Deng --- .../upstream/openpi_test_client_ar.py | 10 +- .../upstream/test_client_ar_path_parity.py | 171 ------------------ .../upstream/test_openpi_e2e_source_parity.py | 23 ++- .../test_roboarena_transform_source_parity.py | 121 ------------- .../test_video_preprocess_source_parity.py | 68 ------- .../upstream_socket_server_no_compile.py | 5 +- 6 files changed, 27 insertions(+), 371 deletions(-) delete mode 100644 tests/dreamzero/upstream/test_client_ar_path_parity.py delete mode 100644 tests/dreamzero/upstream/test_roboarena_transform_source_parity.py delete mode 100644 tests/dreamzero/upstream/test_video_preprocess_source_parity.py diff --git a/tests/dreamzero/upstream/openpi_test_client_ar.py b/tests/dreamzero/upstream/openpi_test_client_ar.py index 8f6de5e487b..7c5aea1d901 100644 --- a/tests/dreamzero/upstream/openpi_test_client_ar.py +++ b/tests/dreamzero/upstream/openpi_test_client_ar.py @@ -49,10 +49,13 @@ import numpy as np from openpi_client import msgpack_numpy -DREAMZERO_REPO = Path(os.environ.get("DREAMZERO_REPO", "~/code/dreamzero")).expanduser() +_DREAMZERO_REPO_ENV = os.environ.get("DREAMZERO_REPO") +DREAMZERO_REPO = Path(_DREAMZERO_REPO_ENV).expanduser() if _DREAMZERO_REPO_ENV else None def _import_upstream_policy_modules(): + if DREAMZERO_REPO is None: + raise ImportError("Set DREAMZERO_REPO to an upstream DreamZero checkout before using this helper.") if DREAMZERO_REPO.exists() and str(DREAMZERO_REPO) not in sys.path: sys.path.insert(0, str(DREAMZERO_REPO)) @@ -64,7 +67,10 @@ def _import_upstream_policy_modules(): policy_server, WebsocketClientPolicy = _import_upstream_policy_modules() -VIDEO_DIR = os.environ.get("DREAMZERO_VIDEO_DIR", str(DREAMZERO_REPO / "debug_image")) +VIDEO_DIR = os.environ.get( + "DREAMZERO_VIDEO_DIR", + str(DREAMZERO_REPO / "debug_image") if DREAMZERO_REPO is not None else "debug_image", +) # roboarena key -> video filename CAMERA_FILES = { diff --git a/tests/dreamzero/upstream/test_client_ar_path_parity.py b/tests/dreamzero/upstream/test_client_ar_path_parity.py deleted file mode 100644 index 0705b9e88ed..00000000000 --- a/tests/dreamzero/upstream/test_client_ar_path_parity.py +++ /dev/null @@ -1,171 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -"""Checks that the copied DreamZero client keeps identical logic across paths. - -The only difference between talking to the upstream DreamZero websocket server -and the vLLM OpenPI websocket server should be the websocket URI suffix: - -- upstream DreamZero: ``ws://host:port`` -- vLLM OpenPI: ``ws://host:port/v1/realtime/robot/openpi`` - -This file verifies that `tests/dreamzero/upstream/openpi_test_client_ar.py` preserves the same -observation / infer / reset flow for both cases. -""" - -from __future__ import annotations - -import importlib.util -import os -import sys -import uuid -from pathlib import Path - -import numpy as np -import pytest - -DREAMZERO_REPO = Path(os.environ.get("DREAMZERO_REPO", "~/code/dreamzero")).expanduser() -CLIENT_SCRIPT = Path(__file__).resolve().with_name("openpi_test_client_ar.py") - -pytestmark = pytest.mark.skipif( - not DREAMZERO_REPO.exists(), - reason="DreamZero source repo is required", -) - -if str(DREAMZERO_REPO) not in sys.path: - sys.path.insert(0, str(DREAMZERO_REPO)) - - -def _load_client_module(): - spec = importlib.util.spec_from_file_location( - "dreamzero_test_client_ar_module", - CLIENT_SCRIPT, - ) - assert spec is not None - assert spec.loader is not None - module = importlib.util.module_from_spec(spec) - try: - spec.loader.exec_module(module) - except ModuleNotFoundError as exc: - pytest.skip(f"DreamZero client dependency is missing: {exc.name}") - return module - - -def _snapshot_obs(obs: dict) -> dict: - snapshot = {} - for key, value in obs.items(): - if isinstance(value, np.ndarray): - snapshot[key] = value.copy() - else: - snapshot[key] = value - return snapshot - - -def _assert_obs_sequence_equal(actual: list[dict], expected: list[dict]) -> None: - assert len(actual) == len(expected) - for actual_obs, expected_obs in zip(actual, expected, strict=True): - assert set(actual_obs) == set(expected_obs) - for key in actual_obs: - actual_value = actual_obs[key] - expected_value = expected_obs[key] - if isinstance(actual_value, np.ndarray): - assert isinstance(expected_value, np.ndarray) - assert actual_value.dtype == expected_value.dtype - assert actual_value.shape == expected_value.shape - assert np.array_equal(actual_value, expected_value) - else: - assert actual_value == expected_value - - -def test_websocket_uri_differs_only_by_path(monkeypatch) -> None: - client_mod = _load_client_module() - monkeypatch.setattr( - client_mod.OpenPIWebsocketClientPolicy, - "_wait_for_server", - lambda self: (object(), {}), - raising=False, - ) - - upstream = client_mod.OpenPIWebsocketClientPolicy( - host="127.0.0.1", - port=8000, - path="", - ) - vllm = client_mod.OpenPIWebsocketClientPolicy( - host="127.0.0.1", - port=8000, - path="/v1/realtime/robot/openpi", - ) - - assert upstream._uri == "ws://127.0.0.1:8000" - assert vllm._uri == "ws://127.0.0.1:8000/v1/realtime/robot/openpi" - - -def test_zero_image_client_flow_is_identical_across_server_paths(monkeypatch) -> None: - client_mod = _load_client_module() - fixed_session_id = uuid.UUID("12345678-1234-5678-1234-567812345678") - monkeypatch.setattr(uuid, "uuid4", lambda: fixed_session_id) - monkeypatch.setattr(client_mod, "_log_action", lambda actions, dt: None) - - instances = [] - - class FakeClient: - def __init__(self, host: str, port: int, path: str) -> None: - self.host = host - self.port = port - self.path = path - self.metadata_calls = 0 - self.infer_obs = [] - self.reset_payloads = [] - instances.append(self) - - def get_server_metadata(self) -> dict: - self.metadata_calls += 1 - return { - "image_resolution": [180, 320], - "n_external_cameras": 2, - "needs_wrist_camera": True, - "needs_stereo_camera": False, - "needs_session_id": True, - "action_space": "joint_position", - } - - def infer(self, obs: dict) -> np.ndarray: - self.infer_obs.append(_snapshot_obs(obs)) - return np.zeros((24, 8), dtype=np.float32) - - def reset(self, payload: dict) -> str: - self.reset_payloads.append(dict(payload)) - return "reset successful" - - monkeypatch.setattr(client_mod, "OpenPIWebsocketClientPolicy", FakeClient) - - client_mod.test_ar_droid_policy_server( - host="127.0.0.1", - port=8000, - path="", - num_chunks=2, - prompt="pick up the object", - use_zero_images=True, - ) - client_mod.test_ar_droid_policy_server( - host="127.0.0.1", - port=8000, - path="/v1/realtime/robot/openpi", - num_chunks=2, - prompt="pick up the object", - use_zero_images=True, - ) - - assert len(instances) == 2 - upstream, vllm = instances - - assert upstream.path == "" - assert vllm.path == "/v1/realtime/robot/openpi" - assert upstream.metadata_calls == 1 - assert vllm.metadata_calls == 1 - assert len(upstream.infer_obs) == 2 - assert len(vllm.infer_obs) == 2 - _assert_obs_sequence_equal(upstream.infer_obs, vllm.infer_obs) - assert upstream.reset_payloads == [{}] - assert vllm.reset_payloads == [{}] diff --git a/tests/dreamzero/upstream/test_openpi_e2e_source_parity.py b/tests/dreamzero/upstream/test_openpi_e2e_source_parity.py index 505c30b365d..3b4f2d38ae5 100644 --- a/tests/dreamzero/upstream/test_openpi_e2e_source_parity.py +++ b/tests/dreamzero/upstream/test_openpi_e2e_source_parity.py @@ -4,8 +4,8 @@ """Formal OpenPI end-to-end parity: upstream DreamZero server vs `vllm serve`. This test uses DreamZero's own client-side observation builders from -`~/code/dreamzero/test_client_AR.py`, and client-side websocket protocol from -`~/code/dreamzero/eval_utils/policy_client.py`. +`${DREAMZERO_REPO}/test_client_AR.py`, and client-side websocket protocol from +`${DREAMZERO_REPO}/eval_utils/policy_client.py`. The only client-side adaptation for vLLM is the websocket path: DreamZero's upstream server serves at `/`, while vLLM serves OpenPI at @@ -39,8 +39,9 @@ msgpack_numpy = pytest.importorskip("openpi_client.msgpack_numpy") -DREAMZERO_REPO = Path(os.environ.get("DREAMZERO_REPO", "~/code/dreamzero")).expanduser() -if str(DREAMZERO_REPO) not in sys.path: +_DREAMZERO_REPO_ENV = os.environ.get("DREAMZERO_REPO") +DREAMZERO_REPO = Path(_DREAMZERO_REPO_ENV).expanduser() if _DREAMZERO_REPO_ENV else None +if DREAMZERO_REPO is not None and str(DREAMZERO_REPO) not in sys.path: sys.path.insert(0, str(DREAMZERO_REPO)) try: @@ -49,8 +50,9 @@ except Exception: # pragma: no cover - guarded by pytest skip below dreamzero_client = None WebsocketClientPolicy = None +_BaseWebsocketClientPolicy = WebsocketClientPolicy if WebsocketClientPolicy is not None else object -CHECKPOINT_DIR = DREAMZERO_REPO / "checkpoints" / "dreamzero" +CHECKPOINT_DIR = DREAMZERO_REPO / "checkpoints" / "dreamzero" if DREAMZERO_REPO is not None else None VLLM_MODEL = os.environ.get("VLLM_DREAMZERO_MODEL", "GEAR-Dreams/DreamZero-DROID") SERVICE_READY_TIMEOUT_S = int(os.environ.get("OPENPI_SERVICE_READY_TIMEOUT_S", "900")) PROMPT = "Move the pan forward and use the brush in the middle of the plates to brush the inside of the pan" @@ -62,12 +64,17 @@ dreamzero_client is None or WebsocketClientPolicy is None, reason="DreamZero client modules are required on PYTHONPATH", ), - pytest.mark.skipif(not DREAMZERO_REPO.exists(), reason="DreamZero source repo is required at ~/code/dreamzero"), - pytest.mark.skipif(not CHECKPOINT_DIR.exists(), reason="DreamZero local checkpoint is required"), + pytest.mark.skipif( + DREAMZERO_REPO is None or not DREAMZERO_REPO.exists(), + reason="DreamZero source repo is required at DREAMZERO_REPO", + ), + pytest.mark.skipif( + CHECKPOINT_DIR is None or not CHECKPOINT_DIR.exists(), reason="DreamZero local checkpoint is required" + ), ] -class OpenPIWebsocketClientPolicy(WebsocketClientPolicy): +class OpenPIWebsocketClientPolicy(_BaseWebsocketClientPolicy): """DreamZero client protocol with an OpenPI websocket path suffix.""" def __init__( diff --git a/tests/dreamzero/upstream/test_roboarena_transform_source_parity.py b/tests/dreamzero/upstream/test_roboarena_transform_source_parity.py deleted file mode 100644 index 6a97a882643..00000000000 --- a/tests/dreamzero/upstream/test_roboarena_transform_source_parity.py +++ /dev/null @@ -1,121 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -"""Source parity checks for the RoboArena → DreamZero input path. - -This test targets the non-model part of the OpenPI DreamZero chain: - -- `socket_test_optimized_AR.py:ARDroidRoboarenaPolicy._convert_observation()` -- upstream `eval_transform.apply()` -- local `RoboArenaTransform.transform_input()` -- local prompt tokenization + state normalization path used by - `DreamZeroPipeline.forward()` - -The goal is to make sure the local serving pre-processing feeds the same -stitched video, prompt tokens, and normalized state into the model as the -upstream DreamZero source server. -""" - -from __future__ import annotations - -import json -import os -import sys -from pathlib import Path - -import numpy as np -import pytest -import torch - -from vllm_omni.diffusion.models.dreamzero.pipeline_dreamzero import DreamZeroPipeline -from vllm_omni.diffusion.models.dreamzero.transform.roboarena import ( - RoboArenaTransform, -) - -instantiate = pytest.importorskip("hydra.utils").instantiate -OmegaConf = pytest.importorskip("omegaconf").OmegaConf -AutoTokenizer = pytest.importorskip("transformers").AutoTokenizer - -DREAMZERO_REPO = Path(os.environ.get("DREAMZERO_REPO", "~/code/dreamzero")).expanduser() -CHECKPOINT_DIR = DREAMZERO_REPO / "checkpoints" / "dreamzero" -PROMPT = "Move the pan forward and use the brush in the middle of the plates to brush the inside of the pan" -SESSION_ID = "roboarena-transform-source-parity" - -pytestmark = [ - pytest.mark.skipif(not DREAMZERO_REPO.exists(), reason="DreamZero source repo is required at ~/code/dreamzero"), - pytest.mark.skipif(not CHECKPOINT_DIR.exists(), reason="DreamZero local checkpoint is required"), -] - - -def _load_source_normalized_input(): - if str(DREAMZERO_REPO) not in sys.path: - sys.path.insert(0, str(DREAMZERO_REPO)) - - import test_client_AR as dreamzero_client - from groot.vla.data.schema import DatasetMetadata, EmbodimentTag - from groot.vla.model.n1_5.sim_policy import unsqueeze_dict_values - from socket_test_optimized_AR import ARDroidRoboarenaPolicy - - class DummyPolicy: - pass - - camera_frames = dreamzero_client.load_camera_frames() - obs0 = dreamzero_client._make_obs_from_video(camera_frames, [0], PROMPT, SESSION_ID) - - adapter = ARDroidRoboarenaPolicy(groot_policy=DummyPolicy(), signal_group=None) - converted = unsqueeze_dict_values(adapter._convert_observation(dict(obs0))) - - train_cfg = OmegaConf.load(CHECKPOINT_DIR / "experiment_cfg" / "conf.yaml") - with open(CHECKPOINT_DIR / "experiment_cfg" / "metadata.json") as f: - metadatas = json.load(f) - - metadata = DatasetMetadata.model_validate(metadatas[EmbodimentTag.OXE_DROID.value]) - eval_transform = instantiate(train_cfg.transforms[EmbodimentTag.OXE_DROID.value]) - eval_transform.set_metadata(metadata) - eval_transform.eval() - normalized = eval_transform.apply(dict(converted)) - - return obs0, metadatas, normalized - - -def test_roboarena_transform_matches_source_video_prompt_and_state(): - obs0, metadatas, source_normalized = _load_source_normalized_input() - - local_transform = RoboArenaTransform() - local_unified = local_transform.transform_input(dict(obs0)) - - source_images = source_normalized["images"].cpu().numpy() - if source_images.ndim == 5 and source_images.shape[0] == 1: - source_images = source_images[0] - assert np.array_equal(local_unified["images"], source_images) - - tokenizer = AutoTokenizer.from_pretrained("google/umt5-xxl") - local_text = tokenizer( - local_unified["prompt"], - max_length=512, - padding="max_length", - truncation=True, - return_tensors="pt", - add_special_tokens=True, - ) - assert torch.equal(local_text["input_ids"], source_normalized["text"].cpu()) - assert torch.equal( - local_text["attention_mask"], - source_normalized["text_attention_mask"].cpu(), - ) - - pipe = DreamZeroPipeline.__new__(DreamZeroPipeline) - pipe.state_norm_stats = DreamZeroPipeline._parse_state_norm_stats(metadatas) - - raw_state = np.asarray(local_unified["state"], dtype=np.float64) - padded = np.zeros(64, dtype=np.float64) - padded[: len(raw_state)] = raw_state - local_state = torch.from_numpy(padded).reshape(1, 1, 64).to(dtype=torch.float32) - local_state = DreamZeroPipeline._normalize_state(pipe, local_state, "oxe_droid") - - torch.testing.assert_close( - local_state, - source_normalized["state"].float(), - atol=1e-7, - rtol=0.0, - ) diff --git a/tests/dreamzero/upstream/test_video_preprocess_source_parity.py b/tests/dreamzero/upstream/test_video_preprocess_source_parity.py deleted file mode 100644 index 69d76d1e240..00000000000 --- a/tests/dreamzero/upstream/test_video_preprocess_source_parity.py +++ /dev/null @@ -1,68 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -"""Parity test for DreamZero video preprocessing order. - -The upstream eager path in -`groot/vla/model/dreamzero/action_head/wan_flow_matching_action_tf.py:952-966` -casts the input video to `bfloat16` *before* applying `normalize_video` -(`x * 2 - 1`). That cast order matters on CUDA and must be preserved for -end-to-end parity. -""" - -from __future__ import annotations - -import os -import sys -from pathlib import Path - -import pytest -import torch - -from vllm_omni.diffusion.models.dreamzero.pipeline_dreamzero import ( - DreamZeroPipeline, -) -from vllm_omni.diffusion.models.dreamzero.transform.roboarena import ( - RoboArenaTransform, -) - -DREAMZERO_REPO = Path(os.environ.get("DREAMZERO_REPO", "~/code/dreamzero")).expanduser() -PROMPT = "Move the pan forward and use the brush in the middle of the plates to brush the inside of the pan" -SESSION_ID = "video-preprocess-source-parity" - -pytestmark = [ - pytest.mark.skipif(not torch.cuda.is_available(), reason="GPU required"), - pytest.mark.skipif(not DREAMZERO_REPO.exists(), reason="DreamZero source repo is required at ~/code/dreamzero"), -] - - -def _load_real_video() -> torch.Tensor: - if str(DREAMZERO_REPO) not in sys.path: - sys.path.insert(0, str(DREAMZERO_REPO)) - - import test_client_AR as dreamzero_client - - camera_frames = dreamzero_client.load_camera_frames() - obs = dreamzero_client._make_obs_from_video(camera_frames, [0], PROMPT, SESSION_ID) - stitched = RoboArenaTransform().transform_input(obs)["images"] - return torch.from_numpy(stitched).unsqueeze(0).to(device="cuda:0") - - -def test_preprocess_video_matches_source_bf16_cast_order() -> None: - videos = _load_real_video() # uint8 [B, T, H, W, C] - - actual = DreamZeroPipeline._preprocess_video(None, videos).float() - - expected = videos.permute(0, 4, 1, 2, 3) - expected = expected.float() / 255.0 - expected = expected.to(dtype=torch.bfloat16) - batch_size, channels, num_frames, height, width = expected.shape - expected = expected.permute(0, 2, 1, 3, 4) - expected = expected.reshape(batch_size * num_frames, channels, height, width) - expected = expected * 2.0 - 1.0 - expected = expected.reshape(batch_size, num_frames, channels, height, width) - expected = expected.permute(0, 2, 1, 3, 4).to(dtype=torch.bfloat16).float() - - diff = (actual - expected).abs() - assert diff.max().item() == 0.0 - assert diff.mean().item() == 0.0 diff --git a/tests/dreamzero/upstream/upstream_socket_server_no_compile.py b/tests/dreamzero/upstream/upstream_socket_server_no_compile.py index 82ed6866bba..bfb4d5f14c9 100644 --- a/tests/dreamzero/upstream/upstream_socket_server_no_compile.py +++ b/tests/dreamzero/upstream/upstream_socket_server_no_compile.py @@ -52,7 +52,10 @@ def deco(fn): os.environ.setdefault("NO_ALBUMENTATIONS_UPDATE", "1") -DREAMZERO_REPO = Path(os.environ.get("DREAMZERO_REPO", "~/code/dreamzero")).expanduser() +DREAMZERO_REPO_ENV = os.environ.get("DREAMZERO_REPO") +if not DREAMZERO_REPO_ENV: + raise RuntimeError("Set DREAMZERO_REPO to an upstream DreamZero checkout before launching this helper.") +DREAMZERO_REPO = Path(DREAMZERO_REPO_ENV).expanduser() if str(DREAMZERO_REPO) not in sys.path: sys.path.insert(0, str(DREAMZERO_REPO)) From ba4c7c1e51b804358c70a6065540f7afee60a486 Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Sun, 17 May 2026 02:17:39 +0000 Subject: [PATCH 22/45] tests: remove duplicate DreamZero example serving test Drop the example-level DreamZero OpenPI serving test because the e2e online serving test already starts the server and validates the same client path with stronger structured checks. Signed-off-by: Yangshen Deng Co-authored-by: Meng --- .../examples/online_serving/test_dreamzero.py | 102 ------------------ 1 file changed, 102 deletions(-) delete mode 100644 tests/examples/online_serving/test_dreamzero.py diff --git a/tests/examples/online_serving/test_dreamzero.py b/tests/examples/online_serving/test_dreamzero.py deleted file mode 100644 index 6f305d3eea7..00000000000 --- a/tests/examples/online_serving/test_dreamzero.py +++ /dev/null @@ -1,102 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -""" -Online serving example test: DreamZero. -See examples/online_serving/dreamzero/README.md -""" - -from __future__ import annotations - -import os -import socket -import subprocess -import sys -from pathlib import Path - -import pytest - -from tests.examples.helpers import run_cmd -from tests.helpers.mark import hardware_test -from tests.helpers.runtime import OmniServerParams - -os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" - -pytestmark = [pytest.mark.advanced_model, pytest.mark.example] - -MODEL = "GEAR-Dreams/DreamZero-DROID" -EXAMPLE_DIR = Path(__file__).resolve().parents[3] / "examples" / "online_serving" / "dreamzero" - - -def _find_free_port() -> int: - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: - sock.bind(("127.0.0.1", 0)) - return int(sock.getsockname()[1]) - - -def _pick_test_gpus() -> str: - override = os.environ.get("DREAMZERO_TEST_GPUS") or os.environ.get("OPENPI_E2E_GPUS") - if override: - return override - - try: - query = subprocess.check_output( - [ - "nvidia-smi", - "--query-gpu=index,memory.used", - "--format=csv,noheader,nounits", - ], - text=True, - ) - except Exception: - return "0,1" - - gpu_rows = [] - for line in query.strip().splitlines(): - gpu_index, used_mb = [part.strip() for part in line.split(",", maxsplit=1)] - gpu_rows.append((int(used_mb), gpu_index)) - gpu_rows.sort() - return ",".join(gpu_index for _, gpu_index in gpu_rows[:2]) or "0,1" - - -test_params = [ - OmniServerParams( - model=MODEL, - port=8092, - server_args=[ - "--deploy-config", - "vllm_omni/deploy/dreamzero_tp1_cfg2.yaml", - "--enforce-eager", - "--disable-log-stats", - ], - env_dict={ - "ATTENTION_BACKEND": "torch", - "DIFFUSION_ATTENTION_BACKEND": "TORCH_SDPA", - "VLLM_DISABLE_COMPILE_CACHE": "1", - "CUDA_VISIBLE_DEVICES": _pick_test_gpus(), - "MASTER_PORT": str(_find_free_port()), - }, - ) -] - - -@pytest.mark.advanced_model -@pytest.mark.diffusion -@hardware_test(res={"cuda": "H100"}, num_cards=2) -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_dreamzero_openpi_client_example(omni_server) -> None: - command = [ - sys.executable, - str(EXAMPLE_DIR / "openpi_client.py"), - "--host", - omni_server.host, - "--port", - str(omni_server.port), - ] - - result = run_cmd(command) - assert "Server metadata:" in result - assert "Action 0:" in result - assert "Action 1:" in result - assert "Action 2:" in result - assert "Reset status: reset successful" in result From fffafe234a3c7348d6bb885fbb7c7eb6f8ff89e5 Mon Sep 17 00:00:00 2001 From: Meng Date: Sun, 17 May 2026 02:30:25 +0000 Subject: [PATCH 23/45] examples: host DreamZero videos outside the repo Remove bundled DreamZero MP4 assets from the repository, point the example client/export helpers at downloaded Hugging Face assets under outputs/dreamzero/assets, and make the e2e online serving test generate synthetic MP4 inputs locally. Signed-off-by: Meng Co-authored-by: Yangshen Deng --- examples/online_serving/dreamzero/README.md | 30 +++++++++++++++--- .../assets/exterior_image_1_left.mp4 | Bin 13104 -> 0 bytes .../assets/exterior_image_2_left.mp4 | Bin 11847 -> 0 bytes .../dreamzero/assets/wrist_image_left.mp4 | Bin 27370 -> 0 bytes .../dreamzero/export_prediction_video.py | 15 ++++++--- .../online_serving/dreamzero/openpi_client.py | 18 ++++++++--- tests/e2e/online_serving/test_dreamzero.py | 29 +++++++++++++++-- 7 files changed, 76 insertions(+), 16 deletions(-) delete mode 100644 examples/online_serving/dreamzero/assets/exterior_image_1_left.mp4 delete mode 100644 examples/online_serving/dreamzero/assets/exterior_image_2_left.mp4 delete mode 100644 examples/online_serving/dreamzero/assets/wrist_image_left.mp4 diff --git a/examples/online_serving/dreamzero/README.md b/examples/online_serving/dreamzero/README.md index 2414f0da207..3eec430dabb 100644 --- a/examples/online_serving/dreamzero/README.md +++ b/examples/online_serving/dreamzero/README.md @@ -1,7 +1,8 @@ # DreamZero OpenPI Example This example shows how to serve DreamZero with `vllm serve --omni` and connect a -compatible OpenPI websocket client using bundled real camera videos. +compatible OpenPI websocket client using real camera videos downloaded from +Hugging Face. ## Files @@ -9,7 +10,6 @@ compatible OpenPI websocket client using bundled real camera videos. - `openpi_client.py`: websocket client that sends real observations - `export_prediction_video.py`: offline helper that runs vLLM once and decodes DreamZero `video_pred` latents to MP4 - `droid_sim_eval_client.py`: DROID `sim-evals` rollout client for the vLLM OpenPI server -- `assets/`: minimal real camera videos used by the example ## Environment requirements @@ -19,7 +19,7 @@ compatible OpenPI websocket client using bundled real camera videos. - `openpi_client.py` extra deps: ```bash -pip install openpi-client websockets opencv-python +pip install openpi-client websockets opencv-python huggingface-hub ``` - video export helper extra deps: @@ -78,6 +78,26 @@ The websocket endpoint is: - `ws://127.0.0.1:8000/v1/realtime/robot/openpi` +## Download example videos + +The real camera videos are hosted outside this repository: + +- + +Download them into the default example location: + +```bash +hf download YangshenDeng/vllm-omni-dreamzero-assets \ + --repo-type dataset \ + --local-dir outputs/dreamzero/assets +``` + +The expected files are: + +- `outputs/dreamzero/assets/exterior_image_1_left.mp4` +- `outputs/dreamzero/assets/exterior_image_2_left.mp4` +- `outputs/dreamzero/assets/wrist_image_left.mp4` + ## Run the client From the repository root: @@ -93,6 +113,8 @@ python examples/online_serving/dreamzero/openpi_client.py \ --port 8000 ``` +If you keep the videos elsewhere, pass `--video-dir`. + The client sends: - one initial single-frame observation @@ -115,7 +137,7 @@ server path. Use the offline helper below when you want visual debug videos. This script: -1. loads the bundled camera videos from `assets/` +1. loads the downloaded camera videos from `outputs/dreamzero/assets/` 2. builds the same DreamZero/OpenPI observations as the client 3. runs vLLM locally through `Omni` 4. collects `video_pred` latents from `OmniRequestOutput.images` diff --git a/examples/online_serving/dreamzero/assets/exterior_image_1_left.mp4 b/examples/online_serving/dreamzero/assets/exterior_image_1_left.mp4 deleted file mode 100644 index baecf8fa3760c090e7f918975e3fb6f56c5a5444..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 13104 zcmaL81yo$kvM@ThI|K)m?kfo;^Jv5D0AT`OekO!^H^%f(5-`0JtFDmYhzmJe(j92-(^l z3I&1KemPk|JOP~wM1<$((iNHgww*=!##ELC@FMl`i4PYa5118f>EaFra{{D0Hz$Vx zn41&IDQGPS2;|uS1$Gs6897!Su!OD@plJoQ1O!qpuJ0V7)}CN44i0WsE)FhkfNAUL z=_bom zW$6j9EZv-dp%oNj_1?u9aL>uY$qBZGczBw*dN|m*zG(c*ft#zDi?y`})Duwmv~>pz zJVg1xjxH_^5L;ki_8%T6*u&Az5(wo#3=Xif`#)1y+BrcyU&63+_Jq1SLI5RzZ{g_W z4tZy0>Eh%HdGT!ttdOTW#LgLT0VujdUUaP8Ax=;aQEsq>nd>`%wzC2j!omz<1#x{z z!otkL4)T(gm7P2EpS0dW?QCp4EdZH|E7aM{#>Ewo{)_pNlLPb};9Hc7m*d~0nUkHf zC?_x2!xHKYwe<27<>7b<(;f1%qV7-+Tfp4i((M1HE6VvY7PWM@20K{*%leWQFc9Su zVB-M0y(A;b!Nv!Wt}l-N!4Mx&K0$!t;R$sW+h`UHi6b{(TJ`at~+ZeU{wk)f4Hw44k0-# z?+l>ZvG_dMv~0-}q`TxUOOMT^I;7g~asOHYQ}s+@6v+$EbvS*|_`C^pxU1N#R9Xnz z+53@8inAU9%lQswLvQ`Kkb0D^nw92`)l3I`uZfWr!)SK{2Sg zKjKHdapkP)?hDa>z&FfM+J9$@$djpT_l7l8s~I7MEP?5{Ja;2vDB50Q{J8vA4hUi% zq^0gQ#gllR(#p;;nVj>ot3hSJ40 z%dx_t3h0SG+|lfbYjPi?x9DX*2`XyNwk_Z_AuX03PK@s!li(S$y|*|(p4*82(PBju zqlwHe(r49SwaPact_2qcVrS^3`JUZ(^H%jS`&eeHW_o=A&Qzh+3xC-25s!xmLxr;E zo+5U*s;=J`A?5p3;U_t5Yx~|{T;6P-i#9aWDchdvIlO~t{1TZCdf##VV?GC25fG4J@0Vq>!)(P1q^Lchf!oF{Bzt{~%%0B2w@ z33)2vfO*S(Sa*=?f5dWKnn{A5MljsfoYsEBz#7lR7FwCS9T7t)PrI zoI|a1f7Go>S6hXANz@1fEzbi_YhNi61zo}83lQcS{GbIp<^9re*n2;hU%*9l_nJ-A z0$i{D3${X)+3r-Ve#U|8P2NkLw^uVswpZmX!67L%rR^B_3 zFWwuh82R%Zmh@t~`9qJd%H{+9Mn*i5m$t}qfk}bfYG++Umw>~}LiA@!3xmuh)vYYz zA*JC2uJrsFbz*hiMmatX{2AD}=|E+D2#GO`*AGNy?qO5gHHe z!drf4ws8_0AVd4Ha-?z>8ltCgFyJ5JuVFg=%@&l{>~!paokJFcEQl?F|KZA#-Xyhb zji5~?c~j%TEMD-p`fc&m_j$MRhvTmEpbLj-=kr7{$K zrPrptRv=AGqhDm#+o(}4ev4PJY;yp&!~S!))mW|1RPT zy6mkLi7KMR*HBYZfv*N4Z8G5rsNXq9c+;a=Yz4LWRLE6iW(Z4cpFmW<9A`~!{*mVy z3dtwQfcczZe`-MeNzbjpnogr|IJ2C8>0E7T)j-UDcF^icp+1|!qyh<*&(HMdekj%c z-KJ1#eT(`o+M=8;yh1I?t$PO+rM)1(#9BjimSNw|AoRuXpip?(i(Q6ZBYEY39*FQp z0;!eb?w$Ud=>1cvnnI&glZeV(2r&}TsM$91k9xtXmAWsB=3fIIC2uK*`RF3`@djZ2 z;vI&xMnAAsptx7tN@>>C`6_nyNXa1nO7cx?WO{H(QHpzaQC>m-d3FxCGa*(Il6lm4 zDrwy=D#geo=zR)WjJ>qY!(3*2h7ZJ0>wxG?-m2RWtpB!{`bZ!u^77mg>*&VS?0>j@)MQ!sbhwp9PIP@wwvnY z+0YVku|!XWtfz?nXf1mY)&G3Gj(;ebb|I>boFlVh$oy>~98r~NfPj(vUCvya> z(Rc`TBOx(|-_(ps=G3dm2U<_EVH?T9(w8M%=!q|E(2K84>&@C-{gFj@{mq|(kmw0@ zzw%tv5APlo)O*BGYHqUTYBvqO#@1hPII+i*ns26X%IOtCE0G=d>hI*Ere$ru_$_mx zVcCubheMi2azWl5F-8`C^6O6@me(h;W7F~F>g`yA$+4rxnn2mCtWIZD4lPs|XVkk4 zmqxk;P7KN>cjd$Z&^9uKqzh2@$DZ}$Bc@x+nU89F=NpLZZ7 zznOWP`X`UrP`^X>7BOq~X6ik#=&K`cs&!3v_Qhd^z{usqHN=z`? zgt{ueC-g48F=QBEmebuvh5zE-vMY!3^HYIQCTY|5@}PZh>jsRGo$X{&^_cZ^_((Ry zAlQCBc|hnWs+sL{F-UkRHg&bO<77a<{K$_w7vggcp3$k!(l>vY^mct+8KaPW@;U$b z#yZFh47IfbB_V-{@@-v5_cP+K`ST3fBDneVH9NF)3_=jvjlQ! z;E8{9T*@1AU6(7y$qgrDkMwqhLzYCsdsVrDMaXE+5>q(m91~62x$V-<{_i!V&o&}i zO>X)3FM}(x>vW$zS>HdneY0fWpLI6lv{O}>4`Z#9EGJC%{G6BVJUZ3cb56~!EqUs2 zJsp}}X6|s1-08_FDNo2im3SJJx5>wZ-xww6;ZiFlRG#*uf^q>Hfvb%tNtZIAxhdg=%=R2rXZv?B*S;$U5MVOvgJ)?S}|KxEo6@3vA zCO;;!k3>e1btBAB$@DQFF9Xrm^gMdNi>Y|L_q{s@Uk|~(5W#dO;r_RN|3jWTJ@Kvd z1oK(@-bMPszH!4_T~N|8V*O8rqt6#S@-o*;N*{P9_uSoRz-KN4uvm!UZ{sN2)cUQt z2R5y$Ptd0ivW|WimmDnW{d`grdvlc#gU?wo)(l_9Z9q7IN!3|A2>K|ONjCh31Fk|c z)K5_ObDSgb3_8oD06Na|3^?qmEWDxWG#^prlzN&2H=v#w+pgir9KAva8fp3NNI>vo z1)N8Tn`Zh%_+ZKVx3A$11e3z}?Hf!l`M5PJw&J@LsnhC{gHP7Ft zJ>cUTnfh{#8Dp~&qUQUnTO&CGod%sSRGDfExT4Kkh)(d)$)(S$pHhiGqZLsiD+7Yx zc?Y^DMq*G5`e~&qf(#u9;Wk$-yCHHpxhu-`3+y7gC04YNIMfuBIVoLks*L@pSCp1Y zoW36Fcsp<7-vr{+#z37LXM|&1EH56#;MB%4kIw}tLeo^oq6tt7JJS2RaqnFnN|PV6 zWc;4r?AA_cc_A+-i>XMZS}Cpxv2`Iuyu~D$+_1=isEn?Cr$8fewQ2e->DoitM6Fgm zCTKV^@+MeBax0}BhUeqEGjcx03}pr7Hnt#DHh7r2_CZRvMcvKR@HO?Cg(Mc#uS4Ck zb3cj|#N;=(vug~Us{T5eJMTa$^ahlsP#(s0`8Hk-L)3W5t+{#M>-=Brg zliF;_+;(PU%j*S+a1K>W&`a67pRuic8KpYI<1zL^%C&y|Kt8#pNK7PVlnruKglBR< zKT6WAFz;~8)D44!{kuFEX^#D;$z-8K53E6Sef_Sv(d<6exv5`JE|F8r?bW5{VTw_z zH6u&Z>0hg_)V2z!?HEuOnCKR(ig_$UwH4{Rhm87b7|zyR<>(;O{1Zh4WplYsu*_2T z4t|03oIIr#N7Ni*eA{p0EaCT#PF9)fcfE!-pb~WycV#tv1HGzx(fEO@?@dWQsef#j zGloMMyW}F<1eBG@bzC6k%w_^9%SJjE1awbQ@g3a`<|!iUuY}L`5G^qXWoNw4kMeg? zVQ22Ky-8&&65{P|tBb^oO|Q*qL0&3zFsSZ#qso~H5~qLXx?@-x6b*}oCxoPzh1iY! zNh%PCuugu4OVY>S-uqpls6D%GU49$2Ubp@2@bThHy*oO{hEN|n7@M|W1Mbt z+(os@k53eowO5!R>Hg;^~ohB2RAK2yxqn#U7K9HQIDGp}?!?-qC%Kn5u&fD$u9>Ew&#_^3w z)p zh+AhtT9pZwCg;wF4Q#`w+wrjaov2`5zQCFh#@L&L-E%d;%7H(28tW}vuQ%KrNo2`P zy(4k`SxNR_S1~q5#Knt{8dfip(S-qZzVCds_u&I3k%h5~dh#aWx)X7NJ3jkEvc2%s zI8U976H1{<12m0xw&@DKPMqNCpy9R0?ln#xyi-A~IHtBpg31TeteczF@{*m-LIO$G zTlIcCk)nW!ggGd9X5_g!8zYLdAIaTz%+fKq^G|sqi`0s5R!Iiz2UH4EWBgbbuoqJR zUx%AaJl1RSLJW)pyZPtX_UOmdXfO&AL(R&9hJY_6PEeNPs&I}{M(k@go@3tm%510& z(?UOZO*&MD_$W=9V(!92oFFeBJMO&z-i9gE8dUb>b-23V^j@EG&! zK8s`usQ%zN6bv!8>_*cShEcfG?t(O}n~Gs4+Cur{Zv`oJk+c zJ(3I}wV;uriiDhq^O3sKEcu1=Fd7aViYA@IWR9Mn8D{%ip_8sL$Pd$r!3tOU%}6^A z#O)rSDOS~WEk~qncMzle=`;xr6PNt>`-|I0vv@mB+6`lJv7j^Zu8!HS#$alv*-ePu zXy?Ze0iB&nl)*5n%9J0>wRE>~db|{;J%kwqis#7M1QzHDpQtK&TM4Ga8>kGd^A4;$ zq>w_GFGwl__$jjG3Vxj?MWRSL#c|fjpc_$}`QH=Tf{Z_&zs)4d=;LxUJox)#dG0JR z7(ZaX{NxkOlqEARCWFD~cA}VG=RAtZM(^&Y(&FPR@fM$5&JN$ZJ2l)beb2HpV>nR9 z;)Yz>RjaGCjGRHe8Z`LZ68W3=r_ZH=xTu#M_!}e_pQhXz)2TEPHK~|zYSmg|Do0C2 zQj}#`oCd`abf#tb)>0N&CpH^8e#bj`;H$60Glo#B_cva@m(Y9Q4-oyWqjGc`gQQ2H z{=@dqO~M6=PapUEmp#b$-q^o87Vo^K;F){WW@+o#$V>U9#M)ri7l_1%2ZziC%rTKw z^x;yJj@8rUz*SUpHx$^+awgSSyNoRar4}}7K^(j3nlg8httLX+W7W{JkZMEzIprkKm>-YuPbzSL>ggyi>cIZ zA$niJ@{Gpa#6y&D-vcRTI3_;2iwKEmgO1~Lllm#PR$8ZH%*J4Z#Q%u_F>;Cq@7aK0 z)amatz1c_x@~AjzRt_&X;Y1)|s`(f*JK3!nN&HQvczD>?ir1NuDRT#khUw#3T}#<2 zUyKKIGNUI~L6E=RHr=&2^gH9J*2Lu0K&1FiAJSml8u+*MnxVy{x&fxbgC@@JW3eug zHbz>^PU(3KrVNxSpYyYo@{ZRLI~m zJC|Xb=~rhkoVLpk7e!9swf|*Jk1sO6X2HW}EU%I98X>O`XEqaCt5Vc8-}o>vsT*tANb_0HGA;zHV zp(Xa&*eO=od|O%0HuvT*EcnN|^< zVrrxk>PBIFLJv?_`ipWZR3|}(C=+myH7l$a_4Lzc+!oa!5p%I7xAWpFR)X-b-M!z+ zSeyVoSSU~M?GE#E)4WL+p+&D+y8JYfKMHp8W-vImqGi0MAVl1kO8%8mDU%ALJN{dv zJX9s!P_KF2kCxgF6+<8CZUtVw0xwz`HK@M9*955scmCZ^5&WfJwz*0firiPnWx-Ic zODyW{sG|4l%ib-Her9#-laefzi_5~O#~?2=Vi)ksRis1)6;b3fD0+q1yppP1zeJDY zo_2z}pse$L9r9ac+;@T#S98lWFp@Fr^bh_06a0I%_c%Y0a^zf-oK?aIk3?wPGTxAg z*cO%X)$?%rjqO3Vka0wZ@@bF>FE=OCZXEwShhDC${btkCI2WoiY^+W<-Fw}C8;KRzIWoVO>pLY zh_P|nyZw_A-6{+rK=@k=_dqB{C`a z0DAfcCtLQ=SirDNC>xEjsYIQ8HhN1=u0j@{l_7;)0i>a{R}P;D z+zOmvN}0xI?L@cu<-9;3m|}?xD6($SC(H?6C`GD-Q`2Bi?S;-@JT+9UrX zi4xQ!yKS^f^Sn>qoT*j_zEf-BBk)s;e9Y%XA}ALVC6?>IVFiA&b)8JC*?*!#%hi=B;B8SO+tcP~03C9cd2q58--pRc>_Ki1jonYY~4^ z%SYVkyl7J=`XV=T)lf8kLDz6ZqVks~2M@O0-T7GBHsc#!r=jlYWo0xo^G}Q`F1%$PbdIh?~M|y5j zHJ4f?%_Nn!FS+_!U3&W2*#_$8@n4sMfy#uvkjRJtE}f?I%eX4~K?s_o-(vbghnK_+ zhxJ)bH1?s@Kr?@X*u8JrGMr)g+r-0m>1R{yZ0WdY#{?#6vn_x4dzm&3nflX*t0-uL z3bU7@87bAGYxl2(hQU_<@>suM7F1tvpS*x~M6Kq{-X}RikkY|+oz!Dw5^9iz=f(c& zZm(;^S>V;c%xG76;rC_3H5IET#A%26LepWg=Ao=Y#jQWu4lo=^hguF8{!ky&w)X3_ z@XSr{ydZTelR(i4$3gzr^*3oAZ$7V+&sPhI{`rDxBf|M(FSuN-XbST*_!yj_&Vhc7 zo10pH$+EJ4mbs4VjK)POuuUC>M<%%Z_y*~lVRdoD^#YRJh8?~*`SkZ}O#`HMJH6PG zb8%2hXu^1E=2d^tb0?U<O zycP%X(@0sfNEp10;4f;JpT^Z#N7#!c5&a4yTYtY!NE^M@CHx$lLDv)|%dE6BypNmj zB;=DmTKC$GK)_Dt?-`U%aGv$T7={-%XEAdr{;b>7*;l|kK5ZBr;9O?|d%dCkUW62P zGx$dOGq}w{{)PdoN5SvL;Ngzq)Ajb%s!;k5^C$g(jXwTcK8ezet4`BnX zJpl)M1e@51!DXh99I>=%n;D9kBu~CtBAijDlH}y_4`x3DLbLU?SjMZ6WQ~&&#y;a) z%W!O-^zbO z;Rp#~NmcjA!xMbuzO$QMxJO^U$qY`jow!l`gw--%_Zshy)Rd+@Jj?v^Ax3SLAAthF zUZnA`G`yyzL8r`HclH4CLVRI*cFUSmD4k9qCUFjfm?4C2Qwm7vF!(5e~j4;C@%Ib?Hj33Y^=xlF=#-%Fxh&)l0y18+7M+_dr@DI=vpYH z;0cX1{J~@&k@z-2KzWPTji~$*;Q;;|(@yeQY5zC^tLnbax;f9)tcMI|Xwp4XcbdQ{ z+LF9jvfEtp5!cN#;-UveJ5LGM;8Baft?PA%enx9L7Z`%&-H~{ryHU9uAy2!6Ml#z zKMmZm@`siY!tO8yA$4mqhY9FT!;at24aC%4k&}W_Z=}a$UgyfN!69Jr$FmTi3CHkxC<3s*4L!2$tcB(0j)TJ%B?fn@O{#3B$B;3 z$KSZ`2C~&SQzey%($&PCt3wE1Ti6)vJmKsmnW0z<_|YVHl&Ro)HxB(dXOQq%5=tTn z1cxn=G0LMX{fxwk`<6YFtT1ZnVve~C!Y5JDz~JYlL|<4v+XrV-V_7K`CUHIxx{C5LbRs@{TFfCb8uDnG+RM>VRKVp>v>$Ibgw)0&*%m^hhe zUdQiuj$I@hN>BT@fr(;?rG#{rFYBe@S#k3yT|Drt)8HNhN}bpDnsnS%-VF1kfBm+T z;gj;%g&8_OJbz~Y{)$tA8n$Usmx!OJX~Z$BiQSc-V@o8~q*!*l@FLxd*4fB&7y17;gi_B!4RQvTtJlOQxYfs6b;Lny)vv-ox?;>lz1h)C- zKBrk!g?@#`BKdhdYlh+mjl$<#F1r~kvaRt2q;fQMi28M2WcAJXLKX&0ZO6Z_OMK>w zkQef0QM2+Y$?m_rnHjfamU#Y*?kKrjvb$H&l$9Y)#{5;r0<=cQL@++pyvuP=F9B=s zw(ndXPKmIax9UxoICW%wgU(4qB4H~s*-zFM!wNO8u;)9RS}8Iza~FQj$zB6v=D0H zaCx{(TFy9dxX%8D5=X{*iyI^1hhc3;Z!EqZ2OV zfPf)pOs7Iq*0XZilV-A@o3-jFNh6JzRGk~K3WVCH!(0qFO>i<;e_%yY<9ngII8_BfYMMs9v-h3<0W4KaplQ4+I@UE-y(<{b*a z(sF1tF2hw%JYn%m=C$gpT=xjL3kz6SGS$`7&r6#=$Mz->4PlzRjU$IAZTWfoUXI~o zBr(h%R9p>q-AU~g8Vu#>S9CLcg9t+-1|~w$gM#6Ea_FjynDCpkm(~kGUNY z-p@jzG@3P4x3l;9!?5{xl)ujCEdIa^3ILJ%!sMXI1eW2r;(p*QOMwx+{~+i)_-O-8 zp1)5D|NG#Nw)EQKsro7dHC4qc&J>KH6`Hhi9ISL_v{m8VMkLW9O1NAc8~U&gZhw*h z@>gxk0r0{LVbGJ0#Z&?|DD-td2M}9E27{BVG?n%*JA?SuG&Iyc8xDr$-Bd{y0w3-lC<2FbBz)%cqc`K$<-C`mtZESA<`&2`e3I*m2F}Q5}SE+D>@l zB)|;Z&pS=4nSMtoI+V)vRH|+YX2^fze8fkwmGab{<9Zj)eRsnU8OqfMZ|Zuck|%w( zwQD9_)$#i!lDj4D!-9xi7lH-7P7+hvY{$P%x8lu^O4eV#Ti)75VV$N26X`UO^4M72 zi>3!D!i9yd4#gcUt*REAf4>imydt;9#MsclN)QmGdY|Jd(he&8_(yvRCI^+dcpsLu z6Zy3cG~rD&?^~?1n3C`xzVli4yfjuW0^HbaT7874${w{v-IzpiGAhgv8Iu(=7>vO2 z5~dM~2%Dd`VKeSJLbE9&BehxfTAesmcW<(fkm5T&5zKW};p30A+AS@$Bt=Tjm-OB8 zITj*V|5;0>LK!Wj3c@=lj;YspMvTQnzn)V@PM5Fv=Kt*tA$DoPWCYKzHu>34BrtK5 zzdpv2YLmI#i0EYLA56ZJx!qZ-l}ma15zMMuPlC=VZdc75zVkJ}Nk(!Srzp&D;MR3( z(V0nDQr}@X)R1pPe|T5*;ZgXs&g9D+=<^n_X^*io#$%@TqJI3!60yJcBTxdi5}9m* zoOsW~1TfO?7e!EbCp7rhd)ZFpbSOy``~%=nqUM~J&A`?$QX-W&^`r5QD#YhElrRMh zzMHMp5gg=q^hEG6VtIA(XKPc|(6k|rnZ$6XJgOfhXRJSDIJ7);puW@Ao$Hivf;$%Jjd~NB z28SFX0dp$r675Mx+ zdcN4)i06zmF!7FcS$y36-9_p~D50E##pghG-}d-z$H*rVDOfS}e5XN?PUGrntMXpp z`Ls7rYz8fTM~*#0h_$in%|uQTL(rFDsaOeyFY-Bj!hdg*a?KbHe(J%%N zEhtp;Dlf6xH(*@A<;P4%aILf}D5p{)v*U3Yk^woRz~s2d1gd>Yfd>YB5}9rw`_HU}Y{bN&rhriOfWz zoM#XnSULnnAC}Pw>t32U&?N0^0sUM_x0~#*CP0Z)BD3~IiFTFlfu52kZ-empiKMh# zLs6VX&lQksN@UIwORELKPWdXvT9fU}ARIxcoJkhR%M&B<$H_wOA_qO-jgTD3sh9Hx zT0%zxWaLpNa?iBd;n3p_eg@2E7;cG2-6iTWs>^?;QsyglUJL@fBE#fdO6P!tfNx8n zB#F!$Vl{f??>$}cs^r8~K_FIwea}2Sv+7ELd(mL?=0i`jC-P!Xj?Aj{Ps`jBiQ3(o z%uHG*5F6>=-VKKj9Vz$v5BA18S_vV6*z$`q+Jc@RDbwIMa>PF8uuN{?8MMR7C0exo zaW`_F8@EYkSQ1X%*!;lzwDJfkuxrhgcJ&2g2fl$$llB zqtcOgU|Nt*Jpby84BpggK*-Ja-2)Sh~gj^;0E$v-2XWM1ph+_0*U{(?*E&H1)k`&fFsckfXLX> z;h#C-{!8uO-hg@kcl%%S{C~|0^I}!^k0A}z%F_d&NTH4%FYh5=8sOo@FY>=*h8=OT zvV#D8u#?sQ&Rqmxn}c+E0R#fE+FCif|3d-Y%fbBr&@Wi`<)FSi#M#Ob`l18t4V=}s z1_*p_r+=LOci1#m|B{hf0SCfgjDhwdNB44f1pkYM<@4~ga0KW|4^Iy(0Kc>so$QzY zCj2w{cliIN|3N_Fbs%|we1Vq+L!b=k^Kr0obFy)AaDg4|EMERZ@Gs#%J<;DL!yk{ohM^zzqVu z0Eo!L^PlB@k-tPs1eDebjSpn;vJEfoh5rK8mj90~{=ay@mi!|_|2O`}7ATef@L$wk zZ2vQU8NQ7F(F5@Rul&FK{15*?+5N}g|KJ1u{>PXU$iecT8UGbOAb-j4pIkfv+53OX zUJ$@1`3B$^0A7MR0o2NXs<;90%PWK~0P2Bt1!OS5{Gb=zKtK-AFpdCy2>=EFLIBVL z00ux8fR}*M0N?`vSW`WAb1070D*%>V!Z diff --git a/examples/online_serving/dreamzero/assets/exterior_image_2_left.mp4 b/examples/online_serving/dreamzero/assets/exterior_image_2_left.mp4 deleted file mode 100644 index 10b9cb03cabf4f3a3b0b59128b7829492c8f9795..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11847 zcmaKS1yogCx99<+L%NZcjzc#bNkO^-Oa%Ej(lJc2!v+g z41<9{tOEAtP**^w90mE|p?F^QPwVE4LPHY!G|deC{(&ct03QtpjTzh-Mgs;&XI?P3 z5DhOF1{Ssu1_TP6fC87Ynyfqr9}PrD8qhR{nE?W6xTBXH%)*t1hnt(1gNK`k7hqbs zx;lz*ad~)naJpNY!{ByM2Tr)NCD-FAoK~*(c7P7t(bXF6;37g}1~rA6iGyjJVHUvC z9A;_=7! zZsrQG%$)3jtvL*8{ub^4xCgU?!88_77gu9P7h7w`M~#0RI5`@_Ei7DMu7J9$l`~-A zA}&B<2Z!52t$>a3zj`m{~lpxZ4Y&Q48z*N73ORQ1(X23shyiM z)XUfmZtnymIut*(#q8okii{c4#t*nM?m@y^D!q|m>1w%oQI$LU!<|UwSzdA zpT@-u<^VHua~0>~ehkwY`dCqCn2Qx)?rdiKU+Id2ANS&B&K5NGra)OA(*ic)JVKn@ zG)|Anh;wra0Houi@xIWtVmOT&ozqw3bDr6DkuTZvYXtSM+v>8d~rKdvTY1Haq8FiGnm7#M^qT3 zhC}tv;erG{8$tN z+pLLn-)tnsN=BI4UCLDegdg$e!c3DX%fdS$5Jg^skPXRKbXl{CJLWL3{_hDT{pE%a z4hU()XWpuTd|8Xvs8a_o_Svu*l|Y9SCkUf#8`$n2f9%AncBUpT zaZ=e7jaT{iiHT$EHad)|efQIk~!G6ra94tRKmD<9tI&*aQYkG^h(aEs`(pQLyV# z(0}*W0LO`_K*=A~H&!d>s>bcPOE? zoF%UMUdVN3msGBR;j^wF^*pxFvrr|`Ji<@2*X{x-G*0zs-^9yDdwrL>F0IZSlw#i6 zkl3QVs$cD}h#ukX-4GKtwz0jUuByJtrgpPDFMZZ@X2!wMUCfD$k!Oq;?x zM|2<|{T@rm!+|#mx`)JVw3c?p79;t|I-Aj*D@LL(@=?4Z#fxaQN)V|n zeHr0--IXG@psVrv=p6E*eM0oJN>zeqx4g`kdIZHtV?_-$4AR6LR(8DN?Z6@WCSwm8af=wS_LKIZx~xPJaNQ>1*hM?Sc&yMDbG|OkFubni?$TcQj{cCU<}Sp+dHp{<=n@4| zO(3;At)OaF{qb_;j^lwhRL6A+A9Weo8VocWI!XrC#jnTP6jF19m?*pVZ3S4=ie3_E zu6jJ_Zk1Ju$ItcMbIUf|QtJ|2`GR3%01qRK=<*mC5U;vHrDmT|OJSrI$&T%z*^Ver zPsJ0VsGIe>ffci_iUj-3k2G>zAaTh~5AF+g)2&Qe{rSx8E#1o`A z>)$~vGU+R2J`K0xM-sRlXmn#X=It@wlrtxk4vJ>?@PsG$_jo}eO;x2z5UuX z?wFvJHWqZKIT6?X)9B#B2=V zwsQ5=j%K{Ja90$P=(ztZmL$O!CT^8_Fn>Vm z@|skufUj&Lg_2^JZZmiP5BDs;CQZVXZ=-`8M+46Bpq?`&IB#1jJJT?$CC7{Im#jui zmqRAXc-FM_n8DDw#7ns|3@Oy_TY+Nhd7R?Acq-L6Ui};aR6Syu=n9=4UtRDHh`w!l z@2@3H57|f%H*r*jJ$Gmv*V4=Psp_BUpbKSZotl*QUHB-xrAN{mU)>{_M{`qt=p3n| zpp0|gwx5mF>l{%WA0hOjaz8MV`{>HxhWw*X75HQ;IZADu_(vk4J?j>mIu*FaO1D30 z9O}Sgwn*4Nq@y;_`I&mW$*}i5iPPQfEW)d&^|gClzFre;MV%)p-{qRiRCbq#=sMDb zufEgAm&L^Hl#1Y3`XyRWkP>Nb?f9!=gQg5v61voAC@RFE6MD!*l}R_W2#EM(cbF^W z(QgaY!emifhm=|_Vl8xuG|lDwKIC6c~VnUC?(wREz+&#~j}dlMhb9FQ_zQhjHpp zM|>zmV5|y(nI#;md9$*N+Zyn!)R0o24}VuyN+Ul;P^OfywF}jB{VdV$C*CSi;@ce) z8+Xd%p7x$&m~vj}_fQx^L90onFS9!lJ##+H);bjdpF?SS=i6%r6Y+ONAwM>G>89ls94AG8?cCL8r@ zPZ;WDo@lWTyqoRr|E8x$F>5_Bmg3py{*fz&i>1*8hvb)ff9&>#%#*u7bRw1bZ-YrU zPYWb7ieLK>PoiVCy)ujp{dCIm%<~rQ&rFeJV@Orih1a0vLr8e27&&Lu{Q(np)fHnV zPF{v7%E7|3wO=@s9?qUz7d3%}GSM+vMc7tlG3NvB(a*Fs(}yKfGUEL`U-GBq>Hais z%0qxBRCbM=P~VR+s-@$kgySmH3aGVQdDG9zyY%=FLT5tGMb zYWBwZxHfaJut*WP&|LcYQ1lc2LH%?YZ>$ppHPK4Px#*K?fVnAACl+s~#wjZyfie<1 zVN4;pNWrnhp|83d8z_C1=?U{E-(GI_MncTfCF9$Tmv+6!YKNUIqWXS3L&+|gZ%JB^ z_KJ$;+iKJbq1=PGF|*8CDx*}-nRTpWy*wM{}|kf$BfKcRXUIS*$^&5FsGP2Rdx~GAj$WW`s6iY(h}SX`zjx+ zV-MFMXg|;vR&*NuB&xZ^ewf*)iu9W=#!Zg29|0*}+VBKdq~)l+yUR5zi-@y7D6{#c zEgeCC%hJyUKVn$gPqIoc_e|Yo865&zP^=h88!U1bi1M-A$=CPZTlr5xGE06}`)L{0uE_<97~D>T>r)XE(AT-! zVZ|h@6aC;6=f#cGTv84WWQLb|=)C(7KS@!nyzEcjo_4Q<&k?0M(uaP*Ahp`nEFUyC ztupzvmtw_6f(&HPM{iFQ*FKvo$VejSDR)r}Rt^+iC$|P6fN0tWd zs`I;^eW9Akp%=>O$LBpE7*Dz$;?ngh_>_UkdM`TMad7Z`*dfiSw^*>XPwfp};;mBU z2w73R`O!&HIM?r**-@C;gr2e#jxW0b&EO842*2y=jYc!_(eoif*S`4cQXhkUr}S_O zk|C;bv7r`awjXQ5sRc>;#*gxMTW^&o)j=C9pbBR8O6hH3qtpPcwNe2FE7w=-Rp?s6ComR zb%gu2@UHL`PeIBTnZci`CJP&P2WaZ9%=lSn=(((guxM?Y2@HV}@Ve5s)C$B%brVmN zy_>{svH{``B-QhbBrCGKS2{mjW(XR3J_e6}KLtsTr)+_BcKj~Kp0P>_Sm6)(%rOjL z*(6pfZ+mxp4wk{#4#nG>YWJ2^^UbrMC2pG3QbyudilQ_!%>+Z$D%#lSFUs zLzyIg*-@LT)kp13oM|dnJ?WAO_dB1oeEHIO;jQFFR&{^9RLGl-e1{qo0d4mpQPs*o zMZe!*OCBDYScO}J+IS-8j~>CHCls$Ht8>+Yxb1WD?oN1`kBw}gmI~q+*12`*f39RV zy0P30>MBoEr;uwXLSFxBXhqqoR|?zwOqEH~8Y^0K zhl$F+6xGwALbR{UkQqpXDZ`k#n}c5y)KO2wN&8;&X34#oI2G4s6hrN<=o6J;D?uhz z@*3yd2YQ#Q^wn(my|nIVh4rt20e6RNIoMY2sU83$ z^^=@oBw<7ywhdn3ODB2;Z(~bRI!44jDXHv-nwZRHa#qSUkDW=oXS1Y~DM6pL>_ZDV z#Xmi?2E6AzYLBHx7SZjGMG2e_AjVj(5~GaM&P(f&uCKPybL#6u9#khjKWY)I9|WCO z2w(6rt4r5Pgn=PWhnkZZOa`nZDjg>k#;VI${-$6jMm#UY{{3 z4&*ojrU}Lp+!w*U*QOGv*NckTNk31+nPQ3%@#p;1j`L4X>MKWr$V7q$GK!eZQF7~> z`B@-g!)*bn*ydzMfopTf5}D|%?6f{I;XFH!riZF+@@?wm2W|SBd$f$% zKGb<9Lq=h*O*Pj8@mllb6*}cnF!EGwxw$4`7zg*)2wGTMZ=ro8FBkzfrhTPp5u_DD7)%e=59~;tZ*X&L8Z@ zy-zbvcJ}*?M5TT(n72F1nb>ew3x^u>`QYp#`zfMv@(=Nfwl`-P`El^SIknlJ?AGj9 zHQ=Duzf<`k_9x-g=L?4R33Ks+a(84Yu9ht_)a$4(!rx5WhTHZLqkLymS_q>Ftd34EQn8-+_SHtCK?D9j34 z#1Mi#Eh^*T1#2eRh5M)dYXpS6H_3^YGivf0eDuh@2s#o|c|tn0`2O=Nj^0+Rv0|~c zf^SA&J6#aVB+_Y~cuQbPI;EBhj&;AUaE@ZG6_AEPGyI$@NRv_MMM{Szn%PKlOzY*s} zX0nM+m3$~0;4>#rK*CkSXcwVeDRP_gY4fCaI(z zH8a!!QMala##!}-UFtKAQ~|akKSEyCk&Xolyvpfmbvq^fp8oemL2062%b6+HF2ZMB z?O-utcG4+=I6JRPb<5|zv3$e<{*(iYwE| zkbM%;Fo(P5<&a+39pz^5;w$g1MS)kn??A+V;^c`{WRr2QJuVtluEcxP=5_K|LJc=* z8Lmg!XxbcgcU}g$MUUw`%#Xg}f%bh%qk}~p`m1UeGx43wHuDyO}3%8Cai5tGQamcS`p8|4qCK- zvPHF?yQt1dyUzwweMGk!&-&fkVvmLFq=#ZK+nr2N)_yEFu^H6<%4hH62k~b0fTqEK zAPDgHzXc?fjx_$EcoHlW#vUZ zm`Eg(BH1sID+fL@JSY2uu$HRF&uaW+YlM3dntAUbw?O+FVQJ*+?qQgSoO~=V$^PQPxw=p zQljyli%B4uNKB$a^ocNAoCb=Lhn9`M=MMErND7=5&tut89Y=P}X$H^DNcH*0CSGPe z11m?v9uqI(##=1^ZN&%&X5NgFy^V4jXrx*);keXoDmXj+AeO6=4q-rsapg3~-=KtO z)w_z_dTxy#eG#__Uh${3{0Es3=d#<>DT6#j_t)`RddqR$ceAbU8CUJXd7YH*F`D`v z!~(}vVbv?N4)YH=+!PlnJvH;HX$f?iiXmMu`a&0u*dlxu%!_{GJ&=%`e2Hg?Qyouk z^dHD#oHxED`E{$D-1a=wmobw1fFRZWc1ybAq&9}1n&>ked7DG87nhQ(mqw-?DDn&> z)tt04=h}A7vX}Au1X)Iz`m(#sir0qHiM0J|p~85J<1H0<&w9xil{(ppDje!=7f`me z@`TL=eW%TG>rKHEwu-A+Om(8==l3|jj;1>oLJJrUW1p#Lp_HY$3gjGhOwCk98Fc0o*`N>@G0q0fw_ztma@?`dO0lr zjjN`10I$@VP0NUdL?7K(Ucu0LZugPog&EPcaW+0IVvIz5+p;`NPQQl9V)B|=$M9Pe z8_aS{zPQR{>&wgO-P#eiqv#6Pd6cY=^LBy_+Fp`dy>0N_W|gu6B%y6=SK$JZ3r99| zcVhhFT{0%Q-%M9wJd1zMv7-eN6_9rCU8q}fQP~sbX1fqP7gwnDeQBlKLbVzQoO9@u z3}$JUbpp6aYo>zoo_%&?Lu}2@p=69#qvU1lY(sPn*zEXjFNF?N$P{Vx3 z_O4E0n^M{tvORsK#qTlMeSa!p?G%4+Db(3lDTHBoZpaa$038h`&sX%W%N69zC0ZH9-1{H$Yg{oDQ zwI9hYf1Czaqo^#S-o}Z9r9Tlf-VvFXE~%HCBJ_v(icynY&@sJjK0Aul{c7doU9SKK~|@_U#v%e^73xcO?NYD~y6qp#Q| zZs(8lSbJ(nY9DE3cc24i-ho{RNqXe&%Quaz_21a%Rv`RAj8C*2TpKM?>So8%E!R@7 zk`4>2X?Hf*ZjV8uf5`1P$YtPqoTx%j_T{5w^rs>gvSmHioGCoN!LJwR){oFC`a}9g z?qqHYrL(wDwdF49V~;hd7CwhiWqH}ea&I)JPkiEReNANeNqFXqr0KHjBm*f=&-J)W zT<+9V1wZQNcfAmLluw^t9P+TIDQz|MAo=pZREH7~e&TIkDHjx(-$&ryvQnYb7I*Q@ zO})aKOyfTr_Zh#6y?R&ECB$`pNAbLAxvt>Gbr@`N-B}}riJ;6ZZ*7u0>F9OGS_FPY zoBee)clg?>-SkT~>d`~e&oqa4B7?QIT5ab7`;Dz9sAJ21K#4v;QnyGeb5VwM{j7bC z2F5i?_J(#M;V%r^J}iLc4(KSCUsh=Tx;|H(ezuQHd4NCC7v;a)P@nA>WcEwy8N$a~ zhX>_x)K}v3W^66G^>B=o%#?E3TXux^?xVs#sK`Hn$JDLc#u%CqB0W`z>V&^ln%mZT z2iJV+b!N8--b_30(U>~Lz^Y74U@jW!KbGn6J6$!R$tX0%P4QuzmSr6)&M0mAsbRTv zD-MeC@zXwCoI9B`4cHNVk)^<67829te{ZA4MtHy2PStuA#0if6s+0TG+R?8}Y}YG8 zHqIjAYoHAqqcxrDWdzrZF!IfYCPn=egET~QCf-9`GW^n4z5+SmGzWr;-U(CuhnowvKtGFWF3*^OoZOMlFfe}HuOn$OV`+=A-c4l^smJd`HjK4{JC7agOBFv zF*Q~vo}lY(*}fXm*G*0kdIqOI(2aUX8aioZOcJr7?&Q}IPQ1jfFB7Iul;aYgyq9-2 zLo|>X_}z=!5FtaZEg>a~Zjc=sIbBhO4f&G}YBW_om`A4QQraFk+?1UU#E`y00cF@C zYO8I`nWC1uw1l*4aY!?ds9H*Gr?E*Duj^3ARf@sJaq9hfZ9rpE(dJX>f$ruSZXZI2 zsQ3mJiUidF$5j&TDV^7C|;4PK2jQ21Bk5>gL#Jc<$q!{Z*lkdi7j+!mNH- zLknRxvz`3;$D7`dXBJwCfru`N9yS76zkWuJ8i66-W=dI&suj)+Rp=5Th2BpvGOv*t z%Cv?PbpRhtBMy@6rRq{mKiE7KD)x4#$q7y}>mNu+uhIPcO{N$epS^sMivx^k;6c*Z z$YfNF&BTZq^o5jVGRqeqB(M})$xU7`Fbg*NMqvvPLG_>H=&(D=i6pDooUU@BnWW3N z5WRG|cSuS-Zhj%RqH*Ua3_c-Bb-l4b|ETpw>w0}bfV)s3Q++8`XEiqSv^QBgImXkj z=ey2g9`UnftM9i9L$b<)rx}>MsBT{RLu@~-XrmmFkPl?1!ckJbA+wYB?~}-Cl@3i4 zm3Tk>k?c(IISsnazmaBxDJ2zqwnOCZn`WO8_lN#0;P#1}LZ(bRVhyR zoI&58hho*3AAJg&r72u?q)sm0`Oy3~%Y%OfqE=wfQ+qsi2e z-Sa!yCs@fM0baTVH|crMu;_KFl{k@sN!Ga`mp_?gYy z_fyybOy3I?6v*Jx9CR$9RKq0Kgt7ayDP93$YE0bA%V%UE)p6Z)Qpja^J$uC6uhKgqxryzH z_jlejKgY1tQz<;u>du_TJf@SD4iiDINL9W3W$HAseR{WA#?34s5g?f-8eh!WB-W`t>%_@T#1>(>f?OP!bG<7n9(p0!NAN%Mu&;kKSnmuVH zbPk!(R46ZxN|n~@2X%gteS`;lunnE}wpgAYU!fe*5iuhS~NWUse>T_<7qddQV{}_8~{o;P$ zsO?y)I9)!G=cZ|u&Et1;tx)Rt@S$1V7ic3R31Royj=o)U2Wwq5j!o7~%9|_T>i>eI zMFYQKox{=&DP^)Y;>5O+QWiR8kDS!U8CK#ZncQ0)xk!|CufpjzDp;V`ccc@DuZjW_ zw}U2KD9+z6^K=?!Zap7)&JWhL~*fZ~hByt>0?wK1ks2xagpZOShL`E2`<#6CaFUV-JVB11H$o zyIYw9_&*jB&`lI65CZUO`w#Aa-vEODCIo?`{!{mV(}=*C1i+-BEg*X1YWw$0r2kO+ zmp9oaLi2upZ@DZq9Zz|Ii5MF0Q6_0A1nY>S7My$MvX_`8e|XclWv05>Num=ny+Lt|%c`Z$~WkMQr4^x*-(e1MiTf;kB5VFg5H3E;9B z2qY^LxSmL$#|Ogl(Th(e2%slc2jPQvmzR&Ggun>sze+l`2?9L=h}6aPZ@C}kkI|BX zKu;cNLLiIBHaxCJ{v$Y={a;mP=k2-&CQUA8;|H~)*m;S%)AGiNy5BT}Z zrvzd!`z!e?`x^rg-(Tzs$lm@-_J{yJ$pL_E0Kf)RUITamz;pn-1ArcYXaJgkP7?t# z34jg)Km?$t04N3E@g*b;fH(k=08jw{eD(ju`Tz!MVg?8B<9e(IHvpz?=B`ix>)XTr zVt@qE{^zv=uBtQC(ed$S{MX{J{^xEp7Q4b99Z~@U;Ezrooq~Y=TLAqB3jqD+{yz}e B{igr` diff --git a/examples/online_serving/dreamzero/assets/wrist_image_left.mp4 b/examples/online_serving/dreamzero/assets/wrist_image_left.mp4 deleted file mode 100644 index 0d7677816ed635f06a574dc4aa8073a3ee7a8a36..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 27370 zcmX_nb6{pY&~I%UTibT~)VA$DwQYNAZtZT3t!;a2+s4+oeZTMC`^U+dOnx)TBqt}C z$vGe(AjIabUXE5S4t5|QU?Bh9zb{r}cT*NSM|Ktv5D*x1XCM#+#C*)o%-HoCQw;_A z^;Nzhe%f`kD%Fxkze2o9c6IH^%E?a5Ky2#Z3?ydxCY{+>n7N7BSb!`%<~-j8QjFgK zCV3@sNd|UeVGYr5q#4lkyFk>z(aRQS?n=zc%*@8X%FN34&9roNb>wAY^6>Crbhk1C zI@lW9Gdeh1F#T5xqou2z?Kj53(bdYq-i4Rg)Y!z>RDgxp8EF2UngLC09ZYQmSa_Lv znTd_AEtek=WW0nWd%EHpscDm2?aOJ8Of8D55B-O#Ztq*H8V=l2%__YH$T7WBt%xI{#55}pV|{w zad(GCdO&zcdrbq|44^V;9=mx?ITs=nR?${d{E*0=McklB2P+E+)?KA;HA~qN9(XEY znpItit4qJqt?a+B$KlEExyi_uMx0mC@kK%EEBxEqW4Wx<9Vd!&CDOkmJN*P>1slb; zj)H?p)I|lmlFYR~eI0>3$MWCloqK<1T1VFLCTW)?@Zdgd8M8`tM1`b8Au>eQOle(< z{GZVO2I^dAGHy@pCf6i;coq99uW zI=U$Fd$d)NAn((q>D-v`zFa|nB{c1&j8wIZAxLPHb41nrQUcpKc4H_67z2z`YA#z0 z?06l<0-O=q^lrx>!`~=|=g_tP zHg-2d*KwRi?9hUe`5KX2l=VHClnUQb);Z+ZIdo^D%feYp>Ce2LHE7j@nao1nz`b_= zMP4Ip*&{0MXv&mO+hwn@toyBEmSd_dae!3C2(>yzlrZ>L%~uS}O!AVmBHH}+sM_{G zdm4d-`p@3ZqK1*1uT$(HS2;NvL~tKJ+)d-NysC7HXXM}hyXa*|zCD)itVF$nKKhs; zJx$?p+TgfR72j2UMkxdmQN`Z)nYu5fsi``g-N)Y$V}}eQ&#h&sjGewuX5tcJXNk*F zZDR3%#Gq^q&9eSc>bD6iT6&A(rAa?^Dx-#YJEMsg8p$_C-dW$1g1e&?Mf^DGSQtS$ z=C17hb*e^-6Tp@!jExl>Dbs4DN1KITbV`c|F7mV^-{feMr{1rmug*R?<&6kwEnB|d zx1H$jiG-t`h+LM(SUyXk7#YegHNL)cE5hAuR;3Y3B&;`~%x&SIO&X@9#caA&e_*7Y zCBR+X&(Bhn$(?mZ!#F7ZqHOh5aL7JEs~ZK|8Mz@H_vf^#m^R&?Vpy6 zi&!3}NMUo1BKV6O*Z#u&ykx~W1!lO(Y>R_UGwW%%R}3PaKc_u%Ftk?1i#0<&jvy2t zFA23yto6Epo8hr5=L*A_3DM3jkLuk`2R4LKbd4U<)lpqSMiO4Pbqs~jF`{I{KoDn) zIaIf!^m5@xa|W@xzw3^INgD?ZnfKM&ObFJKRNk*<7`d#nU$K}sir0)q-5A{^lYhKK< ze~xlX7Q4!ESCb9h4cCL0)~j?6-4@Q&2EiSo^aUN#Y}tXB681k<^5?Crn2Xr zv~;zetC3^Xi$gk z%9kTBJctN8v}8f9U@-m&?F>dmz|gn&3%0DOi#*RcotCR4?e~Qm7lQZaT;Eq;*{|Cz zmX)solIe@2Adoab;i=}-PExS--|HSahXL81J*r5-7%x^54RqNFvCRbb;W2N_UeB%f zeDKuI4Ntg?!L=xc)<}O}K6PE8HGe?b+w(xIaQyL)*d51)Q}$Ki{7U%5=i+@P*^@x= z=8_^xuhkItizk&?&g6gNGj|n*IVoYrQd;n!M)O@1e<#P3(E@skLo1%*Gx!uo*UBwZ zXU|N-wvehm$jkl4&)YbZg@&9oDhU>jkDBZ9-N|}pW43DHrB<7j5G2xJ&Blf01MrT;;-Z~Hd=vL5D( z&`LE$oJ;*gRDM49J|h&dTGd*$gi4$8s8CBOQ{+nY$teVJ$CmVO{H&A`1s6d`^0uMl z_dv(MXtCac)a;aKDS5^innd5dKaqx>4)?JCTU4ngg37_9&Vaw(1@ecQovH>h zDeBdp;KE0n=3nsXYr_&6t=g^MaC_RPbr+Ew=VEU}^W4Ke^p74mOR^lHgydwr3Dkrb zOf4FNquxD6A7ji0C=e#x>9ICt-}y~LFnyN8q0uZBr^jnCthCVEABo;Nc$qP~o9@*U zqwsgFX>)(nw|Ni-*klO(#FUkA49Q7JepHU2mC4R|_>UQ8CHV04o&}L1@%R5=>+B9( zb-$xF(2jQ2C7jT$JpGcqO)q1Ct?9ndhukeVu{s+D$5_SCGgQ?0a`(xgS)V6W9hv2^G%1hvpjug0xAEVb3q0DYG~ zA~IrB-E0fiCv3q+0tBzl9;q9gxd|R~I`ioX&)Gd%RH}6;nSVmyh}fzl50kjh%__mT%L+;j^s;bIK!Rk;(y-i31UQ`GLt`~TU<%Y;-#H#1t9!OE$I~X;x%`AlDVtS zio|^s@7Ba{_2p*Z=?;b(F>SBhtg&@?4TRBpPAd{>a`gz6v-T%wJA z^O`hiEnG0b`rDmZf6hQG+Aq3iOm`cTj)%85p2aJ$Bv2cz(wsA5fvcqgD}0`lCE<64 zSyUo5v^n%u(nMm8Lb<%eQE`Z0W9Q4!sDzvYHEq>M9(?ek!=#yE1nwQ}XAm~dQ)(uB2zstI6bzuFpjK24JhSBrel&Vnwj=Wtpl%#8oVz3=&5w+ZxL@)d2}K?=clAZ4MCC?EJb-wM28Lgi!1)1R_t z<-%33CA&Q`(ZuQ~;Bunczk+U|>*uv8uC=?!LWuKl{us|DxU0qgCk)9nV7qsXaRf;> zl!~qSuVs#Dm*G2}QYV~Ah7l6OXdb;0Bk7pXa?qC{3HJiPWadOI@-Ro$I#W`7!dr7x8Zuv{v~% zd(Tz9fr}%iv)2|peHa4^a!B7Y*DJ2Ck%K6WT}!9`Q*F$X9KX0jl>;BXxi`5t1?&UtR!&B!K6U@sm-EpS%iK(t!hgiMDQHNWNa?UgwLpBvGn(^&=J!@ zT8fGrZR4>yTtV37e(}YSwIN?Ji*B94H+rMeGU{ZGEf3VvOY--gZn}lYUu0`4epRR| zSF`bHww?KIDVV?#V3a=ls+bj=klRPz@6tPFY=)xsipIjQnK%}ax5eazdvIW)%G z?F_|LR!+e^Ox^qopb~gDT6__(T7W_%kI5G&xaYA)E5S-OGb(TlH<0~ItjE5qvFcf@ zY3EAbuU?3dy<+byd9p<^;vsi&{9!$&3qgDfPSX z$}0G;(Vq$-!dnY4iz^DxLyH8=sNhZq#t_RQ!3^@VeJ?JP?ZI5?wREyknP1rX!TOqY`_##u`CQ-BPEMw zC{(vCgy{x%h={3Bd`E@A(S$S2*yTjNj-cpS$>ep~4Z~&yb!6t3zP~ zJD+Tx9^*U6Dbkz;@NZ_u&iL<+&!4z`inW`YE`SWm4lUBsWjoO26)VuvgQRwAh%j?) zq3Igj2u(tBwFTcA&4&?tnU1cY5Qg5B=}M1x(*py)_&lWq4%a5JnL~Hfn;3g`+4C>|s&5{sMQ!_0h+=}; zoX(9d(B(!v-+`o~%qNl0wn$!07x+XFYechWnLgHzZ{D}I z6Ra-((3~W&reO)ig^E}Rta6_Hdle-LGO-e7$Ut2lCg4ZOqxI)hlrL^UktoGumz|#A zOturErDuNC!=sspcFuvKT#q?LRZ17NzhW<1*4K|P$FJXzpw%-wK6l}^NJ0LpwaN_SAd;e zdu|-K(Dh=Mn9EoI;d^;V$^qSJV1+cKMh?^mtDBaJaIOfk3>Gn+eH8_=)zfT{Gx;ru zpdOlhvrm^Lo%B){rGu5$t&V(Gtg}pfmi-@wdJ5nn*=^7eZs5GUC@`~%%D4Luwd??3 zYrDZ~6)y4#3Nm71+Wc*L>0@Y;7$jC>RhSu;`$omOB~=z_E@hhG+LYTiuYg%$i@CWKDNXSx}yvyni=gThS$Rprg9kCSg z$(nQEK4Iw4r|9JiW$xkJpN-2;=H}IkpBOJs(q9r9+egyOt0q0cfqMo!mmBkN=^rtt3Vx%15 z-m9Q~)5EI*I8AXIELatl-c;~y{*Co_upc~2I5Uig1>wt^(bse?2Ba-;`(KyY3!#4p zDOtFkPvcPe*S=~qt>QyP;VrXTTda1!iI_vu?^2o!tK(}_`uG@@*E7`la;FqqqjwuK zL$t5<*J3f(&(Xo>CZPla>?{U##nlI{+dWp15W_d=>aR;_s9%-vWPF>B)LKMnlgo5z zQkO?<#$3TIFAp_<*BNt>r7$T*9h?aJr|N{|l_^G7fL4{OEf{bYzc zPn2H6@Acvfk02)pUmnzzR*JvJBKwHW;}_~xfX)>`022?BCPYHf1xe`X87}q0^t4bp zE;{PX2I_>cc7trl`r--`;!c8=owQl-5- zDTyE#zQ4FdI6!eMEsXq^Q!Vqw9dBSQffRV3l$N>o0uqXmc8c+y6&9vI2#(Gjx1SR3 zJ&w5A>uOBM&?y?fSZepu)~d9GQ3U>P`yF-{9WWB1ekD8#u*3eexvW9`xy!na&_MD; z^;1WXnc_MCEoMtWv!-*Q9{W8$K18PRhT#vslCeU>u%;mtn=0s>cJ1tQ11el0MORDN z=FfNxj?ACbb_9$H7<#8M%WFsk@VOA73158SjD)J3{vLDhw3-lZ;Fjm!OWrA*v0&zu zZ*G(ZGHlh!$7YJ>;Iy{2#V4;t< zP{tPTQKq5yUy?qDD#POc(fR~+9I;JU=&2v{-&J3g{^|y-bN6Jh-lB2}(RR}SxW(B1 zxzFw0LeX?7_M06~V151Izx55{^eig@Pd-$!uxo5`HtAl)G?`t-eug8l`z$(}*9U_) z0?Wj6yMLQ3pGfg29-Xu;G^0C(3T0ydeQwK80xc@WaEmI8VF7=0f_s0?-zH_iln2h< zs&q(RS%o%)?CmbYua}r4`8_?og)>Y4@AO<97654@H$y}LA>!jY-N1`cLZw zupN7-Xh}*2iea>+5^N&8VjTRIsGK6~jrM_5^I&#`LNZC$>obEC22X{=6 zxX;_WkBNnRi=87wyJ!+E`V-;(MD5Ar{1-qijAWJJ3f?pC>rd}fkW5$od?~2=XZ0!n zaQ(LOvH=ptgVU%tZQuvoY#jP0rm9==oxta#`JT2C1D=k*q1L`6Q%>%~`gvx?`?=!? zKJwek-*{PV#@|15`E87d?iuP!Y+0vXZrAI~aF=ddLtRU<0i`0Ip`1AFfQ%dcc80d4 zJXY#r7Jt>~w&mM-MEE|-gB;d9U^6ZOzA({UmCaGi%@X9ggpD?Y9IzN(29TXS(!-~W z(d*LlU`jRY+X15q(&BT+gbH=mW4VOv=nSi%t6pO75F~jtoG?gmRPg@ykEtI4I9S$O}bM=baPTtEobWTi-sQdsIqf%;#O zg@Q+zb?+x0&*nn7w z*ezb3fAphU^3;s<0ADF~5(!=k079Fd9yB=ey62L~eT0v`LrJZlOq=NswHsBOBM&}w zW@~z03`Yj=?XQt*{gA3YFYmH+R5{ZS<{H$wkKx%Sh})`7Ke_G9S2(K1dS5mZF@eM+ z;`VO8q|F3yX~w)6jeT>gWRW)F$KVtiJX|VjklvpIze&%eZQB{E>c2p}gE6rk*EF>f z(GKd?!ZFbv?^gL@&<`~rsz0jpm{khKwEH5?%lEuWFS!oB=XgVJvT=_fRMo0sV4fpU6g#p}mF_T;-5nvF-lW*KCxcRRNs=V(VNW z2zR|;Y2%pf5kU))&oVC3Vj_zVHqNYXf*R9vkT}iMO~Xw4OcYeC?rM&%hWM&;Wzg zU_M%6CVvkJ?#Vm!PJ@ioZ~3pBoX&9YGfamFvW6mDQ;(7_zGJK6Q&O6B^Z0DeWB=~O z#3{Q?yPCFcZ6-&41|#opfyizI7TL=*w1&$c$VpyptYHsJX~@|zXs`{vCYFH^QqqkA zr~*2|@YyL9NXT0F?Jkx*bO=(I3D@8_E0tr$eER5&fJ#t&o)ZlBG!!y-JO5(o^4Vfhplgl})p z+*Eg2jPe}SU&Y^%V_n%~4?1L^Kf)$?XpImWN+11yurS{N#-FQu54TnK*fKo%FMB@L z7fQjsy^*M>l{-zfQ7s6KGttTXoPzc%W&Pl9a5fJQpP`zq$CF!_e=Y+BND@h<^73_0 zl5$3N>Z8LzS^u%~XdEePx$Yn2L3C zpgk{XiHMcr+#vKGH*wWeBu-Q3$|Gk)o9D!J2{Qi@phyiOy(D}{NSF2a`>*Y2)KCmW z@u@~vG5soci|uFaIj3;37h|j?I@u_W`~}#?&=7bei%F6?lKIL*^Eua5dX<$=jF(WI zGuf~CcEK43Y__2our(z|p2$`&+*ARkaFc^g%_KZekx$}@aTKCJzA5Zkjrr7Qt9}hU zUr4ddshss;eX;&TMI1F*cLtbk*}j;(qM67rMh)xIT>DKE-fTtkSx9`+H|3K=Bp;z< z8CI0PFB?^vV)bOPxi|N>dQOwb)sgWmns=7Z?vV8+Jvf5X;N`V{#I^yD>aq((sdpTB zjB(gOyElw=%fy4s|F(&~c>4y5m`l2y$W!IrdGjkTo#u$<-w6F2910kVcI6fjA{7jD zt(TpUuFBPSBBY+hMeVpIB_tRwm7l#50z9tL@=DojF)b~B#UwuS8;92?mqpqWH4e#f8W{_rmiS^J0NRh zTIA&ehEQ4SdOJc^goip4Fuojt@6`+LrK8w;@%MGm09QY$J%ckcHr1EqlZ>fbspd!1o{(zOfgJq1BY@=z{>OL*4!5L5kTBu>87@l^@xHG7q_76Nx1>RK)l*qaY zmavnFxy6>;u~2drE;n|JRK!QJ(Aw7)Q3zc2Wob8NA1m_FqrtlX8j`mUk>1}?BdC^} zg?`(Q1Y4sZNg5DdV?Ed2hEr;E8=i+u7HGOmKWj>55 zp!L0+O6mt*l^$10%on~OLp0T=oQ2t~@R+{icDJ*r^OQ?UTwVN>;N+2w3e1WL#HVsy z;)g}}YfcnM69%)2u%)g=kO12>UG*C$%(i&oE*wecSn_seAJrt1Lap`Smtkz*;$Rn~ zQME*<%;I|f$GojZ>`z73*A?ev(ttI#^ZT-gVk}J+B;Y{qsb>@hua%~VV%7#u7le1N zyZYbOD*735N!t2xNj5v{Pn&2`9=P$U!9VEEQUZ3IXG&shKjqt_U*oeai{No%mzQI7 zBdRRFlu01+?g`l>7D2_&i=|hM(gY~DXqrny(IM*sd=Mi*Z*m_jK_B~X-kB-dy!Td_ zx@L4vR8zTud+jHi>~^b>;R+2qpAOk)MiKhB7Aac}^sg#10H>rFVFI#F(m%dF+$#Nu zawyvz=l0&dW1yQ3=u)kPP^3FyLp2~o-T_~xVqqAqPkFS1V^fPcWLJj>pHa2;jFz|q zupGtN#Esd{PzA)?2I4qB_7EM3l!V%DPU_VFt)CJ4zfk?;9-7&6CkHmiTdH$9ss8Nt zk0F&;^B`7LE{us)h`2s#^-RY+M6v?nBHO7UOFQKMRH6^x=?3Cz-jb?u&rb^~!n1M@ zwV{76gCxS4C*RB9NlJZYnB%y=3hOKgR2uRo8;1#3Z`2}a$H4r9r*#9Er^gL+AEF1} z7+X&lsBu>mNX-=8y-FNS(&027G%X}}Z15}9UB7ZH3c}_vl#{n`2%Xc<i>*V@j|j~@^LCBt=U*3(o~#j%_{GzWfjitZVo*2yzZ5;T^SenR(k|`Gl7~IN*KxK zYFPx6G@Cz>dLVC!&S+^*e$?i1z{vq8>Rb~yQ|*kRwi1_;ky(oW>9J_G)T*()q zVZBJW-SYyLIKPX@s-*bLt%!hcA6pagk4{!@!zXI7{? z$Z3IE@UK*=K<{sAYobDu;mJVKs}Wf#dVtH*4WJRfTj9Ix+U-k`Po@iz@=}zlYMG>e;9i`&I1=-3$JpJ!JZD;2C275trN%)rk^JZri@j4KT*Tu)rh zfUWSkG|HL|^`o9PAMnKql-=a3XoeVSsm;2%-SoIYYdxQt2{s?jDxgK;CMws#kA5sk z(+{(=3_Mdgl>Sxc4*Pd6sb2e0R;3~a_H_L2V^+eI@AHPyGmtTphi<$6`||}@0jn-7beIsxqBFPkN^znWB8&-Ki9*i*Q7~fOFkP>ZB_V5|}hD^Q}8boaJJe z7U-nRJ}(N>f)_ZQnuTiTkd7>w=UAW>%luJ-crc`fNp%-Xje#y+Wf=`M9BRB$o1Ycf zyG+2o?myERi1UX{bC2530UQJ#KIo#?W4L8=M??mThf>Z=KXa}yxR9fWg&F=ZLEn7F z0rb*L@b21^rO4z!)2)DH5G5i^R|PZ2$0>ru zYNQmwc))y)npg*FQt{UKhc}fvI@6+h5-DNg_J8do)b60ma z_V~7K5l%5c+5MuL?QT6Rgrk`+W|4K?tgor$8XiC%(TebAqg3=z$J(b409}? zBFjsSgx+F6g&@Mqs|#=xPd{*329u4_0NUZyy6!7VTZiK7U8Efbs5)5ceO4I+Zn0Yx zv0bHat%%0c{QS9rZ>(T00e<;b1K4W8joaArSLuzhVU_f%GCG1K=$}X?vsLL(!pU5& z4pRfazuTO-g_Bv;qHv85HJ*DMYn3u|KY(W7O{D6^lpn=ENg?@yhkdewg;y6R@gCe@ zMNk#Pjm13-wj@>_N@D{*^~2=z)f5)(2!d<%E2q~j?{W-s{e*8+K9lMj>>zC4i4#5} zw8hs$WhB7ZCE7>NX*_qnMdBd{^Zr4xjbCO|-?z6&N~RXUMYh1iojn-y*8m6Fvqs(a z*RY^iVJ~hOp6Bhc`jmhk(y-<^-1fHE<#Z;S3b{Y(_48?Zt7ffHA!4Mg_FWJ2rtL6<0_PH_uQdT8LZ=ADj7^xpGjC)7p1)RBEH>)W!fy6GFD1?S%74aR)L|qNGECibyypE$0Th1;{>eng zw$T11QrpYcP-V4kd=2E4I1tdZyAH9ZO)SmX!Rm;?W;VNBV_|}w!3JbX+UtxSsOyl_ z*d6oYgZ4AWxl@qb%6EyRFD$xA4=ZQs+{+ly$;MjC_JY zQTxPIuYVO%!Bi{CC^s9;cuwDPvap6EUk5S|j=fcFFcx8Vl=#U+oB);z+#RSYa_0vH z#iRIt#J1p*{FH1T7w`T|lor8t8QtUSHYJE0l?pj|%LmbFAWL(I)K+ZI1Sc&>l*8bpoJ#Gkuj(!sX>ER`C$?j% z#cx5{0)heZU}c*FfMe^c<%}aXmO;u^VIOsh&Y_Rq*h>DY@Tb^YveyMDv&(EiDf2Gy z_H$Z@C4x5x;vsU#EqGyGn`e^@#O~IVnY9STgiI-)f&|;oc>wY1L<^Dzj~RIAITi!D z7rQYuPoHjk8z=trT1|EEPu|)l-xH$tt*S**a^pj$*dfBLi^>sIlP=8SbC1~i!>ed8 zT2voduwWd*K6E5KSbn|zY@4aGO7~%i6V>eF6!?hob9Bri^7E%hQ2p~9@ehLMq{pVh-O7h!nrUus<29S>m!AXkJo;R-y53 zV(kXofe^No>!`kJ%~}KV4431>Y8kjEAMH6|V?%M$q7+iZ2Bu-PEq0@lsxP}I2ZRVD zwtkxkh$t~tX%drzwn#s8$h|F?Y3Q2SUH3Wsmi&}i6dC`F~apuRahv!)|F$`plj2O^y9oNUUp>9M!9^|9K@kMuDN98lug zi0V&7@M!3gY=jDphcnlV4n~%M*5lh7R#3yJ2MyC-7b!yDE}aE-uUbSNDk1u=RA95R&=*ME@quA3X-|U& z6|ZrNY>Sp141!fFy(c6p0{#UwuG_f8d_s92GrYS9YSV_-pH;=kAvZ{j29@ z#>|zaWnt4?ik8KIlg~(O)dUu3_`^PxN;8SMJZ8FM2d4gepbR$56Vs1|oEM^<8Ch{e zbT||c%ly+MF7LgcCWRWO@(SbmM>JXKXr6Y15~o>VnRZ&C;k4xdBbAC)4s!%488Z_> z+?ak3U^v;(jlW}5Y*NiMR>usPpsP0x0EJ{h^7Ehz%cm`4c7g`A%nl^ebaEnKMFH?K zJ;lhYiIx9vp+#2hA*mjvYA1<^l}hjxHw^P%h)LX6OZr@O+IAyL&?TXY# z0_TO z!hb_7h&WlZ1yE`o*tb}3WHWy1IL9JC*Ix(B>?g0#ySrQ-@S~J7>=kV_5sf~k*u-P9 z$bCxwdlIkvLUAzo!|({2(VD`y$-W&o!^EAGtgR77DGwK}niI&>5dwPSWZ0N-S7d{*6UZ^UTwJE=^efrip#= zYv$lYBP7w|Bi!r*HLZXRelkqg2-)U_;sA@W|9<$6c>yyc{({?aazNa3exMHl`TFNmVErj0oS2Q(D0d z{>Hp2Ri~s}XNiJ6d8beRip^UF8t5o1>WaYtn>3^{+w1UOs=bVF6ih;zwCV&ESk9CY z-W_TNItk7EfpQ~hHp}}iJ|LXKejwN@f!KiW)PL{3&mquRap2(F`L0|3ox?P3fh;i0|Gt@OR&ptMw?ScNPo4BnoGB+Dbr?ssVSQ2o#Xm_9|#}2aO?6)o(lpqzcz_R9eGsHAI}PZebH*#XVQ7M1H>D%2IwNn z#}W<1Q;dE2@&Eptwz7q~Y?jgB!AKfFAGmn{*;dslq>=*M`cZ)J@=MmFfJDK5hr)Nx z&|6#OX;CYAK(Uk}HXiun!~nBnmEwa5jU3U?FelU1zXnhnJ1K6hf;GZ=?N60w;OlU=Vn9iB2=*N$jk1E9Bt8LW`C}D_gG= zi_;8J;8p=0#(1buHHO@*^$|9epz zYoG3^a)BUrVALq!CssTB_TDn;Kf>Z{uz};?gVe#r>nuQEW^m5u2(2TsNWi@t+_VC% zid3u9SGx6y+g2PDrLQ9tjnrMiE=ZN<5h}`v*s=kZg(6aPO2k1}o*j4DnnTgc0#s-e z-bRoc%v-o=GhVeMzA&qLJuoGc4kBO8x%k%n_UG{?sJ`ch3ci(U5YF zFZjo8KbUs@EDZ&sU6*Fy~2ep^Pis&)Dv(8g!5*JNrsp}ZSlfudKpo|eA%MO~# zhINAcw()}3!eM3_DV^(nAY4g}V44g80!(&Zzlt{+?2fC}&po(B%pzD;&$-fa!J>Ko znFWO_Ax-QQIdyer7rlBpm&8J<)=6H2%7WHg$7J3voMN(p1b%H)RN>btKWhKA*7ImO zlk@c}-Ps@W%#Zra^8JE0^`l(v+wvYa*|_PrkdM~|t*$5cjg4Mf&3fAoftX{gwNy_M z&D?XRI22?djr_&3(45u4u*($iPKw`Y* z5M?hxzVYtuxCo1K^Bji|KU9i;VIojoPLJ;rM>Z0!8676XT&^VQS0GK;f}p%%ztTAE zh>`-G4zcRM2cVwK ztUl~xIM1}xZBji2M;^q4r}wovcAA)@#xvkPbQfnwGS%WBPm*kp0lB^6lLvL9v|yke zlOOLce}boBgNR$YdSKV)rj)zfjS0-yu6@Lijlh4V`RAA!<~CT0f9^(I`H@1u&4h8I z%$JrZ5-rw0{^gV+vb195bbQMuuYJpnv+QbzeN_6S6oy(3yPJM}kD8I_gb%R5<6>+a zj4nWJhxC%cbfm?so;brjVz4+a8L4=iSTY0U=TsS57dk&9vpr??lSD*Gm7Ykx5KWQs z?hEP~r)6n!KaU|ELk_s4tZLQ6iD-iXXm?}vw>d)EsqbNX#sxh@{F5Ti&xE#+S~AuT zgh2g+s{um-dR#0n{=R`~w%I5pC%!kn4LB%* zSok4Lol%2297E@KGHd?Y_DL zt72XlkCd|^v#VKvRU?;@{qySgOSnn37l4VW+rydG1?tfB@-=Kir>rc@GsfH|~I zvn=|w+0exIA0ROAj6#1zrKM$AGEPZDNU{bI(wX7FJ6DQ>Dna{>g8(o&T&X!=$RrMimR`qAcARx$k+pP9=&o$D&&*3z{}K3Avj|8iavhmKiFh^AAD5^C}1nB0LJ zalQ$6Wn+DG6u2vnCdjSGYe;$)h0`Izls8+Zo>61nybQmX;FJ>iq_5okb$?zL?DM2c zucYxHQOXBat-q}681|8lj!2A6nq10|i4>`1N~G`24Pm00$(<9QvOsx&MRvQmlzIL1 zNJwl91NGtv0&!z8p&;D}--$JH0LR2-50HK!0(a?!tDPWOOfM0VfP2r=EvKWWjej<> zzjfAFn>@o6qflePNa|(@7`xcD95%ZB+6~V=iruZ7*`lZ2$|vjL0w~xv1?q{6K2Z>l zu)bNtkh}P-OPB|NEcN`%$oDFXPGgvrgHle+ewGxDsJ&8f2S zwb9Gq1Pg}ZyD9206w5%Jy(@_4c-F22C~^LIiw!0RgUMYFXIs*zhXCWrU%}FMvVWAP)%E**>nx!$ z4U;!0nXl|D&7_is+!ZN=GzHPTTG&NqMnL<3$PIVPQ!vrxmR!q*3(v;?$%v;Hg6xRap)pz|#1q z0)5>jGW~~xK5bv{uj|{z2Wzu#mP#kZkc{k`cs}$>($wi(bJ;aIG4HVP2#KAK4RtM7 zAy|zmMc=2;Ld3@HWUI|q`_^vhgsOFcq%P8m>1cK<)+rZOa#lFLrVXfvk z!1EVj^ou>rrKdWEwtnySVu}X(N+W?rJ>~mt7xJ;!vVr073$1lq5erufX}I%KQyMiP zoZxeXYLu-=-g-8S5PJ$pl(zO^zwD-kWM6P*I2WfaA34jeec|J@1Iv|^;At+*K{|F z>0Z00WE=J=&i20=nou_>f8>6yYX&C@vh082aL+VO2o_K?THT=})64jNy%YJ&<|poV z>=sdK$K}{xDKIkWVywVm_J(1wrP#o1SEvwK{4130zz~k2DJ9xuQZ|dx6QY4E-%zbO zo7a&*6XAA1#uT*?@4?_145t>cv|Nis=&8cUW%<+S?;frp;-jBJX8LO!1jYQ;T@#;E zUNB?wN%oY@liQ+nHGREVNdkLDQd)j0SqMQm3Q(G278IBr38C9iXfiM6+c=+Z?b<;U zi_U1uE`{$d87i}K>YTZe-{V$cH$lT~nR`d?Y~Oe#sPd5`nh(lq>mDi0Rc zYC))$vmDinlwpWat;*Ee{AHWD@6yF^#BWka>2Fo0&kX20b-xcq(-sWdRK1>0Ms+WR z_%)c9#O;*ZYR>JUGcI*s+oUXW&0Pz=pA`0N_Zekupd_}kmCrpq)30&PO0#gB6bZM& zdYS7FwZzsp17^Vjre1-lWx|m>eushbe4rc{G$iyjvF`8fzH76YzPAi0hZua$!ohm< zKvp1Mw4&Gd`az*t{?7i#Q4_O!mW`3>M}ac{k}%EB0a#WpO9sE^quN7W_gyA9l~5R| zKmkYSV#7aL(R>X%GGy>SzJ4n?53s8Ul=uapZ7b87<-268j2dx?#q1vX!vu&GBB=H-K_u5_zEI8fCwEGA;GVN`7B z7NY(p*Q;%j&`k7|82N(=a9lBgn>dg~#JrAEVkgKGdn0~8Hr?v?qWp}W!}*{VqfHDX zDmQ`mxR(jiVG^L22Oth3bIXH)fyWDHM~b2IbG-ei_HLbjRc4R+HeYr8?2eHmf$4OE zhq!!rlE4z#;FWVL{pXV!3T72=Mzmt1$o;n)0V^oJK9!|BDWLI*YuIV_$iEgrBp1Te&C7Uqs3qd0!|qrur6pSpUOoYBF^ zj(Tgzt?&4ha?PuJsX7r#PapU51-I)}Az##m-_iS~gRk zz3CL}II=2|+$(0R=3s~QFFouDFMf15odTJ>>7gtQCJ2y@nlfkAwLG(!5*bsl0e7^< zQ!~WwkHfXfHaD_6Q=Wvs*hac?M{BD*Dvxg+jtvud7VDup*BSIJRu&f!QGYB#wN+oO z5uB%gBw;eciFgEU&|qD`v&%sV4x(61$nrBKKZ1W2hC|OC(Ip1}4SD0A?%KDsu#a3k z!I;W&@KrmiFz8r&=_)oaMLIF&5_spVw@}pPuC! zp5~ju?ev9{A!e2N6PdFb26VwdUqbc4du`bOKJv4Zagx>Dv3d(Rq)PJeF9+X@WyP@Q zjHSrn2nX0xi?YD(agi{M+P;TCuA}hI$Kias#tp|q)ru;d-Bpm4u%<74Q`@Vv+5K`B z%vGmel+gwzc5Nouw!1}QPtanz9jwa9TBBbQ*TGCtN+x6-@!+v;AX`upJbYBLU1=C! zm)uvjV?qkN7{f-GcP!q^^WRd=6l?MS@tprPtH-V^r^dYfe59oFLWF%TmCk4C$fa6N zoyubr0VX=3Pt^Il7*5NX68)Vov zq}`37iqk89@0(^u$jC?Wo=Tmwlq+;$3PQF)1XR;`Z|d^++NDwA5?P^euRElxU;c@f z`laQVRU*beU-T$>CCfy#KS;x`=+rSN>+0saWLV=p09nUAbG1?Jd$X0$sI01H8T#Ix zv$LUwtC)Y%t+^5h9b>j?5%694n(LNHt@cfQ08goaf*v(jk`0T`K82&AXWrUUEK4W} z`dH1XA}n>{>Roxo$4a?LF|vsBDh}MIwH(iYAUtM`A>sA^XiCqpZ5nPPfHgi-SyzH; zb5x8u?R_*gmarrBDc?rOb;Ic%sW9<)yV~YNq>gwUcoLVHmHNw9CBTfZWy1Vg+#&Sf zBQO0-3AmOnwW9-ab#-W>tj#S9!aOW8L7cqA*wom}tW8^c5_`w}jL2Xg3~YqElQHSY zL!~2^MGTP|HZ+^PFY3iuCJoxnc0f4yPwKHCv2iKkta z#YADQq+m{NQoFh&);Hl)x(i1bv1L<1(%vrx{jpR~_Zr3+10|KuM@AW=OHiaE*wc;! za*zvr-6t_m%t*k^O5!H1uFrb#rvr!PXCA#D!l$2#t>II1Q<#BrmBG~8KwJc zcZ5T@XGxDq3m&-{4olAveWIf`-j$n(+rRU5aOwNJ@^noce^-H755O{4DUIk* z-?lD7S<&iU&naP2cd}m|mx5hQC1q*PsHfehZfQKY+TDk-dmZi>D{W!&)4L*7u3jrH{ z5aFAG9&4vKym*h?+|At{I?zbf>Xu8x&vKg}1IHE_Bl#GsWmZ+U{g%Lms!>qeLzTJM z%6T=upKReZH~2P?EHql5)_dOK%*>uWefm(nJydFqw924ax71JN%1?-(=>X|QRNA)y zc}41ZfZe@!;&(sVGG-KFJUtN}%aQm(Vf)hU#ta})*ycyD)$Z*r+A{**a`n|hF`ha7 zSmI+_H!fo&m$XHXc05LQ9;8es^CMKZth-ha7Yrd`qmCeyWY}J3m`defiYE99&g!&9 z$er|P>t<8-K?{2_RC~!J z^aRV34Fh@4>u3m&hBjW*HXmZ489vIHdhY z(s8rSA<=zs9Po=v$r^}vQmjnDm%M~bxpJ+@t0pIz`q)p=_OT?y?(}hudX|jT!4n5L zBj46YYHm)<_@bH^bZW(DXbgeQ)e$CR_H=P9Qy&(VBcu&8DkZ(uGwMi-P{!xWJJjZ6Ox7f~{=jAWd_kEadRHpc&ZHf@^SYparm z7{69|+!9_7_arE1R#G^<>Z5E?@Y{|`hd5mhKG6WZ5vQ-1S%h}F#;Y56k=7q`-i6Ts z7=WZcX%hn?ED;LW_|eTS6npysFjA3P+CalUUU$kbbAXjBHR(E5UDmBacg>^r$M=>w zdfl540mSB@7|0Y>a-GgBV^*!)4kUwchF7af@2RyH=Un}fzpYZTu zBC+otdB;igE;qIn%Wenevr-N&E0dp0YOOar(oD0qe3!p0#biP_v+`Vr-KFabRLyNS z^X*he>Jvj6>&gGrQ-b$xogCXl`4Me$)3ffo@DO1_)T&Ekf{{Qu|0ofs#qw$lw-JfK zl3k1blr=eK9^U0OyJHFT+FD+6BCxHIA}+@9N7pRC<8DZoZi@cR+$k|u%W;kFvxF%* zZu9L6dH8z%SLmtl)T_KzdwY3Vn?ZYYc!i+~>t}8lZ+#H=4yC>OcOEhrL(c9^e(q^4 z6JsVY(U{FJ5<#wev{f=iN9i8z2n(D(Jml)l2-ZHWs-{+H#6}fUIT5MUffIFJPqN}6 zrO!8XL{eUYfpZi9_O;*JPkI$|Ys4`??FgBw;Ph^BElmW=8{BR9Poi>M7Y#v6X_`%P zPWX5u_5wOHlvEiDL~9#(&hfMbP;Pu>koIrO9i|ckF3)Viv)Fn5>*Mnv&TX2ag)+h+-y9tF&rHs)X|nO z$m^F zm6h=4`Lyg!SJwa~;$&s^gmm>0GP5E(ufjLV2AhLe*1gtX?jjK94<#<34Yj$VA*lj* zgHXO%)YO$dRYu_rjR1!G2clWq`hk$Zd%`RyAK>OGieOOZWyLvv z5Cc}f@lTTvJ~71y0kq(jAxd{0yWf$n^S=(^VH;bvYjRseZ3iOA5i_~&LQW{PT$n3o z4!H+zHjQx6Xw#9ms~1jp>cj`^xSAdZ?&eIF(+E^ZVw-y|=5XG0aqU}{Jehjfpnrr= z=SWj>u^Y6;3ZJZhw&ZLLyX3}HtxC@2xj$h`vwy@C5rL89aEXu@S8_2u*RVRu3+j=)~2a!bXO96_tNaj0-B7_^Om@eDxGDWq4PvPt9yRk(!Jpyo;C{W@yY*`H!UHxKvZ#Y4pDwJ@N$ zpshO28dHeVlQbHv>Qy5%63i2J|C@X8&~XtRGvYK)(rwI_bRQ6TftVw#bPD{$pwZqY zWlpk?p&vRJ^pRXlPXe_hbVn&ZRy{@tM%=8@0gIYdta4A;cUggDC%5^CaV?oT<0W-T zO}J&kr~n3{odj~l?W@RgQo+bjRt)5HOqMC7$NIvuO<8kB9O<+OKS5M(z zjPwFr`k1v~k0A!voZiS1u?@&g5>-sl;*-zZ!8(F^}Ed$K#eWqmJMimmpS^yY8~_ zX*C7W(24ijb&3<^f`+w_v%;2fH}sLz|^DO&bAsCD{K zu)EXBlAT{5(EK;ijy!dp`^F8S3#AotlDBGy(J1Y0ws$X-CbVeU;BTP{eLNe5xbHHw z1k6fb@+gWsbLX318Z)xFSxvN;n|tf0@-DwP<`r-_$JF4(?o8Sew4$kBT+Fx7_DlG& z@?=KW`8dUMHN>`ZX35R$v0LYV2ilQ+?u;66&q=O8MbSC;Iurk#Gx%+yZn3rtTH<8z z7)L)g|3XE$PD~l;nmlTYqQPWfIZ(g#-VK4 z)D@W!o}V%9Xi4n4l{?HxSj#Y z&*{t3QnulTv_jsXP=w9e%9JG14}=JRYI_G$3efDpsZg5YGq4-5okKSco>ADVYO?*g zwBr~>z|&oXe%ciqML5u9h0haBH;^$EpF@YB2U@rdTojLDu?x55Ko7=!e9 z6Ht8)ClUijITT^qdVKvo>e=N*W7|Xh#S{Ie+67l*q}*TFds9Nb_nVMeevZs}p76}4 ztbBvhK%#x!K`aHrw)E72Ciiw;#|` zbA(_XvEEcg3lz-SDI!z2i*hyidGy3zL}9jMiUkAnv{@69XMcu%8>A7gld9}ayXoa` zQcwH{nho~i=^1Vuhgl-`0u+;Y+O97dInu3}74mM3yRqy6_>{$kEl2z|ES;G3T*L!W zVeSsxrp;XpY++-4t4AiUrol&9I1Kag2n+Sldmh4u_!I`$y*Nqx+-<7eZ!_a{GKcp2 z-%+NnQTRKlhcr-E7SYZ}zCbn3bcn!Ekd02GX|6d@*)O zAK9LWyWxdGk#375)A`>unJG+%r zz{_u4K9W-#*3fG@9wUf0(Y2ORvj>)=c&}(@{RY!0@F%jgwa`QC#B04uxRHx8Y>qKt zTzE{(tU#z5P#fxv^K+Ml@Is1J-O_*(Z7PyW_rI)I?mgs40t6FpIf)iGd2W(XZN@}?%7e$&x9%*rxn^lJ$-`Ne|6 zweDx;u@e=>i>{2gJVu2s$jQQZINxVg+qIuz*vi`tZL`rsv}Ich3IG~WOO2L2zWIak zE8-kdhI|%V3#`E|C)b_SAD#xmMwx1`F*DU?8cqhR-sCVS_?vA~W><1Flp%g%(=`1* zQk%NG%C6Jm>6n|qjW@)QV7AFR#VcL}BOs0>mR(`Nz%<7Rx>~(X_=c9Q##ueLNrQ`@ zIn~;k@ZXB31i-aXd-oD))NoL{R&b_u_Iyha2U=~Wk(i5L=~mFRfHf0NyI%Z&c9T=(Xe!-bvNIpqyeF@GK6{ z`wy@tAxSf_9{DrOVl+iyBb}B;k6FyxX=p4XYA>f2>?e}n>GlKR93PH&X@x5Iq;wlH zhV{#_dN8vIpmNMxhIayHucoX@CX&@jJF!C+pFdr`Uf1y@da+{z7UP4B;9<0LN$0GE zip)Gx*J!e%RqrQz_dikIj0!OtN05ZpkgKEX_N5c&l@Em zCo=#qNNby>lAz+ZD#jgVK$whi>D4CS*I*W_CIslml$V%iH0hWy)C`0AWiuU!*}NWq z-hcn7VXqH8zx|vfF1Sa`zo!i-qNp|=Ht>Thil!2;0rS3Ow6yK6On`@qd~v7nfJVmN z?!2-Kmf-;)AvVq1i7e*8vCId&t`=>?iv~K_AtpM&i?augV5z9F`-kKBaI8n$Pga+v zCP>U(c(QtXom&kw5TmS4+sc>WPoolr7Q>bro1R4BdVJKH9wDFI%HPmMLUFwzUf_`u zU}p1f@8P|Jmq*aZAPYN&+(LmZhSW;&2!JjdD>*!q{&C$b`Orb}TgY+S?hdtzbjjy4=G>7s9*R012txt5%Gs2c)?$iLK zc&O+k>#iLK<>gCU-+4NI&M;>-nUAa^{)Gwx^5$ZGoBsNl$s*3T(1MBs?2Gs^zRs>4 zl^Py}X?5s~y(oe9K9ur1Wfmy5G-#$2(4Wd@;WRGLwlnDM*xGBY30%bo^-IFGlL@AF zr(ym6qz8gU1=cQ+Z&QAK9SkTdofl9lJf+X;U~PU=ZH3)nqk!L@XgMUWP65#aC}q07 z@QRP*tW&$PPxMElT_kN&1LGO7GPyyo)0&r(Hq)upv&KC6U{!BO&bG@c2v+Rl`BQd{ z3@58G;*}KY1Ia=~#{#Z7I;xS0A zwU;)U5X_p!3ZOOh)(gE&BlTI2nNCDj}2J-0lL?%h!Mn6hoZq#zBT} zqgB}uDaL}s2{B}qZai_!4|M%z6)Dn|I=CvKK44gfo$_3Lq};UYlqL1~fMmi*J@YZc zaaNX3JUJVd?QlYR%+FnK`Z&@Wn`Ad4IPQC6WMJ^sctRa#koJ#ATh8w`CJzeUW{n8@ zR%G07pZn3K6+@&N8!0g1E%AQl#79qwIs#ICaN#L_<*11x>EsU{owj(iZLD1&VQ^Ix zT!t(@xG&{~UrgBz)a6x-jBh7P<#WgFx45+S$6M$uE2pfPA&=G;>=lDgh5!ymvQ+%# zbb`JO#j(N|8W^w=nV4aw)3u7SmZ=K~`F^s^rd}N!-lX^ZRY)&}C6ZCY=e+!wb+JPz zN5vX%9^|OtOl!ZgIz}%u2CZqs`#~Lr@9yx!N3||0I3zps-@&>^HdCov;^5rmgv#-k zfK(`gfrz-S7n~^mLg{&}wL^`O*oU~WpK zZRWR$$)(4UazNHH9;w3{*L8iT=a8W>J={wSdo(|Hy30iRL7NlYhI;ZQ{KoR{6bQPe zU6n1roQS`@_3h=*#ZfQ;@k)lDEq|Z2fc}j|R67Q zajMX5*AdRLt1_RdFso8}KIlIRXS+bH$HxFEu2iH^TfP;r0E@(1qsE>W@568ctYHxK z;>I8~6fYSwoNt|OYpm4#o6xJh_EU^LfNhD^=i8B#Sg<`$#CmyUphXBVE(5$H44jh;Wil`KYwyW6H+kdp? z!$}v6rg`q}F9QiMM6lKT^vpVRgXG2sguc zqq{)s7%1xkKW(sMmCizce2BGohd0vYHi2H$w+iE$mj0zW-9{Z(sB{F8mL%aXmf}Q! z0ow_8FFJbb=|(!gdQYl7%%!Nmtc+p=qb6`7T`1KaY?+?TiA3X`Ux|l6MDb>QLnyvz z(X7Vq%M7Sp1Nv5{Sm3$|;H%=btT_Q`tfH=nVL$2?nEmIrC{po;B z(^umA5@!SGNGrHQDwsI4lC?=#A)F|K0p`93^h?u6qA^m7oM|hK2uwt((2vUer7=ph z;PWMF?_1?q_5fm--_2n8f-%3(3_?Usm_@ zk95g{>VD=5(X@`qReGXI{6fVvGC{`!VcnRG=3pwRFeWDC6>qml1gi}GJ7ju^ zOEB3<>dTvOcY$Dp&2*i+p99}XYc<;Y-MCWrqN2v@Jha7Dc2Mmk5gGUnh)Y z{*D#LoG{ecCzFcy$ji_=xmuNbF=?b*gPvF5`9<4=mM`vYX%8p(<&ce1XG#pJBzlUS!=2(dt&t6vop;--lkz}EYFH*n zHb8#P9K}!fla0jblkM>B=fdwSa5oYWRvHN5flGh*Lt=s}@&Q{Zx;m~&h_$*g5@D%X zO3@(`T%C0zI6z~pB=T`I(ZrzxccG$IJ--MFJr=ri7={c}KwhUSoQ(dsv(nK+p;=J0 zWY6{bMfFM>i??5i?&s8S=24rDlOJ*?fVbhFsHMr`xO~5sj1wKU zoVnIh0yyL^&IPaxshT7gaLM7-D;Cn&dq2F4ZhXr4RDWbv#WO?0Y@oLM4bFUY1;Y`O zD88$$&#a9fAwfrWeo?->*l%t5Ors!#hus#J&(aa8egtvPge?Z!@3lPiqVx7@2ty zM*P`34Rrxc`J(&r(T=)LlBlRLI%T2ut#pCWT!#n?sM+%bOS_ZBz@|VJpt9#|x8dnm z%rRy_etcgO%pa;Dm;sGA@4SUYax?jmO>H^wqTaO(5oMoV>n3mOY7>#5Dh-)3Q0ON` zxS+3!LWN+R-+#IIY__c;F1XphSRvEntST)xeFU}`db5TQN=s~+{u<}wLp-3Q#y}gs z;>MfK!S9}#6+v?;eXjmd0Cajt$8eo;g5(Q zl99_E$>oIH-YUo{mDsAVik!-hRzOwYsRSAHLMaP%l)Ev1(CpcZBauq!hut@dlFm!;ol z==p?irSUC>8hWAvzT08=6ccBX*YHD|`bK!noPTHSp?vV~5GD2p4MmD&b7o?`c`qqI|t$5)7`B*%R2oNazOG1<&Qi zQL;qJc>_C0Dr~`V$BzOVQQXW+wR1ymZN?7cg_2ybYx+FSFaw~2)$hV-8oX?XZ+q+# zP1>VfQsrO-X8`RE>kID%(M~6WF3qD_S*B)Wv~rl_Zuqd8eG1#?mfjQt7Mmq|&hNdm znk~8_UnBkdywf~W=^*SwSw^ndG(gs9<~7(GSPoB6TaalE?-N{R=vFcoYoL$^-+z^eOOBcI0=`{?hl^MCwy zvvpeBys1W}MRbQr`1ynAF`p{fj8Yq}+=bAHE#J;1sYh`5=0M-rX!g>Z#C#LZ>sl4k zZ4ar>K+mWwAg?6QPmZN&Ktn>R^dAs*)*zJDdFDXn9LY}?T@Bz(dONE4v@W;S-Kw!M=U5xi?M0%clv9_Kz*pCeb+_@ng=()*#FfWymKy12<+d2qL5Ba6~JNNup zhe^(9p_9!kK;Cx3YFN2Y~Ru0pvdD z4dpW!IB4$vjr-qkK!X1!1Oxm0@4Ek!h5+?R0V;m-4J6WW{Pwp`$bVD&mp4$q{|W!m z&;Qmh_`6lzUqeb0fFlq@5t>*5L4|t$0?5O=U$}q843TaPurLHkNUZ_?`Rt(VFfao! zwVuCrOy&S9hrbl)@?L`Hzvy?Y{$7$t($EHAW%8~AAr7j_VhSQYI9va9`tM;=0{)Q^ z0zjo--i<-)Jj-`+s@BS=dG8~}#&wgHA`2Et{F!ot~E zXrNgFd%u8#N)DtNfP+DqpM?#-+~2>OegIXa`DaKWzAK==P&dW4{u zjNcuA;J;!5#rv0b1j*e0O8&Wk_~?qDklq&+1*qkK#snMGvO&!o^t&g};T9|vDBcuM zHeoRczkeC)2gp@22z!F;WI;A0ptc3#d4ZOnAIOdY)I>q_F^HE0l7Ys^*cOD};XNK! zP&0A@I2wYmwzbJ$3?jkE{(TvOmac=Lo!$FN`M argparse.Namespace: - parser = argparse.ArgumentParser(description="Export DreamZero prediction video from bundled example inputs.") + parser = argparse.ArgumentParser(description="Export DreamZero prediction video from downloaded example inputs.") parser.add_argument("--model", default=DEFAULT_MODEL) parser.add_argument("--deploy-config", type=Path, required=True) - parser.add_argument("--video-dir", type=Path, default=ASSETS_DIR) + parser.add_argument( + "--video-dir", type=Path, default=DEFAULT_VIDEO_DIR, help="Directory containing the three camera MP4 files." + ) parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR) parser.add_argument("--output-stem", default=DEFAULT_OUTPUT_STEM) parser.add_argument("--prompt", default=DEFAULT_PROMPT) @@ -69,7 +72,11 @@ def _load_camera_frames(video_dir: Path) -> dict[str, np.ndarray]: for camera_key, file_name in CAMERA_FILES.items(): video_path = video_dir / file_name if not video_path.exists(): - raise FileNotFoundError(f"Missing DreamZero example asset: {video_path}") + raise FileNotFoundError( + f"Missing DreamZero example asset: {video_path}. " + "Download the example videos with: " + f"`hf download {ASSET_REPO_ID} --repo-type dataset --local-dir {video_dir}`" + ) camera_frames[camera_key] = _load_all_frames(video_path) return camera_frames diff --git a/examples/online_serving/dreamzero/openpi_client.py b/examples/online_serving/dreamzero/openpi_client.py index a5d9f6a1d78..246bfb2777a 100755 --- a/examples/online_serving/dreamzero/openpi_client.py +++ b/examples/online_serving/dreamzero/openpi_client.py @@ -48,7 +48,9 @@ ACTION_HORIZON = 24 DEFAULT_ACTION_DIM = 8 RELATIVE_OFFSETS = [-23, -16, -8, 0] -ASSETS_DIR = Path(__file__).resolve().parent / "assets" +REPO_ROOT = Path(__file__).resolve().parents[3] +ASSET_REPO_ID = "YangshenDeng/vllm-omni-dreamzero-assets" +DEFAULT_VIDEO_DIR = REPO_ROOT / "outputs" / "dreamzero" / "assets" CAMERA_FILES = { "observation/exterior_image_0_left": "exterior_image_1_left.mp4", "observation/exterior_image_1_left": "exterior_image_2_left.mp4", @@ -163,7 +165,11 @@ def load_camera_frames(video_dir: Path) -> dict[str, np.ndarray]: for camera_key, file_name in CAMERA_FILES.items(): video_path = video_dir / file_name if not video_path.exists(): - raise FileNotFoundError(f"Missing DreamZero example asset: {video_path}") + raise FileNotFoundError( + f"Missing DreamZero example asset: {video_path}. " + "Download the example videos with: " + f"`hf download {ASSET_REPO_ID} --repo-type dataset --local-dir {video_dir}`" + ) camera_frames[camera_key] = load_all_frames(video_path) return camera_frames @@ -268,7 +274,7 @@ def run_policy_session( host: str = DEFAULT_HOST, port: int = DEFAULT_PORT, path: str = DEFAULT_PATH, - video_dir: Path = ASSETS_DIR, + video_dir: Path = DEFAULT_VIDEO_DIR, prompt: str = DEFAULT_PROMPT, session_id: str | None = None, num_chunks: int = 2, @@ -306,11 +312,13 @@ def format_action_summary(index: int, action: np.ndarray) -> str: def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="DreamZero OpenPI client example with bundled real videos.") + parser = argparse.ArgumentParser(description="DreamZero OpenPI client example with downloaded real videos.") parser.add_argument("--host", default=DEFAULT_HOST) parser.add_argument("--port", type=int, default=DEFAULT_PORT) parser.add_argument("--path", default=DEFAULT_PATH) - parser.add_argument("--video-dir", type=Path, default=ASSETS_DIR) + parser.add_argument( + "--video-dir", type=Path, default=DEFAULT_VIDEO_DIR, help="Directory containing the three camera MP4 files." + ) parser.add_argument("--prompt", default=DEFAULT_PROMPT) parser.add_argument("--session-id", default=None) parser.add_argument("--num-chunks", type=int, default=2) diff --git a/tests/e2e/online_serving/test_dreamzero.py b/tests/e2e/online_serving/test_dreamzero.py index 0084af15216..799fb78773d 100644 --- a/tests/e2e/online_serving/test_dreamzero.py +++ b/tests/e2e/online_serving/test_dreamzero.py @@ -12,6 +12,7 @@ import sys from pathlib import Path +import numpy as np import pytest from tests.helpers.mark import hardware_test @@ -23,7 +24,6 @@ MODEL = "GEAR-Dreams/DreamZero-DROID" EXAMPLE_DIR = Path(__file__).resolve().parents[3] / "examples" / "online_serving" / "dreamzero" CLIENT_SCRIPT = EXAMPLE_DIR / "openpi_client.py" -ASSETS_DIR = EXAMPLE_DIR / "assets" def _find_free_port() -> int: @@ -91,16 +91,39 @@ def _load_client_module(): return module +def _write_synthetic_video(path: Path, cv2_module, *, channel: int) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + height, width, num_frames = 180, 320, 24 + writer = cv2_module.VideoWriter(str(path), cv2_module.VideoWriter_fourcc(*"mp4v"), 15.0, (width, height)) + if not writer.isOpened(): + raise RuntimeError(f"Failed to open video writer for {path}") + try: + for frame_idx in range(num_frames): + frame = np.zeros((height, width, 3), dtype=np.uint8) + frame[..., channel] = (frame_idx * 7) % 255 + frame[..., (channel + 1) % 3] = 64 + writer.write(cv2_module.cvtColor(frame, cv2_module.COLOR_RGB2BGR)) + finally: + writer.release() + + +def _write_synthetic_dreamzero_videos(client_mod, video_dir: Path) -> None: + for channel, file_name in enumerate(client_mod.CAMERA_FILES.values()): + _write_synthetic_video(video_dir / file_name, client_mod.cv2, channel=channel) + + @pytest.mark.advanced_model @pytest.mark.diffusion @hardware_test(res={"cuda": "H100"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_dreamzero_openpi_online(omni_server) -> None: +def test_dreamzero_openpi_online(omni_server, tmp_path: Path) -> None: client_mod = _load_client_module() + video_dir = tmp_path / "dreamzero_videos" + _write_synthetic_dreamzero_videos(client_mod, video_dir) result = client_mod.run_policy_session( host=omni_server.host, port=omni_server.port, - video_dir=ASSETS_DIR, + video_dir=video_dir, session_id="dreamzero-online-e2e", ) From 92d3b7b111715eb70e054d1d0cc2ad82c9557dd7 Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Sun, 17 May 2026 16:02:29 +0000 Subject: [PATCH 24/45] docs: mark DreamZero evaluation demos optional Keep the DROID sim-eval and MolmoSpaces helpers in the DreamZero example directory, but document them as optional external evaluation demos instead of core online serving flow. Signed-off-by: Yangshen Deng Co-authored-by: Meng --- examples/online_serving/dreamzero/README.md | 200 +++----------------- 1 file changed, 23 insertions(+), 177 deletions(-) diff --git a/examples/online_serving/dreamzero/README.md b/examples/online_serving/dreamzero/README.md index 3eec430dabb..495d806eb55 100644 --- a/examples/online_serving/dreamzero/README.md +++ b/examples/online_serving/dreamzero/README.md @@ -160,42 +160,16 @@ Optional flags: - `--save-gif`: also writes GIFs for GitHub comments - `--save-actions`: also writes action chunks as `.npz` -## Run DROID sim-eval against the vLLM server +## Optional Evaluation Demos -This is the closest setup to an end-to-end simulated policy rollout. +The files below are optional external evaluation demos kept with the DreamZero +example for discoverability. They are not required for the basic online serving +flow above, and their simulator dependencies are not vLLM-Omni dependencies. -### 1. Start the vLLM DreamZero server +### DROID Sim-Eval -From the repository root: - -Environment: - -- run this in the `vllm-omni` repo environment -- no extra DreamZero-specific client package is needed for the server itself - -```bash -CUDA_VISIBLE_DEVICES=0 \ -ATTENTION_BACKEND=torch \ -DIFFUSION_ATTENTION_BACKEND=TORCH_SDPA \ -vllm serve \ - GEAR-Dreams/DreamZero-DROID \ - --omni \ - --host 127.0.0.1 \ - --port 8000 \ - --served-model-name dreamzero-droid \ - --enforce-eager -``` - -### 2. Start the DROID simulation client - -Environment: - -- do **not** run this from the plain `vllm-omni` env unless it already has Isaac Lab and `sim_evals` -- launch it from the Isaac Lab / `sim-evals` environment -- see the optional DROID sim-eval dependencies above - -From the `vllm-omni` repository root, invoke the client through an external -Isaac Lab launcher, for example: +`droid_sim_eval_client.py` runs a DROID rollout through Isaac Lab / `sim-evals` +against an already running vLLM DreamZero OpenPI server. ```bash CUDA_VISIBLE_DEVICES=1 \ @@ -209,152 +183,15 @@ CUDA_VISIBLE_DEVICES=1 \ --device cuda:0 ``` -Notes: - -- `CUDA_VISIBLE_DEVICES=1` keeps Isaac Sim off the GPU used by the vLLM server. -- `--scene` selects one of the built-in DROID tasks: - - `1`: `put the cube in the bowl` - - `2`: `pick up the can and put it in the mug` - - `3`: `put the banana in the bin` -- The client keeps the upstream DreamZero sim-eval behavior: - - DROID observation extraction from `external_cam`, `external_cam_2`, and `wrist_cam` - - `resize_with_pad(..., 180, 320)` - - `open_loop_horizon=8` - - 24-step action chunks with 8 action dimensions - -### Action chunk vs open-loop horizon - -DreamZero predicts longer action chunks than the number of actions the -sim-eval client executes before replanning: - -- model output action chunk: `(24, 8)` - - `24`: predicted future action horizon - - `8`: action dimension, i.e. 7 arm joints + 1 gripper -- sim-eval execution horizon: `open_loop_horizon=8` - - after one model call, the client executes only the first `8` actions - - the remaining `16` predicted actions are not consumed - - the client then sends a fresh observation and asks the server for a new - `(24, 8)` chunk - -This follows the upstream DreamZero sim-eval client behavior: - -- the upstream sim-eval default `open_loop_horizon` is `8` -- DreamZero action outputs use `action_horizon=24` - -The split is intentional: `24` lets the model predict a longer future plan, -while `8` keeps execution closed-loop by replanning after roughly half a second -in the DROID simulator. - -## How the sim-eval rollout works - -At a high level, one rollout does the following: - -1. Isaac Lab loads the DROID scene and resets the environment twice. -2. `droid_sim_eval_client.py` reads the current robot observation: - - two external cameras - - one wrist camera - - 7-DoF arm joint positions - - 1-DoF gripper position -3. The client converts the observation into the DreamZero/OpenPI websocket payload: - - `observation/exterior_image_0_left` - - `observation/exterior_image_1_left` - - `observation/wrist_image_left` - - `observation/joint_position` - - `observation/cartesian_position` - - `observation/gripper_position` - - `prompt` - - `session_id` -4. vLLM DreamZero returns one action chunk with shape `(24, 8)`. -5. The sim client consumes that chunk in open loop for `8` control steps. -6. After the local chunk budget is exhausted, the client requests the next action chunk. -7. This repeats until the environment hits its time limit. - -The current DROID sim environment does not expose a built-in task success flag, -so the rollout result should be judged primarily from the video and the final -trajectory JSON. - -## How to read the `runs/` outputs - -By default the client writes results under: - -- `runs/dreamzero_sim_eval/_/` - -The key files are: - -- `episode_00.mp4` - - the rollout video - - this is the first file to inspect -- `episode_00.json` - - per-step trace for one episode - - includes: - - `prompt` - - `steps_executed` - - `server_calls` - - `episode_wall_time_s` - - `server_time_s` - - `trajectory` -- `summary.json` - - top-level run summary across episodes - - includes: - - scene id - - prompt - - server metadata - - per-episode summaries - -Inside `episode_00.json`, the `trajectory` list contains one entry per control -step. Each entry records: - -- `step_index`: control step index -- `used_server_call`: whether this step triggered a new model chunk request -- `chunk_latency_s`: model latency for that chunk request -- `action`: the 8-D action applied to the simulator -- `joint_position`: observed robot joints before the next step -- `gripper_position`: observed gripper state -- `reward`, `terminated`, `truncated` - -Practical reading order: - -1. watch `episode_00.mp4` -2. open `summary.json` and check: - - prompt - - total steps - - total wall time - - total model time - - number of server calls -3. if the behavior looks odd, inspect `episode_00.json` - - check whether actions saturate - - check whether the robot stalls - - check how often a new chunk was requested - -For GitHub issues / PR comments, you can also convert `episode_00.mp4` to a GIF -with `ffmpeg` and attach it directly. - -## Optional upstream parity checks - -The upstream DreamZero-dependent parity tests are kept under: - -- `tests/dreamzero/upstream/` - -Those tests require a local upstream DreamZero checkout and are not needed for -the standard vLLM example above. - - -# MolmoSpaces DreamZero Evaluation Demo - -This example shows how to use the vllm-host to evaluate DreamZero on molmospaces benchmarks. - -## Files - -- `molmospace_dreamzero_eval_demo.py`: evaluate DreamZero on molmospaces benchmarks - -## Environment requirements - -- Install molmospaces in your python environment by following the instructions in [molmospaces/README.md](https://github.com/allenai/molmospaces/blob/main/README.md) -- Prepare the benchmark/assets directory by following the instructions in molmospaces. +The client keeps the upstream DreamZero sim-eval behavior: 24-step action +chunks, 8 executed open-loop control steps before replanning, and DROID camera +observation extraction from `external_cam`, `external_cam_2`, and `wrist_cam`. -## Run the evaluation +### MolmoSpaces Evaluation -From the repository root: +`molmospace_dreamzero_eval_demo.py` evaluates DreamZero through the same vLLM +OpenPI server on MolmoSpaces benchmarks. Install MolmoSpaces and prepare its +benchmark assets by following the upstream MolmoSpaces documentation. ```bash python examples/online_serving/dreamzero/molmospace_dreamzero_eval_demo.py \ @@ -364,3 +201,12 @@ python examples/online_serving/dreamzero/molmospace_dreamzero_eval_demo.py \ --task_horizon_steps 240 \ --episode_idx 1 ``` + +## Optional upstream parity checks + +The upstream DreamZero-dependent parity tests are kept under: + +- `tests/dreamzero/upstream/` + +Those tests require a local upstream DreamZero checkout and are not needed for +the standard vLLM example above. From 94b62b69f92bc65e869fdf91b1667789a97cb3ec Mon Sep 17 00:00:00 2001 From: Meng Date: Sun, 17 May 2026 16:09:59 +0000 Subject: [PATCH 25/45] tests: add DreamZero expansion e2e to nightly Rename the DreamZero OpenPI online serving e2e test to the expansion naming convention, mark it as a full-model distributed diffusion test, and add an isolated two-GPU H100 nightly Buildkite step. Signed-off-by: Meng Co-authored-by: Yangshen Deng --- .buildkite/test-nightly.yml | 42 +++++++++++++++++++ ...eamzero.py => test_dreamzero_expansion.py} | 3 +- 2 files changed, 44 insertions(+), 1 deletion(-) rename tests/e2e/online_serving/{test_dreamzero.py => test_dreamzero_expansion.py} (98%) diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 82e872581c7..9fb48dc1f3a 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -603,6 +603,48 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate + - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with H100 · Multi-GPU DreamZero" + timeout_in_minutes: 120 + commands: + - >- + pytest -sv + tests/e2e/online_serving/test_dreamzero_expansion.py + -m "full_model and diffusion and H100 and distributed_cuda" + --run-level "full_model" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with L4" timeout_in_minutes: 60 diff --git a/tests/e2e/online_serving/test_dreamzero.py b/tests/e2e/online_serving/test_dreamzero_expansion.py similarity index 98% rename from tests/e2e/online_serving/test_dreamzero.py rename to tests/e2e/online_serving/test_dreamzero_expansion.py index 799fb78773d..a5df57a38b3 100644 --- a/tests/e2e/online_serving/test_dreamzero.py +++ b/tests/e2e/online_serving/test_dreamzero_expansion.py @@ -112,8 +112,9 @@ def _write_synthetic_dreamzero_videos(client_mod, video_dir: Path) -> None: _write_synthetic_video(video_dir / file_name, client_mod.cv2, channel=channel) -@pytest.mark.advanced_model +@pytest.mark.full_model @pytest.mark.diffusion +@pytest.mark.distributed_cuda @hardware_test(res={"cuda": "H100"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_dreamzero_openpi_online(omni_server, tmp_path: Path) -> None: From e55739bf597cb418b7e2f9cb164c6e4d11fe0e47 Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Sun, 17 May 2026 16:15:20 +0000 Subject: [PATCH 26/45] docs: document DreamZero ActionGen support Add DreamZero-DROID to the supported models table and list its diffusion feature coverage under a dedicated ActionGen category. Signed-off-by: Yangshen Deng Co-authored-by: Meng --- docs/models/supported_models.md | 1 + docs/user_guide/diffusion_features.md | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index b0637a5d798..ae835025106 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -21,6 +21,7 @@ th { | `Qwen2_5OmniForConditionalGeneration` | Qwen2.5-Omni | `Qwen/Qwen2.5-Omni-7B`, `Qwen/Qwen2.5-Omni-3B` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | | `BagelForConditionalGeneration` | BAGEL (DiT-only) | `ByteDance-Seed/BAGEL-7B-MoT` | ✅︎ | ✅︎ | | ✅︎ | | `InternVLAA1Pipeline` | InternVLA-A1 | `InternRobotics/InternVLA-A1-3B` | ✅︎ | ✅︎ | | | +| `DreamZeroPipeline` | DreamZero-DROID | `GEAR-Dreams/DreamZero-DROID` | ✅︎ | | | | | `HunyuanImage3ForCausalMM` | HunyuanImage3.0 (DiT-only) | `tencent/HunyuanImage-3.0`, `tencent/HunyuanImage-3.0-Instruct` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | | `QwenImagePipeline` | Qwen-Image | `Qwen/Qwen-Image` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | | `QwenImagePipeline` | Qwen-Image-2512 | `Qwen/Qwen-Image-2512` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md index 2a296a05970..d4939859f19 100644 --- a/docs/user_guide/diffusion_features.md +++ b/docs/user_guide/diffusion_features.md @@ -161,6 +161,14 @@ The following tables show which models support each feature: |-----------------------|:---------:|:----------:|:---------------------:|:--------------:|:-----------------:|:-------------------:|:------:|:-------------------------:|:--------------------:|:--------------:|:----------------:| | **Stable-Audio-Open** | ✅ | ❌ | ❓ | ❓ | ❌ | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | +### ActionGen + +| Model | ⚡TeaCache | ⚡Cache-DiT | 🔀SP (Ulysses & Ring) | 🔀CFG-Parallel | 🔀Tensor-Parallel | 🔀HSDP | 💾CPU Offload (Layerwise) | 💾VAE-Patch-Parallel | 💾Quantization | 🔄Step Execution | +|-------|:----------:|:-----------:|:---------------------:|:--------------:|:-----------------:|:------:|:------------------------:|:--------------------:|:--------------:|:----------------:| +| **DreamZero-DROID** | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | + +DreamZero-DROID is a robot policy / action generation pipeline served through the OpenPI-compatible websocket API. See the [DreamZero online serving example](../../examples/online_serving/dreamzero/README.md) for startup commands, client usage, and known precision notes. + ## Feature Compatibility From 660b4d15b6d3af20d20ce85d178c3674658d663e Mon Sep 17 00:00:00 2001 From: Meng Date: Sun, 17 May 2026 16:32:48 +0000 Subject: [PATCH 27/45] tests: use pytest mock in diffusion scheduler test Replace the remaining unittest.mock.patch usage with pytest-mock in the diffusion scheduler test. Signed-off-by: Meng Co-authored-by: Yangshen Deng --- tests/diffusion/test_diffusion_scheduler.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/diffusion/test_diffusion_scheduler.py b/tests/diffusion/test_diffusion_scheduler.py index d76211e5994..82c6f087075 100644 --- a/tests/diffusion/test_diffusion_scheduler.py +++ b/tests/diffusion/test_diffusion_scheduler.py @@ -4,7 +4,6 @@ import queue import threading from types import SimpleNamespace -from unittest.mock import patch import pytest import torch @@ -588,8 +587,8 @@ async def test_step_multi_request_reuses_multimodal_slice_logic(self, mocker: Mo request_ids=["req-0", "req-1"], ) - with patch("vllm_omni.diffusion.diffusion_engine.supports_audio_output", return_value=False): - outputs = await engine.step(request) + mocker.patch("vllm_omni.diffusion.diffusion_engine.supports_audio_output", return_value=False) + outputs = await engine.step(request) assert len(outputs) == 2 assert outputs[0].images == ["frame-0"] From e906813f1f7cb5845ad9259816b38741fc1bd4ec Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Sun, 17 May 2026 16:41:45 +0000 Subject: [PATCH 28/45] tests: reuse common open port helper Replace DreamZero test-local socket helpers with tests.helpers.runtime.get_open_port to match existing test infrastructure. Signed-off-by: Yangshen Deng Co-authored-by: Meng --- .../upstream/test_openpi_e2e_source_parity.py | 15 +++++---------- .../online_serving/test_dreamzero_expansion.py | 11 ++--------- 2 files changed, 7 insertions(+), 19 deletions(-) diff --git a/tests/dreamzero/upstream/test_openpi_e2e_source_parity.py b/tests/dreamzero/upstream/test_openpi_e2e_source_parity.py index 3b4f2d38ae5..2a041a79cd3 100644 --- a/tests/dreamzero/upstream/test_openpi_e2e_source_parity.py +++ b/tests/dreamzero/upstream/test_openpi_e2e_source_parity.py @@ -27,7 +27,6 @@ import os import shutil -import socket import subprocess import sys import time @@ -37,6 +36,8 @@ import pytest import torch +from tests.helpers.runtime import get_open_port + msgpack_numpy = pytest.importorskip("openpi_client.msgpack_numpy") _DREAMZERO_REPO_ENV = os.environ.get("DREAMZERO_REPO") @@ -88,12 +89,6 @@ def __init__( self._ws, self._server_metadata = self._wait_for_server() -def _find_free_port() -> int: - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: - sock.bind(("127.0.0.1", 0)) - return int(sock.getsockname()[1]) - - def _vllm_executable() -> str: fallback = Path(sys.executable).with_name("vllm") if fallback.exists(): @@ -193,7 +188,7 @@ def _run_vllm_service(port: int, log_path: Path) -> subprocess.Popen[str]: env["CUDA_VISIBLE_DEVICES"] = ",".join(gpus[:cfg_parallel_size]) env.setdefault("ATTENTION_BACKEND", "torch") env.setdefault("DIFFUSION_ATTENTION_BACKEND", "TORCH_SDPA") - env.setdefault("MASTER_PORT", str(_find_free_port())) + env.setdefault("MASTER_PORT", str(get_open_port())) argv = [ _vllm_executable(), "serve", @@ -307,7 +302,7 @@ def test_openpi_service_matches_upstream_server_noncompile(tmp_path: Path) -> No "action_space": "joint_position", } - upstream_port = _find_free_port() + upstream_port = get_open_port() upstream_log = tmp_path / "dreamzero_upstream.log" upstream_proc = _run_upstream_service(upstream_port, upstream_log) try: @@ -323,7 +318,7 @@ def test_openpi_service_matches_upstream_server_noncompile(tmp_path: Path) -> No _assert_logs_clean(upstream_log) _assert_upstream_log_matches_vllm_baseline(upstream_log) - vllm_port = _find_free_port() + vllm_port = get_open_port() vllm_log = tmp_path / "vllm_openpi.log" vllm_proc = _run_vllm_service(vllm_port, vllm_log) try: diff --git a/tests/e2e/online_serving/test_dreamzero_expansion.py b/tests/e2e/online_serving/test_dreamzero_expansion.py index a5df57a38b3..f7b00115012 100644 --- a/tests/e2e/online_serving/test_dreamzero_expansion.py +++ b/tests/e2e/online_serving/test_dreamzero_expansion.py @@ -7,7 +7,6 @@ import importlib.util import os -import socket import subprocess import sys from pathlib import Path @@ -16,7 +15,7 @@ import pytest from tests.helpers.mark import hardware_test -from tests.helpers.runtime import OmniServerParams +from tests.helpers.runtime import OmniServerParams, get_open_port os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" @@ -26,12 +25,6 @@ CLIENT_SCRIPT = EXAMPLE_DIR / "openpi_client.py" -def _find_free_port() -> int: - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: - sock.bind(("127.0.0.1", 0)) - return int(sock.getsockname()[1]) - - def _pick_test_gpus() -> str: override = os.environ.get("DREAMZERO_TEST_GPUS") or os.environ.get("OPENPI_E2E_GPUS") if override: @@ -72,7 +65,7 @@ def _pick_test_gpus() -> str: "DIFFUSION_ATTENTION_BACKEND": "TORCH_SDPA", "VLLM_DISABLE_COMPILE_CACHE": "1", "CUDA_VISIBLE_DEVICES": _pick_test_gpus(), - "MASTER_PORT": str(_find_free_port()), + "MASTER_PORT": str(get_open_port()), }, ) ] From c08d1ca4d51ed9ef730f6487ce3d856fc64c3d52 Mon Sep 17 00:00:00 2001 From: Meng Date: Sun, 17 May 2026 17:28:31 +0000 Subject: [PATCH 29/45] tests: decouple DreamZero e2e from example client Move the OpenPI websocket client utilities used by the DreamZero online e2e test into a test helper so CI no longer imports the user-facing example script. Signed-off-by: Meng Co-authored-by: Yangshen Deng --- tests/dreamzero/openpi_client_helper.py | 296 ++++++++++++++++++ .../test_dreamzero_expansion.py | 30 +- 2 files changed, 305 insertions(+), 21 deletions(-) create mode 100644 tests/dreamzero/openpi_client_helper.py diff --git a/tests/dreamzero/openpi_client_helper.py b/tests/dreamzero/openpi_client_helper.py new file mode 100644 index 00000000000..e7889c48628 --- /dev/null +++ b/tests/dreamzero/openpi_client_helper.py @@ -0,0 +1,296 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from __future__ import annotations + +import uuid +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import numpy as np + +try: + import cv2 +except ImportError: # pragma: no cover - optional e2e dependency + cv2 = None + +try: + import websockets.sync.client as websockets_client +except ImportError: # pragma: no cover - optional e2e dependency + websockets_client = None + +try: + from openpi_client import msgpack_numpy +except ImportError: # pragma: no cover - optional e2e dependency + msgpack_numpy = None + +PING_INTERVAL_SECS = 300 +PING_TIMEOUT_SECS = 3600 +DEFAULT_HOST = "127.0.0.1" +DEFAULT_PORT = 8000 +DEFAULT_PATH = "/v1/realtime/robot/openpi" +DEFAULT_PROMPT = "Move the pan forward and use the brush in the middle of the plates to brush the inside of the pan" +ACTION_HORIZON = 24 +DEFAULT_ACTION_DIM = 8 +RELATIVE_OFFSETS = [-23, -16, -8, 0] +CAMERA_FILES = { + "observation/exterior_image_0_left": "exterior_image_1_left.mp4", + "observation/exterior_image_1_left": "exterior_image_2_left.mp4", + "observation/wrist_image_left": "wrist_image_left.mp4", +} + + +def require_dependencies() -> None: + missing = [] + if cv2 is None: + missing.append("opencv-python") + if websockets_client is None: + missing.append("websockets") + if msgpack_numpy is None: + missing.append("openpi-client") + if missing: + raise ModuleNotFoundError(f"DreamZero OpenPI test dependencies are missing: {', '.join(missing)}") + + +@dataclass(frozen=True) +class DreamZeroServerMetadata: + image_resolution: tuple[int, int] + n_external_cameras: int + needs_wrist_camera: bool + needs_stereo_camera: bool + needs_session_id: bool + action_space: str + + @classmethod + def from_dict(cls, payload: dict[str, Any]) -> DreamZeroServerMetadata: + required_keys = ( + "image_resolution", + "n_external_cameras", + "needs_wrist_camera", + "needs_stereo_camera", + "needs_session_id", + "action_space", + ) + missing_keys = [key for key in required_keys if key not in payload] + if missing_keys: + raise ValueError(f"Missing DreamZero metadata keys: {missing_keys}") + + image_resolution = payload["image_resolution"] + if not isinstance(image_resolution, (list, tuple)) or len(image_resolution) != 2: + raise ValueError(f"Invalid image_resolution: {image_resolution!r}") + + return cls( + image_resolution=(int(image_resolution[0]), int(image_resolution[1])), + n_external_cameras=int(payload["n_external_cameras"]), + needs_wrist_camera=bool(payload["needs_wrist_camera"]), + needs_stereo_camera=bool(payload["needs_stereo_camera"]), + needs_session_id=bool(payload["needs_session_id"]), + action_space=str(payload["action_space"]), + ) + + +class OpenPIWebsocketClient: + def __init__( + self, + *, + host: str = DEFAULT_HOST, + port: int = DEFAULT_PORT, + path: str = DEFAULT_PATH, + ) -> None: + require_dependencies() + self._uri = f"ws://{host}:{port}{path}" + self._packer = msgpack_numpy.Packer() + self._ws, self._server_metadata = self._connect() + + def _connect(self): + conn = websockets_client.connect( + self._uri, + compression=None, + max_size=None, + ping_interval=PING_INTERVAL_SECS, + ping_timeout=PING_TIMEOUT_SECS, + ) + metadata = msgpack_numpy.unpackb(conn.recv()) + if not isinstance(metadata, dict): + raise TypeError(f"Expected dict metadata from server, got {type(metadata)!r}") + return conn, metadata + + def get_server_metadata(self) -> dict[str, Any]: + return dict(self._server_metadata) + + def infer(self, obs: dict[str, Any]) -> np.ndarray: + payload = dict(obs) + payload["endpoint"] = "infer" + self._ws.send(self._packer.pack(payload)) + response = self._ws.recv() + if isinstance(response, str): + raise RuntimeError(f"Inference failed: {response}") + return np.asarray(msgpack_numpy.unpackb(response), dtype=np.float32) + + def reset(self, reset_info: dict[str, Any] | None = None) -> str: + payload = dict(reset_info or {}) + payload["endpoint"] = "reset" + self._ws.send(self._packer.pack(payload)) + response = self._ws.recv() + if not isinstance(response, str): + raise RuntimeError(f"Unexpected reset response: {type(response)!r}") + return response + + def close(self) -> None: + self._ws.close() + + +def load_all_frames(video_path: Path) -> np.ndarray: + require_dependencies() + cap = cv2.VideoCapture(str(video_path)) + frames = [] + while True: + ok, frame = cap.read() + if not ok: + break + frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) + cap.release() + if not frames: + raise RuntimeError(f"No frames loaded from {video_path}") + return np.stack(frames, axis=0) + + +def load_camera_frames(video_dir: Path) -> dict[str, np.ndarray]: + camera_frames: dict[str, np.ndarray] = {} + for camera_key, file_name in CAMERA_FILES.items(): + video_path = video_dir / file_name + if not video_path.exists(): + raise FileNotFoundError(f"Missing DreamZero test asset: {video_path}") + camera_frames[camera_key] = load_all_frames(video_path) + return camera_frames + + +def build_frame_schedule(total_frames: int, num_chunks: int) -> list[list[int]]: + chunks: list[list[int]] = [] + current_frame = 23 + for _ in range(num_chunks): + indices = [max(current_frame + offset, 0) for offset in RELATIVE_OFFSETS] + if indices[-1] >= total_frames: + break + chunks.append(indices) + current_frame += ACTION_HORIZON + return chunks + + +def make_obs_from_video( + camera_frames: dict[str, np.ndarray], + frame_indices: list[int], + *, + prompt: str, + session_id: str, +) -> dict[str, Any]: + obs: dict[str, Any] = {} + for camera_key, all_frames in camera_frames.items(): + selected = all_frames[frame_indices] + obs[camera_key] = selected[0] if len(frame_indices) == 1 else selected + + obs["observation/joint_position"] = np.zeros(7, dtype=np.float32) + obs["observation/cartesian_position"] = np.zeros(6, dtype=np.float32) + obs["observation/gripper_position"] = np.zeros(1, dtype=np.float32) + obs["prompt"] = prompt + obs["session_id"] = session_id + return obs + + +def build_demo_observations( + camera_frames: dict[str, np.ndarray], + *, + prompt: str, + session_id: str, + num_chunks: int = 2, +) -> list[dict[str, Any]]: + if num_chunks < 1: + raise ValueError("num_chunks must be at least 1") + + total_frames = min(frames.shape[0] for frames in camera_frames.values()) + observations = [ + make_obs_from_video( + camera_frames, + [0], + prompt=prompt, + session_id=session_id, + ) + ] + for indices in build_frame_schedule(total_frames, num_chunks - 1): + observations.append( + make_obs_from_video( + camera_frames, + indices, + prompt=prompt, + session_id=session_id, + ) + ) + return observations + + +def validate_session_result( + result: dict[str, Any], + *, + expected_action_horizon: int = ACTION_HORIZON, + expected_action_dim: int = DEFAULT_ACTION_DIM, +) -> None: + metadata = DreamZeroServerMetadata.from_dict(result["metadata"]) + if metadata.image_resolution != (180, 320): + raise AssertionError(f"Unexpected image_resolution: {metadata.image_resolution}") + if metadata.n_external_cameras != 2: + raise AssertionError(f"Unexpected n_external_cameras: {metadata.n_external_cameras}") + if not metadata.needs_wrist_camera: + raise AssertionError("DreamZero test expects wrist camera metadata") + if metadata.action_space != "joint_position": + raise AssertionError(f"Unexpected action_space: {metadata.action_space}") + + actions = result["actions"] + if len(actions) != 3: + raise AssertionError(f"Expected 3 action tensors, got {len(actions)}") + for index, action in enumerate(actions): + if action.shape != (expected_action_horizon, expected_action_dim): + raise AssertionError( + f"Action {index} shape mismatch: expected " + f"{(expected_action_horizon, expected_action_dim)}, got {action.shape}" + ) + if not np.isfinite(action).all(): + raise AssertionError(f"Action {index} contains non-finite values") + + if result["reset_status"] != "reset successful": + raise AssertionError(f"Unexpected reset status: {result['reset_status']!r}") + + +def run_policy_session( + *, + host: str = DEFAULT_HOST, + port: int = DEFAULT_PORT, + path: str = DEFAULT_PATH, + video_dir: Path, + prompt: str = DEFAULT_PROMPT, + session_id: str | None = None, + num_chunks: int = 2, +) -> dict[str, Any]: + session_id = session_id or str(uuid.uuid4()) + camera_frames = load_camera_frames(video_dir) + observations = build_demo_observations( + camera_frames, + prompt=prompt, + session_id=session_id, + num_chunks=num_chunks, + ) + + client = OpenPIWebsocketClient(host=host, port=port, path=path) + try: + metadata = client.get_server_metadata() + actions = [client.infer(obs) for obs in observations] + reset_status = client.reset({}) + actions.append(client.infer(observations[0])) + return { + "metadata": metadata, + "actions": actions, + "reset_status": reset_status, + "session_id": session_id, + } + finally: + client.close() diff --git a/tests/e2e/online_serving/test_dreamzero_expansion.py b/tests/e2e/online_serving/test_dreamzero_expansion.py index f7b00115012..7a2fd3ac98f 100644 --- a/tests/e2e/online_serving/test_dreamzero_expansion.py +++ b/tests/e2e/online_serving/test_dreamzero_expansion.py @@ -5,15 +5,14 @@ from __future__ import annotations -import importlib.util import os import subprocess -import sys from pathlib import Path import numpy as np import pytest +from tests.dreamzero import openpi_client_helper as openpi_client from tests.helpers.mark import hardware_test from tests.helpers.runtime import OmniServerParams, get_open_port @@ -21,8 +20,6 @@ os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" MODEL = "GEAR-Dreams/DreamZero-DROID" -EXAMPLE_DIR = Path(__file__).resolve().parents[3] / "examples" / "online_serving" / "dreamzero" -CLIENT_SCRIPT = EXAMPLE_DIR / "openpi_client.py" def _pick_test_gpus() -> str: @@ -71,19 +68,6 @@ def _pick_test_gpus() -> str: ] -def _load_client_module(): - spec = importlib.util.spec_from_file_location("dreamzero_openpi_example_client", CLIENT_SCRIPT) - assert spec is not None - assert spec.loader is not None - module = importlib.util.module_from_spec(spec) - sys.modules[spec.name] = module - try: - spec.loader.exec_module(module) - except ModuleNotFoundError as exc: - pytest.skip(f"DreamZero OpenPI example dependency is missing: {exc.name}") - return module - - def _write_synthetic_video(path: Path, cv2_module, *, channel: int) -> None: path.parent.mkdir(parents=True, exist_ok=True) height, width, num_frames = 180, 320, 24 @@ -111,17 +95,21 @@ def _write_synthetic_dreamzero_videos(client_mod, video_dir: Path) -> None: @hardware_test(res={"cuda": "H100"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_dreamzero_openpi_online(omni_server, tmp_path: Path) -> None: - client_mod = _load_client_module() + try: + openpi_client.require_dependencies() + except ModuleNotFoundError as exc: + pytest.skip(str(exc)) + video_dir = tmp_path / "dreamzero_videos" - _write_synthetic_dreamzero_videos(client_mod, video_dir) - result = client_mod.run_policy_session( + _write_synthetic_dreamzero_videos(openpi_client, video_dir) + result = openpi_client.run_policy_session( host=omni_server.host, port=omni_server.port, video_dir=video_dir, session_id="dreamzero-online-e2e", ) - client_mod.validate_session_result(result) + openpi_client.validate_session_result(result) metadata = result["metadata"] assert metadata["needs_session_id"] is True From dd452bdf65ddf0c93590a0d70a5ed29ce99e3fe8 Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Sun, 17 May 2026 17:54:42 +0000 Subject: [PATCH 30/45] fix: clarify DreamZero local adapter category id Keep DreamZero's upstream single-adapter behavior while avoiding mutation of the caller-provided global embodiment id in CausalWanModel. Signed-off-by: Yangshen Deng Co-authored-by: Meng --- .../models/dreamzero/modeling/causal_wan_model.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/vllm_omni/diffusion/models/dreamzero/modeling/causal_wan_model.py b/vllm_omni/diffusion/models/dreamzero/modeling/causal_wan_model.py index 4e772967ee7..37974e9f41e 100644 --- a/vllm_omni/diffusion/models/dreamzero/modeling/causal_wan_model.py +++ b/vllm_omni/diffusion/models/dreamzero/modeling/causal_wan_model.py @@ -820,9 +820,11 @@ def _forward_blocks( F_t = timestep.shape[1] if action is not None: - embodiment_id = torch.tensor([0], device=x.device).repeat(B) - action_features = self.action_encoder(action, timestep_action, embodiment_id) - state_features = self.state_encoder(state, embodiment_id) + # Current DreamZero checkpoints have one local action/state adapter. + # Global embodiment IDs are used by transforms and normalization. + adapter_category_id = torch.zeros(B, dtype=torch.long, device=x.device) + action_features = self.action_encoder(action, timestep_action, adapter_category_id) + state_features = self.state_encoder(state, adapter_category_id) action_register = torch.cat([action_features, state_features], dim=1) action_length = action_features.shape[1] action_register_length = action_register.shape[1] @@ -834,7 +836,7 @@ def _forward_blocks( timestep = timestep.unsqueeze(-1).expand(B, F_t, seq_len // F_t).reshape(B, -1) if action is not None: assert timestep_action is not None and state is not None - state_features_t = self.state_encoder(state, embodiment_id) + state_features_t = self.state_encoder(state, adapter_category_id) stride = timestep_action.shape[1] // state_features_t.shape[1] timestep_state = timestep_action[:, ::stride] timestep = torch.cat([timestep, timestep_action, timestep_state], dim=1) @@ -866,7 +868,7 @@ def _forward_blocks( if action is not None: action_noise_pred = x[:, seq_len : seq_len + action_length] - action_noise_pred = self.action_decoder(action_noise_pred, embodiment_id) + action_noise_pred = self.action_decoder(action_noise_pred, adapter_category_id) else: action_noise_pred = None From 8d5f61375743571aeb1f869c5aeb88b9121933b8 Mon Sep 17 00:00:00 2001 From: Meng Date: Sun, 17 May 2026 18:09:40 +0000 Subject: [PATCH 31/45] fix: isolate DreamZero OpenPI session state Move OpenPI websocket session counters into the per-connection object and key DreamZero pipeline state by session id so frame buffers and KV caches do not leak across clients. Signed-off-by: Meng Co-authored-by: Yangshen Deng --- tests/dreamzero/test_pipeline_state.py | 44 +++++++ .../openai_api/test_openpi_connection.py | 80 +++++++++++- .../openai_api/test_openpi_serving.py | 13 +- .../models/dreamzero/pipeline_dreamzero.py | 116 +++++++++++------- .../realtime/robot/openpi_connection.py | 25 +++- .../openai/realtime/robot/openpi_serving.py | 35 ++---- 6 files changed, 234 insertions(+), 79 deletions(-) create mode 100644 tests/dreamzero/test_pipeline_state.py diff --git a/tests/dreamzero/test_pipeline_state.py b/tests/dreamzero/test_pipeline_state.py new file mode 100644 index 00000000000..f63d5c3b3b2 --- /dev/null +++ b/tests/dreamzero/test_pipeline_state.py @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections import OrderedDict + +import pytest + +from vllm_omni.diffusion.models.dreamzero.pipeline_dreamzero import DreamZeroPipeline + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +def _empty_pipeline() -> DreamZeroPipeline: + pipeline = DreamZeroPipeline.__new__(DreamZeroPipeline) + pipeline._states = OrderedDict() + pipeline._max_session_states = 2 + return pipeline + + +def test_dreamzero_pipeline_state_is_session_keyed() -> None: + pipeline = _empty_pipeline() + + session_a = pipeline._get_or_create_state("session-a") + session_b = pipeline._get_or_create_state("session-b") + session_a.call_count = 7 + session_b.call_count = 3 + + assert pipeline._get_or_create_state("session-a") is session_a + assert pipeline._get_or_create_state("session-b") is session_b + assert session_a.call_count == 7 + assert session_b.call_count == 3 + + +def test_dreamzero_pipeline_state_lru_caps_retained_sessions() -> None: + pipeline = _empty_pipeline() + + session_a = pipeline._get_or_create_state("session-a") + pipeline._get_or_create_state("session-b") + assert pipeline._get_or_create_state("session-a") is session_a + + pipeline._get_or_create_state("session-c") + + assert list(pipeline._states) == ["session-a", "session-c"] + assert "session-b" not in pipeline._states diff --git a/tests/entrypoints/openai_api/test_openpi_connection.py b/tests/entrypoints/openai_api/test_openpi_connection.py index 18391dc66f5..7c952434e37 100644 --- a/tests/entrypoints/openai_api/test_openpi_connection.py +++ b/tests/entrypoints/openai_api/test_openpi_connection.py @@ -36,6 +36,22 @@ async def close(self): self.closed = True +def _serving_mock(): + serving = MagicMock() + serving.policy_server_config = PolicyServerConfig( + { + "image_resolution": (180, 320), + "n_external_cameras": 2, + "needs_wrist_camera": True, + "needs_stereo_camera": False, + "needs_session_id": True, + "action_space": "joint_position", + } + ) + serving.infer = AsyncMock(return_value=[0.0]) + return serving + + def test_pack_reports_clear_error_when_openpi_client_is_missing(monkeypatch): real_import = builtins.__import__ @@ -130,7 +146,11 @@ def test_handle_connection_returns_structured_error_for_infer_exception(monkeypa assert websocket.sent_bytes[1] == {"type": "error", "message": "Internal inference error"} assert "secret traceback text" not in str(websocket.sent_bytes[1]) assert websocket.sent_texts == [] - serving.infer.assert_awaited_once_with({"prompt": "pick up the object"}) + serving.infer.assert_awaited_once_with( + {"prompt": "pick up the object"}, + session_id="default", + reset=True, + ) def test_handle_connection_closes_websocket_on_idle_timeout(monkeypatch): @@ -166,3 +186,61 @@ async def never_receives(): assert websocket.closed is True assert websocket.sent_texts == [] serving.infer.assert_not_called() + + +def test_handle_connection_keeps_session_state_per_websocket(monkeypatch): + monkeypatch.setattr(openpi_connection, "_pack", lambda obj: obj) + requests = { + b"a1": {"prompt": "first", "session_id": "session-a"}, + b"a2": {"prompt": "second", "session_id": "session-a"}, + b"b1": {"prompt": "other", "session_id": "session-b"}, + } + monkeypatch.setattr(openpi_connection, "_unpack", lambda data: dict(requests[data])) + serving = _serving_mock() + + websocket_a = FakeWebSocket( + [ + {"type": "websocket.receive", "bytes": b"a1"}, + {"type": "websocket.receive", "bytes": b"a2"}, + {"type": "websocket.disconnect"}, + ] + ) + websocket_b = FakeWebSocket( + [ + {"type": "websocket.receive", "bytes": b"b1"}, + {"type": "websocket.disconnect"}, + ] + ) + + asyncio.run(openpi_connection.RobotRealtimeConnection(websocket_a, serving).handle_connection()) + asyncio.run(openpi_connection.RobotRealtimeConnection(websocket_b, serving).handle_connection()) + + calls = serving.infer.await_args_list + assert calls[0].kwargs == {"session_id": "session-a", "reset": True} + assert calls[1].kwargs == {"session_id": "session-a", "reset": False} + assert calls[2].kwargs == {"session_id": "session-b", "reset": True} + + +def test_handle_connection_reset_endpoint_resets_next_infer(monkeypatch): + monkeypatch.setattr(openpi_connection, "_pack", lambda obj: obj) + requests = { + b"a1": {"prompt": "first", "session_id": "session-a"}, + b"reset": {"endpoint": "reset"}, + b"a2": {"prompt": "second", "session_id": "session-a"}, + } + monkeypatch.setattr(openpi_connection, "_unpack", lambda data: dict(requests[data])) + serving = _serving_mock() + websocket = FakeWebSocket( + [ + {"type": "websocket.receive", "bytes": b"a1"}, + {"type": "websocket.receive", "bytes": b"reset"}, + {"type": "websocket.receive", "bytes": b"a2"}, + {"type": "websocket.disconnect"}, + ] + ) + + asyncio.run(openpi_connection.RobotRealtimeConnection(websocket, serving).handle_connection()) + + assert [call.kwargs["reset"] for call in serving.infer.await_args_list] == [True, True] + serving.reset.assert_called_once_with({}) + assert websocket.sent_texts == ["reset successful"] diff --git a/tests/entrypoints/openai_api/test_openpi_serving.py b/tests/entrypoints/openai_api/test_openpi_serving.py index 2b3db13ab9e..670f3023b1c 100644 --- a/tests/entrypoints/openai_api/test_openpi_serving.py +++ b/tests/entrypoints/openai_api/test_openpi_serving.py @@ -126,13 +126,16 @@ def test_policy_server_config_reads_engine_model_config(): assert serving.policy_server_config.to_dict() == policy_config -def test_reset_marks_next_request_for_engine_state_reset(): +def test_build_request_forwards_connection_session_state(): serving = openpi_serving.ServingRealtimeRobotOpenPI(engine_client=_engine_with_policy_config()) - serving._call_count = 3 - serving.reset({}) - serving._call_count += 1 - request = serving._build_request({"prompt": "pick up the object"}) + request = serving._build_request( + {"prompt": "pick up the object"}, + session_id="session-a", + reset=True, + ) assert request.sampling_params.extra_args["reset"] is True + assert request.sampling_params.extra_args["session_id"] == "session-a" assert request.sampling_params.extra_args["robot_obs"]["prompt"] == "pick up the object" + assert request.request_ids == ["robot-session-a"] diff --git a/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py b/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py index 24834604185..9d2cec8d357 100644 --- a/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py +++ b/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py @@ -13,6 +13,7 @@ import logging import os import re as re_module +from collections import OrderedDict from collections.abc import Iterable import numpy as np @@ -50,6 +51,7 @@ from vllm_omni.diffusion.request import OmniDiffusionRequest logger = logging.getLogger(__name__) +MAX_DREAMZERO_SESSIONS = 64 # --------------------------------------------------------------------------- @@ -198,7 +200,9 @@ def __init__(self, *, od_config: OmniDiffusionConfig, prefix: str = "") -> None: use_dynamic_shifting=False, ) - self.state = DreamZeroState() + self._states: OrderedDict[str, DreamZeroState] = OrderedDict() + self._max_session_states = MAX_DREAMZERO_SESSIONS + self.state = self._get_or_create_state("default") # Keep runtime inference settings separate from the training-time config. self.num_inference_steps: int = model_config.get( @@ -260,6 +264,19 @@ def __init__(self, *, od_config: OmniDiffusionConfig, prefix: str = "") -> None: ), ] + def _get_or_create_state(self, session_id: str | None) -> DreamZeroState: + session_key = str(session_id or "default") + state = self._states.get(session_key) + if state is None: + state = DreamZeroState() + self._states[session_key] = state + max_states = getattr(self, "_max_session_states", MAX_DREAMZERO_SESSIONS) + while len(self._states) > max_states: + self._states.popitem(last=False) + else: + self._states.move_to_end(session_key) + return state + # ----------------------------------------------------------------------- # Root config loading # ----------------------------------------------------------------------- @@ -304,9 +321,10 @@ def predict_noise(self, **kwargs) -> tuple[torch.Tensor, torch.Tensor]: embodiment_id=kwargs.get("embodiment_id"), ) if kwargs.get("update_kv_cache", False) and updated_kv_caches: + state = kwargs.get("dreamzero_state", self.state) is_neg = kwargs.get("is_negative", False) for i, kv in enumerate(updated_kv_caches): - self.state.update_kv_cache(i, kv, is_negative=is_neg) + state.update_kv_cache(i, kv, is_negative=is_neg) video_pred = video_pred.clone() if action_pred is not None: @@ -466,6 +484,7 @@ def _prefill_kv_cache( frame_seqlen: int, seq_len: int, do_true_cfg: bool, + state: DreamZeroState, ) -> None: """Prefill KV cache with first frame and/or current observation. @@ -479,8 +498,8 @@ def _prefill_kv_cache( num_heads = getattr(self.transformer.blocks[0].self_attn, "tp_num_heads", self.transformer.num_heads) head_dim = self.transformer.dim // self.transformer.num_heads - if self.state.current_start_frame == 0: - self.state.create_kv_caches( + if state.current_start_frame == 0: + state.create_kv_caches( batch_size, dtype, device, @@ -490,7 +509,7 @@ def _prefill_kv_cache( ) zero_t = torch.zeros([batch_size, 1], device=device, dtype=torch.long) - y_first = self.state.ys[:, :, 0:1] if self.state.ys is not None else None + y_first = state.ys[:, :, 0:1] if state.ys is not None else None # KV cache update is a side effect in predict_noise() common = dict( @@ -499,21 +518,22 @@ def _prefill_kv_cache( seq_len=frame_seqlen, current_start_frame=0, y=y_first, - clip_feature=self.state.clip_feas, + clip_feature=state.clip_feas, update_kv_cache=True, + dreamzero_state=state, ) positive_kwargs = dict( encoder_hidden_states=prompt_embeds, - kv_cache=self.state.get_kv_caches(False), - crossattn_cache=self.state.get_crossattn_caches(False), + kv_cache=state.get_kv_caches(False), + crossattn_cache=state.get_crossattn_caches(False), is_negative=False, **common, ) negative_kwargs = ( dict( encoder_hidden_states=negative_prompt_embeds, - kv_cache=self.state.get_kv_caches(True), - crossattn_cache=self.state.get_crossattn_caches(True), + kv_cache=state.get_kv_caches(True), + crossattn_cache=state.get_crossattn_caches(True), is_negative=True, **common, ) @@ -528,16 +548,16 @@ def _prefill_kv_cache( true_cfg_scale=self.cfg_scale, cfg_normalize=False, ) - self.state.current_start_frame = 1 + state.current_start_frame = 1 - if self.state.current_start_frame != 1: - csf = self.state.current_start_frame + if state.current_start_frame != 1: + csf = state.current_start_frame nfpb = self.num_frame_per_block current_ref = image_latents[:, -nfpb:] - if self.state.ys is not None and csf <= self.state.ys.shape[2]: - y = self.state.ys[:, :, csf - nfpb : csf] - elif self.state.ys is not None: - y = self.state.ys[:, :, -nfpb:] + if state.ys is not None and csf <= state.ys.shape[2]: + y = state.ys[:, :, csf - nfpb : csf] + elif state.ys is not None: + y = state.ys[:, :, -nfpb:] else: y = None @@ -548,21 +568,22 @@ def _prefill_kv_cache( seq_len=seq_len, current_start_frame=csf - nfpb, y=y, - clip_feature=self.state.clip_feas, + clip_feature=state.clip_feas, update_kv_cache=True, + dreamzero_state=state, ) positive_kwargs = dict( encoder_hidden_states=prompt_embeds, - kv_cache=self.state.get_kv_caches(False), - crossattn_cache=self.state.get_crossattn_caches(False), + kv_cache=state.get_kv_caches(False), + crossattn_cache=state.get_crossattn_caches(False), is_negative=False, **common, ) negative_kwargs = ( dict( encoder_hidden_states=negative_prompt_embeds, - kv_cache=self.state.get_kv_caches(True), - crossattn_cache=self.state.get_crossattn_caches(True), + kv_cache=state.get_kv_caches(True), + crossattn_cache=state.get_crossattn_caches(True), is_negative=True, **common, ) @@ -588,6 +609,7 @@ def diffuse( negative_prompt_embeds: torch.Tensor | None, video_action_scheduler: VideoActionScheduler, do_true_cfg: bool, + state: DreamZeroState, **kwargs, ) -> tuple[torch.Tensor, torch.Tensor]: """Denoising loop with CFG parallel support. @@ -605,10 +627,11 @@ def diffuse( # Shared kwargs for predict_noise (both cond & uncond branches) common_kwargs = dict( seq_len=seq_len, - current_start_frame=self.state.current_start_frame, + current_start_frame=state.current_start_frame, state_features=state_features, embodiment_id=embodiment_id, update_kv_cache=False, + dreamzero_state=state, ) noisy_input = video_latents @@ -635,20 +658,20 @@ def diffuse( * action_timestep ) - csf = self.state.current_start_frame - if csf + self.num_frame_per_block <= self.state.ys.shape[2]: - y = self.state.ys[:, :, csf : csf + self.num_frame_per_block] + csf = state.current_start_frame + if csf + self.num_frame_per_block <= state.ys.shape[2]: + y = state.ys[:, :, csf : csf + self.num_frame_per_block] else: - y = self.state.ys[:, :, -self.num_frame_per_block :] + y = state.ys[:, :, -self.num_frame_per_block :] positive_kwargs = dict( hidden_states=noisy_input.transpose(1, 2), timestep_video=timestep, encoder_hidden_states=prompt_embeds, - kv_cache=self.state.get_kv_caches(False), - crossattn_cache=self.state.get_crossattn_caches(False), + kv_cache=state.get_kv_caches(False), + crossattn_cache=state.get_crossattn_caches(False), y=y, - clip_feature=self.state.clip_feas, + clip_feature=state.clip_feas, action=noisy_input_action, timestep_action=timestep_action, is_negative=False, @@ -661,10 +684,10 @@ def diffuse( hidden_states=noisy_input.transpose(1, 2), timestep_video=timestep, encoder_hidden_states=negative_prompt_embeds, - kv_cache=self.state.get_kv_caches(True), - crossattn_cache=self.state.get_crossattn_caches(True), + kv_cache=state.get_kv_caches(True), + crossattn_cache=state.get_crossattn_caches(True), y=y, - clip_feature=self.state.clip_feas, + clip_feature=state.clip_feas, action=noisy_input_action, timestep_action=timestep_action, is_negative=True, @@ -730,6 +753,9 @@ def forward(self, req: OmniDiffusionRequest, **kwargs) -> DiffusionOutput: }, ) raise KeyError("robot_obs") + session_id = str(extra_args.get("session_id") or "default") + state = self._get_or_create_state(session_id) + self.state = state transform, unified_obs = self._transform_robot_obs(robot_obs) device = get_local_device() @@ -785,14 +811,14 @@ def forward(self, req: OmniDiffusionRequest, **kwargs) -> DiffusionOutput: # Explicit reset from OpenPI serving is carried by `extra_args["reset"]` # on the next inference request after websocket reset/session switch. if extra_args.get("reset", False): - self.state.reset() + state.reset() # Auto-reset based on model state (before accumulation) - if self.state.should_reset(text_tokens, 0, self.transformer.local_attn_size): - self.state.reset() - self.state.language = text_tokens + if state.should_reset(text_tokens, 0, self.transformer.local_attn_size): + state.reset() + state.language = text_tokens # Frame accumulation: stitched single frame → multi-frame video - video_frames = self.state.accumulate_frames(stitched) # (T, H, W, C) + video_frames = state.accumulate_frames(stitched) # (T, H, W, C) videos = torch.from_numpy(video_frames).unsqueeze(0).to(device) # (B=1, T, H, W, C) videos = self._preprocess_video(videos) # → [B,C,T,H,W] bf16 @@ -821,17 +847,17 @@ def forward(self, req: OmniDiffusionRequest, **kwargs) -> DiffusionOutput: else: image = videos[:, :, :1].transpose(1, 2) - if self.state.current_start_frame == 0: + if state.current_start_frame == 0: clip_feas, ys, image = self._encode_image( image, self.num_frames, height, width, ) - self.state.clip_feas = clip_feas.to(dtype=image.dtype) - self.state.ys = ys.to(dtype=image.dtype) + state.clip_feas = clip_feas.to(dtype=image.dtype) + state.ys = ys.to(dtype=image.dtype) - if self.state.current_start_frame != 0: + if state.current_start_frame != 0: # Subsequent calls: encode current observation via VAE if (num_frames_raw - 1) // 4 == self.num_frame_per_block: pass @@ -886,6 +912,7 @@ def forward(self, req: OmniDiffusionRequest, **kwargs) -> DiffusionOutput: frame_seqlen, seq_len, do_true_cfg, + state, ) sample_scheduler = copy.deepcopy(self.scheduler) @@ -923,14 +950,15 @@ def forward(self, req: OmniDiffusionRequest, **kwargs) -> DiffusionOutput: negative_prompt_embeds=negative_prompt_embeds, video_action_scheduler=video_action_scheduler, do_true_cfg=do_true_cfg, + state=state, seq_len=seq_len, state_features=state_features, embodiment_id=embodiment_id, ) - if self.state.current_start_frame == 1: + if state.current_start_frame == 1: video_out = torch.cat([image, video_out], dim=1) - self.state.current_start_frame += self.num_frame_per_block + state.current_start_frame += self.num_frame_per_block # q99 denorm: [-1,1] → real values action_out = self._denormalize_action(action_out.float(), embodiment_name) diff --git a/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py b/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py index dca8306ff41..9a2f009fdc9 100644 --- a/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py +++ b/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py @@ -58,6 +58,12 @@ def __init__( self.websocket = websocket self.serving = serving self._idle_timeout = idle_timeout + self._current_session_id: str | None = None + self._call_count = 0 + + def reset(self) -> None: + self._current_session_id = None + self._call_count = 0 async def _send_error(self, message: str) -> None: await self.websocket.send_bytes(_pack({"type": "error", "message": message})) @@ -112,10 +118,27 @@ async def handle_connection(self) -> None: endpoint = obs.pop("endpoint", "infer") if endpoint == "reset": + self.reset() self.serving.reset(obs) await self.websocket.send_text("reset successful") else: - actions = await self.serving.infer(obs) + session_id = str(obs.get("session_id") or self._current_session_id or "default") + if session_id != self._current_session_id: + if self._current_session_id is not None: + logger.info( + "Robot OpenPI session changed %s -> %s", + self._current_session_id, + session_id, + ) + self._current_session_id = session_id + self._call_count = 0 + + self._call_count += 1 + actions = await self.serving.infer( + obs, + session_id=session_id, + reset=self._call_count <= 1, + ) await self.websocket.send_bytes(_pack(actions)) except Exception: logger.exception("Error handling request") diff --git a/vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py b/vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py index f8f878de031..b58cbd988e2 100644 --- a/vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py +++ b/vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py @@ -71,8 +71,6 @@ def __init__( ) -> None: self.engine_client = engine_client self.model_name = model_name - self._current_session_id: str | None = None - self._call_count = 0 self.policy_server_config = self._get_policy_server_config(engine_client) @classmethod @@ -115,28 +113,12 @@ def _get_policy_server_config(engine_client: Any) -> PolicyServerConfig: return PolicyServerConfig.from_model_config(model_config) def reset(self, obs: dict) -> None: - """Reset serving state. + """Compatibility hook; per-connection state lives in RobotRealtimeConnection.""" - Engine-side policy state is reset on the next inference request via - `extra_args["reset"]`, not by an immediate websocket-side RPC. - """ - self._call_count = 0 - self._current_session_id = None - - async def infer(self, obs: dict) -> np.ndarray: + async def infer(self, obs: dict, *, session_id: str, reset: bool) -> np.ndarray: """raw obs → engine → actions.""" - # Session tracking - session_id = obs.get("session_id") - if session_id is not None and session_id != self._current_session_id: - if self._current_session_id is not None: - logger.info("Session changed %s → %s", self._current_session_id, session_id) - self.reset({}) - self._current_session_id = session_id - - self._call_count += 1 - # Build request, run inference through AsyncOmni - request = self._build_request(obs) + request = self._build_request(obs, session_id=session_id, reset=reset) result = None # OpenPI policy serving is one request -> one action reply. AsyncOmni # exposes an async iterator, so consume it to completion and use the @@ -152,7 +134,7 @@ async def infer(self, obs: dict) -> np.ndarray: return self._extract_actions(result) - def _build_request(self, obs: dict) -> Any: + def _build_request(self, obs: dict, *, session_id: str, reset: bool) -> Any: """Build engine request from raw robot obs. Returns an `OmniDiffusionRequest` payload consumed by @@ -161,12 +143,9 @@ def _build_request(self, obs: dict) -> Any: from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.inputs.data import OmniDiffusionSamplingParams - # `_call_count` is reset by websocket reset/session switches, then - # incremented before this request is built. The policy pipeline consumes - # this flag and clears its frame buffer / KV cache before accumulation. extra_args = { - "reset": self._call_count <= 1, - "session_id": self._current_session_id or "default", + "reset": reset, + "session_id": session_id, "robot_obs": obs, } @@ -175,7 +154,7 @@ def _build_request(self, obs: dict) -> Any: return OmniDiffusionRequest( prompts=[prompt], sampling_params=sampling_params, - request_ids=[f"robot-{self._current_session_id or 'default'}"], + request_ids=[f"robot-{session_id}"], ) def _extract_actions(self, result: Any) -> np.ndarray: From 1152e4afb875bb8ef5ff5f60512b8c8b357093d2 Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Sun, 17 May 2026 19:16:33 +0000 Subject: [PATCH 32/45] fix: cap DreamZero stitched frame buffer Keep DreamZero frame accumulation bounded by retaining only the latest FRAMES_PER_CHUNK stitched frames. The padding behavior remains aligned with upstream: early calls pad with the oldest retained frame before enough history exists. Signed-off-by: Yangshen Deng Co-authored-by: Meng --- .../diffusion/models/dreamzero/state_dreamzero.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm_omni/diffusion/models/dreamzero/state_dreamzero.py b/vllm_omni/diffusion/models/dreamzero/state_dreamzero.py index 676d5c486c8..ecc536b8ce2 100644 --- a/vllm_omni/diffusion/models/dreamzero/state_dreamzero.py +++ b/vllm_omni/diffusion/models/dreamzero/state_dreamzero.py @@ -6,6 +6,7 @@ from __future__ import annotations import logging +from collections import deque import numpy as np import torch @@ -53,13 +54,13 @@ def accumulate_frames(self, stitched: np.ndarray) -> np.ndarray: num_frames = 1 if self.call_count == 0 else FRAMES_PER_CHUNK - if len(self.stitched_buffer) >= num_frames: - frames = self.stitched_buffer[-num_frames:] + buffer_frames = list(self.stitched_buffer) + if len(buffer_frames) >= num_frames: + frames = buffer_frames[-num_frames:] else: - # Pad by repeating first frame - frames = list(self.stitched_buffer) + frames = buffer_frames while len(frames) < num_frames: - frames.insert(0, self.stitched_buffer[0]) + frames.insert(0, buffer_frames[0]) self.call_count += 1 return np.stack(frames, axis=0) # (T, H, W, C) @@ -70,8 +71,7 @@ def accumulate_frames(self, stitched: np.ndarray) -> np.ndarray: def reset(self) -> None: """Clear all state.""" - # Frame buffer — single stitched buffer - self.stitched_buffer: list[np.ndarray] = [] + self.stitched_buffer: deque[np.ndarray] = deque(maxlen=FRAMES_PER_CHUNK) self.call_count: int = 0 # KV cache once robot-policy diffusion supports that integration. From a5653e5013125a52d79ca29bf57071f8c2f64ae9 Mon Sep 17 00:00:00 2001 From: Meng Date: Sun, 17 May 2026 19:23:19 +0000 Subject: [PATCH 33/45] fix: reuse exact DreamZero config detection Route the legacy VLA diffusion config path through the shared DreamZero HF config detector instead of keeping a second inline target check. This keeps detection on exact upstream class paths and avoids a manual model_class_name override bypass. Signed-off-by: Meng Co-authored-by: Yangshen Deng --- vllm_omni/diffusion/data.py | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/vllm_omni/diffusion/data.py b/vllm_omni/diffusion/data.py index ccb7f75b615..3dfb6c7c617 100644 --- a/vllm_omni/diffusion/data.py +++ b/vllm_omni/diffusion/data.py @@ -902,23 +902,9 @@ def enrich_config(self) -> None: self.tf_model_config = TransformerConfig() self.update_multimodal_support() elif model_type == "vla": - action_head_cfg = cfg.get("action_head_cfg") or {} - looks_like_dreamzero = False - if isinstance(action_head_cfg, Mapping): - action_head_cfg_config = action_head_cfg.get("config") or {} - diffusion_model_cfg = {} - if isinstance(action_head_cfg_config, Mapping): - diffusion_model_cfg = action_head_cfg_config.get("diffusion_model_cfg") or {} - if isinstance(diffusion_model_cfg, Mapping): - looks_like_dreamzero = ( - action_head_cfg.get("_target_") - == "groot.vla.model.dreamzero.action_head.wan_flow_matching_action_tf.WANPolicyHead" - and diffusion_model_cfg.get("_target_") - == ( - "groot.vla.model.dreamzero.modules.wan_video_dit_action_casual_chunk.CausalWanModel" - ) - ) - if looks_like_dreamzero or self.model_class_name == "DreamZeroPipeline": + from vllm_omni.diffusion.utils.hf_utils import _looks_like_dreamzero + + if _looks_like_dreamzero(self.model): self.model_class_name = "DreamZeroPipeline" self.set_tf_model_config(TransformerConfig()) self.update_multimodal_support() From 2dd504e38616c94e613911af35c93a6fdbfa8c63 Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Sun, 17 May 2026 19:26:42 +0000 Subject: [PATCH 34/45] fix: guard OpenPI request payload size Reject oversized OpenPI websocket payloads before msgpack deserialization to avoid unnecessary CPU and memory pressure. The existing structured invalid-payload response path is preserved. Signed-off-by: Yangshen Deng Co-authored-by: Meng --- .../openai_api/test_openpi_connection.py | 23 +++++++++++++++++++ .../realtime/robot/openpi_connection.py | 3 +++ 2 files changed, 26 insertions(+) diff --git a/tests/entrypoints/openai_api/test_openpi_connection.py b/tests/entrypoints/openai_api/test_openpi_connection.py index 7c952434e37..cfe23b7b0c2 100644 --- a/tests/entrypoints/openai_api/test_openpi_connection.py +++ b/tests/entrypoints/openai_api/test_openpi_connection.py @@ -123,6 +123,29 @@ def test_handle_connection_returns_structured_error_for_invalid_payload(monkeypa serving.reset.assert_not_called() +def test_handle_connection_rejects_oversized_payload_before_unpack(monkeypatch): + unpack_mock = MagicMock(side_effect=AssertionError("_unpack should not be called")) + monkeypatch.setattr(openpi_connection, "_pack", lambda obj: obj) + monkeypatch.setattr(openpi_connection, "_unpack", unpack_mock) + monkeypatch.setattr(openpi_connection, "MAX_OPENPI_PAYLOAD_BYTES", 4) + + websocket = FakeWebSocket( + [ + {"type": "websocket.receive", "bytes": b"too-large"}, + {"type": "websocket.disconnect"}, + ] + ) + serving = MagicMock() + connection = openpi_connection.RobotRealtimeConnection(websocket, serving) + + asyncio.run(connection.handle_connection()) + + assert websocket.sent_bytes[1] == {"type": "error", "message": "Invalid request payload"} + unpack_mock.assert_not_called() + serving.infer.assert_not_called() + serving.reset.assert_not_called() + + def test_handle_connection_returns_structured_error_for_infer_exception(monkeypatch): monkeypatch.setattr(openpi_connection, "_pack", lambda obj: obj) monkeypatch.setattr( diff --git a/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py b/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py index 9a2f009fdc9..b9b513434a8 100644 --- a/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py +++ b/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py @@ -24,6 +24,7 @@ logger = init_logger(__name__) _DEFAULT_IDLE_TIMEOUT = 30.0 +MAX_OPENPI_PAYLOAD_BYTES = 64 * 1024 * 1024 def _get_msgpack_numpy() -> Any: @@ -69,6 +70,8 @@ async def _send_error(self, message: str) -> None: await self.websocket.send_bytes(_pack({"type": "error", "message": message})) def _unpack_request(self, data: bytes) -> dict[str, Any]: + if len(data) > MAX_OPENPI_PAYLOAD_BYTES: + raise ValueError("OpenPI request payload too large") obs = _unpack(data) if not isinstance(obs, dict): raise ValueError("Invalid request payload") From fedc00ea712b5d98fc10795ab4dacd2e0bab2401 Mon Sep 17 00:00:00 2001 From: Suli Wang <50828270+wsl2000@users.noreply.github.com> Date: Sun, 17 May 2026 19:47:37 +0000 Subject: [PATCH 35/45] fix: return msgpack OpenPI reset status Make the vLLM DreamZero reset response consistent with the rest of the websocket protocol by returning msgpack bytes. Keep the example and test clients compatible with upstream DreamZero's legacy text reset response. Signed-off-by: Suli Wang <50828270+wsl2000@users.noreply.github.com> Co-authored-by: Yangshen Deng --- .../online_serving/dreamzero/droid_sim_eval_client.py | 9 ++++++--- examples/online_serving/dreamzero/openpi_client.py | 9 ++++++--- tests/dreamzero/openpi_client_helper.py | 9 ++++++--- tests/entrypoints/openai_api/test_openpi_connection.py | 3 ++- .../openai/realtime/robot/openpi_connection.py | 4 ++-- 5 files changed, 22 insertions(+), 12 deletions(-) diff --git a/examples/online_serving/dreamzero/droid_sim_eval_client.py b/examples/online_serving/dreamzero/droid_sim_eval_client.py index f3c00c4592d..da29ab84dba 100644 --- a/examples/online_serving/dreamzero/droid_sim_eval_client.py +++ b/examples/online_serving/dreamzero/droid_sim_eval_client.py @@ -222,9 +222,12 @@ def reset(self, reset_info: dict[str, Any] | None = None) -> str: payload["endpoint"] = "reset" self._ws.send(self._packer.pack(payload)) response = self._ws.recv() - if not isinstance(response, str): - raise RuntimeError(f"Unexpected reset response: {type(response)!r}") - return response + if isinstance(response, str): + return response + decoded = msgpack_numpy.unpackb(response) + if not isinstance(decoded, dict) or decoded.get("status") != "reset successful": + raise RuntimeError(f"Unexpected reset response: {decoded!r}") + return str(decoded["status"]) def close(self) -> None: """Close the websocket connection explicitly.""" diff --git a/examples/online_serving/dreamzero/openpi_client.py b/examples/online_serving/dreamzero/openpi_client.py index 246bfb2777a..3e1eab51834 100755 --- a/examples/online_serving/dreamzero/openpi_client.py +++ b/examples/online_serving/dreamzero/openpi_client.py @@ -138,9 +138,12 @@ def reset(self, reset_info: dict[str, Any] | None = None) -> str: payload["endpoint"] = "reset" self._ws.send(self._packer.pack(payload)) response = self._ws.recv() - if not isinstance(response, str): - raise RuntimeError(f"Unexpected reset response: {type(response)!r}") - return response + if isinstance(response, str): + return response + decoded = msgpack_numpy.unpackb(response) + if not isinstance(decoded, dict) or decoded.get("status") != "reset successful": + raise RuntimeError(f"Unexpected reset response: {decoded!r}") + return str(decoded["status"]) def close(self) -> None: self._ws.close() diff --git a/tests/dreamzero/openpi_client_helper.py b/tests/dreamzero/openpi_client_helper.py index e7889c48628..dc769ec1337 100644 --- a/tests/dreamzero/openpi_client_helper.py +++ b/tests/dreamzero/openpi_client_helper.py @@ -133,9 +133,12 @@ def reset(self, reset_info: dict[str, Any] | None = None) -> str: payload["endpoint"] = "reset" self._ws.send(self._packer.pack(payload)) response = self._ws.recv() - if not isinstance(response, str): - raise RuntimeError(f"Unexpected reset response: {type(response)!r}") - return response + if isinstance(response, str): + return response + decoded = msgpack_numpy.unpackb(response) + if not isinstance(decoded, dict) or decoded.get("status") != "reset successful": + raise RuntimeError(f"Unexpected reset response: {decoded!r}") + return str(decoded["status"]) def close(self) -> None: self._ws.close() diff --git a/tests/entrypoints/openai_api/test_openpi_connection.py b/tests/entrypoints/openai_api/test_openpi_connection.py index cfe23b7b0c2..124ae60014d 100644 --- a/tests/entrypoints/openai_api/test_openpi_connection.py +++ b/tests/entrypoints/openai_api/test_openpi_connection.py @@ -266,4 +266,5 @@ def test_handle_connection_reset_endpoint_resets_next_infer(monkeypatch): assert [call.kwargs["reset"] for call in serving.infer.await_args_list] == [True, True] serving.reset.assert_called_once_with({}) - assert websocket.sent_texts == ["reset successful"] + assert websocket.sent_bytes[2] == {"status": "reset successful"} + assert websocket.sent_texts == [] diff --git a/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py b/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py index b9b513434a8..2ede4fcd7ad 100644 --- a/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py +++ b/vllm_omni/entrypoints/openai/realtime/robot/openpi_connection.py @@ -6,7 +6,7 @@ Protocol (compatible with OpenPI policy clients): Connect -> server sends msgpack(PolicyServerConfig fields) Infer -> client sends msgpack(obs), server sends msgpack(ndarray) - Reset -> client sends msgpack({endpoint:reset}), server sends "reset successful" + Reset -> client sends msgpack({endpoint:reset}), server sends msgpack(status) """ from __future__ import annotations @@ -123,7 +123,7 @@ async def handle_connection(self) -> None: if endpoint == "reset": self.reset() self.serving.reset(obs) - await self.websocket.send_text("reset successful") + await self.websocket.send_bytes(_pack({"status": "reset successful"})) else: session_id = str(obs.get("session_id") or self._current_session_id or "default") if session_id != self._current_session_id: From 21d0634adebcaa22acb37c861615e64a6498dd7c Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Sun, 17 May 2026 20:25:08 +0000 Subject: [PATCH 36/45] docs: fix DreamZero docs build warnings Make the DreamZero example pointer a plain repository path so MkDocs strict link validation does not treat it as a docs-local page. Add package markers for the DreamZero diffusion modules so api-autonav documents them and can resolve generated API references. Also adjust the action encoder docstring so griffe does not interpret internal attributes as constructor parameters. Signed-off-by: Yangshen Deng Co-authored-by: Meng --- docs/user_guide/diffusion_features.md | 2 +- vllm_omni/diffusion/models/dreamzero/__init__.py | 2 ++ vllm_omni/diffusion/models/dreamzero/modeling/__init__.py | 2 ++ vllm_omni/diffusion/models/dreamzero/modeling/action_encoder.py | 2 +- 4 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 vllm_omni/diffusion/models/dreamzero/__init__.py create mode 100644 vllm_omni/diffusion/models/dreamzero/modeling/__init__.py diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md index d4939859f19..450affbbc96 100644 --- a/docs/user_guide/diffusion_features.md +++ b/docs/user_guide/diffusion_features.md @@ -167,7 +167,7 @@ The following tables show which models support each feature: |-------|:----------:|:-----------:|:---------------------:|:--------------:|:-----------------:|:------:|:------------------------:|:--------------------:|:--------------:|:----------------:| | **DreamZero-DROID** | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | -DreamZero-DROID is a robot policy / action generation pipeline served through the OpenPI-compatible websocket API. See the [DreamZero online serving example](../../examples/online_serving/dreamzero/README.md) for startup commands, client usage, and known precision notes. +DreamZero-DROID is a robot policy / action generation pipeline served through the OpenPI-compatible websocket API. See `examples/online_serving/dreamzero/README.md` for startup commands, client usage, and known precision notes. ## Feature Compatibility diff --git a/vllm_omni/diffusion/models/dreamzero/__init__.py b/vllm_omni/diffusion/models/dreamzero/__init__.py new file mode 100644 index 00000000000..208f01a7cb5 --- /dev/null +++ b/vllm_omni/diffusion/models/dreamzero/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project diff --git a/vllm_omni/diffusion/models/dreamzero/modeling/__init__.py b/vllm_omni/diffusion/models/dreamzero/modeling/__init__.py new file mode 100644 index 00000000000..208f01a7cb5 --- /dev/null +++ b/vllm_omni/diffusion/models/dreamzero/modeling/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project diff --git a/vllm_omni/diffusion/models/dreamzero/modeling/action_encoder.py b/vllm_omni/diffusion/models/dreamzero/modeling/action_encoder.py index e5fcc6604f1..b35bec436ac 100644 --- a/vllm_omni/diffusion/models/dreamzero/modeling/action_encoder.py +++ b/vllm_omni/diffusion/models/dreamzero/modeling/action_encoder.py @@ -35,7 +35,7 @@ def forward(self, timesteps: torch.Tensor) -> torch.Tensor: class CategorySpecificLinear(nn.Module): """Per-category linear: W[cat_id] @ x + b[cat_id] - Params: + Attributes: W: (num_categories, input_dim, hidden_dim) — note: 0.02 * randn init b: (num_categories, hidden_dim) — zero init """ From 2b8b72d2e9eb4466862945e4dc3fde1e2e5bfc23 Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Fri, 22 May 2026 13:27:42 +0000 Subject: [PATCH 37/45] tests: adjust DreamZero upstream parity baseline Co-authored-by: Meng Signed-off-by: Yangshen Deng --- .../upstream/test_openpi_e2e_source_parity.py | 30 +++++++++++++------ .../upstream_socket_server_no_compile.py | 6 +++- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/tests/dreamzero/upstream/test_openpi_e2e_source_parity.py b/tests/dreamzero/upstream/test_openpi_e2e_source_parity.py index 2a041a79cd3..e41cff9032c 100644 --- a/tests/dreamzero/upstream/test_openpi_e2e_source_parity.py +++ b/tests/dreamzero/upstream/test_openpi_e2e_source_parity.py @@ -12,7 +12,7 @@ `/v1/realtime/robot/openpi`. Current scope for this test: -- non-TP (`nproc_per_node=1` on upstream, single-GPU or cfg-parallel `vllm serve`) +- default two-GPU run (`nproc_per_node=2` on upstream, `--cfg-parallel-size 2` on `vllm serve`) - non-`torch.compile` (upstream launched through `upstream_socket_server_no_compile.py`, vLLM with `--enforce-eager`) - non-DiT-cache / non-skip-schedule (`NUM_DIT_STEPS=16`) @@ -100,19 +100,20 @@ def _vllm_executable() -> str: def _cfg_parallel_size() -> int: - return int(os.environ.get("OPENPI_E2E_CFG_PARALLEL_SIZE", "1")) + return int(os.environ.get("OPENPI_E2E_CFG_PARALLEL_SIZE", "2")) def _pick_test_gpus() -> list[str]: + cfg_parallel_size = _cfg_parallel_size() override = os.environ.get("OPENPI_E2E_GPUS") or os.environ.get("OPENPI_E2E_GPU") if override is not None: gpus = [part.strip() for part in override.split(",") if part.strip()] if not gpus: raise ValueError("OPENPI_E2E_GPUS is set but empty.") + if len(gpus) < cfg_parallel_size: + raise RuntimeError(f"Need {cfg_parallel_size} GPUs, but OPENPI_E2E_GPUS only provided {gpus}.") return gpus - cfg_parallel_size = _cfg_parallel_size() - query = subprocess.check_output( [ "nvidia-smi", @@ -141,7 +142,8 @@ def _torchrun_argv(script: str, port: int) -> list[str]: "-m", "torch.distributed.run", "--standalone", - "--nproc_per_node=1", + "--nproc_per_node", + str(_cfg_parallel_size()), script, "--port", str(port), @@ -154,8 +156,9 @@ def _run_upstream_service(port: int, log_path: Path) -> subprocess.Popen[str]: env = os.environ.copy() env.setdefault("PYTHONPATH", "") env["PYTHONPATH"] = f"{Path.cwd()}:{DREAMZERO_REPO}:{env['PYTHONPATH']}".rstrip(":") - env["CUDA_VISIBLE_DEVICES"] = _pick_test_gpus()[0] + env["CUDA_VISIBLE_DEVICES"] = ",".join(_pick_test_gpus()) env.setdefault("NO_ALBUMENTATIONS_UPDATE", "1") + env["ATTENTION_BACKEND"] = "torch" env.setdefault("ENABLE_TENSORRT", "false") env["ENABLE_DIT_CACHE"] = "false" env["NUM_DIT_STEPS"] = "16" @@ -265,12 +268,21 @@ def _collect_outputs_with_client(client) -> tuple[dict, list[np.ndarray]]: client.infer(dict(obs0)), client.infer(dict(obs1)), ] - assert client.reset({}) == "reset successful" + assert _normalize_reset_response(client.reset({})) == "reset successful" outputs.append(client.infer(dict(obs0))) client._ws.close() return metadata, outputs +def _normalize_reset_response(response) -> str: + if isinstance(response, str): + return response + decoded = msgpack_numpy.unpackb(response) + if isinstance(decoded, dict): + return str(decoded.get("status")) + return str(decoded) + + def _normalize_metadata(metadata: dict) -> dict: normalized = dict(metadata) if isinstance(normalized.get("image_resolution"), tuple): @@ -340,7 +352,7 @@ def test_openpi_service_matches_upstream_server_noncompile(tmp_path: Path) -> No np.testing.assert_allclose( actual, expected, - rtol=0.0, - atol=0.0, + rtol=1e-2, + atol=1e-3, err_msg=f"OpenPI step {idx} output mismatch", ) diff --git a/tests/dreamzero/upstream/upstream_socket_server_no_compile.py b/tests/dreamzero/upstream/upstream_socket_server_no_compile.py index bfb4d5f14c9..a9ef7a3d00d 100644 --- a/tests/dreamzero/upstream/upstream_socket_server_no_compile.py +++ b/tests/dreamzero/upstream/upstream_socket_server_no_compile.py @@ -23,7 +23,7 @@ Usage: PYTHONPATH="${DREAMZERO_REPO}" \\ - .venv/bin/python -m torch.distributed.run --standalone --nproc_per_node=1 \\ + .venv/bin/python -m torch.distributed.run --standalone --nproc_per_node=2 \\ tests/dreamzero/upstream/upstream_socket_server_no_compile.py --port 18081 \\ --model_path "${DREAMZERO_REPO}/checkpoints/dreamzero" """ @@ -63,6 +63,7 @@ def deco(fn): import tyro # noqa: E402 from groot.vla.model.dreamzero.modules import attention as upstream_attention # noqa: E402 from groot.vla.model.dreamzero.modules import wan2_1_submodule as upstream_submodule # noqa: E402 +from groot.vla.model.dreamzero.modules import wan_video_dit as upstream_wan_video_dit # noqa: E402 def _torch_varlen_flash_attention( @@ -97,6 +98,9 @@ def _torch_varlen_flash_attention( upstream_attention.flash_attention = _torch_varlen_flash_attention upstream_submodule.flash_attention = _torch_varlen_flash_attention +upstream_wan_video_dit.FLASH_ATTN_3_AVAILABLE = False +upstream_wan_video_dit.FLASH_ATTN_2_AVAILABLE = False +upstream_wan_video_dit.SAGE_ATTN_AVAILABLE = False def main(args: upstream.Args) -> None: From 443a4867c5e101928852846ebf2df3795fd08651 Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Fri, 22 May 2026 13:50:41 +0000 Subject: [PATCH 38/45] chore: remove DreamZero docs from PR Co-authored-by: Meng Signed-off-by: Yangshen Deng --- docs/models/supported_models.md | 1 - docs/user_guide/diffusion_features.md | 8 - examples/online_serving/dreamzero/README.md | 212 -------------------- 3 files changed, 221 deletions(-) delete mode 100644 examples/online_serving/dreamzero/README.md diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index ae835025106..b0637a5d798 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -21,7 +21,6 @@ th { | `Qwen2_5OmniForConditionalGeneration` | Qwen2.5-Omni | `Qwen/Qwen2.5-Omni-7B`, `Qwen/Qwen2.5-Omni-3B` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | | `BagelForConditionalGeneration` | BAGEL (DiT-only) | `ByteDance-Seed/BAGEL-7B-MoT` | ✅︎ | ✅︎ | | ✅︎ | | `InternVLAA1Pipeline` | InternVLA-A1 | `InternRobotics/InternVLA-A1-3B` | ✅︎ | ✅︎ | | | -| `DreamZeroPipeline` | DreamZero-DROID | `GEAR-Dreams/DreamZero-DROID` | ✅︎ | | | | | `HunyuanImage3ForCausalMM` | HunyuanImage3.0 (DiT-only) | `tencent/HunyuanImage-3.0`, `tencent/HunyuanImage-3.0-Instruct` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | | `QwenImagePipeline` | Qwen-Image | `Qwen/Qwen-Image` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | | `QwenImagePipeline` | Qwen-Image-2512 | `Qwen/Qwen-Image-2512` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md index 450affbbc96..2a296a05970 100644 --- a/docs/user_guide/diffusion_features.md +++ b/docs/user_guide/diffusion_features.md @@ -161,14 +161,6 @@ The following tables show which models support each feature: |-----------------------|:---------:|:----------:|:---------------------:|:--------------:|:-----------------:|:-------------------:|:------:|:-------------------------:|:--------------------:|:--------------:|:----------------:| | **Stable-Audio-Open** | ✅ | ❌ | ❓ | ❓ | ❌ | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | -### ActionGen - -| Model | ⚡TeaCache | ⚡Cache-DiT | 🔀SP (Ulysses & Ring) | 🔀CFG-Parallel | 🔀Tensor-Parallel | 🔀HSDP | 💾CPU Offload (Layerwise) | 💾VAE-Patch-Parallel | 💾Quantization | 🔄Step Execution | -|-------|:----------:|:-----------:|:---------------------:|:--------------:|:-----------------:|:------:|:------------------------:|:--------------------:|:--------------:|:----------------:| -| **DreamZero-DROID** | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | - -DreamZero-DROID is a robot policy / action generation pipeline served through the OpenPI-compatible websocket API. See `examples/online_serving/dreamzero/README.md` for startup commands, client usage, and known precision notes. - ## Feature Compatibility diff --git a/examples/online_serving/dreamzero/README.md b/examples/online_serving/dreamzero/README.md deleted file mode 100644 index 495d806eb55..00000000000 --- a/examples/online_serving/dreamzero/README.md +++ /dev/null @@ -1,212 +0,0 @@ -# DreamZero OpenPI Example - -This example shows how to serve DreamZero with `vllm serve --omni` and connect a -compatible OpenPI websocket client using real camera videos downloaded from -Hugging Face. - -## Files - -- `run_server.sh`: launch DreamZero OpenPI serving -- `openpi_client.py`: websocket client that sends real observations -- `export_prediction_video.py`: offline helper that runs vLLM once and decodes DreamZero `video_pred` latents to MP4 -- `droid_sim_eval_client.py`: DROID `sim-evals` rollout client for the vLLM OpenPI server - -## Environment requirements - -- `run_server.sh`, `vllm serve`, `openpi_client.py`, - `export_prediction_video.py`, and the standard example/e2e tests: - use the local `vllm-omni` environment. -- `openpi_client.py` extra deps: - -```bash -pip install openpi-client websockets opencv-python huggingface-hub -``` - -- video export helper extra deps: - -```bash -pip install opencv-python pillow -``` - -Optional DROID sim-eval dependencies: - -- Plain serving, `openpi_client.py`, and standard e2e tests do **not** require - Isaac Lab or `sim-evals`. -- `droid_sim_eval_client.py` must run in an external Isaac Lab / `sim-evals` - environment where these imports already work: - - `isaaclab` - - `isaaclab_tasks` - - `sim_evals` - - `gymnasium` -- In that simulator environment, also install the OpenPI/client-side helpers: - -```bash -pip install openpi-client websockets opencv-python mediapy typing-extensions -``` - -- `typing-extensions` is only needed on Python `< 3.12`. - -- Optional `tests/dreamzero/upstream/*` parity tests also require: - - `DREAMZERO_REPO` pointing to an upstream DreamZero checkout - - an upstream checkpoint at `DREAMZERO_REPO/checkpoints/dreamzero` - -## Start the server - -From the repository root: - -```bash -CUDA_VISIBLE_DEVICES=0,1 \ -examples/online_serving/dreamzero/run_server.sh -``` - -If you have 2 GPUs with moderate VRAM (less than 80GB), you can use the following command to start the server with TP=2 configuration files: -```bash -CUDA_VISIBLE_DEVICES=0,1 \ -examples/online_serving/dreamzero/run_server_with_tp2_config.sh -``` - -If you only want 1 GPU: - -```bash -CUDA_VISIBLE_DEVICES=0 \ -DEPLOY_CONFIG=vllm_omni/deploy/dreamzero.yaml \ -examples/online_serving/dreamzero/run_server.sh -``` -Please note DreamZero requires >=74GB VRAM for single-GPU serving. - -The websocket endpoint is: - -- `ws://127.0.0.1:8000/v1/realtime/robot/openpi` - -## Download example videos - -The real camera videos are hosted outside this repository: - -- - -Download them into the default example location: - -```bash -hf download YangshenDeng/vllm-omni-dreamzero-assets \ - --repo-type dataset \ - --local-dir outputs/dreamzero/assets -``` - -The expected files are: - -- `outputs/dreamzero/assets/exterior_image_1_left.mp4` -- `outputs/dreamzero/assets/exterior_image_2_left.mp4` -- `outputs/dreamzero/assets/wrist_image_left.mp4` - -## Run the client - -From the repository root: - -Environment: - -- run this in the `vllm-omni` repo environment -- if imports are missing, install `openpi-client`, `websockets`, and `opencv-python` - -```bash -python examples/online_serving/dreamzero/openpi_client.py \ - --host 127.0.0.1 \ - --port 8000 -``` - -If you keep the videos elsewhere, pass `--video-dir`. - -The client sends: - -- one initial single-frame observation -- one four-frame observation -- one websocket reset -- one post-reset single-frame observation - -It validates: - -- DreamZero metadata contract -- action tensor shape `(24, 8)` -- finite action values -- reset response - -## Export prediction videos from example inputs - -DreamZero serving returns actions to the websocket client. The model also -produces a latent `video_pred`, but vLLM does **not** auto-save it from the -server path. Use the offline helper below when you want visual debug videos. - -This script: - -1. loads the downloaded camera videos from `outputs/dreamzero/assets/` -2. builds the same DreamZero/OpenPI observations as the client -3. runs vLLM locally through `Omni` -4. collects `video_pred` latents from `OmniRequestOutput.images` -5. decodes them on the DreamZero worker through `DreamZeroVideoExportWorkerExtension` -6. writes an MP4 under `outputs/dreamzero/generated_predictions/` - -Single-config export: - -```bash -python examples/online_serving/dreamzero/export_prediction_video.py \ - --model GEAR-Dreams/DreamZero-DROID \ - --deploy-config vllm_omni/deploy/dreamzero.yaml \ - --output-dir outputs/dreamzero/generated_predictions \ - --output-stem tp1_cfg1_vllm_example -``` - -Optional flags: - -- `--save-input-video`: also writes a stitched real-input camera video -- `--save-gif`: also writes GIFs for GitHub comments -- `--save-actions`: also writes action chunks as `.npz` - -## Optional Evaluation Demos - -The files below are optional external evaluation demos kept with the DreamZero -example for discoverability. They are not required for the basic online serving -flow above, and their simulator dependencies are not vLLM-Omni dependencies. - -### DROID Sim-Eval - -`droid_sim_eval_client.py` runs a DROID rollout through Isaac Lab / `sim-evals` -against an already running vLLM DreamZero OpenPI server. - -```bash -CUDA_VISIBLE_DEVICES=1 \ -"${ISAACLAB_LAUNCHER}" -p \ - examples/online_serving/dreamzero/droid_sim_eval_client.py \ - --host 127.0.0.1 \ - --port 8000 \ - --scene 1 \ - --episodes 1 \ - --headless \ - --device cuda:0 -``` - -The client keeps the upstream DreamZero sim-eval behavior: 24-step action -chunks, 8 executed open-loop control steps before replanning, and DROID camera -observation extraction from `external_cam`, `external_cam_2`, and `wrist_cam`. - -### MolmoSpaces Evaluation - -`molmospace_dreamzero_eval_demo.py` evaluates DreamZero through the same vLLM -OpenPI server on MolmoSpaces benchmarks. Install MolmoSpaces and prepare its -benchmark assets by following the upstream MolmoSpaces documentation. - -```bash -python examples/online_serving/dreamzero/molmospace_dreamzero_eval_demo.py \ - --benchmark_dir "${MOLMOSPACES_BENCHMARK_DIR}/20260327/ithor/FrankaCloseHardBench/FrankaCloseHardBench_20260206_json_benchmark" \ - --output_dir outputs/dreamzero/molmospaces \ - --max_episodes 1 \ - --task_horizon_steps 240 \ - --episode_idx 1 -``` - -## Optional upstream parity checks - -The upstream DreamZero-dependent parity tests are kept under: - -- `tests/dreamzero/upstream/` - -Those tests require a local upstream DreamZero checkout and are not needed for -the standard vLLM example above. From bcec16607c489e78aac8c2c2a7f065e136eed55c Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Fri, 22 May 2026 14:00:30 +0000 Subject: [PATCH 39/45] refactor: flatten DreamZero model modules Co-authored-by: Meng Signed-off-by: Yangshen Deng --- .../diffusion/models/dreamzero/{modeling => }/action_encoder.py | 0 .../models/dreamzero/{modeling => }/causal_wan_model.py | 0 .../diffusion/models/dreamzero/{modeling => }/image_encoder.py | 0 vllm_omni/diffusion/models/dreamzero/modeling/__init__.py | 2 -- 4 files changed, 2 deletions(-) rename vllm_omni/diffusion/models/dreamzero/{modeling => }/action_encoder.py (100%) rename vllm_omni/diffusion/models/dreamzero/{modeling => }/causal_wan_model.py (100%) rename vllm_omni/diffusion/models/dreamzero/{modeling => }/image_encoder.py (100%) delete mode 100644 vllm_omni/diffusion/models/dreamzero/modeling/__init__.py diff --git a/vllm_omni/diffusion/models/dreamzero/modeling/action_encoder.py b/vllm_omni/diffusion/models/dreamzero/action_encoder.py similarity index 100% rename from vllm_omni/diffusion/models/dreamzero/modeling/action_encoder.py rename to vllm_omni/diffusion/models/dreamzero/action_encoder.py diff --git a/vllm_omni/diffusion/models/dreamzero/modeling/causal_wan_model.py b/vllm_omni/diffusion/models/dreamzero/causal_wan_model.py similarity index 100% rename from vllm_omni/diffusion/models/dreamzero/modeling/causal_wan_model.py rename to vllm_omni/diffusion/models/dreamzero/causal_wan_model.py diff --git a/vllm_omni/diffusion/models/dreamzero/modeling/image_encoder.py b/vllm_omni/diffusion/models/dreamzero/image_encoder.py similarity index 100% rename from vllm_omni/diffusion/models/dreamzero/modeling/image_encoder.py rename to vllm_omni/diffusion/models/dreamzero/image_encoder.py diff --git a/vllm_omni/diffusion/models/dreamzero/modeling/__init__.py b/vllm_omni/diffusion/models/dreamzero/modeling/__init__.py deleted file mode 100644 index 208f01a7cb5..00000000000 --- a/vllm_omni/diffusion/models/dreamzero/modeling/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project From 6251092b9078e6fcf4b2a7266e1fd486a11efdf4 Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Fri, 22 May 2026 14:01:33 +0000 Subject: [PATCH 40/45] fix: update DreamZero flattened imports Co-authored-by: Meng Signed-off-by: Yangshen Deng --- vllm_omni/diffusion/models/dreamzero/causal_wan_model.py | 2 +- vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm_omni/diffusion/models/dreamzero/causal_wan_model.py b/vllm_omni/diffusion/models/dreamzero/causal_wan_model.py index 37974e9f41e..d8561fc58b7 100644 --- a/vllm_omni/diffusion/models/dreamzero/causal_wan_model.py +++ b/vllm_omni/diffusion/models/dreamzero/causal_wan_model.py @@ -31,7 +31,7 @@ from vllm.model_executor.utils import set_weight_attrs from vllm_omni.diffusion.attention.layer import Attention -from vllm_omni.diffusion.models.dreamzero.modeling.action_encoder import ( +from vllm_omni.diffusion.models.dreamzero.action_encoder import ( CategorySpecificMLP, MultiEmbodimentActionEncoder, ) diff --git a/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py b/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py index 9d2cec8d357..6bb97aab640 100644 --- a/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py +++ b/vllm_omni/diffusion/models/dreamzero/pipeline_dreamzero.py @@ -31,8 +31,8 @@ from vllm_omni.diffusion.distributed.parallel_state import get_classifier_free_guidance_world_size from vllm_omni.diffusion.distributed.utils import get_local_device from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader -from vllm_omni.diffusion.models.dreamzero.modeling.causal_wan_model import CausalWanModel -from vllm_omni.diffusion.models.dreamzero.modeling.image_encoder import DreamZeroImageEncoder +from vllm_omni.diffusion.models.dreamzero.causal_wan_model import CausalWanModel +from vllm_omni.diffusion.models.dreamzero.image_encoder import DreamZeroImageEncoder from vllm_omni.diffusion.models.dreamzero.state_dreamzero import DreamZeroState from vllm_omni.diffusion.models.dreamzero.transform import ( DEFAULT_EMBODIMENT, From 9fa3896ef99ba9d61d8f3587df1f49a4762a01a5 Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Tue, 26 May 2026 00:18:39 +0000 Subject: [PATCH 41/45] fix: replace DreamZero runtime asserts Convert DreamZero runtime assertions into explicit exceptions so validation survives python -O optimization. Remove the no-op if True wrapper in the CausalWan KV-cache attention path and keep the behavior unchanged for valid inputs. Add regression coverage for accessing DreamZeroState KV and cross-attention caches before initialization. Co-authored-by: Meng Signed-off-by: Yangshen Deng --- dreamzero_hsliu_review_todo.md | 110 ++++++++++++ tests/dreamzero/test_pipeline_state.py | 15 ++ .../models/dreamzero/causal_wan_model.py | 158 ++++++++++-------- .../models/dreamzero/image_encoder.py | 9 +- .../models/dreamzero/state_dreamzero.py | 9 +- .../models/dreamzero/transform/droid.py | 3 +- 6 files changed, 229 insertions(+), 75 deletions(-) create mode 100644 dreamzero_hsliu_review_todo.md diff --git a/dreamzero_hsliu_review_todo.md b/dreamzero_hsliu_review_todo.md new file mode 100644 index 00000000000..5e415fbfeee --- /dev/null +++ b/dreamzero_hsliu_review_todo.md @@ -0,0 +1,110 @@ +# DreamZero PR #2162 hsliuustc0106 Review TODO + +Scope: review comments by `hsliuustc0106` starting from: +`assert is stripped under python -O. This check (and the 14 others in this file)...` + +Current status: H01, H02, and H04 implemented; remaining items pending discussion and confirmation. + +## H01. Runtime `assert` in `causal_wan_model.py` + +- Review: `assert` is stripped under `python -O`; `assert kv_cache is not None` and the other runtime checks in this file should be explicit `if ...: raise ...`. +- Link: https://github.com/vllm-project/vllm-omni/pull/2162#discussion_r3294053083 +- File: `vllm_omni/diffusion/models/dreamzero/causal_wan_model.py` +- Current code status: implemented. Runtime asserts in `causal_wan_model.py` were replaced with explicit exceptions. +- Risk: production runs with optimization flags can silently remove these checks, turning intended validation failures into later `NoneType`, shape, or tensor operation errors. +- Proposed fix: + - Convert runtime validation asserts to explicit exceptions. + - Use `RuntimeError` for violated inference state invariants, such as missing `kv_cache`. + - Use `ValueError` for invalid config, unsupported model type, invalid tensor shapes, and invalid caller inputs. + - Keep test-file asserts unchanged. +- Test plan: + - Done: `python -O -m py_compile vllm_omni/diffusion/models/dreamzero/causal_wan_model.py`. + - Done: `rg` confirms no remaining `assert` or `if True` in `vllm_omni/diffusion/models/dreamzero`. +- Confirmation: confirmed and implemented. + +## H02. Runtime `assert` in `state_dreamzero.py` + +- Review: same issue for three `assert cache is not None` guards in `state_dreamzero.py`. +- Link: https://github.com/vllm-project/vllm-omni/pull/2162#discussion_r3294053104 +- File: `vllm_omni/diffusion/models/dreamzero/state_dreamzero.py` +- Current code status: implemented. The three cache initialization asserts were replaced with `RuntimeError`. +- Risk: under `python -O`, uninitialized cache access may propagate `None` and fail later with less actionable errors. +- Proposed fix: + - Replace the three asserts with explicit `if cache is None: raise RuntimeError(...)`. + - Keep messages specific to KV cache vs cross-attention cache. +- Test plan: + - Done: added `DreamZeroState` tests that call cache methods before `create_kv_caches()` and verify `RuntimeError`. + - Done: `python -O -m py_compile vllm_omni/diffusion/models/dreamzero/state_dreamzero.py`. +- Confirmation: confirmed and implemented. + +## H03. Duplicate embodiment ID for `mecka_hands` and `lapa` + +- Review: `mecka_hands` and `lapa` both map to embodiment ID `27`; if they should have distinct action-head weights, one prediction path is wrong. +- Link: https://github.com/vllm-project/vllm-omni/pull/2162#discussion_r3294053140 +- File: `vllm_omni/diffusion/models/dreamzero/utils.py` +- Current code status: both names map to `27`. +- Risk: if upstream expects different IDs, action conditioning selects the wrong embodiment embedding/domain for one of the two names. +- Proposed fix options: + - Option A: verify upstream DreamZero config/checkpoint mapping and update one ID if it is a typo. + - Option B: if the duplicate is intentional aliasing, keep the mapping and add a short comment explaining that `lapa` is an alias of the same embodiment ID. + - Option C: if unsupported/unknown, remove one alias and require explicit `embodiment_id`. +- Test plan: + - Update `tests/dreamzero/test_utils.py` to assert the intended mapping or alias behavior. +- Confirmation: pending. + +## H04. No-op `if True:` wrapper in `causal_wan_model.py` + +- Review: `if True:` is a no-op wrapper and should be removed with the body dedented. +- Link: https://github.com/vllm-project/vllm-omni/pull/2162#discussion_r3294060321 +- File: `vllm_omni/diffusion/models/dreamzero/causal_wan_model.py` +- Current code status: implemented. The no-op `if True:` wrapper was removed and the body was dedented. +- Risk: no runtime behavior issue, but it is dead/refactor artifact code and reduces readability. +- Proposed fix: + - Remove `if True:`. + - Dedent the body. + - Combine with H01 because both touch the same block. +- Test plan: + - Done: `python -O -m py_compile vllm_omni/diffusion/models/dreamzero/causal_wan_model.py`. + - Done: `tests/dreamzero/test_pipeline_state.py tests/dreamzero/test_utils.py`. +- Confirmation: confirmed and implemented. + +## H05. OpenPI serving reuses `session_id` as engine `request_id` + +- Review: the server uses long-lived `session_id` as per-inference engine `request_id`, causing duplicate active request IDs for concurrent clients or repeated calls. +- Link: https://github.com/vllm-project/vllm-omni/pull/2162#discussion_r3294078365 +- File: `vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py` +- Current code status: `_build_request()` still sets `request_ids=[f"robot-{session_id}"]`. +- Risk: + - Two websocket clients without explicit `session_id` both use `robot-default`. + - Two clients sharing a logical session reuse the same engine request ID. + - One client can reuse the same active ID across sequential calls if a previous generation has not fully drained. + - Diffusion scheduler and `AsyncOmni.request_states` expect request IDs to be unique per inference. +- Proposed fix: + - Keep `session_id` only in `sampling_params.extra_args` for DreamZero state lookup. + - Generate a unique engine request ID per inference, for example `robot-{session_id}-{counter}` or `robot-{session_id}-{uuid}`. + - Prefer a per-serving-instance monotonic counter or UUID to avoid cross-connection collisions. + - Ensure logs still include session information for debugging. +- Test plan: + - Unit test `_build_request()` twice with the same `session_id` and verify `request_ids[0]` differs while `extra_args["session_id"]` stays the same. + - Add a concurrency-oriented serving test if feasible, or at least a regression test for duplicate ID avoidance. +- Confirmation: pending. + +## H06. OpenPI clients do not surface msgpack structured errors + +- Review: server errors are msgpack dicts like `{"type": "error", "message": ...}`, but clients only treat text frames as errors and then try to convert decoded dicts to action arrays. +- Link: https://github.com/vllm-project/vllm-omni/pull/2162#discussion_r3294078370 +- Files: + - `examples/online_serving/dreamzero/openpi_client.py` + - `tests/dreamzero/openpi_client_helper.py` + - `examples/online_serving/dreamzero/droid_sim_eval_client.py` +- Current code status: all three call `msgpack_numpy.unpackb(response)` and immediately convert to `np.asarray(..., dtype=np.float32)` in `infer()`. +- Risk: server-side inference errors are reported to users as confusing NumPy conversion `TypeError` instead of the real server error message. +- Proposed fix: + - Decode binary responses first. + - If decoded payload is a dict with `type == "error"`, raise `RuntimeError(decoded["message"])`. + - Otherwise convert decoded action payload to `np.float32`. + - Apply the same helper logic to all three clients to avoid drift. +- Test plan: + - Add or update client unit tests for msgpack structured error payloads. + - Verify normal array payloads still convert to `np.float32`. +- Confirmation: pending. diff --git a/tests/dreamzero/test_pipeline_state.py b/tests/dreamzero/test_pipeline_state.py index f63d5c3b3b2..a3a5aaec03c 100644 --- a/tests/dreamzero/test_pipeline_state.py +++ b/tests/dreamzero/test_pipeline_state.py @@ -4,8 +4,10 @@ from collections import OrderedDict import pytest +import torch from vllm_omni.diffusion.models.dreamzero.pipeline_dreamzero import DreamZeroPipeline +from vllm_omni.diffusion.models.dreamzero.state_dreamzero import DreamZeroState pytestmark = [pytest.mark.core_model, pytest.mark.cpu] @@ -42,3 +44,16 @@ def test_dreamzero_pipeline_state_lru_caps_retained_sessions() -> None: assert list(pipeline._states) == ["session-a", "session-c"] assert "session-b" not in pipeline._states + + +def test_dreamzero_state_cache_access_requires_initialization() -> None: + state = DreamZeroState() + + with pytest.raises(RuntimeError, match="KV caches not initialized"): + state.get_kv_caches() + + with pytest.raises(RuntimeError, match="Cross-attn caches not initialized"): + state.get_crossattn_caches() + + with pytest.raises(RuntimeError, match="create_kv_caches first"): + state.update_kv_cache(0, torch.empty(0)) diff --git a/vllm_omni/diffusion/models/dreamzero/causal_wan_model.py b/vllm_omni/diffusion/models/dreamzero/causal_wan_model.py index d8561fc58b7..37598ae0c9b 100644 --- a/vllm_omni/diffusion/models/dreamzero/causal_wan_model.py +++ b/vllm_omni/diffusion/models/dreamzero/causal_wan_model.py @@ -41,7 +41,8 @@ def sinusoidal_embedding_1d(dim: int, position: torch.Tensor) -> torch.Tensor: """Sinusoidal positional embedding for timesteps.""" - assert dim % 2 == 0 + if dim % 2 != 0: + raise ValueError(f"dim must be even, got {dim}.") half = dim // 2 position = position.type(torch.float64) sinusoid = torch.outer( @@ -56,7 +57,8 @@ def rope_params(max_seq_len: int, dim: int) -> torch.Tensor: """Precompute complex-valued RoPE frequencies (polar form). Returns: complex tensor [max_seq_len, dim // 2] """ - assert dim % 2 == 0 + if dim % 2 != 0: + raise ValueError(f"dim must be even, got {dim}.") freqs = torch.outer( torch.arange(max_seq_len), 1.0 / torch.pow(10000, torch.arange(0, dim, 2).to(torch.float64).div(dim)), @@ -87,8 +89,10 @@ def rope_action_apply( B, seq_len, n, _ = x.shape x = torch.view_as_complex(x.to(torch.float64).reshape(B, seq_len, n, -1, 2)) if action_register_length is not None: - assert num_action_per_block is not None - assert num_state_per_block is not None + if num_action_per_block is None: + raise ValueError("num_action_per_block is required when action_register_length is set.") + if num_state_per_block is None: + raise ValueError("num_state_per_block is required when action_register_length is set.") chunk_size = action_register_length // (num_action_per_block + num_state_per_block) freqs_1d_action = freqs_action[: chunk_size * num_action_per_block].view( chunk_size * num_action_per_block, 1, -1 @@ -114,7 +118,12 @@ def causal_rope_action_apply( B, seq_len, n, _ = x.shape x = torch.view_as_complex(x.to(torch.float64).reshape(B, seq_len, n, -1, 2)) if action_register_length is not None: - assert action_register_length == (num_action_per_block + num_state_per_block) + expected_length = num_action_per_block + num_state_per_block + if action_register_length != expected_length: + raise ValueError( + f"action_register_length must equal num_action_per_block + num_state_per_block " + f"({expected_length}), got {action_register_length}." + ) freqs_action = freqs_action[ action_state_index * num_action_per_block : (action_state_index + 1) * num_action_per_block ] @@ -229,7 +238,8 @@ class WanT2VCrossAttention(nn.Module): def __init__(self, dim: int, num_heads: int, window_size=(-1, -1), qk_norm: bool = True, eps: float = 1e-6) -> None: super().__init__() - assert dim % num_heads == 0 + if dim % num_heads != 0: + raise ValueError(f"dim={dim} must be divisible by num_heads={num_heads}.") self.dim = dim self.num_heads = num_heads self.head_dim = dim // num_heads @@ -288,7 +298,8 @@ class WanI2VCrossAttention(nn.Module): def __init__(self, dim: int, num_heads: int, window_size=(-1, -1), qk_norm: bool = True, eps: float = 1e-6) -> None: super().__init__() - assert dim % num_heads == 0 + if dim % num_heads != 0: + raise ValueError(f"dim={dim} must be divisible by num_heads={num_heads}.") self.dim = dim self.num_heads = num_heads self.head_dim = dim // num_heads @@ -375,8 +386,9 @@ def __init__( num_action_per_block: int = 32, num_state_per_block: int = 1, ) -> None: - assert dim % num_heads == 0 super().__init__() + if dim % num_heads != 0: + raise ValueError(f"dim={dim} must be divisible by num_heads={num_heads}.") self.dim = dim self.num_heads = num_heads self.head_dim = dim // num_heads @@ -425,61 +437,62 @@ def forward( updated_kv_cache: torch.Tensor | None = None - assert kv_cache is not None, "Inference only — kv_cache required." - if True: - action_state_index = max(0, (current_start_frame - 1) // self.num_frame_per_block) - - roped_query = causal_rope_action_apply( - q, - freqs, - freqs_action, - freqs_state, - action_register_length, - self.num_action_per_block, - self.num_state_per_block, - action_state_index, - ).type_as(v) - roped_key = causal_rope_action_apply( - k, - freqs, - freqs_action, - freqs_state, - action_register_length, - self.num_action_per_block, - self.num_state_per_block, - action_state_index, - ).type_as(v) - - roped_action_query = None - roped_action_key = None - action_v = None - - if action_register_length is not None: - roped_action_query = roped_query[:, -action_register_length:] - roped_query = roped_query[:, :-action_register_length] - roped_action_key = roped_key[:, -action_register_length:] - roped_key = roped_key[:, :-action_register_length] - action_v = v[:, -action_register_length:] - v = v[:, :-action_register_length] - - updated_k = kv_cache[0] - updated_v = kv_cache[1] - new_k = torch.cat([updated_k, roped_key], dim=1) - new_v = torch.cat([updated_v, v], dim=1) - new_k = new_k[:, -self.max_attention_size :] - new_v = new_v[:, -self.max_attention_size :] - - if action_register_length is not None: - q_cat = torch.cat([roped_query, roped_action_query], dim=1) - k_cat = torch.cat([new_k, roped_action_key], dim=1) - v_cat = torch.cat([new_v, action_v], dim=1) - else: - q_cat = roped_query - k_cat = new_k - v_cat = new_v + if kv_cache is None: + raise RuntimeError("Inference only: kv_cache is required.") + + action_state_index = max(0, (current_start_frame - 1) // self.num_frame_per_block) + + roped_query = causal_rope_action_apply( + q, + freqs, + freqs_action, + freqs_state, + action_register_length, + self.num_action_per_block, + self.num_state_per_block, + action_state_index, + ).type_as(v) + roped_key = causal_rope_action_apply( + k, + freqs, + freqs_action, + freqs_state, + action_register_length, + self.num_action_per_block, + self.num_state_per_block, + action_state_index, + ).type_as(v) + + roped_action_query = None + roped_action_key = None + action_v = None + + if action_register_length is not None: + roped_action_query = roped_query[:, -action_register_length:] + roped_query = roped_query[:, :-action_register_length] + roped_action_key = roped_key[:, -action_register_length:] + roped_key = roped_key[:, :-action_register_length] + action_v = v[:, -action_register_length:] + v = v[:, :-action_register_length] + + updated_k = kv_cache[0] + updated_v = kv_cache[1] + new_k = torch.cat([updated_k, roped_key], dim=1) + new_v = torch.cat([updated_v, v], dim=1) + new_k = new_k[:, -self.max_attention_size :] + new_v = new_v[:, -self.max_attention_size :] + + if action_register_length is not None: + q_cat = torch.cat([roped_query, roped_action_query], dim=1) + k_cat = torch.cat([new_k, roped_action_key], dim=1) + v_cat = torch.cat([new_v, action_v], dim=1) + else: + q_cat = roped_query + k_cat = new_k + v_cat = new_v - x = self.attn(q_cat, k_cat, v_cat) - updated_kv_cache = torch.stack([new_k, new_v], dim=0) + x = self.attn(q_cat, k_cat, v_cat) + updated_kv_cache = torch.stack([new_k, new_v], dim=0) x = x.flatten(2) x = self.o(x) @@ -634,7 +647,8 @@ def __init__( num_state_per_block: int = 1, ) -> None: super().__init__() - assert model_type in ["t2v", "i2v", "ti2v"] + if model_type not in ["t2v", "i2v", "ti2v"]: + raise ValueError(f"Unsupported model_type={model_type!r}; expected one of ['t2v', 'i2v', 'ti2v'].") self.model_type = model_type self.patch_size = patch_size self.frame_seqlen = frame_seqlen @@ -716,7 +730,10 @@ def __init__( self.head = CausalHead(dim, out_dim, patch_size, eps) - assert (dim % num_heads) == 0 and (dim // num_heads) % 2 == 0 + if dim % num_heads != 0: + raise ValueError(f"dim={dim} must be divisible by num_heads={num_heads}.") + if (dim // num_heads) % 2 != 0: + raise ValueError(f"dim // num_heads must be even, got {dim // num_heads}.") d = dim // num_heads self.freqs_action = rope_params(1024 * 10, d) self.freqs_state = rope_params(1024, d) @@ -794,7 +811,9 @@ def unpatchify(self, x: torch.Tensor, grid_size: torch.Tensor) -> torch.Tensor: B = x.shape[0] c = self.out_dim grid_size = grid_size.tolist() - assert x.shape[1] == math.prod(grid_size) + expected_seq_len = math.prod(grid_size) + if x.shape[1] != expected_seq_len: + raise ValueError(f"x sequence length must equal product(grid_size)={expected_seq_len}, got {x.shape[1]}.") x = x.view(B, *grid_size, *self.patch_size, c) x = torch.einsum("bfhwpqrc->bcfphqwr", x) x = x.reshape(B, c, *[i * j for i, j in zip(grid_size, self.patch_size)]) @@ -835,7 +854,8 @@ def _forward_blocks( timestep = timestep.unsqueeze(-1).expand(B, F_t, seq_len // F_t).reshape(B, -1) if action is not None: - assert timestep_action is not None and state is not None + if timestep_action is None or state is None: + raise RuntimeError("timestep_action and state are required when action is provided.") state_features_t = self.state_encoder(state, adapter_category_id) stride = timestep_action.shape[1] // state_features_t.shape[1] timestep_state = timestep_action[:, ::stride] @@ -895,8 +915,10 @@ def _forward_inference( embodiment_id: torch.Tensor | None = None, ) -> tuple[torch.Tensor, torch.Tensor | None, list[torch.Tensor]]: if self.model_type == "i2v": - assert clip_feature is not None and y is not None - assert context.shape[1] == self.text_len + if clip_feature is None or y is None: + raise RuntimeError("clip_feature and y are required for i2v inference.") + if context.shape[1] != self.text_len: + raise ValueError(f"context length must be {self.text_len}, got {context.shape[1]}.") if y is not None: x = torch.cat([x, y.to(dtype=x.dtype)], dim=1) diff --git a/vllm_omni/diffusion/models/dreamzero/image_encoder.py b/vllm_omni/diffusion/models/dreamzero/image_encoder.py index df0895eaecf..8a70b5f1eaf 100644 --- a/vllm_omni/diffusion/models/dreamzero/image_encoder.py +++ b/vllm_omni/diffusion/models/dreamzero/image_encoder.py @@ -35,7 +35,8 @@ def __init__( proj_dropout: float = 0.0, ) -> None: super().__init__() - assert dim % num_heads == 0 + if dim % num_heads != 0: + raise ValueError(f"dim={dim} must be divisible by num_heads={num_heads}.") self.dim = dim self.num_heads = num_heads self.head_dim = dim // num_heads @@ -70,7 +71,8 @@ def __init__( norm_eps: float = 1e-5, ) -> None: super().__init__() - assert activation == "gelu", "DreamZero image encoder uses GELU." + if activation != "gelu": + raise ValueError(f"DreamZero image encoder uses GELU; got activation={activation!r}.") self.post_norm = post_norm hidden_dim = int(dim * mlp_ratio) @@ -119,7 +121,8 @@ def __init__( norm_eps: float = 1e-5, ) -> None: super().__init__() - assert pool_type == "token" + if pool_type != "token": + raise ValueError(f"DreamZero image encoder only supports pool_type='token', got {pool_type!r}.") self.image_size = image_size self.patch_size = patch_size self.num_patches = (image_size // patch_size) ** 2 diff --git a/vllm_omni/diffusion/models/dreamzero/state_dreamzero.py b/vllm_omni/diffusion/models/dreamzero/state_dreamzero.py index ecc536b8ce2..0972d1b5a2c 100644 --- a/vllm_omni/diffusion/models/dreamzero/state_dreamzero.py +++ b/vllm_omni/diffusion/models/dreamzero/state_dreamzero.py @@ -142,17 +142,20 @@ def update_kv_cache( ) -> None: """Update a single layer's KV cache after prefill.""" cache = self.kv_cache_neg if is_negative else self.kv_cache - assert cache is not None, "KV caches not initialized, call create_kv_caches first" + if cache is None: + raise RuntimeError("KV caches not initialized, call create_kv_caches first.") cache[layer_index] = updated_kv.clone() def get_kv_caches(self, is_negative: bool = False) -> list[torch.Tensor]: """Get KV caches for the specified branch.""" cache = self.kv_cache_neg if is_negative else self.kv_cache - assert cache is not None, "KV caches not initialized" + if cache is None: + raise RuntimeError("KV caches not initialized.") return cache def get_crossattn_caches(self, is_negative: bool = False) -> list[dict[str, bool | torch.Tensor | None]]: """Get cross-attention caches for the specified branch.""" cache = self.crossattn_cache_neg if is_negative else self.crossattn_cache - assert cache is not None, "Cross-attn caches not initialized" + if cache is None: + raise RuntimeError("Cross-attn caches not initialized.") return cache diff --git a/vllm_omni/diffusion/models/dreamzero/transform/droid.py b/vllm_omni/diffusion/models/dreamzero/transform/droid.py index 981895b58b4..b116c54d56c 100644 --- a/vllm_omni/diffusion/models/dreamzero/transform/droid.py +++ b/vllm_omni/diffusion/models/dreamzero/transform/droid.py @@ -88,7 +88,8 @@ def maybe_preprocess(arr: np.ndarray | None) -> np.ndarray | None: right_ext = maybe_preprocess(right_ext) wrist = maybe_preprocess(wrist) ref = next((v for v in [wrist, left_ext, right_ext] if v is not None), None) - assert ref is not None + if ref is None: + raise RuntimeError("Expected at least one DROID camera view after preprocessing.") t, h, w, c = ref.shape out = np.zeros((t, 2 * h, 2 * w, c), dtype=ref.dtype) # (T, 2H, 2W, C) From ec89c0850408878e23d68d4d0bb15dac828d51cf Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Tue, 26 May 2026 00:43:21 +0000 Subject: [PATCH 42/45] fix: make OpenPI engine request ids unique Keep the robot session id only as DreamZero state metadata and generate a unique engine request id for each OpenPI inference to avoid scheduler collisions. Add regression coverage for repeated requests on the same session and concurrent websocket clients that omit session_id. Co-authored-by: Meng Signed-off-by: Yangshen Deng --- dreamzero_hsliu_review_todo.md | 8 +- .../openai_api/test_openpi_serving.py | 158 +++++++++++++++++- .../openai/realtime/robot/openpi_serving.py | 8 +- 3 files changed, 162 insertions(+), 12 deletions(-) diff --git a/dreamzero_hsliu_review_todo.md b/dreamzero_hsliu_review_todo.md index 5e415fbfeee..5a5bab88348 100644 --- a/dreamzero_hsliu_review_todo.md +++ b/dreamzero_hsliu_review_todo.md @@ -3,7 +3,7 @@ Scope: review comments by `hsliuustc0106` starting from: `assert is stripped under python -O. This check (and the 14 others in this file)...` -Current status: H01, H02, and H04 implemented; remaining items pending discussion and confirmation. +Current status: H01, H02, H04, and H05 implemented; H03 skipped by decision; remaining items pending discussion and confirmation. ## H01. Runtime `assert` in `causal_wan_model.py` @@ -50,7 +50,7 @@ Current status: H01, H02, and H04 implemented; remaining items pending discussio - Option C: if unsupported/unknown, remove one alias and require explicit `embodiment_id`. - Test plan: - Update `tests/dreamzero/test_utils.py` to assert the intended mapping or alias behavior. -- Confirmation: pending. +- Confirmation: skipped by decision; no code change. ## H04. No-op `if True:` wrapper in `causal_wan_model.py` @@ -73,7 +73,7 @@ Current status: H01, H02, and H04 implemented; remaining items pending discussio - Review: the server uses long-lived `session_id` as per-inference engine `request_id`, causing duplicate active request IDs for concurrent clients or repeated calls. - Link: https://github.com/vllm-project/vllm-omni/pull/2162#discussion_r3294078365 - File: `vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py` -- Current code status: `_build_request()` still sets `request_ids=[f"robot-{session_id}"]`. +- Current code status: implemented. `_build_request()` now keeps `session_id` in `extra_args` and generates a unique engine request ID per inference. - Risk: - Two websocket clients without explicit `session_id` both use `robot-default`. - Two clients sharing a logical session reuse the same engine request ID. @@ -87,7 +87,7 @@ Current status: H01, H02, and H04 implemented; remaining items pending discussio - Test plan: - Unit test `_build_request()` twice with the same `session_id` and verify `request_ids[0]` differs while `extra_args["session_id"]` stays the same. - Add a concurrency-oriented serving test if feasible, or at least a regression test for duplicate ID avoidance. -- Confirmation: pending. +- Confirmation: confirmed and implemented. ## H06. OpenPI clients do not surface msgpack structured errors diff --git a/tests/entrypoints/openai_api/test_openpi_serving.py b/tests/entrypoints/openai_api/test_openpi_serving.py index 670f3023b1c..8f482e55ea7 100644 --- a/tests/entrypoints/openai_api/test_openpi_serving.py +++ b/tests/entrypoints/openai_api/test_openpi_serving.py @@ -1,10 +1,17 @@ +import asyncio +import pickle +import threading +from concurrent.futures import ThreadPoolExecutor from types import SimpleNamespace +import numpy as np import pytest +from fastapi import FastAPI, WebSocket from omegaconf import OmegaConf +from starlette.testclient import TestClient from vllm_omni.diffusion.models.dreamzero import transform as dreamzero_transform -from vllm_omni.entrypoints.openai.realtime.robot import openpi_serving +from vllm_omni.entrypoints.openai.realtime.robot import openpi_connection, openpi_serving pytestmark = [pytest.mark.core_model, pytest.mark.cpu] @@ -23,6 +30,63 @@ def _engine_with_policy_config(policy_config=None): return SimpleNamespace(get_diffusion_od_config=lambda: od_config) +class RecordingEngine: + def __init__(self): + self.od_config = SimpleNamespace(model_config={"policy_server_config": TEST_POLICY_SERVER_CONFIG}) + self.generate_calls = [] + + def get_diffusion_od_config(self): + return self.od_config + + def generate(self, *, prompt, request_id, sampling_params_list): + async def _generate(): + self.generate_calls.append( + { + "prompt": prompt, + "request_id": request_id, + "sampling_params_list": sampling_params_list, + } + ) + yield SimpleNamespace(multimodal_output={"actions": [0.0]}) + + return _generate() + + +class ConcurrentRecordingEngine(RecordingEngine): + def __init__(self, *, expected_calls: int): + super().__init__() + self.expected_calls = expected_calls + self.condition = threading.Condition() + self.saw_overlap = False + + def _wait_for_expected_calls(self): + with self.condition: + completed = self.condition.wait_for( + lambda: len(self.generate_calls) >= self.expected_calls, + timeout=5.0, + ) + self.saw_overlap = self.saw_overlap or completed + + def generate(self, *, prompt, request_id, sampling_params_list): + async def _generate(): + with self.condition: + self.generate_calls.append( + { + "prompt": prompt, + "request_id": request_id, + "sampling_params_list": sampling_params_list, + } + ) + if len(self.generate_calls) >= self.expected_calls: + self.saw_overlap = True + self.condition.notify_all() + + await asyncio.to_thread(self._wait_for_expected_calls) + yield SimpleNamespace(multimodal_output={"actions": [0.0]}) + + return _generate() + + def test_ensure_transforms_loaded_fails_fast_on_import_error(monkeypatch): def fail_import(module_name): raise ModuleNotFoundError(f"missing module: {module_name}") @@ -126,16 +190,96 @@ def test_policy_server_config_reads_engine_model_config(): assert serving.policy_server_config.to_dict() == policy_config -def test_build_request_forwards_connection_session_state(): +def test_build_request_uses_unique_engine_request_id_per_inference(): serving = openpi_serving.ServingRealtimeRobotOpenPI(engine_client=_engine_with_policy_config()) - request = serving._build_request( + request_a = serving._build_request( {"prompt": "pick up the object"}, session_id="session-a", reset=True, ) + request_b = serving._build_request( + {"prompt": "pick up the object"}, + session_id="session-a", + reset=False, + ) - assert request.sampling_params.extra_args["reset"] is True - assert request.sampling_params.extra_args["session_id"] == "session-a" - assert request.sampling_params.extra_args["robot_obs"]["prompt"] == "pick up the object" - assert request.request_ids == ["robot-session-a"] + assert request_a.sampling_params.extra_args["reset"] is True + assert request_b.sampling_params.extra_args["reset"] is False + assert request_a.sampling_params.extra_args["session_id"] == "session-a" + assert request_b.sampling_params.extra_args["session_id"] == "session-a" + assert request_a.sampling_params.extra_args["robot_obs"]["prompt"] == "pick up the object" + assert request_b.sampling_params.extra_args["robot_obs"]["prompt"] == "pick up the object" + + assert request_a.request_ids == ["robot-session-a-0"] + assert request_b.request_ids == ["robot-session-a-1"] + assert request_a.request_ids[0] != request_b.request_ids[0] + + +def test_infer_keeps_session_state_but_uses_unique_engine_request_ids(): + engine = RecordingEngine() + serving = openpi_serving.ServingRealtimeRobotOpenPI(engine_client=engine) + + async def run_requests(): + await serving.infer({"prompt": "pick up the object"}, session_id="session-a", reset=True) + await serving.infer({"prompt": "pick up the object"}, session_id="session-a", reset=False) + + asyncio.run(run_requests()) + + assert [call["request_id"] for call in engine.generate_calls] == [ + "robot-session-a-0", + "robot-session-a-1", + ] + assert engine.generate_calls[0]["request_id"] != engine.generate_calls[1]["request_id"] + + sampling_params_a = engine.generate_calls[0]["sampling_params_list"][0] + sampling_params_b = engine.generate_calls[1]["sampling_params_list"][0] + assert sampling_params_a.extra_args["session_id"] == "session-a" + assert sampling_params_b.extra_args["session_id"] == "session-a" + assert sampling_params_a.extra_args["reset"] is True + assert sampling_params_b.extra_args["reset"] is False + + +def test_two_websocket_clients_without_session_id_do_not_conflict(monkeypatch): + monkeypatch.setattr(openpi_connection, "_pack", pickle.dumps) + monkeypatch.setattr(openpi_connection, "_unpack", pickle.loads) + + engine = ConcurrentRecordingEngine(expected_calls=2) + serving = openpi_serving.ServingRealtimeRobotOpenPI(engine_client=engine) + app = FastAPI() + + @app.websocket("/v1/realtime/robot/openpi") + async def openpi_endpoint(websocket: WebSocket): + connection = openpi_connection.RobotRealtimeConnection(websocket, serving) + await connection.handle_connection() + + def run_client(prompt: str): + with TestClient(app) as client: + with client.websocket_connect("/v1/realtime/robot/openpi") as websocket: + metadata = pickle.loads(websocket.receive_bytes()) + assert metadata["needs_session_id"] is True + + websocket.send_bytes(pickle.dumps({"prompt": prompt})) + actions = pickle.loads(websocket.receive_bytes()) + np.testing.assert_array_equal( + np.asarray(actions, dtype=np.float32), + np.asarray([0.0], dtype=np.float32), + ) + + with ThreadPoolExecutor(max_workers=2) as executor: + futures = [ + executor.submit(run_client, "first client"), + executor.submit(run_client, "second client"), + ] + for future in futures: + future.result(timeout=10.0) + + request_ids = [call["request_id"] for call in engine.generate_calls] + assert len(request_ids) == 2 + assert len(set(request_ids)) == 2 + assert all(request_id.startswith("robot-default-") for request_id in request_ids) + assert engine.saw_overlap is True + + sampling_params = [call["sampling_params_list"][0] for call in engine.generate_calls] + assert [params.extra_args["session_id"] for params in sampling_params] == ["default", "default"] + assert [params.extra_args["reset"] for params in sampling_params] == [True, True] diff --git a/vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py b/vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py index b58cbd988e2..46e46276a9c 100644 --- a/vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py +++ b/vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py @@ -11,6 +11,7 @@ from collections.abc import Mapping from dataclasses import dataclass +from itertools import count from typing import Any import numpy as np @@ -72,6 +73,7 @@ def __init__( self.engine_client = engine_client self.model_name = model_name self.policy_server_config = self._get_policy_server_config(engine_client) + self._request_counter = count() @classmethod def create_policy_server( @@ -134,6 +136,10 @@ async def infer(self, obs: dict, *, session_id: str, reset: bool) -> np.ndarray: return self._extract_actions(result) + def _next_request_id(self, session_id: str) -> str: + """Return a unique engine request id while keeping session_id stateful.""" + return f"robot-{session_id}-{next(self._request_counter)}" + def _build_request(self, obs: dict, *, session_id: str, reset: bool) -> Any: """Build engine request from raw robot obs. @@ -154,7 +160,7 @@ def _build_request(self, obs: dict, *, session_id: str, reset: bool) -> Any: return OmniDiffusionRequest( prompts=[prompt], sampling_params=sampling_params, - request_ids=[f"robot-{session_id}"], + request_ids=[self._next_request_id(session_id)], ) def _extract_actions(self, result: Any) -> np.ndarray: From 90bf888a43fc7e896d5472c251c5321f976a76af Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Tue, 26 May 2026 00:49:51 +0000 Subject: [PATCH 43/45] fix: surface OpenPI structured client errors Decode msgpack error responses before converting OpenPI inference replies to action arrays so clients report the server error message instead of a NumPy conversion failure. Apply the same response handling to the DreamZero example client, test helper, and DROID sim-eval client. Add regression coverage for structured error and normal action payloads. Co-authored-by: Meng Signed-off-by: Yangshen Deng --- dreamzero_hsliu_review_todo.md | 10 ++--- .../dreamzero/droid_sim_eval_client.py | 14 ++++-- .../online_serving/dreamzero/openpi_client.py | 14 ++++-- tests/dreamzero/openpi_client_helper.py | 14 ++++-- tests/dreamzero/test_openpi_client_helper.py | 44 +++++++++++++++++++ 5 files changed, 82 insertions(+), 14 deletions(-) create mode 100644 tests/dreamzero/test_openpi_client_helper.py diff --git a/dreamzero_hsliu_review_todo.md b/dreamzero_hsliu_review_todo.md index 5a5bab88348..38c7f8a01e6 100644 --- a/dreamzero_hsliu_review_todo.md +++ b/dreamzero_hsliu_review_todo.md @@ -3,7 +3,7 @@ Scope: review comments by `hsliuustc0106` starting from: `assert is stripped under python -O. This check (and the 14 others in this file)...` -Current status: H01, H02, H04, and H05 implemented; H03 skipped by decision; remaining items pending discussion and confirmation. +Current status: H01, H02, H04, H05, and H06 implemented; H03 skipped by decision; remaining items pending discussion and confirmation. ## H01. Runtime `assert` in `causal_wan_model.py` @@ -97,7 +97,7 @@ Current status: H01, H02, H04, and H05 implemented; H03 skipped by decision; rem - `examples/online_serving/dreamzero/openpi_client.py` - `tests/dreamzero/openpi_client_helper.py` - `examples/online_serving/dreamzero/droid_sim_eval_client.py` -- Current code status: all three call `msgpack_numpy.unpackb(response)` and immediately convert to `np.asarray(..., dtype=np.float32)` in `infer()`. +- Current code status: implemented. All three clients now decode action responses through a helper that surfaces msgpack error dicts before converting normal action payloads to `np.float32`. - Risk: server-side inference errors are reported to users as confusing NumPy conversion `TypeError` instead of the real server error message. - Proposed fix: - Decode binary responses first. @@ -105,6 +105,6 @@ Current status: H01, H02, H04, and H05 implemented; H03 skipped by decision; rem - Otherwise convert decoded action payload to `np.float32`. - Apply the same helper logic to all three clients to avoid drift. - Test plan: - - Add or update client unit tests for msgpack structured error payloads. - - Verify normal array payloads still convert to `np.float32`. -- Confirmation: pending. + - Done: added `tests/dreamzero/test_openpi_client_helper.py` coverage for msgpack structured error payloads. + - Done: verified normal action payloads still convert to `np.float32`. +- Confirmation: confirmed and implemented. diff --git a/examples/online_serving/dreamzero/droid_sim_eval_client.py b/examples/online_serving/dreamzero/droid_sim_eval_client.py index da29ab84dba..40f0f417cb5 100644 --- a/examples/online_serving/dreamzero/droid_sim_eval_client.py +++ b/examples/online_serving/dreamzero/droid_sim_eval_client.py @@ -121,6 +121,16 @@ } +def _decode_action_response(response: bytes | str) -> np.ndarray: + if isinstance(response, str): + raise RuntimeError(f"Error in inference server:\n{response}") + decoded = msgpack_numpy.unpackb(response) + if isinstance(decoded, dict) and decoded.get("type") == "error": + message = decoded.get("message", decoded) + raise RuntimeError(f"Error in inference server:\n{message}") + return np.asarray(decoded, dtype=np.float32) + + @dataclass(frozen=True) class StepRecord: """One fully materialized rollout step for later JSON export. @@ -210,9 +220,7 @@ def infer(self, obs: dict[str, Any]) -> np.ndarray: payload["endpoint"] = "infer" self._ws.send(self._packer.pack(payload)) response = self._ws.recv() - if isinstance(response, str): - raise RuntimeError(f"Error in inference server:\n{response}") - return np.asarray(msgpack_numpy.unpackb(response), dtype=np.float32) + return _decode_action_response(response) @override def reset(self, reset_info: dict[str, Any] | None = None) -> str: diff --git a/examples/online_serving/dreamzero/openpi_client.py b/examples/online_serving/dreamzero/openpi_client.py index 3e1eab51834..114817a9b18 100755 --- a/examples/online_serving/dreamzero/openpi_client.py +++ b/examples/online_serving/dreamzero/openpi_client.py @@ -58,6 +58,16 @@ } +def _decode_action_response(response: bytes | str) -> np.ndarray: + if isinstance(response, str): + raise RuntimeError(f"Inference failed: {response}") + decoded = msgpack_numpy.unpackb(response) + if isinstance(decoded, dict) and decoded.get("type") == "error": + message = decoded.get("message", decoded) + raise RuntimeError(f"Inference failed: {message}") + return np.asarray(decoded, dtype=np.float32) + + @dataclass(frozen=True) class DreamZeroServerMetadata: image_resolution: tuple[int, int] @@ -129,9 +139,7 @@ def infer(self, obs: dict[str, Any]) -> np.ndarray: payload["endpoint"] = "infer" self._ws.send(self._packer.pack(payload)) response = self._ws.recv() - if isinstance(response, str): - raise RuntimeError(f"Inference failed: {response}") - return np.asarray(msgpack_numpy.unpackb(response), dtype=np.float32) + return _decode_action_response(response) def reset(self, reset_info: dict[str, Any] | None = None) -> str: payload = dict(reset_info or {}) diff --git a/tests/dreamzero/openpi_client_helper.py b/tests/dreamzero/openpi_client_helper.py index dc769ec1337..8918e5c36c5 100644 --- a/tests/dreamzero/openpi_client_helper.py +++ b/tests/dreamzero/openpi_client_helper.py @@ -41,6 +41,16 @@ } +def _decode_action_response(response: bytes | str) -> np.ndarray: + if isinstance(response, str): + raise RuntimeError(f"Inference failed: {response}") + decoded = msgpack_numpy.unpackb(response) + if isinstance(decoded, dict) and decoded.get("type") == "error": + message = decoded.get("message", decoded) + raise RuntimeError(f"Inference failed: {message}") + return np.asarray(decoded, dtype=np.float32) + + def require_dependencies() -> None: missing = [] if cv2 is None: @@ -124,9 +134,7 @@ def infer(self, obs: dict[str, Any]) -> np.ndarray: payload["endpoint"] = "infer" self._ws.send(self._packer.pack(payload)) response = self._ws.recv() - if isinstance(response, str): - raise RuntimeError(f"Inference failed: {response}") - return np.asarray(msgpack_numpy.unpackb(response), dtype=np.float32) + return _decode_action_response(response) def reset(self, reset_info: dict[str, Any] | None = None) -> str: payload = dict(reset_info or {}) diff --git a/tests/dreamzero/test_openpi_client_helper.py b/tests/dreamzero/test_openpi_client_helper.py new file mode 100644 index 00000000000..92d64061570 --- /dev/null +++ b/tests/dreamzero/test_openpi_client_helper.py @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pickle + +import numpy as np +import pytest + +from tests.dreamzero import openpi_client_helper + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +class FakeMsgpackNumpy: + @staticmethod + def packb(obj): + return pickle.dumps(obj) + + @staticmethod + def unpackb(data): + return pickle.loads(data) + + +def test_decode_action_response_surfaces_structured_error(monkeypatch): + monkeypatch.setattr(openpi_client_helper, "msgpack_numpy", FakeMsgpackNumpy) + payload = FakeMsgpackNumpy.packb( + { + "type": "error", + "message": "Internal inference error", + } + ) + + with pytest.raises(RuntimeError, match="Internal inference error"): + openpi_client_helper._decode_action_response(payload) + + +def test_decode_action_response_converts_action_payload_to_float32(monkeypatch): + monkeypatch.setattr(openpi_client_helper, "msgpack_numpy", FakeMsgpackNumpy) + payload = FakeMsgpackNumpy.packb(np.asarray([[1.0, 2.0]], dtype=np.float64)) + + actions = openpi_client_helper._decode_action_response(payload) + + assert actions.dtype == np.float32 + np.testing.assert_array_equal(actions, np.asarray([[1.0, 2.0]], dtype=np.float32)) From f90c51b5be033a62e26feb037ad9a86649c54827 Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Wed, 27 May 2026 11:22:13 +0000 Subject: [PATCH 44/45] remove md --- dreamzero_hsliu_review_todo.md | 110 --------------------------------- 1 file changed, 110 deletions(-) delete mode 100644 dreamzero_hsliu_review_todo.md diff --git a/dreamzero_hsliu_review_todo.md b/dreamzero_hsliu_review_todo.md deleted file mode 100644 index 38c7f8a01e6..00000000000 --- a/dreamzero_hsliu_review_todo.md +++ /dev/null @@ -1,110 +0,0 @@ -# DreamZero PR #2162 hsliuustc0106 Review TODO - -Scope: review comments by `hsliuustc0106` starting from: -`assert is stripped under python -O. This check (and the 14 others in this file)...` - -Current status: H01, H02, H04, H05, and H06 implemented; H03 skipped by decision; remaining items pending discussion and confirmation. - -## H01. Runtime `assert` in `causal_wan_model.py` - -- Review: `assert` is stripped under `python -O`; `assert kv_cache is not None` and the other runtime checks in this file should be explicit `if ...: raise ...`. -- Link: https://github.com/vllm-project/vllm-omni/pull/2162#discussion_r3294053083 -- File: `vllm_omni/diffusion/models/dreamzero/causal_wan_model.py` -- Current code status: implemented. Runtime asserts in `causal_wan_model.py` were replaced with explicit exceptions. -- Risk: production runs with optimization flags can silently remove these checks, turning intended validation failures into later `NoneType`, shape, or tensor operation errors. -- Proposed fix: - - Convert runtime validation asserts to explicit exceptions. - - Use `RuntimeError` for violated inference state invariants, such as missing `kv_cache`. - - Use `ValueError` for invalid config, unsupported model type, invalid tensor shapes, and invalid caller inputs. - - Keep test-file asserts unchanged. -- Test plan: - - Done: `python -O -m py_compile vllm_omni/diffusion/models/dreamzero/causal_wan_model.py`. - - Done: `rg` confirms no remaining `assert` or `if True` in `vllm_omni/diffusion/models/dreamzero`. -- Confirmation: confirmed and implemented. - -## H02. Runtime `assert` in `state_dreamzero.py` - -- Review: same issue for three `assert cache is not None` guards in `state_dreamzero.py`. -- Link: https://github.com/vllm-project/vllm-omni/pull/2162#discussion_r3294053104 -- File: `vllm_omni/diffusion/models/dreamzero/state_dreamzero.py` -- Current code status: implemented. The three cache initialization asserts were replaced with `RuntimeError`. -- Risk: under `python -O`, uninitialized cache access may propagate `None` and fail later with less actionable errors. -- Proposed fix: - - Replace the three asserts with explicit `if cache is None: raise RuntimeError(...)`. - - Keep messages specific to KV cache vs cross-attention cache. -- Test plan: - - Done: added `DreamZeroState` tests that call cache methods before `create_kv_caches()` and verify `RuntimeError`. - - Done: `python -O -m py_compile vllm_omni/diffusion/models/dreamzero/state_dreamzero.py`. -- Confirmation: confirmed and implemented. - -## H03. Duplicate embodiment ID for `mecka_hands` and `lapa` - -- Review: `mecka_hands` and `lapa` both map to embodiment ID `27`; if they should have distinct action-head weights, one prediction path is wrong. -- Link: https://github.com/vllm-project/vllm-omni/pull/2162#discussion_r3294053140 -- File: `vllm_omni/diffusion/models/dreamzero/utils.py` -- Current code status: both names map to `27`. -- Risk: if upstream expects different IDs, action conditioning selects the wrong embodiment embedding/domain for one of the two names. -- Proposed fix options: - - Option A: verify upstream DreamZero config/checkpoint mapping and update one ID if it is a typo. - - Option B: if the duplicate is intentional aliasing, keep the mapping and add a short comment explaining that `lapa` is an alias of the same embodiment ID. - - Option C: if unsupported/unknown, remove one alias and require explicit `embodiment_id`. -- Test plan: - - Update `tests/dreamzero/test_utils.py` to assert the intended mapping or alias behavior. -- Confirmation: skipped by decision; no code change. - -## H04. No-op `if True:` wrapper in `causal_wan_model.py` - -- Review: `if True:` is a no-op wrapper and should be removed with the body dedented. -- Link: https://github.com/vllm-project/vllm-omni/pull/2162#discussion_r3294060321 -- File: `vllm_omni/diffusion/models/dreamzero/causal_wan_model.py` -- Current code status: implemented. The no-op `if True:` wrapper was removed and the body was dedented. -- Risk: no runtime behavior issue, but it is dead/refactor artifact code and reduces readability. -- Proposed fix: - - Remove `if True:`. - - Dedent the body. - - Combine with H01 because both touch the same block. -- Test plan: - - Done: `python -O -m py_compile vllm_omni/diffusion/models/dreamzero/causal_wan_model.py`. - - Done: `tests/dreamzero/test_pipeline_state.py tests/dreamzero/test_utils.py`. -- Confirmation: confirmed and implemented. - -## H05. OpenPI serving reuses `session_id` as engine `request_id` - -- Review: the server uses long-lived `session_id` as per-inference engine `request_id`, causing duplicate active request IDs for concurrent clients or repeated calls. -- Link: https://github.com/vllm-project/vllm-omni/pull/2162#discussion_r3294078365 -- File: `vllm_omni/entrypoints/openai/realtime/robot/openpi_serving.py` -- Current code status: implemented. `_build_request()` now keeps `session_id` in `extra_args` and generates a unique engine request ID per inference. -- Risk: - - Two websocket clients without explicit `session_id` both use `robot-default`. - - Two clients sharing a logical session reuse the same engine request ID. - - One client can reuse the same active ID across sequential calls if a previous generation has not fully drained. - - Diffusion scheduler and `AsyncOmni.request_states` expect request IDs to be unique per inference. -- Proposed fix: - - Keep `session_id` only in `sampling_params.extra_args` for DreamZero state lookup. - - Generate a unique engine request ID per inference, for example `robot-{session_id}-{counter}` or `robot-{session_id}-{uuid}`. - - Prefer a per-serving-instance monotonic counter or UUID to avoid cross-connection collisions. - - Ensure logs still include session information for debugging. -- Test plan: - - Unit test `_build_request()` twice with the same `session_id` and verify `request_ids[0]` differs while `extra_args["session_id"]` stays the same. - - Add a concurrency-oriented serving test if feasible, or at least a regression test for duplicate ID avoidance. -- Confirmation: confirmed and implemented. - -## H06. OpenPI clients do not surface msgpack structured errors - -- Review: server errors are msgpack dicts like `{"type": "error", "message": ...}`, but clients only treat text frames as errors and then try to convert decoded dicts to action arrays. -- Link: https://github.com/vllm-project/vllm-omni/pull/2162#discussion_r3294078370 -- Files: - - `examples/online_serving/dreamzero/openpi_client.py` - - `tests/dreamzero/openpi_client_helper.py` - - `examples/online_serving/dreamzero/droid_sim_eval_client.py` -- Current code status: implemented. All three clients now decode action responses through a helper that surfaces msgpack error dicts before converting normal action payloads to `np.float32`. -- Risk: server-side inference errors are reported to users as confusing NumPy conversion `TypeError` instead of the real server error message. -- Proposed fix: - - Decode binary responses first. - - If decoded payload is a dict with `type == "error"`, raise `RuntimeError(decoded["message"])`. - - Otherwise convert decoded action payload to `np.float32`. - - Apply the same helper logic to all three clients to avoid drift. -- Test plan: - - Done: added `tests/dreamzero/test_openpi_client_helper.py` coverage for msgpack structured error payloads. - - Done: verified normal action payloads still convert to `np.float32`. -- Confirmation: confirmed and implemented. From 141a5b1ec548be6c672f25260d732578e321d03d Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Wed, 27 May 2026 12:27:49 +0000 Subject: [PATCH 45/45] remove tp run server --- .../dreamzero/run_server_with_tp2_config.sh | 10 ---------- third_party/dreamzero | 1 + third_party/lerobot | 1 + third_party/openpi | 1 + 4 files changed, 3 insertions(+), 10 deletions(-) delete mode 100644 examples/online_serving/dreamzero/run_server_with_tp2_config.sh create mode 160000 third_party/dreamzero create mode 160000 third_party/lerobot create mode 160000 third_party/openpi diff --git a/examples/online_serving/dreamzero/run_server_with_tp2_config.sh b/examples/online_serving/dreamzero/run_server_with_tp2_config.sh deleted file mode 100644 index 2510192b775..00000000000 --- a/examples/online_serving/dreamzero/run_server_with_tp2_config.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -export PYTORCH_ALLOC_CONF="${PYTORCH_ALLOC_CONF:-expandable_segments:True}" - -CUDA_VISIBLE_DEVICES=0,1 vllm serve GEAR-Dreams/DreamZero-DROID --omni \ - --host 127.0.0.1 --port 8000 \ - --served-model-name dreamzero-droid \ - --enforce-eager --disable-log-stats \ - --deploy-config vllm_omni/deploy/dreamzero_tp2_cfg1.yaml diff --git a/third_party/dreamzero b/third_party/dreamzero new file mode 160000 index 00000000000..d70a8025ac7 --- /dev/null +++ b/third_party/dreamzero @@ -0,0 +1 @@ +Subproject commit d70a8025ac77f7486f38032c718a1ca814a4d8c7 diff --git a/third_party/lerobot b/third_party/lerobot new file mode 160000 index 00000000000..017ff73fbfe --- /dev/null +++ b/third_party/lerobot @@ -0,0 +1 @@ +Subproject commit 017ff73fbfe46bf9a673cd9b402988dcb79151f7 diff --git a/third_party/openpi b/third_party/openpi new file mode 160000 index 00000000000..54cbaee6ae0 --- /dev/null +++ b/third_party/openpi @@ -0,0 +1 @@ +Subproject commit 54cbaee6ae0c010a1ed431871cdaa8f4684ac709