diff --git a/examples/online_serving/text_to_video/README.md b/examples/online_serving/text_to_video/README.md new file mode 100644 index 00000000000..1eeda09a871 --- /dev/null +++ b/examples/online_serving/text_to_video/README.md @@ -0,0 +1,143 @@ +# Text-To-Video + +This example demonstrates how to deploy Wan2.2 video models for online video generation +using vLLM-Omni. The API base is `v1/chat/completions`. + +## Start Server + +### Text-to-Video (T2V) + +```bash +vllm serve Wan-AI/Wan2.2-T2V-A14B-Diffusers --omni --port 8093 \ + --boundary-ratio 0.875 \ + --flow-shift 5.0 +``` + +### Image-to-Video (I2V) + +```bash +vllm serve Wan-AI/Wan2.2-I2V-A14B-Diffusers --omni --port 8094 \ + --boundary-ratio 0.875 \ + --flow-shift 5.0 +``` + +Or use the startup script: + +```bash +bash run_server.sh +``` + +## API Calls + +### Method 1: Using curl (Text-to-Video) + +```bash +bash run_curl_text_to_video.sh +``` + +### Method 2: Using curl (Image-to-Video) + +```bash +bash run_curl_image_to_video.sh input.png "A cinematic slow zoom into the scene" +``` + +## Request Format + +### Text-to-Video + +```json +{ + "messages": [ + {"role": "user", "content": "A serene lakeside sunrise with mist over the water."} + ], + "extra_body": { + "height": 720, + "width": 1280, + "num_frames": 81, + "num_inference_steps": 40, + "guidance_scale": 4.0, + "guidance_scale_2": 4.0, + "seed": 42, + "fps": 24 + } +} +``` + +### Image-to-Video + +```json +{ + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Make the scene come alive with gentle motion"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,..." }} + ] + } + ], + "extra_body": { + "height": 720, + "width": 1280, + "num_frames": 81, + "num_inference_steps": 40, + "guidance_scale": 4.0, + "seed": 42, + "fps": 24 + } +} +``` + +## Generation Parameters (extra_body) + +| Parameter | Type | Default | Description | +| ------------------------ | ----- | ------- | ---------------------------------------------- | +| `height` | int | None | Video height in pixels | +| `width` | int | None | Video width in pixels | +| `num_frames` | int | None | Number of frames to generate | +| `num_inference_steps` | int | 50 | Number of denoising steps | +| `guidance_scale` | float | None | CFG scale | +| `guidance_scale_2` | float | None | Optional high-noise CFG (Wan2.2) | +| `seed` | int | None | Random seed (reproducible) | +| `negative_prompt` | str | None | Negative prompt | +| `num_outputs_per_prompt` | int | 1 | Number of videos to generate | +| `fps` | int | 24 | Output video FPS (used for MP4 encoding only) | + +## Response Format + +```json +{ + "id": "chatcmpl-xxx", + "created": 1234567890, + "model": "Wan-AI/Wan2.2-T2V-A14B-Diffusers", + "choices": [{ + "index": 0, + "message": { + "role": "assistant", + "content": [{ + "type": "video_url", + "video_url": { + "url": "data:video/mp4;base64,..." + } + }] + }, + "finish_reason": "stop" + }], + "usage": {...} +} +``` + +## Extract Video + +```bash +cat response.json | jq -r '.choices[0].message.content[0].video_url.url' \ + | sed 's/^data:video[^,]*,\s*//' | base64 -d > output.mp4 +``` + +## File Description + +| File | Description | +| ---------------------------- | ------------------------------ | +| `run_server.sh` | Server startup script | +| `run_curl_text_to_video.sh` | Text-to-video curl example | +| `run_curl_image_to_video.sh` | Image-to-video curl example | diff --git a/examples/online_serving/text_to_video/run_curl_image_to_video.sh b/examples/online_serving/text_to_video/run_curl_image_to_video.sh new file mode 100644 index 00000000000..f3421a0fab9 --- /dev/null +++ b/examples/online_serving/text_to_video/run_curl_image_to_video.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# Wan2.2 image-to-video curl example + +SERVER="${SERVER:-http://localhost:8094}" +INPUT_IMAGE="${1:-input.png}" +PROMPT="${2:-Make the scene come alive with gentle motion.}" +CURRENT_TIME=$(date +%Y%m%d%H%M%S) +OUTPUT="${OUTPUT:-wan22_i2v_${CURRENT_TIME}.mp4}" + +if [ ! -f "$INPUT_IMAGE" ]; then + echo "Input image not found: $INPUT_IMAGE" + exit 1 +fi + +IMG_B64=$(base64 -w0 "$INPUT_IMAGE") + +echo "Generating video..." +echo "Prompt: $PROMPT" +echo "Input: $INPUT_IMAGE" +echo "Output: $OUTPUT" + +curl -s "$SERVER/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d "{ + \"messages\": [{ + \"role\": \"user\", + \"content\": [ + {\"type\": \"text\", \"text\": \"$PROMPT\"}, + {\"type\": \"image_url\", \"image_url\": {\"url\": \"data:image/png;base64,$IMG_B64\"}} + ] + }], + \"extra_body\": { + \"height\": 720, + \"width\": 1280, + \"num_frames\": 81, + \"num_inference_steps\": 40, + \"guidance_scale\": 4.0, + \"seed\": 42, + \"fps\": 24 + } + }" | jq -r '.choices[0].message.content[0].video_url.url' \ + | sed 's/^data:video[^,]*,\s*//' | base64 -d > "$OUTPUT" + +if [ -f "$OUTPUT" ]; then + echo "Video saved to: $OUTPUT" + echo "Size: $(du -h "$OUTPUT" | cut -f1)" +else + echo "Failed to generate video" + exit 1 +fi diff --git a/examples/online_serving/text_to_video/run_curl_text_to_video.sh b/examples/online_serving/text_to_video/run_curl_text_to_video.sh new file mode 100644 index 00000000000..4b2cdf7b228 --- /dev/null +++ b/examples/online_serving/text_to_video/run_curl_text_to_video.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# Wan2.2 text-to-video curl example + +SERVER="${SERVER:-http://localhost:8093}" +PROMPT="${PROMPT:-A serene lakeside sunrise with mist over the water.}" +CURRENT_TIME=$(date +%Y%m%d%H%M%S) +OUTPUT="${OUTPUT:-wan22_t2v_${CURRENT_TIME}.mp4}" + +echo "Generating video..." +echo "Prompt: $PROMPT" +echo "Output: $OUTPUT" + +curl -s "$SERVER/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d "{ + \"messages\": [ + {\"role\": \"user\", \"content\": \"$PROMPT\"} + ], + \"extra_body\": { + \"height\": 720, + \"width\": 1280, + \"num_frames\": 81, + \"num_inference_steps\": 40, + \"guidance_scale\": 4.0, + \"guidance_scale_2\": 4.0, + \"seed\": 42, + \"fps\": 24 + } + }" | jq -r '.choices[0].message.content[0].video_url.url' \ + | sed 's/^data:video[^,]*,\s*//' | base64 -d > "$OUTPUT" + +if [ -f "$OUTPUT" ]; then + echo "Video saved to: $OUTPUT" + echo "Size: $(du -h "$OUTPUT" | cut -f1)" +else + echo "Failed to generate video" + exit 1 +fi diff --git a/examples/online_serving/text_to_video/run_server.sh b/examples/online_serving/text_to_video/run_server.sh new file mode 100644 index 00000000000..f77f5bc83af --- /dev/null +++ b/examples/online_serving/text_to_video/run_server.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# Wan2.2 video generation online serving startup script + +MODEL="${MODEL:-Wan-AI/Wan2.2-T2V-A14B-Diffusers}" +PORT="${PORT:-8093}" +BOUNDARY_RATIO="${BOUNDARY_RATIO:-0.875}" +FLOW_SHIFT="${FLOW_SHIFT:-5.0}" + +echo "Starting Wan2.2 server..." +echo "Model: $MODEL" +echo "Port: $PORT" +echo "Boundary ratio: $BOUNDARY_RATIO" +echo "Flow shift: $FLOW_SHIFT" + +vllm serve "$MODEL" --omni \ + --port "$PORT" \ + --boundary-ratio "$BOUNDARY_RATIO" \ + --flow-shift "$FLOW_SHIFT" diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 1f225e5b979..152bddb73d4 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -86,7 +86,7 @@ class OmniOpenAIServingChat(OpenAIServingChat, AudioMixin): This class extends OpenAIServingChat to support: - Standard LLM chat completions - - Diffusion model image generation via chat interface + - Diffusion model image/video generation via chat interface For diffusion mode, use the `for_diffusion` class method to create an instance. """ @@ -133,7 +133,7 @@ async def create_chat_completion( for the API specification. This API mimics the OpenAI Chat Completion API. - For diffusion models, this generates images and returns them + For diffusion models, this generates images or videos and returns them in a chat completion response format. """ # Handle diffusion mode @@ -1007,6 +1007,174 @@ def _create_image_choice(self, omni_outputs: OmniRequestOutput, role: str): return choices + def _extract_diffusion_outputs(self, result: Any) -> list[Any]: + outputs: list[Any] = [] + if result is None: + return outputs + + if isinstance(result, OmniRequestOutput): + if result.images: + outputs.extend(result.images) + elif result.request_output is not None: + outputs.extend(self._extract_diffusion_outputs(result.request_output)) + return outputs + + if isinstance(result, dict): + if result.get("images"): + outputs.extend(result["images"]) + return outputs + + if hasattr(result, "images") and result.images: + outputs.extend(result.images) + elif hasattr(result, "request_output") and result.request_output is not None: + outputs.extend(self._extract_diffusion_outputs(result.request_output)) + + return outputs + + def _is_pil_sequence(self, item: Any) -> bool: + return isinstance(item, (list, tuple)) and item and all(isinstance(x, Image.Image) for x in item) + + def _is_array_like(self, item: Any) -> bool: + try: + import torch + except ImportError: + torch = None + import numpy as np + + array_types = (np.ndarray,) if torch is None else (np.ndarray, torch.Tensor) + return isinstance(item, array_types) + + def _get_array_ndim(self, item: Any) -> int | None: + import numpy as np + + if isinstance(item, np.ndarray): + return item.ndim + try: + import torch + except ImportError: + return None + if isinstance(item, torch.Tensor): + return item.dim() + return None + + def _split_video_batch(self, video: Any) -> list[Any]: + ndim = self._get_array_ndim(video) + if ndim == 5: + return [video[i] for i in range(video.shape[0])] + return [video] + + def _split_video_sequence(self, items: list[Any]) -> list[Any]: + has_batched_videos = any((self._get_array_ndim(item) or 0) >= 4 for item in items) + if has_batched_videos: + videos = [] + for item in items: + videos.extend(self._split_video_batch(item)) + return videos + return [items] + + def _split_diffusion_visual_outputs(self, outputs: list[Any]) -> tuple[list[Image.Image], list[Any]]: + images: list[Image.Image] = [] + videos: list[Any] = [] + + for item in outputs: + if isinstance(item, Image.Image): + images.append(item) + continue + if self._is_pil_sequence(item): + images.extend(item) + continue + if isinstance(item, (list, tuple)) and item and all(self._is_array_like(x) for x in item): + videos.extend(self._split_video_sequence(list(item))) + continue + if self._is_array_like(item): + videos.extend(self._split_video_batch(item)) + continue + logger.warning("Unsupported diffusion output type: %s", type(item)) + + return images, videos + + def _normalize_video_frame(self, frame): + import numpy as np + + if frame.dtype == np.uint8: + return frame.astype(np.float32) / 255.0 + + frame = frame.astype(np.float32) + if frame.min() < 0: + frame = np.clip(frame, -1.0, 1.0) + frame = frame * 0.5 + 0.5 + elif frame.max() > 1.0: + frame = np.clip(frame, 0.0, 255.0) / 255.0 + else: + frame = np.clip(frame, 0.0, 1.0) + return frame + + def _ensure_frame_hwc(self, frame): + import numpy as np + + if frame.ndim == 2: + return np.expand_dims(frame, axis=-1) + if frame.ndim == 3 and frame.shape[0] in (1, 3, 4) and frame.shape[-1] not in (1, 3, 4): + return np.transpose(frame, (1, 2, 0)) + return frame + + def _coerce_video_frames(self, video: Any) -> list[Any]: + import numpy as np + + try: + import torch + except ImportError: + torch = None + + def to_numpy(item): + if torch is not None and isinstance(item, torch.Tensor): + return item.detach().cpu().numpy() + if isinstance(item, np.ndarray): + return item + return np.asarray(item) + + if isinstance(video, (list, tuple)): + frames = [] + for frame in video: + frame_arr = self._ensure_frame_hwc(to_numpy(frame)) + frames.append(self._normalize_video_frame(frame_arr)) + return frames + + arr = to_numpy(video) + if arr.ndim == 5: + arr = arr[0] + + if arr.ndim == 4: + if arr.shape[0] in (1, 3, 4) and arr.shape[-1] not in (1, 3, 4): + arr = np.transpose(arr, (1, 2, 3, 0)) + elif arr.shape[-1] not in (1, 3, 4) and arr.shape[1] in (1, 3, 4): + arr = np.transpose(arr, (0, 2, 3, 1)) + return [self._normalize_video_frame(self._ensure_frame_hwc(arr[i])) for i in range(arr.shape[0])] + + if arr.ndim == 3: + return [self._normalize_video_frame(self._ensure_frame_hwc(arr))] + + raise ValueError(f"Unsupported video shape: {getattr(arr, 'shape', None)}") + + def _encode_video_base64(self, video: Any, fps: int) -> str: + try: + from diffusers.utils import export_to_video + except ImportError as exc: + raise RuntimeError("diffusers is required for export_to_video.") from exc + + frames = self._coerce_video_frames(video) + if not frames: + raise ValueError("No frames available for video export") + + import os + import tempfile + + with tempfile.TemporaryDirectory() as tmpdir: + video_path = os.path.join(tmpdir, f"video_{uuid.uuid4().hex}.mp4") + export_to_video(frames, video_path, fps=fps) + with open(video_path, "rb") as f: + return base64.b64encode(f.read()).decode("utf-8") + # ==================== Diffusion Mode Methods ==================== async def _create_diffusion_chat_completion( @@ -1014,7 +1182,7 @@ async def _create_diffusion_chat_completion( request: ChatCompletionRequest, raw_request: Request | None = None, ) -> ChatCompletionResponse | ErrorResponse: - """Generate images via chat completion interface for diffusion models. + """Generate images or videos via chat completion interface for diffusion models. Args: request: Chat completion request @@ -1070,6 +1238,7 @@ async def _create_diffusion_chat_completion( # Text-to-video parameters (ref: text_to_video.py) num_frames = extra_body.get("num_frames") + fps = extra_body.get("fps") guidance_scale_2 = extra_body.get("guidance_scale_2") # For video high-noise CFG logger.info( @@ -1111,6 +1280,8 @@ async def _create_diffusion_chat_completion( # Add video generation parameters if set if num_frames is not None: gen_kwargs["num_frames"] = num_frames + if fps is not None: + gen_kwargs["fps"] = fps if guidance_scale_2 is not None: gen_kwargs["guidance_scale_2"] = guidance_scale_2 @@ -1150,33 +1321,46 @@ async def _create_diffusion_chat_completion( else: # AsyncOmniDiffusion: direct call result = await self._diffusion_engine.generate(**gen_kwargs) - # Extract images from result - # Handle nested OmniRequestOutput structure where images might be in request_output - images: list[Image.Image] = [] - if result.request_output["images"]: - images = result.request_output["images"] + outputs = self._extract_diffusion_outputs(result) + images, videos = self._split_diffusion_visual_outputs(outputs) + + video_contents: list[dict[str, Any]] = [] + if videos: + video_fps = int(fps) if fps is not None else 24 + for video in videos: + video_base64 = self._encode_video_base64(video, video_fps) + video_contents.append( + { + "type": "video_url", + "video_url": { + "url": f"data:video/mp4;base64,{video_base64}", + }, + } + ) - # Convert images to base64 content image_contents: list[dict[str, Any]] = [] - for img in images: - with BytesIO() as buffer: - img.save(buffer, format="PNG") - img_bytes = buffer.getvalue() - img_base64 = base64.b64encode(img_bytes).decode("utf-8") - image_contents.append( - { - "type": "image_url", - "image_url": { - "url": f"data:image/png;base64,{img_base64}", - }, - } - ) + if images and not video_contents: + for img in images: + with BytesIO() as buffer: + img.save(buffer, format="PNG") + img_bytes = buffer.getvalue() + img_base64 = base64.b64encode(img_bytes).decode("utf-8") + image_contents.append( + { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{img_base64}", + }, + } + ) # Build response - if not image_contents: - content = "Image generation completed but no images were produced." - else: + if video_contents: + content = video_contents + elif image_contents: content = image_contents + else: + content = "Generation completed but no visual outputs were produced." # Use model_construct to bypass validation for multimodal content # (ChatMessage.content only accepts str, but we need list for images) @@ -1211,9 +1395,10 @@ async def _create_diffusion_chat_completion( ) logger.info( - "Diffusion chat completed for request %s: %d images", + "Diffusion chat completed for request %s: %d images, %d videos", request_id, len(images), + len(video_contents), ) return response @@ -1221,7 +1406,7 @@ async def _create_diffusion_chat_completion( except Exception as e: logger.exception("Diffusion chat completion failed: %s", e) return self._create_error_response( - f"Image generation failed: {str(e)}", + f"Diffusion generation failed: {str(e)}", status_code=500, )