with1015 · with1015 · Apr 6, 2026 · Apr 6, 2026 · Apr 6, 2026 · Apr 6, 2026
diff --git a/examples/online_serving/hcx_omni/README.md b/examples/online_serving/hcx_omni/README.md
@@ -0,0 +1,107 @@
+# HyperCLOVAX-SEED-Omni-8B with vLLM-Omni
+
+[HyperCLOVAX-SEED-Omni-8B](https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B)
+is an omni-modal model by NAVER Cloud that supports:
+
+| Input  | Output          |
+|--------|-----------------|
+| Text   | Text            |
+| Audio  | Text + Audio    |
+| Image  | Text            |
+| Text   | Text + Image    |
+| Audio  | Text + Audio + Image |
+
+## Architecture
+
+The model uses a 3-stage pipeline:
+
+```
+Stage 0 (Thinker) ──→ Stage 1 (Vision Decoder, diffusion)
+         │
+         └──────────→ Stage 2 (Audio Decoder, unit-BigVGAN)
+```
+
+- **Thinker**: Qwen2.5-VL vision encoder + Qwen2Audio encoder + HyperCLOVAX language model.
+  Outputs text tokens and discrete audio/vision codes in the vocabulary.
+- **Vision Decoder**: Diffusion-based image generation from 729 discrete TA-Tok codes.
+- **Audio Decoder**: Unit-BigVGAN vocoder from CosyVoice2 FSQ discrete audio codes.
+
+## Hardware Requirements
+
+| Setup     | GPUs                                        |
+|-----------|---------------------------------------------|
+| Default   | 6 × GPU ≥24 GB (4 for thinker TP, 1+1 for decoders) |
+| Minimal   | 3 × GPU ≥24 GB (1 for thinker, 1+1 for decoders) |
+
+## Quick Start
+
+### 1. Start the Server
+
+```bash
+# 6-GPU setup (production)
+./run_server.sh --model naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B
+
+# Custom GPU allocation
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 ./run_server.sh
+```
+
+### 2. Run the Client Demo
+
+```bash
+# All modes: text-only, text-to-vision, speech-to-speech
+python client_demo.py --base-url http://localhost:8000/v1
+
+# Speech-to-Speech with your own audio file
+python client_demo.py --mode s2s --audio-file /path/to/speech.wav
+
+# Text-to-Vision
+python client_demo.py --mode t2v --prompt "고양이 그림을 그려줘"
+```
+
+### 3. Use the OpenAI API Directly
+
+**Speech-to-Speech:**
+```bash
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B",
+    "modalities": ["text", "audio"],
+    "messages": [{
+      "role": "user",
+      "content": [
+        {"type": "input_audio", "input_audio": {"data": "<base64-wav>", "format": "wav"}},
+        {"type": "text", "text": "이 오디오에 무슨 내용이 있나요?"}
+      ]
+    }]
+  }'
+```
+
+**Text-to-Vision:**
+```bash
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B",
+    "modalities": ["text", "image"],
+    "messages": [{
+      "role": "user",
+      "content": [
+        {"type": "text", "text": "귀여운 강아지 한 마리가 공원에서 뛰노는 그림을 그려줘."}
+      ]
+    }]
+  }'
+```
+
+## Stage Config
+
+The default stage config is at
+`vllm_omni/model_executor/stage_configs/hcx_omni.yaml`.
+
+Key parameters:
+
+| Stage | Type      | `model_arch` / `model_class_name`  | GPU   |
+|-------|-----------|------------------------------------|-------|
+| 0     | LLM       | `HCXVisionV2ForCausalLM`           | 0-3   |
+| 1     | Diffusion | `HyperCLOVAXVisionPipeline`        | 4     |
+| 2     | Diffusion | `HyperCLOVAXAudioPipeline`         | 5     |
diff --git a/examples/online_serving/hcx_omni/client_demo.py b/examples/online_serving/hcx_omni/client_demo.py
@@ -0,0 +1,153 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""HyperCLOVAX-SEED-Omni-8B client demo.
+
+Demonstrates Speech-to-Speech and Text-to-Vision via the OpenAI-compatible
+HTTP API provided by vLLM-Omni.
+
+Usage:
+    # Start the server first (see run_server.sh), then:
+    python client_demo.py --base-url http://localhost:8000/v1
+
+    # With a local audio file:
+    python client_demo.py --audio-file path/to/speech.wav
+
+    # Text-to-Vision only:
+    python client_demo.py --mode t2v --prompt "고양이 그림을 그려줘"
+"""
+import argparse
+import base64
+import io
+import sys
+from pathlib import Path
+
+try:
+    from openai import OpenAI
+except ImportError:
+    print("Please install openai: pip install openai")
+    sys.exit(1)
+
+
+def encode_audio_file(path: str) -> str:
+    """Base64-encode a WAV/MP3 file."""
+    with open(path, "rb") as f:
+        return base64.b64encode(f.read()).decode()
+
+
+def encode_audio_array(array, sample_rate: int = 16000) -> str:
+    """Base64-encode a numpy audio array as WAV."""
+    import numpy as np
+    import scipy.io.wavfile as wav
+
+    if not isinstance(array, np.ndarray):
+        array = np.array(array)
+    buf = io.BytesIO()
+    wav.write(buf, sample_rate, (array * 32767).astype(np.int16))
+    return base64.b64encode(buf.getvalue()).decode()
+
+
+def speech_to_speech(client: OpenAI, audio_b64: str, prompt: str = "이 오디오에 무슨 내용이 있나요?"):
+    """Send audio → receive text + audio."""
+    print(f"\n[Speech-to-Speech] prompt: {prompt!r}")
+    response = client.chat.completions.create(
+        model="naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B",
+        modalities=["text", "audio"],
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "input_audio",
+                        "input_audio": {"data": audio_b64, "format": "wav"},
+                    },
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ],
+    )
+    choice = response.choices[0]
+    print(f"Text response: {choice.message.content}")
+    if hasattr(choice.message, "audio") and choice.message.audio:
+        audio_data = base64.b64decode(choice.message.audio.data)
+        out_path = Path("/tmp/hcx_omni_response.wav")
+        out_path.write_bytes(audio_data)
+        print(f"Audio saved to: {out_path}")
+    return response
+
+
+def text_to_vision(client: OpenAI, prompt: str = "귀여운 강아지 한 마리가 공원에서 뛰노는 그림을 그려줘."):
+    """Send text → receive text + image."""
+    print(f"\n[Text-to-Vision] prompt: {prompt!r}")
+    response = client.chat.completions.create(
+        model="naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B",
+        modalities=["text", "image"],
+        messages=[
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": prompt}],
+            }
+        ],
+    )
+    choice = response.choices[0]
+    print(f"Text response: {choice.message.content}")
+    if hasattr(choice.message, "image") and choice.message.image:
+        img_data = base64.b64decode(choice.message.image.data)
+        out_path = Path("/tmp/hcx_omni_generated.png")
+        out_path.write_bytes(img_data)
+        print(f"Image saved to: {out_path}")
+    return response
+
+
+def text_only(client: OpenAI, prompt: str = "대한민국의 수도는 어디인가요?"):
+    """Pure text conversation (thinker only)."""
+    print(f"\n[Text-only] prompt: {prompt!r}")
+    response = client.chat.completions.create(
+        model="naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B",
+        modalities=["text"],
+        messages=[
+            {"role": "user", "content": prompt}
+        ],
+    )
+    print(f"Response: {response.choices[0].message.content}")
+    return response
+
+
+def main():
+    parser = argparse.ArgumentParser(description="HyperCLOVAX-SEED-Omni-8B demo")
+    parser.add_argument("--base-url", default="http://localhost:8000/v1")
+    parser.add_argument(
+        "--mode",
+        choices=["s2s", "t2v", "text", "all"],
+        default="all",
+        help="Demo mode: s2s=Speech-to-Speech, t2v=Text-to-Vision, text=Text-only",
+    )
+    parser.add_argument("--audio-file", default=None, help="Path to input audio file")
+    parser.add_argument("--prompt", default=None, help="Text prompt override")
+    args = parser.parse_args()
+
+    client = OpenAI(api_key="EMPTY", base_url=args.base_url)
+
+    if args.mode in ("text", "all"):
+        text_only(client, prompt=args.prompt or "대한민국의 수도는 어디인가요?")
+
+    if args.mode in ("t2v", "all"):
+        text_to_vision(client, prompt=args.prompt or "귀여운 강아지 한 마리가 공원에서 뛰노는 그림을 그려줘.")
+
+    if args.mode in ("s2s", "all"):
+        if args.audio_file:
+            audio_b64 = encode_audio_file(args.audio_file)
+        else:
+            # Generate synthetic 1-second sine wave
+            try:
+                import numpy as np
+                t = np.linspace(0, 1, 16000, endpoint=False)
+                audio_array = np.sin(2 * np.pi * 440 * t).astype(np.float32)
+                audio_b64 = encode_audio_array(audio_array)
+            except ImportError:
+                print("numpy not available, skipping S2S demo")
+                return
+        speech_to_speech(client, audio_b64, prompt=args.prompt or "이 오디오에 무슨 내용이 있나요?")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/hcx_omni/run_server.sh b/examples/online_serving/hcx_omni/run_server.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+# Launch HyperCLOVAX-SEED-Omni-8B with vLLM-Omni.
+#
+# Requirements:
+#   - 6× GPUs (≥24 GB VRAM each):
+#       GPU 0-3: Thinker (tensor_parallel_size=4)
+#       GPU 4  : Vision decoder
+#       GPU 5  : Audio decoder
+#   - HF model: naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B
+#
+# Usage:
+#   ./run_server.sh [--model MODEL] [--port PORT] [--stage-configs-path PATH]
+
+set -e
+
+MODEL="${MODEL:-naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B}"
+PORT="${PORT:-8000}"
+HOST="${HOST:-0.0.0.0}"
+STAGE_CONFIG="${STAGE_CONFIG:-}"
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+DEFAULT_STAGE_CONFIG="$SCRIPT_DIR/../../../vllm_omni/model_executor/stage_configs/hcx_omni.yaml"
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)         MODEL="$2";       shift 2 ;;
+        --port)          PORT="$2";        shift 2 ;;
+        --host)          HOST="$2";        shift 2 ;;
+        --stage-configs-path) STAGE_CONFIG="$2"; shift 2 ;;
+        --help)
+            echo "Usage: $0 [--model MODEL] [--port PORT] [--host HOST] [--stage-configs-path PATH]"
+            exit 0 ;;
+        *) echo "Unknown: $1"; exit 1 ;;
+    esac
+done
+
+[[ -z "$STAGE_CONFIG" ]] && STAGE_CONFIG="$DEFAULT_STAGE_CONFIG"
+
+echo "================================================="
+echo " HyperCLOVAX-SEED-Omni-8B  vLLM-Omni Server"
+echo "================================================="
+echo " Model       : $MODEL"
+echo " Stage config: $STAGE_CONFIG"
+echo " Endpoint    : http://$HOST:$PORT/v1"
+echo "================================================="
+
+python -m vllm_omni.entrypoints.openai.api_server \
+    --model "$MODEL" \
+    --stage-configs-path "$STAGE_CONFIG" \
+    --port "$PORT" \
+    --host "$HOST" \
+    --trust-remote-code