vllm-project · Miguel0312 · May 6, 2026 · May 19, 2026 · May 19, 2026 · May 26, 2026
@@ -73,5 +73,6 @@ th {
 |`DyninOmniForConditionalGeneration` | Dynin-Omni | `snu-aidas/Dynin-Omni` | ✅︎ | | | |
 | `ErnieImagePipeline` | ERNIE-Image | `baidu/ERNIE-Image`, `baidu/ERNIE-Image-Turbo` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 |`HiDreamImagePipeline` | HiDream-I1-Full | `HiDream-ai/HiDream-I1-Full` | ✅︎ | ✅︎ | | |
+|`LingbotWorldFastPipeline`| Lingbot-World-Fast | `robbyant/lingbot-world-fast`|✅︎ | | | |
 
 ✅︎ indicates the model is supported on that backend. Empty cells mean not listed as supported on that backend.
@@ -0,0 +1,49 @@
+# Lingbot World Fast Offline Inference
+
+Lingbot World Fast is an autoregressive diffusion model that uses a reference image, a text prompt and a set of camera positions to generate a video.
+
+## Video Generation
+
+First, download the model weights using `examples/offline_inference/lingbot_world_fast/download_lingbot_world_fast.py`.
+
+The simplest way to run offline generation is to use the script on `examples/offline_inference/lingbot_world_fast/end2end.py`. The core of this script is done by:
+
+```python
+from vllm_omni.entrypoints.omni import Omni
+
+if __name__ == "__main__":
+    omni = Omni(model="lingbot_world/lingbot-world-base-cam/Lingbot-World-Fast", model_class_name="LingbotWorldFastPipeline")
+    outputs = omni.generate(
+        {
+            "prompt": "A journey along the Great Wall of China",
+            "multi_modal_data": {
+                "image": "input.png",
+                "camera": {
+                    "poses": np.load("path/to/poses.npy")
+                    "intrinsics": np.load("path/to/intrinsics.npy")
+                }
+            },
+        },
+        OmniDiffusionSamplingParams(
+            height=height,
+            width=width,
+            num_frames=num_frames,
+            frame_rate=fps,
+        ),
+    )
+    export_to_video(outputs[0], "output.png")
+```
+
+## Generation Parameters
+
+| Parameter             | Type  | Default | Description                         |
+| --------------------- | ----- | ------- | ----------------------------------- |
+| `height`              | int   | None (computed from image)    | Image height in pixels              |
+| `width`               | int   | None (computed from image)    | Image width in pixels               |
+| `num_frames`          | int   | 81                            | Number of frames to generate        |
+| `fps`                 | int   | 16                            | Frames per second                   |
+| `seed`                | int   | 42                            | Optional random seed                |
+| `prompt`              | str   | ""                            | Text prompt                         |
+| `negative_prompt`     | str   | None                          | Negative prompt                     |
+| `image`               | str   | Required                      | Path to reference image             |
+|`camera-path`          | str   | Required                      | Path to folder with `poses.npy` and `intrinsics.npy`|
@@ -0,0 +1,51 @@
+# Lingbot World Fast Offline Inference
+
+Lingbot World Fast is an autoregressive diffusion model that uses a reference image, a text prompt and a set of camera positions to generate a video. The online serving model of this model adds a feature that is not implemented in the original model: video extension.
+
+## Quickstart
+
+The easiest way to launch a server running the Lingbot World Fast model is by using the script `examples/online_serving/lingbot_world_fast/run_server.sh`.
+
+Once the server is launched, the client can send requests to its websocket at `/v1/realtime/world/camera`. The easiest way to interact with the server is using the script `examples/online_serving/lingbot_world_fast/openai_client.py`. Its command line options are described below.
+
+| Parameter             | Type  | Default | Description                         |
+| --------------------- | ----- | ------- | ----------------------------------- |
+| `height`              | int   | None (computed from image)    | Image height in pixels              |
+| `width`               | int   | None (computed from image)    | Image width in pixels               |
+| `num_frames`          | int   | 81                            | Number of frames to generate        |
+| `fps`                 | int   | 16                            | Frames per second                   |
+| `seed`                | int   | 42                            | Optional random seed                |
+| `prompt`              | str   | ""                            | Text prompt                         |
+| `negative_prompt`     | str   | None                          | Negative prompt                     |
+| `image`               | str   | Required                      | Path to reference image             |
+|`camera-path`          | str   | Required                      | Path to folder with `poses.npy` and `intrinsics.npy`|
+| `num-calls`               | int   | 1                      | Makes an additional `num-calls - 1` video extension calls with `num_frames` frames         |
+| `num-skip-frames`               | int   | 4                      | Extension calls have artifacts on the first couple frames. Discard them. |
+| `session-id`               | str   | None                      | Session id to control whether to trigger a video extension call             |
+
+## Video Extension
+
+The idea of video extension is to allow the user to generate further frames for the same video efficiently. This is done by the vllm-omni implementation by storing the KV-cache of the generated video by default. This way, if the next request uses the same session-id, the pipeline will enter extension mode. So, the newly generated frames will use the previously generated frames as context. This is done by storing the KV-cache as mentioned above. No frame information, whether in latent space or RGB values, is kept in the server.
+
+This feature is limited by the fact that the model has not been trained to perform this task. So, the steering capacity of the user is limited. Namely, the reference image and changes to the text prompt are ignored. The best tool the user has is to provide camera positions. In the end, video extension is more of a demonstration of the power and features of VLLM-Omni than of Lingbot World in itself.
+
+## API
+
+The server uses a websocket endpoint located at `/v1/realtime/world/camera`. It makes available two tasks: `infer` and `reset` which can be controlled by the "endpoint" key of the request.
+
+By default, the server uses the `infer` task, which checks the `session-id` field and compares it to the one used on the last infer call. If they are the same, it triggers an extension call at the pipeline level. Note that only the KV-cache of the last request is stored to mitigate Out of Memory problems at the GPU level. Otherwise, it generates the video from scratch. Notice that when doing an extension task, no reference image should be provided (it would be ignored anyway).
+
+The `reset` endpoint does not immediately evict the KV cache in the GPU, but instead it forces a reset on the next `infer` call independently of the value of `session-id`.
+
+The endpoint sends the resulting frames in groups of 4 to mitigate package loss problems. It is the client's role to concatenate the different frames to obtain the final video.
+
+## Example materials
+
+??? abstract "run_server.sh"
+    ``````sh
+    --8<-- "examples/online_serving/lingbot_world_fast/run_server.sh"
+    ``````
+??? abstract "openai_client.py"
+    ``````sh
+    --8<-- "examples/online_serving/lingbot_world_fast/openai_client.py"
+    ``````
@@ -0,0 +1,79 @@
+import argparse
+import json
+import os
+import tempfile
+import time
+from pathlib import Path
+
+from huggingface_hub import snapshot_download
+
+DEPENDENCY_REPO = "https://github.com/Robbyant/lingbot-world"
+DEPENDENCY_BRANCH = "main"
+CACHE_DIR = Path(tempfile.gettempdir()) / "vllm-omni-dependency"
+LOCK_FILE = CACHE_DIR / ".install.lock"
+DEPENDENCY_DIR = CACHE_DIR / "Lingbot-World"
+
+
+def timed_download(repo_id: str, local_dir: str, allow_patterns: list | None = None):
+    """Download files from HF repo and log time + destination."""
+    if os.path.exists(local_dir):
+        print(f"Directory {local_dir} already exists. Skipping download.")
+        return
+    print(f"Starting download from {repo_id} into {local_dir}")
+    start_time = time.time()
+
+    snapshot_download(
+        repo_id=repo_id,
+        local_dir=local_dir,
+        allow_patterns=allow_patterns,
+    )
+
+    elapsed = time.time() - start_time
+    print(f"✅ Finished downloading {repo_id} in {elapsed:.2f} seconds. Files saved at: {local_dir}")
+
+
+def main(output_dir: str):
+    lingbot_base_dir = os.path.join(output_dir, "lingbot-world-base-cam")
+
+    # Base Model
+    timed_download(
+        repo_id="robbyant/lingbot-world-base-cam",
+        local_dir=lingbot_base_dir,
+        allow_patterns=["google/*", "models_t5_umt5-xxl-enc-bf16.pth", "Wan2.1_VAE.pth"],
+    )
+
+    lingbot_fast_dir = os.path.join(lingbot_base_dir, "Lingbot-World-Fast")
+
+    timed_download(repo_id="robbyant/lingbot-world-fast", local_dir=lingbot_fast_dir)
+
+    # Lingbot World does not come with config.json which is required by diffusers
+    config = {
+        "_class_name": "WanModel",
+        "_diffusers_version": "0.33.0",
+        "dim": 5120,
+        "eps": 1e-06,
+        "ffn_dim": 13824,
+        "freq_dim": 256,
+        "in_dim": 36,
+        "model_type": "lingbot_world_fast",
+        "num_heads": 40,
+        "num_layers": 40,
+        "out_dim": 16,
+        "text_len": 512,
+    }
+
+    config_path = os.path.join(output_dir, "lingbot-world-base-cam", "Lingbot-World-Fast", "config.json")
+
+    with open(config_path, "w", encoding="utf-8") as f:
+        json.dump(config, f, indent=2)
+
+    print(f"config.json created at {config_path}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Download models from Hugging Face")
+    parser.add_argument(
+        "--output-dir", type=str, default="./lingbot_world", help="Base directory to save downloaded models"
+    )
+    args = parser.parse_args()
+    main(args.output_dir)