From 925bb6c9b0b898083e437f1fdcd585d20b4a0a23 Mon Sep 17 00:00:00 2001 From: knlnguyen1802 Date: Tue, 6 Jan 2026 10:55:39 +0800 Subject: [PATCH 01/12] Add docs for wakeup and sleep Signed-off-by: knlnguyen1802 --- vllm_omni/diffusion/worker/gpu_worker.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/vllm_omni/diffusion/worker/gpu_worker.py b/vllm_omni/diffusion/worker/gpu_worker.py index 0f8a9b8553f..e267241befd 100644 --- a/vllm_omni/diffusion/worker/gpu_worker.py +++ b/vllm_omni/diffusion/worker/gpu_worker.py @@ -143,6 +143,16 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: return self.pipeline.load_weights(weights) def sleep(self, level: int = 1) -> bool: + """ + Put the worker to sleep. The worker should not process any requests. + The caller should guarantee that no requests are being processed + during the sleep period, before `wake_up` is called. + + Args: + level: The sleep level. Level 1 sleep will offload the model + weights and discard the kv cache. + Currently only support level 1. + """ from vllm.device_allocator.cumem import CuMemAllocator free_bytes_before_sleep = torch.cuda.mem_get_info()[0] @@ -166,6 +176,17 @@ def sleep(self, level: int = 1) -> bool: return True def wake_up(self, tags: list[str] | None = None) -> bool: + """ + Wake up the worker from sleep mode. See the sleep function + method for more details. + + Args: + tags: An optional list of tags to reallocate the worker memory + for specific memory allocations. Values must be in + `("weights")`. If None, all memory is reallocated. + wake_up should be called with all tags (or None) before the + worker is used again. + """ from vllm.device_allocator.cumem import CuMemAllocator allocator = CuMemAllocator.get_instance() From a294c13c4e32d5351c6f8af6ef33294dfe9773d0 Mon Sep 17 00:00:00 2001 From: knlnguyen1802 Date: Tue, 6 Jan 2026 11:42:54 +0800 Subject: [PATCH 02/12] Update docs Signed-off-by: knlnguyen1802 --- docs/features/sleep_mode.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 docs/features/sleep_mode.md diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md new file mode 100644 index 00000000000..dcffa14bce6 --- /dev/null +++ b/docs/features/sleep_mode.md @@ -0,0 +1,27 @@ +# Sleep Mode + +vLLM-Omni’s **Sleep Mode** allows you to temporarily release most GPU memory used by a model—such as model weights and key-value (KV) caches (for autoregressive models)—**without stopping the server or unloading the Docker container**. + +This feature is inherited from [vLLM’s Sleep Mode](https://blog.vllm.ai/2025/10/26/sleep-mode.html), which provides zero-reload model switching for multi-model serving. +It is especially useful in **RLHF**, **training**, or **cost-saving scenarios**, where GPU resources must be freed between inference workloads. + +--- + +## 🧠 Diffusion Model Extension + +We added Sleep Mode support for **diffusion models**, which previously lacked this functionality. +In diffusion pipelines, this currently only offloads **model weight memory**, as these models typically do not use KV caches. + +This means: +- Diffusion models can now enter Level 1 sleep. +- Pipeline states (e.g., noise schedulers, buffers) remain intact after waking. +- Useful for releasing VRAM between image generation or training cycles. + +--- + +## Enable sleep mode +To enable sleep mode, set the `enable_sleep_mode` in `engine_args` to True +Example: +``` +omni = Omni(model=...,enable_sleep_mode=True) +``` \ No newline at end of file From 32317300fa118a2e7077daecf613f8adb0e6027d Mon Sep 17 00:00:00 2001 From: knlnguyen1802 Date: Tue, 6 Jan 2026 13:21:16 +0800 Subject: [PATCH 03/12] Update nav.yaml Signed-off-by: knlnguyen1802 --- docs/.nav.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/.nav.yml b/docs/.nav.yml index c85a01116d9..dfc21218bd0 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -33,6 +33,8 @@ nav: - Parallelism Acceleration: user_guide/acceleration/parallelism_acceleration.md - Models: - models/supported_models.md + - Features: + - Sleep Mode: features/sleep_mode.md - Developer Guide: - General: - contributing/README.md From abb17f63db2cf552d854ac5d9709a3a64ba633e9 Mon Sep 17 00:00:00 2001 From: knlnguyen1802 Date: Tue, 6 Jan 2026 13:22:21 +0800 Subject: [PATCH 04/12] Fix pre-commit Signed-off-by: knlnguyen1802 --- docs/features/sleep_mode.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md index dcffa14bce6..2696c7db116 100644 --- a/docs/features/sleep_mode.md +++ b/docs/features/sleep_mode.md @@ -24,4 +24,4 @@ To enable sleep mode, set the `enable_sleep_mode` in `engine_args` to True Example: ``` omni = Omni(model=...,enable_sleep_mode=True) -``` \ No newline at end of file +``` From f924bda110f67da8c15adb29c9761de30a92ea52 Mon Sep 17 00:00:00 2001 From: knlnguyen1802 Date: Tue, 6 Jan 2026 13:24:44 +0800 Subject: [PATCH 05/12] Update docs Signed-off-by: knlnguyen1802 --- docs/features/sleep_mode.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md index 2696c7db116..47aefcc2e59 100644 --- a/docs/features/sleep_mode.md +++ b/docs/features/sleep_mode.md @@ -7,6 +7,13 @@ It is especially useful in **RLHF**, **training**, or **cost-saving scenarios**, --- +## 🧠 Omni Model + +Omni model inherit the feature from vLLM' Sleep Mode + +This mean: +- Support both Level 1 and Level 2 sleep, allow to release and reset both model weights and KV Cache + ## 🧠 Diffusion Model Extension We added Sleep Mode support for **diffusion models**, which previously lacked this functionality. From d3a69504b98126ff16071b0cc2de22970de7f65f Mon Sep 17 00:00:00 2001 From: knlnguyen1802 Date: Tue, 6 Jan 2026 13:25:20 +0800 Subject: [PATCH 06/12] Fix pre-commit Signed-off-by: knlnguyen1802 --- docs/features/sleep_mode.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md index 47aefcc2e59..c6d3747d799 100644 --- a/docs/features/sleep_mode.md +++ b/docs/features/sleep_mode.md @@ -7,7 +7,7 @@ It is especially useful in **RLHF**, **training**, or **cost-saving scenarios**, --- -## 🧠 Omni Model +## 🧠 Omni Model Omni model inherit the feature from vLLM' Sleep Mode From b54e2c31f39e4256695d5f8f4d93f9b52dac07f0 Mon Sep 17 00:00:00 2001 From: knlnguyen1802 Date: Tue, 6 Jan 2026 14:54:24 +0800 Subject: [PATCH 07/12] Update docs/features/sleep_mode.md Co-authored-by: Jiangyun Zhu Signed-off-by: knlnguyen1802 --- docs/features/sleep_mode.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md index c6d3747d799..81448c4b5e8 100644 --- a/docs/features/sleep_mode.md +++ b/docs/features/sleep_mode.md @@ -7,7 +7,7 @@ It is especially useful in **RLHF**, **training**, or **cost-saving scenarios**, --- -## 🧠 Omni Model +## Omni Model Omni model inherit the feature from vLLM' Sleep Mode From 4944af223a6d5b16126360b901430ebd34e4cc64 Mon Sep 17 00:00:00 2001 From: knlnguyen1802 Date: Tue, 6 Jan 2026 14:54:32 +0800 Subject: [PATCH 08/12] Update docs/features/sleep_mode.md Co-authored-by: Jiangyun Zhu Signed-off-by: knlnguyen1802 --- docs/features/sleep_mode.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md index 81448c4b5e8..88646a71fd5 100644 --- a/docs/features/sleep_mode.md +++ b/docs/features/sleep_mode.md @@ -14,7 +14,7 @@ Omni model inherit the feature from vLLM' Sleep Mode This mean: - Support both Level 1 and Level 2 sleep, allow to release and reset both model weights and KV Cache -## 🧠 Diffusion Model Extension +## Diffusion Model Extension We added Sleep Mode support for **diffusion models**, which previously lacked this functionality. In diffusion pipelines, this currently only offloads **model weight memory**, as these models typically do not use KV caches. From 4b849b5cd6968807af35e2ef47b92427cd349445 Mon Sep 17 00:00:00 2001 From: knlnguyen1802 Date: Tue, 6 Jan 2026 14:54:54 +0800 Subject: [PATCH 09/12] Update docs/features/sleep_mode.md Co-authored-by: Jiangyun Zhu Signed-off-by: knlnguyen1802 --- docs/features/sleep_mode.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md index 88646a71fd5..f493828d365 100644 --- a/docs/features/sleep_mode.md +++ b/docs/features/sleep_mode.md @@ -29,6 +29,6 @@ This means: ## Enable sleep mode To enable sleep mode, set the `enable_sleep_mode` in `engine_args` to True Example: -``` +```python omni = Omni(model=...,enable_sleep_mode=True) ``` From 3c1b322a7bcf9f6615b8d34d8df1f80c211575f6 Mon Sep 17 00:00:00 2001 From: knlnguyen1802 Date: Tue, 6 Jan 2026 15:00:10 +0800 Subject: [PATCH 10/12] Fix docs gen Signed-off-by: knlnguyen1802 --- docs/features/sleep_mode.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md index f493828d365..83e0674575c 100644 --- a/docs/features/sleep_mode.md +++ b/docs/features/sleep_mode.md @@ -11,7 +11,8 @@ It is especially useful in **RLHF**, **training**, or **cost-saving scenarios**, Omni model inherit the feature from vLLM' Sleep Mode -This mean: +This means: + - Support both Level 1 and Level 2 sleep, allow to release and reset both model weights and KV Cache ## Diffusion Model Extension @@ -20,6 +21,7 @@ We added Sleep Mode support for **diffusion models**, which previously lacked th In diffusion pipelines, this currently only offloads **model weight memory**, as these models typically do not use KV caches. This means: + - Diffusion models can now enter Level 1 sleep. - Pipeline states (e.g., noise schedulers, buffers) remain intact after waking. - Useful for releasing VRAM between image generation or training cycles. From 3f58466592a88e417a4c2b6428cc34fb5b0d6ec3 Mon Sep 17 00:00:00 2001 From: knlnguyen1802 Date: Tue, 6 Jan 2026 15:00:53 +0800 Subject: [PATCH 11/12] Fix docs gen and pre-commit Signed-off-by: knlnguyen1802 --- docs/features/sleep_mode.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md index 83e0674575c..f4894b39c4d 100644 --- a/docs/features/sleep_mode.md +++ b/docs/features/sleep_mode.md @@ -29,7 +29,9 @@ This means: --- ## Enable sleep mode -To enable sleep mode, set the `enable_sleep_mode` in `engine_args` to True +To enable sleep mode, set the `enable_sleep_mode` in `engine_args` to `True` + + Example: ```python omni = Omni(model=...,enable_sleep_mode=True) From 9acafed6b27b2e28d6bda21df9d85ca79b077289 Mon Sep 17 00:00:00 2001 From: knlnguyen1802 Date: Tue, 6 Jan 2026 15:34:01 +0800 Subject: [PATCH 12/12] Resolve comment Signed-off-by: knlnguyen1802 --- docs/features/sleep_mode.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md index f4894b39c4d..41aa48c1735 100644 --- a/docs/features/sleep_mode.md +++ b/docs/features/sleep_mode.md @@ -3,6 +3,7 @@ vLLM-Omni’s **Sleep Mode** allows you to temporarily release most GPU memory used by a model—such as model weights and key-value (KV) caches (for autoregressive models)—**without stopping the server or unloading the Docker container**. This feature is inherited from [vLLM’s Sleep Mode](https://blog.vllm.ai/2025/10/26/sleep-mode.html), which provides zero-reload model switching for multi-model serving. + It is especially useful in **RLHF**, **training**, or **cost-saving scenarios**, where GPU resources must be freed between inference workloads. ---