From 925bb6c9b0b898083e437f1fdcd585d20b4a0a23 Mon Sep 17 00:00:00 2001
From: knlnguyen1802 <knlnguyen1802@gmail.com>
Date: Tue, 6 Jan 2026 10:55:39 +0800
Subject: [PATCH 01/12] Add docs for wakeup and sleep

Signed-off-by: knlnguyen1802 <knlnguyen1802@gmail.com>
---
 vllm_omni/diffusion/worker/gpu_worker.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/vllm_omni/diffusion/worker/gpu_worker.py b/vllm_omni/diffusion/worker/gpu_worker.py
index 0f8a9b8553f..e267241befd 100644
--- a/vllm_omni/diffusion/worker/gpu_worker.py
+++ b/vllm_omni/diffusion/worker/gpu_worker.py
@@ -143,6 +143,16 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         return self.pipeline.load_weights(weights)
 
     def sleep(self, level: int = 1) -> bool:
+        """
+        Put the worker to sleep. The worker should not process any requests.
+        The caller should guarantee that no requests are being processed
+        during the sleep period, before `wake_up` is called.
+
+        Args:
+            level: The sleep level. Level 1 sleep will offload the model
+                weights and discard the kv cache.
+                Currently only support level 1.
+        """
         from vllm.device_allocator.cumem import CuMemAllocator
 
         free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
@@ -166,6 +176,17 @@ def sleep(self, level: int = 1) -> bool:
         return True
 
     def wake_up(self, tags: list[str] | None = None) -> bool:
+        """
+        Wake up the worker from sleep mode. See the sleep function
+        method for more details.
+
+        Args:
+            tags: An optional list of tags to reallocate the worker memory
+                for specific memory allocations. Values must be in
+                `("weights")`. If None, all memory is reallocated.
+                wake_up should be called with all tags (or None) before the
+                worker is used again.
+        """
         from vllm.device_allocator.cumem import CuMemAllocator
 
         allocator = CuMemAllocator.get_instance()

From a294c13c4e32d5351c6f8af6ef33294dfe9773d0 Mon Sep 17 00:00:00 2001
From: knlnguyen1802 <knlnguyen1802@gmail.com>
Date: Tue, 6 Jan 2026 11:42:54 +0800
Subject: [PATCH 02/12] Update docs

Signed-off-by: knlnguyen1802 <knlnguyen1802@gmail.com>
---
 docs/features/sleep_mode.md | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 docs/features/sleep_mode.md

diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md
new file mode 100644
index 00000000000..dcffa14bce6
--- /dev/null
+++ b/docs/features/sleep_mode.md
@@ -0,0 +1,27 @@
+# Sleep Mode
+
+vLLM-Omni’s **Sleep Mode** allows you to temporarily release most GPU memory used by a model—such as model weights and key-value (KV) caches (for autoregressive models)—**without stopping the server or unloading the Docker container**.
+
+This feature is inherited from [vLLM’s Sleep Mode](https://blog.vllm.ai/2025/10/26/sleep-mode.html), which provides zero-reload model switching for multi-model serving.  
+It is especially useful in **RLHF**, **training**, or **cost-saving scenarios**, where GPU resources must be freed between inference workloads.
+
+---
+
+## 🧠 Diffusion Model Extension
+
+We added Sleep Mode support for **diffusion models**, which previously lacked this functionality.  
+In diffusion pipelines, this currently only offloads **model weight memory**, as these models typically do not use KV caches.
+
+This means:
+- Diffusion models can now enter Level 1 sleep.
+- Pipeline states (e.g., noise schedulers, buffers) remain intact after waking.
+- Useful for releasing VRAM between image generation or training cycles.
+
+---
+
+## Enable sleep mode
+To enable sleep mode, set the `enable_sleep_mode` in `engine_args` to True
+Example:
+```
+omni = Omni(model=...,enable_sleep_mode=True)
+```
\ No newline at end of file

From 32317300fa118a2e7077daecf613f8adb0e6027d Mon Sep 17 00:00:00 2001
From: knlnguyen1802 <knlnguyen1802@gmail.com>
Date: Tue, 6 Jan 2026 13:21:16 +0800
Subject: [PATCH 03/12] Update nav.yaml

Signed-off-by: knlnguyen1802 <knlnguyen1802@gmail.com>
---
 docs/.nav.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/.nav.yml b/docs/.nav.yml
index c85a01116d9..dfc21218bd0 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -33,6 +33,8 @@ nav:
       - Parallelism Acceleration: user_guide/acceleration/parallelism_acceleration.md
   - Models:
     - models/supported_models.md
+  - Features:
+    - Sleep Mode: features/sleep_mode.md
 - Developer Guide:
   - General:
     - contributing/README.md

From abb17f63db2cf552d854ac5d9709a3a64ba633e9 Mon Sep 17 00:00:00 2001
From: knlnguyen1802 <knlnguyen1802@gmail.com>
Date: Tue, 6 Jan 2026 13:22:21 +0800
Subject: [PATCH 04/12] Fix pre-commit

Signed-off-by: knlnguyen1802 <knlnguyen1802@gmail.com>
---
 docs/features/sleep_mode.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md
index dcffa14bce6..2696c7db116 100644
--- a/docs/features/sleep_mode.md
+++ b/docs/features/sleep_mode.md
@@ -24,4 +24,4 @@ To enable sleep mode, set the `enable_sleep_mode` in `engine_args` to True
 Example:
 ```
 omni = Omni(model=...,enable_sleep_mode=True)
-```
\ No newline at end of file
+```

From f924bda110f67da8c15adb29c9761de30a92ea52 Mon Sep 17 00:00:00 2001
From: knlnguyen1802 <knlnguyen1802@gmail.com>
Date: Tue, 6 Jan 2026 13:24:44 +0800
Subject: [PATCH 05/12] Update docs

Signed-off-by: knlnguyen1802 <knlnguyen1802@gmail.com>
---
 docs/features/sleep_mode.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md
index 2696c7db116..47aefcc2e59 100644
--- a/docs/features/sleep_mode.md
+++ b/docs/features/sleep_mode.md
@@ -7,6 +7,13 @@ It is especially useful in **RLHF**, **training**, or **cost-saving scenarios**,
 
 ---
 
+## 🧠 Omni Model 
+
+Omni model inherit the feature from vLLM' Sleep Mode
+
+This mean:
+- Support both Level 1 and Level 2 sleep, allow to release and reset both model weights and KV Cache
+
 ## 🧠 Diffusion Model Extension
 
 We added Sleep Mode support for **diffusion models**, which previously lacked this functionality.  

From d3a69504b98126ff16071b0cc2de22970de7f65f Mon Sep 17 00:00:00 2001
From: knlnguyen1802 <knlnguyen1802@gmail.com>
Date: Tue, 6 Jan 2026 13:25:20 +0800
Subject: [PATCH 06/12] Fix pre-commit

Signed-off-by: knlnguyen1802 <knlnguyen1802@gmail.com>
---
 docs/features/sleep_mode.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md
index 47aefcc2e59..c6d3747d799 100644
--- a/docs/features/sleep_mode.md
+++ b/docs/features/sleep_mode.md
@@ -7,7 +7,7 @@ It is especially useful in **RLHF**, **training**, or **cost-saving scenarios**,
 
 ---
 
-## 🧠 Omni Model 
+## 🧠 Omni Model
 
 Omni model inherit the feature from vLLM' Sleep Mode
 

From b54e2c31f39e4256695d5f8f4d93f9b52dac07f0 Mon Sep 17 00:00:00 2001
From: knlnguyen1802 <knlnguyen1802@gmail.com>
Date: Tue, 6 Jan 2026 14:54:24 +0800
Subject: [PATCH 07/12] Update docs/features/sleep_mode.md

Co-authored-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
Signed-off-by: knlnguyen1802 <knlnguyen1802@gmail.com>
---
 docs/features/sleep_mode.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md
index c6d3747d799..81448c4b5e8 100644
--- a/docs/features/sleep_mode.md
+++ b/docs/features/sleep_mode.md
@@ -7,7 +7,7 @@ It is especially useful in **RLHF**, **training**, or **cost-saving scenarios**,
 
 ---
 
-## 🧠 Omni Model
+## Omni Model
 
 Omni model inherit the feature from vLLM' Sleep Mode
 

From 4944af223a6d5b16126360b901430ebd34e4cc64 Mon Sep 17 00:00:00 2001
From: knlnguyen1802 <knlnguyen1802@gmail.com>
Date: Tue, 6 Jan 2026 14:54:32 +0800
Subject: [PATCH 08/12] Update docs/features/sleep_mode.md

Co-authored-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
Signed-off-by: knlnguyen1802 <knlnguyen1802@gmail.com>
---
 docs/features/sleep_mode.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md
index 81448c4b5e8..88646a71fd5 100644
--- a/docs/features/sleep_mode.md
+++ b/docs/features/sleep_mode.md
@@ -14,7 +14,7 @@ Omni model inherit the feature from vLLM' Sleep Mode
 This mean:
 - Support both Level 1 and Level 2 sleep, allow to release and reset both model weights and KV Cache
 
-## 🧠 Diffusion Model Extension
+## Diffusion Model Extension
 
 We added Sleep Mode support for **diffusion models**, which previously lacked this functionality.  
 In diffusion pipelines, this currently only offloads **model weight memory**, as these models typically do not use KV caches.

From 4b849b5cd6968807af35e2ef47b92427cd349445 Mon Sep 17 00:00:00 2001
From: knlnguyen1802 <knlnguyen1802@gmail.com>
Date: Tue, 6 Jan 2026 14:54:54 +0800
Subject: [PATCH 09/12] Update docs/features/sleep_mode.md

Co-authored-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
Signed-off-by: knlnguyen1802 <knlnguyen1802@gmail.com>
---
 docs/features/sleep_mode.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md
index 88646a71fd5..f493828d365 100644
--- a/docs/features/sleep_mode.md
+++ b/docs/features/sleep_mode.md
@@ -29,6 +29,6 @@ This means:
 ## Enable sleep mode
 To enable sleep mode, set the `enable_sleep_mode` in `engine_args` to True
 Example:
-```
+```python
 omni = Omni(model=...,enable_sleep_mode=True)
 ```

From 3c1b322a7bcf9f6615b8d34d8df1f80c211575f6 Mon Sep 17 00:00:00 2001
From: knlnguyen1802 <knlnguyen1802@gmail.com>
Date: Tue, 6 Jan 2026 15:00:10 +0800
Subject: [PATCH 10/12] Fix docs gen

Signed-off-by: knlnguyen1802 <knlnguyen1802@gmail.com>
---
 docs/features/sleep_mode.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md
index f493828d365..83e0674575c 100644
--- a/docs/features/sleep_mode.md
+++ b/docs/features/sleep_mode.md
@@ -11,7 +11,8 @@ It is especially useful in **RLHF**, **training**, or **cost-saving scenarios**,
 
 Omni model inherit the feature from vLLM' Sleep Mode
 
-This mean:
+This means:
+
 - Support both Level 1 and Level 2 sleep, allow to release and reset both model weights and KV Cache
 
 ## Diffusion Model Extension
@@ -20,6 +21,7 @@ We added Sleep Mode support for **diffusion models**, which previously lacked th
 In diffusion pipelines, this currently only offloads **model weight memory**, as these models typically do not use KV caches.
 
 This means:
+
 - Diffusion models can now enter Level 1 sleep.
 - Pipeline states (e.g., noise schedulers, buffers) remain intact after waking.
 - Useful for releasing VRAM between image generation or training cycles.

From 3f58466592a88e417a4c2b6428cc34fb5b0d6ec3 Mon Sep 17 00:00:00 2001
From: knlnguyen1802 <knlnguyen1802@gmail.com>
Date: Tue, 6 Jan 2026 15:00:53 +0800
Subject: [PATCH 11/12] Fix docs gen and pre-commit

Signed-off-by: knlnguyen1802 <knlnguyen1802@gmail.com>
---
 docs/features/sleep_mode.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md
index 83e0674575c..f4894b39c4d 100644
--- a/docs/features/sleep_mode.md
+++ b/docs/features/sleep_mode.md
@@ -29,7 +29,9 @@ This means:
 ---
 
 ## Enable sleep mode
-To enable sleep mode, set the `enable_sleep_mode` in `engine_args` to True
+To enable sleep mode, set the `enable_sleep_mode` in `engine_args` to `True`
+
+
 Example:
 ```python
 omni = Omni(model=...,enable_sleep_mode=True)

From 9acafed6b27b2e28d6bda21df9d85ca79b077289 Mon Sep 17 00:00:00 2001
From: knlnguyen1802 <knlnguyen1802@gmail.com>
Date: Tue, 6 Jan 2026 15:34:01 +0800
Subject: [PATCH 12/12] Resolve comment

Signed-off-by: knlnguyen1802 <knlnguyen1802@gmail.com>
---
 docs/features/sleep_mode.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md
index f4894b39c4d..41aa48c1735 100644
--- a/docs/features/sleep_mode.md
+++ b/docs/features/sleep_mode.md
@@ -3,6 +3,7 @@
 vLLM-Omni’s **Sleep Mode** allows you to temporarily release most GPU memory used by a model—such as model weights and key-value (KV) caches (for autoregressive models)—**without stopping the server or unloading the Docker container**.
 
 This feature is inherited from [vLLM’s Sleep Mode](https://blog.vllm.ai/2025/10/26/sleep-mode.html), which provides zero-reload model switching for multi-model serving.  
+
 It is especially useful in **RLHF**, **training**, or **cost-saving scenarios**, where GPU resources must be freed between inference workloads.
 
 ---