napmany · napmany · Nov 20, 2025 · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025
diff --git a/.github/workflows/go-ci-windows.yml b/.github/workflows/go-ci-windows.yml
@@ -32,7 +32,6 @@ jobs:
 
     # necessary for testing proxy/Process swapping
     - name: Create simple-responder
-      if: steps.restore-simple-responder.outputs.cache-hit != 'true'
       shell: bash
       run: make simple-responder-windows
 

diff --git a/README.md b/README.md
@@ -29,12 +29,14 @@ Built in Go for performance and simplicity, llama-swap has zero dependencies and
   - `/ui` - web UI
   - `/upstream/:model_id` - direct access to upstream server ([demo](https://github.com/mostlygeek/llama-swap/pull/31))
   - `/models/unload` - manually unload running models ([#58](https://github.com/mostlygeek/llama-swap/issues/58))
+  - `/models/sleep/:model_id` - put a model to sleep (requires sleep/wake configuration)
   - `/running` - list currently running models ([#61](https://github.com/mostlygeek/llama-swap/issues/61))
   - `/log` - remote log monitoring
   - `/health` - just returns "OK"
 - ✅ Customizable
   - Run multiple models at once with `Groups` ([#107](https://github.com/mostlygeek/llama-swap/issues/107))
   - Automatic unloading of models after timeout by setting a `ttl`
+  - Fast model switching with sleep/wake support (vLLM sleep mode, offload memory instead of full restart)
   - Reliable Docker and Podman support using `cmd` and `cmdStop` together
   - Preload models on startup with `hooks` ([#235](https://github.com/mostlygeek/llama-swap/pull/235))
 

diff --git a/cmd/simple-responder/simple-responder.go b/cmd/simple-responder/simple-responder.go
@@ -8,6 +8,7 @@ import (
 	"net/http"
 	"os"
 	"os/signal"
+	"strings"
 	"syscall"
 	"time"
 
@@ -264,6 +265,32 @@ func main() {
 		c.JSON(200, gin.H{"status": "ok"})
 	})
 
+	// Sleep/wake endpoints
+	r.POST("/sleep", func(c *gin.Context) {
+		c.Status(http.StatusOK)
+	})
+
+	r.POST("/wake_up", func(c *gin.Context) {
+		c.Status(http.StatusOK)
+	})
+
+	r.POST("/wake_up_fail", func(c *gin.Context) {
+		c.Status(http.StatusInternalServerError)
+	})
+
+	r.POST("/collective_rpc", func(c *gin.Context) {
+		body, _ := io.ReadAll(c.Request.Body)
+		if strings.Contains(string(body), "reload_weights") {
+			c.Status(http.StatusOK)
+		} else {
+			c.Status(http.StatusBadRequest)
+		}
+	})
+
+	r.POST("/reset_prefix_cache", func(c *gin.Context) {
+		c.Status(http.StatusOK)
+	})
+
 	r.GET("/", func(c *gin.Context) {
 		c.Header("Content-Type", "text/plain")
 		c.String(200, fmt.Sprintf("%s %s", c.Request.Method, c.Request.URL.Path))

diff --git a/config-schema.json b/config-schema.json
@@ -48,6 +48,18 @@
             "default": 120,
             "description": "Number of seconds to wait for a model to be ready to serve requests."
         },
+        "sleepRequestTimeout": {
+            "type": "integer",
+            "minimum": 1,
+            "default": 10,
+            "description": "Number of seconds to wait for each sleep HTTP request to complete. Applies globally to all sleep endpoints unless overridden per-endpoint with timeout field."
+        },
+        "wakeRequestTimeout": {
+            "type": "integer",
+            "minimum": 1,
+            "default": 10,
+            "description": "Number of seconds to wait for each wake HTTP request to complete. Applies globally to all wake endpoints unless overridden per-endpoint with timeout field."
+        },
         "logLevel": {
             "type": "string",
             "enum": [
@@ -214,6 +226,80 @@
                         "type": "boolean",
                         "default": false,
                         "description": "If true the model will not show up in /v1/models responses. It can still be used as normal in API requests."
+                    },
+                    "sleepMode": {
+                        "type": "string",
+                        "enum": ["enable", "disable"],
+                        "default": "disable",
+                        "description": "Explicitly controls sleep/wake behavior. 'enable' activates sleep/wake functionality and requires sleepEndpoints and wakeEndpoints to be defined."
+                    },
+                    "sleepEndpoints": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "required": ["endpoint"],
+                            "properties": {
+                                "endpoint": {
+                                    "type": "string",
+                                    "minLength": 1,
+                                    "description": "URL path for the sleep endpoint (e.g., /sleep?level=1)."
+                                },
+                                "method": {
+                                    "type": "string",
+                                    "enum": ["GET", "POST", "PUT", "PATCH"],
+                                    "default": "POST",
+                                    "description": "HTTP method to use for the request."
+                                },
+                                "body": {
+                                    "type": "string",
+                                    "default": "",
+                                    "description": "Optional request body (JSON string)."
+                                },
+                                "timeout": {
+                                    "type": "integer",
+                                    "minimum": 0,
+                                    "default": 0,
+                                    "description": "Optional per-endpoint timeout in seconds. 0 uses global sleepRequestTimeout."
+                                }
+                            },
+                            "additionalProperties": false
+                        },
+                        "default": [],
+                        "description": "Array of HTTP endpoints to call for putting the model to sleep. Requires sleepMode to be 'enable'. Endpoints are called sequentially in array order. Used instead of cmdStop during model swapping."
+                    },
+                    "wakeEndpoints": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "required": ["endpoint"],
+                            "properties": {
+                                "endpoint": {
+                                    "type": "string",
+                                    "minLength": 1,
+                                    "description": "URL path for the wake endpoint (e.g., /wake_up)."
+                                },
+                                "method": {
+                                    "type": "string",
+                                    "enum": ["GET", "POST", "PUT", "PATCH"],
+                                    "default": "POST",
+                                    "description": "HTTP method to use for the request."
+                                },
+                                "body": {
+                                    "type": "string",
+                                    "default": "",
+                                    "description": "Optional request body (JSON string)."
+                                },
+                                "timeout": {
+                                    "type": "integer",
+                                    "minimum": 0,
+                                    "default": 0,
+                                    "description": "Optional per-endpoint timeout in seconds. 0 uses global wakeRequestTimeout."
+                                }
+                            },
+                            "additionalProperties": false
+                        },
+                        "default": [],
+                        "description": "Array of HTTP endpoints to call for waking the model. Requires sleepMode to be 'enable'. Required when sleepMode is 'enable'. Endpoints are called sequentially in array order."
                     }
                 }
             }

diff --git a/config.example.yaml b/config.example.yaml
@@ -21,6 +21,18 @@
 # - minimum value is 15 seconds, anything less will be set to this value
 healthCheckTimeout: 500
 
+# sleepRequestTimeout: number of seconds to wait for each sleep HTTP request to complete
+# - optional, default: 10
+# - applies globally to all sleep endpoints unless overridden per-endpoint with timeout field
+# - used when putting a model to sleep during model swapping
+sleepRequestTimeout: 20
+
+# wakeRequestTimeout: number of seconds to wait for each wake HTTP request to complete
+# - optional, default: 10
+# - applies globally to all wake endpoints unless overridden per-endpoint with timeout field
+# - used when waking a model from sleep
+wakeRequestTimeout: 20
+
 # logLevel: sets the logging value
 # - optional, default: info
 # - Valid log levels: debug, info, warn, error
@@ -243,6 +255,93 @@ models:
     # - processes have 5 seconds to shutdown until forceful termination is attempted
     cmdStop: docker stop ${MODEL_ID}
 
+  # vLLM Sleep Mode Example - Level 1:
+  # vLLM supports sleep/wake functionality for fast model switching
+  # See: https://docs.vllm.ai/en/stable/features/sleep_mode.html
+  # Level 1: offload weights to CPU RAM (faster wake, higher RAM usage, single-step wake)
+  "vllm-sleep-level1":
+    # sleepMode: explicitly controls sleep/wake behavior
+    # - "enable": activates sleep/wake - requires sleepEndpoints and wakeEndpoints
+    # - "disable": disables sleep/wake - uses cmdStop instead
+    # - (empty): default - sleep mode disabled
+    sleepMode: enable
+
+    cmd: |
+      uv run python -m vllm.entrypoints.openai.api_server
+      --model /path/to/models/my-model
+      --served-model-name ${MODEL_ID}
+      --port ${PORT}
+      --enable-sleep-mode
+    env:
+      # Required to enable sleep mode in vLLM
+      - "VLLM_SERVER_DEV_MODE=1"
+
+    # sleepEndpoints: array of HTTP endpoints to call for putting the model to sleep
+    # - optional, default: []
+    # - if defined along with wakeEndpoints, used instead of cmdStop during model swapping
+    # - HTTP requests are sent to proxy base URL + endpoint
+    # - endpoints are called sequentially in array order
+    # - supports macro substitution: ${PORT}, ${MODEL_ID}
+    # - each endpoint can include query parameters: /sleep?level=1
+    # - vLLM sleep levels:
+    #   - level 1: offload weights to CPU RAM (faster wake, higher RAM usage)
+    #   - level 2: discard weights entirely (slower wake, minimal RAM usage)
+    sleepEndpoints:
+      - endpoint: /sleep?level=1
+        method: POST
+        # body is optional
+        # timeout is optional - overrides global sleepRequestTimeout for this specific endpoint
+
+    # wakeEndpoints: array of HTTP endpoints to call for waking the model
+    # - required if sleepEndpoints is defined
+    # - used when loading a sleeping model
+    # - HTTP requests are sent to proxy base URL + endpoint
+    # - endpoints are called sequentially in array order
+    # - level 1 sleep requires only single wake step
+    wakeEndpoints:
+      - endpoint: /wake_up
+        method: POST
+        # timeout is optional - overrides global wakeRequestTimeout for this specific endpoint
+
+  # vLLM Sleep Mode Example - Level 2:
+  # Level 2: discard weights entirely (slower wake, minimal RAM usage, multi-step wake)
+  # Requires a 3-step wake sequence to fully restore the model
+  "vllm-sleep-level2":
+    # Enable sleep/wake functionality
+    sleepMode: enable
+
+    cmd: |
+      uv run python -m vllm.entrypoints.openai.api_server
+      --model /path/to/models/my-large-model
+      --served-model-name ${MODEL_ID}
+      --port ${PORT}
+      --enable-sleep-mode
+    env:
+      # Required to enable sleep mode in vLLM
+      - "VLLM_SERVER_DEV_MODE=1"
+
+    # Level 2 sleep endpoint - discards weights for minimal RAM usage
+    sleepEndpoints:
+      - endpoint: /sleep?level=2
+        method: POST
+        # Optional: override global sleepRequestTimeout
+        timeout: 15
+
+    # Level 2 wake requires multi-step sequence to reload weights and reset cache
+    wakeEndpoints:
+      # Step 1: Wake the model
+      - endpoint: /wake_up
+        method: POST
+      # Step 2: Reload weights
+      - endpoint: /collective_rpc
+        method: POST
+        body: '{"method": "reload_weights"}'
+        # Optional: override timeout for this specific endpoint
+        timeout: 12
+      # Step 3: Reset the prefix cache
+      - endpoint: /reset_prefix_cache
+        method: POST
+
 # groups: a dictionary of group settings
 # - optional, default: empty dictionary
 # - provides advanced controls over model swapping behaviour