Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/go-ci-windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ jobs:

# necessary for testing proxy/Process swapping
- name: Create simple-responder
if: steps.restore-simple-responder.outputs.cache-hit != 'true'
shell: bash
run: make simple-responder-windows

Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,14 @@ Built in Go for performance and simplicity, llama-swap has zero dependencies and
- `/ui` - web UI
- `/upstream/:model_id` - direct access to upstream server ([demo](https://github.com/mostlygeek/llama-swap/pull/31))
- `/models/unload` - manually unload running models ([#58](https://github.com/mostlygeek/llama-swap/issues/58))
- `/models/sleep/:model_id` - put a model to sleep (requires sleep/wake configuration)
- `/running` - list currently running models ([#61](https://github.com/mostlygeek/llama-swap/issues/61))
- `/log` - remote log monitoring
- `/health` - just returns "OK"
- ✅ Customizable
- Run multiple models at once with `Groups` ([#107](https://github.com/mostlygeek/llama-swap/issues/107))
- Automatic unloading of models after timeout by setting a `ttl`
- Fast model switching with sleep/wake support (vLLM sleep mode, offload memory instead of full restart)
- Reliable Docker and Podman support using `cmd` and `cmdStop` together
- Preload models on startup with `hooks` ([#235](https://github.com/mostlygeek/llama-swap/pull/235))

Expand Down
27 changes: 27 additions & 0 deletions cmd/simple-responder/simple-responder.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"net/http"
"os"
"os/signal"
"strings"
"syscall"
"time"

Expand Down Expand Up @@ -264,6 +265,32 @@ func main() {
c.JSON(200, gin.H{"status": "ok"})
})

// Sleep/wake endpoints
r.POST("/sleep", func(c *gin.Context) {
c.Status(http.StatusOK)
})

r.POST("/wake_up", func(c *gin.Context) {
c.Status(http.StatusOK)
})

r.POST("/wake_up_fail", func(c *gin.Context) {
c.Status(http.StatusInternalServerError)
})

r.POST("/collective_rpc", func(c *gin.Context) {
body, _ := io.ReadAll(c.Request.Body)
if strings.Contains(string(body), "reload_weights") {
c.Status(http.StatusOK)
} else {
c.Status(http.StatusBadRequest)
}
})

r.POST("/reset_prefix_cache", func(c *gin.Context) {
c.Status(http.StatusOK)
})

r.GET("/", func(c *gin.Context) {
c.Header("Content-Type", "text/plain")
c.String(200, fmt.Sprintf("%s %s", c.Request.Method, c.Request.URL.Path))
Expand Down
86 changes: 86 additions & 0 deletions config-schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,18 @@
"default": 120,
"description": "Number of seconds to wait for a model to be ready to serve requests."
},
"sleepRequestTimeout": {
"type": "integer",
"minimum": 1,
"default": 10,
"description": "Number of seconds to wait for each sleep HTTP request to complete. Applies globally to all sleep endpoints unless overridden per-endpoint with timeout field."
},
"wakeRequestTimeout": {
"type": "integer",
"minimum": 1,
"default": 10,
"description": "Number of seconds to wait for each wake HTTP request to complete. Applies globally to all wake endpoints unless overridden per-endpoint with timeout field."
},
"logLevel": {
"type": "string",
"enum": [
Expand Down Expand Up @@ -214,6 +226,80 @@
"type": "boolean",
"default": false,
"description": "If true the model will not show up in /v1/models responses. It can still be used as normal in API requests."
},
"sleepMode": {
"type": "string",
"enum": ["enable", "disable"],
"default": "disable",
"description": "Explicitly controls sleep/wake behavior. 'enable' activates sleep/wake functionality and requires sleepEndpoints and wakeEndpoints to be defined."
},
"sleepEndpoints": {
"type": "array",
"items": {
"type": "object",
"required": ["endpoint"],
"properties": {
"endpoint": {
"type": "string",
"minLength": 1,
"description": "URL path for the sleep endpoint (e.g., /sleep?level=1)."
},
"method": {
"type": "string",
"enum": ["GET", "POST", "PUT", "PATCH"],
"default": "POST",
"description": "HTTP method to use for the request."
},
"body": {
"type": "string",
"default": "",
"description": "Optional request body (JSON string)."
},
"timeout": {
"type": "integer",
"minimum": 0,
"default": 0,
"description": "Optional per-endpoint timeout in seconds. 0 uses global sleepRequestTimeout."
}
},
"additionalProperties": false
},
"default": [],
"description": "Array of HTTP endpoints to call for putting the model to sleep. Requires sleepMode to be 'enable'. Endpoints are called sequentially in array order. Used instead of cmdStop during model swapping."
},
"wakeEndpoints": {
"type": "array",
"items": {
"type": "object",
"required": ["endpoint"],
"properties": {
"endpoint": {
"type": "string",
"minLength": 1,
"description": "URL path for the wake endpoint (e.g., /wake_up)."
},
"method": {
"type": "string",
"enum": ["GET", "POST", "PUT", "PATCH"],
"default": "POST",
"description": "HTTP method to use for the request."
},
"body": {
"type": "string",
"default": "",
"description": "Optional request body (JSON string)."
},
"timeout": {
"type": "integer",
"minimum": 0,
"default": 0,
"description": "Optional per-endpoint timeout in seconds. 0 uses global wakeRequestTimeout."
}
},
"additionalProperties": false
},
"default": [],
"description": "Array of HTTP endpoints to call for waking the model. Requires sleepMode to be 'enable'. Required when sleepMode is 'enable'. Endpoints are called sequentially in array order."
}
}
}
Expand Down
99 changes: 99 additions & 0 deletions config.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,18 @@
# - minimum value is 15 seconds, anything less will be set to this value
healthCheckTimeout: 500

# sleepRequestTimeout: number of seconds to wait for each sleep HTTP request to complete
# - optional, default: 10
# - applies globally to all sleep endpoints unless overridden per-endpoint with timeout field
# - used when putting a model to sleep during model swapping
sleepRequestTimeout: 20

# wakeRequestTimeout: number of seconds to wait for each wake HTTP request to complete
# - optional, default: 10
# - applies globally to all wake endpoints unless overridden per-endpoint with timeout field
# - used when waking a model from sleep
wakeRequestTimeout: 20

# logLevel: sets the logging value
# - optional, default: info
# - Valid log levels: debug, info, warn, error
Expand Down Expand Up @@ -243,6 +255,93 @@ models:
# - processes have 5 seconds to shutdown until forceful termination is attempted
cmdStop: docker stop ${MODEL_ID}

# vLLM Sleep Mode Example - Level 1:
# vLLM supports sleep/wake functionality for fast model switching
# See: https://docs.vllm.ai/en/stable/features/sleep_mode.html
# Level 1: offload weights to CPU RAM (faster wake, higher RAM usage, single-step wake)
"vllm-sleep-level1":
# sleepMode: explicitly controls sleep/wake behavior
# - "enable": activates sleep/wake - requires sleepEndpoints and wakeEndpoints
# - "disable": disables sleep/wake - uses cmdStop instead
# - (empty): default - sleep mode disabled
sleepMode: enable

cmd: |
uv run python -m vllm.entrypoints.openai.api_server
--model /path/to/models/my-model
--served-model-name ${MODEL_ID}
--port ${PORT}
--enable-sleep-mode
env:
# Required to enable sleep mode in vLLM
- "VLLM_SERVER_DEV_MODE=1"

# sleepEndpoints: array of HTTP endpoints to call for putting the model to sleep
# - optional, default: []
# - if defined along with wakeEndpoints, used instead of cmdStop during model swapping
# - HTTP requests are sent to proxy base URL + endpoint
# - endpoints are called sequentially in array order
# - supports macro substitution: ${PORT}, ${MODEL_ID}
# - each endpoint can include query parameters: /sleep?level=1
# - vLLM sleep levels:
# - level 1: offload weights to CPU RAM (faster wake, higher RAM usage)
# - level 2: discard weights entirely (slower wake, minimal RAM usage)
sleepEndpoints:
- endpoint: /sleep?level=1
method: POST
# body is optional
# timeout is optional - overrides global sleepRequestTimeout for this specific endpoint

# wakeEndpoints: array of HTTP endpoints to call for waking the model
# - required if sleepEndpoints is defined
# - used when loading a sleeping model
# - HTTP requests are sent to proxy base URL + endpoint
# - endpoints are called sequentially in array order
# - level 1 sleep requires only single wake step
wakeEndpoints:
- endpoint: /wake_up
method: POST
# timeout is optional - overrides global wakeRequestTimeout for this specific endpoint

# vLLM Sleep Mode Example - Level 2:
# Level 2: discard weights entirely (slower wake, minimal RAM usage, multi-step wake)
# Requires a 3-step wake sequence to fully restore the model
"vllm-sleep-level2":
# Enable sleep/wake functionality
sleepMode: enable

cmd: |
uv run python -m vllm.entrypoints.openai.api_server
--model /path/to/models/my-large-model
--served-model-name ${MODEL_ID}
--port ${PORT}
--enable-sleep-mode
env:
# Required to enable sleep mode in vLLM
- "VLLM_SERVER_DEV_MODE=1"

# Level 2 sleep endpoint - discards weights for minimal RAM usage
sleepEndpoints:
- endpoint: /sleep?level=2
method: POST
# Optional: override global sleepRequestTimeout
timeout: 15

# Level 2 wake requires multi-step sequence to reload weights and reset cache
wakeEndpoints:
# Step 1: Wake the model
- endpoint: /wake_up
method: POST
# Step 2: Reload weights
- endpoint: /collective_rpc
method: POST
body: '{"method": "reload_weights"}'
# Optional: override timeout for this specific endpoint
timeout: 12
# Step 3: Reset the prefix cache
- endpoint: /reset_prefix_cache
method: POST

# groups: a dictionary of group settings
# - optional, default: empty dictionary
# - provides advanced controls over model swapping behaviour
Expand Down
Loading