Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ Built in Go for performance and simplicity, llama-swap has zero dependencies and
- Request timeout protection with `requestTimeout` to prevent runaway inference
- Reliable Docker and Podman support using `cmd` and `cmdStop` together
- Preload models on startup with `hooks` ([#235](https://github.com/mostlygeek/llama-swap/pull/235))
- RPC health checking for distributed inference - conditionally expose models based on RPC server availability

### Web UI

Expand Down Expand Up @@ -175,6 +176,7 @@ Almost all configuration settings are optional and can be added one step at a ti
- `useModelName` to override model names sent to upstream servers
- `${PORT}` automatic port variables for dynamic port assignment
- `filters` rewrite parts of requests before sending to the upstream server
- `rpcHealthCheck` monitor RPC server health for distributed inference models

See the [configuration documentation](docs/configuration.md) for all options.

Expand Down
5 changes: 5 additions & 0 deletions config-schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,11 @@
"type": "boolean",
"default": false,
"description": "If true the model will not show up in /v1/models responses. It can still be used as normal in API requests."
},
"rpcHealthCheck": {
"type": "boolean",
"default": false,
"description": "Enable TCP health checks for RPC endpoints specified in cmd. When enabled, parses --rpc host:port[,host:port,...] from cmd and performs health checks every 30 seconds. Models with unhealthy RPC endpoints are filtered from /v1/models and return 503 on inference requests."
}
}
}
Expand Down
18 changes: 18 additions & 0 deletions config.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,24 @@ models:
unlisted: true
cmd: llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0

# RPC health check example for distributed inference:
"qwen-distributed":
# rpcHealthCheck: enable TCP health checks for RPC endpoints
# - optional, default: false
# - when enabled, parses --rpc host:port[,host:port,...] from cmd
# - performs TCP connectivity checks every 30 seconds
# - model is only listed in /v1/models when ALL RPC endpoints are healthy
# - inference requests to unhealthy models return HTTP 503
# - useful for distributed inference with llama.cpp's rpc-server
rpcHealthCheck: true
cmd: |
llama-server --port ${PORT}
--rpc 192.168.1.10:50051,192.168.1.11:50051
-m Qwen2.5-32B-Instruct-Q4_K_M.gguf
-ngl 99
name: "Qwen 32B (Distributed)"
description: "Large model using distributed RPC inference"

# Docker example:
# container runtimes like Docker and Podman can be used reliably with
# a combination of cmd, cmdStop, and ${MODEL_ID}
Expand Down
21 changes: 11 additions & 10 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,16 +72,17 @@ models:

llama-swap supports many more features to customize how you want to manage your environment.

| Feature | Description |
| --------- | ---------------------------------------------- |
| `ttl` | automatic unloading of models after a timeout |
| `macros` | reusable snippets to use in configurations |
| `groups` | run multiple models at a time |
| `hooks` | event driven functionality |
| `env` | define environment variables per model |
| `aliases` | serve a model with different names |
| `filters` | modify requests before sending to the upstream |
| `...` | And many more tweaks |
| Feature | Description |
| ----------------- | ------------------------------------------------------- |
| `ttl` | automatic unloading of models after a timeout |
| `macros` | reusable snippets to use in configurations |
| `groups` | run multiple models at a time |
| `hooks` | event driven functionality |
| `env` | define environment variables per model |
| `aliases` | serve a model with different names |
| `filters` | modify requests before sending to the upstream |
| `rpcHealthCheck` | monitor RPC server health for distributed inference |
| `...` | And many more tweaks |

## Full Configuration Example

Expand Down
58 changes: 58 additions & 0 deletions proxy/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package config
import (
"fmt"
"io"
"net"
"net/url"
"os"
"regexp"
Expand Down Expand Up @@ -533,6 +534,63 @@ func SanitizeCommand(cmdStr string) ([]string, error) {
return args, nil
}

// ParseRPCEndpoints extracts RPC endpoints from command string
// Handles: --rpc host:port,host2:port2 or --rpc=host:port or -rpc host:port
func ParseRPCEndpoints(cmdStr string) ([]string, error) {
args, err := SanitizeCommand(cmdStr)
if err != nil {
return nil, err
}

var endpoints []string
for i, arg := range args {
if arg == "--rpc" || arg == "-rpc" {
if i+1 < len(args) {
endpoints = parseEndpointList(args[i+1])
}
} else if strings.HasPrefix(arg, "--rpc=") {
endpoints = parseEndpointList(strings.TrimPrefix(arg, "--rpc="))
} else if strings.HasPrefix(arg, "-rpc=") {
endpoints = parseEndpointList(strings.TrimPrefix(arg, "-rpc="))
}
}

// Validate each endpoint
for _, ep := range endpoints {
if _, _, err := net.SplitHostPort(ep); err != nil {
return nil, fmt.Errorf("invalid RPC endpoint %q: %w", ep, err)
}
}

return endpoints, nil
}

func parseEndpointList(s string) []string {
s = strings.TrimSpace(s)

// Strip surrounding quotes (both single and double) from the whole string
// if they match. This handles cases like: "host:port,host2:port2"
if len(s) >= 2 {
if (s[0] == '\'' && s[len(s)-1] == '\'') || (s[0] == '"' && s[len(s)-1] == '"') {
s = s[1 : len(s)-1]
}
}

parts := strings.Split(s, ",")
var result []string
for _, p := range parts {
p = strings.TrimSpace(p)
// Strip any remaining leading/trailing quotes from individual parts
// This handles Windows where shlex doesn't handle single quotes and
// may split 'host:port, host2:port' into "'host:port," and "host2:port'"
p = strings.Trim(p, "'\"")
if p != "" {
result = append(result, p)
}
}
return result
}

func StripComments(cmdStr string) string {
var cleanedLines []string
for _, line := range strings.Split(cmdStr, "\n") {
Expand Down
105 changes: 105 additions & 0 deletions proxy/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1309,3 +1309,108 @@ peers:
assert.Contains(t, err.Error(), "unknown macro")
})
}

func TestParseRPCEndpoints_ValidFormats(t *testing.T) {
tests := []struct {
name string
cmd string
expected []string
}{
{
name: "single endpoint with --rpc",
cmd: "llama-server --rpc localhost:50051 -ngl 99",
expected: []string{"localhost:50051"},
},
{
name: "single endpoint with --rpc=",
cmd: "llama-server --rpc=192.168.1.100:50051 -ngl 99",
expected: []string{"192.168.1.100:50051"},
},
{
name: "single endpoint with -rpc",
cmd: "llama-server -rpc localhost:50051 -ngl 99",
expected: []string{"localhost:50051"},
},
{
name: "single endpoint with -rpc=",
cmd: "llama-server -rpc=localhost:50051 -ngl 99",
expected: []string{"localhost:50051"},
},
{
name: "multiple endpoints comma-separated",
cmd: "llama-server --rpc 192.168.1.10:50051,192.168.1.11:50051 -ngl 99",
expected: []string{"192.168.1.10:50051", "192.168.1.11:50051"},
},
{
name: "multiple endpoints with spaces trimmed",
cmd: "llama-server --rpc '192.168.1.10:50051, 192.168.1.11:50051' -ngl 99",
expected: []string{"192.168.1.10:50051", "192.168.1.11:50051"},
},
{
name: "IPv6 endpoint",
cmd: "llama-server --rpc [::1]:50051 -ngl 99",
expected: []string{"[::1]:50051"},
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
endpoints, err := ParseRPCEndpoints(tt.cmd)
assert.NoError(t, err)
assert.Equal(t, tt.expected, endpoints)
})
}
}

func TestParseRPCEndpoints_NoRPCFlag(t *testing.T) {
cmd := "llama-server -ngl 99 -m model.gguf"
endpoints, err := ParseRPCEndpoints(cmd)
assert.NoError(t, err)
assert.Empty(t, endpoints)
}

func TestParseRPCEndpoints_InvalidFormats(t *testing.T) {
tests := []struct {
name string
cmd string
wantErr string
}{
{
name: "missing port",
cmd: "llama-server --rpc localhost -ngl 99",
wantErr: "invalid RPC endpoint",
},
{
name: "invalid host:port format",
cmd: "llama-server --rpc not-a-valid-endpoint -ngl 99",
wantErr: "invalid RPC endpoint",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
_, err := ParseRPCEndpoints(tt.cmd)
assert.Error(t, err)
assert.Contains(t, err.Error(), tt.wantErr)
})
}
}

func TestParseRPCEndpoints_EmptyEndpointsFiltered(t *testing.T) {
// Empty strings after commas are filtered out
cmd := "llama-server --rpc 'localhost:50051,,' -ngl 99"
endpoints, err := ParseRPCEndpoints(cmd)
assert.NoError(t, err)
assert.Equal(t, []string{"localhost:50051"}, endpoints)
}

func TestParseRPCEndpoints_MultilineCommand(t *testing.T) {
cmd := `llama-server \
--rpc localhost:50051 \
-ngl 99 \
-m model.gguf`

endpoints, err := ParseRPCEndpoints(cmd)
assert.NoError(t, err)
assert.Equal(t, []string{"localhost:50051"}, endpoints)
}
3 changes: 3 additions & 0 deletions proxy/config/model_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ type ModelConfig struct {
// override global setting
SendLoadingState *bool `yaml:"sendLoadingState"`

// RPC health checking
RPCHealthCheck bool `yaml:"rpcHealthCheck"`
// Maximum time in seconds for a request to complete before killing the process
// 0 means no timeout (default)
RequestTimeout int `yaml:"requestTimeout"`
Expand All @@ -57,6 +59,7 @@ func (m *ModelConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
ConcurrencyLimit: 0,
Name: "",
Description: "",
RPCHealthCheck: false,
RequestTimeout: 0,
}

Expand Down
Loading