overcuriousity · overcuriousity · Jan 31, 2026 · Jan 30, 2026 · Jan 30, 2026 · Jan 30, 2026
diff --git a/README.md b/README.md
@@ -46,6 +46,7 @@ Built in Go for performance and simplicity, llama-swap has zero dependencies and
   - Request timeout protection with `requestTimeout` to prevent runaway inference
   - Reliable Docker and Podman support using `cmd` and `cmdStop` together
   - Preload models on startup with `hooks` ([#235](https://github.com/mostlygeek/llama-swap/pull/235))
+  - RPC health checking for distributed inference - conditionally expose models based on RPC server availability
 
 ### Web UI
 
@@ -175,6 +176,7 @@ Almost all configuration settings are optional and can be added one step at a ti
   - `useModelName` to override model names sent to upstream servers
   - `${PORT}` automatic port variables for dynamic port assignment
   - `filters` rewrite parts of requests before sending to the upstream server
+  - `rpcHealthCheck` monitor RPC server health for distributed inference models
 
 See the [configuration documentation](docs/configuration.md) for all options.
 

diff --git a/config-schema.json b/config-schema.json
@@ -226,6 +226,11 @@
                         "type": "boolean",
                         "default": false,
                         "description": "If true the model will not show up in /v1/models responses. It can still be used as normal in API requests."
+                    },
+                    "rpcHealthCheck": {
+                        "type": "boolean",
+                        "default": false,
+                        "description": "Enable TCP health checks for RPC endpoints specified in cmd. When enabled, parses --rpc host:port[,host:port,...] from cmd and performs health checks every 30 seconds. Models with unhealthy RPC endpoints are filtered from /v1/models and return 503 on inference requests."
                     }
                 }
             }

diff --git a/config.example.yaml b/config.example.yaml
@@ -272,6 +272,24 @@ models:
     unlisted: true
     cmd: llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
 
+  # RPC health check example for distributed inference:
+  "qwen-distributed":
+    # rpcHealthCheck: enable TCP health checks for RPC endpoints
+    # - optional, default: false
+    # - when enabled, parses --rpc host:port[,host:port,...] from cmd
+    # - performs TCP connectivity checks every 30 seconds
+    # - model is only listed in /v1/models when ALL RPC endpoints are healthy
+    # - inference requests to unhealthy models return HTTP 503
+    # - useful for distributed inference with llama.cpp's rpc-server
+    rpcHealthCheck: true
+    cmd: |
+      llama-server --port ${PORT}
+      --rpc 192.168.1.10:50051,192.168.1.11:50051
+      -m Qwen2.5-32B-Instruct-Q4_K_M.gguf
+      -ngl 99
+    name: "Qwen 32B (Distributed)"
+    description: "Large model using distributed RPC inference"
+
   # Docker example:
   # container runtimes like Docker and Podman can be used reliably with
   # a combination of cmd, cmdStop, and ${MODEL_ID}

diff --git a/docs/configuration.md b/docs/configuration.md
@@ -72,16 +72,17 @@ models:
 
 llama-swap supports many more features to customize how you want to manage your environment.
 
-| Feature   | Description                                    |
-| --------- | ---------------------------------------------- |
-| `ttl`     | automatic unloading of models after a timeout  |
-| `macros`  | reusable snippets to use in configurations     |
-| `groups`  | run multiple models at a time                  |
-| `hooks`   | event driven functionality                     |
-| `env`     | define environment variables per model         |
-| `aliases` | serve a model with different names             |
-| `filters` | modify requests before sending to the upstream |
-| `...`     | And many more tweaks                           |
+| Feature           | Description                                             |
+| ----------------- | ------------------------------------------------------- |
+| `ttl`             | automatic unloading of models after a timeout           |
+| `macros`          | reusable snippets to use in configurations              |
+| `groups`          | run multiple models at a time                           |
+| `hooks`           | event driven functionality                              |
+| `env`             | define environment variables per model                  |
+| `aliases`         | serve a model with different names                      |
+| `filters`         | modify requests before sending to the upstream          |
+| `rpcHealthCheck`  | monitor RPC server health for distributed inference     |
+| `...`             | And many more tweaks                                    |
 
 ## Full Configuration Example
 

diff --git a/proxy/config/config.go b/proxy/config/config.go
@@ -3,6 +3,7 @@ package config
 import (
 	"fmt"
 	"io"
+	"net"
 	"net/url"
 	"os"
 	"regexp"
@@ -533,6 +534,63 @@ func SanitizeCommand(cmdStr string) ([]string, error) {
 	return args, nil
 }
 
+// ParseRPCEndpoints extracts RPC endpoints from command string
+// Handles: --rpc host:port,host2:port2 or --rpc=host:port or -rpc host:port
+func ParseRPCEndpoints(cmdStr string) ([]string, error) {
+	args, err := SanitizeCommand(cmdStr)
+	if err != nil {
+		return nil, err
+	}
+
+	var endpoints []string
+	for i, arg := range args {
+		if arg == "--rpc" || arg == "-rpc" {
+			if i+1 < len(args) {
+				endpoints = parseEndpointList(args[i+1])
+			}
+		} else if strings.HasPrefix(arg, "--rpc=") {
+			endpoints = parseEndpointList(strings.TrimPrefix(arg, "--rpc="))
+		} else if strings.HasPrefix(arg, "-rpc=") {
+			endpoints = parseEndpointList(strings.TrimPrefix(arg, "-rpc="))
+		}
+	}
+
+	// Validate each endpoint
+	for _, ep := range endpoints {
+		if _, _, err := net.SplitHostPort(ep); err != nil {
+			return nil, fmt.Errorf("invalid RPC endpoint %q: %w", ep, err)
+		}
+	}
+
+	return endpoints, nil
+}
+
+func parseEndpointList(s string) []string {
+	s = strings.TrimSpace(s)
+
+	// Strip surrounding quotes (both single and double) from the whole string
+	// if they match. This handles cases like: "host:port,host2:port2"
+	if len(s) >= 2 {
+		if (s[0] == '\'' && s[len(s)-1] == '\'') || (s[0] == '"' && s[len(s)-1] == '"') {
+			s = s[1 : len(s)-1]
+		}
+	}
+
+	parts := strings.Split(s, ",")
+	var result []string
+	for _, p := range parts {
+		p = strings.TrimSpace(p)
+		// Strip any remaining leading/trailing quotes from individual parts
+		// This handles Windows where shlex doesn't handle single quotes and
+		// may split 'host:port, host2:port' into "'host:port," and "host2:port'"
+		p = strings.Trim(p, "'\"")
+		if p != "" {
+			result = append(result, p)
+		}
+	}
+	return result
+}
+
 func StripComments(cmdStr string) string {
 	var cleanedLines []string
 	for _, line := range strings.Split(cmdStr, "\n") {

diff --git a/proxy/config/config_test.go b/proxy/config/config_test.go
@@ -1309,3 +1309,108 @@ peers:
 		assert.Contains(t, err.Error(), "unknown macro")
 	})
 }
+
+func TestParseRPCEndpoints_ValidFormats(t *testing.T) {
+	tests := []struct {
+		name     string
+		cmd      string
+		expected []string
+	}{
+		{
+			name:     "single endpoint with --rpc",
+			cmd:      "llama-server --rpc localhost:50051 -ngl 99",
+			expected: []string{"localhost:50051"},
+		},
+		{
+			name:     "single endpoint with --rpc=",
+			cmd:      "llama-server --rpc=192.168.1.100:50051 -ngl 99",
+			expected: []string{"192.168.1.100:50051"},
+		},
+		{
+			name:     "single endpoint with -rpc",
+			cmd:      "llama-server -rpc localhost:50051 -ngl 99",
+			expected: []string{"localhost:50051"},
+		},
+		{
+			name:     "single endpoint with -rpc=",
+			cmd:      "llama-server -rpc=localhost:50051 -ngl 99",
+			expected: []string{"localhost:50051"},
+		},
+		{
+			name:     "multiple endpoints comma-separated",
+			cmd:      "llama-server --rpc 192.168.1.10:50051,192.168.1.11:50051 -ngl 99",
+			expected: []string{"192.168.1.10:50051", "192.168.1.11:50051"},
+		},
+		{
+			name:     "multiple endpoints with spaces trimmed",
+			cmd:      "llama-server --rpc '192.168.1.10:50051, 192.168.1.11:50051' -ngl 99",
+			expected: []string{"192.168.1.10:50051", "192.168.1.11:50051"},
+		},
+		{
+			name:     "IPv6 endpoint",
+			cmd:      "llama-server --rpc [::1]:50051 -ngl 99",
+			expected: []string{"[::1]:50051"},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			endpoints, err := ParseRPCEndpoints(tt.cmd)
+			assert.NoError(t, err)
+			assert.Equal(t, tt.expected, endpoints)
+		})
+	}
+}
+
+func TestParseRPCEndpoints_NoRPCFlag(t *testing.T) {
+	cmd := "llama-server -ngl 99 -m model.gguf"
+	endpoints, err := ParseRPCEndpoints(cmd)
+	assert.NoError(t, err)
+	assert.Empty(t, endpoints)
+}
+
+func TestParseRPCEndpoints_InvalidFormats(t *testing.T) {
+	tests := []struct {
+		name    string
+		cmd     string
+		wantErr string
+	}{
+		{
+			name:    "missing port",
+			cmd:     "llama-server --rpc localhost -ngl 99",
+			wantErr: "invalid RPC endpoint",
+		},
+		{
+			name:    "invalid host:port format",
+			cmd:     "llama-server --rpc not-a-valid-endpoint -ngl 99",
+			wantErr: "invalid RPC endpoint",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			_, err := ParseRPCEndpoints(tt.cmd)
+			assert.Error(t, err)
+			assert.Contains(t, err.Error(), tt.wantErr)
+		})
+	}
+}
+
+func TestParseRPCEndpoints_EmptyEndpointsFiltered(t *testing.T) {
+	// Empty strings after commas are filtered out
+	cmd := "llama-server --rpc 'localhost:50051,,' -ngl 99"
+	endpoints, err := ParseRPCEndpoints(cmd)
+	assert.NoError(t, err)
+	assert.Equal(t, []string{"localhost:50051"}, endpoints)
+}
+
+func TestParseRPCEndpoints_MultilineCommand(t *testing.T) {
+	cmd := `llama-server \
+		--rpc localhost:50051 \
+		-ngl 99 \
+		-m model.gguf`
+
+	endpoints, err := ParseRPCEndpoints(cmd)
+	assert.NoError(t, err)
+	assert.Equal(t, []string{"localhost:50051"}, endpoints)
+}
diff --git a/proxy/config/model_config.go b/proxy/config/model_config.go
@@ -37,6 +37,8 @@ type ModelConfig struct {
 	// override global setting
 	SendLoadingState *bool `yaml:"sendLoadingState"`
 
+	// RPC health checking
+	RPCHealthCheck bool `yaml:"rpcHealthCheck"`
 	// Maximum time in seconds for a request to complete before killing the process
 	// 0 means no timeout (default)
 	RequestTimeout int `yaml:"requestTimeout"`
@@ -57,6 +59,7 @@ func (m *ModelConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
 		ConcurrencyLimit: 0,
 		Name:             "",
 		Description:      "",
+		RPCHealthCheck:   false,
 		RequestTimeout:   0,
 	}