Implement Multi-Process Handling (#7)

Refactor code to support starting of multiple back end llama.cpp servers. This functionality is exposed as `profiles` to create a simple configuration format. Changes: * refactor proxy tests to get ready for multi-process support * update proxy/ProxyManager to support multiple processes (#7) * Add support for Groups in configuration * improve handling of Model alias configs * implement multi-model swapping * improve code clarity for swapModel * improve docs, rename groups to profiles in config
mostlygeek · Nov 24, 2024 · 73ad85e · 73ad85e
1 parent 533162c
commit 73ad85e
Show file tree

Hide file tree

Showing 10 changed files with 355 additions and 118 deletions.
diff --git a/README.md b/README.md
@@ -2,17 +2,22 @@
 
 ![llama-swap header image](header.jpeg)
 
-[llama.cpp's server](https://github.com/ggerganov/llama.cpp/tree/master/examples/server) can't swap models on demand. So let's swap the server on demand instead!
+llama-swap is a golang server that automatically swaps the llama.cpp server on demand. Since [llama.cpp's server](https://github.com/ggerganov/llama.cpp/tree/master/examples/server) can't swap models, let's swap the server instead!
 
-llama-swap is a proxy server that sits in front of llama-server. When a request for `/v1/chat/completions` comes in it will extract the `model` requested and change the underlying llama-server automatically.
+Features:
 
-- ✅ easy to deploy: single binary with no dependencies
-- ✅ full control over llama-server's startup settings
-- ✅ ❤️ for users who are rely on llama.cpp for LLM inference
+- ✅ Easy to deploy: single binary with no dependencies
+- ✅ Single yaml configuration file
+- ✅ Automatically switching between models
+- ✅ Full control over llama.cpp server settings per model
+- ✅ OpenAI API support (`v1/completions` and `v1/chat/completions`)
+- ✅ Multiple GPU support
+- ✅ Run multiple models at once with `profiles`
+- ✅ Remote log monitoring at `/log`
 
 ## config.yaml
 
-llama-swap's configuration purposefully simple.
+llama-swap's configuration is purposefully simple.
 
 ```yaml
 # Seconds to wait for llama.cpp to load and be ready to serve requests
@@ -24,25 +29,24 @@ models:
   "llama":
     cmd: llama-server --port 8999 -m Llama-3.2-1B-Instruct-Q4_K_M.gguf
 
-    # where to reach the server started by cmd
+    # where to reach the server started by cmd, make sure the ports match
     proxy: http://127.0.0.1:8999
 
-    # aliases model names to use this configuration for
+    # aliases names to use this model for
     aliases:
     - "gpt-4o-mini"
     - "gpt-3.5-turbo"
 
-    # wait for this path to return an HTTP 200 before serving requests
-    # defaults to /health to match llama.cpp
-    #
-    # use "none" to skip endpoint checking. This may cause requests to fail
-    # until the server is ready
+    # check this path for an HTTP 200 OK before serving requests
+    # default: /health to match llama.cpp
+    # use "none" to skip endpoint checking, but may cause HTTP errors
+    # until the model is ready
     checkEndpoint: /custom-endpoint
 
-    # automatically unload the model after 10 seconds
+    # automatically unload the model after this many seconds
     # ttl values must be a value greater than 0
     # default: 0 = never unload model
-    ttl: 5
+    ttl: 60
 
   "qwen":
     # environment variables to pass to the command
@@ -53,8 +57,18 @@ models:
     cmd: >
       llama-server --port 8999
       --model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
-
     proxy: http://127.0.0.1:8999
+
+# profiles make it easy to managing multi model (and gpu) configurations.
+#
+# Tips:
+#  - each model must be listening on a unique address and port
+#  - the model name is in this format: "profile_name/model", like "coding/qwen"
+#  - the profile will load and unload all models in the profile at the same time
+profiles:
+  coding:
+    - "qwen"
+    - "llama"
 ```
 
 ## Installation

diff --git a/config.example.yaml b/config.example.yaml
@@ -6,9 +6,9 @@ models:
   "llama":
     cmd: >
       models/llama-server-osx
-      --port 8999
+      --port 9001
       -m models/Llama-3.2-1B-Instruct-Q4_0.gguf
-    proxy: http://127.0.0.1:8999
+    proxy: http://127.0.0.1:9001
 
     # list of model name aliases this llama.cpp instance can serve
     aliases:
@@ -21,8 +21,8 @@ models:
     ttl: 5
 
   "qwen":
-    cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
-    proxy: http://127.0.0.1:8999
+    cmd: models/llama-server-osx --port 9002 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
+    proxy: http://127.0.0.1:9002
     aliases:
     - gpt-3.5-turbo
 
@@ -44,4 +44,10 @@ models:
     proxy: http://127.0.0.1:8999
   "broken_timeout":
     cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
-    proxy: http://127.0.0.1:9000
+    proxy: http://127.0.0.1:9000
+
+# creating a coding profile with models for code generation and general questions
+profiles:
+  coding:
+    - "qwen"
+    - "llama"
diff --git a/misc/simple-responder/simple-responder.go b/misc/simple-responder/simple-responder.go
@@ -16,12 +16,16 @@ func main() {
 
 	flag.Parse() // Parse the command-line flags
 
-	// Set up the handler function using the provided response message
-	http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
+	responseMessageHandler := func(w http.ResponseWriter, r *http.Request) {
 		// Set the header to text/plain
 		w.Header().Set("Content-Type", "text/plain")
 		fmt.Fprintln(w, *responseMessage)
-	})
+	}
+
+	// Set up the handler function using the provided response message
+	http.HandleFunc("/v1/chat/completions", responseMessageHandler)
+	http.HandleFunc("/v1/completions", responseMessageHandler)
+	http.HandleFunc("/test", responseMessageHandler)
 
 	http.HandleFunc("/env", func(w http.ResponseWriter, r *http.Request) {
 		w.Header().Set("Content-Type", "text/plain")
@@ -43,6 +47,11 @@ func main() {
 		w.Write([]byte(response))
 	})
 
+	http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "text/plain")
+		fmt.Fprintf(w, "%s %s", r.Method, r.URL.Path)
+	})
+
 	address := "127.0.0.1:" + *port // Address with the specified port
 	fmt.Printf("Server is listening on port %s\n", *port)
 

diff --git a/proxy/config.go b/proxy/config.go
@@ -22,26 +22,30 @@ func (m *ModelConfig) SanitizedCommand() ([]string, error) {
 }
 
 type Config struct {
-	Models             map[string]ModelConfig `yaml:"models"`
 	HealthCheckTimeout int                    `yaml:"healthCheckTimeout"`
+	Models             map[string]ModelConfig `yaml:"models"`
+	Profiles           map[string][]string    `yaml:"profiles"`
+
+	// map aliases to actual model IDs
+	aliases map[string]string
 }
 
-func (c *Config) FindConfig(modelName string) (ModelConfig, string, bool) {
-	modelConfig, found := c.Models[modelName]
-	if found {
-		return modelConfig, modelName, true
+func (c *Config) RealModelName(search string) (string, bool) {
+	if _, found := c.Models[search]; found {
+		return search, true
+	} else if name, found := c.aliases[search]; found {
+		return name, found
+	} else {
+		return "", false
 	}
+}
 
-	// Search through aliases to find the right config
-	for actual, config := range c.Models {
-		for _, alias := range config.Aliases {
-			if alias == modelName {
-				return config, actual, true
-			}
-		}
+func (c *Config) FindConfig(modelName string) (ModelConfig, string, bool) {
+	if realName, found := c.RealModelName(modelName); !found {
+		return ModelConfig{}, "", false
+	} else {
+		return c.Models[realName], realName, true
 	}
-
-	return ModelConfig{}, "", false
 }
 
 func LoadConfig(path string) (*Config, error) {
@@ -60,6 +64,14 @@ func LoadConfig(path string) (*Config, error) {
 		config.HealthCheckTimeout = 15
 	}
 
+	// Populate the aliases map
+	config.aliases = make(map[string]string)
+	for modelName, modelConfig := range config.Models {
+		for _, alias := range modelConfig.Aliases {
+			config.aliases[alias] = modelName
+		}
+	}
+
 	return &config, nil
 }
 

diff --git a/proxy/config_test.go b/proxy/config_test.go
@@ -8,7 +8,7 @@ import (
 	"github.com/stretchr/testify/assert"
 )
 
-func TestLoadConfig(t *testing.T) {
+func TestConfig_Load(t *testing.T) {
 	// Create a temporary YAML file for testing
 	tempDir, err := os.MkdirTemp("", "test-config")
 	if err != nil {
@@ -17,7 +17,8 @@ func TestLoadConfig(t *testing.T) {
 	defer os.RemoveAll(tempDir)
 
 	tempFile := filepath.Join(tempDir, "config.yaml")
-	content := `models:
+	content := `
+models:
   model1:
     cmd: path/to/cmd --arg1 one
     proxy: "http://localhost:8080"
@@ -28,7 +29,17 @@ func TestLoadConfig(t *testing.T) {
       - "VAR1=value1"
       - "VAR2=value2"
     checkEndpoint: "/health"
+  model2:
+    cmd: path/to/cmd --arg1 one
+    proxy: "http://localhost:8081"
+    aliases:
+      - "m2"
+    checkEndpoint: "/"
 healthCheckTimeout: 15
+profiles:
+  test:
+    - model1
+    - model2
 `
 
 	if err := os.WriteFile(tempFile, []byte(content), 0644); err != nil {
@@ -50,14 +61,33 @@ healthCheckTimeout: 15
 				Env:           []string{"VAR1=value1", "VAR2=value2"},
 				CheckEndpoint: "/health",
 			},
+			"model2": {
+				Cmd:           "path/to/cmd --arg1 one",
+				Proxy:         "http://localhost:8081",
+				Aliases:       []string{"m2"},
+				Env:           nil,
+				CheckEndpoint: "/",
+			},
 		},
 		HealthCheckTimeout: 15,
+		Profiles: map[string][]string{
+			"test": {"model1", "model2"},
+		},
+		aliases: map[string]string{
+			"m1":        "model1",
+			"model-one": "model1",
+			"m2":        "model2",
+		},
 	}
 
 	assert.Equal(t, expected, config)
+
+	realname, found := config.RealModelName("m1")
+	assert.True(t, found)
+	assert.Equal(t, "model1", realname)
 }
 
-func TestModelConfigSanitizedCommand(t *testing.T) {
+func TestConfig_ModelConfigSanitizedCommand(t *testing.T) {
 	config := &ModelConfig{
 		Cmd: `python model1.py \
     --arg1 value1 \
@@ -69,7 +99,10 @@ func TestModelConfigSanitizedCommand(t *testing.T) {
 	assert.Equal(t, []string{"python", "model1.py", "--arg1", "value1", "--arg2", "value2"}, args)
 }
 
-func TestFindConfig(t *testing.T) {
+func TestConfig_FindConfig(t *testing.T) {
+
+	// TODO?
+	// make make this shared between the different tests
 	config := &Config{
 		Models: map[string]ModelConfig{
 			"model1": {
@@ -88,6 +121,11 @@ func TestFindConfig(t *testing.T) {
 			},
 		},
 		HealthCheckTimeout: 10,
+		aliases: map[string]string{
+			"m1":        "model1",
+			"model-one": "model1",
+			"m2":        "model2",
+		},
 	}
 
 	// Test finding a model by its name
@@ -109,7 +147,7 @@ func TestFindConfig(t *testing.T) {
 	assert.Equal(t, ModelConfig{}, modelConfig)
 }
 
-func TestSanitizeCommand(t *testing.T) {
+func TestConfig_SanitizeCommand(t *testing.T) {
 	// Test a simple command
 	args, err := SanitizeCommand("python model1.py")
 	assert.NoError(t, err)

diff --git a/proxy/helpers_test.go b/proxy/helpers_test.go
@@ -0,0 +1,58 @@
+package proxy
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"runtime"
+	"sync"
+	"testing"
+
+	"github.com/gin-gonic/gin"
+)
+
+var (
+	nextTestPort int = 12000
+	portMutex    sync.Mutex
+)
+
+// Check if the binary exists
+func TestMain(m *testing.M) {
+	binaryPath := getSimpleResponderPath()
+	if _, err := os.Stat(binaryPath); os.IsNotExist(err) {
+		fmt.Printf("simple-responder not found at %s, did you `make simple-responder`?\n", binaryPath)
+		os.Exit(1)
+	}
+
+	gin.SetMode(gin.TestMode)
+
+	m.Run()
+}
+
+// Helper function to get the binary path
+func getSimpleResponderPath() string {
+	goos := runtime.GOOS
+	goarch := runtime.GOARCH
+	return filepath.Join("..", "build", fmt.Sprintf("simple-responder_%s_%s", goos, goarch))
+}
+
+func getTestSimpleResponderConfig(expectedMessage string) ModelConfig {
+	portMutex.Lock()
+	defer portMutex.Unlock()
+
+	port := nextTestPort
+	nextTestPort++
+
+	return getTestSimpleResponderConfigPort(expectedMessage, port)
+}
+
+func getTestSimpleResponderConfigPort(expectedMessage string, port int) ModelConfig {
+	binaryPath := getSimpleResponderPath()
+
+	// Create a process configuration
+	return ModelConfig{
+		Cmd:           fmt.Sprintf("%s --port %d --respond '%s'", binaryPath, port, expectedMessage),
+		Proxy:         fmt.Sprintf("http://127.0.0.1:%d", port),
+		CheckEndpoint: "/health",
+	}
+}
diff --git a/proxy/process.go b/proxy/process.go
@@ -113,7 +113,7 @@ func (p *Process) Stop() {
 	p.Lock()
 	defer p.Unlock()
 
-	if !p.isRunning {
+	if !p.isRunning || p.cmd == nil || p.cmd.Process == nil {
 		return
 	}