From 3989c17d1b63f1f9ec90f5820eb27fd674b38530 Mon Sep 17 00:00:00 2001
From: overcuriousity <overcuriousity@posteo.org>
Date: Fri, 30 Jan 2026 01:29:53 +0100
Subject: [PATCH 1/3] add timeout feature

---
 config-schema.json           |  6 ++++++
 config.example.yaml          | 10 ++++++++++
 docs/configuration.md        | 10 ++++++++++
 proxy/config/model_config.go |  5 +++++
 proxy/process.go             | 26 ++++++++++++++++++++++++++
 5 files changed, 57 insertions(+)

diff --git a/config-schema.json b/config-schema.json
index 8baa0cc43..9b77344ad 100644
--- a/config-schema.json
+++ b/config-schema.json
@@ -216,6 +216,12 @@
                         "type": "boolean",
                         "description": "Overrides the global sendLoadingState for this model. Ommitting this property will use the global setting."
                     },
+                    "requestTimeout": {
+                        "type": "integer",
+                        "minimum": 0,
+                        "default": 0,
+                        "description": "Maximum time in seconds for a single request to complete before forcefully killing the model process. This prevents runaway inference processes from blocking the GPU indefinitely. 0 disables timeout (default). When exceeded, the process is terminated and must be restarted for the next request."
+                    },
                     "unlisted": {
                         "type": "boolean",
                         "default": false,
diff --git a/config.example.yaml b/config.example.yaml
index d8282fc17..0ef80c02a 100644
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -249,6 +249,16 @@ models:
     # - recommended to be omitted and the default used
     concurrencyLimit: 0
 
+    # requestTimeout: maximum time in seconds for a single request to complete
+    # - optional, default: 0 (no timeout)
+    # - useful for preventing runaway inference processes that never complete
+    # - when exceeded, the model process is forcefully stopped
+    # - protects against GPU overheating and blocking from stuck processes
+    # - the process must be restarted for the next request
+    # - set to 0 to disable timeout
+    # - recommended for models that may have infinite loops or excessive generation
+    requestTimeout: 0  # disabled by default, set to e.g., 300 for 5 minutes
+
     # sendLoadingState: overrides the global sendLoadingState setting for this model
     # - optional, default: undefined (use global setting)
     sendLoadingState: false
diff --git a/docs/configuration.md b/docs/configuration.md
index 5aac2706c..32713d577 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -319,6 +319,16 @@ models:
     # - recommended to be omitted and the default used
     concurrencyLimit: 0
 
+    # requestTimeout: maximum time in seconds for a single request to complete
+    # - optional, default: 0 (no timeout)
+    # - useful for preventing runaway inference processes that never complete
+    # - when exceeded, the model process is forcefully stopped
+    # - protects against GPU overheating and blocking from stuck processes
+    # - the process must be restarted for the next request
+    # - set to 0 to disable timeout
+    # - recommended for models that may have infinite loops or excessive generation
+    requestTimeout: 300  # 5 minutes
+
     # sendLoadingState: overrides the global sendLoadingState setting for this model
     # - optional, default: undefined (use global setting)
     sendLoadingState: false
diff --git a/proxy/config/model_config.go b/proxy/config/model_config.go
index 9dc37aea6..6b2ba742a 100644
--- a/proxy/config/model_config.go
+++ b/proxy/config/model_config.go
@@ -36,6 +36,10 @@ type ModelConfig struct {
 
 	// override global setting
 	SendLoadingState *bool `yaml:"sendLoadingState"`
+
+	// Maximum time in seconds for a request to complete before killing the process
+	// 0 means no timeout (default)
+	RequestTimeout int `yaml:"requestTimeout"`
 }
 
 func (m *ModelConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
@@ -53,6 +57,7 @@ func (m *ModelConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
 		ConcurrencyLimit: 0,
 		Name:             "",
 		Description:      "",
+		RequestTimeout:   0,
 	}
 
 	// the default cmdStop to taskkill /f /t /pid ${PID}
diff --git a/proxy/process.go b/proxy/process.go
index 414270595..5ada9723b 100644
--- a/proxy/process.go
+++ b/proxy/process.go
@@ -500,6 +500,32 @@ func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
 		p.inFlightRequests.Done()
 	}()
 
+	// Start timeout monitoring if requestTimeout is configured
+	var timeoutCancel context.CancelFunc
+	if p.config.RequestTimeout > 0 {
+		timeoutCtx, cancel := context.WithCancel(context.Background())
+		timeoutCancel = cancel
+
+		go func() {
+			timeoutDuration := time.Duration(p.config.RequestTimeout) * time.Second
+			timer := time.NewTimer(timeoutDuration)
+			defer timer.Stop()
+
+			select {
+			case <-timer.C:
+				p.proxyLogger.Warnf("<%s> Request timeout exceeded (%v), force stopping process to prevent GPU blocking", p.ID, timeoutDuration)
+				// Force stop the process - this will kill the underlying inference process
+				p.StopImmediately()
+			case <-timeoutCtx.Done():
+				// Request completed normally, cancel timeout
+				return
+			}
+		}()
+
+		// Ensure timeout goroutine is cancelled when request completes
+		defer timeoutCancel()
+	}
+
 	// for #366
 	// - extract streaming param from request context, should have been set by proxymanager
 	var srw *statusResponseWriter

From e6f9f9a9990cea88228dc67a62e072ef0854da5e Mon Sep 17 00:00:00 2001
From: overcuriousity <overcuriousity@posteo.org>
Date: Sat, 31 Jan 2026 00:27:33 +0000
Subject: [PATCH 2/3] proxy: fix requestTimeout feature to actually terminate
 requests

The requestTimeout feature was not working because the timeout
context was not connected to the HTTP request. When the timeout
fired, it attempted to kill the process but the reverse proxy
continued waiting for a response indefinitely.

- Use context.WithTimeout() to create a timeout context for the HTTP request
- Clone the request with the timeout context before proxying
- When timeout fires, the HTTP request is immediately cancelled
- Fix StopImmediately() to handle timeouts during model loading (StateStarting)
- Add unit test to verify timeout behavior

Before: requests would run for 60+ seconds despite requestTimeout: 20
After: requests terminate in exactly 20 seconds as configured

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 proxy/process.go              |  36 ++++++-----
 proxy/process_timeout_test.go | 109 ++++++++++++++++++++++++++++++++++
 2 files changed, 130 insertions(+), 15 deletions(-)
 create mode 100644 proxy/process_timeout_test.go

diff --git a/proxy/process.go b/proxy/process.go
index 5ada9723b..7e311d11c 100644
--- a/proxy/process.go
+++ b/proxy/process.go
@@ -381,13 +381,17 @@ func (p *Process) Stop() {
 // StopImmediately will transition the process to the stopping state and stop the process with a SIGTERM.
 // If the process does not stop within the specified timeout, it will be forcefully stopped with a SIGKILL.
 func (p *Process) StopImmediately() {
-	if !isValidTransition(p.CurrentState(), StateStopping) {
+	currentState := p.CurrentState()
+	if !isValidTransition(currentState, StateStopping) {
 		return
 	}
 
-	p.proxyLogger.Debugf("<%s> Stopping process, current state: %s", p.ID, p.CurrentState())
-	if curState, err := p.swapState(StateReady, StateStopping); err != nil {
-		p.proxyLogger.Infof("<%s> Stop() Ready -> StateStopping err: %v, current state: %v", p.ID, err, curState)
+	p.proxyLogger.Debugf("<%s> Stopping process, current state: %s", p.ID, currentState)
+
+	// Try to transition from current state to StateStopping
+	// Process might be in StateReady or StateStarting when timeout fires
+	if _, err := p.swapState(currentState, StateStopping); err != nil {
+		p.proxyLogger.Infof("<%s> Stop() %s -> StateStopping err: %v", p.ID, currentState, err)
 		return
 	}
 
@@ -502,30 +506,32 @@ func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
 
 	// Start timeout monitoring if requestTimeout is configured
 	var timeoutCancel context.CancelFunc
+	var requestCtx context.Context = r.Context()
+
 	if p.config.RequestTimeout > 0 {
-		timeoutCtx, cancel := context.WithCancel(context.Background())
+		timeoutDuration := time.Duration(p.config.RequestTimeout) * time.Second
+		var cancel context.CancelFunc
+		requestCtx, cancel = context.WithTimeout(r.Context(), timeoutDuration)
 		timeoutCancel = cancel
 
 		go func() {
-			timeoutDuration := time.Duration(p.config.RequestTimeout) * time.Second
-			timer := time.NewTimer(timeoutDuration)
-			defer timer.Stop()
-
-			select {
-			case <-timer.C:
+			<-requestCtx.Done()
+			if requestCtx.Err() == context.DeadlineExceeded {
 				p.proxyLogger.Warnf("<%s> Request timeout exceeded (%v), force stopping process to prevent GPU blocking", p.ID, timeoutDuration)
 				// Force stop the process - this will kill the underlying inference process
 				p.StopImmediately()
-			case <-timeoutCtx.Done():
-				// Request completed normally, cancel timeout
-				return
 			}
 		}()
 
-		// Ensure timeout goroutine is cancelled when request completes
+		// Ensure timeout is cancelled when request completes
 		defer timeoutCancel()
 	}
 
+	// Create a new request with the timeout context
+	if requestCtx != r.Context() {
+		r = r.Clone(requestCtx)
+	}
+
 	// for #366
 	// - extract streaming param from request context, should have been set by proxymanager
 	var srw *statusResponseWriter
diff --git a/proxy/process_timeout_test.go b/proxy/process_timeout_test.go
new file mode 100644
index 000000000..9f048d9e2
--- /dev/null
+++ b/proxy/process_timeout_test.go
@@ -0,0 +1,109 @@
+package proxy
+
+import (
+	"fmt"
+	"net/http"
+	"net/http/httptest"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/mostlygeek/llama-swap/proxy/config"
+)
+
+// TestProcess_RequestTimeout verifies that requestTimeout actually kills the process
+func TestProcess_RequestTimeout(t *testing.T) {
+	// Create a mock server that simulates a long-running inference
+	mockServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		t.Logf("Mock server received request")
+
+		// Simulate streaming response that takes 60 seconds
+		w.Header().Set("Content-Type", "text/event-stream")
+		w.WriteHeader(http.StatusOK)
+
+		flusher, ok := w.(http.Flusher)
+		if !ok {
+			t.Fatal("Expected http.ResponseWriter to be an http.Flusher")
+		}
+
+		// Stream data for 60 seconds
+		for i := 0; i < 60; i++ {
+			select {
+			case <-r.Context().Done():
+				t.Logf("Mock server: client disconnected after %d seconds", i)
+				return
+			default:
+				fmt.Fprintf(w, "data: token %d\n\n", i)
+				flusher.Flush()
+				time.Sleep(1 * time.Second)
+			}
+		}
+		t.Logf("Mock server completed full 60 second response")
+	}))
+	defer mockServer.Close()
+
+	// Setup process logger - use NewLogMonitor() to avoid race in test
+	processLogger := NewLogMonitor()
+	proxyLogger := NewLogMonitor()
+
+	// Create process with 5 second request timeout
+	cfg := config.ModelConfig{
+		Proxy:          mockServer.URL,
+		CheckEndpoint:  "none", // skip health check
+		RequestTimeout: 5,      // 5 second timeout
+	}
+
+	p := NewProcess("test-timeout", 30, cfg, processLogger, proxyLogger)
+	p.gracefulStopTimeout = 2 * time.Second // shorter for testing
+
+	// Manually set state to ready (skip actual process start)
+	p.forceState(StateReady)
+
+	// Make a request that should timeout
+	req := httptest.NewRequest("POST", "/v1/chat/completions", nil)
+	w := httptest.NewRecorder()
+
+	start := time.Now()
+	var wg sync.WaitGroup
+	wg.Add(1)
+
+	go func() {
+		defer wg.Done()
+		p.ProxyRequest(w, req)
+	}()
+
+	// Wait for either completion or timeout
+	done := make(chan struct{})
+	go func() {
+		wg.Wait()
+		close(done)
+	}()
+
+	select {
+	case <-done:
+		elapsed := time.Since(start)
+		t.Logf("Request completed after %v", elapsed)
+
+		// Request should complete within timeout + gracefulStopTimeout + some buffer
+		maxExpected := time.Duration(cfg.RequestTimeout+2)*time.Second + 3*time.Second
+		if elapsed > maxExpected {
+			t.Errorf("Request took %v, expected less than %v with 5s timeout", elapsed, maxExpected)
+		} else {
+			t.Logf("✓ Request was properly terminated by timeout")
+		}
+
+	case <-time.After(15 * time.Second):
+		t.Fatalf("Test timed out after 15 seconds - request should have been killed by requestTimeout")
+	}
+}
+
+// TestProcess_RequestTimeoutWithRealProcess tests with an actual process
+func TestProcess_RequestTimeoutWithRealProcess(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping test with real process in short mode")
+	}
+
+	// This test would require a real llama.cpp server or similar
+	// For now, we can skip it or mock it
+	t.Skip("Requires real inference server")
+}

From 0e86bbcb688011bd625f23bf4b1a19f4d3aafae4 Mon Sep 17 00:00:00 2001
From: overcuriousity <overcuriousity@posteo.org>
Date: Sat, 31 Jan 2026 00:33:41 +0000
Subject: [PATCH 3/3] docs: add requestTimeout to README features list

Add brief mention of requestTimeout feature in the customizable
features section of README.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c26962351..8d372c108 100644
--- a/README.md
+++ b/README.md
@@ -42,7 +42,8 @@ Built in Go for performance and simplicity, llama-swap has zero dependencies and
 - ✅ API Key support - define keys to restrict access to API endpoints
 - ✅ Customizable
   - Run multiple models at once with `Groups` ([#107](https://github.com/mostlygeek/llama-swap/issues/107))
-  - Automatic unloading of models after timeout by setting a `ttl`
+  - Automatic unloading of models after idle timeout by setting a `ttl`
+  - Request timeout protection with `requestTimeout` to prevent runaway inference
   - Reliable Docker and Podman support using `cmd` and `cmdStop` together
   - Preload models on startup with `hooks` ([#235](https://github.com/mostlygeek/llama-swap/pull/235))