From 5befa0a2c06ee2150bee8654274122e85200585a Mon Sep 17 00:00:00 2001
From: Benson Wong <mostlygeek@gmail.com>
Date: Sat, 29 Nov 2025 21:17:23 -0800
Subject: [PATCH 1/2] proxy: add support for anthropic v1/messages api

---
 proxy/proxymanager.go | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/proxy/proxymanager.go b/proxy/proxymanager.go
index 39295e0b..c8ce0d42 100644
--- a/proxy/proxymanager.go
+++ b/proxy/proxymanager.go
@@ -236,27 +236,29 @@ func (pm *ProxyManager) setupGinEngine() {
 	})
 
 	// Set up routes using the Gin engine
-	pm.ginEngine.POST("/v1/chat/completions", pm.proxyOAIHandler)
+	pm.ginEngine.POST("/v1/chat/completions", pm.proxyInferenceHandler)
 	// Support legacy /v1/completions api, see issue #12
-	pm.ginEngine.POST("/v1/completions", pm.proxyOAIHandler)
+	pm.ginEngine.POST("/v1/completions", pm.proxyInferenceHandler)
+	// Support anthropic /v1/messages (added https://github.com/ggml-org/llama.cpp/pull/17570)
+	pm.ginEngine.POST("/v1/messages", pm.proxyInferenceHandler)
 
 	// Support embeddings and reranking
-	pm.ginEngine.POST("/v1/embeddings", pm.proxyOAIHandler)
+	pm.ginEngine.POST("/v1/embeddings", pm.proxyInferenceHandler)
 
 	// llama-server's /reranking endpoint + aliases
-	pm.ginEngine.POST("/reranking", pm.proxyOAIHandler)
-	pm.ginEngine.POST("/rerank", pm.proxyOAIHandler)
-	pm.ginEngine.POST("/v1/rerank", pm.proxyOAIHandler)
-	pm.ginEngine.POST("/v1/reranking", pm.proxyOAIHandler)
+	pm.ginEngine.POST("/reranking", pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/rerank", pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/v1/rerank", pm.proxyInferenceHandler)
+	pm.ginEngine.POST("/v1/reranking", pm.proxyInferenceHandler)
 
 	// llama-server's /infill endpoint for code infilling
-	pm.ginEngine.POST("/infill", pm.proxyOAIHandler)
+	pm.ginEngine.POST("/infill", pm.proxyInferenceHandler)
 
 	// llama-server's /completion endpoint
-	pm.ginEngine.POST("/completion", pm.proxyOAIHandler)
+	pm.ginEngine.POST("/completion", pm.proxyInferenceHandler)
 
 	// Support audio/speech endpoint
-	pm.ginEngine.POST("/v1/audio/speech", pm.proxyOAIHandler)
+	pm.ginEngine.POST("/v1/audio/speech", pm.proxyInferenceHandler)
 	pm.ginEngine.POST("/v1/audio/transcriptions", pm.proxyOAIPostFormHandler)
 
 	pm.ginEngine.GET("/v1/models", pm.listModelsHandler)
@@ -545,7 +547,7 @@ func (pm *ProxyManager) proxyToUpstream(c *gin.Context) {
 	}
 }
 
-func (pm *ProxyManager) proxyOAIHandler(c *gin.Context) {
+func (pm *ProxyManager) proxyInferenceHandler(c *gin.Context) {
 	bodyBytes, err := io.ReadAll(c.Request.Body)
 	if err != nil {
 		pm.sendErrorResponse(c, http.StatusBadRequest, "could not ready request body")

From 4e8623952f809f803e1bb245ecdac9b72217532d Mon Sep 17 00:00:00 2001
From: Benson Wong <mostlygeek@gmail.com>
Date: Sat, 29 Nov 2025 21:53:08 -0800
Subject: [PATCH 2/2] proxy: restrict loading message to /v1/chat/completions

---
 proxy/process.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/proxy/process.go b/proxy/process.go
index 2c05f943..640ba34a 100644
--- a/proxy/process.go
+++ b/proxy/process.go
@@ -507,7 +507,10 @@ func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
 		// add a sync so the streaming client only runs when the goroutine has exited
 
 		isStreaming, _ := r.Context().Value(proxyCtxKey("streaming")).(bool)
-		if p.config.SendLoadingState != nil && *p.config.SendLoadingState && isStreaming {
+
+		// PR #417 (no support for anthropic v1/messages yet)
+		isChatCompletions := strings.HasPrefix(r.URL.Path, "/v1/chat/completions")
+		if p.config.SendLoadingState != nil && *p.config.SendLoadingState && isStreaming && isChatCompletions {
 			srw = newStatusResponseWriter(p, w)
 			go srw.statusUpdates(swapCtx)
 		} else {