From 5befa0a2c06ee2150bee8654274122e85200585a Mon Sep 17 00:00:00 2001 From: Benson Wong Date: Sat, 29 Nov 2025 21:17:23 -0800 Subject: [PATCH 1/2] proxy: add support for anthropic v1/messages api --- proxy/proxymanager.go | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/proxy/proxymanager.go b/proxy/proxymanager.go index 39295e0b..c8ce0d42 100644 --- a/proxy/proxymanager.go +++ b/proxy/proxymanager.go @@ -236,27 +236,29 @@ func (pm *ProxyManager) setupGinEngine() { }) // Set up routes using the Gin engine - pm.ginEngine.POST("/v1/chat/completions", pm.proxyOAIHandler) + pm.ginEngine.POST("/v1/chat/completions", pm.proxyInferenceHandler) // Support legacy /v1/completions api, see issue #12 - pm.ginEngine.POST("/v1/completions", pm.proxyOAIHandler) + pm.ginEngine.POST("/v1/completions", pm.proxyInferenceHandler) + // Support anthropic /v1/messages (added https://github.com/ggml-org/llama.cpp/pull/17570) + pm.ginEngine.POST("/v1/messages", pm.proxyInferenceHandler) // Support embeddings and reranking - pm.ginEngine.POST("/v1/embeddings", pm.proxyOAIHandler) + pm.ginEngine.POST("/v1/embeddings", pm.proxyInferenceHandler) // llama-server's /reranking endpoint + aliases - pm.ginEngine.POST("/reranking", pm.proxyOAIHandler) - pm.ginEngine.POST("/rerank", pm.proxyOAIHandler) - pm.ginEngine.POST("/v1/rerank", pm.proxyOAIHandler) - pm.ginEngine.POST("/v1/reranking", pm.proxyOAIHandler) + pm.ginEngine.POST("/reranking", pm.proxyInferenceHandler) + pm.ginEngine.POST("/rerank", pm.proxyInferenceHandler) + pm.ginEngine.POST("/v1/rerank", pm.proxyInferenceHandler) + pm.ginEngine.POST("/v1/reranking", pm.proxyInferenceHandler) // llama-server's /infill endpoint for code infilling - pm.ginEngine.POST("/infill", pm.proxyOAIHandler) + pm.ginEngine.POST("/infill", pm.proxyInferenceHandler) // llama-server's /completion endpoint - pm.ginEngine.POST("/completion", pm.proxyOAIHandler) + pm.ginEngine.POST("/completion", pm.proxyInferenceHandler) // Support audio/speech endpoint - pm.ginEngine.POST("/v1/audio/speech", pm.proxyOAIHandler) + pm.ginEngine.POST("/v1/audio/speech", pm.proxyInferenceHandler) pm.ginEngine.POST("/v1/audio/transcriptions", pm.proxyOAIPostFormHandler) pm.ginEngine.GET("/v1/models", pm.listModelsHandler) @@ -545,7 +547,7 @@ func (pm *ProxyManager) proxyToUpstream(c *gin.Context) { } } -func (pm *ProxyManager) proxyOAIHandler(c *gin.Context) { +func (pm *ProxyManager) proxyInferenceHandler(c *gin.Context) { bodyBytes, err := io.ReadAll(c.Request.Body) if err != nil { pm.sendErrorResponse(c, http.StatusBadRequest, "could not ready request body") From 4e8623952f809f803e1bb245ecdac9b72217532d Mon Sep 17 00:00:00 2001 From: Benson Wong Date: Sat, 29 Nov 2025 21:53:08 -0800 Subject: [PATCH 2/2] proxy: restrict loading message to /v1/chat/completions --- proxy/process.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/proxy/process.go b/proxy/process.go index 2c05f943..640ba34a 100644 --- a/proxy/process.go +++ b/proxy/process.go @@ -507,7 +507,10 @@ func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) { // add a sync so the streaming client only runs when the goroutine has exited isStreaming, _ := r.Context().Value(proxyCtxKey("streaming")).(bool) - if p.config.SendLoadingState != nil && *p.config.SendLoadingState && isStreaming { + + // PR #417 (no support for anthropic v1/messages yet) + isChatCompletions := strings.HasPrefix(r.URL.Path, "/v1/chat/completions") + if p.config.SendLoadingState != nil && *p.config.SendLoadingState && isStreaming && isChatCompletions { srw = newStatusResponseWriter(p, w) go srw.statusUpdates(swapCtx) } else {