From fc04a3a3d6524459fd5a31e27b8e7fc9e71ebf3c Mon Sep 17 00:00:00 2001
From: Roo Code <roomote@roocode.com>
Date: Sat, 25 Oct 2025 11:10:34 +0000
Subject: [PATCH 1/4] fix: adjust GLM-4.6-turbo max output tokens to 40k

Fixes issue where GLM-4.6-turbo was requesting the entire context window
(202752 tokens) for output, leaving no room for input tokens. Now set to
40960 tokens (20% of 200k context) to allow sufficient input space.

Fixes #8821
---
 packages/types/src/providers/chutes.ts | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/packages/types/src/providers/chutes.ts b/packages/types/src/providers/chutes.ts
index 237cb76e804c..0e209ad87431 100644
--- a/packages/types/src/providers/chutes.ts
+++ b/packages/types/src/providers/chutes.ts
@@ -88,7 +88,8 @@ export const chutesModels = {
 		supportsPromptCache: false,
 		inputPrice: 0.23,
 		outputPrice: 0.9,
-		description: "DeepSeek‑V3.1‑Terminus is an update to V3.1 that improves language consistency by reducing CN/EN mix‑ups and eliminating random characters, while strengthening agent capabilities with notably better Code Agent and Search Agent performance.",
+		description:
+			"DeepSeek‑V3.1‑Terminus is an update to V3.1 that improves language consistency by reducing CN/EN mix‑ups and eliminating random characters, while strengthening agent capabilities with notably better Code Agent and Search Agent performance.",
 	},
 	"deepseek-ai/DeepSeek-V3.1-turbo": {
 		maxTokens: 32768,
@@ -97,7 +98,8 @@ export const chutesModels = {
 		supportsPromptCache: false,
 		inputPrice: 1.0,
 		outputPrice: 3.0,
-		description: "DeepSeek-V3.1-turbo is an FP8, speculative-decoding turbo variant optimized for ultra-fast single-shot queries (~200 TPS), with outputs close to the originals and solid function calling/reasoning/structured output, priced at $1/M input and $3/M output tokens, using 2× quota per request and not intended for bulk workloads.",
+		description:
+			"DeepSeek-V3.1-turbo is an FP8, speculative-decoding turbo variant optimized for ultra-fast single-shot queries (~200 TPS), with outputs close to the originals and solid function calling/reasoning/structured output, priced at $1/M input and $3/M output tokens, using 2× quota per request and not intended for bulk workloads.",
 	},
 	"deepseek-ai/DeepSeek-V3.2-Exp": {
 		maxTokens: 163840,
@@ -106,7 +108,8 @@ export const chutesModels = {
 		supportsPromptCache: false,
 		inputPrice: 0.25,
 		outputPrice: 0.35,
-		description: "DeepSeek-V3.2-Exp is an experimental LLM that introduces DeepSeek Sparse Attention to improve long‑context training and inference efficiency while maintaining performance comparable to V3.1‑Terminus.",
+		description:
+			"DeepSeek-V3.2-Exp is an experimental LLM that introduces DeepSeek Sparse Attention to improve long‑context training and inference efficiency while maintaining performance comparable to V3.1‑Terminus.",
 	},
 	"unsloth/Llama-3.3-70B-Instruct": {
 		maxTokens: 32768, // From Groq
@@ -328,7 +331,7 @@ export const chutesModels = {
 			"GLM-4.6 introduces major upgrades over GLM-4.5, including a longer 200K-token context window for complex tasks, stronger coding performance in benchmarks and real-world tools (such as Claude Code, Cline, Roo Code, and Kilo Code), improved reasoning with tool use during inference, more capable and efficient agent integration, and refined writing that better matches human style, readability, and natural role-play scenarios.",
 	},
 	"zai-org/GLM-4.6-turbo": {
-		maxTokens: 202752, // From Chutes /v1/models: max_output_length
+		maxTokens: 40960, // 20% of 200K context window to leave room for input
 		contextWindow: 202752,
 		supportsImages: false,
 		supportsPromptCache: false,
@@ -397,8 +400,9 @@ export const chutesModels = {
 		contextWindow: 262144,
 		supportsImages: true,
 		supportsPromptCache: false,
-		inputPrice: 0.1600,
-		outputPrice: 0.6500,
-		description: "Qwen3‑VL‑235B‑A22B‑Thinking is an open‑weight MoE vision‑language model (235B total, ~22B activated) optimized for deliberate multi‑step reasoning with strong text‑image‑video understanding and long‑context capabilities.",
+		inputPrice: 0.16,
+		outputPrice: 0.65,
+		description:
+			"Qwen3‑VL‑235B‑A22B‑Thinking is an open‑weight MoE vision‑language model (235B total, ~22B activated) optimized for deliberate multi‑step reasoning with strong text‑image‑video understanding and long‑context capabilities.",
 	},
 } as const satisfies Record<string, ModelInfo>

From 442a8074119005de760477fc110b8c1c44ff3a86 Mon Sep 17 00:00:00 2001
From: Roo Code <roomote@roocode.com>
Date: Sat, 25 Oct 2025 15:40:34 +0000
Subject: [PATCH 2/4] =?UTF-8?q?fix:=20enforce=20centralized=2020%=20output?=
 =?UTF-8?q?-token=20cap=20for=20OpenAI-compatible=20providers=20(Chutes)\n?=
 =?UTF-8?q?\nRoot=20cause:=20BaseOpenAiCompatibleProvider=20sent=20model.i?=
 =?UTF-8?q?nfo.maxTokens=20directly,=20bypassing=20getModelMaxOutputTokens?=
 =?UTF-8?q?,=20so=20the=2020%=20context-window=20cap=20wasn=E2=80=99t=20ap?=
 =?UTF-8?q?plied=20for=20Chutes=20(e.g.,=20GLM-4.6-turbo).=20Fix=20applies?=
 =?UTF-8?q?=20getModelMaxOutputTokens=20with=20format=3D"openai"=20before?=
 =?UTF-8?q?=20sending=20max=5Ftokens,=20preventing=20context=20limit=20ove?=
 =?UTF-8?q?rruns.\n\nTests:=20update=20Chutes=20spec=20to=20expect=20clamp?=
 =?UTF-8?q?ed=20max=5Ftokens;=20all=20affected=20tests=20pass.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/api/providers/__tests__/chutes.spec.ts       |  5 ++++-
 .../providers/base-openai-compatible-provider.ts | 16 +++++++++++-----
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/api/providers/__tests__/chutes.spec.ts b/src/api/providers/__tests__/chutes.spec.ts
index 70ee06a923c1..5c5550333861 100644
--- a/src/api/providers/__tests__/chutes.spec.ts
+++ b/src/api/providers/__tests__/chutes.spec.ts
@@ -460,10 +460,13 @@ describe("ChutesHandler", () => {
 		const messageGenerator = handlerWithModel.createMessage(systemPrompt, messages)
 		await messageGenerator.next()
 
+		// Centralized 20% cap should apply to OpenAI-compatible providers like Chutes
+		const expectedMaxTokens = Math.min(modelInfo.maxTokens, Math.ceil(modelInfo.contextWindow * 0.2))
+
 		expect(mockCreate).toHaveBeenCalledWith(
 			expect.objectContaining({
 				model: modelId,
-				max_tokens: modelInfo.maxTokens,
+				max_tokens: expectedMaxTokens,
 				temperature: 0.5,
 				messages: expect.arrayContaining([{ role: "system", content: systemPrompt }]),
 				stream: true,
diff --git a/src/api/providers/base-openai-compatible-provider.ts b/src/api/providers/base-openai-compatible-provider.ts
index fb6c5d03770e..9fe516ed09a3 100644
--- a/src/api/providers/base-openai-compatible-provider.ts
+++ b/src/api/providers/base-openai-compatible-provider.ts
@@ -3,7 +3,7 @@ import OpenAI from "openai"
 
 import type { ModelInfo } from "@roo-code/types"
 
-import type { ApiHandlerOptions } from "../../shared/api"
+import { type ApiHandlerOptions, getModelMaxOutputTokens } from "../../shared/api"
 import { ApiStream } from "../transform/stream"
 import { convertToOpenAiMessages } from "../transform/openai-format"
 
@@ -69,10 +69,16 @@ export abstract class BaseOpenAiCompatibleProvider<ModelName extends string>
 		metadata?: ApiHandlerCreateMessageMetadata,
 		requestOptions?: OpenAI.RequestOptions,
 	) {
-		const {
-			id: model,
-			info: { maxTokens: max_tokens },
-		} = this.getModel()
+		const { id: model, info } = this.getModel()
+
+		// Centralized cap: clamp to 20% of the context window (unless provider-specific exceptions apply)
+		const max_tokens =
+			getModelMaxOutputTokens({
+				modelId: model,
+				model: info,
+				settings: this.options,
+				format: "openai",
+			}) ?? undefined
 
 		const temperature = this.options.modelTemperature ?? this.defaultTemperature
 

From e47fdc5c38a65994097ebd4d28c86dd0ee8a41a2 Mon Sep 17 00:00:00 2001
From: Roo Code <roomote@roocode.com>
Date: Sat, 25 Oct 2025 15:47:08 +0000
Subject: [PATCH 3/4] test: align Z AI tests with centralized 20% output-token
 cap\n\nUpdate zai.spec to expect clamped max_tokens (min(model.maxTokens,
 ceil(contextWindow*0.2))) for OpenAI-compatible requests.

---
 src/api/providers/__tests__/zai.spec.ts | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/api/providers/__tests__/zai.spec.ts b/src/api/providers/__tests__/zai.spec.ts
index 14e3e2465388..ec3c9dbe0e4a 100644
--- a/src/api/providers/__tests__/zai.spec.ts
+++ b/src/api/providers/__tests__/zai.spec.ts
@@ -280,10 +280,13 @@ describe("ZAiHandler", () => {
 			const messageGenerator = handlerWithModel.createMessage(systemPrompt, messages)
 			await messageGenerator.next()
 
+			// Centralized 20% cap should apply to OpenAI-compatible providers like Z AI
+			const expectedMaxTokens = Math.min(modelInfo.maxTokens, Math.ceil(modelInfo.contextWindow * 0.2))
+
 			expect(mockCreate).toHaveBeenCalledWith(
 				expect.objectContaining({
 					model: modelId,
-					max_tokens: modelInfo.maxTokens,
+					max_tokens: expectedMaxTokens,
 					temperature: ZAI_DEFAULT_TEMPERATURE,
 					messages: expect.arrayContaining([{ role: "system", content: systemPrompt }]),
 					stream: true,

From d461d3ec1ac77642a164344759cce0a84ffa20e7 Mon Sep 17 00:00:00 2001
From: Roo Code <roomote@roocode.com>
Date: Sat, 25 Oct 2025 16:06:50 +0000
Subject: [PATCH 4/4] revert: undo fc04a3a3d6524459fd5a31e27b8e7fc9e71ebf3c
 change to GLM-4.6-turbo maxTokens; restore 202752 in types; rely on
 centralized 20% cap at request time

---
 packages/types/src/providers/chutes.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/types/src/providers/chutes.ts b/packages/types/src/providers/chutes.ts
index 0e209ad87431..20fe15017f90 100644
--- a/packages/types/src/providers/chutes.ts
+++ b/packages/types/src/providers/chutes.ts
@@ -331,7 +331,7 @@ export const chutesModels = {
 			"GLM-4.6 introduces major upgrades over GLM-4.5, including a longer 200K-token context window for complex tasks, stronger coding performance in benchmarks and real-world tools (such as Claude Code, Cline, Roo Code, and Kilo Code), improved reasoning with tool use during inference, more capable and efficient agent integration, and refined writing that better matches human style, readability, and natural role-play scenarios.",
 	},
 	"zai-org/GLM-4.6-turbo": {
-		maxTokens: 40960, // 20% of 200K context window to leave room for input
+		maxTokens: 202752, // From Chutes /v1/models: max_output_length
 		contextWindow: 202752,
 		supportsImages: false,
 		supportsPromptCache: false,