Kilo-Org · kevinvandijk · Feb 21, 2026 · Feb 15, 2026 · Feb 16, 2026 · Feb 21, 2026
diff --git a/.changeset/tender-otters-pay.md b/.changeset/tender-otters-pay.md
@@ -0,0 +1,5 @@
+---
+"kilo-code": patch
+---
+
+fix: cap qwen3-max-thinking max_tokens to provider limit
diff --git a/src/shared/__tests__/api.spec.ts b/src/shared/__tests__/api.spec.ts
@@ -211,6 +211,44 @@ describe("getModelMaxOutputTokens", () => {
 		})
 	})
 
+	// kilocode_change start
+	test("should cap qwen3-max-thinking to provider max output limit of 32,768", () => {
+		const model: ModelInfo = {
+			contextWindow: 300_000,
+			supportsPromptCache: false,
+			maxTokens: 200_000,
+		}
+
+		const result = getModelMaxOutputTokens({
+			modelId: "qwen/qwen3-max-thinking",
+			model,
+			settings: {},
+			format: "openrouter",
+		})
+
+		// 20% cap would be 60,000, but model-specific provider cap is 32,768.
+		expect(result).toBe(32_768)
+	})
+
+	test("should still honor lower context-based cap for qwen3-max-thinking", () => {
+		const model: ModelInfo = {
+			contextWindow: 100_000,
+			supportsPromptCache: false,
+			maxTokens: 200_000,
+		}
+
+		const result = getModelMaxOutputTokens({
+			modelId: "qwen/qwen3-max-thinking",
+			model,
+			settings: {},
+			format: "openrouter",
+		})
+
+		// 20% cap is 20,000 which is lower than 32,768.
+		expect(result).toBe(20_000)
+	})
+	// kilocode_change end
+
 	test("should handle GPT-5 models with various max token configurations", () => {
 		const testCases = [
 			{

diff --git a/src/shared/api.ts b/src/shared/api.ts
@@ -108,6 +108,9 @@ export const shouldUseReasoningEffort = ({
 export const DEFAULT_HYBRID_REASONING_MODEL_MAX_TOKENS = 16_384
 export const DEFAULT_HYBRID_REASONING_MODEL_THINKING_TOKENS = 8_192
 export const GEMINI_25_PRO_MIN_THINKING_TOKENS = 128
+// kilocode_change start
+const QWEN3_MAX_THINKING_OUTPUT_TOKEN_LIMIT = 32_768
+// kilocode_change end
 
 // Max Tokens
 
@@ -143,6 +146,10 @@ export const getModelMaxOutputTokens = ({
 		return ANTHROPIC_DEFAULT_MAX_TOKENS
 	}
 
+	// kilocode_change start
+	const isQwen3MaxThinkingModel = modelId.toLowerCase().includes("qwen3-max-thinking")
+	// kilocode_change end
+
 	// If model has explicit maxTokens, clamp it to 20% of the context window
 	// Exception: GPT-5 models should use their exact configured max output tokens
 	if (model.maxTokens) {
@@ -154,8 +161,17 @@ export const getModelMaxOutputTokens = ({
 			return model.maxTokens
 		}
 
+		const contextCappedMaxTokens = Math.min(model.maxTokens, Math.ceil(model.contextWindow * 0.2))
+
+		// kilocode_change start
+		// qwen3-max-thinking currently rejects values above 32,768 (upstream provider constraint).
+		if (isQwen3MaxThinkingModel) {
+			return Math.min(contextCappedMaxTokens, QWEN3_MAX_THINKING_OUTPUT_TOKEN_LIMIT)
+		}
+		// kilocode_change end
+
 		// All other models are clamped to 20% of context window
-		return Math.min(model.maxTokens, Math.ceil(model.contextWindow * 0.2))
+		return contextCappedMaxTokens
 	}
 
 	// For non-Anthropic formats without explicit maxTokens, return undefined