From 913a39ae5548807f08028405a00212d541341d4f Mon Sep 17 00:00:00 2001 From: Olusammytee Date: Sun, 15 Feb 2026 11:02:11 -0500 Subject: [PATCH 1/3] fix: cap qwen3-max-thinking max tokens to provider limit --- src/shared/__tests__/api.spec.ts | 36 ++++++++++++++++++++++++++++++++ src/shared/api.ts | 12 ++++++++++- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/src/shared/__tests__/api.spec.ts b/src/shared/__tests__/api.spec.ts index 9f85b19e33b..0857f785c46 100644 --- a/src/shared/__tests__/api.spec.ts +++ b/src/shared/__tests__/api.spec.ts @@ -211,6 +211,42 @@ describe("getModelMaxOutputTokens", () => { }) }) + test("should cap qwen3-max-thinking to provider max output limit of 32,768", () => { + const model: ModelInfo = { + contextWindow: 300_000, + supportsPromptCache: false, + maxTokens: 200_000, + } + + const result = getModelMaxOutputTokens({ + modelId: "qwen/qwen3-max-thinking", + model, + settings: {}, + format: "openrouter", + }) + + // 20% cap would be 60,000, but model-specific provider cap is 32,768. + expect(result).toBe(32_768) + }) + + test("should still honor lower context-based cap for qwen3-max-thinking", () => { + const model: ModelInfo = { + contextWindow: 100_000, + supportsPromptCache: false, + maxTokens: 200_000, + } + + const result = getModelMaxOutputTokens({ + modelId: "qwen/qwen3-max-thinking", + model, + settings: {}, + format: "openrouter", + }) + + // 20% cap is 20,000 which is lower than 32,768. + expect(result).toBe(20_000) + }) + test("should handle GPT-5 models with various max token configurations", () => { const testCases = [ { diff --git a/src/shared/api.ts b/src/shared/api.ts index 97162006001..59977d5f379 100644 --- a/src/shared/api.ts +++ b/src/shared/api.ts @@ -108,6 +108,7 @@ export const shouldUseReasoningEffort = ({ export const DEFAULT_HYBRID_REASONING_MODEL_MAX_TOKENS = 16_384 export const DEFAULT_HYBRID_REASONING_MODEL_THINKING_TOKENS = 8_192 export const GEMINI_25_PRO_MIN_THINKING_TOKENS = 128 +const QWEN3_MAX_THINKING_OUTPUT_TOKEN_LIMIT = 32_768 // Max Tokens @@ -143,6 +144,8 @@ export const getModelMaxOutputTokens = ({ return ANTHROPIC_DEFAULT_MAX_TOKENS } + const isQwen3MaxThinkingModel = modelId.toLowerCase().includes("qwen3-max-thinking") + // If model has explicit maxTokens, clamp it to 20% of the context window // Exception: GPT-5 models should use their exact configured max output tokens if (model.maxTokens) { @@ -154,8 +157,15 @@ export const getModelMaxOutputTokens = ({ return model.maxTokens } + const contextCappedMaxTokens = Math.min(model.maxTokens, Math.ceil(model.contextWindow * 0.2)) + + // qwen3-max-thinking currently rejects values above 32,768 (upstream provider constraint). + if (isQwen3MaxThinkingModel) { + return Math.min(contextCappedMaxTokens, QWEN3_MAX_THINKING_OUTPUT_TOKEN_LIMIT) + } + // All other models are clamped to 20% of context window - return Math.min(model.maxTokens, Math.ceil(model.contextWindow * 0.2)) + return contextCappedMaxTokens } // For non-Anthropic formats without explicit maxTokens, return undefined From 13caa14120681648ce71733dcfb06ea603d5b852 Mon Sep 17 00:00:00 2001 From: Olusammytee Date: Mon, 16 Feb 2026 00:11:48 -0500 Subject: [PATCH 2/3] chore: mark qwen token-cap changes as kilocode specific --- src/shared/__tests__/api.spec.ts | 2 ++ src/shared/api.ts | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/src/shared/__tests__/api.spec.ts b/src/shared/__tests__/api.spec.ts index 0857f785c46..5092cf146ae 100644 --- a/src/shared/__tests__/api.spec.ts +++ b/src/shared/__tests__/api.spec.ts @@ -211,6 +211,7 @@ describe("getModelMaxOutputTokens", () => { }) }) + // kilocode_change start test("should cap qwen3-max-thinking to provider max output limit of 32,768", () => { const model: ModelInfo = { contextWindow: 300_000, @@ -246,6 +247,7 @@ describe("getModelMaxOutputTokens", () => { // 20% cap is 20,000 which is lower than 32,768. expect(result).toBe(20_000) }) + // kilocode_change end test("should handle GPT-5 models with various max token configurations", () => { const testCases = [ diff --git a/src/shared/api.ts b/src/shared/api.ts index 59977d5f379..da618713258 100644 --- a/src/shared/api.ts +++ b/src/shared/api.ts @@ -108,7 +108,9 @@ export const shouldUseReasoningEffort = ({ export const DEFAULT_HYBRID_REASONING_MODEL_MAX_TOKENS = 16_384 export const DEFAULT_HYBRID_REASONING_MODEL_THINKING_TOKENS = 8_192 export const GEMINI_25_PRO_MIN_THINKING_TOKENS = 128 +// kilocode_change start const QWEN3_MAX_THINKING_OUTPUT_TOKEN_LIMIT = 32_768 +// kilocode_change end // Max Tokens @@ -144,7 +146,9 @@ export const getModelMaxOutputTokens = ({ return ANTHROPIC_DEFAULT_MAX_TOKENS } + // kilocode_change start const isQwen3MaxThinkingModel = modelId.toLowerCase().includes("qwen3-max-thinking") + // kilocode_change end // If model has explicit maxTokens, clamp it to 20% of the context window // Exception: GPT-5 models should use their exact configured max output tokens @@ -159,10 +163,12 @@ export const getModelMaxOutputTokens = ({ const contextCappedMaxTokens = Math.min(model.maxTokens, Math.ceil(model.contextWindow * 0.2)) + // kilocode_change start // qwen3-max-thinking currently rejects values above 32,768 (upstream provider constraint). if (isQwen3MaxThinkingModel) { return Math.min(contextCappedMaxTokens, QWEN3_MAX_THINKING_OUTPUT_TOKEN_LIMIT) } + // kilocode_change end // All other models are clamped to 20% of context window return contextCappedMaxTokens From c7d5865ac2f7004b31a31f1f28886987294f62a1 Mon Sep 17 00:00:00 2001 From: Kevin van Dijk Date: Sat, 21 Feb 2026 21:07:25 +0100 Subject: [PATCH 3/3] Add changeset --- .changeset/tender-otters-pay.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/tender-otters-pay.md diff --git a/.changeset/tender-otters-pay.md b/.changeset/tender-otters-pay.md new file mode 100644 index 00000000000..23ad8cee966 --- /dev/null +++ b/.changeset/tender-otters-pay.md @@ -0,0 +1,5 @@ +--- +"kilo-code": patch +--- + +fix: cap qwen3-max-thinking max_tokens to provider limit