diff --git a/packages/types/src/providers/chutes.ts b/packages/types/src/providers/chutes.ts index 237cb76e804c..20fe15017f90 100644 --- a/packages/types/src/providers/chutes.ts +++ b/packages/types/src/providers/chutes.ts @@ -88,7 +88,8 @@ export const chutesModels = { supportsPromptCache: false, inputPrice: 0.23, outputPrice: 0.9, - description: "DeepSeek‑V3.1‑Terminus is an update to V3.1 that improves language consistency by reducing CN/EN mix‑ups and eliminating random characters, while strengthening agent capabilities with notably better Code Agent and Search Agent performance.", + description: + "DeepSeek‑V3.1‑Terminus is an update to V3.1 that improves language consistency by reducing CN/EN mix‑ups and eliminating random characters, while strengthening agent capabilities with notably better Code Agent and Search Agent performance.", }, "deepseek-ai/DeepSeek-V3.1-turbo": { maxTokens: 32768, @@ -97,7 +98,8 @@ export const chutesModels = { supportsPromptCache: false, inputPrice: 1.0, outputPrice: 3.0, - description: "DeepSeek-V3.1-turbo is an FP8, speculative-decoding turbo variant optimized for ultra-fast single-shot queries (~200 TPS), with outputs close to the originals and solid function calling/reasoning/structured output, priced at $1/M input and $3/M output tokens, using 2× quota per request and not intended for bulk workloads.", + description: + "DeepSeek-V3.1-turbo is an FP8, speculative-decoding turbo variant optimized for ultra-fast single-shot queries (~200 TPS), with outputs close to the originals and solid function calling/reasoning/structured output, priced at $1/M input and $3/M output tokens, using 2× quota per request and not intended for bulk workloads.", }, "deepseek-ai/DeepSeek-V3.2-Exp": { maxTokens: 163840, @@ -106,7 +108,8 @@ export const chutesModels = { supportsPromptCache: false, inputPrice: 0.25, outputPrice: 0.35, - description: "DeepSeek-V3.2-Exp is an experimental LLM that introduces DeepSeek Sparse Attention to improve long‑context training and inference efficiency while maintaining performance comparable to V3.1‑Terminus.", + description: + "DeepSeek-V3.2-Exp is an experimental LLM that introduces DeepSeek Sparse Attention to improve long‑context training and inference efficiency while maintaining performance comparable to V3.1‑Terminus.", }, "unsloth/Llama-3.3-70B-Instruct": { maxTokens: 32768, // From Groq @@ -397,8 +400,9 @@ export const chutesModels = { contextWindow: 262144, supportsImages: true, supportsPromptCache: false, - inputPrice: 0.1600, - outputPrice: 0.6500, - description: "Qwen3‑VL‑235B‑A22B‑Thinking is an open‑weight MoE vision‑language model (235B total, ~22B activated) optimized for deliberate multi‑step reasoning with strong text‑image‑video understanding and long‑context capabilities.", + inputPrice: 0.16, + outputPrice: 0.65, + description: + "Qwen3‑VL‑235B‑A22B‑Thinking is an open‑weight MoE vision‑language model (235B total, ~22B activated) optimized for deliberate multi‑step reasoning with strong text‑image‑video understanding and long‑context capabilities.", }, } as const satisfies Record diff --git a/src/api/providers/__tests__/chutes.spec.ts b/src/api/providers/__tests__/chutes.spec.ts index 70ee06a923c1..5c5550333861 100644 --- a/src/api/providers/__tests__/chutes.spec.ts +++ b/src/api/providers/__tests__/chutes.spec.ts @@ -460,10 +460,13 @@ describe("ChutesHandler", () => { const messageGenerator = handlerWithModel.createMessage(systemPrompt, messages) await messageGenerator.next() + // Centralized 20% cap should apply to OpenAI-compatible providers like Chutes + const expectedMaxTokens = Math.min(modelInfo.maxTokens, Math.ceil(modelInfo.contextWindow * 0.2)) + expect(mockCreate).toHaveBeenCalledWith( expect.objectContaining({ model: modelId, - max_tokens: modelInfo.maxTokens, + max_tokens: expectedMaxTokens, temperature: 0.5, messages: expect.arrayContaining([{ role: "system", content: systemPrompt }]), stream: true, diff --git a/src/api/providers/__tests__/zai.spec.ts b/src/api/providers/__tests__/zai.spec.ts index 14e3e2465388..ec3c9dbe0e4a 100644 --- a/src/api/providers/__tests__/zai.spec.ts +++ b/src/api/providers/__tests__/zai.spec.ts @@ -280,10 +280,13 @@ describe("ZAiHandler", () => { const messageGenerator = handlerWithModel.createMessage(systemPrompt, messages) await messageGenerator.next() + // Centralized 20% cap should apply to OpenAI-compatible providers like Z AI + const expectedMaxTokens = Math.min(modelInfo.maxTokens, Math.ceil(modelInfo.contextWindow * 0.2)) + expect(mockCreate).toHaveBeenCalledWith( expect.objectContaining({ model: modelId, - max_tokens: modelInfo.maxTokens, + max_tokens: expectedMaxTokens, temperature: ZAI_DEFAULT_TEMPERATURE, messages: expect.arrayContaining([{ role: "system", content: systemPrompt }]), stream: true, diff --git a/src/api/providers/base-openai-compatible-provider.ts b/src/api/providers/base-openai-compatible-provider.ts index fb6c5d03770e..9fe516ed09a3 100644 --- a/src/api/providers/base-openai-compatible-provider.ts +++ b/src/api/providers/base-openai-compatible-provider.ts @@ -3,7 +3,7 @@ import OpenAI from "openai" import type { ModelInfo } from "@roo-code/types" -import type { ApiHandlerOptions } from "../../shared/api" +import { type ApiHandlerOptions, getModelMaxOutputTokens } from "../../shared/api" import { ApiStream } from "../transform/stream" import { convertToOpenAiMessages } from "../transform/openai-format" @@ -69,10 +69,16 @@ export abstract class BaseOpenAiCompatibleProvider metadata?: ApiHandlerCreateMessageMetadata, requestOptions?: OpenAI.RequestOptions, ) { - const { - id: model, - info: { maxTokens: max_tokens }, - } = this.getModel() + const { id: model, info } = this.getModel() + + // Centralized cap: clamp to 20% of the context window (unless provider-specific exceptions apply) + const max_tokens = + getModelMaxOutputTokens({ + modelId: model, + model: info, + settings: this.options, + format: "openai", + }) ?? undefined const temperature = this.options.modelTemperature ?? this.defaultTemperature