refactor: move conservative maxTokens to types file per review feedback

sebastiand-cerebras · sebastiand-cerebras · commit c5dc8bfa9738 · 2025-12-03T21:12:13.000-08:00
Instead of clamping max_tokens in the provider, set maxTokens to 8192
directly in cerebras.ts types file. This is cleaner and keeps the
rate-limiting logic in one place.
diff --git a/packages/types/src/providers/cerebras.ts b/packages/types/src/providers/cerebras.ts
@@ -7,7 +7,7 @@ export const cerebrasDefaultModelId: CerebrasModelId = "gpt-oss-120b"
 
 export const cerebrasModels = {
 	"zai-glm-4.6": {
-		maxTokens: 16384, // consistent with their other models
+		maxTokens: 8192, // Conservative default to avoid premature rate limiting (Cerebras reserves quota upfront)
 		contextWindow: 131072,
 		supportsImages: false,
 		supportsPromptCache: false,
@@ -17,7 +17,7 @@ export const cerebrasModels = {
 		description: "Highly intelligent general purpose model with up to 1,000 tokens/s",
 	},
 	"qwen-3-235b-a22b-instruct-2507": {
-		maxTokens: 64000,
+		maxTokens: 8192, // Conservative default to avoid premature rate limiting
 		contextWindow: 64000,
 		supportsImages: false,
 		supportsPromptCache: false,
@@ -27,7 +27,7 @@ export const cerebrasModels = {
 		description: "Intelligent model with ~1400 tokens/s",
 	},
 	"llama-3.3-70b": {
-		maxTokens: 64000,
+		maxTokens: 8192, // Conservative default to avoid premature rate limiting
 		contextWindow: 64000,
 		supportsImages: false,
 		supportsPromptCache: false,
@@ -37,7 +37,7 @@ export const cerebrasModels = {
 		description: "Powerful model with ~2600 tokens/s",
 	},
 	"qwen-3-32b": {
-		maxTokens: 64000,
+		maxTokens: 8192, // Conservative default to avoid premature rate limiting
 		contextWindow: 64000,
 		supportsImages: false,
 		supportsPromptCache: false,
@@ -47,7 +47,7 @@ export const cerebrasModels = {
 		description: "SOTA coding performance with ~2500 tokens/s",
 	},
 	"gpt-oss-120b": {
-		maxTokens: 8000,
+		maxTokens: 8192, // Conservative default to avoid premature rate limiting
 		contextWindow: 64000,
 		supportsImages: false,
 		supportsPromptCache: false,
diff --git a/src/api/providers/cerebras.ts b/src/api/providers/cerebras.ts
@@ -16,13 +16,6 @@ import { t } from "../../i18n"
 const CEREBRAS_BASE_URL = "https://api.cerebras.ai/v1"
 const CEREBRAS_DEFAULT_TEMPERATURE = 0
 
-/**
- * Conservative max_tokens for Cerebras to avoid premature rate limiting.
- * Cerebras rate limiter estimates token consumption using max_completion_tokens upfront,
- * so requesting the model maximum (e.g., 64K) reserves that quota even if actual usage is low.
- * 8K is sufficient for most agentic tool use while preserving rate limit headroom.
- */
-const CEREBRAS_DEFAULT_MAX_TOKENS = 8_192
 const CEREBRAS_INTEGRATION_HEADER = "X-Cerebras-3rd-Party-Integration"
 const CEREBRAS_INTEGRATION_NAME = "roocode"
 
@@ -116,14 +109,12 @@ export class CerebrasHandler extends BaseProvider implements SingleCompletionHan
 		const openaiMessages = convertToOpenAiMessages(messages)
 
 		// Prepare request body following Cerebras API specification exactly
-		// Use conservative default to avoid premature rate limiting (Cerebras reserves quota upfront)
-		const effectiveMaxTokens = Math.min(max_tokens || CEREBRAS_DEFAULT_MAX_TOKENS, CEREBRAS_DEFAULT_MAX_TOKENS)
 		const requestBody: Record<string, any> = {
 			model,
 			messages: [{ role: "system", content: systemPrompt }, ...openaiMessages],
 			stream: true,
 			// Use max_completion_tokens (Cerebras-specific parameter)
-			...(effectiveMaxTokens > 0 ? { max_completion_tokens: effectiveMaxTokens } : {}),
+			...(max_tokens && max_tokens > 0 && max_tokens <= 32768 ? { max_completion_tokens: max_tokens } : {}),
 			// Clamp temperature to Cerebras range (0 to 1.5)
 			...(temperature !== undefined && temperature !== CEREBRAS_DEFAULT_TEMPERATURE
 				? {