Skip to content

Commit c5dc8bf

Browse files
refactor: move conservative maxTokens to types file per review feedback
Instead of clamping max_tokens in the provider, set maxTokens to 8192 directly in cerebras.ts types file. This is cleaner and keeps the rate-limiting logic in one place.
1 parent 8858b0a commit c5dc8bf

File tree

2 files changed

+6
-15
lines changed

2 files changed

+6
-15
lines changed

packages/types/src/providers/cerebras.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ export const cerebrasDefaultModelId: CerebrasModelId = "gpt-oss-120b"
77

88
export const cerebrasModels = {
99
"zai-glm-4.6": {
10-
maxTokens: 16384, // consistent with their other models
10+
maxTokens: 8192, // Conservative default to avoid premature rate limiting (Cerebras reserves quota upfront)
1111
contextWindow: 131072,
1212
supportsImages: false,
1313
supportsPromptCache: false,
@@ -17,7 +17,7 @@ export const cerebrasModels = {
1717
description: "Highly intelligent general purpose model with up to 1,000 tokens/s",
1818
},
1919
"qwen-3-235b-a22b-instruct-2507": {
20-
maxTokens: 64000,
20+
maxTokens: 8192, // Conservative default to avoid premature rate limiting
2121
contextWindow: 64000,
2222
supportsImages: false,
2323
supportsPromptCache: false,
@@ -27,7 +27,7 @@ export const cerebrasModels = {
2727
description: "Intelligent model with ~1400 tokens/s",
2828
},
2929
"llama-3.3-70b": {
30-
maxTokens: 64000,
30+
maxTokens: 8192, // Conservative default to avoid premature rate limiting
3131
contextWindow: 64000,
3232
supportsImages: false,
3333
supportsPromptCache: false,
@@ -37,7 +37,7 @@ export const cerebrasModels = {
3737
description: "Powerful model with ~2600 tokens/s",
3838
},
3939
"qwen-3-32b": {
40-
maxTokens: 64000,
40+
maxTokens: 8192, // Conservative default to avoid premature rate limiting
4141
contextWindow: 64000,
4242
supportsImages: false,
4343
supportsPromptCache: false,
@@ -47,7 +47,7 @@ export const cerebrasModels = {
4747
description: "SOTA coding performance with ~2500 tokens/s",
4848
},
4949
"gpt-oss-120b": {
50-
maxTokens: 8000,
50+
maxTokens: 8192, // Conservative default to avoid premature rate limiting
5151
contextWindow: 64000,
5252
supportsImages: false,
5353
supportsPromptCache: false,

src/api/providers/cerebras.ts

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,6 @@ import { t } from "../../i18n"
1616
const CEREBRAS_BASE_URL = "https://api.cerebras.ai/v1"
1717
const CEREBRAS_DEFAULT_TEMPERATURE = 0
1818

19-
/**
20-
* Conservative max_tokens for Cerebras to avoid premature rate limiting.
21-
* Cerebras rate limiter estimates token consumption using max_completion_tokens upfront,
22-
* so requesting the model maximum (e.g., 64K) reserves that quota even if actual usage is low.
23-
* 8K is sufficient for most agentic tool use while preserving rate limit headroom.
24-
*/
25-
const CEREBRAS_DEFAULT_MAX_TOKENS = 8_192
2619
const CEREBRAS_INTEGRATION_HEADER = "X-Cerebras-3rd-Party-Integration"
2720
const CEREBRAS_INTEGRATION_NAME = "roocode"
2821

@@ -116,14 +109,12 @@ export class CerebrasHandler extends BaseProvider implements SingleCompletionHan
116109
const openaiMessages = convertToOpenAiMessages(messages)
117110

118111
// Prepare request body following Cerebras API specification exactly
119-
// Use conservative default to avoid premature rate limiting (Cerebras reserves quota upfront)
120-
const effectiveMaxTokens = Math.min(max_tokens || CEREBRAS_DEFAULT_MAX_TOKENS, CEREBRAS_DEFAULT_MAX_TOKENS)
121112
const requestBody: Record<string, any> = {
122113
model,
123114
messages: [{ role: "system", content: systemPrompt }, ...openaiMessages],
124115
stream: true,
125116
// Use max_completion_tokens (Cerebras-specific parameter)
126-
...(effectiveMaxTokens > 0 ? { max_completion_tokens: effectiveMaxTokens } : {}),
117+
...(max_tokens && max_tokens > 0 && max_tokens <= 32768 ? { max_completion_tokens: max_tokens } : {}),
127118
// Clamp temperature to Cerebras range (0 to 1.5)
128119
...(temperature !== undefined && temperature !== CEREBRAS_DEFAULT_TEMPERATURE
129120
? {

0 commit comments

Comments
 (0)