From c04e01953a92ff467a8b6868b256a1c423d5ed51 Mon Sep 17 00:00:00 2001
From: Roo Code <roomote@roocode.com>
Date: Tue, 12 Aug 2025 09:02:03 +0000
Subject: [PATCH 1/4] fix: use max_completion_tokens for GPT-5 models in
 LiteLLM provider

- GPT-5 models require max_completion_tokens instead of the deprecated max_tokens parameter
- Added detection for GPT-5 model variants (gpt-5, gpt5, GPT-5, etc.)
- Updated both createMessage and completePrompt methods to handle GPT-5 models
- Added comprehensive tests for GPT-5 model handling

Fixes #6979
---
 src/api/providers/__tests__/lite-llm.spec.ts | 178 +++++++++++++++++--
 src/api/providers/lite-llm.ts                |  23 ++-
 2 files changed, 187 insertions(+), 14 deletions(-)

diff --git a/src/api/providers/__tests__/lite-llm.spec.ts b/src/api/providers/__tests__/lite-llm.spec.ts
index 26ebbc35258a..0056619e46c4 100644
--- a/src/api/providers/__tests__/lite-llm.spec.ts
+++ b/src/api/providers/__tests__/lite-llm.spec.ts
@@ -10,15 +10,9 @@ import { litellmDefaultModelId, litellmDefaultModelInfo } from "@roo-code/types"
 vi.mock("vscode", () => ({}))
 
 // Mock OpenAI
-vi.mock("openai", () => {
-	const mockStream = {
-		[Symbol.asyncIterator]: vi.fn(),
-	}
-
-	const mockCreate = vi.fn().mockReturnValue({
-		withResponse: vi.fn().mockResolvedValue({ data: mockStream }),
-	})
+const mockCreate = vi.fn()
 
+vi.mock("openai", () => {
 	return {
 		default: vi.fn().mockImplementation(() => ({
 			chat: {
@@ -35,6 +29,15 @@ vi.mock("../fetchers/modelCache", () => ({
 	getModels: vi.fn().mockImplementation(() => {
 		return Promise.resolve({
 			[litellmDefaultModelId]: litellmDefaultModelInfo,
+			"gpt-5": { ...litellmDefaultModelInfo, maxTokens: 8192 },
+			gpt5: { ...litellmDefaultModelInfo, maxTokens: 8192 },
+			"GPT-5": { ...litellmDefaultModelInfo, maxTokens: 8192 },
+			"gpt-5-turbo": { ...litellmDefaultModelInfo, maxTokens: 8192 },
+			"gpt5-preview": { ...litellmDefaultModelInfo, maxTokens: 8192 },
+			"gpt-4": { ...litellmDefaultModelInfo, maxTokens: 8192 },
+			"claude-3-opus": { ...litellmDefaultModelInfo, maxTokens: 8192 },
+			"llama-3": { ...litellmDefaultModelInfo, maxTokens: 8192 },
+			"gpt-4-turbo": { ...litellmDefaultModelInfo, maxTokens: 8192 },
 		})
 	}),
 }))
@@ -42,7 +45,6 @@ vi.mock("../fetchers/modelCache", () => ({
 describe("LiteLLMHandler", () => {
 	let handler: LiteLLMHandler
 	let mockOptions: ApiHandlerOptions
-	let mockOpenAIClient: any
 
 	beforeEach(() => {
 		vi.clearAllMocks()
@@ -52,7 +54,6 @@ describe("LiteLLMHandler", () => {
 			litellmModelId: litellmDefaultModelId,
 		}
 		handler = new LiteLLMHandler(mockOptions)
-		mockOpenAIClient = new OpenAI()
 	})
 
 	describe("prompt caching", () => {
@@ -85,7 +86,7 @@ describe("LiteLLMHandler", () => {
 				},
 			}
 
-			mockOpenAIClient.chat.completions.create.mockReturnValue({
+			mockCreate.mockReturnValue({
 				withResponse: vi.fn().mockResolvedValue({ data: mockStream }),
 			})
 
@@ -96,7 +97,7 @@ describe("LiteLLMHandler", () => {
 			}
 
 			// Verify that create was called with cache control headers
-			const createCall = mockOpenAIClient.chat.completions.create.mock.calls[0][0]
+			const createCall = mockCreate.mock.calls[0][0]
 
 			// Check system message has cache control in the proper format
 			expect(createCall.messages[0]).toMatchObject({
@@ -155,4 +156,157 @@ describe("LiteLLMHandler", () => {
 			})
 		})
 	})
+
+	describe("GPT-5 model handling", () => {
+		it("should use max_completion_tokens instead of max_tokens for GPT-5 models", async () => {
+			const optionsWithGPT5: ApiHandlerOptions = {
+				...mockOptions,
+				litellmModelId: "gpt-5",
+			}
+			handler = new LiteLLMHandler(optionsWithGPT5)
+
+			const systemPrompt = "You are a helpful assistant"
+			const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello" }]
+
+			// Mock the stream response
+			const mockStream = {
+				async *[Symbol.asyncIterator]() {
+					yield {
+						choices: [{ delta: { content: "Hello!" } }],
+						usage: {
+							prompt_tokens: 10,
+							completion_tokens: 5,
+						},
+					}
+				},
+			}
+
+			mockCreate.mockReturnValue({
+				withResponse: vi.fn().mockResolvedValue({ data: mockStream }),
+			})
+
+			const generator = handler.createMessage(systemPrompt, messages)
+			const results = []
+			for await (const chunk of generator) {
+				results.push(chunk)
+			}
+
+			// Verify that create was called with max_completion_tokens instead of max_tokens
+			const createCall = mockCreate.mock.calls[0][0]
+
+			// Should have max_completion_tokens, not max_tokens
+			expect(createCall.max_completion_tokens).toBeDefined()
+			expect(createCall.max_tokens).toBeUndefined()
+		})
+
+		it("should use max_completion_tokens for various GPT-5 model variations", async () => {
+			const gpt5Variations = ["gpt-5", "gpt5", "GPT-5", "gpt-5-turbo", "gpt5-preview"]
+
+			for (const modelId of gpt5Variations) {
+				vi.clearAllMocks()
+
+				const optionsWithGPT5: ApiHandlerOptions = {
+					...mockOptions,
+					litellmModelId: modelId,
+				}
+				handler = new LiteLLMHandler(optionsWithGPT5)
+
+				const systemPrompt = "You are a helpful assistant"
+				const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Test" }]
+
+				// Mock the stream response
+				const mockStream = {
+					async *[Symbol.asyncIterator]() {
+						yield {
+							choices: [{ delta: { content: "Response" } }],
+							usage: {
+								prompt_tokens: 10,
+								completion_tokens: 5,
+							},
+						}
+					},
+				}
+
+				mockCreate.mockReturnValue({
+					withResponse: vi.fn().mockResolvedValue({ data: mockStream }),
+				})
+
+				const generator = handler.createMessage(systemPrompt, messages)
+				for await (const chunk of generator) {
+					// Consume the generator
+				}
+
+				// Verify that create was called with max_completion_tokens for this model variation
+				const createCall = mockCreate.mock.calls[0][0]
+
+				expect(createCall.max_completion_tokens).toBeDefined()
+				expect(createCall.max_tokens).toBeUndefined()
+			}
+		})
+
+		it("should still use max_tokens for non-GPT-5 models", async () => {
+			const nonGPT5Models = ["gpt-4", "claude-3-opus", "llama-3", "gpt-4-turbo"]
+
+			for (const modelId of nonGPT5Models) {
+				vi.clearAllMocks()
+
+				const options: ApiHandlerOptions = {
+					...mockOptions,
+					litellmModelId: modelId,
+				}
+				handler = new LiteLLMHandler(options)
+
+				const systemPrompt = "You are a helpful assistant"
+				const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Test" }]
+
+				// Mock the stream response
+				const mockStream = {
+					async *[Symbol.asyncIterator]() {
+						yield {
+							choices: [{ delta: { content: "Response" } }],
+							usage: {
+								prompt_tokens: 10,
+								completion_tokens: 5,
+							},
+						}
+					},
+				}
+
+				mockCreate.mockReturnValue({
+					withResponse: vi.fn().mockResolvedValue({ data: mockStream }),
+				})
+
+				const generator = handler.createMessage(systemPrompt, messages)
+				for await (const chunk of generator) {
+					// Consume the generator
+				}
+
+				// Verify that create was called with max_tokens for non-GPT-5 models
+				const createCall = mockCreate.mock.calls[0][0]
+
+				expect(createCall.max_tokens).toBeDefined()
+				expect(createCall.max_completion_tokens).toBeUndefined()
+			}
+		})
+
+		it("should use max_completion_tokens in completePrompt for GPT-5 models", async () => {
+			const optionsWithGPT5: ApiHandlerOptions = {
+				...mockOptions,
+				litellmModelId: "gpt-5",
+			}
+			handler = new LiteLLMHandler(optionsWithGPT5)
+
+			mockCreate.mockResolvedValue({
+				choices: [{ message: { content: "Test response" } }],
+			})
+
+			await handler.completePrompt("Test prompt")
+
+			// Verify that create was called with max_completion_tokens
+			const createCall = mockCreate.mock.calls[0][0]
+
+			expect(createCall.max_completion_tokens).toBeDefined()
+			expect(createCall.max_tokens).toBeUndefined()
+		})
+	})
 })
diff --git a/src/api/providers/lite-llm.ts b/src/api/providers/lite-llm.ts
index 7cea7411febc..a26e22cbfbce 100644
--- a/src/api/providers/lite-llm.ts
+++ b/src/api/providers/lite-llm.ts
@@ -107,9 +107,11 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa
 		// Required by some providers; others default to max tokens allowed
 		let maxTokens: number | undefined = info.maxTokens ?? undefined
 
+		// Check if this is a GPT-5 model that requires max_completion_tokens instead of max_tokens
+		const isGPT5Model = modelId.toLowerCase().includes("gpt-5") || modelId.toLowerCase().includes("gpt5")
+
 		const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming = {
 			model: modelId,
-			max_tokens: maxTokens,
 			messages: [systemMessage, ...enhancedMessages],
 			stream: true,
 			stream_options: {
@@ -117,6 +119,14 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa
 			},
 		}
 
+		// GPT-5 models require max_completion_tokens instead of the deprecated max_tokens parameter
+		if (isGPT5Model && maxTokens) {
+			// @ts-ignore - max_completion_tokens is not in the OpenAI types yet but is supported
+			requestOptions.max_completion_tokens = maxTokens
+		} else if (maxTokens) {
+			requestOptions.max_tokens = maxTokens
+		}
+
 		if (this.supportsTemperature(modelId)) {
 			requestOptions.temperature = this.options.modelTemperature ?? 0
 		}
@@ -179,6 +189,9 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa
 	async completePrompt(prompt: string): Promise<string> {
 		const { id: modelId, info } = await this.fetchModel()
 
+		// Check if this is a GPT-5 model that requires max_completion_tokens instead of max_tokens
+		const isGPT5Model = modelId.toLowerCase().includes("gpt-5") || modelId.toLowerCase().includes("gpt5")
+
 		try {
 			const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming = {
 				model: modelId,
@@ -189,7 +202,13 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa
 				requestOptions.temperature = this.options.modelTemperature ?? 0
 			}
 
-			requestOptions.max_tokens = info.maxTokens
+			// GPT-5 models require max_completion_tokens instead of the deprecated max_tokens parameter
+			if (isGPT5Model && info.maxTokens) {
+				// @ts-ignore - max_completion_tokens is not in the OpenAI types yet but is supported
+				requestOptions.max_completion_tokens = info.maxTokens
+			} else if (info.maxTokens) {
+				requestOptions.max_tokens = info.maxTokens
+			}
 
 			const response = await this.client.chat.completions.create(requestOptions)
 			return response.choices[0]?.message.content || ""

From 3d576b18c6d33d3988315017252242df0daf5525 Mon Sep 17 00:00:00 2001
From: daniel-lxs <ricciodaniel98@gmail.com>
Date: Thu, 25 Sep 2025 14:31:15 -0500
Subject: [PATCH 2/4] fix: remove TypeScript ignore for max_completion_tokens
 in GPT-5 model handling

---
 src/api/providers/lite-llm.ts | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/api/providers/lite-llm.ts b/src/api/providers/lite-llm.ts
index a26e22cbfbce..677f9354c4f5 100644
--- a/src/api/providers/lite-llm.ts
+++ b/src/api/providers/lite-llm.ts
@@ -204,7 +204,6 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa
 
 			// GPT-5 models require max_completion_tokens instead of the deprecated max_tokens parameter
 			if (isGPT5Model && info.maxTokens) {
-				// @ts-ignore - max_completion_tokens is not in the OpenAI types yet but is supported
 				requestOptions.max_completion_tokens = info.maxTokens
 			} else if (info.maxTokens) {
 				requestOptions.max_tokens = info.maxTokens

From f89b23ca635a8705b69ef82d74f35059993bdb9f Mon Sep 17 00:00:00 2001
From: daniel-lxs <ricciodaniel98@gmail.com>
Date: Thu, 25 Sep 2025 14:48:40 -0500
Subject: [PATCH 3/4] refactor(litellm): centralize GPT-5 detection; expand
 variants; add undefined maxTokens guards and tests

---
 src/api/providers/__tests__/lite-llm.spec.ts | 81 +++++++++++++++++++-
 src/api/providers/lite-llm.ts                | 10 ++-
 2 files changed, 88 insertions(+), 3 deletions(-)

diff --git a/src/api/providers/__tests__/lite-llm.spec.ts b/src/api/providers/__tests__/lite-llm.spec.ts
index 0056619e46c4..0b16285f3f34 100644
--- a/src/api/providers/__tests__/lite-llm.spec.ts
+++ b/src/api/providers/__tests__/lite-llm.spec.ts
@@ -34,6 +34,9 @@ vi.mock("../fetchers/modelCache", () => ({
 			"GPT-5": { ...litellmDefaultModelInfo, maxTokens: 8192 },
 			"gpt-5-turbo": { ...litellmDefaultModelInfo, maxTokens: 8192 },
 			"gpt5-preview": { ...litellmDefaultModelInfo, maxTokens: 8192 },
+			"gpt-5o": { ...litellmDefaultModelInfo, maxTokens: 8192 },
+			"gpt-5.1": { ...litellmDefaultModelInfo, maxTokens: 8192 },
+			"gpt-5-mini": { ...litellmDefaultModelInfo, maxTokens: 8192 },
 			"gpt-4": { ...litellmDefaultModelInfo, maxTokens: 8192 },
 			"claude-3-opus": { ...litellmDefaultModelInfo, maxTokens: 8192 },
 			"llama-3": { ...litellmDefaultModelInfo, maxTokens: 8192 },
@@ -200,7 +203,16 @@ describe("LiteLLMHandler", () => {
 		})
 
 		it("should use max_completion_tokens for various GPT-5 model variations", async () => {
-			const gpt5Variations = ["gpt-5", "gpt5", "GPT-5", "gpt-5-turbo", "gpt5-preview"]
+			const gpt5Variations = [
+				"gpt-5",
+				"gpt5",
+				"GPT-5",
+				"gpt-5-turbo",
+				"gpt5-preview",
+				"gpt-5o",
+				"gpt-5.1",
+				"gpt-5-mini",
+			]
 
 			for (const modelId of gpt5Variations) {
 				vi.clearAllMocks()
@@ -308,5 +320,72 @@ describe("LiteLLMHandler", () => {
 			expect(createCall.max_completion_tokens).toBeDefined()
 			expect(createCall.max_tokens).toBeUndefined()
 		})
+
+		it("should not set any max token fields when maxTokens is undefined (GPT-5 streaming)", async () => {
+			const optionsWithGPT5: ApiHandlerOptions = {
+				...mockOptions,
+				litellmModelId: "gpt-5",
+			}
+			handler = new LiteLLMHandler(optionsWithGPT5)
+
+			// Force fetchModel to return undefined maxTokens
+			vi.spyOn(handler as any, "fetchModel").mockResolvedValue({
+				id: "gpt-5",
+				info: { ...litellmDefaultModelInfo, maxTokens: undefined },
+			})
+
+			// Mock the stream response
+			const mockStream = {
+				async *[Symbol.asyncIterator]() {
+					yield {
+						choices: [{ delta: { content: "Hello!" } }],
+						usage: {
+							prompt_tokens: 10,
+							completion_tokens: 5,
+						},
+					}
+				},
+			}
+
+			mockCreate.mockReturnValue({
+				withResponse: vi.fn().mockResolvedValue({ data: mockStream }),
+			})
+
+			const generator = handler.createMessage("You are a helpful assistant", [
+				{ role: "user", content: "Hello" } as unknown as Anthropic.Messages.MessageParam,
+			])
+			for await (const _chunk of generator) {
+				// consume
+			}
+
+			// Should not include either token field
+			const createCall = mockCreate.mock.calls[0][0]
+			expect(createCall.max_tokens).toBeUndefined()
+			expect(createCall.max_completion_tokens).toBeUndefined()
+		})
+
+		it("should not set any max token fields when maxTokens is undefined (GPT-5 completePrompt)", async () => {
+			const optionsWithGPT5: ApiHandlerOptions = {
+				...mockOptions,
+				litellmModelId: "gpt-5",
+			}
+			handler = new LiteLLMHandler(optionsWithGPT5)
+
+			// Force fetchModel to return undefined maxTokens
+			vi.spyOn(handler as any, "fetchModel").mockResolvedValue({
+				id: "gpt-5",
+				info: { ...litellmDefaultModelInfo, maxTokens: undefined },
+			})
+
+			mockCreate.mockResolvedValue({
+				choices: [{ message: { content: "Ok" } }],
+			})
+
+			await handler.completePrompt("Test prompt")
+
+			const createCall = mockCreate.mock.calls[0][0]
+			expect(createCall.max_tokens).toBeUndefined()
+			expect(createCall.max_completion_tokens).toBeUndefined()
+		})
 	})
 })
diff --git a/src/api/providers/lite-llm.ts b/src/api/providers/lite-llm.ts
index 677f9354c4f5..cd6f5fead8f8 100644
--- a/src/api/providers/lite-llm.ts
+++ b/src/api/providers/lite-llm.ts
@@ -32,6 +32,12 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa
 		})
 	}
 
+	private isGpt5(modelId: string): boolean {
+		// Match gpt-5, gpt5, and variants like gpt-5o, gpt-5-turbo, gpt5-preview, gpt-5.1
+		// Avoid matching gpt-50, gpt-500, etc.
+		return /\bgpt-?5(?!\d)/i.test(modelId)
+	}
+
 	override async *createMessage(
 		systemPrompt: string,
 		messages: Anthropic.Messages.MessageParam[],
@@ -108,7 +114,7 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa
 		let maxTokens: number | undefined = info.maxTokens ?? undefined
 
 		// Check if this is a GPT-5 model that requires max_completion_tokens instead of max_tokens
-		const isGPT5Model = modelId.toLowerCase().includes("gpt-5") || modelId.toLowerCase().includes("gpt5")
+		const isGPT5Model = this.isGpt5(modelId)
 
 		const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming = {
 			model: modelId,
@@ -190,7 +196,7 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa
 		const { id: modelId, info } = await this.fetchModel()
 
 		// Check if this is a GPT-5 model that requires max_completion_tokens instead of max_tokens
-		const isGPT5Model = modelId.toLowerCase().includes("gpt-5") || modelId.toLowerCase().includes("gpt5")
+		const isGPT5Model = this.isGpt5(modelId)
 
 		try {
 			const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming = {

From 822886b3d6a4353e959fef14680b8f34c25ee521 Mon Sep 17 00:00:00 2001
From: daniel-lxs <ricciodaniel98@gmail.com>
Date: Thu, 25 Sep 2025 14:53:52 -0500
Subject: [PATCH 4/4] fix: remove TypeScript ignore for max_completion_tokens
 in GPT-5 model handling

---
 src/api/providers/lite-llm.ts | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/api/providers/lite-llm.ts b/src/api/providers/lite-llm.ts
index cd6f5fead8f8..9f58f092234d 100644
--- a/src/api/providers/lite-llm.ts
+++ b/src/api/providers/lite-llm.ts
@@ -127,7 +127,6 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa
 
 		// GPT-5 models require max_completion_tokens instead of the deprecated max_tokens parameter
 		if (isGPT5Model && maxTokens) {
-			// @ts-ignore - max_completion_tokens is not in the OpenAI types yet but is supported
 			requestOptions.max_completion_tokens = maxTokens
 		} else if (maxTokens) {
 			requestOptions.max_tokens = maxTokens