From c04e01953a92ff467a8b6868b256a1c423d5ed51 Mon Sep 17 00:00:00 2001 From: Roo Code Date: Tue, 12 Aug 2025 09:02:03 +0000 Subject: [PATCH 1/4] fix: use max_completion_tokens for GPT-5 models in LiteLLM provider - GPT-5 models require max_completion_tokens instead of the deprecated max_tokens parameter - Added detection for GPT-5 model variants (gpt-5, gpt5, GPT-5, etc.) - Updated both createMessage and completePrompt methods to handle GPT-5 models - Added comprehensive tests for GPT-5 model handling Fixes #6979 --- src/api/providers/__tests__/lite-llm.spec.ts | 178 +++++++++++++++++-- src/api/providers/lite-llm.ts | 23 ++- 2 files changed, 187 insertions(+), 14 deletions(-) diff --git a/src/api/providers/__tests__/lite-llm.spec.ts b/src/api/providers/__tests__/lite-llm.spec.ts index 26ebbc35258a..0056619e46c4 100644 --- a/src/api/providers/__tests__/lite-llm.spec.ts +++ b/src/api/providers/__tests__/lite-llm.spec.ts @@ -10,15 +10,9 @@ import { litellmDefaultModelId, litellmDefaultModelInfo } from "@roo-code/types" vi.mock("vscode", () => ({})) // Mock OpenAI -vi.mock("openai", () => { - const mockStream = { - [Symbol.asyncIterator]: vi.fn(), - } - - const mockCreate = vi.fn().mockReturnValue({ - withResponse: vi.fn().mockResolvedValue({ data: mockStream }), - }) +const mockCreate = vi.fn() +vi.mock("openai", () => { return { default: vi.fn().mockImplementation(() => ({ chat: { @@ -35,6 +29,15 @@ vi.mock("../fetchers/modelCache", () => ({ getModels: vi.fn().mockImplementation(() => { return Promise.resolve({ [litellmDefaultModelId]: litellmDefaultModelInfo, + "gpt-5": { ...litellmDefaultModelInfo, maxTokens: 8192 }, + gpt5: { ...litellmDefaultModelInfo, maxTokens: 8192 }, + "GPT-5": { ...litellmDefaultModelInfo, maxTokens: 8192 }, + "gpt-5-turbo": { ...litellmDefaultModelInfo, maxTokens: 8192 }, + "gpt5-preview": { ...litellmDefaultModelInfo, maxTokens: 8192 }, + "gpt-4": { ...litellmDefaultModelInfo, maxTokens: 8192 }, + "claude-3-opus": { ...litellmDefaultModelInfo, maxTokens: 8192 }, + "llama-3": { ...litellmDefaultModelInfo, maxTokens: 8192 }, + "gpt-4-turbo": { ...litellmDefaultModelInfo, maxTokens: 8192 }, }) }), })) @@ -42,7 +45,6 @@ vi.mock("../fetchers/modelCache", () => ({ describe("LiteLLMHandler", () => { let handler: LiteLLMHandler let mockOptions: ApiHandlerOptions - let mockOpenAIClient: any beforeEach(() => { vi.clearAllMocks() @@ -52,7 +54,6 @@ describe("LiteLLMHandler", () => { litellmModelId: litellmDefaultModelId, } handler = new LiteLLMHandler(mockOptions) - mockOpenAIClient = new OpenAI() }) describe("prompt caching", () => { @@ -85,7 +86,7 @@ describe("LiteLLMHandler", () => { }, } - mockOpenAIClient.chat.completions.create.mockReturnValue({ + mockCreate.mockReturnValue({ withResponse: vi.fn().mockResolvedValue({ data: mockStream }), }) @@ -96,7 +97,7 @@ describe("LiteLLMHandler", () => { } // Verify that create was called with cache control headers - const createCall = mockOpenAIClient.chat.completions.create.mock.calls[0][0] + const createCall = mockCreate.mock.calls[0][0] // Check system message has cache control in the proper format expect(createCall.messages[0]).toMatchObject({ @@ -155,4 +156,157 @@ describe("LiteLLMHandler", () => { }) }) }) + + describe("GPT-5 model handling", () => { + it("should use max_completion_tokens instead of max_tokens for GPT-5 models", async () => { + const optionsWithGPT5: ApiHandlerOptions = { + ...mockOptions, + litellmModelId: "gpt-5", + } + handler = new LiteLLMHandler(optionsWithGPT5) + + const systemPrompt = "You are a helpful assistant" + const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello" }] + + // Mock the stream response + const mockStream = { + async *[Symbol.asyncIterator]() { + yield { + choices: [{ delta: { content: "Hello!" } }], + usage: { + prompt_tokens: 10, + completion_tokens: 5, + }, + } + }, + } + + mockCreate.mockReturnValue({ + withResponse: vi.fn().mockResolvedValue({ data: mockStream }), + }) + + const generator = handler.createMessage(systemPrompt, messages) + const results = [] + for await (const chunk of generator) { + results.push(chunk) + } + + // Verify that create was called with max_completion_tokens instead of max_tokens + const createCall = mockCreate.mock.calls[0][0] + + // Should have max_completion_tokens, not max_tokens + expect(createCall.max_completion_tokens).toBeDefined() + expect(createCall.max_tokens).toBeUndefined() + }) + + it("should use max_completion_tokens for various GPT-5 model variations", async () => { + const gpt5Variations = ["gpt-5", "gpt5", "GPT-5", "gpt-5-turbo", "gpt5-preview"] + + for (const modelId of gpt5Variations) { + vi.clearAllMocks() + + const optionsWithGPT5: ApiHandlerOptions = { + ...mockOptions, + litellmModelId: modelId, + } + handler = new LiteLLMHandler(optionsWithGPT5) + + const systemPrompt = "You are a helpful assistant" + const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Test" }] + + // Mock the stream response + const mockStream = { + async *[Symbol.asyncIterator]() { + yield { + choices: [{ delta: { content: "Response" } }], + usage: { + prompt_tokens: 10, + completion_tokens: 5, + }, + } + }, + } + + mockCreate.mockReturnValue({ + withResponse: vi.fn().mockResolvedValue({ data: mockStream }), + }) + + const generator = handler.createMessage(systemPrompt, messages) + for await (const chunk of generator) { + // Consume the generator + } + + // Verify that create was called with max_completion_tokens for this model variation + const createCall = mockCreate.mock.calls[0][0] + + expect(createCall.max_completion_tokens).toBeDefined() + expect(createCall.max_tokens).toBeUndefined() + } + }) + + it("should still use max_tokens for non-GPT-5 models", async () => { + const nonGPT5Models = ["gpt-4", "claude-3-opus", "llama-3", "gpt-4-turbo"] + + for (const modelId of nonGPT5Models) { + vi.clearAllMocks() + + const options: ApiHandlerOptions = { + ...mockOptions, + litellmModelId: modelId, + } + handler = new LiteLLMHandler(options) + + const systemPrompt = "You are a helpful assistant" + const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Test" }] + + // Mock the stream response + const mockStream = { + async *[Symbol.asyncIterator]() { + yield { + choices: [{ delta: { content: "Response" } }], + usage: { + prompt_tokens: 10, + completion_tokens: 5, + }, + } + }, + } + + mockCreate.mockReturnValue({ + withResponse: vi.fn().mockResolvedValue({ data: mockStream }), + }) + + const generator = handler.createMessage(systemPrompt, messages) + for await (const chunk of generator) { + // Consume the generator + } + + // Verify that create was called with max_tokens for non-GPT-5 models + const createCall = mockCreate.mock.calls[0][0] + + expect(createCall.max_tokens).toBeDefined() + expect(createCall.max_completion_tokens).toBeUndefined() + } + }) + + it("should use max_completion_tokens in completePrompt for GPT-5 models", async () => { + const optionsWithGPT5: ApiHandlerOptions = { + ...mockOptions, + litellmModelId: "gpt-5", + } + handler = new LiteLLMHandler(optionsWithGPT5) + + mockCreate.mockResolvedValue({ + choices: [{ message: { content: "Test response" } }], + }) + + await handler.completePrompt("Test prompt") + + // Verify that create was called with max_completion_tokens + const createCall = mockCreate.mock.calls[0][0] + + expect(createCall.max_completion_tokens).toBeDefined() + expect(createCall.max_tokens).toBeUndefined() + }) + }) }) diff --git a/src/api/providers/lite-llm.ts b/src/api/providers/lite-llm.ts index 7cea7411febc..a26e22cbfbce 100644 --- a/src/api/providers/lite-llm.ts +++ b/src/api/providers/lite-llm.ts @@ -107,9 +107,11 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa // Required by some providers; others default to max tokens allowed let maxTokens: number | undefined = info.maxTokens ?? undefined + // Check if this is a GPT-5 model that requires max_completion_tokens instead of max_tokens + const isGPT5Model = modelId.toLowerCase().includes("gpt-5") || modelId.toLowerCase().includes("gpt5") + const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming = { model: modelId, - max_tokens: maxTokens, messages: [systemMessage, ...enhancedMessages], stream: true, stream_options: { @@ -117,6 +119,14 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa }, } + // GPT-5 models require max_completion_tokens instead of the deprecated max_tokens parameter + if (isGPT5Model && maxTokens) { + // @ts-ignore - max_completion_tokens is not in the OpenAI types yet but is supported + requestOptions.max_completion_tokens = maxTokens + } else if (maxTokens) { + requestOptions.max_tokens = maxTokens + } + if (this.supportsTemperature(modelId)) { requestOptions.temperature = this.options.modelTemperature ?? 0 } @@ -179,6 +189,9 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa async completePrompt(prompt: string): Promise { const { id: modelId, info } = await this.fetchModel() + // Check if this is a GPT-5 model that requires max_completion_tokens instead of max_tokens + const isGPT5Model = modelId.toLowerCase().includes("gpt-5") || modelId.toLowerCase().includes("gpt5") + try { const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming = { model: modelId, @@ -189,7 +202,13 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa requestOptions.temperature = this.options.modelTemperature ?? 0 } - requestOptions.max_tokens = info.maxTokens + // GPT-5 models require max_completion_tokens instead of the deprecated max_tokens parameter + if (isGPT5Model && info.maxTokens) { + // @ts-ignore - max_completion_tokens is not in the OpenAI types yet but is supported + requestOptions.max_completion_tokens = info.maxTokens + } else if (info.maxTokens) { + requestOptions.max_tokens = info.maxTokens + } const response = await this.client.chat.completions.create(requestOptions) return response.choices[0]?.message.content || "" From 3d576b18c6d33d3988315017252242df0daf5525 Mon Sep 17 00:00:00 2001 From: daniel-lxs Date: Thu, 25 Sep 2025 14:31:15 -0500 Subject: [PATCH 2/4] fix: remove TypeScript ignore for max_completion_tokens in GPT-5 model handling --- src/api/providers/lite-llm.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/src/api/providers/lite-llm.ts b/src/api/providers/lite-llm.ts index a26e22cbfbce..677f9354c4f5 100644 --- a/src/api/providers/lite-llm.ts +++ b/src/api/providers/lite-llm.ts @@ -204,7 +204,6 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa // GPT-5 models require max_completion_tokens instead of the deprecated max_tokens parameter if (isGPT5Model && info.maxTokens) { - // @ts-ignore - max_completion_tokens is not in the OpenAI types yet but is supported requestOptions.max_completion_tokens = info.maxTokens } else if (info.maxTokens) { requestOptions.max_tokens = info.maxTokens From f89b23ca635a8705b69ef82d74f35059993bdb9f Mon Sep 17 00:00:00 2001 From: daniel-lxs Date: Thu, 25 Sep 2025 14:48:40 -0500 Subject: [PATCH 3/4] refactor(litellm): centralize GPT-5 detection; expand variants; add undefined maxTokens guards and tests --- src/api/providers/__tests__/lite-llm.spec.ts | 81 +++++++++++++++++++- src/api/providers/lite-llm.ts | 10 ++- 2 files changed, 88 insertions(+), 3 deletions(-) diff --git a/src/api/providers/__tests__/lite-llm.spec.ts b/src/api/providers/__tests__/lite-llm.spec.ts index 0056619e46c4..0b16285f3f34 100644 --- a/src/api/providers/__tests__/lite-llm.spec.ts +++ b/src/api/providers/__tests__/lite-llm.spec.ts @@ -34,6 +34,9 @@ vi.mock("../fetchers/modelCache", () => ({ "GPT-5": { ...litellmDefaultModelInfo, maxTokens: 8192 }, "gpt-5-turbo": { ...litellmDefaultModelInfo, maxTokens: 8192 }, "gpt5-preview": { ...litellmDefaultModelInfo, maxTokens: 8192 }, + "gpt-5o": { ...litellmDefaultModelInfo, maxTokens: 8192 }, + "gpt-5.1": { ...litellmDefaultModelInfo, maxTokens: 8192 }, + "gpt-5-mini": { ...litellmDefaultModelInfo, maxTokens: 8192 }, "gpt-4": { ...litellmDefaultModelInfo, maxTokens: 8192 }, "claude-3-opus": { ...litellmDefaultModelInfo, maxTokens: 8192 }, "llama-3": { ...litellmDefaultModelInfo, maxTokens: 8192 }, @@ -200,7 +203,16 @@ describe("LiteLLMHandler", () => { }) it("should use max_completion_tokens for various GPT-5 model variations", async () => { - const gpt5Variations = ["gpt-5", "gpt5", "GPT-5", "gpt-5-turbo", "gpt5-preview"] + const gpt5Variations = [ + "gpt-5", + "gpt5", + "GPT-5", + "gpt-5-turbo", + "gpt5-preview", + "gpt-5o", + "gpt-5.1", + "gpt-5-mini", + ] for (const modelId of gpt5Variations) { vi.clearAllMocks() @@ -308,5 +320,72 @@ describe("LiteLLMHandler", () => { expect(createCall.max_completion_tokens).toBeDefined() expect(createCall.max_tokens).toBeUndefined() }) + + it("should not set any max token fields when maxTokens is undefined (GPT-5 streaming)", async () => { + const optionsWithGPT5: ApiHandlerOptions = { + ...mockOptions, + litellmModelId: "gpt-5", + } + handler = new LiteLLMHandler(optionsWithGPT5) + + // Force fetchModel to return undefined maxTokens + vi.spyOn(handler as any, "fetchModel").mockResolvedValue({ + id: "gpt-5", + info: { ...litellmDefaultModelInfo, maxTokens: undefined }, + }) + + // Mock the stream response + const mockStream = { + async *[Symbol.asyncIterator]() { + yield { + choices: [{ delta: { content: "Hello!" } }], + usage: { + prompt_tokens: 10, + completion_tokens: 5, + }, + } + }, + } + + mockCreate.mockReturnValue({ + withResponse: vi.fn().mockResolvedValue({ data: mockStream }), + }) + + const generator = handler.createMessage("You are a helpful assistant", [ + { role: "user", content: "Hello" } as unknown as Anthropic.Messages.MessageParam, + ]) + for await (const _chunk of generator) { + // consume + } + + // Should not include either token field + const createCall = mockCreate.mock.calls[0][0] + expect(createCall.max_tokens).toBeUndefined() + expect(createCall.max_completion_tokens).toBeUndefined() + }) + + it("should not set any max token fields when maxTokens is undefined (GPT-5 completePrompt)", async () => { + const optionsWithGPT5: ApiHandlerOptions = { + ...mockOptions, + litellmModelId: "gpt-5", + } + handler = new LiteLLMHandler(optionsWithGPT5) + + // Force fetchModel to return undefined maxTokens + vi.spyOn(handler as any, "fetchModel").mockResolvedValue({ + id: "gpt-5", + info: { ...litellmDefaultModelInfo, maxTokens: undefined }, + }) + + mockCreate.mockResolvedValue({ + choices: [{ message: { content: "Ok" } }], + }) + + await handler.completePrompt("Test prompt") + + const createCall = mockCreate.mock.calls[0][0] + expect(createCall.max_tokens).toBeUndefined() + expect(createCall.max_completion_tokens).toBeUndefined() + }) }) }) diff --git a/src/api/providers/lite-llm.ts b/src/api/providers/lite-llm.ts index 677f9354c4f5..cd6f5fead8f8 100644 --- a/src/api/providers/lite-llm.ts +++ b/src/api/providers/lite-llm.ts @@ -32,6 +32,12 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa }) } + private isGpt5(modelId: string): boolean { + // Match gpt-5, gpt5, and variants like gpt-5o, gpt-5-turbo, gpt5-preview, gpt-5.1 + // Avoid matching gpt-50, gpt-500, etc. + return /\bgpt-?5(?!\d)/i.test(modelId) + } + override async *createMessage( systemPrompt: string, messages: Anthropic.Messages.MessageParam[], @@ -108,7 +114,7 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa let maxTokens: number | undefined = info.maxTokens ?? undefined // Check if this is a GPT-5 model that requires max_completion_tokens instead of max_tokens - const isGPT5Model = modelId.toLowerCase().includes("gpt-5") || modelId.toLowerCase().includes("gpt5") + const isGPT5Model = this.isGpt5(modelId) const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming = { model: modelId, @@ -190,7 +196,7 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa const { id: modelId, info } = await this.fetchModel() // Check if this is a GPT-5 model that requires max_completion_tokens instead of max_tokens - const isGPT5Model = modelId.toLowerCase().includes("gpt-5") || modelId.toLowerCase().includes("gpt5") + const isGPT5Model = this.isGpt5(modelId) try { const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming = { From 822886b3d6a4353e959fef14680b8f34c25ee521 Mon Sep 17 00:00:00 2001 From: daniel-lxs Date: Thu, 25 Sep 2025 14:53:52 -0500 Subject: [PATCH 4/4] fix: remove TypeScript ignore for max_completion_tokens in GPT-5 model handling --- src/api/providers/lite-llm.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/src/api/providers/lite-llm.ts b/src/api/providers/lite-llm.ts index cd6f5fead8f8..9f58f092234d 100644 --- a/src/api/providers/lite-llm.ts +++ b/src/api/providers/lite-llm.ts @@ -127,7 +127,6 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa // GPT-5 models require max_completion_tokens instead of the deprecated max_tokens parameter if (isGPT5Model && maxTokens) { - // @ts-ignore - max_completion_tokens is not in the OpenAI types yet but is supported requestOptions.max_completion_tokens = maxTokens } else if (maxTokens) { requestOptions.max_tokens = maxTokens