Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion core/llm/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -996,7 +996,6 @@ export abstract class BaseLLM implements ILLM {
return completionOptions;
}

// Update the processChatChunk method:
private processChatChunk(
chunk: ChatMessage,
interaction: ILLMInteractionLog | undefined,
Expand Down
143 changes: 141 additions & 2 deletions core/llm/llms/Vllm.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
import { Chunk, LLMOptions } from "../../index.js";
import {
ChatMessage,
Chunk,
CompletionOptions,
LLMOptions,
} from "../../index.js";

import { LlmApiRequestType } from "../openaiTypeConverters.js";
import { ThinkingTagExtractor } from "../thinkingTagExtractor.js";
import OpenAI from "./OpenAI.js";

// vLLM-specific rerank response types
Expand All @@ -20,16 +27,148 @@ interface VllmRerankResponse {
results: VllmRerankItem[];
}

/**
* vLLM-specific options for thinking output extraction.
* These options allow configuring custom tags to extract thinking content from the response.
*/
export interface VllmOptions extends LLMOptions {
/**
* Custom opening tag for extracting thinking/reasoning content from streamed responses.
* Used with models that output thinking content wrapped in custom tags (e.g., `<think>`, `<reasoning>`).
* Must be used together with `thinkingCloseTag`.
*/
thinkingOpenTag?: string;
/**
* Custom closing tag for extracting thinking/reasoning content from streamed responses.
* Must be used together with `thinkingOpenTag`.
*/
thinkingCloseTag?: string;
}

/**
* vLLM provider for Continue.
*
* vLLM supports thinking/reasoning outputs in two ways:
* 1. Via the standard `reasoning_content` field in the response (default OpenAI format)
* 2. Via custom tags in the response content (configurable)
*
* For custom thinking tag formats, you can configure `thinkingOpenTag` and `thinkingCloseTag`
* in the model options. For example:
*
* ```yaml
* models:
* - provider: vllm
* model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
* apiBase: http://localhost:8000
* thinkingOpenTag: "<think>"
* thinkingCloseTag: "</think>"
* ```
*
* See vLLM documentation for more details:
* https://docs.vllm.ai/en/latest/features/reasoning_outputs.html
*/
class Vllm extends OpenAI {
static providerName = "vllm";
constructor(options: LLMOptions) {

// vLLM-specific options for thinking tag extraction
private _thinkingOpenTag?: string;
private _thinkingCloseTag?: string;

// Override useOpenAIAdapterFor to NOT include "streamChat".
// vLLM uses the reasoning_content field for thinking output (via vLLM's reasoning parser),
// which is not part of the standard OpenAI SDK types. By excluding "streamChat", we force
// the use of the parent class's _streamChat method which uses streamSse for direct SSE
// parsing. This ensures proper handling of reasoning_content in streaming responses,
// as streamSse parses JSON directly and preserves all fields including non-standard ones.
protected override useOpenAIAdapterFor: (LlmApiRequestType | "*")[] = [
"chat",
"embed",
"list",
"rerank",
"streamFim",
];

constructor(options: VllmOptions) {
super(options);

// Validate that thinking tags are provided together
if (
(options.thinkingOpenTag && !options.thinkingCloseTag) ||
(!options.thinkingOpenTag && options.thinkingCloseTag)
) {
throw new Error(
"vLLM: Both thinkingOpenTag and thinkingCloseTag must be provided together",
);
}

// Store vLLM-specific options
this._thinkingOpenTag = options.thinkingOpenTag;
this._thinkingCloseTag = options.thinkingCloseTag;

if (options.isFromAutoDetect) {
this._setupCompletionOptions();
}
}

/**
* Override _streamChat to handle thinking tag extraction if configured.
* This allows vLLM to support models that use custom tags (like <think>...</think>)
* instead of the standard reasoning_content field.
*/
protected async *_streamChat(
messages: ChatMessage[],
signal: AbortSignal,
options: CompletionOptions,
): AsyncGenerator<ChatMessage> {
// If no custom thinking tags configured, use parent implementation
if (!this._thinkingOpenTag || !this._thinkingCloseTag) {
for await (const chunk of super._streamChat(messages, signal, options)) {
yield chunk;
}
return;
}

// Use thinking tag extractor for custom tag formats
const extractor = new ThinkingTagExtractor(
this._thinkingOpenTag,
this._thinkingCloseTag,
);

for await (const chunk of super._streamChat(messages, signal, options)) {
if (chunk.role === "assistant" && typeof chunk.content === "string") {
const extracted = extractor.process(chunk.content);

// Yield thinking content first
if (extracted.thinking) {
yield {
role: "thinking",
content: extracted.thinking,
};
}

// Yield regular content if present
if (extracted.content) {
yield {
...chunk,
content: extracted.content,
};
}
} else {
// Pass through non-assistant chunks unchanged
yield chunk;
}
}

// Flush any remaining content from the extractor
const flushed = extractor.flush();
if (flushed.thinking) {
yield { role: "thinking", content: flushed.thinking };
}
if (flushed.content) {
yield { role: "assistant", content: flushed.content };
}
}

supportsFim(): boolean {
return false;
}
Expand Down
127 changes: 127 additions & 0 deletions core/llm/thinkingTagExtractor.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
/**
* Helper class to extract thinking content from custom tags during streaming.
* This is used for providers like vLLM that support custom thinking output formats.
*/
export class ThinkingTagExtractor {
private buffer: string = "";
private inThinkingBlock: boolean = false;
private readonly openTag: string;
private readonly closeTag: string;

constructor(openTag: string, closeTag: string) {
this.openTag = openTag;
this.closeTag = closeTag;
}

/**
* Process a chunk of text and extract thinking/regular content.
* Returns an object with the thinking content and regular content that should be yielded.
*/
process(text: string): {
thinking: string;
content: string;
} {
this.buffer += text;

let thinking = "";
let content = "";

while (this.buffer.length > 0) {
if (this.inThinkingBlock) {
// Look for closing tag
const closeIndex = this.buffer.indexOf(this.closeTag);
if (closeIndex !== -1) {
// Found closing tag - extract thinking content up to it
thinking += this.buffer.substring(0, closeIndex);
this.buffer = this.buffer.substring(
closeIndex + this.closeTag.length,
);
this.inThinkingBlock = false;
} else {
// No closing tag yet - check if we might have a partial closing tag at the end
const partialMatchLength = this.getPartialMatchLength(
this.buffer,
this.closeTag,
);
if (partialMatchLength > 0) {
// Keep the potential partial match in the buffer
thinking += this.buffer.substring(
0,
this.buffer.length - partialMatchLength,
);
this.buffer = this.buffer.substring(
this.buffer.length - partialMatchLength,
);
} else {
// No partial match - all content is thinking
thinking += this.buffer;
this.buffer = "";
}
break;
}
} else {
// Not in thinking block - look for opening tag
const openIndex = this.buffer.indexOf(this.openTag);
if (openIndex !== -1) {
// Found opening tag
content += this.buffer.substring(0, openIndex);
this.buffer = this.buffer.substring(openIndex + this.openTag.length);
this.inThinkingBlock = true;
} else {
// No opening tag - check if we might have a partial opening tag at the end
const partialMatchLength = this.getPartialMatchLength(
this.buffer,
this.openTag,
);
if (partialMatchLength > 0) {
// Keep the potential partial match in the buffer
content += this.buffer.substring(
0,
this.buffer.length - partialMatchLength,
);
this.buffer = this.buffer.substring(
this.buffer.length - partialMatchLength,
);
} else {
// No partial match - all content is regular content
content += this.buffer;
this.buffer = "";
}
break;
}
}
}

return { thinking, content };
}

/**
* Flush any remaining content in the buffer.
* Call this when the stream ends.
*/
flush(): {
thinking: string;
content: string;
} {
const result = {
thinking: this.inThinkingBlock ? this.buffer : "",
content: this.inThinkingBlock ? "" : this.buffer,
};
this.buffer = "";
this.inThinkingBlock = false;
return result;
}

/**
* Check if the end of the text could be the start of the tag.
* Returns the length of the partial match, or 0 if no match.
*/
private getPartialMatchLength(text: string, tag: string): number {
for (let i = 1; i < tag.length && i <= text.length; i++) {
if (text.slice(-i) === tag.slice(0, i)) {
return i;
}
}
return 0;
}
}
Loading
Loading