continuedev · AyRickk · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025
@@ -996,7 +996,6 @@ export abstract class BaseLLM implements ILLM {
     return completionOptions;
   }
 
-  // Update the processChatChunk method:
   private processChatChunk(
     chunk: ChatMessage,
     interaction: ILLMInteractionLog | undefined,

@@ -1,5 +1,12 @@
-import { Chunk, LLMOptions } from "../../index.js";
+import {
+  ChatMessage,
+  Chunk,
+  CompletionOptions,
+  LLMOptions,
+} from "../../index.js";
 
+import { LlmApiRequestType } from "../openaiTypeConverters.js";
+import { ThinkingTagExtractor } from "../thinkingTagExtractor.js";
 import OpenAI from "./OpenAI.js";
 
 // vLLM-specific rerank response types
@@ -20,16 +27,148 @@ interface VllmRerankResponse {
   results: VllmRerankItem[];
 }
 
+/**
+ * vLLM-specific options for thinking output extraction.
+ * These options allow configuring custom tags to extract thinking content from the response.
+ */
+export interface VllmOptions extends LLMOptions {
+  /**
+   * Custom opening tag for extracting thinking/reasoning content from streamed responses.
+   * Used with models that output thinking content wrapped in custom tags (e.g., `<think>`, `<reasoning>`).
+   * Must be used together with `thinkingCloseTag`.
+   */
+  thinkingOpenTag?: string;
+  /**
+   * Custom closing tag for extracting thinking/reasoning content from streamed responses.
+   * Must be used together with `thinkingOpenTag`.
+   */
+  thinkingCloseTag?: string;
+}
+
+/**
+ * vLLM provider for Continue.
+ *
+ * vLLM supports thinking/reasoning outputs in two ways:
+ * 1. Via the standard `reasoning_content` field in the response (default OpenAI format)
+ * 2. Via custom tags in the response content (configurable)
+ *
+ * For custom thinking tag formats, you can configure `thinkingOpenTag` and `thinkingCloseTag`
+ * in the model options. For example:
+ *
+ * ```yaml
+ * models:
+ *   - provider: vllm
+ *     model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+ *     apiBase: http://localhost:8000
+ *     thinkingOpenTag: "<think>"
+ *     thinkingCloseTag: "</think>"
+ * ```
+ *
+ * See vLLM documentation for more details:
+ * https://docs.vllm.ai/en/latest/features/reasoning_outputs.html
+ */
 class Vllm extends OpenAI {
   static providerName = "vllm";
-  constructor(options: LLMOptions) {
+
+  // vLLM-specific options for thinking tag extraction
+  private _thinkingOpenTag?: string;
+  private _thinkingCloseTag?: string;
+
+  // Override useOpenAIAdapterFor to NOT include "streamChat".
+  // vLLM uses the reasoning_content field for thinking output (via vLLM's reasoning parser),
+  // which is not part of the standard OpenAI SDK types. By excluding "streamChat", we force
+  // the use of the parent class's _streamChat method which uses streamSse for direct SSE
+  // parsing. This ensures proper handling of reasoning_content in streaming responses,
+  // as streamSse parses JSON directly and preserves all fields including non-standard ones.
+  protected override useOpenAIAdapterFor: (LlmApiRequestType | "*")[] = [
+    "chat",
+    "embed",
+    "list",
+    "rerank",
+    "streamFim",
+  ];
+
+  constructor(options: VllmOptions) {
     super(options);
 
+    // Validate that thinking tags are provided together
+    if (
+      (options.thinkingOpenTag && !options.thinkingCloseTag) ||
+      (!options.thinkingOpenTag && options.thinkingCloseTag)
+    ) {
+      throw new Error(
+        "vLLM: Both thinkingOpenTag and thinkingCloseTag must be provided together",
+      );
+    }
+
+    // Store vLLM-specific options
+    this._thinkingOpenTag = options.thinkingOpenTag;
+    this._thinkingCloseTag = options.thinkingCloseTag;
+
     if (options.isFromAutoDetect) {
       this._setupCompletionOptions();
     }
   }
 
+  /**
+   * Override _streamChat to handle thinking tag extraction if configured.
+   * This allows vLLM to support models that use custom tags (like <think>...</think>)
+   * instead of the standard reasoning_content field.
+   */
+  protected async *_streamChat(
+    messages: ChatMessage[],
+    signal: AbortSignal,
+    options: CompletionOptions,
+  ): AsyncGenerator<ChatMessage> {
+    // If no custom thinking tags configured, use parent implementation
+    if (!this._thinkingOpenTag || !this._thinkingCloseTag) {
+      for await (const chunk of super._streamChat(messages, signal, options)) {
+        yield chunk;
+      }
+      return;
+    }
+
+    // Use thinking tag extractor for custom tag formats
+    const extractor = new ThinkingTagExtractor(
+      this._thinkingOpenTag,
+      this._thinkingCloseTag,
+    );
+
+    for await (const chunk of super._streamChat(messages, signal, options)) {
+      if (chunk.role === "assistant" && typeof chunk.content === "string") {
+        const extracted = extractor.process(chunk.content);
+
+        // Yield thinking content first
+        if (extracted.thinking) {
+          yield {
+            role: "thinking",
+            content: extracted.thinking,
+          };
+        }
+
+        // Yield regular content if present
+        if (extracted.content) {
+          yield {
+            ...chunk,
+            content: extracted.content,
+          };
+        }
+      } else {
+        // Pass through non-assistant chunks unchanged
+        yield chunk;
+      }
+    }
+
+    // Flush any remaining content from the extractor
+    const flushed = extractor.flush();
+    if (flushed.thinking) {
+      yield { role: "thinking", content: flushed.thinking };
+    }
+    if (flushed.content) {
+      yield { role: "assistant", content: flushed.content };
+    }
+  }
+
   supportsFim(): boolean {
     return false;
   }

@@ -0,0 +1,127 @@
+/**
+ * Helper class to extract thinking content from custom tags during streaming.
+ * This is used for providers like vLLM that support custom thinking output formats.
+ */
+export class ThinkingTagExtractor {
+  private buffer: string = "";
+  private inThinkingBlock: boolean = false;
+  private readonly openTag: string;
+  private readonly closeTag: string;
+
+  constructor(openTag: string, closeTag: string) {
+    this.openTag = openTag;
+    this.closeTag = closeTag;
+  }
+
+  /**
+   * Process a chunk of text and extract thinking/regular content.
+   * Returns an object with the thinking content and regular content that should be yielded.
+   */
+  process(text: string): {
+    thinking: string;
+    content: string;
+  } {
+    this.buffer += text;
+
+    let thinking = "";
+    let content = "";
+
+    while (this.buffer.length > 0) {
+      if (this.inThinkingBlock) {
+        // Look for closing tag
+        const closeIndex = this.buffer.indexOf(this.closeTag);
+        if (closeIndex !== -1) {
+          // Found closing tag - extract thinking content up to it
+          thinking += this.buffer.substring(0, closeIndex);
+          this.buffer = this.buffer.substring(
+            closeIndex + this.closeTag.length,
+          );
+          this.inThinkingBlock = false;
+        } else {
+          // No closing tag yet - check if we might have a partial closing tag at the end
+          const partialMatchLength = this.getPartialMatchLength(
+            this.buffer,
+            this.closeTag,
+          );
+          if (partialMatchLength > 0) {
+            // Keep the potential partial match in the buffer
+            thinking += this.buffer.substring(
+              0,
+              this.buffer.length - partialMatchLength,
+            );
+            this.buffer = this.buffer.substring(
+              this.buffer.length - partialMatchLength,
+            );
+          } else {
+            // No partial match - all content is thinking
+            thinking += this.buffer;
+            this.buffer = "";
+          }
+          break;
+        }
+      } else {
+        // Not in thinking block - look for opening tag
+        const openIndex = this.buffer.indexOf(this.openTag);
+        if (openIndex !== -1) {
+          // Found opening tag
+          content += this.buffer.substring(0, openIndex);
+          this.buffer = this.buffer.substring(openIndex + this.openTag.length);
+          this.inThinkingBlock = true;
+        } else {
+          // No opening tag - check if we might have a partial opening tag at the end
+          const partialMatchLength = this.getPartialMatchLength(
+            this.buffer,
+            this.openTag,
+          );
+          if (partialMatchLength > 0) {
+            // Keep the potential partial match in the buffer
+            content += this.buffer.substring(
+              0,
+              this.buffer.length - partialMatchLength,
+            );
+            this.buffer = this.buffer.substring(
+              this.buffer.length - partialMatchLength,
+            );
+          } else {
+            // No partial match - all content is regular content
+            content += this.buffer;
+            this.buffer = "";
+          }
+          break;
+        }
+      }
+    }
+
+    return { thinking, content };
+  }
+
+  /**
+   * Flush any remaining content in the buffer.
+   * Call this when the stream ends.
+   */
+  flush(): {
+    thinking: string;
+    content: string;
+  } {
+    const result = {
+      thinking: this.inThinkingBlock ? this.buffer : "",
+      content: this.inThinkingBlock ? "" : this.buffer,
+    };
+    this.buffer = "";
+    this.inThinkingBlock = false;
+    return result;
+  }
+
+  /**
+   * Check if the end of the text could be the start of the tag.
+   * Returns the length of the partial match, or 0 if no match.
+   */
+  private getPartialMatchLength(text: string, tag: string): number {
+    for (let i = 1; i < tag.length && i <= text.length; i++) {
+      if (text.slice(-i) === tag.slice(0, i)) {
+        return i;
+      }
+    }
+    return 0;
+  }
+}