guoqingbao · guoqingbao · Jan 31, 2026 · Jan 30, 2026 · Jan 31, 2026
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "vllm-rs"
-version = "0.8.10"
+version = "0.8.11"
 edition = "2021"
 default-run = "vllm-rs"
 
@@ -61,6 +61,8 @@ bytemuck = "1.24.0"
 regex = "1.12.2"
 local-ip-address = "0.6.5"
 url = "2.5.7"
+tool-parser = "1.0"
+openai-protocol = "1.0"
 
 [lib]
 name = "vllm_rs"

diff --git a/ReadMe-CN.md b/ReadMe-CN.md
@@ -77,6 +77,7 @@
 ## 📚 文档
 - [快速开始](docs/get_started.md)
 - [Docker构建](docs/docker.md)
+- [工具调用解析](docs/tool_parsing.md)
 - [MCP集成与工具调用](docs/mcp_tool_calling.md)
 - [Claude Code使用vLLM.rs后端](docs/claude_code.md)
 - [Goose AI Agent使用vLLM.rs后端](docs/goose.md)

diff --git a/ReadMe.md b/ReadMe.md
@@ -78,6 +78,7 @@ All models support hardware FP8 KV-cache acceleration (requires SM90+ and disabl
 ## 📚 Guides
 - [Get Started](docs/get_started.md)
 - [Docker Build](docs/docker.md)
+- [Tool Parsing](docs/tool_parsing.md)
 - [MCP Integration and Tool Calling](docs/mcp_tool_calling.md)
 - [Work with Claude Code](docs/claude_code.md)
 - [Work with Goose AI Agent](docs/goose.md)

diff --git a/docs/tool_parsing.md b/docs/tool_parsing.md
@@ -0,0 +1,74 @@
+## Tool Call Parsing
+
+This project uses the model-specific parsers for parsing tool calls for both
+streaming and non-streaming responses. The goal is to keep parsing logic
+consistent across models while remaining robust to partial output and format
+differences.
+
+### Parser selection
+
+Parser selection follows this order:
+
+1. `--enforce-parser` (if provided and valid).
+2. Model-based heuristics (model type + model id).
+3. Fallback to `passthrough`.
+
+If you pass an invalid name to `--enforce-parser`, the server returns an error
+and includes the list of valid parser names.
+
+Available parser names:
+
+- passthrough
+- json
+- mistral
+- qwen
+- qwen_coder
+- pythonic
+- llama
+- deepseek
+- glm45_moe
+- glm47_moe
+- step3
+- kimik2
+- minimax_m2
+
+### Streaming parsing
+
+Streaming requests use incremental parsing logic. Each incoming token is appended to an internal buffer and fed into the parser. The parser emits streaming tool call fragments, which are accumulated into full tool calls.
+
+When an end marker is detected (token id or `</tool_call>` tag), the stream parser:
+
+1. Flushes any unstreamed arguments from the external parser.
+2. Builds tool calls from the accumulated fragments.
+3. If no tool calls were produced, falls back to `parse_complete_with_fallback`
+   on the buffered content.
+
+If parsing still fails, the buffered content is emitted as normal text so the
+client does not lose output.
+
+### Non-streaming parsing
+
+Non-streaming requests reuse the same stream parser and call
+`parse_complete_with_fallback`. This keeps parser selection and fallback logic
+identical between streaming and non-streaming paths.
+
+### Enforcing a parser
+
+CLI (Rust server):
+
+```
+--enforce-parser qwen_coder
+```
+
+Python server example (`server.py` or `vllm_rs.server`):
+
+```
+--enforce-parser qwen_coder
+```
+
+### Environment Variables
+
+- `VLLM_RS_STRICT_TOOL_CALL`:
+  - `1` or `true`: Strict validation. Dropping invalid tool calls (calls that do not match the schema) effectively preventing them from being sent to the client. The server logs a warning for dropped calls.
+  - `0` or `false` (default): Lenient validation. Invalid tool calls are kept and sent to the client, but a warning is logged by the server. This allows models to output "hallucinated" or malformed calls if desired.
+
diff --git a/example/server.py b/example/server.py
@@ -37,12 +37,39 @@ def parse_args():
     parser.add_argument("--mcp_config", type=str, default=None)
     parser.add_argument("--mcp_command", type=str, default=None)
     parser.add_argument("--mcp_args", type=str, default=None)
+    parser.add_argument("--enforce-parser", type=str, default=None)
     parser.add_argument("--pd-server-prefix-cache-ratio", type=float, default=None)
     parser.add_argument("--pd-client-prefix-cache-ratio", type=float, default=None)
 
     args = parser.parse_args()
     if args.pd_server and args.ui_server:
         raise ValueError("PD Server cannot run with UI Server enabled!")
+    if args.enforce_parser is not None:
+        enforce_parser = args.enforce_parser.strip()
+        if enforce_parser == "":
+            args.enforce_parser = None
+        else:
+            valid_parsers = {
+                "passthrough",
+                "json",
+                "mistral",
+                "qwen",
+                "qwen_coder",
+                "pythonic",
+                "llama",
+                "deepseek",
+                "glm45_moe",
+                "glm47_moe",
+                "step3",
+                "kimik2",
+                "minimax_m2",
+            }
+            if enforce_parser not in valid_parsers:
+                valid_list = ", ".join(sorted(valid_parsers))
+                raise ValueError(
+                    f"Invalid --enforce-parser '{enforce_parser}'. Valid parsers: {valid_list}"
+                )
+            args.enforce_parser = enforce_parser
     return args
 
 def run_server(args):
@@ -73,6 +100,7 @@ def run_server(args):
         model_id=args.m,
         weight_path=args.w,
         weight_file=args.f,
+        enforce_parser=args.enforce_parser,
         max_num_seqs=max_num_seqs,
         max_model_len=args.max_model_len,
         max_tokens=args.max_tokens,

diff --git a/src/api.rs b/src/api.rs
@@ -124,6 +124,7 @@ impl EngineBuilder {
             None,
             None,
             None,
+            None,
             self.isq,
             Some(self.device_ids.clone().unwrap_or(vec![0]).len()),
             self.device_ids.clone(),

diff --git a/src/core/runner.rs b/src/core/runner.rs
@@ -576,7 +576,7 @@ impl ModelRunner {
 
                 // Log thinking parameter only from first rank to avoid duplicate logs in multi-GPU
                 if self.is_first_rank && seqs[0].num_cached_tokens == 0 {
-                    crate::log_warn!(
+                    crate::log_info!(
                         "User's thinking preference for reasoning models: {:?}",
                         user_params.thinking
                     );

diff --git a/src/core/scheduler.rs b/src/core/scheduler.rs
@@ -566,7 +566,7 @@ impl Scheduler {
                     let mut seq = seq.clone();
                     seq.num_cached_tokens += CHUNK_SIZE; //current prefilled CHUNK_SIZE
                     seq.status = SequenceStatus::Waiting;
-                    crate::log_warn!(
+                    crate::log_info!(
                         "Seq {} - chunk prefilled {} (remain {} tokens)",
                         seq.id,
                         seq.num_cached_tokens,

diff --git a/src/main.rs b/src/main.rs
@@ -4,6 +4,7 @@ use colored::Colorize;
 use reedline::{DefaultPrompt, DefaultPromptSegment, Reedline, Signal};
 use serde_json;
 use std::sync::Arc;
+use tool_parser::ParserFactory;
 use vllm_rs::core::engine::StreamItem;
 use vllm_rs::core::engine::GLOBAL_RT;
 use vllm_rs::core::{engine::LLMEngine, GenerationOutput};
@@ -27,6 +28,17 @@ async fn main() -> Result<()> {
         candle_core::bail!("Must provide model_id or weight_path or weight_file!");
     }
 
+    if let Some(ref enforced) = args.enforce_parser {
+        let parsers = ParserFactory::new().list_parsers();
+        if !parsers.contains(enforced) {
+            candle_core::bail!(
+                "Invalid enforce-parser '{}'. Valid parsers: {}",
+                enforced,
+                parsers.join(", ")
+            );
+        }
+    }
+
     let dtype = get_dtype(args.dtype);
 
     let (max_num_seqs, interactive) = if args.batch.is_some() {
@@ -175,6 +187,7 @@ async fn main() -> Result<()> {
         args.weight_file,
         args.hf_token,
         args.hf_token_path,
+        args.enforce_parser.clone(),
         Some(std::cmp::max(max_num_seqs, prompts.len())),
         None,
         max_model_len,

diff --git a/src/mcp/manager.rs b/src/mcp/manager.rs
@@ -392,9 +392,9 @@ fn map_mcp_tools(
             );
             Tool {
                 tool_type: "function".to_string(),
-                function: crate::tools::FunctionDefinition {
+                function: crate::tools::Function {
                     name: prefixed_name,
-                    description: tool.description.unwrap_or_default(),
+                    description: tool.description,
                     parameters: tool.input_schema,
                     strict: None,
                 },

diff --git a/src/py/mod.rs b/src/py/mod.rs
@@ -259,7 +259,7 @@ impl Message {
 impl EngineConfig {
     #[new]
     #[pyo3(signature = (model_id=None, weight_path=None, weight_file=None,
-        hf_token=None, hf_token_path=None,
+        hf_token=None, hf_token_path=None, enforce_parser=None,
         max_num_seqs=Some(32), config_model_len=None, max_model_len=Some(1024), max_tokens=None,
         isq=None, num_shards=Some(1), device_ids=None,
         generation_cfg=None, seed=None, prefix_cache=None, prefix_cache_max_tokens=None,
@@ -273,6 +273,7 @@ impl EngineConfig {
         weight_file: Option<String>,
         hf_token: Option<String>,
         hf_token_path: Option<String>,
+        enforce_parser: Option<String>,
         max_num_seqs: Option<usize>,
         config_model_len: Option<usize>,
         max_model_len: Option<usize>,
@@ -321,6 +322,7 @@ impl EngineConfig {
             weight_file,
             hf_token,
             hf_token_path,
+            enforce_parser,
             num_blocks: 128, //placeholder
             kv_fraction,
             cpu_mem_fold,
-Original file line number
+Diff line change
@@ Expand Up / @@ -124,6 +124,7 @@ impl EngineBuilder { @@
                 None,
                 None,
                 None,
+                None,
                 self.isq,
                 Some(self.device_ids.clone().unwrap_or(vec![0]).len()),
                 self.device_ids.clone(),
@@ Expand Down @@