Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "vllm-rs"
version = "0.8.10"
version = "0.8.11"
edition = "2021"
default-run = "vllm-rs"

Expand Down Expand Up @@ -61,6 +61,8 @@ bytemuck = "1.24.0"
regex = "1.12.2"
local-ip-address = "0.6.5"
url = "2.5.7"
tool-parser = "1.0"
openai-protocol = "1.0"

[lib]
name = "vllm_rs"
Expand Down
1 change: 1 addition & 0 deletions ReadMe-CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
## 📚 文档
- [快速开始](docs/get_started.md)
- [Docker构建](docs/docker.md)
- [工具调用解析](docs/tool_parsing.md)
- [MCP集成与工具调用](docs/mcp_tool_calling.md)
- [Claude Code使用vLLM.rs后端](docs/claude_code.md)
- [Goose AI Agent使用vLLM.rs后端](docs/goose.md)
Expand Down
1 change: 1 addition & 0 deletions ReadMe.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ All models support hardware FP8 KV-cache acceleration (requires SM90+ and disabl
## 📚 Guides
- [Get Started](docs/get_started.md)
- [Docker Build](docs/docker.md)
- [Tool Parsing](docs/tool_parsing.md)
- [MCP Integration and Tool Calling](docs/mcp_tool_calling.md)
- [Work with Claude Code](docs/claude_code.md)
- [Work with Goose AI Agent](docs/goose.md)
Expand Down
74 changes: 74 additions & 0 deletions docs/tool_parsing.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
## Tool Call Parsing

This project uses the model-specific parsers for parsing tool calls for both
streaming and non-streaming responses. The goal is to keep parsing logic
consistent across models while remaining robust to partial output and format
differences.

### Parser selection

Parser selection follows this order:

1. `--enforce-parser` (if provided and valid).
2. Model-based heuristics (model type + model id).
3. Fallback to `passthrough`.

If you pass an invalid name to `--enforce-parser`, the server returns an error
and includes the list of valid parser names.

Available parser names:

- passthrough
- json
- mistral
- qwen
- qwen_coder
- pythonic
- llama
- deepseek
- glm45_moe
- glm47_moe
- step3
- kimik2
- minimax_m2

### Streaming parsing

Streaming requests use incremental parsing logic. Each incoming token is appended to an internal buffer and fed into the parser. The parser emits streaming tool call fragments, which are accumulated into full tool calls.

When an end marker is detected (token id or `</tool_call>` tag), the stream parser:

1. Flushes any unstreamed arguments from the external parser.
2. Builds tool calls from the accumulated fragments.
3. If no tool calls were produced, falls back to `parse_complete_with_fallback`
on the buffered content.

If parsing still fails, the buffered content is emitted as normal text so the
client does not lose output.

### Non-streaming parsing

Non-streaming requests reuse the same stream parser and call
`parse_complete_with_fallback`. This keeps parser selection and fallback logic
identical between streaming and non-streaming paths.

### Enforcing a parser

CLI (Rust server):

```
--enforce-parser qwen_coder
```

Python server example (`server.py` or `vllm_rs.server`):

```
--enforce-parser qwen_coder
```

### Environment Variables

- `VLLM_RS_STRICT_TOOL_CALL`:
- `1` or `true`: Strict validation. Dropping invalid tool calls (calls that do not match the schema) effectively preventing them from being sent to the client. The server logs a warning for dropped calls.
- `0` or `false` (default): Lenient validation. Invalid tool calls are kept and sent to the client, but a warning is logged by the server. This allows models to output "hallucinated" or malformed calls if desired.

28 changes: 28 additions & 0 deletions example/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,39 @@ def parse_args():
parser.add_argument("--mcp_config", type=str, default=None)
parser.add_argument("--mcp_command", type=str, default=None)
parser.add_argument("--mcp_args", type=str, default=None)
parser.add_argument("--enforce-parser", type=str, default=None)
parser.add_argument("--pd-server-prefix-cache-ratio", type=float, default=None)
parser.add_argument("--pd-client-prefix-cache-ratio", type=float, default=None)

args = parser.parse_args()
if args.pd_server and args.ui_server:
raise ValueError("PD Server cannot run with UI Server enabled!")
if args.enforce_parser is not None:
enforce_parser = args.enforce_parser.strip()
if enforce_parser == "":
args.enforce_parser = None
else:
valid_parsers = {
"passthrough",
"json",
"mistral",
"qwen",
"qwen_coder",
"pythonic",
"llama",
"deepseek",
"glm45_moe",
"glm47_moe",
"step3",
"kimik2",
"minimax_m2",
}
if enforce_parser not in valid_parsers:
valid_list = ", ".join(sorted(valid_parsers))
raise ValueError(
f"Invalid --enforce-parser '{enforce_parser}'. Valid parsers: {valid_list}"
)
args.enforce_parser = enforce_parser
return args

def run_server(args):
Expand Down Expand Up @@ -73,6 +100,7 @@ def run_server(args):
model_id=args.m,
weight_path=args.w,
weight_file=args.f,
enforce_parser=args.enforce_parser,
max_num_seqs=max_num_seqs,
max_model_len=args.max_model_len,
max_tokens=args.max_tokens,
Expand Down
1 change: 1 addition & 0 deletions src/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ impl EngineBuilder {
None,
None,
None,
None,
self.isq,
Some(self.device_ids.clone().unwrap_or(vec![0]).len()),
self.device_ids.clone(),
Expand Down
2 changes: 1 addition & 1 deletion src/core/runner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,7 @@ impl ModelRunner {

// Log thinking parameter only from first rank to avoid duplicate logs in multi-GPU
if self.is_first_rank && seqs[0].num_cached_tokens == 0 {
crate::log_warn!(
crate::log_info!(
"User's thinking preference for reasoning models: {:?}",
user_params.thinking
);
Expand Down
2 changes: 1 addition & 1 deletion src/core/scheduler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,7 @@ impl Scheduler {
let mut seq = seq.clone();
seq.num_cached_tokens += CHUNK_SIZE; //current prefilled CHUNK_SIZE
seq.status = SequenceStatus::Waiting;
crate::log_warn!(
crate::log_info!(
"Seq {} - chunk prefilled {} (remain {} tokens)",
seq.id,
seq.num_cached_tokens,
Expand Down
13 changes: 13 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use colored::Colorize;
use reedline::{DefaultPrompt, DefaultPromptSegment, Reedline, Signal};
use serde_json;
use std::sync::Arc;
use tool_parser::ParserFactory;
use vllm_rs::core::engine::StreamItem;
use vllm_rs::core::engine::GLOBAL_RT;
use vllm_rs::core::{engine::LLMEngine, GenerationOutput};
Expand All @@ -27,6 +28,17 @@ async fn main() -> Result<()> {
candle_core::bail!("Must provide model_id or weight_path or weight_file!");
}

if let Some(ref enforced) = args.enforce_parser {
let parsers = ParserFactory::new().list_parsers();
if !parsers.contains(enforced) {
candle_core::bail!(
"Invalid enforce-parser '{}'. Valid parsers: {}",
enforced,
parsers.join(", ")
);
}
}

let dtype = get_dtype(args.dtype);

let (max_num_seqs, interactive) = if args.batch.is_some() {
Expand Down Expand Up @@ -175,6 +187,7 @@ async fn main() -> Result<()> {
args.weight_file,
args.hf_token,
args.hf_token_path,
args.enforce_parser.clone(),
Some(std::cmp::max(max_num_seqs, prompts.len())),
None,
max_model_len,
Expand Down
4 changes: 2 additions & 2 deletions src/mcp/manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -392,9 +392,9 @@ fn map_mcp_tools(
);
Tool {
tool_type: "function".to_string(),
function: crate::tools::FunctionDefinition {
function: crate::tools::Function {
name: prefixed_name,
description: tool.description.unwrap_or_default(),
description: tool.description,
parameters: tool.input_schema,
strict: None,
},
Expand Down
4 changes: 3 additions & 1 deletion src/py/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ impl Message {
impl EngineConfig {
#[new]
#[pyo3(signature = (model_id=None, weight_path=None, weight_file=None,
hf_token=None, hf_token_path=None,
hf_token=None, hf_token_path=None, enforce_parser=None,
max_num_seqs=Some(32), config_model_len=None, max_model_len=Some(1024), max_tokens=None,
isq=None, num_shards=Some(1), device_ids=None,
generation_cfg=None, seed=None, prefix_cache=None, prefix_cache_max_tokens=None,
Expand All @@ -273,6 +273,7 @@ impl EngineConfig {
weight_file: Option<String>,
hf_token: Option<String>,
hf_token_path: Option<String>,
enforce_parser: Option<String>,
max_num_seqs: Option<usize>,
config_model_len: Option<usize>,
max_model_len: Option<usize>,
Expand Down Expand Up @@ -321,6 +322,7 @@ impl EngineConfig {
weight_file,
hf_token,
hf_token_path,
enforce_parser,
num_blocks: 128, //placeholder
kv_fraction,
cpu_mem_fold,
Expand Down
Loading