guoqingbao · sempervictus · Mar 2, 2026 · Mar 8, 2026 · Mar 7, 2026 · Mar 9, 2026
diff --git a/Cargo.toml b/Cargo.toml
@@ -11,8 +11,8 @@ categories = ["algorithms", "hardware-support", "science"]
 license = "MIT"
 
 [dependencies]
-candle-core = { git = "https://github.com/guoqingbao/candle.git", version = "0.8.3", rev = "1e9d1a9" }
-candle-nn = { git = "https://github.com/guoqingbao/candle.git", version = "0.8.3", rev = "1e9d1a9" }
+candle-core = { git = "https://github.com/guoqingbao/candle.git", version = "0.8.3", rev = "157b048" }
+candle-nn = { git = "https://github.com/guoqingbao/candle.git", version = "0.8.3", rev = "157b048" }
 serde = { version = "1.0.190", features = ["serde_derive"] }
 tokenizers = {version = "0.21.2", features = ["http"] }
 hf-hub = "0.4.1"
@@ -21,7 +21,8 @@ itertools = "0.13.0"
 akin = "0.4.0"
 indicatif = "0.17.11"
 serde_json = "1.0.108"
-llguidance = "0.6"
+llguidance = { version = "1.6", default-features = false, features = ["lark"] }
+toktrie_hf_tokenizers = "1.6"
 toktrie = "1.4"
 half = { version = "2.5.0", features = ["num-traits", "use-intrinsics", "rand_distr"] }
 tokio = { version = "1.38.0", features = ["sync"] }
@@ -35,6 +36,7 @@ interprocess = "2.2.2"
 serde-big-array = "0.5.1"
 bincode = { version = "1.3.1" }
 twox-hash = "2.1.1"
+rmp-serde = "1.3.1"
 rand = "0.9.0"
 rayon="1.10.0"
 clap = { version = "4.4.7", features = ["derive"] }
@@ -44,7 +46,7 @@ ahash = "0.8.11"
 reedline = "0.40.0"
 pyo3 = { version = "0.25.1", features = ["extension-module", "abi3-py38"], optional = true }
 parking_lot = "0.12.4"
-attention-rs = { git = "https://github.com/guoqingbao/attention.rs.git", version="0.4.1", rev = "af0b475" }
+attention-rs = { git = "https://github.com/guoqingbao/attention.rs.git", version="0.4.1", rev = "29e4beb" }
 once_cell = "1.21.3"
 tqdm = "0.8.0"
 futures = "0.3.31"
@@ -60,7 +62,7 @@ utoipa = { version = "4.2", features = ["axum_extras"] }
 colored = { version = "3.0.0" }
 tower-http = { version = "0.6.6", features = ["cors"] }
 rustchatui = { git = "https://github.com/guoqingbao/rustchatui.git", rev = "68caad9" }
-sysinfo = "0.37.2"
+sysinfo = "0.38.3"
 image = { version = "0.25.6", default-features = false, features = ['bmp', 'gif', 'jpeg', 'png', 'tiff', 'webp'] }
 reqwest = { version = "0.12.24", features = ["blocking", "json", "rustls-tls"]}
 bytemuck = "1.24.0"
@@ -89,3 +91,7 @@ python = ["pyo3"]
 [[bin]]
 name = "runner"
 path = "src/runner/runner.rs"
+
+[[bin]]
+name = "special-tokens-extraction"
+path = "example/special-tokens-extraction/src/main.rs"
diff --git a/ReadMe-CN.md b/ReadMe-CN.md
@@ -319,8 +319,30 @@ cargo install --features metal
   </details>
 
 ---
-## 🔌 MCP集成 (工具调用)
 
+## 🔌 LLGuidance 支持（结构化输出与约束）
+
+vLLM.rs 现在支持通过 llguidance 库实现结构化输出和约束生成：
+
+- **工具调用优化**：使用 `--enable-tool-grammar` 启用工具调用语法，强制模型输出符合工具参数schema的JSON结构
+- **自定义约束**：使用 `--allow-constraint-api` 允许客户端通过 structured_outputs 或 response_format 提交 Lark/Regex/JSON Schema 约束
+- **正则表达式约束**：强制输出符合特定格式（如电话号码、日期等）
+- **JSON Schema 约束**：通过 OpenAI 兼容的 response_format 或 structured_outputs 提交自定义约束
+
+**使用示例：**
+```bash
+# 启用工具调用语法（自动从 MCP 工具构建 LLG 语法）
+vllm-rs --m Qwen/Qwen3-30B-A3B-Instruct --enable-tool-grammar --ui-server
+
+# 启用客户端约束API（允许OpenAI风格的structured_outputs/response_format）
+vllm-rs --m Qwen/Qwen3-30B-A3B-Instruct --allow-constraint-api --ui-server
+```
+
+查看 [**结构化输出文档 →**](docs/llguidance-integration.md)
+
+---
+
+## 🔌 MCP集成 (工具调用)
 通过Model Context Protocol让LLM调用外部工具。
 
 ```bash
@@ -455,6 +477,8 @@ pip install target/wheels/vllm_rs-*-cp38-abi3-*.whl --force-reinstall
 | `--kv-fraction`       |  用于控制KVCache使用量 (模型加载后剩余可用GPU显存的百分比) |
 | `--prefix-cache`   | 启用前缀缓存，用于多轮对话 |
 | `--prefix-cache-max-tokens`   | 限制前缀缓存大小（按 block size 向下取整） |
+| `--allow-constraint-api`      | 允许通过HTTP API提交客户端约束（默认：false） |
+| `--enable-tool-grammar`       | 自动从工具构建LLG语法（默认：false） |
 
 ### MCP配置参数
 

diff --git a/ReadMe.md b/ReadMe.md
@@ -95,9 +95,9 @@ All models support hardware FP8 KV-cache acceleration (requires SM90+ and disabl
 ## 📘 Usage in Python
 
 ### 📦 Install with pip
-- 💡 **CUDA compute capability < 8.0** (e.g., V100) requires a **manual build**  
+- 💡 **CUDA compute capability < 8.0** (e.g., V100) requires a **manual build**
   (no `flash-attn` support; alternatively use **Rust mode**).
-- 💡 The **prebuilt wheel** is built with the `flash-context` feature enabled.  
+- 💡 The **prebuilt wheel** is built with the `flash-context` feature enabled.
   To use **FP8 KV Cache**, you must **build manually** (remove the `flash-context` build flag).
 
 
@@ -284,7 +284,7 @@ Use `--i` to enable interactive mode 🤖, `--ui-server` or `--server` to enable
   # Metal/MacOS
   vllm-rs --m Qwen/Qwen3-4B-GGUF --f Qwen3-4B-Q4_K_M.gguf --ui-server --prefix-cache
   ```
-  
+
   <details open>
     <summary>Multi-GPU + Unquantized Model</summary>
 
@@ -332,6 +332,28 @@ vllm-rs --m Qwen/Qwen3-4B-Instruct-2507-FP8 --ui-server --prefix-cache
 
 ---
 
+## 🔌 LLGuidance Support (Structured Outputs & Constraints)
+
+vLLM.rs now supports structured output and constraint-based generation via llguidance:
+
+- **Tool Call Optimization**: Use `--enable-tool-grammar` to auto-build LLG grammar from tools, forcing model to output JSON matching tool parameter schemas
+- **Custom Constraints**: Use `--allow-constraint-api` to allow clients to submit Lark/Regex/JSON Schema constraints via OpenAI-compatible structured_outputs/response_format
+- **Regex Constraints**: Enforce output formats like phone numbers (`^number\s\d{3}-\d{3}-\d{4}$`)
+- **JSON Schema Constraints**: Enforce structured output via response_format or structured_outputs
+
+**Usage Examples:**
+```bash
+# Enable tool grammar (auto-builds LLG grammar from MCP tools)
+vllm-rs --m Qwen/Qwen3-30B-A3B-Instruct --enable-tool-grammar --ui-server
+
+# Enable client constraints API (accepts structured_outputs/response_format)
+vllm-rs --m Qwen/Qwen3-30B-A3B-Instruct --allow-constraint-api --ui-server
+```
+
+See [**Structured Outputs Documentation →**](docs/llguidance-integration.md)
+
+---
+
 ## 🔌 MCP Integration (Tool Calling)
 
 Enable LLMs to call external tools via Model Context Protocol.
@@ -434,7 +456,7 @@ PD Disaggregation separates prefill (prompt processing) and decode (token genera
 
 ## 📽️ Demo Video
 
-Watch it in action 🎉 
+Watch it in action 🎉
 
 <video src="https://github.com/user-attachments/assets/7fc6aa0b-78ac-4323-923f-d761dd12857f" width="1000px"></video>
 
@@ -462,19 +484,19 @@ pip install maturin[patchelf]  # For Linux/Windows
 2. **Build the Python package**
 
 ```bash
-# Naive CUDA (single GPU only) 
+# Naive CUDA (single GPU only)
 maturin build --release --features cuda,python
 
 # Naive CUDA (+CUDA Graph, experimental)
 ./build.sh --release --features cuda,graph,python
 
-# CUDA (with prefix-cache and FP8 KV Cache, no Flash Attention, compatible with V100) 
+# CUDA (with prefix-cache and FP8 KV Cache, no Flash Attention, compatible with V100)
 ./build.sh --release --features cuda,nccl,python
 
-# CUDA (+Flash Attention, only used in prefill stage) 
+# CUDA (+Flash Attention, only used in prefill stage)
 ./build.sh --release --features cuda,nccl,flash-attn,python
 
-# CUDA (+cutlass (sm90+), +Flash Attention for decoding, +high prefill throughput, long time to build) 
+# CUDA (+cutlass (sm90+), +Flash Attention for decoding, +high prefill throughput, long time to build)
 ./build.sh --release --features cuda,nccl,flash-attn,flash-context,cutlass,python
 
 # macOS (Metal, single GPU only, with prefix-cache and FP8 kvcache)
@@ -518,6 +540,8 @@ pip install target/wheels/vllm_rs-*-cp38-abi3-*.whl --force-reinstall
 | `--kv-fraction`       |  control kvcache usage (percentage of remaining gpu memory after model loading) |
 | `--prefix-cache`   | Enable prefix caching for multi-turn conversations |
 | `--prefix-cache-max-tokens`   | Cap prefix cache size in tokens (rounded down to block size) |
+| `--allow-constraint-api`      | Allow client-submitted constraints via HTTP API (default: false) |
+| `--enable-tool-grammar`       | Automatically build LLG grammar from tools (default: false) |
 
 ### MCP Configuration
 
@@ -563,7 +587,7 @@ pip install target/wheels/vllm_rs-*-cp38-abi3-*.whl --force-reinstall
 * [x] **Claude/Anthropic-compatible API Server**
 * [x] **Support CUDA 13**
 * [x] **Support FlashInfer backend**
-* [ ] TentorRT-LLM 
+* [ ] TentorRT-LLM
 ---
 
 ## 📚 References

diff --git a/docs/goose.md b/docs/goose.md
@@ -17,35 +17,33 @@ python3 -m vllm_rs.server --m Qwen/Qwen3-30B-A3B-Instruct-2507 --d 0,1 --server
 
 ## 2) Configure Goose
 
-### Download and install Goose: https://block.github.io/goose/docs/getting-started/installation/
-
 ```shell
 # For non-UI system,
 export GOOSE_DISABLE_KEYRING=1
 ```
-
 Export empty API KEY
 
 ```shell
 export VLLM_API_KEY="empty"
 ```
 
+### Download and install Goose: https://block.github.io/goose/docs/getting-started/installation/
 
 ### Configure goose with `Custom Providers` and API key `empty`
 
 ```shell
 goose configure
 
-┌   goose-configure 
+┌   goose-configure
 │
 ◇  What would you like to configure?
-│  Custom Providers 
+│  Custom Providers
 │
 ◇  What would you like to do?
-│  Add A Custom Provider 
+│  Add A Custom Provider
 │
 ◇  What type of API is this?
-│  OpenAI Compatible 
+│  OpenAI Compatible
 │
 ◇  What should we call this provider?
 │  vllm-rs
@@ -60,10 +58,10 @@ goose configure
 │  default
 │
 ◇  Does this provider support streaming responses?
-│  Yes 
+│  Yes
 │
 ◇  Does this provider require custom headers?
-│  No 
+│  No
 │
 └  Custom provider added: vllm-rs
 └  Configuration saved successfully to /root/.config/goose/config.yaml