guoqingbao · guoqingbao · Mar 2, 2026 · Mar 8, 2026 · Mar 7, 2026 · Mar 9, 2026
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "vllm-rs"
-version = "0.9.9"
+version = "0.9.10"
 edition = "2021"
 default-run = "vllm-rs"
 description = "A minimal, high-performance large language model (LLM) inference engine implementing vLLM in Rust."
@@ -21,7 +21,8 @@ itertools = "0.13.0"
 akin = "0.4.0"
 indicatif = "0.17.11"
 serde_json = "1.0.108"
-llguidance = "0.6"
+llguidance = { version = "1.6", default-features = false, features = ["lark"] }
+toktrie_hf_tokenizers = "1.6"
 toktrie = "1.4"
 half = { version = "2.5.0", features = ["num-traits", "use-intrinsics", "rand_distr"] }
 tokio = { version = "1.38.0", features = ["sync"] }

diff --git a/ReadMe-CN.md b/ReadMe-CN.md
@@ -82,6 +82,7 @@
 - [Docker构建](docs/docker.md)
 - [工具调用解析](docs/tool_parsing.md)
 - [MCP集成与工具调用](docs/mcp_tool_calling.md)
+- [结构化输出文档](docs/llguidance-integration.md)
 - [Claude Code使用vLLM.rs后端](docs/claude_code.md)
 - [OpenCode使用vLLM.rs后端](docs/open_code.md)
 - [Goose AI Agent使用vLLM.rs后端](docs/goose.md)
@@ -307,8 +308,18 @@ cargo install --features metal
   </details>
 
 ---
-## 🔌 MCP集成 (工具调用)
 
+## 🔌 LLGuidance 支持（结构化输出与约束）
+
+vLLM.rs 现在支持通过 llguidance 库实现结构化输出和约束生成：
+
+- **自定义约束**：允许客户端通过 structured_outputs 或 response_format 提交 Lark/Regex/JSON Schema 约束
+
+查看 [**结构化输出文档 →**](docs/llguidance-integration.md)
+
+---
+
+## 🔌 MCP集成 (工具调用)
 通过Model Context Protocol让LLM调用外部工具。
 
 ```bash

diff --git a/ReadMe.md b/ReadMe.md
@@ -83,6 +83,7 @@ All models support hardware FP8 KV-cache acceleration (requires SM90+ and disabl
 - [Docker Build](docs/docker.md)
 - [Tool Parsing](docs/tool_parsing.md)
 - [MCP Integration and Tool Calling](docs/mcp_tool_calling.md)
+- [Structured Outputs](docs/llguidance-integration.md)
 - [Work with Claude Code](docs/claude_code.md)
 - [Work with OpenCode](docs/opencode.md)
 - [Embedding](docs/embeddings.md)
@@ -275,7 +276,7 @@ Use `--i` to enable interactive mode 🤖, `--ui-server` or `--server` to enable
   # Metal/MacOS
   vllm-rs --m Qwen/Qwen3-4B-GGUF --f Qwen3-4B-Q4_K_M.gguf --ui-server --prefix-cache
   ```
-  
+
   <details open>
     <summary>Multi-GPU + Unquantized Model</summary>
 
@@ -323,6 +324,15 @@ vllm-rs --m Qwen/Qwen3.5-4B-FP8 --ui-server --prefix-cache
 
 ---
 
+## 🔌 Guided decoding (Structured Outputs & Constraints)
+vLLM.rs now supports structured output and constraint-based generation via llguidance:
+
+- **Custom Constraints**: allow clients to submit Lark/Regex/JSON Schema constraints via OpenAI-compatible structured_outputs/response_format
+
+See [**Structured Outputs Documentation →**](docs/llguidance-integration.md)
+
+---
+
 ## 🔌 MCP Integration (Tool Calling)
 
 Enable LLMs to call external tools via Model Context Protocol.
@@ -425,7 +435,7 @@ PD Disaggregation separates prefill (prompt processing) and decode (token genera
 
 ## 📽️ Demo Video
 
-Watch it in action 🎉 
+Watch it in action 🎉
 
 <video src="https://github.com/user-attachments/assets/7fc6aa0b-78ac-4323-923f-d761dd12857f" width="1000px"></video>
 

diff --git a/docs/goose.md b/docs/goose.md
@@ -17,35 +17,33 @@ python3 -m vllm_rs.server --m Qwen/Qwen3-30B-A3B-Instruct-2507 --d 0,1 --server
 
 ## 2) Configure Goose
 
-### Download and install Goose: https://block.github.io/goose/docs/getting-started/installation/
-
 ```shell
 # For non-UI system,
 export GOOSE_DISABLE_KEYRING=1
 ```
-
 Export empty API KEY
 
 ```shell
 export VLLM_API_KEY="empty"
 ```
 
+### Download and install Goose: https://block.github.io/goose/docs/getting-started/installation/
 
 ### Configure goose with `Custom Providers` and API key `empty`
 
 ```shell
 goose configure
 
-┌   goose-configure 
+┌   goose-configure
 │
 ◇  What would you like to configure?
-│  Custom Providers 
+│  Custom Providers
 │
 ◇  What would you like to do?
-│  Add A Custom Provider 
+│  Add A Custom Provider
 │
 ◇  What type of API is this?
-│  OpenAI Compatible 
+│  OpenAI Compatible
 │
 ◇  What should we call this provider?
 │  vllm-rs
@@ -60,10 +58,10 @@ goose configure
 │  default
 │
 ◇  Does this provider support streaming responses?
-│  Yes 
+│  Yes
 │
 ◇  Does this provider require custom headers?
-│  No 
+│  No
 │
 └  Custom provider added: vllm-rs
 └  Configuration saved successfully to /root/.config/goose/config.yaml