diff --git a/Cargo.toml b/Cargo.toml
index 6a5c76cc..238fb8e1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,8 +22,8 @@ itertools = "0.13.0"
akin = "0.4.0"
indicatif = "0.17.11"
serde_json = "1.0.108"
-llguidance = { version = "1.6", default-features = false, features = ["lark"] }
-toktrie_hf_tokenizers = "1.6"
+llguidance = { version = "1.7", default-features = false, features = ["lark", "referencing", "jsonschema_validation"] }
+toktrie_hf_tokenizers = "1.7"
toktrie = "1.4"
half = { version = "2.5.0", features = ["num-traits", "use-intrinsics", "rand_distr"] }
tokio = { version = "1.38.0", features = ["sync"] }
diff --git a/ReadMe-CN.md b/ReadMe-CN.md
index 1ef3356e..a83943d0 100644
--- a/ReadMe-CN.md
+++ b/ReadMe-CN.md
@@ -177,6 +177,147 @@ xinfer --m unsloth/Qwen3.5-4B-GGUF --f Qwen3.5-4B-Q4_K_M.gguf
## 📘 使用方法
> **Python包安装后**请使用 `python3 -m xinfer.server` 方式运行
+### 安装
+
+
+CUDA(Linux)
+
+```bash
+# 前置依赖
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
+sudo apt-get install -y git build-essential libssl-dev pkg-config
+
+# 可选:CUDA toolkit + NCCL
+sudo apt-get install -y cuda-nvcc-12-9 cuda-nvrtc-dev-12-9 libcublas-dev-12-9 libcurand-dev-12-9
+sudo apt-get install -y libnccl2 libnccl-dev
+
+# 编译安装
+cargo --install --features cuda,nccl,flashinfer,cutlass
+# Flash Attention 后端:
+cargo --install --features cuda,nccl,flashattn,cutlass
+# V100 / 较老硬件(无 flash 后端):
+cargo --install --features cuda,nccl
+```
+
+
+
+
+Metal(macOS)
+
+```bash
+# 先安装 Xcode 命令行工具
+cargo install --features metal
+```
+
+
+
+默认启动 **API 服务模式**(端口 8000)。使用 `--i` 启用交互模式 🤖,`--ui-server` 启用带 Web UI 的服务模式 🌐,`--m` 指定Huggingface模型,或`--w` 指定本地Safetensors模型路径 或`--f` 指定GGUF模型文件:
+
+> 单卡/多卡推理
+
+ 单卡推理
+
+ ```bash
+ # CUDA (将 `--i`替换成 `--ui-server`则启用网页版本)
+ vllm-rs --i --m unsloth/Qwen3.5-27B-GGUF --f Qwen3.5-27B-Q4_K_M.gguf --kv-fraction 0.8
+ # Metal/MacOS (MacOS Tahoe之前的系统可能会存在生成过慢问题,使用更小的`--max-model-len` 或 `--kv-fraction`减少显存占用)
+ vllm-rs --i --m unsloth/Qwen3.5-4B-GGUF --f Qwen3.5-4B-Q3_K_M.gguf
+ ```
+
+
+
+ 多卡未量化模型
+
+ ```bash
+ vllm-rs --d 0,1 --w /path/Qwen3-30B-A3B-Instruct-2507 --ui-server --prefix-cache
+ ```
+
+
+
+ FP8/FP4模型
+
+ _FP8格式:_
+ ```bash
+ vllm-rs --d 0,1 --w /path/Qwen3-Coder-30B-A3B-Instruct-FP8/ --ui-server --prefix-cache
+ # Or Qwen3-Next 80B
+ vllm-rs --m Qwen/Qwen3-Coder-Next-FP8 --ui-server --d 0,1 --prefix-cache
+ ```
+
+ _MXFP4格式:_
+ ```bash
+ vllm-rs --m olka-fi/Qwen3.5-4B-MXFP4 --ui-server --prefix-cache
+ ```
+
+ _NVFP4格式:_
+ ```bash
+ vllm-rs --m AxionML/Qwen3.5-9B-NVFP4 --ui-server --prefix-cache
+ ```
+
+
+
+ 多卡量化模型
+
+ ```bash
+ vllm-rs --ui-server --d 0,1 --f /path/Qwen3-30B-A3B-Instruct-2507-Q4_K_M.gguf --prefix-cache
+ ```
+
+
+
+ 未量化模型运行为Q4K量化模型,同时使用FP8 KVCache
+
+ ```bash
+ # 编译时去除`flashinfer` 或 `flashattn` 以使用fp8 kvcache
+ vllm-rs --d 0,1 --w /path/Qwen3-30B-A3B-Instruct-2507 --isq q4k --server --port 8000 --fp8-kvcache
+ ```
+
+
+---
+
+## 🔌 结构化输出与约束(Guided Decoding)
+
+vLLM.rs 现在支持通过 llguidance 库实现结构化输出和约束生成。
+
+### ⚠️ 安全说明
+
+**客户端提供的约束默认被阻止。**要启用它们,您必须显式设置 `--allow-constraint-api` 标志。
+
+#### 启用客户端约束
+```bash
+# 启用客户端提交的约束 via HTTP API
+vllm-rs --m Qwen/Qwen3.5-27B-FP8 --ui-server --prefix-cache --allow-constraint-api
+```
+
+#### 客户端约束的安全风险
+客户端提供的约束可能导致严重的安全漏洞:
+
+1. **Lark 语法注入**:恶意客户端可以提交精心设计的 Lark 语法,这些语法:
+ - 可以访问超出用户角色边界的特殊令牌
+ - 注入可能导致 ReDoS 攻击的任意正则表达式模式
+ - 绕过聊天模板的角色分离
+
+2. **JSON Schema 转义**:客户端可以指定:
+ - 引用系统不打算让用户控制的内部特殊令牌
+ - 创建模糊的令牌边界,导致系统指令泄露
+ - 注入匹配系统角色的禁止正则表达式模式
+
+3. **角色边界 violation**:启用约束后,客户端可能:
+ - 逃逸聊天模板中的 `user:` 角色边界
+ - 注入 `system:` 或 `assistant:` 角色内容
+ - 操纵 tool_call 标记以注入伪造的工具响应
+ - 发明新的方法使设计不佳的系统超出预期范围运行
+
+#### 推荐用法
+- **生产环境**:仅与可信的访问系统/客户端一起设置 `--enable-tool-grammar` 和/或 `--allow-constraint-api`,或在 tokenizer-aware WAF 内联验证语法时过滤传入内容。
+
+```bash
+# 启用自动工具语法生成
+vllm-rs --m Qwen/Qwen3.5-27B-FP8 --ui-server --prefix-cache --enable-tool-grammar
+```
+
+查看 [**结构化输出文档 →**](docs/llguidance-integration.md)
+
+---
+
> Docker 内构建请参考 [**在 Docker 中运行 xInfer →**](docs/docker.md)
### 运行模型
diff --git a/ReadMe.md b/ReadMe.md
index 171fcf7f..f4d43d7c 100644
--- a/ReadMe.md
+++ b/ReadMe.md
@@ -290,6 +290,48 @@ xinfer --m mistralai/Ministral-3-3B --ui-server
---
+## 🔌 Guided decoding (Structured Outputs & Constraints)
+vLLM.rs now supports structured output and constraint-based generation via llguidance.
+
+### ⚠️ Security Notice
+
+**Client-provided constraints are BLOCKED by default.** To enable them, you must explicitly set the `--allow-constraint-api` flag.
+
+#### Enabling Client Constraints
+```bash
+# Enable client-submitted constraints via HTTP API
+vllm-rs --m Qwen/Qwen3.5-27B-FP8 --ui-server --prefix-cache --allow-constraint-api
+```
+
+#### Security Risks of Client Constraints
+Client-provided constraints can introduce serious security vulnerabilities:
+
+1. **Lark Grammar Injection**: Malicious clients can submit crafted Lark grammars that:
+ - Access special tokens beyond the user role boundary
+ - Inject arbitrary regex patterns that could cause ReDoS attacks
+ - Bypass the chat template's role separation
+
+2. **JSON Schema Escapes**: Clients can specify schemas that:
+ - Reference internal special tokens not intended for user control
+ - Create ambiguous token boundaries that leak system instructions
+ - Inject forbidden regex patterns matching system roles
+
+3. **Role Boundary Violations**: When constraints are enabled, clients can potentially:
+ - Escape the `user:` role boundary in chat templates
+ - Inject `system:` or `assistant:` role content
+ - Manipulate tool_call markers to inject fake tool responses
+ - Invent new ways to make poorly designed systems behave beyond intended scope
+
+#### Recommended Usage
+- **Production**: Set `--enable-tool-grammar` and/or `--allow-constraint-api` with trusted accessor systems/clients or when filtering inbound content through a tokenizer-aware WAF with grammar validation inline.
+
+```bash
+# Enable automatic tool grammar generation
+vllm-rs --m Qwen/Qwen3.5-27B-FP8 --ui-server --prefix-cache --enable-tool-grammar
+```
+
+See [**Structured Outputs Documentation →**](docs/llguidance-integration.md)
+
## 📘 Build from source code
**Option 1 — Cargo**
diff --git a/docs/guided_decoding.md b/docs/guided_decoding.md
index d177bf8b..68f3fa5d 100644
--- a/docs/guided_decoding.md
+++ b/docs/guided_decoding.md
@@ -8,13 +8,23 @@ It focuses on:
- how reasoning effort is applied
- practical usage and validation commands
+## JSON Schema Reference
+
+For detailed JSON Schema constraint documentation with curl examples, see [`llguidance-json-schema.md`](llguidance-json-schema.md).
+
+This covers:
+- Schema type definitions (string, integer, number, boolean, object, array)
+- All supported API endpoints (OpenAI-compatible and Claude server)
+- Complete curl examples for each permutation
+- Schema sanitization behavior
+
## Current Model
Guided decoding is request-scoped.
The core engine does not invent grammars on its own. A request either:
- supplies a constraint grammar
-- gets that constraint grammar composed with a reasoning prefix
+- gets a composed grammar containing both
- or runs unconstrained when neither exists
The final grammar is stored in `SamplingParams.grammar` and consumed by the runner.
@@ -44,7 +54,7 @@ The server composes:
The result is a single `TopLevelGrammar` assigned to `SamplingParams.grammar`.
-If no client-supplied constraint grammar exists, `params.grammar` stays `None`.
+If no constraint grammar and no tool grammar exist, `params.grammar` stays `None`.
### 3. Sampling in runner
@@ -84,14 +94,17 @@ Legacy fields
- `constraint`
- `constraint_type = regex | lark | json_schema | json`
-If a request provides none of the above, guided decoding is not enabled.
+If a request provides none of the above, guided decoding is not enabled unless tool grammar synthesis adds one.
+
+The grammar composition logic is in `src/utils/guidance_grammar.rs`. The `GrammarRequestDispatcher` and `GrammarComposer` handle the composition of constraint grammars with reasoning grammars.
### Claude server
+Claude reuses the same tool-grammar builder path.
+
Current state:
- Claude does not expose the same client-supplied grammar request surface as the OpenAI endpoint
- Claude reasoning is still driven by explicit thinking behavior, not by `reasoning_effort` grammar composition
-- Claude requests therefore do not currently enable guided decoding
## Reasoning Effort
@@ -99,7 +112,7 @@ Reasoning effort is separate from ordinary structured outputs.
### Current state
-The OpenAI path maps `reasoning_effort` into grammar composition when a request constraint exists.
+The OpenAI path maps `reasoning_effort` into grammar composition.
Accepted values come from `ReasoningEffort::from_str`:
- `none`
@@ -115,7 +128,7 @@ Non-Python builds also support:
- `custom:`
Relevant code:
-- `src/utils/reasoning.rs`
+- `src/utils/guidance_grammar.rs`
- `src/server/server.rs`
- `src/utils/guidance.rs`
@@ -124,7 +137,6 @@ Relevant code:
Reasoning effort:
- does not enable chat-template thinking by itself
- only affects grammar composition
-- is ignored when no request constraint grammar is present
- only works when reasoning start/end tokens are available
If the tokenizer does not expose reasoning markers, the system logs a warning and falls back to the base grammar.
@@ -160,7 +172,7 @@ xinfer --m Qwen/Qwen3.5-35B-A3B-FP8/ --ui-server --d 0
| Enforce text pattern | `structured_outputs` or `constraint` | `regex` |
| Enforce full object schema | `structured_outputs` or `response_format` | `json` / `json_schema` |
| Enforce custom grammar | `structured_outputs` or `constraint` | `grammar` / `lark` |
-| Constrain tagged structured output payload | `structured_outputs` | `structural_tag` |
+| Constrain tool call payload | `structured_outputs` or automatic tool grammar | `structural_tag` / tool grammar |
| Add a reasoning prefix | `reasoning_effort` | `low`, `medium`, `high`, etc. |
### 1. Constrain the answer to a fixed set
@@ -470,5 +482,5 @@ Check:
- Guided decoding is only active when `SamplingParams.grammar` is present.
- OpenAI currently has the richest client-facing grammar surface.
-- Claude does not currently expose request-level guided decoding.
+- Claude currently reuses tool grammar, but not the same direct constraint request API.
- No request-level grammar means no guided decoding.
diff --git a/src/api.rs b/src/api.rs
index 49e6e047..2dc08e65 100644
--- a/src/api.rs
+++ b/src/api.rs
@@ -167,6 +167,8 @@ impl EngineBuilder {
false,
false,
None,
+ false,
+ false,
);
if let Some(kv_dtype) = self.kvcache_dtype {
diff --git a/src/core/engine.rs b/src/core/engine.rs
index 0008bf17..afbe9c76 100644
--- a/src/core/engine.rs
+++ b/src/core/engine.rs
@@ -21,6 +21,7 @@ use crate::transfer::Transfer;
use crate::utils::chat_template::Message;
use crate::utils::config::{EngineConfig, EosTokenId, ModelType, SamplingParams};
use crate::utils::guidance::{build_llg_factory, extract_guidance_tokens, GuidanceTokens};
+use crate::utils::guidance_grammar::{get_reasoning_token_strings, is_reasoning_grammar};
use crate::utils::heartbeat::heartbeat_worker;
use crate::utils::image::{get_image_config, ImageData, ImageProcessConfig};
use crate::utils::kvcache_allocator::KVCacheAllocator;
@@ -132,6 +133,13 @@ impl LLMEngine {
};
}
}
+ if config.bos_token_id.is_none() {
+ if let Some(bos) = &config_tokenizer.bos_token {
+ if let Some(token) = tokenizer.get_vocab(true).get(bos).copied() {
+ config.bos_token_id = Some(token as usize);
+ };
+ }
+ }
let guidance_tokens = extract_guidance_tokens(
&tokenizer,
config
@@ -139,6 +147,8 @@ impl LLMEngine {
.as_ref()
.map(EosTokenId::to_vec)
.unwrap_or_default(),
+ config.bos_token_id.map_or(Vec::new(), |bos| vec![bos as u32]),
+ &config_tokenizer,
);
assert!(
config.architectures.is_some() && config.architectures.as_ref().unwrap().len() == 1,
@@ -530,12 +540,17 @@ impl LLMEngine {
}
}
let mut params = params.clone();
- params.max_tokens = Some(
- params
- .max_tokens
- .unwrap_or(self.econfig.max_tokens.unwrap_or(16384)),
- );
- let mut max_tokens = params.max_tokens.unwrap();
+
+ let mut max_tokens = if let Some(max_t) = params.max_tokens {
+ max_t
+ } else {
+ params.max_tokens = Some(
+ params
+ .max_tokens
+ .unwrap_or(self.econfig.max_tokens.unwrap_or(16384)),
+ );
+ params.max_tokens.unwrap()
+ };
let requested_max_tokens = max_tokens;
let max_model_len = self.econfig.max_model_len.unwrap_or(max_tokens);
@@ -1239,8 +1254,15 @@ impl LLMEngine {
) -> (String, i32) {
// let mut collected_images = Vec::new();
let mut prompt_template = self.template.clone();
- prompt_template
- .set_enable_thinking(params.thinking.unwrap_or(!self.econfig.disable_reasoning));
+ if let Some(grammar) = ¶ms.grammar {
+ if is_reasoning_grammar(&grammar) {
+ prompt_template.set_enable_thinking(true);
+ } else {
+ prompt_template.set_enable_thinking(false);
+ }
+ } else {
+ prompt_template.set_enable_thinking(params.thinking.unwrap_or(!self.econfig.disable_reasoning));
+ };
prompt_template.set_messages(messages);
let image_idx: i32 = 0;
let prompt_processed = prompt_template
@@ -1272,6 +1294,44 @@ impl LLMEngine {
prompt.replace("\n", "")
);
}
+ // Generation alignment and open/close parity enforcement
+ if let Some(grammar) = ¶ms.grammar {
+ if self.guidance_tokens.add_bos_token {
+ // BOS-based trimming: trim at BOS token (for models with add_bos_token=true)
+ if let Ok(bos_string) = self.tokenizer.decode(&self.guidance_tokens.bos_token_ids, false) {
+ if let Some((prompt, _trimmed)) = prompt.rsplit_once(&bos_string) {
+ return (prompt.to_string(), image_idx)
+ }
+ }
+ for bos_token in self.guidance_tokens.bos_token_ids.iter() {
+ if let Ok(bos_string) = self.tokenizer.decode(&[bos_token.clone()], false) {
+ if let Some((prompt, _trimmed)) = prompt.rsplit_once(&bos_string) {
+ return (prompt.to_string(), image_idx)
+ }
+ }
+ }
+ } else {
+ // Reasoning tag-based trimming: check for reasoning start/end tokens
+ if let Some((start_str, end_str)) = get_reasoning_token_strings(&self.guidance_tokens, &self.tokenizer) {
+ if is_reasoning_grammar(&grammar) {
+ // Control entire reasoning block via guidance
+ if prompt.trim().ends_with(&start_str) || prompt.trim().ends_with(&end_str) {
+ if let Some((prompt, _trimmed)) = prompt.rsplit_once(&start_str) {
+ return (prompt.to_string(), image_idx)
+ }
+ }
+ } else {
+ // Ensure guided grammar which will not generate a think-stop token is not within reasoning envelope
+ // A completed inert \n\n block or even an injected think template are harmless
+ if prompt.trim().ends_with(&start_str) {
+ if let Some((prompt, _trimmed)) = prompt.rsplit_once(&start_str) {
+ return (prompt.to_string(), image_idx)
+ }
+ }
+ }
+ }
+ }
+ }
(prompt, image_idx)
}
@@ -1773,6 +1833,11 @@ impl LLMEngine {
pub fn get_chat_template(&self) -> ChatTemplate {
self.template.clone()
}
+
+ /// Get a clone of the default chat template for grammar generation
+ pub fn get_default_chat_template(&self) -> String {
+ self.default_chat_template.clone()
+ }
}
#[cfg(test)]
@@ -1783,9 +1848,13 @@ mod tests {
#[test]
fn trim_prompt_replay_prefix_accepts_single_reasoning_token() {
let guidance_tokens = GuidanceTokens {
+ bos_token_ids: Vec::new(),
eos_token_ids: Vec::new(),
reasoning_start_ids: vec![42, 99],
reasoning_end_ids: vec![100],
+ tool_call_start_ids: Vec::new(),
+ tool_call_end_ids: Vec::new(),
+ add_bos_token: false,
};
assert_eq!(
@@ -1797,9 +1866,13 @@ mod tests {
#[test]
fn trim_prompt_replay_prefix_accepts_multi_token_suffix_when_first_token_is_reasoning() {
let guidance_tokens = GuidanceTokens {
+ bos_token_ids: Vec::new(),
eos_token_ids: Vec::new(),
reasoning_start_ids: vec![42],
reasoning_end_ids: vec![100],
+ tool_call_start_ids: Vec::new(),
+ tool_call_end_ids: Vec::new(),
+ add_bos_token: false,
};
assert_eq!(
@@ -1811,9 +1884,13 @@ mod tests {
#[test]
fn trim_prompt_replay_prefix_trims_leading_non_reasoning_tokens() {
let guidance_tokens = GuidanceTokens {
+ bos_token_ids: Vec::new(),
eos_token_ids: Vec::new(),
reasoning_start_ids: vec![42],
reasoning_end_ids: vec![100],
+ tool_call_start_ids: Vec::new(),
+ tool_call_end_ids: Vec::new(),
+ add_bos_token: false,
};
assert_eq!(
@@ -1825,9 +1902,13 @@ mod tests {
#[test]
fn trim_prompt_replay_prefix_rejects_suffix_without_reasoning_token() {
let guidance_tokens = GuidanceTokens {
+ bos_token_ids: Vec::new(),
eos_token_ids: Vec::new(),
reasoning_start_ids: vec![42],
reasoning_end_ids: vec![100],
+ tool_call_start_ids: Vec::new(),
+ tool_call_end_ids: Vec::new(),
+ add_bos_token: false,
};
assert_eq!(
@@ -1839,9 +1920,13 @@ mod tests {
#[test]
fn trim_prompt_replay_prefix_rejects_empty_suffix() {
let guidance_tokens = GuidanceTokens {
+ bos_token_ids: Vec::new(),
eos_token_ids: Vec::new(),
reasoning_start_ids: vec![42],
reasoning_end_ids: vec![100],
+ tool_call_start_ids: Vec::new(),
+ tool_call_end_ids: Vec::new(),
+ add_bos_token: false,
};
assert_eq!(
diff --git a/src/core/runner.rs b/src/core/runner.rs
index 18458e52..aac3aa9c 100644
--- a/src/core/runner.rs
+++ b/src/core/runner.rs
@@ -14,6 +14,7 @@ use crate::utils::guidance::{GuidanceState, ParserFactory};
use crate::utils::image::compute_image_slice;
use crate::utils::logits_processor::{LogitsProcessor, Sampling};
use crate::utils::progress::ProgressLike;
+use crate::utils::env::soft_mask_disabled;
#[cfg(feature = "flashinfer")]
use crate::utils::FlashInferKvParams;
use crate::{
@@ -55,6 +56,30 @@ pub struct CachedSamplingParams {
pub presence_penalty: Option,
}
+/// Soft masking configuration for gradient smoothing
+/// Instead of hard masking to -inf, we shift logits by a configurable amount
+#[derive(Clone, Debug)]
+pub struct SoftMaskConfig {
+ /// Logit shift for disallowed tokens (default: -1000.0)
+ /// This value should be large enough to make softmax probability negligible
+ /// but small enough to avoid numerical overflow
+ pub mask_shift: f32,
+ /// Minimum logit value after applying mask_shift (default: -1e9)
+ pub min_logit: f32,
+ /// Whether to use soft masking (default: true)
+ pub enabled: bool,
+}
+
+impl Default for SoftMaskConfig {
+ fn default() -> Self {
+ Self {
+ mask_shift: -1000.0,
+ min_logit: -1e9, // Prevent underflow to -inf while keeping gradient flow
+ enabled: !soft_mask_disabled(),
+ }
+ }
+}
+
pub enum Seqs<'a> {
SeqRefs(&'a [&'a Sequence]),
DecodeVec(&'a Vec),
@@ -316,14 +341,29 @@ impl ModelRunner {
}
let apply_len = std::cmp::min(vocab_size, mask_len);
+ // Soft masking configuration for gradient smoothing
+ let soft_mask = SoftMaskConfig::default();
for tok in 0..apply_len {
if !mask.is_allowed(tok as u32) {
- row[tok] = f32::NEG_INFINITY;
+ if soft_mask.enabled {
+ // Soft masking: shift logit by configured amount
+ // This maintains gradient flow while still suppressing disallowed tokens
+ row[tok] = (row[tok] + soft_mask.mask_shift).max(soft_mask.min_logit);
+ } else {
+ // Hard masking when soft mask is disabled
+ row[tok] = f32::NEG_INFINITY;
+ }
}
}
if mask_len < vocab_size {
for tok in mask_len..vocab_size {
- row[tok] = f32::NEG_INFINITY;
+ if soft_mask.enabled {
+ // Soft masking for out-of-range tokens
+ row[tok] = (row[tok] + soft_mask.mask_shift).max(soft_mask.min_logit);
+ } else {
+ // Hard masking when soft mask is disabled
+ row[tok] = f32::NEG_INFINITY;
+ }
}
}
}
@@ -1603,11 +1643,8 @@ impl ModelRunner {
}),
};
- let (guided_logits, guided_seq_ids) =
- self.apply_requested_guidance(logits, &seqs, &seq_ids)?;
-
// Apply penalties using cached values (same for all sequences in batch)
- // This is done AFTER LLG masking so penalties only affect tokens allowed by grammar
+ // This is done BEFORE LLG masking so as to avoid impacting masked logits
let has_any_penalty =
cached_params.frequency_penalty.is_some() || cached_params.presence_penalty.is_some();
@@ -1629,16 +1666,39 @@ impl ModelRunner {
.collect();
self.logit_processor.apply_batch_repeat_penalty(
- &guided_logits,
+ logits,
vec![cached_params.frequency_penalty.unwrap_or(0.0); batch_size],
vec![cached_params.presence_penalty.unwrap_or(0.0); batch_size],
reference_tokens,
)?
} else {
- guided_logits.to_owned()
+ logits.to_owned()
};
- let tokens = self.sample_processed_logits(&logits, &cached_params.sampling)?;
+ let (guided_logits, guided_seq_ids) =
+ self.apply_requested_guidance(&logits, &seqs, &seq_ids)?;
+
+ let mut tokens = self.sample_processed_logits(&guided_logits, &cached_params.sampling)?;
+
+ // For sequences with ff_tokens, use them instead of sampled tokens if mismatching
+ // TODO: use the fftokens as draft tokens
+ if let Some(factory) = &self.llg_factory {
+ let mut guidance_states = self.guidance_states.write();
+ for (i, seq_id) in seq_ids.iter().enumerate() {
+ if let Some(state) = guidance_states.get_mut(seq_id) {
+ let ff_tokens = state.compute_ff_tokens();
+ if !ff_tokens.is_empty() && ff_tokens[0] != tokens[i] {
+ crate::log_warn!(
+ "[Seq {}] Replacing sampled token {} with ff-token {}",
+ seq_id,
+ tokens[i],
+ ff_tokens[0]
+ );
+ tokens[i] = ff_tokens[0];
+ }
+ }
+ }
+ }
self.commit_guided_tokens(&seq_ids, &tokens, guided_seq_ids);
diff --git a/src/main.rs b/src/main.rs
index 7ae7cc6c..3aba2fe9 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -218,6 +218,8 @@ async fn main() -> Result<()> {
args.disable_reasoning,
args.disable_cuda_graph,
Some(args.prefill_chunk_size),
+ args.allow_constraint_api,
+ args.enable_tool_grammar,
);
let server_port = if server {
diff --git a/src/py/mod.rs b/src/py/mod.rs
index c95e2946..224604fe 100644
--- a/src/py/mod.rs
+++ b/src/py/mod.rs
@@ -6,8 +6,8 @@ use crate::server::run_server;
use crate::transfer::{PdConfig, PdMethod, PdRole};
use crate::utils::chat_template::Message;
use crate::utils::config::{EngineConfig, GenerationConfig, SamplingParams};
+use crate::utils::config::ReasoningEffort;
use crate::utils::get_dtype;
-use crate::utils::reasoning::ReasoningEffort;
use llguidance::api::TopLevelGrammar;
use parking_lot::RwLock;
use pyo3::exceptions::PyStopIteration;
@@ -283,7 +283,8 @@ impl EngineConfig {
mcp_command=None, mcp_config=None, mcp_args=None,
tool_prompt_template=None,
pd_server_prefix_cache_ratio=None, pd_client_prefix_cache_ratio=None, yarn_scaling_factor=None,
- disable_reasoning=false, disable_cuda_graph=false, prefill_chunk_size=Some(8192),))]
+ disable_reasoning=false, disable_cuda_graph=false, prefill_chunk_size=Some(8192),
+ allow_constraint_api=false, enable_tool_grammar=false,))]
pub fn new(
model_id: Option,
weight_path: Option,
@@ -318,6 +319,8 @@ impl EngineConfig {
disable_reasoning: bool,
disable_cuda_graph: bool,
prefill_chunk_size: Option,
+ allow_constraint_api: bool,
+ enable_tool_grammar: bool,
) -> Self {
let mut device_ids = device_ids.unwrap_or_default();
if device_ids.is_empty() {
@@ -372,6 +375,8 @@ impl EngineConfig {
prefill_chunk_size: crate::utils::config::normalize_prefill_chunk_size(
prefill_chunk_size.unwrap_or(crate::utils::config::DEFAULT_PREFILL_CHUNK_SIZE),
),
+ allow_constraint_api,
+ enable_tool_grammar,
}
}
}
@@ -462,6 +467,7 @@ impl SamplingParams {
fn reasoning_effort(&self) -> Option {
self.reasoning_effort.as_ref().map(|effort| match effort {
ReasoningEffort::None => "none".to_string(),
+ ReasoningEffort::ModelDefault => "model_default".to_string(),
ReasoningEffort::Low => "low".to_string(),
ReasoningEffort::Medium => "medium".to_string(),
ReasoningEffort::High => "high".to_string(),
diff --git a/src/server/claude_server.rs b/src/server/claude_server.rs
index c5023bb5..cf055a1d 100644
--- a/src/server/claude_server.rs
+++ b/src/server/claude_server.rs
@@ -1,13 +1,15 @@
use super::{
- build_messages_and_images, ChatMessage, ImageUrlContent, MessageContent, MessageContentType,
- ServerData,
+ build_messages_and_images, ChatMessage, ImageUrlContent,
+ MessageContent, MessageContentType, ServerData,
};
+use crate::utils::guidance_grammar::build_guided_decoding_grammar;
use crate::core::engine::{LLMEngine, StreamItem};
use crate::server::logger::ChatCompletionLogger;
use crate::server::parser::{BufferedFinalizeResult, StreamResult, StreamToolParser};
use crate::tools::helpers::{
build_invalid_tool_call_feedback, build_tool_schema_map, filter_tool_calls,
- retain_tool_calls_forced_name, strict_tool_call_validation_enabled,
+ retain_tool_calls_forced_name,
+ strict_tool_call_validation_enabled,
};
use crate::tools::{Tool, ToolCall, ToolChoice};
use crate::utils::config::SamplingParams;
@@ -2142,12 +2144,48 @@ pub async fn messages(
let parser_model_id =
super::resolve_engine_model_id(&engine_config).unwrap_or_else(|| model_id.clone());
let enforce_parser = engine_config.enforce_parser.clone();
+ let tool_parser_name = if let Some(ref enforced) = enforce_parser {
+ enforced.clone()
+ } else {
+ StreamToolParser::parser_name_for_model(&model_type, &parser_model_id).to_string()
+ };
let img_cfg = {
let e = data.engine.read();
e.img_cfg.clone()
};
+ {
+ let engine = data.engine.read();
+ let model_type = engine.model_type.clone();
+ let model_id = model_id.clone();
+ let chat_template = Some(engine.get_chat_template());
+
+ params.grammar = build_guided_decoding_grammar(
+ &engine.guidance_tokens,
+ &tool_config,
+ &resolved_tools,
+ &tool_parser_name,
+ None,
+ tool_choice_required,
+ forced_tool_name.clone(),
+ max_tokens,
+ None,
+ engine_config.enable_tool_grammar,
+ engine_config.allow_constraint_api,
+ &engine.tokenizer,
+ &model_type,
+ &model_id,
+ chat_template,
+ engine_config.disable_reasoning,
+ );
+
+ if let Some(ref grammar) = params.grammar {
+ let lark = crate::utils::guidance_grammar::get_lark_from_top_level_grammar(grammar);
+ crate::log_info!("[llg] Final Claude grammar:\n{}", lark);
+ }
+ }
+
let (messages, image_data) = match build_messages_and_images(&chat_messages, img_cfg.as_ref()) {
Ok(output) => output,
Err(err) => {
diff --git a/src/server/mod.rs b/src/server/mod.rs
index 7b66b5f1..3e1ecd72 100644
--- a/src/server/mod.rs
+++ b/src/server/mod.rs
@@ -1,6 +1,5 @@
// src/server/mod.rs
use clap::Parser;
-use llguidance::api::TopLevelGrammar;
use serde::{Deserialize, Serialize};
pub mod claude_server;
pub mod logger;
@@ -9,16 +8,14 @@ pub mod server;
pub mod streaming;
use crate::core::engine::LLMEngine;
use crate::server::streaming::Streamer;
-use crate::tools::schema::{schema_to_tools, ToolGrammarBuilder};
use crate::transfer::PdRole;
use crate::utils::chat_template::Message;
-use crate::utils::config::{EngineConfig, SamplingParams};
-use crate::utils::guidance::{compose_grammars, GuidanceTokens, TopLevelGrammarExt};
+use crate::utils::config::{EngineConfig, ReasoningEffort, SamplingParams};
+use crate::utils::guidance::{GuidanceTokens};
use crate::utils::image::{
compute_tokens_per_image, get_tensor_raw_data, load_image_from_base64, load_image_from_url,
ImageData, ImageProcessConfig, ImageProcessTrait, IMAGE_PLACEHOLDER,
};
-use crate::utils::reasoning::ReasoningEffort;
use axum::http::{self, StatusCode};
use axum::response::{IntoResponse, Sse};
use axum::routing::{get, post};
@@ -61,7 +58,7 @@ where
}))
}
-#[derive(Debug, Deserialize, Serialize)]
+#[derive(Debug, Deserialize, Serialize, Clone)]
pub struct ChatCompletionRequest {
pub messages: Vec,
pub model: Option,
@@ -111,6 +108,90 @@ pub struct ChatCompletionRequest {
pub reasoning_effort: Option,
}
+#[derive(Debug, Deserialize)]
+pub struct GrammarRequest {
+ /// Messages for the conversation (without system prompt)
+ pub messages: Vec,
+ /// LLGuidance grammar definition (Lark format)
+ pub grammar: String,
+ /// Type of grammar: "lark", "json_schema", "regex", "choice"
+ #[serde(default = "default_grammar_type")]
+ pub grammar_type: String,
+ /// Optional system prompt override (plain text, not Jinja2)
+ #[serde(default)]
+ pub system_prompt_override: Option,
+ /// Whether to stream the response
+ #[serde(default)]
+ pub stream: bool,
+ /// Maximum tokens to generate
+ #[serde(default)]
+ pub max_tokens: Option,
+ /// Temperature for sampling
+ #[serde(default)]
+ pub temperature: Option,
+ /// Top-k for sampling
+ #[serde(default)]
+ pub top_k: Option,
+ /// Top-p for sampling
+ #[serde(default)]
+ pub top_p: Option,
+ /// Frequency penalty
+ #[serde(default)]
+ pub frequency_penalty: Option,
+ /// Presence penalty
+ #[serde(default)]
+ pub presence_penalty: Option,
+ /// Session ID for conversation persistence
+ #[serde(default)]
+ pub session_id: Option,
+ /// Extra thinking parameters
+ #[serde(default, alias = "enable_thinking")]
+ pub thinking: Option,
+ /// Stop sequences
+ #[serde(
+ default,
+ alias = "stop_sequences",
+ deserialize_with = "deserialize_stop_sequences"
+ )]
+ pub stop: Option>,
+}
+
+fn default_grammar_type() -> String {
+ "lark".to_string()
+}
+
+#[derive(Debug, Serialize)]
+pub struct GrammarResponse {
+ pub id: String,
+ pub object: String,
+ pub created: u64,
+ pub model: String,
+ pub choices: Vec,
+ pub usage: Usage,
+ pub grammar_metadata: Option,
+}
+
+#[derive(Debug, Serialize)]
+pub struct GrammarChoice {
+ pub index: usize,
+ pub message: GrammarMessage,
+ pub finish_reason: Option,
+}
+
+#[derive(Debug, Serialize)]
+pub struct GrammarMessage {
+ pub role: String,
+ pub content: Option,
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub grammar_metadata: Option,
+}
+
+#[derive(Debug, Serialize)]
+pub struct GrammarMetadata {
+ pub grammar_type: String,
+ pub matched_cache: bool,
+}
+
pub fn resolve_engine_model_id(econfig: &EngineConfig) -> Option {
if let Some(model_id) = &econfig.model_id {
if !model_id.trim().is_empty() {
@@ -200,266 +281,55 @@ pub struct ExtraBody {
pub extra: HashMap,
}
-// TopLevelGrammar conversion functions
-// Client grammars are composed alongside TEXT and optional reasoning grammars.
-
-pub fn grammar_fragment_from_structured_outputs(
- structured: &StructuredOutputs,
-) -> Result