diff --git a/Cargo.toml b/Cargo.toml
index 6a5c76cc..238fb8e1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,8 +22,8 @@ itertools = "0.13.0"
 akin = "0.4.0"
 indicatif = "0.17.11"
 serde_json = "1.0.108"
-llguidance = { version = "1.6", default-features = false, features = ["lark"] }
-toktrie_hf_tokenizers = "1.6"
+llguidance = { version = "1.7", default-features = false, features = ["lark", "referencing", "jsonschema_validation"] }
+toktrie_hf_tokenizers = "1.7"
 toktrie = "1.4"
 half = { version = "2.5.0", features = ["num-traits", "use-intrinsics", "rand_distr"] }
 tokio = { version = "1.38.0", features = ["sync"] }
diff --git a/ReadMe-CN.md b/ReadMe-CN.md
index 1ef3356e..a83943d0 100644
--- a/ReadMe-CN.md
+++ b/ReadMe-CN.md
@@ -177,6 +177,147 @@ xinfer --m unsloth/Qwen3.5-4B-GGUF --f Qwen3.5-4B-Q4_K_M.gguf
 ## 📘 使用方法
 > **Python包安装后**请使用 `python3 -m xinfer.server` 方式运行
 
+### 安装
+
+<details>
+<summary><b>CUDA（Linux）</b></summary>
+
+```bash
+# 前置依赖
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
+sudo apt-get install -y git build-essential libssl-dev pkg-config
+
+# 可选：CUDA toolkit + NCCL
+sudo apt-get install -y cuda-nvcc-12-9 cuda-nvrtc-dev-12-9 libcublas-dev-12-9 libcurand-dev-12-9
+sudo apt-get install -y libnccl2 libnccl-dev
+
+# 编译安装
+cargo --install --features cuda,nccl,flashinfer,cutlass
+# Flash Attention 后端：
+cargo --install --features cuda,nccl,flashattn,cutlass
+# V100 / 较老硬件（无 flash 后端）：
+cargo --install --features cuda,nccl
+```
+
+</details>
+
+<details>
+<summary><b>Metal（macOS）</b></summary>
+
+```bash
+# 先安装 Xcode 命令行工具
+cargo install --features metal
+```
+
+</details>
+
+默认启动 **API 服务模式**（端口 8000）。使用 `--i` 启用交互模式 🤖，`--ui-server` 启用带 Web UI 的服务模式 🌐，`--m` 指定Huggingface模型，或`--w` 指定本地Safetensors模型路径 或`--f` 指定GGUF模型文件：
+
+> 单卡/多卡推理
+  <details open>
+    <summary>单卡推理</summary>
+
+   ```bash
+   # CUDA （将 `--i`替换成 `--ui-server`则启用网页版本）
+   vllm-rs --i --m unsloth/Qwen3.5-27B-GGUF --f Qwen3.5-27B-Q4_K_M.gguf --kv-fraction 0.8
+   # Metal/MacOS (MacOS Tahoe之前的系统可能会存在生成过慢问题，使用更小的`--max-model-len` 或 `--kv-fraction`减少显存占用)
+   vllm-rs --i --m unsloth/Qwen3.5-4B-GGUF --f Qwen3.5-4B-Q3_K_M.gguf
+   ```
+  </details>
+
+  <details open>
+    <summary>多卡未量化模型</summary>
+
+   ```bash
+   vllm-rs --d 0,1 --w /path/Qwen3-30B-A3B-Instruct-2507 --ui-server --prefix-cache
+   ```
+  </details>
+
+  <details open>
+    <summary>FP8/FP4模型</summary>
+
+  _FP8格式:_
+   ```bash
+   vllm-rs --d 0,1 --w /path/Qwen3-Coder-30B-A3B-Instruct-FP8/ --ui-server --prefix-cache
+    # Or Qwen3-Next 80B
+   vllm-rs --m Qwen/Qwen3-Coder-Next-FP8 --ui-server --d 0,1 --prefix-cache
+   ```
+
+  _MXFP4格式:_
+  ```bash
+  vllm-rs --m olka-fi/Qwen3.5-4B-MXFP4 --ui-server --prefix-cache
+  ```
+
+  _NVFP4格式:_
+  ```bash
+  vllm-rs --m AxionML/Qwen3.5-9B-NVFP4 --ui-server --prefix-cache
+  ```
+  </details>
+
+   <details open>
+    <summary>多卡量化模型</summary>
+
+   ```bash
+   vllm-rs --ui-server --d 0,1 --f /path/Qwen3-30B-A3B-Instruct-2507-Q4_K_M.gguf --prefix-cache
+   ```
+  </details>
+
+   <details open>
+    <summary>未量化模型运行为Q4K量化模型，同时使用FP8 KVCache</summary>
+
+   ```bash
+   # 编译时去除`flashinfer` 或 `flashattn` 以使用fp8 kvcache
+   vllm-rs --d 0,1 --w /path/Qwen3-30B-A3B-Instruct-2507 --isq q4k --server --port 8000 --fp8-kvcache
+   ```
+  </details>
+
+---
+
+## 🔌 结构化输出与约束（Guided Decoding）
+
+vLLM.rs 现在支持通过 llguidance 库实现结构化输出和约束生成。
+
+### ⚠️ 安全说明
+
+**客户端提供的约束默认被阻止。**要启用它们，您必须显式设置 `--allow-constraint-api` 标志。
+
+#### 启用客户端约束
+```bash
+# 启用客户端提交的约束 via HTTP API
+vllm-rs --m Qwen/Qwen3.5-27B-FP8 --ui-server --prefix-cache --allow-constraint-api
+```
+
+#### 客户端约束的安全风险
+客户端提供的约束可能导致严重的安全漏洞：
+
+1. **Lark 语法注入**：恶意客户端可以提交精心设计的 Lark 语法，这些语法：
+   - 可以访问超出用户角色边界的特殊令牌
+   - 注入可能导致 ReDoS 攻击的任意正则表达式模式
+   - 绕过聊天模板的角色分离
+
+2. **JSON Schema 转义**：客户端可以指定：
+   - 引用系统不打算让用户控制的内部特殊令牌
+   - 创建模糊的令牌边界，导致系统指令泄露
+   - 注入匹配系统角色的禁止正则表达式模式
+
+3. **角色边界 violation**：启用约束后，客户端可能：
+   - 逃逸聊天模板中的 `user:` 角色边界
+   - 注入 `system:` 或 `assistant:` 角色内容
+   - 操纵 tool_call 标记以注入伪造的工具响应
+   - 发明新的方法使设计不佳的系统超出预期范围运行
+
+#### 推荐用法
+- **生产环境**：仅与可信的访问系统/客户端一起设置 `--enable-tool-grammar` 和/或 `--allow-constraint-api`，或在 tokenizer-aware WAF 内联验证语法时过滤传入内容。
+
+```bash
+# 启用自动工具语法生成
+vllm-rs --m Qwen/Qwen3.5-27B-FP8 --ui-server --prefix-cache --enable-tool-grammar
+```
+
+查看 [**结构化输出文档 →**](docs/llguidance-integration.md)
+
+---
+
 > Docker 内构建请参考 [**在 Docker 中运行 xInfer →**](docs/docker.md)
 
 ### 运行模型
diff --git a/ReadMe.md b/ReadMe.md
index 171fcf7f..f4d43d7c 100644
--- a/ReadMe.md
+++ b/ReadMe.md
@@ -290,6 +290,48 @@ xinfer --m mistralai/Ministral-3-3B --ui-server
 
 ---
 
+## 🔌 Guided decoding (Structured Outputs & Constraints)
+vLLM.rs now supports structured output and constraint-based generation via llguidance.
+
+### ⚠️ Security Notice
+
+**Client-provided constraints are BLOCKED by default.** To enable them, you must explicitly set the `--allow-constraint-api` flag.
+
+#### Enabling Client Constraints
+```bash
+# Enable client-submitted constraints via HTTP API
+vllm-rs --m Qwen/Qwen3.5-27B-FP8 --ui-server --prefix-cache --allow-constraint-api
+```
+
+#### Security Risks of Client Constraints
+Client-provided constraints can introduce serious security vulnerabilities:
+
+1. **Lark Grammar Injection**: Malicious clients can submit crafted Lark grammars that:
+   - Access special tokens beyond the user role boundary
+   - Inject arbitrary regex patterns that could cause ReDoS attacks
+   - Bypass the chat template's role separation
+
+2. **JSON Schema Escapes**: Clients can specify schemas that:
+   - Reference internal special tokens not intended for user control
+   - Create ambiguous token boundaries that leak system instructions
+   - Inject forbidden regex patterns matching system roles
+
+3. **Role Boundary Violations**: When constraints are enabled, clients can potentially:
+   - Escape the `user:` role boundary in chat templates
+   - Inject `system:` or `assistant:` role content
+   - Manipulate tool_call markers to inject fake tool responses
+   - Invent new ways to make poorly designed systems behave beyond intended scope
+
+#### Recommended Usage
+- **Production**: Set `--enable-tool-grammar` and/or `--allow-constraint-api` with trusted accessor systems/clients or when filtering inbound content through a tokenizer-aware WAF with grammar validation inline.
+
+```bash
+# Enable automatic tool grammar generation
+vllm-rs --m Qwen/Qwen3.5-27B-FP8 --ui-server --prefix-cache --enable-tool-grammar
+```
+
+See [**Structured Outputs Documentation →**](docs/llguidance-integration.md)
+
 ## 📘 Build from source code
 
 **Option 1 — Cargo**
diff --git a/docs/guided_decoding.md b/docs/guided_decoding.md
index d177bf8b..68f3fa5d 100644
--- a/docs/guided_decoding.md
+++ b/docs/guided_decoding.md
@@ -8,13 +8,23 @@ It focuses on:
 - how reasoning effort is applied
 - practical usage and validation commands
 
+## JSON Schema Reference
+
+For detailed JSON Schema constraint documentation with curl examples, see [`llguidance-json-schema.md`](llguidance-json-schema.md).
+
+This covers:
+- Schema type definitions (string, integer, number, boolean, object, array)
+- All supported API endpoints (OpenAI-compatible and Claude server)
+- Complete curl examples for each permutation
+- Schema sanitization behavior
+
 ## Current Model
 
 Guided decoding is request-scoped.
 
 The core engine does not invent grammars on its own. A request either:
 - supplies a constraint grammar
-- gets that constraint grammar composed with a reasoning prefix
+- gets a composed grammar containing both
 - or runs unconstrained when neither exists
 
 The final grammar is stored in `SamplingParams.grammar` and consumed by the runner.
@@ -44,7 +54,7 @@ The server composes:
 
 The result is a single `TopLevelGrammar` assigned to `SamplingParams.grammar`.
 
-If no client-supplied constraint grammar exists, `params.grammar` stays `None`.
+If no constraint grammar and no tool grammar exist, `params.grammar` stays `None`.
 
 ### 3. Sampling in runner
 
@@ -84,14 +94,17 @@ Legacy fields
 - `constraint`
 - `constraint_type = regex | lark | json_schema | json`
 
-If a request provides none of the above, guided decoding is not enabled.
+If a request provides none of the above, guided decoding is not enabled unless tool grammar synthesis adds one.
+
+The grammar composition logic is in `src/utils/guidance_grammar.rs`. The `GrammarRequestDispatcher` and `GrammarComposer` handle the composition of constraint grammars with reasoning grammars.
 
 ### Claude server
 
+Claude reuses the same tool-grammar builder path.
+
 Current state:
 - Claude does not expose the same client-supplied grammar request surface as the OpenAI endpoint
 - Claude reasoning is still driven by explicit thinking behavior, not by `reasoning_effort` grammar composition
-- Claude requests therefore do not currently enable guided decoding
 
 ## Reasoning Effort
 
@@ -99,7 +112,7 @@ Reasoning effort is separate from ordinary structured outputs.
 
 ### Current state
 
-The OpenAI path maps `reasoning_effort` into grammar composition when a request constraint exists.
+The OpenAI path maps `reasoning_effort` into grammar composition.
 
 Accepted values come from `ReasoningEffort::from_str`:
 - `none`
@@ -115,7 +128,7 @@ Non-Python builds also support:
 - `custom:<template>`
 
 Relevant code:
-- `src/utils/reasoning.rs`
+- `src/utils/guidance_grammar.rs`
 - `src/server/server.rs`
 - `src/utils/guidance.rs`
 
@@ -124,7 +137,6 @@ Relevant code:
 Reasoning effort:
 - does not enable chat-template thinking by itself
 - only affects grammar composition
-- is ignored when no request constraint grammar is present
 - only works when reasoning start/end tokens are available
 
 If the tokenizer does not expose reasoning markers, the system logs a warning and falls back to the base grammar.
@@ -160,7 +172,7 @@ xinfer --m Qwen/Qwen3.5-35B-A3B-FP8/ --ui-server --d 0
 | Enforce text pattern | `structured_outputs` or `constraint` | `regex` |
 | Enforce full object schema | `structured_outputs` or `response_format` | `json` / `json_schema` |
 | Enforce custom grammar | `structured_outputs` or `constraint` | `grammar` / `lark` |
-| Constrain tagged structured output payload | `structured_outputs` | `structural_tag` |
+| Constrain tool call payload | `structured_outputs` or automatic tool grammar | `structural_tag` / tool grammar |
 | Add a reasoning prefix | `reasoning_effort` | `low`, `medium`, `high`, etc. |
 
 ### 1. Constrain the answer to a fixed set
@@ -470,5 +482,5 @@ Check:
 
 - Guided decoding is only active when `SamplingParams.grammar` is present.
 - OpenAI currently has the richest client-facing grammar surface.
-- Claude does not currently expose request-level guided decoding.
+- Claude currently reuses tool grammar, but not the same direct constraint request API.
 - No request-level grammar means no guided decoding.
diff --git a/src/api.rs b/src/api.rs
index 49e6e047..2dc08e65 100644
--- a/src/api.rs
+++ b/src/api.rs
@@ -167,6 +167,8 @@ impl EngineBuilder {
             false,
             false,
             None,
+            false,
+            false,
         );
 
         if let Some(kv_dtype) = self.kvcache_dtype {
diff --git a/src/core/engine.rs b/src/core/engine.rs
index 0008bf17..afbe9c76 100644
--- a/src/core/engine.rs
+++ b/src/core/engine.rs
@@ -21,6 +21,7 @@ use crate::transfer::Transfer;
 use crate::utils::chat_template::Message;
 use crate::utils::config::{EngineConfig, EosTokenId, ModelType, SamplingParams};
 use crate::utils::guidance::{build_llg_factory, extract_guidance_tokens, GuidanceTokens};
+use crate::utils::guidance_grammar::{get_reasoning_token_strings, is_reasoning_grammar};
 use crate::utils::heartbeat::heartbeat_worker;
 use crate::utils::image::{get_image_config, ImageData, ImageProcessConfig};
 use crate::utils::kvcache_allocator::KVCacheAllocator;
@@ -132,6 +133,13 @@ impl LLMEngine {
                 };
             }
         }
+        if config.bos_token_id.is_none() {
+            if let Some(bos) = &config_tokenizer.bos_token {
+                if let Some(token) = tokenizer.get_vocab(true).get(bos).copied() {
+                    config.bos_token_id = Some(token as usize);
+                };
+            }
+        }
         let guidance_tokens = extract_guidance_tokens(
             &tokenizer,
             config
@@ -139,6 +147,8 @@ impl LLMEngine {
                 .as_ref()
                 .map(EosTokenId::to_vec)
                 .unwrap_or_default(),
+            config.bos_token_id.map_or(Vec::new(), |bos| vec![bos as u32]),
+            &config_tokenizer,
         );
         assert!(
             config.architectures.is_some() && config.architectures.as_ref().unwrap().len() == 1,
@@ -530,12 +540,17 @@ impl LLMEngine {
             }
         }
         let mut params = params.clone();
-        params.max_tokens = Some(
-            params
-                .max_tokens
-                .unwrap_or(self.econfig.max_tokens.unwrap_or(16384)),
-        );
-        let mut max_tokens = params.max_tokens.unwrap();
+
+        let mut max_tokens = if let Some(max_t) = params.max_tokens {
+            max_t
+        } else {
+            params.max_tokens = Some(
+                params
+                    .max_tokens
+                    .unwrap_or(self.econfig.max_tokens.unwrap_or(16384)),
+            );
+            params.max_tokens.unwrap()
+        };
         let requested_max_tokens = max_tokens;
 
         let max_model_len = self.econfig.max_model_len.unwrap_or(max_tokens);
@@ -1239,8 +1254,15 @@ impl LLMEngine {
     ) -> (String, i32) {
         // let mut collected_images = Vec::new();
         let mut prompt_template = self.template.clone();
-        prompt_template
-            .set_enable_thinking(params.thinking.unwrap_or(!self.econfig.disable_reasoning));
+        if let Some(grammar) = &params.grammar {
+            if is_reasoning_grammar(&grammar) {
+                prompt_template.set_enable_thinking(true);
+            } else {
+                prompt_template.set_enable_thinking(false);
+            }
+        } else {
+            prompt_template.set_enable_thinking(params.thinking.unwrap_or(!self.econfig.disable_reasoning));
+        };
         prompt_template.set_messages(messages);
         let image_idx: i32 = 0;
         let prompt_processed = prompt_template
@@ -1272,6 +1294,44 @@ impl LLMEngine {
                 prompt.replace("\n", "")
             );
         }
+        // Generation alignment and open/close parity enforcement
+        if let Some(grammar) = &params.grammar {
+            if self.guidance_tokens.add_bos_token {
+                // BOS-based trimming: trim at BOS token (for models with add_bos_token=true)
+                if let Ok(bos_string) = self.tokenizer.decode(&self.guidance_tokens.bos_token_ids, false) {
+                    if let Some((prompt, _trimmed)) = prompt.rsplit_once(&bos_string) {
+                        return (prompt.to_string(), image_idx)
+                    }
+                }
+                for bos_token in self.guidance_tokens.bos_token_ids.iter() {
+                    if let Ok(bos_string) = self.tokenizer.decode(&[bos_token.clone()], false) {
+                        if let Some((prompt, _trimmed)) = prompt.rsplit_once(&bos_string) {
+                            return (prompt.to_string(), image_idx)
+                        }
+                    }
+                }
+            } else {
+                // Reasoning tag-based trimming: check for reasoning start/end tokens
+                if let Some((start_str, end_str)) = get_reasoning_token_strings(&self.guidance_tokens, &self.tokenizer) {
+                    if is_reasoning_grammar(&grammar) {
+                        // Control entire reasoning block via guidance
+                        if prompt.trim().ends_with(&start_str) || prompt.trim().ends_with(&end_str) {
+                            if let Some((prompt, _trimmed)) = prompt.rsplit_once(&start_str) {
+                                return (prompt.to_string(), image_idx)
+                            }
+                        }
+                    } else {
+                        // Ensure guided grammar which will not generate a think-stop token is not within reasoning envelope
+                        // A completed inert <think>\n\n</think> block or even an injected think template are harmless
+                        if prompt.trim().ends_with(&start_str) {
+                            if let Some((prompt, _trimmed)) = prompt.rsplit_once(&start_str) {
+                                return (prompt.to_string(), image_idx)
+                            }
+                        }
+                    }
+                }
+            }
+        }
         (prompt, image_idx)
     }
 
@@ -1773,6 +1833,11 @@ impl LLMEngine {
     pub fn get_chat_template(&self) -> ChatTemplate {
         self.template.clone()
     }
+
+    /// Get a clone of the default chat template for grammar generation
+    pub fn get_default_chat_template(&self) -> String {
+        self.default_chat_template.clone()
+    }
 }
 
 #[cfg(test)]
@@ -1783,9 +1848,13 @@ mod tests {
     #[test]
     fn trim_prompt_replay_prefix_accepts_single_reasoning_token() {
         let guidance_tokens = GuidanceTokens {
+            bos_token_ids: Vec::new(),
             eos_token_ids: Vec::new(),
             reasoning_start_ids: vec![42, 99],
             reasoning_end_ids: vec![100],
+            tool_call_start_ids: Vec::new(),
+            tool_call_end_ids: Vec::new(),
+            add_bos_token: false,
         };
 
         assert_eq!(
@@ -1797,9 +1866,13 @@ mod tests {
     #[test]
     fn trim_prompt_replay_prefix_accepts_multi_token_suffix_when_first_token_is_reasoning() {
         let guidance_tokens = GuidanceTokens {
+            bos_token_ids: Vec::new(),
             eos_token_ids: Vec::new(),
             reasoning_start_ids: vec![42],
             reasoning_end_ids: vec![100],
+            tool_call_start_ids: Vec::new(),
+            tool_call_end_ids: Vec::new(),
+            add_bos_token: false,
         };
 
         assert_eq!(
@@ -1811,9 +1884,13 @@ mod tests {
     #[test]
     fn trim_prompt_replay_prefix_trims_leading_non_reasoning_tokens() {
         let guidance_tokens = GuidanceTokens {
+            bos_token_ids: Vec::new(),
             eos_token_ids: Vec::new(),
             reasoning_start_ids: vec![42],
             reasoning_end_ids: vec![100],
+            tool_call_start_ids: Vec::new(),
+            tool_call_end_ids: Vec::new(),
+            add_bos_token: false,
         };
 
         assert_eq!(
@@ -1825,9 +1902,13 @@ mod tests {
     #[test]
     fn trim_prompt_replay_prefix_rejects_suffix_without_reasoning_token() {
         let guidance_tokens = GuidanceTokens {
+            bos_token_ids: Vec::new(),
             eos_token_ids: Vec::new(),
             reasoning_start_ids: vec![42],
             reasoning_end_ids: vec![100],
+            tool_call_start_ids: Vec::new(),
+            tool_call_end_ids: Vec::new(),
+            add_bos_token: false,
         };
 
         assert_eq!(
@@ -1839,9 +1920,13 @@ mod tests {
     #[test]
     fn trim_prompt_replay_prefix_rejects_empty_suffix() {
         let guidance_tokens = GuidanceTokens {
+            bos_token_ids: Vec::new(),
             eos_token_ids: Vec::new(),
             reasoning_start_ids: vec![42],
             reasoning_end_ids: vec![100],
+            tool_call_start_ids: Vec::new(),
+            tool_call_end_ids: Vec::new(),
+            add_bos_token: false,
         };
 
         assert_eq!(
diff --git a/src/core/runner.rs b/src/core/runner.rs
index 18458e52..aac3aa9c 100644
--- a/src/core/runner.rs
+++ b/src/core/runner.rs
@@ -14,6 +14,7 @@ use crate::utils::guidance::{GuidanceState, ParserFactory};
 use crate::utils::image::compute_image_slice;
 use crate::utils::logits_processor::{LogitsProcessor, Sampling};
 use crate::utils::progress::ProgressLike;
+use crate::utils::env::soft_mask_disabled;
 #[cfg(feature = "flashinfer")]
 use crate::utils::FlashInferKvParams;
 use crate::{
@@ -55,6 +56,30 @@ pub struct CachedSamplingParams {
     pub presence_penalty: Option<f32>,
 }
 
+/// Soft masking configuration for gradient smoothing
+/// Instead of hard masking to -inf, we shift logits by a configurable amount
+#[derive(Clone, Debug)]
+pub struct SoftMaskConfig {
+    /// Logit shift for disallowed tokens (default: -1000.0)
+    /// This value should be large enough to make softmax probability negligible
+    /// but small enough to avoid numerical overflow
+    pub mask_shift: f32,
+    /// Minimum logit value after applying mask_shift (default: -1e9)
+    pub min_logit: f32,
+    /// Whether to use soft masking (default: true)
+    pub enabled: bool,
+}
+
+impl Default for SoftMaskConfig {
+    fn default() -> Self {
+        Self {
+            mask_shift: -1000.0,
+            min_logit: -1e9,  // Prevent underflow to -inf while keeping gradient flow
+            enabled: !soft_mask_disabled(),
+        }
+    }
+}
+
 pub enum Seqs<'a> {
     SeqRefs(&'a [&'a Sequence]),
     DecodeVec(&'a Vec<DecodeSequence>),
@@ -316,14 +341,29 @@ impl ModelRunner {
             }
 
             let apply_len = std::cmp::min(vocab_size, mask_len);
+            // Soft masking configuration for gradient smoothing
+            let soft_mask = SoftMaskConfig::default();
             for tok in 0..apply_len {
                 if !mask.is_allowed(tok as u32) {
-                    row[tok] = f32::NEG_INFINITY;
+                    if soft_mask.enabled {
+                        // Soft masking: shift logit by configured amount
+                        // This maintains gradient flow while still suppressing disallowed tokens
+                        row[tok] = (row[tok] + soft_mask.mask_shift).max(soft_mask.min_logit);
+                    } else {
+                        // Hard masking when soft mask is disabled
+                        row[tok] = f32::NEG_INFINITY;
+                    }
                 }
             }
             if mask_len < vocab_size {
                 for tok in mask_len..vocab_size {
-                    row[tok] = f32::NEG_INFINITY;
+                    if soft_mask.enabled {
+                        // Soft masking for out-of-range tokens
+                        row[tok] = (row[tok] + soft_mask.mask_shift).max(soft_mask.min_logit);
+                    } else {
+                        // Hard masking when soft mask is disabled
+                        row[tok] = f32::NEG_INFINITY;
+                    }
                 }
             }
         }
@@ -1603,11 +1643,8 @@ impl ModelRunner {
                 }),
         };
 
-        let (guided_logits, guided_seq_ids) =
-            self.apply_requested_guidance(logits, &seqs, &seq_ids)?;
-
         // Apply penalties using cached values (same for all sequences in batch)
-        // This is done AFTER LLG masking so penalties only affect tokens allowed by grammar
+        // This is done BEFORE LLG masking so as to avoid impacting masked logits
         let has_any_penalty =
             cached_params.frequency_penalty.is_some() || cached_params.presence_penalty.is_some();
 
@@ -1629,16 +1666,39 @@ impl ModelRunner {
                 .collect();
 
             self.logit_processor.apply_batch_repeat_penalty(
-                &guided_logits,
+                logits,
                 vec![cached_params.frequency_penalty.unwrap_or(0.0); batch_size],
                 vec![cached_params.presence_penalty.unwrap_or(0.0); batch_size],
                 reference_tokens,
             )?
         } else {
-            guided_logits.to_owned()
+            logits.to_owned()
         };
 
-        let tokens = self.sample_processed_logits(&logits, &cached_params.sampling)?;
+        let (guided_logits, guided_seq_ids) =
+            self.apply_requested_guidance(&logits, &seqs, &seq_ids)?;
+
+        let mut tokens = self.sample_processed_logits(&guided_logits, &cached_params.sampling)?;
+
+        // For sequences with ff_tokens, use them instead of sampled tokens if mismatching
+        // TODO: use the fftokens as draft tokens
+        if let Some(factory) = &self.llg_factory {
+            let mut guidance_states = self.guidance_states.write();
+            for (i, seq_id) in seq_ids.iter().enumerate() {
+                if let Some(state) = guidance_states.get_mut(seq_id) {
+                    let ff_tokens = state.compute_ff_tokens();
+                    if !ff_tokens.is_empty() && ff_tokens[0] != tokens[i] {
+                        crate::log_warn!(
+                            "[Seq {}] Replacing sampled token {} with ff-token {}",
+                            seq_id,
+                            tokens[i],
+                            ff_tokens[0]
+                        );
+                        tokens[i] = ff_tokens[0];
+                    }
+                }
+            }
+        }
 
         self.commit_guided_tokens(&seq_ids, &tokens, guided_seq_ids);
 
diff --git a/src/main.rs b/src/main.rs
index 7ae7cc6c..3aba2fe9 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -218,6 +218,8 @@ async fn main() -> Result<()> {
         args.disable_reasoning,
         args.disable_cuda_graph,
         Some(args.prefill_chunk_size),
+        args.allow_constraint_api,
+        args.enable_tool_grammar,
     );
 
     let server_port = if server {
diff --git a/src/py/mod.rs b/src/py/mod.rs
index c95e2946..224604fe 100644
--- a/src/py/mod.rs
+++ b/src/py/mod.rs
@@ -6,8 +6,8 @@ use crate::server::run_server;
 use crate::transfer::{PdConfig, PdMethod, PdRole};
 use crate::utils::chat_template::Message;
 use crate::utils::config::{EngineConfig, GenerationConfig, SamplingParams};
+use crate::utils::config::ReasoningEffort;
 use crate::utils::get_dtype;
-use crate::utils::reasoning::ReasoningEffort;
 use llguidance::api::TopLevelGrammar;
 use parking_lot::RwLock;
 use pyo3::exceptions::PyStopIteration;
@@ -283,7 +283,8 @@ impl EngineConfig {
         mcp_command=None, mcp_config=None, mcp_args=None,
         tool_prompt_template=None,
         pd_server_prefix_cache_ratio=None, pd_client_prefix_cache_ratio=None, yarn_scaling_factor=None,
-        disable_reasoning=false, disable_cuda_graph=false, prefill_chunk_size=Some(8192),))]
+        disable_reasoning=false, disable_cuda_graph=false, prefill_chunk_size=Some(8192),
+        allow_constraint_api=false, enable_tool_grammar=false,))]
     pub fn new(
         model_id: Option<String>,
         weight_path: Option<String>,
@@ -318,6 +319,8 @@ impl EngineConfig {
         disable_reasoning: bool,
         disable_cuda_graph: bool,
         prefill_chunk_size: Option<usize>,
+        allow_constraint_api: bool,
+        enable_tool_grammar: bool,
     ) -> Self {
         let mut device_ids = device_ids.unwrap_or_default();
         if device_ids.is_empty() {
@@ -372,6 +375,8 @@ impl EngineConfig {
             prefill_chunk_size: crate::utils::config::normalize_prefill_chunk_size(
                 prefill_chunk_size.unwrap_or(crate::utils::config::DEFAULT_PREFILL_CHUNK_SIZE),
             ),
+            allow_constraint_api,
+            enable_tool_grammar,
         }
     }
 }
@@ -462,6 +467,7 @@ impl SamplingParams {
     fn reasoning_effort(&self) -> Option<String> {
         self.reasoning_effort.as_ref().map(|effort| match effort {
             ReasoningEffort::None => "none".to_string(),
+            ReasoningEffort::ModelDefault => "model_default".to_string(),
             ReasoningEffort::Low => "low".to_string(),
             ReasoningEffort::Medium => "medium".to_string(),
             ReasoningEffort::High => "high".to_string(),
diff --git a/src/server/claude_server.rs b/src/server/claude_server.rs
index c5023bb5..cf055a1d 100644
--- a/src/server/claude_server.rs
+++ b/src/server/claude_server.rs
@@ -1,13 +1,15 @@
 use super::{
-    build_messages_and_images, ChatMessage, ImageUrlContent, MessageContent, MessageContentType,
-    ServerData,
+    build_messages_and_images, ChatMessage, ImageUrlContent,
+    MessageContent, MessageContentType, ServerData,
 };
+use crate::utils::guidance_grammar::build_guided_decoding_grammar;
 use crate::core::engine::{LLMEngine, StreamItem};
 use crate::server::logger::ChatCompletionLogger;
 use crate::server::parser::{BufferedFinalizeResult, StreamResult, StreamToolParser};
 use crate::tools::helpers::{
     build_invalid_tool_call_feedback, build_tool_schema_map, filter_tool_calls,
-    retain_tool_calls_forced_name, strict_tool_call_validation_enabled,
+    retain_tool_calls_forced_name,
+    strict_tool_call_validation_enabled,
 };
 use crate::tools::{Tool, ToolCall, ToolChoice};
 use crate::utils::config::SamplingParams;
@@ -2142,12 +2144,48 @@ pub async fn messages(
     let parser_model_id =
         super::resolve_engine_model_id(&engine_config).unwrap_or_else(|| model_id.clone());
     let enforce_parser = engine_config.enforce_parser.clone();
+    let tool_parser_name = if let Some(ref enforced) = enforce_parser {
+        enforced.clone()
+    } else {
+        StreamToolParser::parser_name_for_model(&model_type, &parser_model_id).to_string()
+    };
 
     let img_cfg = {
         let e = data.engine.read();
         e.img_cfg.clone()
     };
 
+    {
+        let engine = data.engine.read();
+        let model_type = engine.model_type.clone();
+        let model_id = model_id.clone();
+        let chat_template = Some(engine.get_chat_template());
+
+        params.grammar = build_guided_decoding_grammar(
+            &engine.guidance_tokens,
+            &tool_config,
+            &resolved_tools,
+            &tool_parser_name,
+            None,
+            tool_choice_required,
+            forced_tool_name.clone(),
+            max_tokens,
+            None,
+            engine_config.enable_tool_grammar,
+            engine_config.allow_constraint_api,
+            &engine.tokenizer,
+            &model_type,
+            &model_id,
+            chat_template,
+            engine_config.disable_reasoning,
+        );
+
+        if let Some(ref grammar) = params.grammar {
+            let lark = crate::utils::guidance_grammar::get_lark_from_top_level_grammar(grammar);
+            crate::log_info!("[llg] Final Claude grammar:\n{}", lark);
+        }
+    }
+
     let (messages, image_data) = match build_messages_and_images(&chat_messages, img_cfg.as_ref()) {
         Ok(output) => output,
         Err(err) => {
diff --git a/src/server/mod.rs b/src/server/mod.rs
index 7b66b5f1..3e1ecd72 100644
--- a/src/server/mod.rs
+++ b/src/server/mod.rs
@@ -1,6 +1,5 @@
 // src/server/mod.rs
 use clap::Parser;
-use llguidance::api::TopLevelGrammar;
 use serde::{Deserialize, Serialize};
 pub mod claude_server;
 pub mod logger;
@@ -9,16 +8,14 @@ pub mod server;
 pub mod streaming;
 use crate::core::engine::LLMEngine;
 use crate::server::streaming::Streamer;
-use crate::tools::schema::{schema_to_tools, ToolGrammarBuilder};
 use crate::transfer::PdRole;
 use crate::utils::chat_template::Message;
-use crate::utils::config::{EngineConfig, SamplingParams};
-use crate::utils::guidance::{compose_grammars, GuidanceTokens, TopLevelGrammarExt};
+use crate::utils::config::{EngineConfig, ReasoningEffort, SamplingParams};
+use crate::utils::guidance::{GuidanceTokens};
 use crate::utils::image::{
     compute_tokens_per_image, get_tensor_raw_data, load_image_from_base64, load_image_from_url,
     ImageData, ImageProcessConfig, ImageProcessTrait, IMAGE_PLACEHOLDER,
 };
-use crate::utils::reasoning::ReasoningEffort;
 use axum::http::{self, StatusCode};
 use axum::response::{IntoResponse, Sse};
 use axum::routing::{get, post};
@@ -61,7 +58,7 @@ where
     }))
 }
 
-#[derive(Debug, Deserialize, Serialize)]
+#[derive(Debug, Deserialize, Serialize, Clone)]
 pub struct ChatCompletionRequest {
     pub messages: Vec<ChatMessage>,
     pub model: Option<String>,
@@ -111,6 +108,90 @@ pub struct ChatCompletionRequest {
     pub reasoning_effort: Option<String>,
 }
 
+#[derive(Debug, Deserialize)]
+pub struct GrammarRequest {
+    /// Messages for the conversation (without system prompt)
+    pub messages: Vec<ChatMessage>,
+    /// LLGuidance grammar definition (Lark format)
+    pub grammar: String,
+    /// Type of grammar: "lark", "json_schema", "regex", "choice"
+    #[serde(default = "default_grammar_type")]
+    pub grammar_type: String,
+    /// Optional system prompt override (plain text, not Jinja2)
+    #[serde(default)]
+    pub system_prompt_override: Option<String>,
+    /// Whether to stream the response
+    #[serde(default)]
+    pub stream: bool,
+    /// Maximum tokens to generate
+    #[serde(default)]
+    pub max_tokens: Option<usize>,
+    /// Temperature for sampling
+    #[serde(default)]
+    pub temperature: Option<f32>,
+    /// Top-k for sampling
+    #[serde(default)]
+    pub top_k: Option<isize>,
+    /// Top-p for sampling
+    #[serde(default)]
+    pub top_p: Option<f32>,
+    /// Frequency penalty
+    #[serde(default)]
+    pub frequency_penalty: Option<f32>,
+    /// Presence penalty
+    #[serde(default)]
+    pub presence_penalty: Option<f32>,
+    /// Session ID for conversation persistence
+    #[serde(default)]
+    pub session_id: Option<String>,
+    /// Extra thinking parameters
+    #[serde(default, alias = "enable_thinking")]
+    pub thinking: Option<bool>,
+    /// Stop sequences
+    #[serde(
+        default,
+        alias = "stop_sequences",
+        deserialize_with = "deserialize_stop_sequences"
+    )]
+    pub stop: Option<Vec<String>>,
+}
+
+fn default_grammar_type() -> String {
+    "lark".to_string()
+}
+
+#[derive(Debug, Serialize)]
+pub struct GrammarResponse {
+    pub id: String,
+    pub object: String,
+    pub created: u64,
+    pub model: String,
+    pub choices: Vec<GrammarChoice>,
+    pub usage: Usage,
+    pub grammar_metadata: Option<GrammarMetadata>,
+}
+
+#[derive(Debug, Serialize)]
+pub struct GrammarChoice {
+    pub index: usize,
+    pub message: GrammarMessage,
+    pub finish_reason: Option<String>,
+}
+
+#[derive(Debug, Serialize)]
+pub struct GrammarMessage {
+    pub role: String,
+    pub content: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub grammar_metadata: Option<GrammarMetadata>,
+}
+
+#[derive(Debug, Serialize)]
+pub struct GrammarMetadata {
+    pub grammar_type: String,
+    pub matched_cache: bool,
+}
+
 pub fn resolve_engine_model_id(econfig: &EngineConfig) -> Option<String> {
     if let Some(model_id) = &econfig.model_id {
         if !model_id.trim().is_empty() {
@@ -200,266 +281,55 @@ pub struct ExtraBody {
     pub extra: HashMap<String, serde_json::Value>,
 }
 
-// TopLevelGrammar conversion functions
-// Client grammars are composed alongside TEXT and optional reasoning grammars.
-
-pub fn grammar_fragment_from_structured_outputs(
-    structured: &StructuredOutputs,
-) -> Result<Option<llguidance::api::TopLevelGrammar>> {
-    let mut selected: Option<llguidance::api::TopLevelGrammar> = None;
-    let mut constraint_count = 0;
-
-    if let Some(choice) = &structured.choice {
-        if !choice.is_empty() {
-            constraint_count += 1;
-            if constraint_count > 1 {
-                crate::log_error!("[llg] Multiple constraints specified - structured_outputs must set exactly one of choice, regex, json, grammar, or structural_tag");
-                return Err(candle_core::Error::msg("structured_outputs must set exactly one of choice, regex, json, grammar, or structural_tag"));
-            }
-            let choice_gram = crate::tools::schema::build_choice_lark_grammar(choice)
-                .map_err(|e| candle_core::Error::msg(e))?;
-            selected = Some(choice_gram);
-        }
-    }
-
-    if let Some(regex) = &structured.regex {
-        constraint_count += 1;
-        if constraint_count > 1 {
-            crate::log_error!("[llg] Multiple constraints specified - structured_outputs must set exactly one of choice, regex, json, grammar, or structural_tag");
-            return Err(candle_core::Error::msg("structured_outputs must set exactly one of choice, regex, json, grammar, or structural_tag"));
-        }
-        let regex_gram = TopLevelGrammarExt::from_regex_ascii(regex);
-        selected = Some(regex_gram);
-    }
-
-    if let Some(schema) = &structured.json {
-        constraint_count += 1;
-        if constraint_count > 1 {
-            crate::log_error!("[llg] Multiple constraints specified - structured_outputs must set exactly one of choice, regex, json, grammar, or structural_tag");
-            return Err(candle_core::Error::msg("structured_outputs must set exactly one of choice, regex, json, grammar, or structural_tag"));
-        }
-        let schema = crate::tools::schema::sanitize_schema_for_llguidance(schema);
-        let json_gram = TopLevelGrammarExt::from_json_schema_utf8(schema)
-            .map_err(|e| candle_core::Error::msg(e.to_string()))?;
-        selected = Some(json_gram);
-    }
-
-    if let Some(grammar) = &structured.grammar {
-        constraint_count += 1;
-        if constraint_count > 1 {
-            crate::log_error!("[llg] Multiple constraints specified - structured_outputs must set exactly one of choice, regex, json, grammar, or structural_tag");
-            return Err(candle_core::Error::msg("structured_outputs must set exactly one of choice, regex, json, grammar, or structural_tag"));
+pub fn normalize_reasoning_controls(params: &mut SamplingParams, guidance_tokens: &GuidanceTokens) {
+    #[cfg(not(feature = "python"))]
+    {
+        let reasoning_enabled = params
+            .reasoning_effort
+            .as_ref()
+            .is_some_and(|effort| *effort != ReasoningEffort::None);
+        if !reasoning_enabled {
+            return;
         }
-        let lark_gram = TopLevelGrammarExt::from_lark_utf8(grammar);
-        selected = Some(lark_gram);
-    }
 
-    if let Some(tag) = &structured.structural_tag {
-        constraint_count += 1;
-        if constraint_count > 1 {
-            crate::log_error!("[llg] Multiple constraints specified - structured_outputs must set exactly one of choice, regex, json, grammar, or structural_tag");
-            return Err(candle_core::Error::msg("structured_outputs must set exactly one of choice, regex, json, grammar, or structural_tag"));
-        }
-        let (start, end, schema) = crate::tools::schema::parse_structural_tag(tag)
-            .map_err(|e| candle_core::Error::msg(e))?;
-        let schema = crate::tools::schema::sanitize_schema_for_llguidance(&schema);
-        // Convert schema Value to Vec<Tool> for build_json_tool_lark_grammar
-        let tools = schema_to_tools(&schema);
-        // structural_tag uses text-based matching, pass None for token IDs
-        let tool_gram = ToolGrammarBuilder::new()
-            .tools(&tools)
-            .start_tag(&start)
-            .end_tag(&end)
-            .start_is_special(false)
-            .end_is_special(false)
-            .build_json();
-        selected = Some(tool_gram);
-    }
-
-    if selected.is_none() {
-        crate::log_error!("[llg] No constraint specified in structured_outputs - must set exactly one of choice, regex, json, grammar, or structural_tag");
-        return Err(candle_core::Error::msg("structured_outputs must set exactly one of choice, regex, json, grammar, or structural_tag"));
-    }
-
-    Ok(selected)
-}
-
-pub fn grammar_fragment_from_response_format(
-    response_format: &ResponseFormat,
-) -> Result<Option<llguidance::api::TopLevelGrammar>> {
-    match response_format.format_type.as_str() {
-        "json_schema" => {
-            let Some(schema) = response_format.json_schema.as_ref() else {
-                crate::log_error!(
-                    "[llg] response_format.json_schema is required for type=json_schema"
-                );
-                return Err(candle_core::Error::msg(
-                    "response_format.json_schema is required",
-                ));
-            };
-            let schema = crate::tools::schema::sanitize_schema_for_llguidance(&schema.schema);
-            let json_gram = TopLevelGrammarExt::from_json_schema_utf8(schema)
-                .map_err(|e| candle_core::Error::msg(e.to_string()))?;
-            Ok(Some(json_gram))
-        }
-        "json_object" => {
-            let json_gram = TopLevelGrammarExt::from_json_schema_utf8(json!({
-                "type": "object"
-            }))
-            .map_err(|e| candle_core::Error::msg(e.to_string()))?;
-            Ok(Some(json_gram))
-        }
-        other => {
-            crate::log_error!(
-                "[llg] Unsupported response_format type '{}'; only 'json_schema' and 'json_object' are supported",
-                other
+        let has_reasoning_tokens = !guidance_tokens.reasoning_start_ids.is_empty()
+            && !guidance_tokens.reasoning_end_ids.is_empty();
+        if !has_reasoning_tokens {
+            crate::log_warn!(
+                "[llg] reasoning_effort requested but current model/tokenizer does not expose reasoning tokens; disabling reasoning grammar"
             );
-            Err(candle_core::Error::msg(format!(
-                "Unsupported response_format type '{}'; only 'json_schema' and 'json_object' are supported",
-                other
-            )))
+            params.reasoning_effort = None;
+            return;
         }
-    }
-}
 
-fn structured_outputs_kind(structured: &StructuredOutputs) -> &'static str {
-    if structured
-        .choice
-        .as_ref()
-        .is_some_and(|choice| !choice.is_empty())
-    {
-        "choice"
-    } else if structured.regex.is_some() {
-        "regex"
-    } else if structured.json.is_some() {
-        "json"
-    } else if structured.grammar.is_some() {
-        "grammar"
-    } else if structured.structural_tag.is_some() {
-        "structural_tag"
-    } else {
-        "unknown"
+        params.thinking = Some(true);
     }
-}
 
-pub fn collect_openai_constraint_grammar(
-    request: &ChatCompletionRequest,
-) -> Result<Option<TopLevelGrammar>> {
-    let mut selected: Option<TopLevelGrammar> = None;
-
-    let mut try_set = |grammar: TopLevelGrammar, source: &str, kind: &str| -> Result<()> {
-        if selected.is_some() {
-            return Err(candle_core::Error::msg(
-                "only one of structured_outputs, response_format, or constraint may be set",
-            ));
-        }
-        selected = Some(grammar);
-        crate::log_info!(
-            "[llg] Request constraint selected: source={} type={}",
-            source,
-            kind
-        );
-        Ok(())
-    };
-
-    if let Some(structured) = request.structured_outputs.as_ref().or_else(|| {
-        request
-            .extra_body
+    #[cfg(feature = "python")]
+    {
+        // In Python builds, reasoning_effort is Option<ReasoningEffort>
+        // Check if it's enabled using the is_enabled() method
+        let reasoning_enabled = params
+            .reasoning_effort
             .as_ref()
-            .and_then(|body| body.structured_outputs.as_ref())
-    }) {
-        if let Some(grammar) = grammar_fragment_from_structured_outputs(structured)? {
-            try_set(
-                grammar,
-                "structured_outputs",
-                structured_outputs_kind(structured),
-            )?;
+            .map(|effort| effort.is_enabled())
+            .unwrap_or(false);
+        if !reasoning_enabled {
+            return;
         }
-    }
 
-    if let Some(response_format) = request.response_format.as_ref() {
-        if let Some(grammar) = grammar_fragment_from_response_format(response_format)? {
-            try_set(
-                grammar,
-                "response_format",
-                response_format.format_type.as_str(),
-            )?;
+        let has_reasoning_tokens = !guidance_tokens.reasoning_start_ids.is_empty()
+            && !guidance_tokens.reasoning_end_ids.is_empty();
+        if !has_reasoning_tokens {
+            crate::log_warn!(
+                "[llg] reasoning_effort requested but current model/tokenizer does not expose reasoning tokens; disabling reasoning grammar"
+            );
+            params.reasoning_effort = None;
+            return;
         }
-    }
-
-    if let Some(grammar_str) = request.constraint.as_ref() {
-        let constraint_type = request.constraint_type.as_deref().unwrap_or("regex");
-        let grammar = match constraint_type {
-            "regex" => TopLevelGrammarExt::from_regex_ascii(grammar_str),
-            "lark" => TopLevelGrammarExt::from_lark_utf8(grammar_str),
-            "json_schema" | "json" => {
-                let value: serde_json::Value =
-                    serde_json::from_str(grammar_str).map_err(candle_core::Error::wrap)?;
-                let value = crate::tools::schema::sanitize_schema_for_llguidance(&value);
-                TopLevelGrammarExt::from_json_schema_utf8(value)
-                    .map_err(candle_core::Error::wrap)?
-            }
-            other => {
-                return Err(candle_core::Error::msg(format!(
-                    "unknown constraint_type '{}'",
-                    other
-                )));
-            }
-        };
-        try_set(grammar, "constraint", constraint_type)?;
-    }
-
-    Ok(selected)
-}
-
-pub fn build_guided_decoding_grammar(
-    guidance_tokens: &GuidanceTokens,
-    constraint_grammar: Option<TopLevelGrammar>,
-    max_tokens: usize,
-    reasoning_effort: Option<ReasoningEffort>,
-) -> Option<TopLevelGrammar> {
-    if constraint_grammar.is_none() {
-        return None;
-    }
-
-    crate::log_info!(
-        "[llg] Guided decoding enabled: constraint={} max_tokens={} reasoning={}",
-        true,
-        max_tokens,
-        reasoning_effort
-            .as_ref()
-            .map(|effort| format!("{effort:?}"))
-            .unwrap_or_else(|| "none".to_string())
-    );
-
-    Some(compose_grammars(
-        constraint_grammar.into_iter().collect(),
-        Some(max_tokens),
-        guidance_tokens,
-        reasoning_effort,
-    ))
-}
-
-pub fn normalize_reasoning_controls(params: &mut SamplingParams, guidance_tokens: &GuidanceTokens) {
-    let reasoning_enabled = params
-        .reasoning_effort
-        .as_ref()
-        .is_some_and(|effort| *effort != ReasoningEffort::None);
-    if !reasoning_enabled {
-        return;
-    }
 
-    let has_reasoning_tokens = !guidance_tokens.reasoning_start_ids.is_empty()
-        && !guidance_tokens.reasoning_end_ids.is_empty();
-    if !has_reasoning_tokens {
-        crate::log_warn!(
-            "[llg] reasoning_effort requested but current model/tokenizer does not expose reasoning tokens; disabling reasoning grammar"
-        );
-        params.reasoning_effort = None;
-        return;
+        params.thinking = Some(true);
     }
-
-    params.thinking = Some(true);
 }
 
 #[derive(Serialize, Deserialize, Debug, Clone)]
@@ -1007,6 +877,14 @@ pub struct Args {
     /// Metal uses half of this value after rounding.
     #[arg(long, default_value_t = crate::utils::config::DEFAULT_PREFILL_CHUNK_SIZE)]
     pub prefill_chunk_size: usize,
+
+    /// Allow client-submitted constraints via HTTP API
+    #[arg(long, default_value = "false")]
+    pub allow_constraint_api: bool,
+
+    /// Whether to automatically build LLG grammar from tools
+    #[arg(long, default_value = "false")]
+    pub enable_tool_grammar: bool,
 }
 
 /// Result of executing tool calls via MCP
@@ -1451,6 +1329,7 @@ pub async fn run_server(
             }),
         )
         .route("/v1/chat/completions", post(server::chat_completion))
+        .route("/v1/grammar", post(server::grammar_completion))
         .route("/v1/messages", post(claude_server::messages))
         .route(
             "/v1/messages/count_tokens",
@@ -1705,269 +1584,18 @@ mod tests {
     }
 
     #[test]
-    fn test_collect_openai_constraint_grammar_rejects_multiple_sources() {
-        let request: ChatCompletionRequest = serde_json::from_str(
-            r#"{
-                "messages":[{"role":"user","content":"hi"}],
-                "structured_outputs":{"choice":["a"]},
-                "response_format":{"type":"json_schema","json_schema":{"schema":{"type":"object"}}}
-            }"#,
-        )
-        .unwrap();
-        let result = collect_openai_constraint_grammar(&request);
-        assert!(result.is_err());
-    }
-
-    #[test]
-    fn test_grammar_fragment_from_structured_outputs_choice() {
-        let so = StructuredOutputs {
-            choice: Some(vec!["option1".to_string(), "option2".to_string()]),
-            regex: None,
-            json: None,
-            grammar: None,
-            structural_tag: None,
-        };
-        let result = grammar_fragment_from_structured_outputs(&so);
-        assert!(result.is_ok());
-        assert!(result.unwrap().is_some());
-    }
-
-    #[test]
-    fn test_grammar_fragment_from_structured_outputs_json() {
-        let so = StructuredOutputs {
-            choice: None,
-            regex: None,
-            json: Some(serde_json::json!({"type": "object", "properties": {}})),
-            grammar: None,
-            structural_tag: None,
-        };
-        let result = grammar_fragment_from_structured_outputs(&so);
-        assert!(result.is_ok());
-        assert!(result.unwrap().is_some());
-    }
-
-    #[test]
-    fn test_grammar_fragment_from_structured_outputs_regex() {
-        let so = StructuredOutputs {
-            choice: None,
-            regex: Some("^[a-z]+$".to_string()),
-            json: None,
-            grammar: None,
-            structural_tag: None,
-        };
-        let result = grammar_fragment_from_structured_outputs(&so);
-        assert!(result.is_ok());
-        assert!(result.unwrap().is_some());
-    }
-
-    #[test]
-    fn test_grammar_fragment_from_structured_outputs_grammar() {
-        let so = StructuredOutputs {
-            choice: None,
-            regex: None,
-            json: None,
-            // Grammar without start: - that's managed by ComposedGrammar
-            grammar: Some("'hello' 'world'".to_string()),
-            structural_tag: None,
-        };
-        let result = grammar_fragment_from_structured_outputs(&so);
-        assert!(result.is_ok());
-        assert!(result.unwrap().is_some());
-    }
-
-    #[test]
-    fn test_grammar_fragment_from_structured_outputs_empty() {
-        let so = StructuredOutputs {
-            choice: None,
-            regex: None,
-            json: None,
-            grammar: None,
-            structural_tag: None,
-        };
-        let result = grammar_fragment_from_structured_outputs(&so);
-        assert!(result.is_err());
-    }
-
-    #[test]
-    fn test_grammar_fragment_from_structured_outputs_too_many() {
-        let so = StructuredOutputs {
-            choice: Some(vec!["a".to_string()]),
-            regex: Some("b".to_string()),
-            json: None,
-            grammar: None,
-            structural_tag: None,
-        };
-        let result = grammar_fragment_from_structured_outputs(&so);
-        assert!(result.is_err());
-    }
-
-    #[test]
-    fn test_grammar_fragment_from_response_format_json_schema() {
-        let rf = ResponseFormat {
-            format_type: "json_schema".to_string(),
-            json_schema: Some(ResponseFormatJsonSchema {
-                name: None,
-                schema: serde_json::json!({"type": "object", "properties": {}}),
-            }),
-        };
-        let result = grammar_fragment_from_response_format(&rf);
-        assert!(result.is_ok());
-        assert!(result.unwrap().is_some());
-    }
-
-    #[test]
-    fn test_grammar_fragment_from_response_format_json_object() {
-        let rf = ResponseFormat {
-            format_type: "json_object".to_string(),
-            json_schema: None,
-        };
-        let result = grammar_fragment_from_response_format(&rf);
-        assert!(result.is_ok());
-        assert!(result.unwrap().is_some());
-    }
-
-    #[test]
-    fn test_grammar_fragment_from_response_format_missing_json_schema() {
-        let rf = ResponseFormat {
-            format_type: "json_schema".to_string(),
-            json_schema: None,
-        };
-        let result = grammar_fragment_from_response_format(&rf);
-        assert!(result.is_err());
-    }
-
-    #[test]
-    fn test_grammar_fragment_from_response_format_unsupported_type() {
-        let rf = ResponseFormat {
-            format_type: "unsupported".to_string(),
-            json_schema: None,
-        };
-        let result = grammar_fragment_from_response_format(&rf);
-        assert!(result.is_err());
-    }
-
-    #[test]
-    fn test_grammar_fragment_from_response_format_json_schema_composed() {
-        // Test that json_schema grammars pass through ComposedGrammar
-        let rf = ResponseFormat {
-            format_type: "json_schema".to_string(),
-            json_schema: Some(ResponseFormatJsonSchema {
-                name: None,
-                schema: serde_json::json!({"type": "object", "properties": {"test": {"type": "string"}}}),
-            }),
-        };
-        let result = grammar_fragment_from_response_format(&rf);
-        assert!(result.is_ok());
-        // The grammar was created via ComposedGrammar - just verify it's Some
-        let grammar = result.unwrap();
-        assert!(grammar.is_some());
-    }
-
-    #[test]
-    fn test_build_guided_decoding_grammar_reasoning_only() {
-        let guidance_tokens = GuidanceTokens {
-            eos_token_ids: vec![2],
-            reasoning_start_ids: vec![101],
-            reasoning_end_ids: vec![102],
-        };
-
-        let grammar =
-            build_guided_decoding_grammar(&guidance_tokens, None, 64, Some(ReasoningEffort::Low));
-
-        assert!(
-            grammar.is_none(),
-            "reasoning-only requests must not build guided decoding without a constraint"
-        );
-    }
-
-    #[test]
-    fn test_build_guided_decoding_grammar_reasoning_with_choice_constraint() {
-        let guidance_tokens = GuidanceTokens {
-            eos_token_ids: vec![2],
-            reasoning_start_ids: vec![101],
-            reasoning_end_ids: vec![102],
-        };
-        let constraint =
-            TopLevelGrammar::from_lark_utf8(r#"start: "positive" | "negative" | "neutral""#);
-
-        let grammar = build_guided_decoding_grammar(
-            &guidance_tokens,
-            Some(constraint),
-            64,
-            Some(ReasoningEffort::Low),
-        )
-        .expect("reasoning + choice guided grammar should be built");
-
-        let lark = crate::utils::guidance::get_lark_from_top_level_grammar(&grammar);
-        assert!(
-            lark.contains("start: reasoning_block"),
-            "start rule should sequence reasoning first: {lark}"
-        );
-        assert!(
-            lark.contains(r#""positive" | "negative" | "neutral""#),
-            "choice constraint should remain grouped: {lark}"
-        );
-        assert!(
-            lark.contains("\nreasoning_block:"),
-            "reasoning rule definition should remain on its own line: {lark}"
-        );
-        assert!(
-            lark.contains("\nthinkgram:"),
-            "reasoning helper rules should remain intact: {lark}"
-        );
-    }
-
-    #[test]
-    fn test_build_guided_decoding_grammar_json_schema_constraint_with_reasoning() {
-        let guidance_tokens = GuidanceTokens {
-            eos_token_ids: vec![2],
-            reasoning_start_ids: vec![101],
-            reasoning_end_ids: vec![102],
-        };
-        let constraint = TopLevelGrammar::from_json_schema(serde_json::json!({
-            "type": "object",
-            "properties": {
-                "label": {"type": "string"}
-            },
-            "required": ["label"],
-            "additionalProperties": false
-        }));
-
-        let grammar = build_guided_decoding_grammar(
-            &guidance_tokens,
-            Some(constraint),
-            64,
-            Some(ReasoningEffort::Low),
-        )
-        .expect("reasoning + json-schema guided grammar should be built");
-
-        let lark = crate::utils::guidance::get_lark_from_top_level_grammar(&grammar);
-        assert!(
-            lark.contains("@reasoning @inner"),
-            "wrapper should reference reasoning and inner subgrammars: {lark}"
-        );
-        assert!(
-            !lark.contains("none have lark_grammar"),
-            "wrapper must not stringify non-lark grammars: {lark}"
-        );
-        assert!(
-            grammar
-                .grammars
-                .iter()
-                .any(|g| g.name.as_deref() == Some("inner") && g.json_schema.is_some()),
-            "json-schema constraint should be preserved as nested grammar"
-        );
-    }
-
-    #[test]
+    #[cfg(not(feature = "python"))]
     fn test_normalize_reasoning_controls_enables_thinking() {
         let mut params = SamplingParams::new_with_max_tokens(32);
         params.thinking = Some(false);
         params.reasoning_effort = Some(ReasoningEffort::Low);
         let guidance_tokens = GuidanceTokens {
+            bos_token_ids: Vec::new(),
             eos_token_ids: vec![2],
             reasoning_start_ids: vec![101],
             reasoning_end_ids: vec![102],
+            tool_call_start_ids: Vec::new(),
+            tool_call_end_ids: Vec::new(),
         };
 
         normalize_reasoning_controls(&mut params, &guidance_tokens);
@@ -1977,14 +1605,18 @@ mod tests {
     }
 
     #[test]
+    #[cfg(not(feature = "python"))]
     fn test_normalize_reasoning_controls_disables_unsupported_reasoning() {
         let mut params = SamplingParams::new_with_max_tokens(32);
         params.thinking = Some(false);
         params.reasoning_effort = Some(ReasoningEffort::High);
         let guidance_tokens = GuidanceTokens {
+            bos_token_ids: Vec::new(),
             eos_token_ids: vec![2],
             reasoning_start_ids: Vec::new(),
             reasoning_end_ids: Vec::new(),
+            tool_call_start_ids: Vec::new(),
+            tool_call_end_ids: Vec::new(),
         };
 
         normalize_reasoning_controls(&mut params, &guidance_tokens);
@@ -2064,3 +1696,5 @@ mod tests {
         );
     }
 }
+
+
diff --git a/src/server/parser.rs b/src/server/parser.rs
index 147e2218..aa059a05 100644
--- a/src/server/parser.rs
+++ b/src/server/parser.rs
@@ -503,6 +503,15 @@ impl ToolConfig {
         }
     }
 
+    /// Create tool config from tokenizer, dynamically extracting tool call token IDs.
+    /// This method first creates a config for the model type, then validates and overrides
+    /// the token IDs using the actual tokenizer.
+    pub fn from_tokenizer(tokenizer: &Tokenizer, model_type: &ModelType) -> Self {
+        let mut config = Self::for_model_type(model_type);
+        config.validate_with_tokenizer(tokenizer, model_type);
+        config
+    }
+
     /// Resolve tool call end token IDs using tokenizer and the validated config.
     pub fn tool_call_end_ids(&self, tokenizer: &Tokenizer) -> Vec<u32> {
         let mut tool_call_end_ids: Vec<u32> = Vec::new();
diff --git a/src/server/server.rs b/src/server/server.rs
index 3024d91b..92c863fe 100644
--- a/src/server/server.rs
+++ b/src/server/server.rs
@@ -1,8 +1,8 @@
 // src/server/server.rs
 use super::logger::ChatCompletionLogger;
+use super::GrammarRequest;
 use super::{
-    build_guided_decoding_grammar, build_messages_and_images, collect_openai_constraint_grammar,
-    normalize_reasoning_controls,
+    build_messages_and_images, normalize_reasoning_controls,
     streaming::{ChatResponse, Streamer, StreamingStatus},
     ChatResponder, DetokenizeRequest, DetokenizeResponse, EmbeddingRequest, EmbeddingResponse,
     EncodingFormat, TokenizeInput, TokenizeRequest, TokenizeResponse,
@@ -20,12 +20,10 @@ use crate::tools::helpers::{
     resolve_tools, retain_tool_calls_forced_name, strict_tool_call_validation_enabled,
 };
 use crate::tools::{ToolChoice, ToolChoiceMode};
-use crate::utils::config::SamplingParams;
-use crate::utils::guidance::ReasoningEffort;
-use axum::{
-    extract::{Json, Query, State},
-    response::{sse::KeepAlive, Sse},
-};
+use crate::utils::config::{ReasoningEffort, SamplingParams};
+use crate::utils::guidance_grammar::{build_grammar_from_request, GrammarRequestDispatcher};
+use axum::extract::{Json, Query, State};
+use axum::response::{sse::KeepAlive, Sse};
 use base64::Engine;
 use std::collections::HashSet;
 use std::env;
@@ -376,20 +374,47 @@ pub async fn chat_completion(
         .max_tokens
         .unwrap_or(data.econfig.max_tokens.unwrap_or(16384));
 
+    // Get generation config from engine config for fallback
+    let generation_cfg = data.econfig.generation_cfg.as_ref();
+
     let mut params = SamplingParams::new_with_max_tokens(max_tokens);
-    params.temperature = request.temperature;
-    params.top_k = request.top_k;
-    params.top_p = request.top_p;
-    params.frequency_penalty = request.frequency_penalty;
-    params.presence_penalty = request.presence_penalty;
+    // Apply request values with fallback to generation config
+    params.temperature = request.temperature.or(generation_cfg.and_then(|gc| gc.temperature));
+    params.top_k = request.top_k.or(generation_cfg.and_then(|gc| gc.top_k));
+    params.top_p = request.top_p.or(generation_cfg.and_then(|gc| gc.top_p));
+    params.frequency_penalty = request.frequency_penalty.or(generation_cfg.and_then(|gc| gc.frequency_penalty));
+    params.presence_penalty = request.presence_penalty.or(generation_cfg.and_then(|gc| gc.presence_penalty));
+    // Set stop_token_ids from engine eos_token_id only (no request override)
+    params.stop_token_ids = generation_cfg
+        .and_then(|gc| gc.eos_token_id.clone())
+        .map(|eos| vec![eos.to_vec()]);
     params.session_id = request.session_id.clone();
     params.thinking = request.thinking.clone();
     params.stop_sequences = request.stop.clone();
-    params.reasoning_effort = request
-        .reasoning_effort
-        .clone()
-        .map(ReasoningEffort::from_str);
-    let (img_cfg, model_type, tool_config, engine_config, guidance_tokens) = {
+    #[cfg(not(feature = "python"))]
+    {
+        let effort_str = request
+            .reasoning_effort
+            .clone()
+            .unwrap_or_else(|| "none".to_string());
+        params.reasoning_effort = if effort_str != "none" {
+            Some(ReasoningEffort::from_str(effort_str))
+        } else {
+            None
+        };
+    }
+    #[cfg(feature = "python")]
+    {
+        // In Python builds, request.reasoning_effort is Option<String>
+        // Convert it to ReasoningEffort
+        let effort_str = request.reasoning_effort.clone().unwrap_or_else(|| "none".to_string());
+        params.reasoning_effort = if effort_str != "none" {
+            Some(ReasoningEffort::from_str(effort_str))
+        } else {
+            None
+        };
+    }
+    let (img_cfg, model_type, tool_config, engine_config, guidance_tokens, tokenizer) = {
         let e = data.engine.read();
         (
             e.img_cfg.clone(),
@@ -397,13 +422,49 @@ pub async fn chat_completion(
             e.tool_config.clone(),
             e.econfig.clone(),
             e.guidance_tokens.clone(),
+            e.tokenizer.clone(),
         )
     };
-    let constraint_grammar = match collect_openai_constraint_grammar(&request) {
-        Ok(grammar) => grammar,
-        Err(err) => return ChatResponder::ValidationError(err.to_string()),
+
+    // Generate complete grammar from request using unified single-call function
+    // This handles all permutations: tools, structured_outputs, response_format, constraint
+    let enforce_parser = engine_config.enforce_parser.clone();
+    let tool_parser_name = if let Some(ref enforced) = enforce_parser {
+        enforced.clone()
+    } else {
+        let parser_model_id =
+            super::resolve_engine_model_id(&engine_config).unwrap_or_else(|| model_id.clone());
+        StreamToolParser::parser_name_for_model(&model_type, &parser_model_id).to_string()
+    };
+    
+    // Get the chat template from the engine
+    let chat_template = {
+        let e = data.engine.read();
+        e.get_chat_template()
+    };
+
+    let mut grammar_request = request.clone();
+    if !grammar_request.max_tokens.is_some() {
+        grammar_request.max_tokens = Some(max_tokens);
+    }
+    
+    // Use new GrammarRequestDispatcher for grammar composition
+    let grammar = {
+        let dispatcher = GrammarRequestDispatcher::new(
+            &grammar_request,
+            &guidance_tokens,
+            &tool_config,
+            engine_config.enable_tool_grammar,
+            engine_config.allow_constraint_api,
+            tool_parser_name,
+            &tokenizer,
+            Some(chat_template),
+            engine_config.disable_reasoning,
+        );
+        dispatcher.build_grammar()
     };
-    if constraint_grammar.is_some() {
+
+    if grammar.is_some() {
         normalize_reasoning_controls(&mut params, &guidance_tokens);
     }
 
@@ -487,14 +548,9 @@ pub async fn chat_completion(
         .unwrap()
         .as_millis() as u64;
 
-    {
-        let engine = data.engine.read();
-        params.grammar = build_guided_decoding_grammar(
-            &engine.guidance_tokens,
-            constraint_grammar,
-            max_tokens,
-            params.reasoning_effort.clone(),
-        );
+    // Set grammar from unified generation
+    if let Some(g) = grammar {
+        params.grammar = Some(g);
     }
 
     if use_stream {
@@ -1869,3 +1925,49 @@ mod tests {
         assert!(validate_openai_tool_messages(&messages).is_ok());
     }
 }
+
+#[utoipa::path(
+    post,
+    tag = "vllm-rs",
+    path = "/v1/grammar",
+    request_body = GrammarRequest,
+    responses((status = 200, description = "Grammar-based completion"))
+)]
+pub async fn grammar_completion(
+    State(data): State<Arc<ServerData>>,
+    request: Json<GrammarRequest>,
+) -> ChatResponder {
+    // Only allow if constraint API is enabled via CLI
+    if !data.econfig.allow_constraint_api {
+        return ChatResponder::ValidationError(
+            "Grammar endpoint requires allow_constraint_api CLI flag".to_string(),
+        );
+    }
+
+    // Parse grammar using guidance.rs
+    let grammar =
+        match build_grammar_from_request(&request.grammar_type, &request.grammar)
+        {
+            Ok(g) => g,
+            Err(e) => return ChatResponder::ValidationError(e.to_string()),
+        };
+
+    // Build sampling params with grammar
+    let mut params = SamplingParams::new_with_max_tokens(
+        request
+            .max_tokens
+            .unwrap_or(data.econfig.max_tokens.unwrap_or(16384)),
+    );
+    params.temperature = request.temperature;
+    params.top_k = request.top_k;
+    params.top_p = request.top_p;
+    params.frequency_penalty = request.frequency_penalty;
+    params.presence_penalty = request.presence_penalty;
+    params.session_id = request.session_id.clone();
+    params.thinking = request.thinking;
+    params.stop_sequences = request.stop.clone();
+    params.grammar = Some(grammar);
+
+    // Delegate to existing streaming logic
+    ChatResponder::ValidationError("Grammar endpoint not fully implemented".to_string())
+}
diff --git a/src/tools/schema.rs b/src/tools/schema.rs
index c951b217..ee83a25a 100644
--- a/src/tools/schema.rs
+++ b/src/tools/schema.rs
@@ -1,355 +1,11 @@
 // src/tools/schema.rs
 //! JSON Schema utilities for tool parameters
-//!
+//! 
 //! Provides helpers for working with JSON Schema in tool definitions.
 
 use crate::tools::Tool;
-use crate::utils::guidance::{GrammarError, GrammarResult, TopLevelGrammarExt};
-use llguidance::api::TopLevelGrammar;
-use serde_json::{json, Map, Value};
-use std::collections::{HashMap, HashSet};
-
-/// Remove JSON Schema features that llguidance doesn't support.
-/// Currently strips all "format" fields recursively.
-fn sanitize_schema_for_llguidance_recursive(schema: &Value) -> Value {
-    match schema {
-        Value::Object(map) => {
-            let mut out = Map::new();
-            for (key, value) in map {
-                if key == "format" {
-                    continue;
-                }
-                out.insert(key.clone(), sanitize_schema_for_llguidance_recursive(value));
-            }
-            Value::Object(out)
-        }
-        Value::Array(items) => Value::Array(
-            items
-                .iter()
-                .map(sanitize_schema_for_llguidance_recursive)
-                .collect(),
-        ),
-        _ => schema.clone(),
-    }
-}
-
-/// Remove JSON Schema features that llguidance doesn't support.
-/// Currently strips all "format" fields recursively.
-pub fn sanitize_schema_for_llguidance(schema: &Value) -> Value {
-    sanitize_schema_for_llguidance_recursive(schema)
-}
-
-/// Lark grammar helper functions for llguidance constraint building
-fn lark_quote(value: &str) -> String {
-    serde_json::to_string(value).unwrap_or_else(|_| "\"\"".to_string())
-}
-
-/// Convert token IDs to Lark special token syntax <[token_id]>
-/// This is used when the tokenizer has canonical tokenization for the tag
-fn lark_special_token(token_ids: &HashSet<u32>) -> String {
-    if token_ids.is_empty() {
-        return String::new();
-    }
-    // Join multiple token IDs with |
-    let ids: Vec<String> = token_ids.iter().map(|id| format!("[{}]", id)).collect();
-    format!("<{}>", ids.join(","))
-}
-
-fn _lark_literal(value: &str, is_special: bool) -> String {
-    if is_special && value.starts_with('<') && value.ends_with('>') {
-        // Only allow ASCII special tags
-        let sanitized: String = value.chars().filter(|c| c.is_ascii()).collect();
-        sanitized
-    } else {
-        lark_quote(value)
-    }
-}
-
-/// Builder for constructing tool call grammars
-pub struct ToolGrammarBuilder {
-    tools: Vec<Tool>,
-    start_tag: String,
-    end_tag: String,
-    start_is_special: bool,
-    end_is_special: bool,
-    start_token_ids: Option<HashSet<u32>>,
-    end_token_ids: Option<HashSet<u32>>,
-}
-
-impl ToolGrammarBuilder {
-    pub fn new() -> Self {
-        Self {
-            tools: Vec::new(),
-            start_tag: String::new(),
-            end_tag: String::new(),
-            start_is_special: false,
-            end_is_special: false,
-            start_token_ids: None,
-            end_token_ids: None,
-        }
-    }
-
-    pub fn tools(mut self, tools: &[Tool]) -> Self {
-        self.tools.extend(tools.iter().cloned());
-        self
-    }
-
-    pub fn start_tag(mut self, tag: impl Into<String>) -> Self {
-        self.start_tag = tag.into();
-        self
-    }
-
-    pub fn end_tag(mut self, tag: impl Into<String>) -> Self {
-        self.end_tag = tag.into();
-        self
-    }
-
-    pub fn start_is_special(mut self, special: bool) -> Self {
-        self.start_is_special = special;
-        self
-    }
-
-    pub fn end_is_special(mut self, special: bool) -> Self {
-        self.end_is_special = special;
-        self
-    }
-
-    pub fn start_token_ids(mut self, ids: Option<HashSet<u32>>) -> Self {
-        self.start_token_ids = ids;
-        self
-    }
-
-    pub fn end_token_ids(mut self, ids: Option<HashSet<u32>>) -> Self {
-        self.end_token_ids = ids;
-        self
-    }
-
-    /// Build Lark expression for JSON tool schema content
-    pub fn build_json(self) -> TopLevelGrammar {
-        let start_tag = self.get_tag_or_token_id(
-            &self.start_tag,
-            &self.start_token_ids,
-            self.start_is_special,
-        );
-        let end_tag =
-            self.get_tag_or_token_id(&self.end_tag, &self.end_token_ids, self.end_is_special);
-        let payload_schema = if self.tools.is_empty() {
-            json!({ "type": "object" })
-        } else {
-            let variants: Vec<Value> = self
-                .tools
-                .iter()
-                .map(|tool| {
-                    let arguments_schema =
-                        sanitize_schema_for_llguidance(&tool.function.parameters);
-                    json!({
-                        "type": "object",
-                        "properties": {
-                            "name": {
-                                "type": "string",
-                                "enum": [tool.function.name.clone()],
-                            },
-                            "arguments": arguments_schema,
-                        },
-                        "required": ["name", "arguments"],
-                        "additionalProperties": false,
-                    })
-                })
-                .collect();
-            if variants.len() == 1 {
-                variants[0].clone()
-            } else {
-                json!({ "oneOf": variants })
-            }
-        };
-
-        let payload_schema = serde_json::to_string(&payload_schema).unwrap_or_default();
-        let lark = format!(
-            "start: tool_call\ntool_call: {start_tag} tool_payload {end_tag}\ntool_payload: %json {payload_schema}\n"
-        );
-        TopLevelGrammar::from_lark_utf8(&lark)
-    }
-
-    /// Build Lark expression for valid XML parameter content
-    fn build_xml_value_expression(schema: &serde_json::Value) -> String {
-        let param_type = schema
-            .get("type")
-            .and_then(|t| t.as_str())
-            .unwrap_or("string");
-
-        match param_type {
-            "string" => {
-                if let Ok(val) = std::env::var("XINFER_LLG_DEFAULT_XML_STR") {
-                    format!("{}", val)
-                } else {
-                    r#"/[ -~]*?/"#.to_string()
-                }
-            }
-            "integer" => r"/-?[0-9]+/".to_string(),
-            "number" => r"/-?[0-9]+(\.[0-9]+)?/".to_string(),
-            "boolean" => r"/^(true|false)$/".to_string(),
-            "array" => r"/\[[^\]]*\]/".to_string(),
-            "object" => r"/\{[^\}]*\}/".to_string(),
-            _ => r"/[ -~]*?/".to_string(),
-        }
-    }
-
-    /// Build Lark expression for XML tool schema content
-    /// Uses structured tag parsing with ASCII-restricted value patterns
-    pub fn build_xml(self) -> TopLevelGrammar {
-        let mut rules: Vec<String> = Vec::new();
-
-        // Build envelope tag using token IDs when available
-        let envelope_start_tag = self.get_envelope_tag(
-            &self.start_tag,
-            &self.start_token_ids,
-            self.start_is_special,
-        );
-        let envelope_end_tag =
-            self.get_envelope_tag(&self.end_tag, &self.end_token_ids, self.end_is_special);
-
-        let tool_rule_names: Vec<String> =
-            (0..self.tools.len()).map(|i| format!("tool_{i}")).collect();
-
-        // GUARD 1: Use string literals for XML inner structure (function/parameter tags)
-        // The stream parser detects these using text matching in the buffer
-        rules.push("start: tool_call".to_string());
-        rules.push(format!(
-            "tool_call: {} tool_content {}",
-            envelope_start_tag, envelope_end_tag
-        ));
-
-        for (tool_idx, tool) in self.tools.iter().enumerate() {
-            let tool_name_ascii: String = tool
-                .function
-                .name
-                .chars()
-                .filter(|c| c.is_ascii())
-                .collect();
-            let func_start = lark_quote(&format!("<function={}>", tool_name_ascii));
-            let func_end = lark_quote("</function>");
-            let params_schema = &tool.function.parameters;
-            let props = params_schema.get("properties").and_then(|p| p.as_object());
-            let required_params: Vec<String> = params_schema
-                .get("required")
-                .and_then(|r| r.as_array())
-                .map(|arr| {
-                    arr.iter()
-                        .filter_map(|v| v.as_str().map(|s| s.to_string()))
-                        .collect()
-                })
-                .unwrap_or_default();
-
-            if let Some(props) = props {
-                let mut param_rules_vec: Vec<String> = Vec::new();
-
-                for (param_idx, (param_name, schema)) in props.iter().enumerate() {
-                    let param_name_ascii: String =
-                        param_name.chars().filter(|c| c.is_ascii()).collect();
-                    let param_tag = lark_quote(&format!("<parameter={}>", param_name_ascii));
-                    let param_end = lark_quote("</parameter>");
-                    let value_rule = format!("value_{tool_idx}_{param_idx}");
-                    let param_rule = format!("param_{tool_idx}_{param_idx}");
-
-                    let value_expr = Self::build_xml_value_expression(schema);
-                    rules.push(format!("{value_rule}: {value_expr}"));
-                    rules.push(format!(
-                        "{param_rule}: {param_tag} {value_rule} {param_end}"
-                    ));
-
-                    if required_params.contains(param_name) {
-                        param_rules_vec.push(param_rule);
-                    } else {
-                        param_rules_vec.push(format!("({param_rule})?"));
-                    }
-                }
-
-                let params_expr = param_rules_vec.join(" ");
-                rules.push(format!(
-                    "tool_{tool_idx}: {func_start} {params_expr} {func_end}"
-                ));
-            } else {
-                rules.push(format!("tool_{tool_idx}: {func_start} {func_end}"));
-            }
-        }
-
-        // Build tool_content with alternation of all tools
-        let tool_variants = tool_rule_names.join(" | ");
-        rules.push(format!("tool_content: {tool_variants}"));
-
-        let lark = rules.join("\n") + "\n";
-        TopLevelGrammar::from_lark_utf8(&lark)
-    }
-
-    /// Get envelope tag (start/end) using token IDs when available, falling back to string literals
-    fn get_envelope_tag(
-        &self,
-        tag: &str,
-        token_ids: &Option<HashSet<u32>>,
-        is_special: bool,
-    ) -> String {
-        if let Some(ids) = token_ids {
-            if !ids.is_empty() {
-                return lark_special_token(ids);
-            }
-        }
-
-        if is_special && tag.starts_with('<') && tag.ends_with('>') {
-            // Only allow ASCII special tags
-            let sanitized: String = tag.chars().filter(|c| c.is_ascii()).collect();
-            sanitized
-        } else {
-            lark_quote(tag)
-        }
-    }
-
-    fn get_tag_or_token_id(
-        &self,
-        tag: &str,
-        token_ids: &Option<HashSet<u32>>,
-        is_special: bool,
-    ) -> String {
-        if let Some(ids) = token_ids {
-            if !ids.is_empty() {
-                return format!(
-                    "<{}>",
-                    ids.iter()
-                        .map(|id| format!("[{}]", id))
-                        .collect::<Vec<_>>()
-                        .join(",")
-                );
-            }
-        }
-
-        if is_special && tag.starts_with('<') && tag.ends_with('>') {
-            tag.to_string()
-        } else {
-            lark_quote(tag)
-        }
-    }
-}
-
-/// Build a Lark grammar for QwenCoder-style function/parameter tags with JSON values.
-/// Used for models like Qwen3-Coder that use XML-style tool call envelopes.
-pub fn build_xml_tool_lark_grammar(
-    tools: &[Tool],
-    start: &str,
-    end: &str,
-    start_is_special: bool,
-    end_is_special: bool,
-    start_token_ids: Option<&HashSet<u32>>,
-    end_token_ids: Option<&HashSet<u32>>,
-) -> TopLevelGrammar {
-    ToolGrammarBuilder::new()
-        .tools(tools)
-        .start_tag(start)
-        .end_tag(end)
-        .start_is_special(start_is_special)
-        .end_is_special(end_is_special)
-        .start_token_ids(start_token_ids.cloned())
-        .end_token_ids(end_token_ids.cloned())
-        .build_xml()
-}
+use serde_json::{json, Value};
+use std::collections::HashMap;
 
 /// Builder for creating JSON Schema objects
 #[derive(Debug, Clone, Default)]
@@ -597,98 +253,6 @@ pub mod common {
     }
 }
 
-/// Build a Lark grammar for choice constraints (structured outputs choice field)
-pub fn build_choice_lark_grammar(choices: &[String]) -> GrammarResult<TopLevelGrammar> {
-    if choices.is_empty() {
-        return Err(GrammarError::InvalidGrammar(
-            "structured_outputs.choice must include at least one option".to_string(),
-        ));
-    }
-
-    let mut parts = Vec::with_capacity(choices.len());
-    for choice in choices {
-        if choice.is_empty() {
-            return Err(GrammarError::InvalidGrammar(
-                "structured_outputs.choice cannot contain empty strings".to_string(),
-            ));
-        }
-        parts.push(lark_quote(choice));
-    }
-
-    let body = parts.join(" | ");
-    let lark_string = format!("start: {}\n", body);
-    Ok(TopLevelGrammar::from_lark_utf8(&lark_string))
-}
-
-/// Normalize a tag string for structural_tag parsing
-fn normalize_tag_pair(tag: &str) -> Result<(String, String), String> {
-    let trimmed = tag.trim();
-    if trimmed.is_empty() {
-        return Err("structured_outputs.structural_tag.tag cannot be empty".to_string());
-    }
-
-    if trimmed.starts_with('<') && trimmed.ends_with('>') {
-        let inner = trimmed
-            .trim_start_matches('<')
-            .trim_end_matches('>')
-            .trim_start_matches('/');
-        if inner.is_empty() {
-            return Err("structured_outputs.structural_tag.tag is invalid".to_string());
-        }
-        let start = if trimmed.starts_with("</") {
-            format!("<{}>", inner)
-        } else {
-            trimmed.to_string()
-        };
-        let end = format!("</{}>", inner);
-        Ok((start, end))
-    } else {
-        Ok((format!("<{}>", trimmed), format!("</{}>", trimmed)))
-    }
-}
-
-/// Parse structural_tag for structured outputs
-pub fn parse_structural_tag(value: &Value) -> Result<(String, String, Value), String> {
-    let obj = value
-        .as_object()
-        .ok_or_else(|| "structured_outputs.structural_tag must be an object".to_string())?;
-
-    let schema = obj
-        .get("schema")
-        .cloned()
-        .ok_or_else(|| "structured_outputs.structural_tag.schema is required".to_string())?;
-
-    let start = obj
-        .get("start_tag")
-        .or_else(|| obj.get("start"))
-        .or_else(|| obj.get("tag"));
-    let end = obj.get("end_tag").or_else(|| obj.get("end"));
-
-    let (start_tag, end_tag) = match (start, end) {
-        (Some(start_val), Some(end_val)) => {
-            let start = start_val.as_str().ok_or_else(|| {
-                "structured_outputs.structural_tag.start_tag must be a string".to_string()
-            })?;
-            let end = end_val.as_str().ok_or_else(|| {
-                "structured_outputs.structural_tag.end_tag must be a string".to_string()
-            })?;
-            (start.to_string(), end.to_string())
-        }
-        (Some(tag), None) if obj.contains_key("tag") => {
-            normalize_tag_pair(tag.as_str().ok_or_else(|| {
-                "structured_outputs.structural_tag.tag must be a string".to_string()
-            })?)?
-        }
-        _ => {
-            return Err(
-                "structured_outputs.structural_tag requires tag or start_tag/end_tag".to_string(),
-            );
-        }
-    };
-
-    Ok((start_tag, end_tag, schema))
-}
-
 /// Convert a Value schema to a Vec of Tool objects using ToolBuilder
 /// The schema should be an object where keys are tool names and values are tool schemas
 pub fn schema_to_tools(schema: &Value) -> Vec<Tool> {
@@ -706,311 +270,3 @@ pub fn schema_to_tools(schema: &Value) -> Vec<Tool> {
     }
     tools
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::utils::guidance::get_lark_from_top_level_grammar;
-
-    #[test]
-    fn test_sanitize_schema_for_llguidance_strips_format() {
-        let schema = json!({
-            "type": "object",
-            "properties": {
-                "url": {"type": "string", "format": "uri"},
-                "nested": {"type": "object", "properties": {"id": {"type": "string", "format": "uuid"}}}
-            }
-        });
-        let sanitized = sanitize_schema_for_llguidance(&schema);
-        assert!(sanitized["properties"]["url"].get("format").is_none());
-        assert!(sanitized["properties"]["nested"]["properties"]["id"]
-            .get("format")
-            .is_none());
-    }
-
-    #[test]
-    fn test_sanitize_schema_for_llguidance_preserves_nullable_types() {
-        let schema = json!({
-            "type": "object",
-            "properties": {
-                "cwd": {"type": ["string", "null"]}
-            },
-            "required": ["cwd"]
-        });
-        let sanitized = sanitize_schema_for_llguidance(&schema);
-        assert_eq!(
-            sanitized["properties"]["cwd"]["type"],
-            json!(["string", "null"])
-        );
-    }
-
-    #[test]
-    fn test_build_choice_lark_grammar_empty_string() {
-        let result = build_choice_lark_grammar(&["".to_string()]);
-        assert!(result.is_err());
-    }
-
-    #[test]
-    fn test_parse_structural_tag_missing_schema() {
-        let value = json!({});
-        let result = parse_structural_tag(&value);
-        assert!(result.is_err());
-    }
-
-    #[test]
-    fn test_parse_structural_tag_start_end() {
-        let value = json!({
-            "start_tag": "<tool>",
-            "end_tag": "</tool>",
-            "schema": {"type": "object"}
-        });
-        let result = parse_structural_tag(&value);
-        assert!(result.is_ok());
-        let (start, end, schema) = result.unwrap();
-        assert_eq!(start, "<tool>");
-        assert_eq!(end, "</tool>");
-        assert_eq!(schema, json!({"type": "object"}));
-    }
-
-    #[test]
-    fn test_parse_structural_tag_tag() {
-        let value = json!({
-            "tag": "<tool>",
-            "schema": {"type": "object"}
-        });
-        let result = parse_structural_tag(&value);
-        assert!(result.is_ok());
-        let (start, end, _) = result.unwrap();
-        assert_eq!(start, "<tool>");
-        assert_eq!(end, "</tool>");
-    }
-
-    #[test]
-    fn test_parse_structural_tag_invalid() {
-        let value = json!({
-            "schema": {"type": "object"}
-        });
-        let result = parse_structural_tag(&value);
-        assert!(result.is_err());
-    }
-
-    #[test]
-    fn test_lark_quote_escapes_special_chars() {
-        let result = lark_quote("test\"value");
-        assert!(result.contains("test\\\"value"));
-    }
-
-    #[test]
-    fn test_lark_literal_special_tags() {
-        let result = _lark_literal("<tool>", true);
-        assert_eq!(result, "<tool>");
-    }
-
-    #[test]
-    fn test_lark_literal_regular_string() {
-        let result = _lark_literal("regular", false);
-        assert!(result.contains("\"regular\""));
-    }
-
-    #[test]
-    fn test_lark_special_token_single_id() {
-        let mut ids = HashSet::new();
-        ids.insert(151657);
-        let result = lark_special_token(&ids);
-        assert_eq!(result, "<[151657]>");
-    }
-
-    #[test]
-    fn test_lark_special_token_multiple_ids() {
-        let mut ids = HashSet::new();
-        ids.insert(151657);
-        ids.insert(151658);
-        let result = lark_special_token(&ids);
-        assert!(result.contains("[151657]"));
-        assert!(result.contains("[151658]"));
-    }
-
-    #[test]
-    fn test_lark_special_token_empty() {
-        let ids = HashSet::new();
-        let result = lark_special_token(&ids);
-        assert_eq!(result, "");
-    }
-
-    #[test]
-    fn test_build_xml_tool_lark_grammar_qwen3_coder_required_only() {
-        // Test Qwen3-Coder XML tool format with required attributes only
-        let tools = vec![crate::tools::ToolBuilder::new(
-            "search".to_string(),
-            "Search the web".to_string(),
-        )
-        .param("query", "string", "Search query", true)
-        .build()];
-        let grammar = build_xml_tool_lark_grammar(&tools, "", "", false, false, None, None);
-        let lark_str = get_lark_from_top_level_grammar(&grammar);
-        println!("{}", &lark_str);
-
-        // Qwen3Coder uses XML format with start: tool_call
-        assert!(
-            lark_str.contains("start: tool_call"),
-            "Should have start: tool_call"
-        );
-        assert!(
-            lark_str.contains("<function=search>"),
-            "Should contain function tag"
-        );
-        assert!(lark_str.contains("tool_0:"), "Should contain tool_0 rule");
-    }
-
-    #[test]
-    fn test_build_xml_tool_lark_grammar_qwen3_coder_optional() {
-        // Test Qwen3-Coder XML tool format with optional attributes
-        let tools = vec![crate::tools::ToolBuilder::new(
-            "get_weather".to_string(),
-            "Get weather".to_string(),
-        )
-        .param("city", "string", "City name", true)
-        .param("units", "string", "Temperature units (optional)", false)
-        .build()];
-        let grammar = build_xml_tool_lark_grammar(&tools, "", "", false, false, None, None);
-        let lark_str = get_lark_from_top_level_grammar(&grammar);
-
-        assert!(
-            lark_str.contains("start: tool_call"),
-            "Should have start: tool_call"
-        );
-        assert!(
-            lark_str.contains("<function=get_weather>"),
-            "Should contain function tag"
-        );
-        assert!(lark_str.contains("city"), "Should contain city parameter");
-        assert!(
-            lark_str.contains("units"),
-            "Should contain optional units parameter"
-        );
-    }
-
-    #[test]
-    fn test_build_xml_tool_lark_grammar_qwen3_coder_deep_parameters() {
-        // Test Qwen3-Coder XML tool format with nested/complex parameters
-        let tools = vec![crate::tools::ToolBuilder::new(
-            "edit_file".to_string(),
-            "Edit a file with complex parameters".to_string(),
-        )
-        .param("file_path", "string", "Path to the file", true)
-        .param("old_string", "string", "String to replace", true)
-        .param("new_string", "string", "Replacement string", true)
-        .param("replace_all", "boolean", "Replace all occurrences", false)
-        .build()];
-        let grammar = build_xml_tool_lark_grammar(&tools, "", "", false, false, None, None);
-        let lark_str = get_lark_from_top_level_grammar(&grammar);
-        println!("XML Grammar:\n{}", &lark_str);
-
-        // Verify the grammar contains XML structure
-        assert!(
-            lark_str.contains("start: tool_call"),
-            "Should have start: tool_call"
-        );
-        // Note: <function=...> uses U+200C (zero-width non-joiner) which is invisible
-        assert!(
-            lark_str.contains("function="),
-            "Should contain function tag with attribute"
-        );
-
-        // Verify all parameter tags are present
-        // Note: <parameter=...> uses U+200C (zero-width non-joiner) which is invisible
-        assert!(
-            lark_str.contains("parameter=file_path"),
-            "Should contain file_path parameter tag"
-        );
-        assert!(
-            lark_str.contains("parameter=old_string"),
-            "Should contain old_string parameter tag"
-        );
-        assert!(
-            lark_str.contains("parameter=new_string"),
-            "Should contain new_string parameter tag"
-        );
-        assert!(
-            lark_str.contains("parameter=replace_all"),
-            "Should contain replace_all parameter tag"
-        );
-
-        // Verify parameter rules reference the correct types
-        assert!(
-            lark_str.contains("value_0_0:"),
-            "Should have value_0_0 rule for first param"
-        );
-        assert!(
-            lark_str.contains("value_0_1:"),
-            "Should have value_0_1 rule for second param"
-        );
-        assert!(
-            lark_str.contains("value_0_2:"),
-            "Should have value_0_2 rule for third param"
-        );
-        assert!(
-            lark_str.contains("value_0_3:"),
-            "Should have value_0_3 rule for fourth param"
-        );
-
-        // Verify tool rule has all parameters
-        assert!(lark_str.contains("tool_0:"), "Should have tool_0 rule");
-    }
-
-    #[test]
-    fn test_xml_grammar_required_params_no_wrapper() {
-        // Test that XML grammar puts required params directly without (...) * wrapper
-        let tools = vec![crate::tools::ToolBuilder::new(
-            "search_tool".to_string(),
-            "Search tool".to_string(),
-        )
-        .param("query", "string", "Search query", true) // REQUIRED - should appear as bare rule reference
-        .build()];
-
-        let grammar = build_xml_tool_lark_grammar(&tools, "", "", false, false, None, None);
-        let lark_str = get_lark_from_top_level_grammar(&grammar);
-
-        // Verify tool rule has all parameters
-        assert!(lark_str.contains("tool_0:"), "Should have tool_0 rule");
-        assert!(lark_str.contains("value_0"), "Should have value rules");
-
-        // Required params appear directly in tool rule without ()* wrapper
-    }
-
-    #[test]
-    fn test_xml_grammar_optional_params_wrapped() {
-        // Test that XML grammar wraps optional params with (...) * syntax
-        let tools = vec![crate::tools::ToolBuilder::new(
-            "mixed_tool".to_string(),
-            "Mixed params".to_string(),
-        )
-        .param("required_param", "string", "Required", true) // REQUIRED
-        .param("optional_param", "string", "Optional", false) // OPTIONAL
-        .build()];
-
-        let grammar = build_xml_tool_lark_grammar(&tools, "", "", false, false, None, None);
-        let lark_str = get_lark_from_top_level_grammar(&grammar);
-
-        println!("XML Grammar for mixed tool:\n{}", lark_str);
-
-        // Optional parameters should appear in a (...) * pattern when there are multiple options
-        assert!(lark_str.contains("tool_0:"), "Should have tool_0 rule");
-    }
-
-    #[test]
-    fn test_xml_tool_call_structure_validates() {
-        // Full end-to-end: verify XML grammar produces valid llguidance TopLevelGrammar structure
-        let tools =
-            vec![
-                crate::tools::ToolBuilder::new("formatter".to_string(), "Formatter".to_string())
-                    .param("text", "string", "Text to format", true)
-                    .build(),
-            ];
-
-        let grammar = build_xml_tool_lark_grammar(&tools, "", "", false, false, None, None);
-
-        // Grammar should have at least one sub-grammar (the tool rules)
-        assert!(grammar.grammars.len() > 0, "Should have generated grammars");
-    }
-}
diff --git a/src/utils/chat_template.rs b/src/utils/chat_template.rs
index 538255b5..5e72ac19 100644
--- a/src/utils/chat_template.rs
+++ b/src/utils/chat_template.rs
@@ -230,8 +230,8 @@ fn strip_generation_assistant_header(suffix_text: &str) -> &str {
         return suffix_text;
     };
 
-    // Standard Qwen/ChatML-style: `<|im_start|>assistant`
-    if first_line.ends_with("assistant") {
+    // Standard Qwen/ChatML-style: `<|im_start|>assistant` or minimax `ai` suffix:
+    if first_line.ends_with("assistant") || first_line.ends_with("ai") {
         return remainder;
     }
 
@@ -331,6 +331,10 @@ impl ChatTemplate {
         self.enable_thinking = enable;
     }
 
+    pub fn enable_thinking(&self) -> bool {
+        self.enable_thinking
+    }
+
     pub fn set_escape_tokens(&mut self, mut tokens: Vec<String>) {
         tokens.retain(|token| !token.is_empty());
         tokens.sort_by_key(|token| std::cmp::Reverse(token.len()));
@@ -382,7 +386,7 @@ impl ChatTemplate {
                 let mut escaped = message.clone();
                 match escaped.role.as_str() {
                     "system" | "developer" => {}
-                    "assistant" => {
+                    "assistant" | "ai" => {
                         if let Some((reasoning, remaining)) =
                             extract_reasoning_content(&escaped.content)
                         {
@@ -482,6 +486,11 @@ impl ChatTemplate {
         }
         Some(suffix_text)
     }
+    
+    /// Get the template string for external use (e.g., validation checks)
+    pub fn get_template_string(&self) -> Option<&str> {
+        self.chat_template.as_deref()
+    }
 }
 
 #[cfg(test)]
diff --git a/src/utils/config.rs b/src/utils/config.rs
index 6ae8749d..578335b1 100644
--- a/src/utils/config.rs
+++ b/src/utils/config.rs
@@ -1,6 +1,5 @@
 // src/utils/config.rs
 use crate::transfer::PdConfig;
-use crate::utils::reasoning::ReasoningEffort;
 use llguidance::api::TopLevelGrammar;
 #[cfg(feature = "python")]
 use pyo3::pyclass;
@@ -8,7 +7,7 @@ use serde::de::value::SeqAccessDeserializer;
 use serde::de::{Deserializer, Visitor};
 use serde::ser::Error as _;
 use serde::{Deserialize, Serialize, Serializer};
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::fmt;
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -161,7 +160,14 @@ impl EosTokenId {
     pub fn to_vec(&self) -> Vec<u32> {
         match self {
             EosTokenId::Single(x) => vec![*x],
-            EosTokenId::Multiple(v) => v.clone(),
+            EosTokenId::Multiple(v) => {
+                // Deduplicate while preserving order
+                let mut seen = HashSet::new();
+                v.iter()
+                    .filter(|&id| seen.insert(*id))
+                    .cloned()
+                    .collect()
+            }
         }
     }
 
@@ -397,6 +403,8 @@ pub struct EngineConfig {
     pub disable_cuda_graph: bool,
     #[serde(default = "default_prefill_chunk_size")]
     pub prefill_chunk_size: usize,
+    pub allow_constraint_api: bool,
+    pub enable_tool_grammar: bool,
 }
 
 #[cfg(feature = "python")]
@@ -483,6 +491,10 @@ pub struct EngineConfig {
     #[pyo3(get, set)]
     #[serde(default = "default_prefill_chunk_size")]
     pub prefill_chunk_size: usize,
+    #[pyo3(get, set)]
+    pub allow_constraint_api: bool,
+    #[pyo3(get, set)]
+    pub enable_tool_grammar: bool,
 }
 
 impl EngineConfig {
@@ -534,6 +546,8 @@ impl EngineConfig {
         disable_reasoning: bool,
         disable_cuda_graph: bool,
         prefill_chunk_size: Option<usize>,
+        allow_constraint_api: bool,
+        enable_tool_grammar: bool,
     ) -> Self {
         let mut device_ids = device_ids.unwrap_or_default();
         if device_ids.is_empty() {
@@ -587,6 +601,8 @@ impl EngineConfig {
             prefill_chunk_size: normalize_prefill_chunk_size(
                 prefill_chunk_size.unwrap_or(DEFAULT_PREFILL_CHUNK_SIZE),
             ),
+            allow_constraint_api,
+            enable_tool_grammar,
         }
     }
 }
@@ -599,6 +615,7 @@ pub struct TokenizerConfig {
     pub chat_template: Option<String>,
     pub bos_token: Option<String>,
     pub eos_token: Option<String>,
+    pub pad_token: Option<String>,
 }
 
 #[cfg(not(feature = "python"))]
@@ -782,6 +799,33 @@ pub enum ModelType {
     LLaMa4,
     MiniMax,
 }
+impl ModelType {
+    /// Convert architecture string to ModelType
+    pub fn from_architectures(architectures: &[String]) -> Option<Self> {
+        architectures.first().and_then(|arch| match arch.as_str() {
+            "Qwen3ForCausalLM" => Some(ModelType::Qwen3),
+            "Qwen3MoEForCausalLM" => Some(ModelType::Qwen3MoE),
+            "Qwen3_5ForCausalLM" => Some(ModelType::Qwen3_5),
+            "Qwen3_5MoEForCausalLM" => Some(ModelType::Qwen3_5MoE),
+            "LlamaForCausalLM" => Some(ModelType::LLaMa),
+            "GemmaForCausalLM" => Some(ModelType::Gemma),
+            "Gemma3ForConditionalGeneration" => Some(ModelType::Gemma3),
+            "Gemma4ForConditionalGeneration" => Some(ModelType::Gemma4),
+            "Gemma4ForCausalLM" => Some(ModelType::Gemma4),
+            "PhiForCausalLM" => Some(ModelType::Phi),
+            "Phi4ForCausalLM" => Some(ModelType::Phi4),
+            "MistralForCausalLM" => Some(ModelType::Mistral),
+            "GLM4ForCausalLM" => Some(ModelType::GLM4),
+            "GLM4MoEForCausalLM" => Some(ModelType::GLM4MoE),
+            "YiForCausalLM" => Some(ModelType::Yi),
+            "StableLmForCausalLM" => Some(ModelType::StableLM),
+            "DeepSeekForCausalLM" => Some(ModelType::DeepSeek),
+            "Mistral3VForConditionalGeneration" => Some(ModelType::Mistral3VL),
+            "Qwen3VLMoEForConditionalGeneration" => Some(ModelType::Qwen3VL),
+            _ => None,
+        })
+    }
+}
 
 #[cfg_attr(feature = "python", pyclass)]
 #[derive(Clone, Debug, Serialize, Deserialize)]
@@ -1118,6 +1162,72 @@ impl fmt::Debug for QuantConfig {
     }
 }
 
+/// Reasoning effort level for grammar generation
+    /// Optimized for specific reasoning strategies based on current research (2024-2025)
+    #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
+    #[serde(rename_all = "lowercase")]
+    pub enum ReasoningEffort {
+        /// No structured reasoning - direct output only
+        None,
+        /// Default model reasoning output as induced by opening a reasoning tag
+        ModelDefault,
+        /// Constrained single-paragraph reasoning (~150 chars max)
+        Low,
+        /// Standard multi-step Chain-of-Thought (CoT)
+        Medium,
+        /// Adversarial analysis with self-correction phases
+        High,
+        /// Best-of-breed Chain-of-Verification (CoVe) + Self-Critique
+        ChainOfThought,
+        /// Custom user-provided grammar template (non-Python builds only)
+        #[cfg(all(not(feature = "python"), not(feature = "pyo3")))]
+        Custom(String),
+    }
+
+    impl Default for ReasoningEffort {
+        fn default() -> Self {
+            ReasoningEffort::ModelDefault
+        }
+    }
+
+    impl ReasoningEffort {
+    /// Parse a string to ReasoningEffort
+    pub fn from_str(s: String) -> Self {
+        match s.to_lowercase().as_str() {
+            "none" => Self::None,
+            "low" => Self::Low,
+            "normal" | "medium" => Self::Medium,
+            "high" => Self::High,
+            "very_high" | "chain_of_thought" | "cot" | "cove" => Self::ChainOfThought,
+            #[cfg(all(not(feature = "python"), not(feature = "pyo3")))]
+            s if s.starts_with("custom:") => Self::Custom(s[7..].to_string()),
+            _ => Self::None,
+        }
+    }
+
+    /// Check if reasoning effort is enabled (not None)
+    pub fn is_enabled(&self) -> bool {
+        *self != ReasoningEffort::None
+    }
+}
+
+/// Conversion to string for serialization
+impl std::fmt::Display for ReasoningEffort {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            ReasoningEffort::None => write!(f, "none"),
+            ReasoningEffort::ModelDefault => write!(f, "model_default"),
+            ReasoningEffort::Low => write!(f, "low"),
+            ReasoningEffort::Medium => write!(f, "medium"),
+            ReasoningEffort::High => write!(f, "high"),
+            ReasoningEffort::ChainOfThought => write!(f, "chain_of_thought"),
+            #[cfg(all(not(feature = "python"), not(feature = "pyo3")))]
+            ReasoningEffort::Custom(_) => write!(f, "custom"),
+        }
+    }
+}
+
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -1693,4 +1803,21 @@ mod tests {
         cfg.normalize_compressed_tensors();
         assert_eq!(cfg.quant_method, "fp8");
     }
+
+    fn test_reasoning_effort_from_str() {
+        assert_eq!(ReasoningEffort::from_str("none".to_string()), ReasoningEffort::None);
+        assert_eq!(ReasoningEffort::from_str("low".to_string()), ReasoningEffort::Low);
+        assert_eq!(ReasoningEffort::from_str("medium".to_string()), ReasoningEffort::Medium);
+        assert_eq!(ReasoningEffort::from_str("high".to_string()), ReasoningEffort::High);
+        assert_eq!(ReasoningEffort::from_str("chain_of_thought".to_string()), ReasoningEffort::ChainOfThought);
+    }
+
+    #[test]
+    fn test_reasoning_effort_is_enabled() {
+        assert!(!ReasoningEffort::None.is_enabled());
+        assert!(ReasoningEffort::Low.is_enabled());
+        assert!(ReasoningEffort::Medium.is_enabled());
+        assert!(ReasoningEffort::High.is_enabled());
+        assert!(ReasoningEffort::ChainOfThought.is_enabled());
+    }
 }
diff --git a/src/utils/env.rs b/src/utils/env.rs
index a878dd62..90c9e6fd 100644
--- a/src/utils/env.rs
+++ b/src/utils/env.rs
@@ -41,3 +41,35 @@ pub fn mamba_snapshot_block_stride_blocks(default: usize) -> usize {
         }
     }
 }
+
+pub const DEFAULT_REASONING_MAX_TOKENS_ENV: &str = "XINFER_DEFAULT_REASONING_MAX_TOKENS";
+pub const DEFAULT_REASONING_MAX_TOKENS_VALUE: usize = 512;
+
+static DEFAULT_REASONING_MAX_TOKENS: OnceLock<usize> = OnceLock::new();
+
+pub fn default_reasoning_max_tokens() -> usize {
+    *DEFAULT_REASONING_MAX_TOKENS.get_or_init(|| {
+        env::var(DEFAULT_REASONING_MAX_TOKENS_ENV)
+            .map(|raw| {
+                raw.trim().parse::<usize>()
+                    .map(|n| if n == 0 { DEFAULT_REASONING_MAX_TOKENS_VALUE } else { n })
+                    .unwrap_or(DEFAULT_REASONING_MAX_TOKENS_VALUE)
+            })
+            .unwrap_or(DEFAULT_REASONING_MAX_TOKENS_VALUE)
+    })
+}
+
+/// Environment variable to disable soft masking for gradient smoothing
+/// When set to "0", "false", or "no", soft masking is enabled (default behavior)
+/// When set to any other value (or not set), soft masking is disabled (hard masking)
+pub const SOFT_MASK_DISABLED_ENV: &str = "XINFER_SOFT_MASK_DISABLED";
+
+static SOFT_MASK_DISABLED: OnceLock<bool> = OnceLock::new();
+
+pub fn soft_mask_disabled() -> bool {
+    *SOFT_MASK_DISABLED.get_or_init(|| {
+        env::var(SOFT_MASK_DISABLED_ENV)
+            .map(|v| !matches!(v.trim().to_lowercase().as_str(), "0" | "false" | "no"))
+            .unwrap_or(false)
+    })
+}
diff --git a/src/utils/gguf_helper.rs b/src/utils/gguf_helper.rs
index e37136da..063d9e62 100644
--- a/src/utils/gguf_helper.rs
+++ b/src/utils/gguf_helper.rs
@@ -267,6 +267,7 @@ pub struct GGUFInfo {
     pub bos: Option<String>,
     pub eos: Option<String>,
     pub unk: Option<String>,
+    pub pad_token: Option<String>,
     pub context_length: Option<usize>,
     pub chat_template: Option<String>,
 }
@@ -281,6 +282,7 @@ struct PropsGGUF {
     unk: Option<u32>,
     eos: Option<u32>,
     bos: Option<u32>,
+    pad: Option<u32>,
 }
 
 impl TryFrom<ContentMetadata<'_>> for PropsGGUF {
@@ -300,6 +302,7 @@ impl TryFrom<ContentMetadata<'_>> for PropsGGUF {
             unk: c.get_value("unknown_token_id").ok(),
             eos: c.get_value("eos_token_id").ok(),
             bos: c.get_value("bos_token_id").ok(),
+            pad: c.get_value("pad_token_id").ok(),
         };
 
         Ok(props)
@@ -388,11 +391,17 @@ pub fn get_gguf_info<R: std::io::Seek + std::io::Read>(
         _ => None,
     };
 
+    let pad_token = match props.pad {
+        Some(u) => Some(props.tokens[u as usize].clone()),
+        _ => None,
+    };
+
     Ok(GGUFInfo {
         tokenizer,
         bos,
         eos,
         unk,
+        pad_token,
         context_length: Some(context_length as usize),
         chat_template,
     })
diff --git a/src/utils/guidance.rs b/src/utils/guidance.rs
index 1be0c327..80b25ef2 100644
--- a/src/utils/guidance.rs
+++ b/src/utils/guidance.rs
@@ -1,5 +1,12 @@
 // src/utils/guidance.rs
+// This module contains non-grammar guidance utilities:
+// - GuidanceTokens: token ID collections
+// - ParserFactory: llguidance parser factory
+// - GuidanceState: matcher state for speculative decoding
+// - Mask operations: batch mask bias and early exit validation
+
 use crate::utils::special_tokens::SpecialTokens;
+use crate::utils::config::TokenizerConfig;
 use anyhow::Result;
 use candle_core::Tensor;
 use llguidance::{api::TopLevelGrammar, Matcher, ParserFactory as LlgParserFactory};
@@ -9,814 +16,76 @@ use tokenizers::Tokenizer;
 use toktrie::{SimpleVob, TokTrie};
 use toktrie_hf_tokenizers::{ByteTokenizer, ByteTokenizerEnv};
 
-use crate::tools::schema::ToolGrammarBuilder;
-use crate::tools::Tool;
 use crate::utils::logits_processor::{LogitsProcessor, Sampling};
-use serde_json::json;
 
-// Re-export reasoning types for convenience (without pyclass since it causes compilation issues)
-pub use crate::utils::reasoning::{
-    build_reasoning_grammar, thinking_grammar_with_reasoning_block, ReasoningEffort,
-    ThinkingGrammarBuilder,
+// Re-export from guidance_grammar for grammar-related types
+// Only export the two entrypoints: generate_grammar_from_request and build_grammar_from_request
+pub use crate::utils::guidance_grammar::{
+    build_grammar_from_request, generate_grammar_from_request,
 };
 
 #[derive(Clone, Debug, Default)]
 pub struct GuidanceTokens {
+    pub bos_token_ids: Vec<u32>,
     pub eos_token_ids: Vec<u32>,
     pub reasoning_start_ids: Vec<u32>,
     pub reasoning_end_ids: Vec<u32>,
+    pub tool_call_start_ids: Vec<u32>,
+    pub tool_call_end_ids: Vec<u32>,
+    pub add_bos_token: bool,
 }
 
-pub fn extract_guidance_tokens(tokenizer: &Tokenizer, eos_token_ids: Vec<u32>) -> GuidanceTokens {
+pub fn extract_guidance_tokens(
+    tokenizer: &Tokenizer,
+    eos_token_ids: Vec<u32>,
+    bos_token_ids: Vec<u32>,
+    tokenizer_config: &TokenizerConfig,
+) -> GuidanceTokens {
     let special_tokens = SpecialTokens::new(tokenizer);
-    GuidanceTokens {
-        eos_token_ids,
-        reasoning_start_ids: special_tokens.reasoning_start_ids(),
-        reasoning_end_ids: special_tokens.reasoning_end_ids(),
-    }
-}
-
-/// Error type for grammar-related errors
-#[derive(Debug, thiserror::Error)]
-pub enum GrammarError {
-    #[error("structured_outputs must set exactly one of choice, regex, json, grammar, or structural_tag")]
-    TooManyConstraints,
-
-    #[error("response_format.json_schema is required for type=json_schema")]
-    MissingJsonSchema,
-
-    #[error("unsupported response_format type: {0}")]
-    UnsupportedFormat(String),
-
-    #[error("invalid grammar: {0}")]
-    InvalidGrammar(String),
-}
-
-pub type GrammarResult<T> = Result<T, GrammarError>;
-
-/// Builder for structured output constraint grammars
-pub struct ConstraintBuilder {
-    choice: Option<Vec<String>>,
-    regex: Option<String>,
-    json: Option<serde_json::Value>,
-    grammar: Option<String>,
-    structural_tag: Option<serde_json::Value>,
-}
-
-impl ConstraintBuilder {
-    pub fn new() -> Self {
-        Self {
-            choice: None,
-            regex: None,
-            json: None,
-            grammar: None,
-            structural_tag: None,
-        }
-    }
-
-    pub fn choice(mut self, choice: Vec<String>) -> Self {
-        self.choice = Some(choice);
-        self
-    }
-
-    pub fn regex(mut self, regex: String) -> Self {
-        self.regex = Some(regex);
-        self
-    }
-
-    pub fn json(mut self, json: serde_json::Value) -> Self {
-        self.json = Some(json);
-        self
-    }
-
-    pub fn grammar(mut self, grammar: String) -> Self {
-        self.grammar = Some(grammar);
-        self
-    }
-
-    pub fn structural_tag(mut self, tag: serde_json::Value) -> Self {
-        self.structural_tag = Some(tag);
-        self
-    }
-
-    pub fn build(self) -> Result<Option<TopLevelGrammar>> {
-        let mut selected: Option<TopLevelGrammar> = None;
-        let mut constraint_count = 0;
-
-        if let Some(choice) = self.choice {
-            constraint_count += 1;
-            if constraint_count > 1 {
-                return Err(anyhow::Error::msg("structured_outputs must set exactly one of choice, regex, json, grammar, or structural_tag"));
-            }
-            let choice_gram = crate::tools::schema::build_choice_lark_grammar(&choice)
-                .map_err(|e| anyhow::Error::msg(e))?;
-            selected = Some(choice_gram);
-        }
-
-        if let Some(regex) = self.regex {
-            constraint_count += 1;
-            if constraint_count > 1 {
-                return Err(anyhow::Error::msg("structured_outputs must set exactly one of choice, regex, json, grammar, or structural_tag"));
-            }
-            let regex_gram = TopLevelGrammarExt::from_regex_ascii(&regex);
-            selected = Some(regex_gram);
-        }
-
-        if let Some(schema) = self.json {
-            constraint_count += 1;
-            if constraint_count > 1 {
-                return Err(anyhow::Error::msg("structured_outputs must set exactly one of choice, regex, json, grammar, or structural_tag"));
-            }
-            let schema = crate::tools::schema::sanitize_schema_for_llguidance(&schema);
-            let json_gram = TopLevelGrammarExt::from_json_schema_utf8(schema)
-                .map_err(|e| anyhow::Error::msg(e.to_string()))?;
-            selected = Some(json_gram);
-        }
-
-        if let Some(grammar) = self.grammar {
-            constraint_count += 1;
-            if constraint_count > 1 {
-                return Err(anyhow::Error::msg("structured_outputs must set exactly one of choice, regex, json, grammar, or structural_tag"));
-            }
-            let lark_gram = TopLevelGrammarExt::from_lark_utf8(&grammar);
-            selected = Some(lark_gram);
-        }
-
-        if let Some(tag) = self.structural_tag {
-            constraint_count += 1;
-            if constraint_count > 1 {
-                return Err(anyhow::Error::msg("structured_outputs must set exactly one of choice, regex, json, grammar, or structural_tag"));
-            }
-            let (start, end, schema) = crate::tools::schema::parse_structural_tag(&tag)
-                .map_err(|e| anyhow::Error::msg(e))?;
-            let schema = crate::tools::schema::sanitize_schema_for_llguidance(&schema);
-            let tools = crate::tools::schema::schema_to_tools(&schema);
-            let tool_gram = ToolGrammarBuilder::new()
-                .tools(&tools)
-                .start_tag(&start)
-                .end_tag(&end)
-                .start_is_special(false)
-                .end_is_special(false)
-                .build_json();
-            selected = Some(tool_gram);
-        }
-
-        if selected.is_none() {
-            return Err(anyhow::Error::msg("structured_outputs must set exactly one of choice, regex, json, grammar, or structural_tag"));
-        }
-
-        Ok(selected)
-    }
-}
-
-/// Builder for composing multiple grammars with alternation
-/// This provides a more readable, declarative way to build composed grammars
-pub struct GrammarBuilder {
-    alternatives: Vec<TopLevelGrammar>,
-    max_tokens: Option<usize>,
-}
-
-impl GrammarBuilder {
-    pub fn new() -> Self {
-        Self {
-            alternatives: Vec::new(),
-            max_tokens: None,
-        }
-    }
-
-    pub fn alternative(mut self, grammar: TopLevelGrammar) -> Self {
-        self.alternatives.push(grammar);
-        self
-    }
-
-    pub fn max_tokens(mut self, tokens: usize) -> Self {
-        self.max_tokens = Some(tokens);
-        self
-    }
-
-    pub fn build(self) -> TopLevelGrammar {
-        // Note: GrammarBuilder currently uses chat_text_expression() without EOS tokens
-        // EOS token support is provided through compose_grammars() directly
-        match self.alternatives.len() {
-            0 => {
-                let lark = chat_text_expression(false);
-                TopLevelGrammar::from_lark_utf8(&lark)
-            }
-            1 => {
-                let mut gram = self.alternatives.into_iter().next().unwrap();
-                gram.max_tokens = self.max_tokens;
-                gram
-            }
-            _ => {
-                let merged = merge_top_level_grammars(
-                    self.alternatives,
-                    self.max_tokens,
-                    Some("|".to_string()),
-                );
-                merged
-            }
-        }
-    }
-}
-
-/// Grammar composition variant - represents all possible grammar configurations
-#[derive(Clone, Debug)]
-pub enum GrammarComposers {
-    TextWithEos,
-    Constraint(TopLevelGrammar),
-    WithReasoning(TopLevelGrammar, TopLevelGrammar),
-}
-
-/// Builder for constructing GrammarComposers
-pub struct GrammarComposerBuilder {
-    constraint_grammars: Vec<TopLevelGrammar>,
-    reasoning_effort: Option<ReasoningEffort>,
-}
-
-impl GrammarComposerBuilder {
-    pub fn new() -> Self {
-        Self {
-            constraint_grammars: Vec::new(),
-            reasoning_effort: None,
-        }
-    }
-
-    pub fn constraints(mut self, grammars: Vec<TopLevelGrammar>) -> Self {
-        self.constraint_grammars = grammars;
-        self
-    }
-
-    pub fn reasoning_effort(mut self, effort: Option<ReasoningEffort>) -> Self {
-        self.reasoning_effort = effort;
-        self
-    }
-
-    pub fn into_composer(self, guidance_tokens: &GuidanceTokens) -> GrammarComposers {
-        let base = self.build_base_composer(guidance_tokens);
-        self.build_with_reasoning(base, guidance_tokens)
-    }
-
-    fn build_base_composer(&self, _guidance_tokens: &GuidanceTokens) -> GrammarComposers {
-        match self.constraint_grammars.is_empty() {
-            true => GrammarComposers::TextWithEos,
-            false => GrammarComposers::Constraint(self.constraint_grammars[0].clone()),
-        }
-    }
-
-    fn build_with_reasoning(
-        self,
-        base: GrammarComposers,
-        guidance_tokens: &GuidanceTokens,
-    ) -> GrammarComposers {
-        match self.reasoning_effort {
-            Some(ReasoningEffort::None) => base,
-            Some(effort) => {
-                let start_ids = &guidance_tokens.reasoning_start_ids;
-                let end_ids = &guidance_tokens.reasoning_end_ids;
-
-                if start_ids.is_empty() || end_ids.is_empty() {
-                    crate::log_warn!(
-                        "[llg] Reasoning effort {:?} set but no reasoning tokens found",
-                        effort
-                    );
-                    base
-                } else {
-                    let start_id = start_ids[0];
-                    let end_id = end_ids[0];
-                    let reasoning_lark =
-                        thinking_grammar_with_reasoning_block(start_id, end_id, Some(effort));
-                    let reasoning_gram = TopLevelGrammar::from_lark_utf8(&reasoning_lark);
-                    let base_gram = base.to_grammar(guidance_tokens);
-                    GrammarComposers::WithReasoning(reasoning_gram, base_gram)
-                }
-            }
-            None => base,
-        }
-    }
-
-    pub fn build(self, guidance_tokens: &GuidanceTokens) -> TopLevelGrammar {
-        let composer = self.into_composer(guidance_tokens);
-        composer.to_grammar(guidance_tokens)
-    }
-}
-
-impl GrammarComposers {
-    pub fn to_grammar(&self, guidance_tokens: &GuidanceTokens) -> TopLevelGrammar {
-        let base_grammar = self.build_base_grammar(guidance_tokens);
-
-        // Add eos? termination to ensure all grammars can terminate
-        add_eos_termination(&base_grammar, &guidance_tokens.eos_token_ids)
-    }
-
-    fn build_base_grammar(&self, guidance_tokens: &GuidanceTokens) -> TopLevelGrammar {
-        match self {
-            GrammarComposers::TextWithEos => {
-                let has_eos = !guidance_tokens.eos_token_ids.is_empty();
-                let lark = chat_text_expression(has_eos);
-                TopLevelGrammar::from_lark_utf8(&lark)
-            }
-            GrammarComposers::Constraint(c) => c.clone(),
-            GrammarComposers::WithReasoning(reasoning, inner) => wrap_with_subgrammars(
-                "@reasoning @inner",
-                None,
-                &[("reasoning", reasoning), ("inner", inner)],
-                None,
-            ),
-        }
-    }
-}
-
-/// Build text pattern for chat conversations (without EOS)
-/// have_eos: when true, generates simple text rule; when false, uses stop="" fallback
-pub fn chat_text_expression(have_eos: bool) -> String {
-    // First check environment variable override
-    if let Ok(val) = std::env::var("XINFER_LLG_DEFAULT_TEXT") {
-        return format!("{}", val);
-    }
-
-    if have_eos {
-        // Text pattern without EOS - just text rule
-        r#"start: text
-text: /(?s:.*)/"#
-            .to_string()
-    } else {
-        // Fallback to stop="" when no EOS tokens available
-        r#"start: text
-text[stop=""]: /((?s).*?)/"#
-            .to_string()
-    }
-}
-
-/// Build EOS pattern for text completion
-/// Returns just the EOS rule definition (not combined with text)
-pub fn eos_expression(eos_token_ids: &[u32]) -> String {
-    if eos_token_ids.is_empty() {
-        String::new()
-    } else if eos_token_ids.len() == 1 {
-        format!("eos: <[{}]>\n", eos_token_ids[0])
-    } else {
-        let ids: Vec<String> = eos_token_ids
-            .iter()
-            .map(|id| format!("<[{}]>", id))
-            .collect();
-        let alternation = ids.join(" | ");
-        format!("eos: ( {} )", alternation)
-    }
-}
-
-/// Add eos? termination to a grammar, ensuring all paths can end with EOS
-/// This function modifies the start: rule to append optional EOS token alternation
-fn add_eos_termination(grammar: &TopLevelGrammar, eos_token_ids: &[u32]) -> TopLevelGrammar {
-    if eos_token_ids.is_empty() {
-        return grammar.clone();
-    }
-
-    let is_simple_lark = grammar.grammars.len() == 1
-        && grammar
-            .grammars
-            .first()
-            .and_then(|g| g.lark_grammar.as_ref())
-            .is_some();
-
-    if !is_simple_lark {
-        let eos_line = if eos_token_ids.len() > 1 {
-            let ids: Vec<String> = eos_token_ids
-                .iter()
-                .map(|id| format!("<[{}]>", id))
-                .collect();
-            format!("eos: ( {} )", ids.join(" | "))
-        } else {
-            format!("eos: <[{}]>", eos_token_ids[0])
-        };
-        return wrap_with_subgrammars(
-            "@inner eos?",
-            Some(eos_line.as_str()),
-            &[("inner", grammar)],
-            grammar.max_tokens,
-        );
-    }
 
-    let lark = get_lark_from_top_level_grammar(grammar);
-
-    // Parse lines to find start: rule
-    let lines: Vec<&str> = lark.lines().collect();
-    if lines.is_empty() {
-        return grammar.clone();
-    }
-
-    let first_line = if lines[0].trim().contains("eos?") {
-        lines[0].trim().replace("eos?", "")
-    } else {
-        lines[0].trim().to_string()
-    };
-
-    // Extract the current start RHS (everything after "start:")
-    let current_start_rhs = if let Some(rhs) = first_line.strip_prefix("start:") {
-        rhs.trim()
-    } else {
-        return grammar.clone();
-    };
+    // Verify EOS token IDs are in added vocabulary if more than one provided
+    let added_tokens: HashMap<u32, String> = tokenizer
+        .get_added_tokens_decoder()
+        .iter()
+        .map(|(id, token)| (*id, token.content.clone()))
+        .collect();
 
-    // Build new start rule with eos? termination
-    // For multiple EOS tokens, use ( <[id1]> | <[id2]> )? format
-    let new_start_line = format!(
-        r#"start: {current_start_rhs} eos?
-"#
-    );
-    let eos_line = if eos_token_ids.len() > 1 {
-        let ids: Vec<String> = eos_token_ids
-            .iter()
-            .map(|id| format!("<[{}]>", id))
-            .collect();
-        let alternation = ids.join(" | ");
-        format!("eos:  ( {} )", alternation)
+    let validated_eos: Vec<u32> = if eos_token_ids.len() > 1 {
+        eos_token_ids
+            .into_iter()
+            .filter(|id| added_tokens.contains_key(id))
+            .collect()
     } else {
-        format!("eos: <[{}]>", eos_token_ids[0])
+        eos_token_ids
     };
 
-    // Get existing rules (everything after first line)
-    let other_rules = if lines.len() > 1 {
-        // Filter out any existing eos: rules to avoid duplication
-        let filtered: Vec<_> = lines[1..]
-            .iter()
-            .filter(|line| !line.trim().starts_with("eos:"))
-            .map(|s| *s)
+    let validated_bos: Vec<u32> = {
+        let retained: Vec<u32> = bos_token_ids
+            .into_iter()
+            .filter(|id| !validated_eos.contains(id))
             .collect();
-        filtered.join("\n")
-    } else {
-        String::new()
-    };
-
-    let final_grammar = format!("{}\n{}\n{}", new_start_line, other_rules, eos_line);
-
-    TopLevelGrammar::from_lark_utf8(&final_grammar)
-}
-
-fn wrap_with_subgrammars(
-    start_rhs: &str,
-    extra_rules: Option<&str>,
-    subgrammars: &[(&str, &TopLevelGrammar)],
-    max_tokens: Option<usize>,
-) -> TopLevelGrammar {
-    let mut wrapper_lark = format!("start: {}", start_rhs);
-    if let Some(extra_rules) = extra_rules {
-        let extra_rules = extra_rules.trim();
-        if !extra_rules.is_empty() {
-            wrapper_lark.push('\n');
-            wrapper_lark.push_str(extra_rules);
-        }
-    }
-
-    let mut wrapped = TopLevelGrammar::from_lark_utf8(&wrapper_lark);
-    wrapped.max_tokens = max_tokens;
-    for (name, grammar) in subgrammars {
-        let mut nested = (*grammar).clone();
-        if let Some(first) = nested.grammars.first_mut() {
-            first.name = Some((*name).to_string());
-        }
-        wrapped.grammars.extend(nested.grammars);
-    }
-    wrapped
-}
-
-/// Extension trait for TopLevelGrammar with built-in sanitization
-/// This ensures all grammar construction paths sanitize inputs consistently
-pub trait TopLevelGrammarExt: Sized {
-    /// Create TopLevelGrammar from regex with ASCII sanitization
-    fn from_regex_ascii(regex: &str) -> Self;
-
-    /// Create TopLevelGrammar from Lark string with UTF-8 sanitization
-    fn from_lark_utf8(lark: &str) -> Self;
-
-    /// Create TopLevelGrammar from JSON schema with UTF-8 sanitization
-    fn from_json_schema_utf8(schema: serde_json::Value) -> Result<Self, anyhow::Error>;
-}
-
-impl TopLevelGrammarExt for TopLevelGrammar {
-    fn from_regex_ascii(regex: &str) -> Self {
-        let sanitized = sanitize_utf8_valid(regex);
-        Self::from_regex(&sanitized)
-    }
-
-    fn from_lark_utf8(lark: &str) -> Self {
-        let sanitized = sanitize_utf8_valid(lark);
-        Self::from_lark(sanitized)
-    }
-
-    fn from_json_schema_utf8(schema: serde_json::Value) -> Result<Self, anyhow::Error> {
-        let schema_str = serde_json::to_string(&schema)?;
-        let sanitized = sanitize_utf8_valid(&schema_str);
-        let val = serde_json::from_str(&sanitized)?;
-        Ok(Self::from_json_schema(val))
-    }
-}
-
-/// Sanitize a string by removing non-ASCII bytes
-/// This is used for tool choice strings to ensure only safe ASCII characters reach llguidance lexer
-pub fn sanitize_to_ascii(s: &str) -> String {
-    s.bytes()
-        .filter(|&b| b.is_ascii())
-        .map(|b| b as char)
-        .collect::<String>()
-}
-
-/// Sanitize a string by removing invalid UTF-8 sequences and control characters
-pub fn sanitize_utf8_valid(s: &str) -> String {
-    let mut result = String::new();
-    for ch in s.chars() {
-        if ch.is_control() && !matches!(ch, '\n' | '\r' | '\t') {
-            continue;
-        }
-        result.push(ch);
-    }
-    result
-}
-
-/// Parse a Lark grammar string to extract the start rule RHS and other rules
-/// Returns (start_rhs, other_rules) where start_rhs is the RHS of the start: rule
-/// The RHS should be a list of rule names separated by | for alternation
-fn parse_lark_grammar(lark: &str) -> (String, Vec<String>) {
-    let lines: Vec<&str> = lark.lines().collect();
-    if lines.is_empty() {
-        return (String::new(), Vec::new());
-    }
-
-    let first_line = lines[0].trim();
-    if first_line.starts_with("start:") {
-        // Extract only the rule names after "start:", not the full rule definition
-        let rhs_part = first_line.strip_prefix("start:").unwrap_or("").trim();
-
-        // Parse the RHS to get individual rule names (separated by |)
-        // We only want the rule names, not their definitions
-        let rule_names: Vec<String> = rhs_part.split('|').map(|s| s.trim().to_string()).collect();
-
-        // The RHS for alternation should be just the rule names
-        let start_rhs = rule_names.join(" | ");
-
-        // Return all remaining lines as other rules
-        let other_rules: Vec<String> = lines[1..].iter().map(|s| s.to_string()).collect();
-
-        (start_rhs, other_rules)
-    } else {
-        // No start rule - treat entire grammar as the start rule
-        (lark.to_string(), Vec::new())
-    }
-}
-
-/// Combine grammar rules, handling duplicate rule names by merging them
-fn combine_rules(rules: Vec<String>) -> String {
-    if rules.is_empty() {
-        return String::new();
-    }
-
-    // Group rules by their name (the part before ":")
-    use std::collections::HashMap;
-    let mut rule_groups: HashMap<String, Vec<String>> = HashMap::new();
-
-    for rule in rules {
-        let rule = rule.trim();
-        if rule.is_empty() {
-            continue;
-        }
-
-        // Find the rule name (before the first ":")
-        if let Some(colon_pos) = rule.find(':') {
-            let name = rule[..colon_pos].trim().to_string();
-            let body = rule[colon_pos + 1..].trim().to_string();
-
-            rule_groups.entry(name).or_default().push(body);
+        if retained.is_empty() {
+            special_tokens.bos_token_ids()
         } else {
-            // Rule without colon - add as-is
-            rule_groups
-                .entry("anonymous".to_string())
-                .or_default()
-                .push(rule.to_string());
+            retained
         }
-    }
-
-    // Reconstruct rules, merging duplicates
-    let mut combined = Vec::new();
-    for (name, bodies) in rule_groups {
-        if bodies.len() == 1 {
-            combined.push((name.clone(), format!("{}: {}", name, bodies[0])));
-        } else {
-            // Multiple definitions for same rule - combine with alternation
-            combined.push((name.clone(), format!("{}: {}", name, bodies.join(" | "))));
-        }
-    }
-
-    // Sort rules: start first, then tool rules (tool_N), then alphabetically
-    combined.sort_by(|a, b| {
-        let name_a = a.0.as_str();
-        let name_b = b.0.as_str();
-
-        // "start" always comes first
-        if name_a == "start" {
-            return std::cmp::Ordering::Less;
-        }
-        if name_b == "start" {
-            return std::cmp::Ordering::Greater;
-        }
-
-        // Tool rules (tool_N) come next, sorted by their numeric index
-        if name_a.starts_with("tool_") && name_b.starts_with("tool_") {
-            // Extract the numeric part
-            let num_a: u32 = name_a[5..].parse().unwrap_or(0);
-            let num_b: u32 = name_b[5..].parse().unwrap_or(0);
-            return num_a.cmp(&num_b);
-        }
-        if name_a.starts_with("tool_") {
-            return std::cmp::Ordering::Less;
-        }
-        if name_b.starts_with("tool_") {
-            return std::cmp::Ordering::Greater;
-        }
-
-        // Other rules sorted alphabetically
-        name_a.cmp(name_b)
-    });
-
-    combined
-        .into_iter()
-        .map(|(_, rule)| rule)
-        .collect::<Vec<_>>()
-        .join("\n")
-}
-
-/// Merge multiple TopLevelGrammar objects into one
-/// This creates a single Lark grammar with alternation at the start rule level
-/// Each sub-grammar's rules are combined directly without rule_N indirection
-pub fn merge_top_level_grammars(
-    grammars: Vec<TopLevelGrammar>,
-    max_tokens: Option<usize>,
-    start_separator: Option<String>,
-) -> TopLevelGrammar {
-    // Extract all Lark grammar strings
-    let mut lark_parts = Vec::new();
-
-    let sep = match start_separator {
-        Some(s) => s,
-        None => "|".to_string(),
     };
 
-    for (_i, g) in grammars.iter().enumerate() {
-        for gw in &g.grammars {
-            if let Some(lark) = &gw.lark_grammar {
-                lark_parts.push(lark.clone());
-            }
-        }
-    }
-
-    if lark_parts.is_empty() {
-        let lark_start_exp = format!("start: text\ntext[stop=\"\"]: /((?s).*?)/");
-        let mut tlg = TopLevelGrammar::from_lark(lark_start_exp);
-        tlg.max_tokens = max_tokens;
-        return tlg;
-    }
-
-    // Parse each grammar and extract start RHS + other rules
-    let mut combined_start_rhs = Vec::new();
-    let mut all_other_rules = Vec::new();
-
-    for lark in lark_parts.iter() {
-        let (start_rhs, other_rules) = parse_lark_grammar(lark);
-        combined_start_rhs.push(start_rhs);
-        all_other_rules.extend(other_rules);
-    }
+    // Determine if BOS token should be added based on tokenizer config
+    // add_bos_token == Some(true) means the tokenizer adds BOS automatically
+    let add_bos_token = tokenizer_config.add_bos_token == Some(true);
 
-    // Combine all other rules, handling duplicates
-    let combined_rules = combine_rules(all_other_rules);
-
-    // Build new grammar with direct alternation at start
-    let start_separator = format!(" {} ", &sep);
-    let start_alternation = combined_start_rhs.join(&start_separator);
-    let final_grammar = format!("start: ( {} )\n{}", start_alternation, combined_rules);
-
-    let mut top_gram = TopLevelGrammar::from_lark(final_grammar);
-    top_gram.max_tokens = max_tokens;
-    top_gram
-}
-
-/// Extract the Lark grammar string from TopLevelGrammar for debugging
-pub fn get_lark_from_top_level_grammar(gram: &TopLevelGrammar) -> String {
-    if gram.grammars.is_empty() {
-        return "No grammars".to_string();
-    }
-    let larks: Vec<String> = gram
-        .grammars
-        .iter()
-        .filter_map(|g| g.lark_grammar.as_ref())
-        .map(|s| s.clone())
-        .collect();
-    if larks.is_empty() {
-        format!("{} grammars, none have lark_grammar", gram.grammars.len())
-    } else {
-        larks.join("\n---\n")
-    }
-}
-
-/// Lark grammar TEXT pattern for common UTF-8 printable characters
-/// Excludes control characters (0x00-0x1F), DEL (0x7F), and C1 controls (0x80-0x9F)
-/// This pattern allows:
-/// - ASCII printable: space (0x20) through tilde (0x7E)
-/// - Unicode text: 0x80 onwards (Latin extended, accented chars, CJK, emoji, etc.)
-/// - Common whitespace: newline, carriage return, tab
-///
-/// ## Binary Token Matching with llguidance Matcher
-///
-/// When working with Qwen-style tool tokens (e.g., ``), llguidance uses
-/// a **byte-level lexer approach** with the following key concepts:
-///
-/// ### 1. Token-Based, Not Byte-Based
-/// The `Matcher.compute_mask()` returns a [`SimpleVob`](toktrie::SimpleVob) - a bit vector
-/// where each bit represents whether a **token ID** is allowed. This is pre-computed
-/// against the tokenizer's trie.
-///
-/// ### 2. Special Token Marker (0xFF)
-/// llguidance uses byte `0xFF` (TokTrie::SPECIAL_TOKEN_MARKER) to prefix special tokens
-/// like `<|end_of_text|>`, `<|eot_id|>`, etc. This is because:
-/// - `0xFF` is not valid UTF-8, so it never appears in regular text
-/// - In Rust: `&[u8]` can contain 0xFF, but `&str` cannot
-/// - Tokenizers like Qwen may embed special tokens as bytes like `[\xFF, b'[', b'1', b'2', b']']`
-///
-/// ### 3. Qwen Tool Call Format Example
-/// For models like Qwen3 that use `` delimiters:
-///
-/// ```lark
-/// start: tool*
-/// tool: "" "\n" func "\n" "" ("\n")*
-/// func: %json {"type":"object","properties":{"name":...}}
-/// ```
-///
-/// ### 4. Current Implementation in xInfer
-/// The [`src/core/runner.rs`](src/core/runner.rs) uses logits-based sampling:
-/// ```ignore
-/// // Apply mask: set disallowed tokens to -inf
-/// for tok in 0..vocab_size {
-///     if !mask.is_allowed(tok as u32) {
-///         row[tok] = f32::NEG_INFINITY;
-///     }
-/// }
-/// ```
-/// This is compatible with llguidance's token-level SimpleVob mask because:
-/// - `mask.is_allowed(tok)` checks if token ID `tok` is in the allowed set
-/// - The logits are modified to give -inf to disallowed tokens
-/// - Sampling then only picks from allowed tokens
-fn lark_quote(value: &str) -> String {
-    serde_json::to_string(value).unwrap_or_else(|_| "\"\"".to_string())
-}
-
-/// Build special token syntax for Lark grammar using token IDs
-/// When token IDs are available, uses <[token_id]> syntax instead of string literals
-/// This ensures alignment with the outbound parser's token-based detection
-pub fn build_special_token_tag(
-    token_ids: &std::collections::HashSet<u32>,
-    fallback: &str,
-) -> String {
-    if token_ids.is_empty() {
-        // Fall back to string representation when token IDs are not available
-        return lark_quote(fallback);
+    GuidanceTokens {
+        bos_token_ids: validated_bos,
+        eos_token_ids: validated_eos,
+        reasoning_start_ids: special_tokens.reasoning_start_ids(),
+        reasoning_end_ids: special_tokens.reasoning_end_ids(),
+        tool_call_start_ids: special_tokens.tool_call_start_ids(),
+        tool_call_end_ids: special_tokens.tool_call_end_ids(),
+        add_bos_token,
     }
-    // Convert token IDs to Lark special token syntax <[id]>
-    // The format is: <[token_id]> which matches what the tokenizer expects
-    let ids: Vec<String> = token_ids.iter().map(|id| format!("[{}]", id)).collect();
-    format!("<{}>", ids.join(","))
-}
-
-/// Build tool call start tag using token IDs when available
-pub fn build_tool_call_tag(
-    start_token_ids: &std::collections::HashSet<u32>,
-    start_token_str: &str,
-) -> String {
-    build_special_token_tag(start_token_ids, start_token_str)
-}
-
-/// Build tool call end tag using token IDs when available
-pub fn build_tool_call_end_tag(
-    end_token_ids: &std::collections::HashSet<u32>,
-    end_token_str: &str,
-) -> String {
-    build_special_token_tag(end_token_ids, end_token_str)
 }
 
-/// Compose grammars based on request constraints and optional reasoning
-/// Returns a single TopLevelGrammar with proper precedence
-/// This function takes the grammar that was built externally (with appropriate model-specific format)
-/// and handles the alternation/composition logic
-pub fn compose_grammars(
-    constraint_grammars: Vec<TopLevelGrammar>,
-    max_tokens: Option<usize>,
-    guidance_tokens: &GuidanceTokens,
-    reasoning_effort: Option<ReasoningEffort>,
-) -> TopLevelGrammar {
-    let builder = GrammarComposerBuilder::new()
-        .constraints(constraint_grammars)
-        .reasoning_effort(reasoning_effort);
-
-    let mut grammar = builder.build(guidance_tokens);
-    grammar.max_tokens = max_tokens;
-    grammar
-}
 pub type ParserFactory = LlgParserFactory;
 
 pub fn build_llg_factory(
@@ -849,37 +118,7 @@ pub fn load_toktrie_from_path(path: impl AsRef<std::path::Path>) -> Result<TokTr
 
 /// WS regex pattern for Lark grammars - matches whitespace including spaces, tabs, newlines, carriage returns
 pub fn lark_ws_regex() -> &'static str {
-    "/[ \\\\t\\\
-\\\
-]+/"
-}
-
-/// Build Lark grammar string for tool calls
-pub fn build_tool_call_lark(
-    tools: &[Tool],
-    schema_map: &std::sync::Arc<std::collections::HashMap<String, serde_json::Value>>,
-    start: &str,
-    end: &str,
-) -> String {
-    let mut obj_rules = String::new();
-    for tool in tools {
-        let name = &tool.function.name;
-        let schema_str =
-            serde_json::to_string(schema_map.get(name).unwrap_or(&json!({}))).unwrap_or_default();
-        obj_rules.push_str(&format!(
-            "obj_{}: %json {}\n",
-            name.replace("-", "_"),
-            schema_str
-        ));
-    }
-
-    format!(
-        "{} _WS? json_array _WS? {}\njson_array: \"[\" obj (\",\" obj)* \"]\"\nobj:\n_WS: {}\n{}",
-        start,
-        end,
-        lark_ws_regex(),
-        obj_rules.trim_end()
-    )
+    "/[ \\t\\n\\r]+/"
 }
 
 /// Cache for precomputed mask slices to avoid expensive re-computation
@@ -924,7 +163,17 @@ impl GuidanceState {
         factory: Arc<ParserFactory>,
         grammar: &TopLevelGrammar,
     ) -> Result<Self> {
-        let parser = factory.create_parser(grammar.clone())?;
+        use crate::utils::guidance_grammar::get_lark_from_top_level_grammar;
+        let lark = get_lark_from_top_level_grammar(grammar);
+        crate::log_info!("[llg] Composed Grammar Constraint:\n{}\n", lark);
+        let mut grammar = grammar.clone();
+        // Add generation space for EOS token to prevent overrun if max_tokens reached
+        if let Some(max_tokens) = grammar.max_tokens {
+            let bos_len = 1; // Placeholders which get compiled-out in case this ever changes
+            let eos_len = 1;
+            grammar.max_tokens = Some(max_tokens + bos_len + eos_len);
+        };
+        let parser = factory.create_parser(grammar)?;
         let matcher = Matcher::new(Ok(parser));
 
         Ok(Self {
@@ -1166,3 +415,16 @@ pub fn _early_exit_validate(
 
     Ok(())
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_extract_guidance_tokens() {
+        // This test verifies that extract_guidance_tokens compiles
+        // It doesn't actually run since we don't have a tokenizer here
+        let tokens = GuidanceTokens::default();
+        assert!(tokens.bos_token_ids.is_empty());
+    }
+}
diff --git a/src/utils/guidance_grammar.rs b/src/utils/guidance_grammar.rs
new file mode 100644
index 00000000..22ea01f5
--- /dev/null
+++ b/src/utils/guidance_grammar.rs
@@ -0,0 +1,3574 @@
+// src/utils/guidance_grammar.rs
+//! Clean-sheet grammar generation for llguidance
+//! Handles constraints, tools, and reasoning in a simple, idiomatic way
+
+use llguidance::api::TopLevelGrammar;
+use serde_json::{json, Value};
+use std::collections::{HashMap, HashSet};
+
+use crate::server::ChatCompletionRequest;
+use crate::tools::Tool;
+use crate::server::parser::{StreamToolParser, ToolConfig};
+use crate::utils::chat_template::ChatTemplate;
+use crate::utils::config::ModelType;
+use crate::utils::config::ReasoningEffort;
+use crate::utils::guidance::GuidanceTokens;
+use tokenizers::Tokenizer;
+use crate::utils::env::default_reasoning_max_tokens;
+
+// COMMON TRAITS
+
+/// Common trait for grammar builders in llguidance integration
+///
+/// Each grammar type must implement `build_lark()` to generate its Lark representation.
+/// Default implementations are provided for composition methods; override when needed.
+pub trait GrammarBuilder: Clone + Default + std::fmt::Debug + Sized {
+    /// Build the Lark grammar string - must be implemented by each grammar type
+    fn build_lark(&mut self) -> String;
+
+    /// Compose two grammars with alternation (OR) - defaults to cloning 'other'
+    /// Override when specific alternation logic is needed
+    fn compose_alternate(&mut self, other: &mut Self) -> Self {
+        other.clone()
+    }
+
+    /// Compose two grammars with sequence (AND) - defaults to cloning 'other'
+    /// Override when specific sequence logic is needed
+    fn compose_sequence(&mut self, other: &mut Self) -> Self {
+        other.clone()
+    }
+
+    /// Convert to TopLevelGrammar - defaults to parsing build_lark() output
+    fn format(&mut self) -> TopLevelGrammar {
+        TopLevelGrammar::from_lark_ascii(&self.build_lark())
+    }
+
+    /// Substitute token IDs from a mapping - defaults to cloning self
+    /// Override when token-specific mutation is needed
+    fn substitute_tokens(&mut self, _token_map: &TokenSubstitutionMap) -> Self {
+        self.clone()
+    }
+
+    /// Extract specific rules by name - defaults to empty vector
+    /// Override when line-based extraction is needed
+    fn extract_rules(&mut self, _rule_names: &[&str]) -> Vec<String> {
+        Vec::new()
+    }
+}
+
+pub type TokenSubstitutionMap = HashMap<String, Vec<u32>>;
+
+/// Result type for grammar-related operations
+pub type GrammarResult<T> = Result<T, GrammarError>;
+
+/// Error type for grammar-related operations
+#[derive(Debug, thiserror::Error)]
+pub enum GrammarError {
+    #[error("Invalid grammar: {0}")]
+    InvalidGrammar(String),
+    #[error("Missing constraint: {0}")]
+    MissingConstraint(String),
+    #[error("Unsupported format: {0}")]
+    UnsupportedFormat(String),
+}
+
+/// Extension trait for TopLevelGrammar with built-in sanitization
+/// This ensures all grammar construction paths sanitize inputs consistently
+pub trait TopLevelGrammarExt: Sized {
+    /// Create TopLevelGrammar from regex with ASCII sanitization
+    fn from_regex_ascii(regex: &str) -> Self;
+
+    /// Create TopLevelGrammar from Lark string with ASCII sanitization
+    fn from_lark_ascii(lark: &str) -> Self;
+
+    /// Create TopLevelGrammar from JSON schema with ASCII sanitization
+    fn from_json_schema_ascii(schema: serde_json::Value) -> Result<Self, anyhow::Error>;
+
+    /// Deprecated: use from_lark_ascii instead
+    fn from_lark_utf8(lark: &str) -> Self {
+        Self::from_lark_ascii(lark)
+    }
+
+    /// Deprecated: use from_json_schema_ascii instead
+    fn from_json_schema_utf8(schema: serde_json::Value) -> Result<Self, anyhow::Error> {
+        Self::from_json_schema_ascii(schema)
+    }
+}
+
+impl TopLevelGrammarExt for TopLevelGrammar {
+    fn from_regex_ascii(regex: &str) -> Self {
+        let sanitized = sanitize_ascii_only(regex);
+        Self::from_regex(&sanitized)
+    }
+
+    fn from_lark_ascii(lark: &str) -> Self {
+        let sanitized = sanitize_ascii_only(lark);
+        Self::from_lark(sanitized)
+    }
+
+    fn from_json_schema_ascii(schema: serde_json::Value) -> Result<Self, anyhow::Error> {
+        let schema_str = serde_json::to_string(&schema)?;
+        let sanitized = sanitize_ascii_only(&schema_str);
+        let val = serde_json::from_str(&sanitized)?;
+        Ok(Self::from_json_schema(val))
+    }
+}
+
+// UTILITY FUNCTIONS
+
+/// Sanitize schema for llguidance - resolves $ref references and strips metadata
+/// This function extracts definitions from $defs, resolves all $ref references,
+/// and removes the $defs section entirely since llguidance doesn't support $ref.
+fn sanitize_schema_for_llguidance_recursive(schema: &Value) -> Value {
+    // Extract definitions and resolve references
+    let (schema_without_defs, defs) = extract_defs(schema);
+
+    // Resolve all $ref references in the schema
+    let resolved_schema = resolve_schema_refs(&schema_without_defs, &defs);
+
+    // Now sanitize the resolved schema (strip metadata, keep validation keywords)
+    sanitize_sanitized_schema_recursive(&resolved_schema)
+}
+
+/// Sanitize a schema that has already had $refs resolved
+/// This strips metadata fields like description, default, title while keeping validation keywords
+fn sanitize_sanitized_schema_recursive(schema: &Value) -> Value {
+    // JSON Schema validation keywords that should be KEPT
+    // Based on llguidance parser/src/json/schema.rs IMPLEMENTED and META_AND_ANNOTATIONS
+    const VALIDATION_KEYWORDS: &[&str] = &[
+        // Core
+        "anyOf", "oneOf", "allOf", "$ref", "const", "enum", "type",
+        // Array
+        "items", "additionalItems", "prefixItems", "minItems", "maxItems",
+        // Object
+        "properties", "additionalProperties", "patternProperties", "required", "minProperties", "maxProperties",
+        // String
+        "minLength", "maxLength", "pattern", "format",
+        // Number
+        "minimum", "maximum", "exclusiveMinimum", "exclusiveMaximum", "multipleOf",
+        // Schema definitions (for $ref resolution)
+        "$defs", "definitions", "$anchor",
+    ];
+
+    match schema {
+        Value::Object(map) => {
+            let mut out = serde_json::Map::new();
+            for (key, value) in map {
+                if key == "properties" {
+                    // Preserve property names (field names) - they are NOT validation keywords
+                    // but we still need to process the schema values inside properties
+                    if let Value::Object(props) = value {
+                        let mut new_props = serde_json::Map::new();
+                        for (prop_name, prop_schema) in props {
+                            new_props.insert(prop_name.clone(), sanitize_sanitized_schema_recursive(prop_schema));
+                        }
+                        out.insert(key.clone(), Value::Object(new_props));
+                    } else {
+                        out.insert(key.clone(), sanitize_sanitized_schema_recursive(value));
+                    }
+                } else if VALIDATION_KEYWORDS.contains(&key.as_str()) {
+                    // Keep validation keywords, strip metadata/annotation fields
+                    out.insert(key.clone(), sanitize_sanitized_schema_recursive(value));
+                }
+                // Skip all other fields (metadata, annotations, etc.)
+            }
+            Value::Object(out)
+        }
+        Value::Array(items) => Value::Array(
+            items
+                .iter()
+                .map(sanitize_sanitized_schema_recursive)
+                .collect(),
+        ),
+        _ => schema.clone(),
+    }
+}
+
+pub fn sanitize_schema_for_llguidance(schema: &Value) -> Value {
+    sanitize_schema_for_llguidance_recursive(schema)
+}
+
+pub fn sanitize_ascii_only(s: &str) -> String {
+    let mut result = String::new();
+    for ch in s.chars() {
+        if ch.is_ascii() {
+            result.push(ch);
+        }
+    }
+    result
+}
+
+/// Resolve $ref references by inlining the definitions from $defs
+/// This is required because llguidance's JSON schema parser doesn't support $ref
+fn resolve_schema_refs(schema: &Value, defs: &HashMap<String, Value>) -> Value {
+    fn resolve_recursive(schema: &Value, defs: &HashMap<String, Value>) -> Value {
+        match schema {
+            Value::Object(map) => {
+                // Check if this object is a simple $ref (single key "$ref")
+                if map.len() == 1 {
+                    if let Some(ref_value) = map.get("$ref") {
+                        if let Value::String(ref_path) = ref_value {
+                            // Handle both $defs/TypeName and #/$defs/TypeName formats
+                            let def_name = ref_path
+                                .strip_prefix("#/$defs/")
+                                .or_else(|| ref_path.strip_prefix("$defs/"))
+                                .or_else(|| ref_path.strip_prefix("#/definitions/"))
+                                .or_else(|| ref_path.strip_prefix("definitions/"));
+
+                            if let Some(name) = def_name {
+                                if let Some(def) = defs.get(name) {
+                                    // Found a matching definition - resolve it recursively
+                                    // and return the resolved definition directly
+                                    return resolve_recursive(def, defs);
+                                }
+                            }
+                        }
+                    }
+                }
+
+                // Not a simple $ref object, process all keys normally
+                let mut out = serde_json::Map::new();
+                for (key, value) in map {
+                    if key == "$defs" || key == "definitions" {
+                        // Skip definitions - they're already inlined
+                        continue;
+                    } else {
+                        // Recursively process nested values
+                        out.insert(key.clone(), resolve_recursive(value, defs));
+                    }
+                }
+                Value::Object(out)
+            }
+            Value::Array(items) => Value::Array(
+                items
+                    .iter()
+                    .map(|item| resolve_recursive(item, defs))
+                    .collect(),
+            ),
+            _ => schema.clone(),
+        }
+    }
+
+    resolve_recursive(schema, defs)
+}
+
+/// Extract $defs from schema and return (schema_without_defs, defs_map)
+fn extract_defs(schema: &Value) -> (Value, HashMap<String, Value>) {
+    match schema {
+        Value::Object(map) => {
+            let mut defs = HashMap::new();
+            let mut out = serde_json::Map::new();
+
+            for (key, value) in map {
+                if key == "$defs" || key == "definitions" {
+                    if let Value::Object(def_map) = value {
+                        for (def_name, def_value) in def_map {
+                            defs.insert(def_name.clone(), def_value.clone());
+                        }
+                    }
+                } else {
+                    out.insert(key.clone(), value.clone());
+                }
+            }
+
+            (Value::Object(out), defs)
+        }
+        _ => (schema.clone(), HashMap::new()),
+    }
+}
+
+/// Lark literal quoting - wraps string in quotes and escapes special characters
+pub fn lark_quote(value: &str) -> String {
+    let ascii_only = sanitize_ascii_only(value);
+    serde_json::to_string(&ascii_only).unwrap_or_else(|_| "\"\"".to_string())
+}
+
+/// Build special token syntax for Lark grammar using token IDs
+/// When token IDs are available, uses <[token_id]> syntax instead of string literals
+/// This ensures alignment with the outbound parser's token-based detection
+pub fn lark_special_token(token_ids: &HashSet<u32>) -> String {
+    if token_ids.is_empty() {
+        return String::new();
+    }
+    // Join multiple token IDs with | - ensure ASCII only
+    let ids: Vec<String> = token_ids.iter().map(|id| format!("[{}]", id)).collect();
+    format!("<{}>", ids.join(","))
+}
+
+// REASONING GRAMMAR - ReasoningEffort is defined in utils/config.rs
+
+#[derive(Clone, Debug)]
+pub struct ReasoningGrammar {
+    pub start_token_id: u32,
+    pub end_token_id: u32,
+    pub effort: ReasoningEffort,
+}
+
+impl Default for ReasoningGrammar {
+    fn default() -> Self {
+        Self {
+            start_token_id: 0,
+            end_token_id: 0,
+            effort: ReasoningEffort::None,
+        }
+    }
+}
+
+impl ReasoningGrammar {
+    pub fn new(start_id: u32, end_id: u32, effort: ReasoningEffort) -> Self {
+        Self {
+            start_token_id: start_id,
+            end_token_id: end_id,
+            effort,
+        }
+    }
+
+    /// Generate the appropriate grammar template for this reasoning level
+    /// This is used by guidance_grammar.rs to build Lark grammars
+    fn generate_grammar(&self, start_id: u32, end_id: u32) -> String {
+        match &self.effort {
+            ReasoningEffort::None => {
+                // No reasoning block - direct output only
+                // Minimal latency, no structured thinking
+                format!(
+                    r#"start: reasoning_block
+reasoning_block: <[{start_id}]> "\n\n" <[{end_id}]> "\n"
+"#
+                )
+            }
+            ReasoningEffort::ModelDefault => {
+                let rmt = default_reasoning_max_tokens();
+                format!(
+                    r#"start: reasoning_block
+reasoning_block: <[{start_id}]> "\n" think_text? "\n" <[{end_id}]> "\n"
+think_text[temperature=0, max_tokens={rmt}]: /(?s:.+?)/
+"#
+                )
+            }
+            ReasoningEffort::Low => {
+                format!(
+                    r#"start: reasoning_block
+reasoning_block: <[{start_id}]> "\n" think_text "\n" (think_text+ "\n")? <[{end_id}]> "\n"
+think_text[temperature=0.3, max_tokens=256]: /(?s:.+?)/
+"#
+                )
+            }
+            ReasoningEffort::Medium => {
+                format!(
+                    r#"start: reasoning_block
+reasoning_block: <[{start_id}]> "\n" think_text "\n" <[{end_id}]> "\n"
+think_text[temperature=0.5, max_tokens=768]: /(?s:.+?)/
+"#
+                )
+            }
+            ReasoningEffort::High => {
+                format!(
+                    r#"start: reasoning_block
+reasoning_block: <[{start_id}]> analysis_block critique_block structure_block "\n" <[{end_id}]> "\n"
+analysis_block: "\n<analysis>\n" analysis_text
+analysis_text[suffix="\n</analysis>\n", temperature=0.8, max_tokens=512]: /(?s:.+?)/
+critique_block: "\n<critique>\n" critique_text
+critique_text[suffix="\n</critique>\n", temperature=0, max_tokens=512]: /(?s:.+?)/
+structure_block: "\n<structure_response>\n" structure_text
+structure_text[suffix="\n</structure_response>\n", temperature=0.8, max_tokens=512]: /(?s:.+?)/
+"#
+                )
+            }
+            ReasoningEffort::ChainOfThought => {
+                format!(
+                    r#"start: reasoning_block
+reasoning_block: <[{start_id}]> draft_block verification_block critique_block structure_block "\n" <[{end_id}]> "\n"
+draft_block: "\n<draft>\nCardinalities of concern, intended outcomes, and structures of consideration:\n" draft_text
+draft_text[suffix="\n</draft>\n", temperature=0.8, max_tokens=768]: /(?s:.+?)/
+verification_block: "\n<verify>\nQuestions, assumptions, and suppositions:\n" verification_text
+verification_text[suffix="\n</verify>\n", temperature=0, max_tokens=768]: /(?s:.+?)/
+critique_block: "\n<critique>\nAdversarial assessment of evaluation:\n" critique_text
+critique_text[suffix="\n</critique>\n", temperature=0.6, max_tokens=768]: /(?s:.+?)/
+structure_block: "\n<structure_response>\n" structure_text
+structure_text[suffix="\n</structure_response>\n", temperature=0.8, max_tokens=768]: /(?s:.+?)/
+"#
+                )
+            }
+            #[cfg(all(not(feature = "python"), not(feature = "pyo3")))]
+            ReasoningEffort::Custom(template) => {
+                // User-provided template with token ID injection
+                // Supports $START_ID and $END_ID placeholders for dynamic token ID substitution
+                template
+                    .replace("$START_ID", &start_id.to_string())
+                    .replace("$END_ID", &end_id.to_string())
+            }
+        }
+    }
+}
+
+impl GrammarBuilder for ReasoningGrammar {
+    fn build_lark(&mut self) -> String {
+        if self.effort == ReasoningEffort::None {
+            return String::new();
+        }
+        self.generate_grammar(self.start_token_id, self.end_token_id)
+    }
+
+    fn compose_sequence(&mut self, other: &mut Self) -> Self {
+        if other.effort != ReasoningEffort::None {
+            return other.clone();
+        }
+        self.clone()
+    }
+
+    fn format(&mut self) -> TopLevelGrammar {
+        let lark = self.build_lark();
+        if lark.is_empty() {
+            TopLevelGrammar::from_lark_ascii("startf\ntext: /(?s:.+?)/")
+        } else {
+            TopLevelGrammar::from_lark_ascii(&lark)
+        }
+    }
+
+    fn substitute_tokens(&mut self, token_map: &TokenSubstitutionMap) -> Self {
+        let mut new = self.clone();
+        if let Some(start_ids) = token_map.get("reasoning_start") {
+            if let Some(&id) = start_ids.first() {
+                new.start_token_id = id;
+            }
+        }
+        if let Some(end_ids) = token_map.get("reasoning_end") {
+            if let Some(&id) = end_ids.first() {
+                new.end_token_id = id;
+            }
+        }
+        new
+    }
+
+    fn extract_rules(&mut self, rule_names: &[&str]) -> Vec<String> {
+        let lark = self.build_lark();
+        let mut rules = Vec::new();
+        for line in lark.lines() {
+            for rule_name in rule_names {
+                if line.starts_with(*rule_name) {
+                    rules.push(line.trim().to_string());
+                }
+            }
+        }
+        rules
+    }
+}
+
+// CHAT RESPONSE GRAMMAR
+
+#[derive(Clone, Debug)]
+pub struct ChatResponseGrammar {
+    pub eos_termination: bool,
+    pub max_tokens: Option<usize>,
+}
+
+impl Default for ChatResponseGrammar {
+    fn default() -> Self {
+        Self {
+            eos_termination: false,
+            max_tokens: None,
+        }
+    }
+}
+
+impl ChatResponseGrammar {
+    pub fn new() -> Self {
+        Self::default()
+    }
+    pub fn with_eos(self, eos: bool) -> Self {
+        Self {
+            eos_termination: eos,
+            ..self
+        }
+    }
+    pub fn with_max_tokens(self, max: usize) -> Self {
+        Self {
+            max_tokens: Some(max),
+            ..self
+        }
+    }
+}
+
+impl GrammarBuilder for ChatResponseGrammar {
+    fn build_lark(&mut self) -> String {
+        if self.eos_termination {
+            r#"start: text
+text: /(?s:.+?)/"#
+                .to_string()
+        } else {
+            if let Some(max_tokens) = self.max_tokens {
+                format!(r#"start: text
+text[stop="", max_tokens={}]: /(?s:.+?)/"#, max_tokens)
+            } else {
+                r#"start: text
+text[stop=""]: /(?s:.+?)/"#
+                .to_string()
+            }
+        }
+    }
+    fn compose_alternate(&mut self, _other: &mut Self) -> Self {
+        self.clone()
+    }
+    fn compose_sequence(&mut self, other: &mut Self) -> Self {
+        Self {
+            eos_termination: self.eos_termination || other.eos_termination,
+            max_tokens: self.max_tokens.or(other.max_tokens),
+        }
+    }
+    fn format(&mut self) -> TopLevelGrammar {
+        TopLevelGrammar::from_lark_ascii(&self.build_lark())
+    }
+    fn substitute_tokens(&mut self, _token_map: &TokenSubstitutionMap) -> Self {
+        self.clone()
+    }
+    fn extract_rules(&mut self, _rule_names: &[&str]) -> Vec<String> {
+        Vec::new()
+    }
+}
+
+// STRUCTURED CONSTRAINTS
+
+#[derive(Clone, Debug)]
+pub enum StructuredConstraint {
+    Choice(Vec<String>),
+    Regex(String),
+    Json(Value),
+    Lark(String),
+    StructuralTag(StructuralTagConfig),
+}
+
+#[derive(Clone, Debug)]
+pub struct StructuralTagConfig {
+    pub start_tag: String,
+    pub end_tag: String,
+    pub schema: Value,
+}
+
+impl StructuredConstraint {
+    pub fn build_lark(&mut self) -> String {
+        match self {
+            StructuredConstraint::Choice(choices) => {
+                let mut parts = Vec::with_capacity(choices.len());
+                for choice in choices {
+                    if !choice.is_empty() {
+                        parts.push(lark_quote(choice));
+                    }
+                }
+                format!("start: {}\n", parts.join(" | "))
+            }
+            StructuredConstraint::Regex(pattern) => format!(
+                r#"start: text
+text: /{}/"#,
+                pattern
+            ),
+            StructuredConstraint::Json(schema) => {
+                let sanitized = sanitize_schema_for_llguidance(schema);
+                let schema_str = serde_json::to_string(&sanitized).unwrap_or_default();
+                format!(
+                    r#"start: text
+text: %json {}"#,
+                    schema_str
+                )
+            }
+            StructuredConstraint::Lark(grammar) => grammar.clone(),
+            StructuredConstraint::StructuralTag(config) => {
+                let start_tag = lark_quote(&config.start_tag);
+                let end_tag = lark_quote(&config.end_tag);
+                format!(
+                    r#"start: text
+text: {} content {}
+content: /[\x20-\x7E\x0A\x0D]+?/"#,
+                    start_tag, end_tag
+                )
+            }
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct StructuredOutputsGrammar {
+    pub constraint: StructuredConstraint,
+}
+
+impl Default for StructuredOutputsGrammar {
+    fn default() -> Self {
+        Self {
+            constraint: StructuredConstraint::Lark(String::new()),
+        }
+    }
+}
+
+impl StructuredOutputsGrammar {
+    pub fn new(constraint: StructuredConstraint) -> Self {
+        Self { constraint }
+    }
+}
+
+impl GrammarBuilder for StructuredOutputsGrammar {
+    fn build_lark(&mut self) -> String {
+        self.constraint.build_lark()
+    }
+    fn compose_alternate(&mut self, other: &mut Self) -> Self {
+        // Extract the constraint Lark strings
+        let this_lark = self.build_lark();
+        let other_lark = other.build_lark();
+
+        // Extract start RHS from both (the part after "start: ")
+        let this_start = this_lark
+            .lines()
+            .next()
+            .and_then(|l| l.strip_prefix("start: "))
+            .unwrap_or("text");
+        let other_start = other_lark
+            .lines()
+            .next()
+            .and_then(|l| l.strip_prefix("start: "))
+            .unwrap_or("text");
+
+        // Combine the start alternatives
+        let combined_start = format!("( {} | {} )+", this_start, other_start);
+
+        // Extract non-start rules from both grammars, deduplicate and filter empty lines
+        let this_rules: Vec<String> = this_lark
+            .lines()
+            .skip(1)
+            .filter(|l| !l.trim().is_empty() && l.contains(':') && !l.trim().starts_with("start:"))
+            .map(|s| s.trim().to_string())
+            .collect();
+
+        let other_rules: Vec<String> = other_lark
+            .lines()
+            .skip(1)
+            .filter(|l| !l.trim().is_empty() && l.contains(':') && !l.trim().starts_with("start:"))
+            .map(|s| s.trim().to_string())
+            .collect();
+
+        // Combine all rules and deduplicate
+        let all_rules: Vec<String> = [this_rules, other_rules].concat();
+        let mut seen = std::collections::HashSet::new();
+        let unique_rules: Vec<String> = all_rules
+            .into_iter()
+            .filter(|l| {
+                if seen.contains(l) {
+                    false
+                } else {
+                    seen.insert(l.clone());
+                    true
+                }
+            })
+            .collect();
+
+        let combined_rules = unique_rules.join("\n");
+
+        Self {
+            constraint: StructuredConstraint::Lark(format!(
+                "start: {}\n{}",
+                combined_start, combined_rules
+            )),
+        }
+    }
+
+    fn compose_sequence(&mut self, other: &mut Self) -> Self {
+        let this_lark = self.build_lark();
+        let other_lark = other.build_lark();
+
+        // Parse both grammars and combine rules, deduplicating
+        let this_lines: Vec<&str> = this_lark.lines().collect();
+        let other_lines: Vec<&str> = other_lark.lines().collect();
+
+        // Extract start rules and other rules from both
+        let this_start = this_lines
+            .first()
+            .and_then(|l| l.strip_prefix("start: "))
+            .unwrap_or("");
+        let other_start = other_lines
+            .first()
+            .and_then(|l| l.strip_prefix("start: "))
+            .unwrap_or("");
+
+        // Combine start rules
+        let combined_start = format!("{} {}", this_start, other_start).trim().to_string();
+
+        // Collect all non-start rules from both, deduplicating
+        let mut seen = std::collections::HashSet::new();
+        let mut all_rules: Vec<String> = Vec::new();
+
+        for line in this_lines.iter().skip(1) {
+            let trimmed = line.trim();
+            if !trimmed.is_empty() && !seen.contains(trimmed) {
+                seen.insert(trimmed.to_string());
+                all_rules.push(trimmed.to_string());
+            }
+        }
+
+        for line in other_lines.iter().skip(1) {
+            let trimmed = line.trim();
+            if !trimmed.is_empty() && !seen.contains(trimmed) {
+                seen.insert(trimmed.to_string());
+                all_rules.push(trimmed.to_string());
+            }
+        }
+
+        let combined_rules = all_rules.join("\n");
+
+        Self {
+            constraint: StructuredConstraint::Lark(format!(
+                "start: {}\n{}",
+                combined_start, combined_rules
+            )),
+        }
+    }
+    fn format(&mut self) -> TopLevelGrammar {
+        TopLevelGrammar::from_lark_ascii(&self.build_lark())
+    }
+    fn substitute_tokens(&mut self, _token_map: &TokenSubstitutionMap) -> Self {
+        self.clone()
+    }
+    fn extract_rules(&mut self, _rule_names: &[&str]) -> Vec<String> {
+        Vec::new()
+    }
+}
+
+impl StructuredOutputsGrammar {}
+
+// RESPONSE FORMAT GRAMMAR
+
+#[derive(Clone, Debug)]
+pub struct ResponseFormatGrammar {
+    pub format_type: String,
+    pub schema: Option<Value>,
+}
+
+impl Default for ResponseFormatGrammar {
+    fn default() -> Self {
+        Self {
+            format_type: "json_object".to_string(),
+            schema: None,
+        }
+    }
+}
+
+impl ResponseFormatGrammar {
+    pub fn new_json_schema(schema: Value) -> Self {
+        Self {
+            format_type: "json_schema".to_string(),
+            schema: Some(schema),
+        }
+    }
+    pub fn new_json_object() -> Self {
+        Self {
+            format_type: "json_object".to_string(),
+            schema: None,
+        }
+    }
+}
+
+impl GrammarBuilder for ResponseFormatGrammar {
+    fn build_lark(&mut self) -> String {
+        match self.format_type.as_str() {
+            "json_schema" => {
+                if let Some(schema) = &self.schema {
+                    let sanitized = sanitize_schema_for_llguidance(schema);
+                    let schema_str = serde_json::to_string(&sanitized).unwrap_or_default();
+                    format!(
+                        r#"start: text
+text: %json {}"#,
+                        schema_str
+                    )
+                } else {
+                    String::new()
+                }
+            }
+            "json_object" => r#"start: text
+text: %json {"type":"object"}"#
+                .to_string(),
+            _ => String::new(),
+        }
+    }
+    fn compose_alternate(&mut self, _other: &mut Self) -> Self {
+        self.clone()
+    }
+    fn compose_sequence(&mut self, other: &mut Self) -> Self {
+        other.clone()
+    }
+    fn format(&mut self) -> TopLevelGrammar {
+        TopLevelGrammar::from_lark_ascii(&self.build_lark())
+    }
+    fn substitute_tokens(&mut self, _token_map: &TokenSubstitutionMap) -> Self {
+        self.clone()
+    }
+    fn extract_rules(&mut self, _rule_names: &[&str]) -> Vec<String> {
+        Vec::new()
+    }
+}
+
+// CONSTRAINT GRAMMAR
+
+#[derive(Clone, Debug)]
+pub struct ConstraintGrammar {
+    pub constraint_type: String,
+    pub content: String,
+}
+
+impl Default for ConstraintGrammar {
+    fn default() -> Self {
+        Self {
+            constraint_type: "regex".to_string(),
+            content: String::new(),
+        }
+    }
+}
+
+impl ConstraintGrammar {
+    pub fn new_regex(content: String) -> Self {
+        Self {
+            constraint_type: "regex".to_string(),
+            content,
+        }
+    }
+    pub fn new_lark(content: String) -> Self {
+        Self {
+            constraint_type: "lark".to_string(),
+            content,
+        }
+    }
+    pub fn new_json_schema(content: String) -> Self {
+        Self {
+            constraint_type: "json_schema".to_string(),
+            content,
+        }
+    }
+}
+
+impl GrammarBuilder for ConstraintGrammar {
+    fn build_lark(&mut self) -> String {
+        match self.constraint_type.as_str() {
+            "regex" => format!(
+                r#"start: text
+text: /{}/"#,
+                self.content
+            ),
+            "lark" => self.content.clone(),
+            "json_schema" => {
+                if let Ok(schema) = serde_json::from_str(&self.content) {
+                    let sanitized = sanitize_schema_for_llguidance(&schema);
+                    let schema_str = serde_json::to_string(&sanitized).unwrap_or_default();
+                    format!(
+                        r#"start: text
+text: %json {}"#,
+                        schema_str
+                    )
+                } else {
+                    String::new()
+                }
+            }
+            _ => String::new(),
+        }
+    }
+    fn compose_alternate(&mut self, _other: &mut Self) -> Self {
+        self.clone()
+    }
+    fn compose_sequence(&mut self, other: &mut Self) -> Self {
+        other.clone()
+    }
+    fn format(&mut self) -> TopLevelGrammar {
+        TopLevelGrammar::from_lark_ascii(&self.build_lark())
+    }
+    fn substitute_tokens(&mut self, _token_map: &TokenSubstitutionMap) -> Self {
+        self.clone()
+    }
+    fn extract_rules(&mut self, _rule_names: &[&str]) -> Vec<String> {
+        Vec::new()
+    }
+}
+
+// TOOL CALL GRAMMAR
+
+#[derive(Clone, Debug)]
+pub enum ToolFormat {
+    QwenCoder,
+    MiniMax,
+    Json,
+    Generic,
+    Gemma4,
+}
+
+#[derive(Clone, Debug)]
+pub struct ToolCallGrammar {
+    pub tools: Vec<Tool>,
+    pub start_token_id: u32,
+    pub end_token_id: u32,
+    pub format: ToolFormat,
+    value_rules: HashMap<String, String>,
+}
+
+impl Default for ToolCallGrammar {
+    fn default() -> Self {
+        Self {
+            tools: Vec::new(),
+            start_token_id: 0,
+            end_token_id: 0,
+            format: ToolFormat::Json,
+            value_rules: HashMap::new(),
+        }
+    }
+}
+
+impl ToolCallGrammar {
+    pub fn new_generic(tools: Vec<Tool>, start_token_id: u32, end_token_id: u32) -> Self {
+        Self {
+            tools,
+            start_token_id,
+            end_token_id,
+            format: ToolFormat::Generic,
+            value_rules: HashMap::new(),
+        }
+    }
+    pub fn new_qwen_coder(tools: Vec<Tool>, start_token_id: u32, end_token_id: u32) -> Self {
+        Self {
+            tools,
+            start_token_id,
+            end_token_id,
+            format: ToolFormat::QwenCoder,
+            value_rules: HashMap::new(),
+        }
+    }
+    pub fn new_minimax(tools: Vec<Tool>, start_token_id: u32, end_token_id: u32) -> Self {
+        Self {
+            tools,
+            start_token_id,
+            end_token_id,
+            format: ToolFormat::MiniMax,
+            value_rules: HashMap::new(),
+        }
+    }
+    pub fn new_json(tools: Vec<Tool>, start_token_id: u32, end_token_id: u32) -> Self {
+        Self {
+            tools,
+            start_token_id,
+            end_token_id,
+            format: ToolFormat::Json,
+            value_rules: HashMap::new(),
+        }
+    }
+    pub fn _new_gemma4(tools: Vec<Tool>, start_token_id: u32, end_token_id: u32) -> Self {
+        Self {
+            tools,
+            start_token_id,
+            end_token_id,
+            format: ToolFormat::Gemma4,
+            value_rules: HashMap::new(),
+        }
+    }
+
+    /// Build ToolCallGrammar with model-aware selection using StreamToolParser for consistent parser name determination
+    /// This ensures grammar generation aligns with the streaming parser's format selection
+    pub fn for_model_type(
+        tools: Vec<Tool>,
+        start_token_id: u32,
+        end_token_id: u32,
+        model_type: &ModelType,
+        model_id: &str,
+    ) -> ToolCallGrammar {
+        // Use StreamToolParser::parser_name_for_model() to determine the parser name
+        let parser_name = StreamToolParser::parser_name_for_model(model_type, model_id);
+
+        // Map parser name to ToolFormat
+        let format = Self::tool_format_for_parser_name(parser_name);
+
+        ToolCallGrammar {
+            tools,
+            start_token_id,
+            end_token_id,
+            format,
+            value_rules: HashMap::new(),
+        }
+    }
+
+    /// Map parser name to ToolFormat variant
+    /// Parsers without explicit grammars default to Generic
+    fn tool_format_for_parser_name(parser_name: &str) -> ToolFormat {
+        match parser_name {
+            "qwen_coder" => ToolFormat::QwenCoder,
+            "minimax_m2" => ToolFormat::MiniMax,
+            "json" => ToolFormat::Json,
+            _ => ToolFormat::Generic,
+        }
+    }
+}
+
+impl GrammarBuilder for ToolCallGrammar {
+    fn build_lark(&mut self) -> String {
+        match self.format {
+            ToolFormat::QwenCoder => self.build_qwen_coder_lark(),
+            ToolFormat::MiniMax   => self.build_minimax_lark(),
+            ToolFormat::Gemma4    => self.build_gemma4_lark(),
+            ToolFormat::Json      => self.build_json_lark(),
+            ToolFormat::Generic   => self.build_generic_lark(),
+        }
+    }
+    fn compose_alternate(&mut self, _other: &mut Self) -> Self {
+        self.clone()
+    }
+    fn compose_sequence(&mut self, other: &mut Self) -> Self {
+        other.clone()
+    }
+    fn format(&mut self) -> TopLevelGrammar {
+        TopLevelGrammar::from_lark_ascii(&self.build_lark())
+    }
+    fn substitute_tokens(&mut self, token_map: &TokenSubstitutionMap) -> Self {
+        let mut new = self.clone();
+        if let Some(ids) = token_map.get("tool_start") {
+            if let Some(&id) = ids.first() {
+                new.start_token_id = id;
+            }
+        }
+        if let Some(ids) = token_map.get("tool_end") {
+            if let Some(&id) = ids.first() {
+                new.end_token_id = id;
+            }
+        }
+        new
+    }
+    fn extract_rules(&mut self, rule_names: &[&str]) -> Vec<String> {
+        let lark = self.build_lark();
+        let mut rules = Vec::new();
+        for line in lark.lines() {
+            for rule_name in rule_names {
+                if line.starts_with(*rule_name) {
+                    rules.push(line.trim().to_string());
+                }
+            }
+        }
+        rules
+    }
+}
+
+impl ToolCallGrammar {
+    pub fn build_generic_lark(&mut self) -> String {
+        if self.tools.is_empty() {
+            r#"start: text
+ text: /(?s:.+?)/
+"#
+            .to_string()
+        } else {
+            format!(
+                r#"start: tool_call
+tool_call: <[{}]> text <[{}]>
+text: /(?s:.+?)/
+"#,
+                self.start_token_id, self.end_token_id
+            )
+        }
+    }
+
+    fn build_json_lark(&mut self) -> String {
+        let start_tag = format!("<[{}]>", self.start_token_id);
+        let end_tag = format!("<[{}]>", self.end_token_id);
+        let payload_schema = if self.tools.is_empty() {
+            serde_json::json!({ "type": "object" })
+        } else {
+            let variants: Vec<Value> = self.tools.iter().map(|tool| {
+                let arguments_schema = sanitize_schema_for_llguidance(&tool.function.parameters);
+                serde_json::json!({
+                    "type": "object",
+                    "properties": { "name": { "type": "string", "enum": [tool.function.name.clone()] }, "arguments": arguments_schema },
+                    "required": ["name", "arguments"], "additionalProperties": false,
+                })
+            }).collect();
+            if variants.len() == 1 {
+                variants[0].clone()
+            } else {
+                serde_json::json!({ "anyOf": variants })
+            }
+        };
+        let payload_schema_str = serde_json::to_string(&payload_schema).unwrap_or_default();
+        format!(
+            r#"start: tool_call
+ tool_call: {} tool_content {}
+ tool_content: %json {}"#,
+            start_tag, end_tag, payload_schema_str
+        )
+    }
+
+    fn build_gemma4_lark(&mut self) -> String {
+        let start_tag = format!("<[{}]>", self.start_token_id);
+        let end_tag = format!("<[{}]>", self.end_token_id);
+
+        if self.tools.is_empty() {
+            return format!(
+                r#"start: tool_call
+    tool_call: {} call_text {}
+    call_text: /(?s:.*)/
+    "#,
+                start_tag, end_tag
+            );
+        }
+
+        let mut rules: Vec<String> = Vec::new();
+        let tool_rule_names: Vec<String> = (0..self.tools.len()).map(|i| format!("tool_{}", i)).collect();
+
+        rules.push("start: tool_call".to_string());
+        rules.push(format!("tool_call: {} tool_content {}", start_tag, end_tag));
+        rules.push(format!("tool_content: {}", tool_rule_names.join(" | ")));
+
+        // Clone tools to avoid borrow conflict
+        let tools_clone = self.tools.clone();
+
+        for (tool_idx, tool) in tools_clone.iter().enumerate() {
+            let tool_name = &tool.function.name;
+            let (args_expr, param_value_rules) = self.build_gemma4_args_pattern(&tool.function.parameters, &tool_idx);
+
+            let tool_rule = format!("tool_{}: \"call:{}{{\" {} \"}}\"", tool_idx, tool_name, args_expr);
+            rules.push(tool_rule);
+
+            for param_value in param_value_rules {
+                rules.push(param_value);
+            }
+        }
+
+        let value_definitions = self.build_value_rules();
+        rules.extend(value_definitions);
+
+        rules.join("\n")
+    }
+
+    fn build_value_rules(&self) -> Vec<String> {
+        let mut rules: Vec<String> = Vec::new();
+        let mut sorted_rules: Vec<_> = self.value_rules.iter().collect();
+        sorted_rules.sort_by(|a, b| a.0.cmp(b.0));
+
+        for (rule_name, pattern) in sorted_rules {
+            // Check if pattern already contains the LHS (multi-line pattern)
+            // If pattern starts with "rule_name: ", don't add another "rule_name: "
+            let output = if pattern.starts_with(&format!("{}: ", rule_name)) {
+                pattern.clone()
+            } else {
+                format!("{}: {}", rule_name, pattern)
+            };
+            rules.push(output);
+        }
+        rules
+    }
+
+    fn _build_gemma4_array_definition() -> String {
+        r#"gemma4_array: "[" gemma4_array_items? "]"
+gemma4_array_items: gemma4_value ("," gemma4_value)*
+gemma4_value: gemma4_string | gemma4_number | gemma4_boolean | gemma4_array | gemma4_object
+"#
+            .to_string()
+    }
+
+    fn _build_gemma4_object_definition() -> String {
+        r#"gemma4_object: "{" gemma4_object_items? "}"
+gemma4_object_items: gemma4_key_value ("," gemma4_key_value)*
+gemma4_key_value: gemma4_key ":" gemma4_value
+gemma4_key: /[^:]+/
+gemma4_value: gemma4_string | gemma4_number | gemma4_boolean | gemma4_array | gemma4_object
+"#
+            .to_string()
+    }
+
+    fn build_gemma4_pattern_definition(_rule_name: &str, value: &serde_json::Value) -> String {
+        if let Some(obj) = value.as_object() {
+            if let Some(type_val) = obj.get("type").and_then(|t| t.as_str()) {
+                match type_val {
+                    "string" => r#""<|\"|>" /[\x20-\x7E\x0A\x0D]+?/ "<|\"|>""#.to_string(),
+                    "integer" | "number" => r#"/-?\d+(\.\d+)?/"#.to_string(),
+                    "boolean" => r#""true" | "false""#.to_string(),
+                    "array" => Self::build_gemma4_array_rhs(),
+                    "object" => Self::build_gemma4_object_rhs(),
+                    _ => r#"/.*/"#.to_string(),
+                }
+            } else {
+                r#"/.*/"#.to_string()
+            }
+        } else {
+            r#"/.*/"#.to_string()
+        }
+    }
+
+    fn build_gemma4_array_rhs() -> String {
+        r#""[" gemma4_array_items? "]""#.to_string()
+    }
+
+    fn build_gemma4_object_rhs() -> String {
+        r#""{" gemma4_object_items? "}""#.to_string()
+    }
+
+    fn build_gemma4_args_pattern(&mut self, params: &serde_json::Value, tool_idx: &usize) -> (String, Vec<String>) {
+        let mut param_value_rules: Vec<String> = Vec::new();
+        let mut param_names: Vec<String> = Vec::new();
+
+        if let Some(props) = params.get("properties") {
+            if let Some(obj) = props.as_object() {
+                for (param_idx, (key, value)) in obj.iter().enumerate() {
+                    let param_rule = format!("param_{}_{}", tool_idx, param_idx);
+                    let type_pattern = Self::build_gemma4_type_pattern(value);
+
+                    // Store pattern definition in value_rules
+                    let pattern_def = Self::build_gemma4_pattern_definition(&type_pattern, value);
+                    self.value_rules.insert(type_pattern.clone(), pattern_def);
+
+                    // For array/object types, add nested rules
+                    if let Some(obj_val) = value.as_object() {
+                        if let Some(type_val) = obj_val.get("type").and_then(|t| t.as_str()) {
+                            if type_val == "array" {
+                                // Add nested array rules
+                                self.value_rules.insert("gemma4_array_items".to_string(), "gemma4_value (\",\" gemma4_value)*".to_string());
+                                self.value_rules.insert("gemma4_value".to_string(), "gemma4_string | gemma4_number | gemma4_boolean | gemma4_array | gemma4_object".to_string());
+                            } else if type_val == "object" {
+                                // Add nested object rules
+                                self.value_rules.insert("gemma4_object_items".to_string(), "gemma4_key_value (\",\" gemma4_key_value)*".to_string());
+                                self.value_rules.insert("gemma4_key_value".to_string(), "gemma4_key \":\" gemma4_value".to_string());
+                                self.value_rules.insert("gemma4_key".to_string(), "/[^:]+/".to_string());
+                            }
+                        }
+                    }
+
+                    let param_value = format!("{}: \"{}\" {}", param_rule, key, type_pattern);
+                    param_value_rules.push(param_value);
+                    param_names.push(param_rule);
+                }
+            }
+        }
+
+        // Build the args expression: param_0_0 ("," param_0_1)? ("," param_0_2)?
+        let args_expr = if param_names.is_empty() {
+            String::new()
+        } else {
+            let mut parts = vec![param_names[0].clone()];
+            for rule in &param_names[1..] {
+                parts.push(format!("( \",\" {})?", rule));
+            }
+            parts.join(" ")
+        };
+
+        (args_expr, param_value_rules)
+    }
+
+
+    fn build_gemma4_type_pattern(value: &serde_json::Value) -> String {
+        if let Some(obj) = value.as_object() {
+            if let Some(type_val) = obj.get("type").and_then(|t| t.as_str()) {
+                match type_val {
+                    "string" => "gemma4_string".to_string(),
+                    "integer" | "number" => "gemma4_number".to_string(),
+                    "boolean" => "gemma4_boolean".to_string(),
+                    "array" => "gemma4_array".to_string(),
+                    "object" => "gemma4_object".to_string(),
+                    _ => "gemma4_value".to_string(),
+                }
+            } else {
+                "gemma4_value".to_string()
+            }
+        } else {
+            "gemma4_value".to_string()
+        }
+    }
+
+
+
+    fn build_qwen_coder_lark(&mut self) -> String {
+        let mut rules: Vec<String> = Vec::new();
+        let envelope_start_tag = format!("<[{}]>", self.start_token_id);
+        let envelope_end_tag = format!("<[{}]>", self.end_token_id);
+        let tool_rule_names: Vec<String> = (0..self.tools.len())
+            .map(|i| format!("tool_{}", i))
+            .collect();
+        rules.push("start: tool_call".to_string());
+        rules.push(format!(
+            r#"tool_call: {} tool_content {} "#,
+            envelope_start_tag, envelope_end_tag
+        ));
+        let tools = self.tools.clone();
+        for (tool_idx, tool) in tools.iter().enumerate() {
+            let tool_name_ascii: String = tool
+                .function
+                .name
+                .chars()
+                .filter(|c| c.is_ascii())
+                .collect();
+            let func_end = lark_quote("</function>\n");
+            if let Some(props) = tool
+                .function
+                .parameters
+                .get("properties")
+                .and_then(|p| p.as_object())
+            {
+                let mut param_rules_vec: Vec<String> = Vec::new();
+                for (param_idx, (param_name, schema)) in props.iter().enumerate() {
+                    let param_name_ascii: String =
+                        param_name.chars().filter(|c| c.is_ascii()).collect();
+                    let param_tag = lark_quote(&format!("\n<parameter={}>\n", param_name_ascii));
+                    let param_end = lark_quote("\n</parameter>\n");
+                    let param_rule = format!("param_{}_{}", tool_idx, param_idx);
+                    let param_type = schema
+                        .get("type")
+                        .and_then(|t| t.as_str())
+                        .unwrap_or("string")
+                        .to_string();
+                    let value_rule =
+                        self.get_value_rule_name(tool_idx, param_idx, &param_type, schema);
+                    if param_type == "string" {
+                        rules.push(format!(r#"{}: {} {} "#, param_rule, param_tag, value_rule));
+                    } else {
+                        rules.push(format!(
+                            r#"{}: {} {} {} "#,
+                            param_rule, param_tag, value_rule, param_end
+                        ));
+                    }
+                    let required_params: Vec<String> = tool
+                        .function
+                        .parameters
+                        .get("required")
+                        .and_then(|r| r.as_array())
+                        .map(|arr| {
+                            arr.iter()
+                                .filter_map(|v| v.as_str().map(|s| s.to_string()))
+                                .collect()
+                        })
+                        .unwrap_or_default();
+                    if required_params.contains(param_name) {
+                        param_rules_vec.push(format!(" {}", param_rule));
+                    } else {
+                        param_rules_vec.push(format!("({})?", param_rule));
+                    }
+                }
+                let params_expr = param_rules_vec.join(" ");
+                if param_rules_vec.len() > 0 {
+                    let func_start = lark_quote(&format!("\n<function={}>", tool_name_ascii));
+                    rules.push(format!(
+                        r#"tool_{}: {}{} {}"#,
+                        tool_idx, func_start, params_expr, func_end
+                    ));
+                } else {
+                    let func_start = lark_quote(&format!("\n<function={}>\n", tool_name_ascii));
+                    rules.push(format!(
+                        r#"tool_{}: {}{} {}"#,
+                        tool_idx, func_start, params_expr, func_end
+                    ));
+                }
+            } else {
+                let func_start = lark_quote(&format!("\n<function={}>\n", tool_name_ascii));
+                rules.push(format!(r#"tool_{}: {} {}"#, tool_idx, func_start, func_end));
+            }
+        }
+        let tool_variants = tool_rule_names.join(" | ");
+        rules.push(format!("tool_content: {}", tool_variants));
+        let value_rules = self.build_value_rules();
+        rules.extend(value_rules);
+        let lark = rules.join("\n") + "\n";
+        lark
+    }
+
+
+    fn build_minimax_lark(&mut self) -> String {
+        let mut rules: Vec<String> = Vec::new();
+        let envelope_start_tag = format!("<[{}]>", self.start_token_id);
+        let envelope_end_tag = format!("<[{}]>", self.end_token_id);
+        let tool_rule_names: Vec<String> = (0..self.tools.len())
+            .map(|i| format!("tool_{}", i))
+            .collect();
+        rules.push("start: tool_call".to_string());
+        rules.push(format!(
+            r#"tool_call: {} tool_content {} "#,
+            envelope_start_tag, envelope_end_tag
+        ));
+        let tools = self.tools.clone();
+        for (tool_idx, tool) in tools.iter().enumerate() {
+            let tool_name_ascii: String = tool
+                .function
+                .name
+                .chars()
+                .filter(|c| c.is_ascii())
+                .collect();
+            let func_end = lark_quote("</invoke>\n");
+            if let Some(props) = tool
+                .function
+                .parameters
+                .get("properties")
+                .and_then(|p| p.as_object())
+            {
+                let mut param_rules_vec: Vec<String> = Vec::new();
+                for (param_idx, (param_name, schema)) in props.iter().enumerate() {
+                    let param_name_ascii: String =
+                        param_name.chars().filter(|c| c.is_ascii()).collect();
+                    let param_tag = format!(r#""\n<parameter name=\"{}\">""#, param_name_ascii);
+                    let param_end = lark_quote("\n</parameter>\n");
+                    let param_rule = format!("param_{}_{}", tool_idx, param_idx);
+                    let param_type = schema
+                        .get("type")
+                        .and_then(|t| t.as_str())
+                        .unwrap_or("string")
+                        .to_string();
+                    let value_rule =
+                        self.get_value_rule_name(tool_idx, param_idx, &param_type, schema);
+                    if param_type == "string" {
+                        rules.push(format!(r#"{}: {} {} "#, param_rule, param_tag, value_rule));
+                    } else {
+                        rules.push(format!(
+                            r#"{}: {} {} {} "#,
+                            param_rule, param_tag, value_rule, param_end
+                        ));
+                    }
+                    let required_params: Vec<String> = tool
+                        .function
+                        .parameters
+                        .get("required")
+                        .and_then(|r| r.as_array())
+                        .map(|arr| {
+                            arr.iter()
+                                .filter_map(|v| v.as_str().map(|s| s.to_string()))
+                                .collect()
+                        })
+                        .unwrap_or_default();
+                    if required_params.contains(param_name) {
+                        param_rules_vec.push(format!(" {}", param_rule));
+                    } else {
+                        param_rules_vec.push(format!(" ({})?", param_rule));
+                    }
+                }
+                let params_expr = param_rules_vec.join(" ");
+                if param_rules_vec.len() > 0 {
+                    let func_start = format!(r#""\n<invoke name=\"{}\">""#, tool_name_ascii);
+                    rules.push(format!(
+                        r#"tool_{}: {}{} {}"#,
+                        tool_idx, func_start, params_expr, func_end
+                    ));
+                } else {
+                    let func_start = format!(r#""\n<invoke name=\"{}\">\n""#, tool_name_ascii);
+                    rules.push(format!(
+                        r#"tool_{}: {}{} {}"#,
+                        tool_idx, func_start, params_expr, func_end
+                    ));
+                }
+            } else {
+                let func_start = format!(r#""\n<invoke name=\"{}\">\n""#, tool_name_ascii);
+                rules.push(format!(r#"tool_{}: {} {}"#, tool_idx, func_start, func_end));
+            }
+        }
+        let tool_variants = tool_rule_names.join(" | ");
+        rules.push(format!("tool_content: {}", tool_variants));
+        let value_rules = self.build_value_rules();
+        rules.extend(value_rules);
+        let lark = rules.join("\n") + "\n";
+        lark
+    }
+
+    fn get_value_rule_name(
+        &mut self,
+        tool_idx: usize,
+        param_idx: usize,
+        param_type: &str,
+        param_schema: &Value,
+    ) -> String {
+        let rule_name = if param_type == "string" {
+            "value_string".to_string()
+        } else {
+            format!("value_{}_{}_{}", tool_idx, param_idx, param_type)
+        };
+        let pattern = if param_type == "string" {
+            r#"/[\x20-\x7E\x0A\x0D]+?/"#.to_string()
+        } else {
+            let sanitized = sanitize_schema_for_llguidance(param_schema);
+            let schema_json = serde_json::to_string(&sanitized).unwrap_or_default();
+            format!("%json {}", schema_json)
+        };
+        let lhs = if param_type == "string" {
+            match self.format{
+                ToolFormat::MiniMax => {
+                    format!(r#"{}[suffix="</parameter>\n"]"#, rule_name)
+                },
+                _ =>{
+                    format!(r#"{}[suffix="\n</parameter>\n"]"#, rule_name)
+                }
+            }
+        } else {
+            rule_name.clone()
+        };
+        self.value_rules.insert(lhs, pattern);
+        rule_name
+    }
+}
+
+pub struct GrammarRequestDispatcher<'a> {
+    pub request: &'a ChatCompletionRequest,
+    pub guidance_tokens: &'a GuidanceTokens,
+    pub tool_config: &'a crate::server::parser::ToolConfig,
+    pub enable_tool_grammar: bool,
+    pub allow_constraint_api: bool,
+    pub parser_name: String,
+    pub tokenizer: &'a Tokenizer,
+    pub chat_template: Option<crate::utils::chat_template::ChatTemplate>,
+    pub disable_reasoning: bool,
+}
+
+impl<'a> GrammarRequestDispatcher<'a> {
+    pub fn new(
+        request: &'a ChatCompletionRequest,
+        guidance_tokens: &'a GuidanceTokens,
+        tool_config: &'a crate::server::parser::ToolConfig,
+        enable_tool_grammar: bool,
+        allow_constraint_api: bool,
+        parser_name: String,
+        tokenizer: &'a Tokenizer,
+        chat_template: Option<crate::utils::chat_template::ChatTemplate>,
+        disable_reasoning: bool,
+    ) -> Self {
+        Self {
+            request,
+            guidance_tokens,
+            tool_config,
+            enable_tool_grammar,
+            allow_constraint_api,
+            parser_name,
+            tokenizer,
+            chat_template,
+            disable_reasoning,
+        }
+    }
+
+    pub fn build_grammar(self) -> Option<TopLevelGrammar> {
+        if !self.allow_constraint_api && !self.enable_tool_grammar {
+            return None;
+        }
+        let constraint_grammar = self.build_constraint_grammar();
+        let tool_grammar = self.build_tool_grammar();
+        let reasoning_effort = self.build_reasoning_effort();
+        // Get max_tokens from request
+        let max_tokens = self.request.max_tokens.unwrap_or(0);
+
+        // If constraint_grammar is None, use a default "any text" grammar
+        // This allows tool grammars to be composed even without constraints
+        let constraint_grammar = constraint_grammar.unwrap_or_else(|| {
+            StructuredOutputsGrammar::new(StructuredConstraint::Lark(
+                "start: text\ntext: /(?s:.+?)/".to_string(),
+            ))
+        });
+
+        // Use tokenizer and chat_template from dispatcher fields for fallback application
+        Some(GrammarComposer::compose_all_grammars(
+            vec![constraint_grammar],
+            tool_grammar,
+            reasoning_effort,
+            self.guidance_tokens,
+            max_tokens,
+            self.chat_template,
+            self.tokenizer,
+            self.disable_reasoning,
+        ))
+    }
+
+    fn build_constraint_grammar(&self) -> Option<StructuredOutputsGrammar> {
+        if let Some(ref so) = self.request.structured_outputs {
+            if let Some(choice) = &so.choice {
+                if !choice.is_empty() {
+                    return Some(StructuredOutputsGrammar::new(StructuredConstraint::Choice(
+                        choice.clone(),
+                    )));
+                }
+            }
+            if let Some(ref regex) = so.regex {
+                return Some(StructuredOutputsGrammar::new(StructuredConstraint::Regex(
+                    regex.clone(),
+                )));
+            }
+            if let Some(ref json) = so.json {
+                return Some(StructuredOutputsGrammar::new(StructuredConstraint::Json(
+                    json.clone(),
+                )));
+            }
+            if let Some(ref grammar) = so.grammar {
+                return Some(StructuredOutputsGrammar::new(StructuredConstraint::Lark(
+                    grammar.clone(),
+                )));
+            }
+            if let Some(ref structural_tag) = so.structural_tag {
+                return Some(self.build_structural_tag_grammar(structural_tag));
+            }
+        }
+        if let Some(ref rf) = self.request.response_format {
+            match rf.format_type.as_str() {
+                "json_schema" => {
+                    if let Some(ref schema) = rf.json_schema {
+                        let schema = sanitize_schema_for_llguidance(&schema.schema);
+                        return Some(StructuredOutputsGrammar::new(StructuredConstraint::Json(
+                            schema,
+                        )));
+                    }
+                }
+                "json_object" => {
+                    return Some(StructuredOutputsGrammar::new(StructuredConstraint::Lark(
+                        "start: text\ntext: %json {\"type\":\"object\"}".to_string(),
+                    )));
+                }
+                _ => {}
+            }
+        }
+        if let Some(ref constraint) = self.request.constraint {
+            let constraint_type = self.request.constraint_type.as_deref().unwrap_or("regex");
+            match constraint_type {
+                "regex" => {
+                    return Some(StructuredOutputsGrammar::new(StructuredConstraint::Regex(
+                        constraint.clone(),
+                    )));
+                }
+                "lark" => {
+                    return Some(StructuredOutputsGrammar::new(StructuredConstraint::Lark(
+                        constraint.clone(),
+                    )));
+                }
+                "json_schema" | "json" => {
+                    if let Ok(schema) = serde_json::from_str(constraint) {
+                        let schema = sanitize_schema_for_llguidance(&schema);
+                        return Some(StructuredOutputsGrammar::new(StructuredConstraint::Json(
+                            schema,
+                        )));
+                    }
+                }
+                _ => {}
+            }
+        }
+        None
+    }
+
+    fn build_structural_tag_grammar(&self, structural_tag: &Value) -> StructuredOutputsGrammar {
+        let start_tag = structural_tag
+            .get("start_tag")
+            .or_else(|| structural_tag.get("tag"))
+            .and_then(|v| v.as_str())
+            .unwrap_or("<tool>")
+            .to_string();
+        let end_tag = structural_tag
+            .get("end_tag")
+            .and_then(|v| v.as_str())
+            .unwrap_or("</tool>")
+            .to_string();
+        let schema = structural_tag
+            .get("schema")
+            .cloned()
+            .unwrap_or(serde_json::json!({"type": "object"}));
+        StructuredOutputsGrammar::new(StructuredConstraint::StructuralTag(StructuralTagConfig {
+            start_tag,
+            end_tag,
+            schema,
+        }))
+    }
+
+    fn build_tool_grammar(&self) -> Option<ToolCallGrammar> {
+        if self.request.tools.is_none() {
+            return None;
+        }
+
+        let tools = self.request.tools.as_ref().unwrap().clone();
+        // Extract token IDs from GuidanceTokens (u32), not from tool_config (String)
+        let start_token_id = self
+            .guidance_tokens
+            .tool_call_start_ids
+            .first()
+            .copied()
+            .unwrap_or(0);
+        let end_token_id = self
+            .guidance_tokens
+            .tool_call_end_ids
+            .first()
+            .copied()
+            .unwrap_or(0);
+
+        if !self.enable_tool_grammar {
+            return Some(ToolCallGrammar::new_generic(
+                tools,
+                start_token_id,
+                end_token_id,
+            ));
+        }
+
+        // TODO align 1:1 with parser selection
+        match self.parser_name.as_str() {
+            "qwen_coder" => Some(ToolCallGrammar::new_qwen_coder(
+                    tools,
+                    start_token_id,
+                    end_token_id,
+                )),
+            "minimax_m2" => Some(ToolCallGrammar::new_minimax(
+                    tools,
+                    start_token_id,
+                    end_token_id,
+                )),
+            "gemma4" => Some(ToolCallGrammar::new_json(
+                    tools,
+                    start_token_id,
+                    end_token_id,
+                )),
+            "qwen" | "json" | _ => Some(ToolCallGrammar::new_json(
+                    tools,
+                    start_token_id,
+                    end_token_id,
+                ))
+        }
+    }
+
+    fn build_reasoning_effort(&self) -> Option<ReasoningEffort> {
+        let effort_str = self.request.reasoning_effort.as_ref()?;
+        Some(ReasoningEffort::from_str(effort_str.clone()))
+    }
+}
+
+// GRAMMAR COMPOSER
+
+pub struct GrammarComposer;
+
+impl GrammarComposer {
+    pub fn compose_all_grammars(
+        constraint_grammars: Vec<StructuredOutputsGrammar>,
+        tool_grammar: Option<ToolCallGrammar>,
+        reasoning_effort: Option<ReasoningEffort>,
+        guidance_tokens: &GuidanceTokens,
+        max_tokens: usize,
+        chat_template: Option<crate::utils::chat_template::ChatTemplate>,
+        tokenizer: &Tokenizer,
+        disable_reasoning: bool,
+    ) -> TopLevelGrammar {
+        // Use ChatResponseGrammar as default when no constraints specified
+        let merged_constraints = Self::merge_constraints(constraint_grammars);
+        let composed_with_tools =
+            Self::compose_constraint_with_tools(merged_constraints, tool_grammar);
+        let final_grammar = Self::wrap_with_reasoning(
+            composed_with_tools,
+            reasoning_effort,
+            guidance_tokens,
+            disable_reasoning,
+        );
+        let mut grammar = Self::finalize_with_eos(final_grammar, guidance_tokens);
+        // Models like MiniMax allow us to change the model's role in responding to permit "multi-agent" conversations
+        // Need a helper to extract the capability from the chat template and role name from request
+        let role = "assistant".to_string();
+        
+        // Only prefix with BOS if add_bos_token is true
+        if guidance_tokens.add_bos_token {
+            grammar = Self::prefix_with_bos(grammar, guidance_tokens, role);
+        }
+
+        // Apply thinking fallback transformation after all composition is complete
+        // This transforms <[token_id]> syntax to string literals for models without reasoning tokens
+        grammar = apply_thinking_fallback(grammar, guidance_tokens, chat_template, tokenizer);
+
+        // Set max_tokens on the grammar
+        grammar.max_tokens = Some(max_tokens);
+        grammar
+    }
+
+    fn merge_constraints(grammars: Vec<StructuredOutputsGrammar>) -> StructuredOutputsGrammar {
+        if grammars.is_empty() {
+            // Default text grammar when no constraints specified
+            return StructuredOutputsGrammar::new(StructuredConstraint::Lark(
+                "start: text\ntext: /(?s:.+?)/".to_string(),
+            ));
+        }
+        if grammars.len() == 1 {
+            return grammars.into_iter().next().unwrap();
+        }
+        // Clone grammars to avoid consuming them
+        let grammars_clone = grammars.clone();
+        let mut result = grammars_clone.into_iter().next().unwrap();
+        let grammars_for_loop = grammars.into_iter().skip(1).collect::<Vec<_>>();
+        for mut g in grammars_for_loop {
+            result = result.compose_alternate(&mut g);
+        }
+        result
+    }
+
+    fn compose_constraint_with_tools(
+        base: StructuredOutputsGrammar,
+        tool: Option<ToolCallGrammar>,
+    ) -> StructuredOutputsGrammar {
+        match tool {
+            Some(mut tool_gram) => {
+                // Replace the text: rule with tool_call: rule
+                let tool_constraint = StructuredConstraint::Lark(tool_gram.build_lark());
+                let mut tool_grammar = StructuredOutputsGrammar::new(tool_constraint);
+                // Use alternation: ( text | tool_call )+
+                let mut base_mut = base;
+                base_mut.compose_alternate(&mut tool_grammar)
+            }
+            None => base,
+        }
+    }
+
+    fn wrap_with_reasoning(
+        base: StructuredOutputsGrammar,
+        effort: Option<ReasoningEffort>,
+        guidance_tokens: &GuidanceTokens,
+        disable_reasoning: bool,
+    ) -> StructuredOutputsGrammar {
+        // If reasoning is disabled, return base without reasoning
+        if disable_reasoning {
+            return base;
+        }
+
+        // If no explicit effort passed, use ModelDefault
+        let effort = effort.unwrap_or(ReasoningEffort::ModelDefault);
+
+        if effort != ReasoningEffort::None {
+            // Build reasoning grammar that wraps the base
+            let start_id = *guidance_tokens.reasoning_start_ids.first().unwrap_or(&0);
+            let end_id = *guidance_tokens.reasoning_end_ids.first().unwrap_or(&0);
+            let mut reasoning = ReasoningGrammar::new(start_id, end_id, effort);
+            let mut reasoning_grammar = StructuredOutputsGrammar::new(
+                StructuredConstraint::Lark(reasoning.build_lark()),
+            );
+            // Use sequence: reasoning_block followed by base
+            let mut base_mut = base;
+            return reasoning_grammar.compose_sequence(&mut base_mut);
+        }
+        base
+    }
+
+    fn prefix_with_bos(
+        mut grammar: TopLevelGrammar,
+        guidance_tokens: &GuidanceTokens,
+        role: String,
+    ) -> TopLevelGrammar {
+        if guidance_tokens.bos_token_ids.is_empty() {
+            return grammar;
+        }
+
+        // Check if grammar already has bos rule - avoid duplication
+        let lark = get_lark_from_top_level_grammar(&grammar);
+        if lark.contains("bos") {
+            return grammar;
+        }
+
+        // Extract current start rule RHS
+        let first_line = lark.lines().next().unwrap_or("");
+        let current_start_rhs = if let Some(rhs) = first_line.strip_prefix("start: ") {
+            rhs.trim()
+        } else {
+            "text"
+        };
+
+        // Build BOS rule(s) - support multiple BOS tokens with alternation
+        let bos_rule = if guidance_tokens.bos_token_ids.len() == 1 {
+            format!(r#"bos: <[{}]> "{}:" "\n" "#, guidance_tokens.bos_token_ids[0], &role)
+        } else {
+            let ids: Vec<String> = guidance_tokens
+                .bos_token_ids
+                .iter()
+                .map(|id| format!("<[{}]>", id))
+                .collect();
+            format!(r#"bos: ( {} ) "{}:" "\n" "#, ids.join(" | "), &role)
+        };
+
+        // Construct new grammar with BOS prefix
+        // Remove old start line, keep other rules, add new start and bos
+        let remaining_rules: Vec<String> = lark
+            .lines()
+            .skip(1)
+            .filter(|l| !l.trim().is_empty())
+            .map(|s| s.trim().to_string())
+            .collect();
+
+        let new_lark = format!(
+            "start: bos {}\n{}\n{}",
+            current_start_rhs,
+            remaining_rules.join("\n"),
+            bos_rule
+        );
+
+        TopLevelGrammar::from_lark_ascii(&new_lark)
+}
+
+    fn finalize_with_eos(
+        mut grammar: StructuredOutputsGrammar,
+        guidance_tokens: &GuidanceTokens,
+    ) -> TopLevelGrammar {
+        if guidance_tokens.eos_token_ids.is_empty() {
+            return grammar.format();
+        }
+        let lark = grammar.build_lark();
+        if lark.contains("eos") {
+            return grammar.format();
+        }
+        let first_line = lark.lines().next().unwrap_or("");
+        let current_start_rhs = if let Some(rhs) = first_line.strip_prefix("start: ") {
+            rhs.trim()
+        } else {
+            "text"
+        };
+        let new_start = format!("start: {} eos", current_start_rhs);
+        let eos_rule = if guidance_tokens.eos_token_ids.len() == 1 {
+            format!("eos: <[{}]>", guidance_tokens.eos_token_ids[0])
+        } else {
+            let ids: Vec<String> = guidance_tokens
+                .eos_token_ids
+                .iter()
+                .map(|id| format!("<[{}]>", id))
+                .collect();
+            format!("eos: ( {} )", ids.join(" | "))
+        };
+        let final_lark = format!(
+            "{}\n{}\n{}",
+            new_start,
+            lark.lines().skip(1).collect::<Vec<_>>().join("\n"),
+            eos_rule
+        );
+        TopLevelGrammar::from_lark_ascii(&final_lark)
+    }
+}
+
+// HELPER FUNCTIONS
+
+pub fn get_lark_from_top_level_grammar(grammar: &TopLevelGrammar) -> String {
+    if grammar.grammars.is_empty() {
+        return "No grammars".to_string();
+    }
+    let mut larks: Vec<String> = grammar
+        .grammars
+        .iter()
+        .filter_map(|g| g.lark_grammar.as_ref())
+        .map(|s| s.clone())
+        .collect();
+    for g in &grammar.grammars {
+        if let Some(json_schema) = &g.json_schema {
+            let schema_str = serde_json::to_string(json_schema).unwrap_or_default();
+            larks.push(format!("start: text\ntext: %json {}", schema_str));
+        }
+    }
+    if larks.is_empty() {
+        format!(
+            "{} grammars, none have lark_grammar",
+            grammar.grammars.len()
+        )
+    } else {
+        larks.join("\n---\n")
+    }
+}
+
+/// Apply thinking fallback transformation for models without reasoning tokens in chat template
+/// This version works on Lark strings for use during grammar composition
+///
+/// This function transforms <[token_id]> syntax to string literals like "thinking" and "</thinking>"
+/// for models that were not trained on reasoning tokens and cannot properly handle the <[token_id]> syntax.
+///
+/// The fallback is controlled by the VLLM_RS_PROVIDE_THINKING_FALLBACK environment variable.
+/// When set to true, models without explicit reasoning tokens in their chat template will have
+/// their grammar transformed to use string literals instead of token IDs.
+///
+/// Returns Some(transformed_lark) if fallback should be applied, None otherwise
+///
+/// This function checks:
+/// 1. If VLLM_RS_PROVIDE_THINKING_FALLBACK is set to "true" or "1"
+/// 2. If the chat template contains reasoning tokens
+/// 3. If the grammar contains <[token_id]> syntax that needs to be replaced
+///
+/// Returns None if:
+/// - Environment variable is not set (fallback not enabled)
+/// - Chat template already contains reasoning tokens (no need for fallback)
+pub fn apply_thinking_fallback_lark(
+    lark: String,
+    guidance_tokens: &GuidanceTokens,
+    chat_template: Option<crate::utils::chat_template::ChatTemplate>,
+    tokenizer: &Tokenizer,
+) -> Option<String> {
+    // Check environment variable - if not set, fallback is not enabled
+    let provide_thinking_fallback = std::env::var("VLLM_RS_PROVIDE_THINKING_FALLBACK")
+        .ok()
+        .map(|v| v.eq_ignore_ascii_case("true") || v == "1")
+        .unwrap_or(false);
+
+    if !provide_thinking_fallback {
+        return None; // Fallback not enabled via environment variable
+    }
+
+    // Get reasoning token strings from tokenizer
+    match get_reasoning_token_strings(guidance_tokens, tokenizer) {
+        Some((start_str, end_str)) => {
+            // Check if chat template already contains reasoning tokens
+            if let Some(template_str) = chat_template
+                .as_ref()
+                .and_then(|t| t.get_template_string().map(|s| s.to_string()))
+            {
+                // Normalize to ASCII-only for robust comparison
+                let normalized_template: String =
+                    template_str.chars().filter(|c| c.is_ascii()).collect();
+                let normalized_start: String = start_str.chars().filter(|c| c.is_ascii()).collect();
+                let normalized_end: String = end_str.chars().filter(|c| c.is_ascii()).collect();
+
+                // Check if template contains reasoning tokens
+                if normalized_template.contains(&normalized_start)
+                    && normalized_template.contains(&normalized_end)
+                {
+                    crate::log_info!(
+                        "[llg] Chat template contains reasoning tokens, no fallback needed"
+                    );
+                    return None; // Reasoning tokens found in template, no fallback needed
+                }
+            }
+
+            // Apply fallback transformation
+            crate::log_info!(
+                "[llg] Chat template does not contain reasoning tokens, applying fallback"
+            );
+
+            let reason_start = format!("<[{}]>", guidance_tokens.reasoning_start_ids[0]);
+            let reason_end = format!("<[{}]>", guidance_tokens.reasoning_end_ids[0]);
+
+            // Transform <[token_id]> syntax to string literals in common vocabulary
+            let lark = lark
+                .replace(&reason_start, "\"<thinking>\"")
+                .replace(&reason_end, "\"</thinking>\"");
+
+            Some(lark)
+        }
+        None => {
+            // No reasoning tokens in guidance_tokens, apply fallback
+            crate::log_info!(
+                "[llg] No reasoning tokens in guidance_tokens, applying fallback"
+            );
+
+            let reason_start = format!("<[{}]>", guidance_tokens.reasoning_start_ids[0]);
+            let reason_end = format!("<[{}]>", guidance_tokens.reasoning_end_ids[0]);
+
+            let lark = lark
+                .replace(&reason_start, "\"<thinking>\"")
+                .replace(&reason_end, "\"</thinking>\"");
+
+            Some(lark)
+        }
+    }
+}
+
+/// Apply thinking fallback transformation for models without reasoning tokens in chat template
+///
+/// This function transforms <[token_id]> syntax to string literals like "thinking" and "</thinking>"
+/// for models that were not trained on reasoning tokens and cannot properly handle the <[token_id]> syntax.
+///
+/// The fallback is controlled by the VLLM_RS_PROVIDE_THINKING_FALLBACK environment variable.
+/// When set to true, models without explicit reasoning tokens in their chat template will have
+/// their grammar transformed to use string literals instead of token IDs.
+pub fn apply_thinking_fallback(
+    grammar: TopLevelGrammar,
+    guidance_tokens: &GuidanceTokens,
+    chat_template: Option<crate::utils::chat_template::ChatTemplate>,
+    tokenizer: &Tokenizer,
+) -> TopLevelGrammar {
+    // Extract Lark string from grammar
+    let lark_str = get_lark_from_top_level_grammar(&grammar);
+
+    // Apply the lark-based fallback transformation
+    if let Some(transformed_lark) = apply_thinking_fallback_lark(
+        lark_str,
+        guidance_tokens,
+        chat_template,
+        tokenizer,
+    ) {
+        TopLevelGrammar::from_lark_ascii(&transformed_lark)
+    } else {
+        grammar
+    }
+}
+
+// REASONING TOKEN FUNCTIONS
+
+/// Extract reasoning token strings from GuidanceTokens using tokenizer
+/// Returns Some((start_string, end_string)) if tokens exist, None otherwise
+pub fn get_reasoning_token_strings(
+    guidance_tokens: &GuidanceTokens,
+    tokenizer: &Tokenizer,
+) -> Option<(String, String)> {
+    if guidance_tokens.reasoning_start_ids.is_empty()
+        || guidance_tokens.reasoning_end_ids.is_empty()
+    {
+        return None;
+    }
+
+    // Use tokenizer to decode token IDs to strings
+    let start_str = tokenizer
+        .decode(&guidance_tokens.reasoning_start_ids, false)
+        .ok()?;
+    let end_str = tokenizer
+        .decode(&guidance_tokens.reasoning_end_ids, false)
+        .ok()?;
+
+    Some((start_str, end_str))
+}
+
+/// Check if a TopLevelGrammar contains reasoning block patterns
+pub fn is_reasoning_grammar(grammar: &TopLevelGrammar) -> bool {
+    // Extract the Lark representation from TopLevelGrammar
+    let lark_str = get_lark_from_top_level_grammar(grammar);
+    // Check for reasoning-specific definition in the grammar structure
+    lark_str
+        .split("\n")
+        .into_iter()
+        .any(|l: &str| l.contains("reasoning_block") && l.contains("<[") && l.contains("]>"))
+}
+
+/// Build TopLevelGrammar from a GrammarRequest
+/// This function handles all grammar types (lark, regex, json_schema, choice)
+/// and returns a parsed TopLevelGrammar ready for use in guided decoding.
+pub fn build_grammar_from_request(
+    grammar_type: &str,
+    grammar_content: &str,
+) -> GrammarResult<TopLevelGrammar> {
+    match grammar_type {
+        "lark" => Ok(TopLevelGrammar::from_lark_ascii(grammar_content)),
+        "json_schema" => {
+            let value: serde_json::Value = serde_json::from_str(grammar_content)
+                .map_err(|e| GrammarError::InvalidGrammar(format!("Invalid JSON schema: {}", e)))?;
+            let sanitized = sanitize_schema_for_llguidance(&value);
+            TopLevelGrammar::from_json_schema_ascii(sanitized)
+                .map_err(|e| GrammarError::InvalidGrammar(format!("Invalid schema: {}", e)))
+        }
+        "regex" => Ok(TopLevelGrammar::from_regex_ascii(grammar_content)),
+        "choice" => {
+            // Parse the grammar_content as a JSON array of strings
+            let choices: Vec<String> = serde_json::from_str(grammar_content)
+                .map_err(|e| GrammarError::InvalidGrammar(format!("Invalid choice JSON: {}", e)))?;
+            build_choice_lark_grammar(&choices)
+        }
+        other => Err(GrammarError::UnsupportedFormat(format!(
+            "Unknown grammar_type: {}",
+            other
+        ))),
+    }
+}
+
+/// Build a Lark grammar for choice constraints (structured outputs choice field)
+pub fn build_choice_lark_grammar(choices: &[String]) -> GrammarResult<TopLevelGrammar> {
+    // Validate choices - must not contain empty strings
+    for choice in choices {
+        if choice.is_empty() {
+            return Err(GrammarError::InvalidGrammar(
+                "Choice grammar cannot contain empty strings".to_string(),
+            ));
+        }
+    }
+
+    // Build Lark grammar for choices using lark_quote for proper escaping
+    let mut parts = Vec::with_capacity(choices.len());
+    for choice in choices {
+        parts.push(lark_quote(choice));
+    }
+    let choice_grammar = parts.join(" | ");
+
+    // Create TopLevelGrammar from the choice Lark string
+    let lark = format!("start: {}\n", choice_grammar);
+    Ok(TopLevelGrammar::from_lark_ascii(&lark))
+}
+
+/// Generate complete TopLevelGrammar from ChatCompletionRequest
+/// Single call-site function that handles all grammar permutations
+/// Returns fully composed grammar with proper <[token_id]> format for tool tags
+pub fn generate_grammar_from_request(
+    request: &crate::server::ChatCompletionRequest,
+    guidance_tokens: &crate::utils::guidance::GuidanceTokens,
+    enable_tool_grammar: bool,
+    allow_constraint_api: bool,
+    model_type: &crate::utils::config::ModelType,
+    _model_id: &str,
+    parser_name: String,
+    tokenizer: &Tokenizer,
+    chat_template: Option<ChatTemplate>,
+    disable_reasoning: bool,
+) -> Option<TopLevelGrammar> {
+    // Use new GrammarRequestDispatcher for grammar composition
+    let tool_config = ToolConfig::from_tokenizer(tokenizer, model_type);
+
+    let dispatcher = GrammarRequestDispatcher::new(
+        request,
+        guidance_tokens,
+        &tool_config,
+        enable_tool_grammar,
+        allow_constraint_api,
+        parser_name,
+        tokenizer,
+        chat_template,
+        disable_reasoning,
+    );
+
+    dispatcher.build_grammar()
+}
+
+/// Build guided decoding grammar for claude_server.rs
+/// This function constructs a synthetic ChatCompletionRequest from claude-style parameters
+/// and delegates to generate_grammar_from_request for grammar composition.
+///
+/// Parameters:
+/// - guidance_tokens: Contains EOS and reasoning token IDs
+/// - tool_config: Model-specific tool call token configuration
+/// - tools: List of available tools for grammar generation
+/// - tool_parser_name: Name of the tool parser (e.g., "qwen_coder", "json")
+/// - constraint_grammar: Optional constraint grammar from structured_outputs
+/// - tool_choice_required: Whether tool choice is required
+/// - forced_tool_name: Optional specific tool name to force
+/// - max_tokens: Maximum tokens for generation
+/// - reasoning_effort: Optional reasoning effort level
+/// - enable_tool_grammar: Whether to enable tool grammar generation
+/// - allow_constraint_api: Whether to allow constraint API
+/// - tokenizer: Tokenizer for token ID lookup and grammar composition
+/// - model_type: Model type for parser selection
+/// - model_id: Model ID for parser configuration
+/// - chat_template: Optional chat template for reasoning token detection
+pub fn build_guided_decoding_grammar(
+    guidance_tokens: &crate::utils::guidance::GuidanceTokens,
+    _tool_config: &crate::server::parser::ToolConfig,
+    tools: &[crate::tools::Tool],
+    tool_parser_name: &str,
+    constraint_grammar: Option<TopLevelGrammar>,
+    _tool_choice_required: bool,
+    forced_tool_name: Option<String>,
+    max_tokens: usize,
+    reasoning_effort: Option<crate::utils::config::ReasoningEffort>,
+    enable_tool_grammar: bool,
+    allow_constraint_api: bool,
+    tokenizer: &Tokenizer,
+    model_type: &crate::utils::config::ModelType,
+    model_id: &str,
+    chat_template: Option<ChatTemplate>,
+    disable_reasoning: bool,
+) -> Option<TopLevelGrammar> {
+    // If constraint_grammar is provided, extract the Lark string and set it as a constraint
+    // The dispatcher will handle this in build_constraint_grammar
+    let constraint = constraint_grammar
+        .as_ref()
+        .map(|cg| get_lark_from_top_level_grammar(cg));
+
+    // Build a synthetic request with tools and constraint info
+    let synthetic_request = crate::server::ChatCompletionRequest {
+        messages: vec![],
+        model: None,
+        temperature: None,
+        max_tokens: Some(max_tokens),
+        top_k: None,
+        top_p: None,
+        frequency_penalty: None,
+        presence_penalty: None,
+        thinking: None,
+        stop: None,
+        stream: None,
+        stream_options: None,
+        session_id: None,
+        tools: if tools.is_empty() {
+            None
+        } else {
+            Some(tools.to_vec())
+        },
+        tool_choice: forced_tool_name.map(|fn_name| crate::tools::ToolChoice::function(fn_name)),
+        response_format: None,
+        extra_body: None,
+        structured_outputs: None, // constraint_grammar is handled via constraint field
+        constraint: constraint,
+        constraint_type: Some("lark".to_string()), // constraint_grammar is always Lark format
+        reasoning_effort: reasoning_effort.map(|e| e.to_string()),
+    };
+
+    // Use GrammarRequestDispatcher with disable_reasoning
+    let tool_config = ToolConfig::from_tokenizer(tokenizer, model_type);
+    let parser_name = tool_parser_name.to_string();
+
+    let dispatcher = GrammarRequestDispatcher::new(
+        &synthetic_request,
+        guidance_tokens,
+        &tool_config,
+        enable_tool_grammar,
+        allow_constraint_api,
+        parser_name,
+        tokenizer,
+        chat_template,
+        disable_reasoning,
+    );
+
+    dispatcher.build_grammar()
+}
+
+/// Lark literal - returns string with quotes for non-special tags
+pub fn _lark_literal(s: &str, special: bool) -> String {
+    if special {
+        s.to_string()
+    } else {
+        format!("\"{}\"", s)
+    }
+}
+
+/// Parse structural tag configuration from JSON value
+pub fn parse_structural_tag(value: &Value) -> Result<(String, String, Value), String> {
+    let start_tag = value
+        .get("start_tag")
+        .or_else(|| value.get("tag"))
+        .and_then(|v| v.as_str())
+        .ok_or_else(|| "Missing start_tag or tag".to_string())?
+        .to_string();
+
+    let end_tag = value
+        .get("end_tag")
+        .and_then(|v| v.as_str())
+        .map(|s| s.to_string())
+        .unwrap_or_else(|| {
+            // Extract tag name from start_tag (e.g., "<tool>" -> "tool")
+            let tag_name = start_tag.trim_start_matches('<').trim_end_matches('>');
+            format!("</{}>", tag_name)
+        });
+
+    let schema = value
+        .get("schema")
+        .cloned()
+        .unwrap_or_else(|| json!({"type": "object"}));
+
+    Ok((start_tag, end_tag, schema))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Helper function to create mock GuidanceTokens for testing
+    fn create_mock_guidance_tokens() -> GuidanceTokens {
+        GuidanceTokens {
+            bos_token_ids: vec![151647],
+            eos_token_ids: vec![151648, 151649],
+            reasoning_start_ids: vec![151657],
+            reasoning_end_ids: vec![151658],
+            tool_call_start_ids: vec![151657],
+            tool_call_end_ids: vec![151658],
+            add_bos_token: false,
+        }
+    }
+
+    /// Helper function to create mock ChatCompletionRequest with tools for testing
+    fn create_mock_request_with_tools(tools: Vec<Tool>) -> crate::server::ChatCompletionRequest {
+        crate::server::ChatCompletionRequest {
+            messages: vec![],
+            model: None,
+            temperature: None,
+            max_tokens: None,
+            top_k: None,
+            top_p: None,
+            frequency_penalty: None,
+            presence_penalty: None,
+            thinking: None,
+            stop: None,
+            stream: None,
+            stream_options: None,
+            session_id: None,
+            tools: Some(tools),
+            tool_choice: None,
+            response_format: None,
+            extra_body: None,
+            structured_outputs: None,
+            constraint: None,
+            constraint_type: None,
+            reasoning_effort: None,
+        }
+    }
+
+    #[test]
+    fn test_reasoning_grammar_low_effort() {
+        let mut grammar = ReasoningGrammar::new(151657, 151658, ReasoningEffort::Low);
+        let lark = grammar.build_lark();
+        assert!(lark.contains("reasoning_block"));
+        assert!(lark.contains("<[151657]>"));
+        assert!(lark.contains("<[151658]>"));
+    }
+
+    #[test]
+    fn test_tool_call_grammar_json() {
+        let mut grammar = ToolCallGrammar::new_json(Vec::new(), 151657, 151658);
+        let lark = grammar.build_lark();
+        assert!(lark.contains("tool_call"));
+    }
+
+    #[test]
+    fn test_structured_outputs_grammar_choice() {
+        let mut grammar = StructuredOutputsGrammar::new(StructuredConstraint::Choice(vec![
+            "option1".to_string(),
+            "option2".to_string(),
+        ]));
+        let lark = grammar.build_lark();
+        assert!(lark.contains("start:"));
+    }
+
+    #[test]
+    fn test_chat_response_grammar() {
+        let mut grammar = ChatResponseGrammar::new().with_eos(true);
+        let lark = grammar.build_lark();
+        assert!(lark.contains("start: text"));
+    }
+
+    #[test]
+    fn test_reasoning_grammar_high_effort_output() {
+        let mut grammar = ReasoningGrammar::new(151667, 151668, ReasoningEffort::High);
+        let lark = grammar.build_lark();
+
+        // Should contain reasoning block structure
+        assert!(lark.contains("reasoning_block"));
+        assert!(lark.contains("analysis_block"));
+        assert!(lark.contains("critique_block"));
+        assert!(lark.contains("structure_block"));
+
+        // Should contain token IDs
+        assert!(lark.contains("[151667]"));
+        assert!(lark.contains("[151668]"));
+    }
+
+    #[test]
+    fn test_choice_constraint_output() {
+        let mut grammar = StructuredOutputsGrammar::new(StructuredConstraint::Choice(vec![
+            "yes".to_string(),
+            "no".to_string(),
+        ]));
+        let lark = grammar.build_lark();
+
+        // Should contain choice alternation
+        assert!(lark.contains("start:"));
+        assert!(lark.contains("yes"));
+        assert!(lark.contains("no"));
+    }
+
+    #[test]
+    fn test_json_constraint_output() {
+        let schema =
+            serde_json::json!({"type": "object", "properties": {"name": {"type": "string"}}});
+        let mut grammar = StructuredOutputsGrammar::new(StructuredConstraint::Json(schema));
+        let lark = grammar.build_lark();
+
+        // Should contain JSON schema constraint
+        assert!(lark.contains("text: %json"));
+    }
+
+    #[test]
+    fn test_tool_call_grammar_output() {
+        let mut grammar = ToolCallGrammar::new_json(Vec::new(), 151657, 151658);
+        let lark = grammar.build_lark();
+
+        // Should contain tool_call structure
+        assert!(lark.contains("start: tool_call"));
+        assert!(lark.contains("tool_call:"));
+    }
+
+    #[test]
+    fn test_compose_alternate_constraint_and_tool() {
+        // Create constraint grammar
+        let constraint = StructuredOutputsGrammar::new(StructuredConstraint::Lark(
+            "start: text\ntext: /(?s:.+?)/".to_string(),
+        ));
+
+        // Create tool grammar
+        let mut tool = ToolCallGrammar::new_json(Vec::new(), 151657, 151658);
+        let tool_grammar =
+            StructuredOutputsGrammar::new(StructuredConstraint::Lark(tool.build_lark()));
+
+        // Compose with alternation
+        let mut constraint_mut = constraint.clone();
+        let mut result = constraint_mut.compose_alternate(&mut tool_grammar.clone());
+
+        let lark = result.build_lark();
+
+        // Should contain alternation of both constraints
+        assert!(lark.contains("text |"));
+        assert!(lark.contains("tool_call"));
+    }
+
+    #[test]
+    fn test_compose_sequence_reasoning_and_constraint() {
+        // Create reasoning grammar
+        let mut reasoning = ReasoningGrammar::new(151667, 151668, ReasoningEffort::High);
+        let reasoning_grammar =
+            StructuredOutputsGrammar::new(StructuredConstraint::Lark(reasoning.build_lark()));
+
+        // Create constraint grammar
+        let constraint = StructuredOutputsGrammar::new(StructuredConstraint::Lark(
+            "start: text\ntext: /(?s:.+?)/".to_string(),
+        ));
+
+        // Compose with sequence
+        let mut reasoning_mut = reasoning_grammar.clone();
+        let mut result = reasoning_mut.compose_sequence(&mut constraint.clone());
+
+        let lark = result.build_lark();
+
+        // Should contain both reasoning and constraint
+        assert!(lark.contains("reasoning_block"));
+        assert!(lark.contains("text"));
+    }
+
+    #[test]
+    fn test_compose_alternate_multiple_constraints() {
+        // Create multiple constraint grammars
+        let constraint1 = StructuredOutputsGrammar::new(StructuredConstraint::Lark(
+            r#"start: text1\ntext1: /[\x20-\x7E\x0A\x0D]+?/"#.to_string(),
+        ));
+
+        let mut constraint2 = StructuredOutputsGrammar::new(StructuredConstraint::Lark(
+            r#"start: text2\ntext2: /[\x20-\x7E\x0A\x0D]+?/"#.to_string(),
+        ));
+
+        let mut constraint3 = StructuredOutputsGrammar::new(StructuredConstraint::Lark(
+            r#"start: text3\ntext3: /[\x20-\x7E\x0A\x0D]+?/"#.to_string(),
+        ));
+
+        // Compose all with alternation
+        let mut result = constraint1;
+        result = result.compose_alternate(&mut constraint2);
+        result = result.compose_alternate(&mut constraint3);
+
+        let lark = result.build_lark();
+
+        // Should contain all three alternatives
+        // The alternation format is: start: ( (text1 | text2)+ | text3 )+
+        assert!(lark.contains("text1"));
+        assert!(lark.contains("text2"));
+        assert!(lark.contains("text3"));
+    }
+
+    #[test]
+    fn test_build_choice_lark_grammar_empty_string() {
+        let result = build_choice_lark_grammar(&["".to_string()]);
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_sanitize_schema_for_llguidance_strips_metadata_fields() {
+        // Test that metadata fields like default, description, title are stripped
+        // while validation fields like minimum, maximum, type are preserved
+        let schema = json!({
+            "type": "object",
+            "properties": {
+                "count": {
+                    "type": "integer",
+                    "minimum": 1,
+                    "maximum": 200,
+                    "default": 50,
+                    "description": "A count parameter",
+                    "title": "Count"
+                },
+                "enabled": {
+                    "type": "boolean",
+                    "default": true
+                }
+            },
+            "required": ["count"]
+        });
+        let sanitized = sanitize_schema_for_llguidance(&schema);
+
+        // Metadata fields should be stripped
+        assert!(sanitized["properties"]["count"].get("default").is_none(), "default should be stripped");
+        assert!(sanitized["properties"]["count"].get("description").is_none(), "description should be stripped");
+        assert!(sanitized["properties"]["count"].get("title").is_none(), "title should be stripped");
+
+        // Validation fields should be preserved
+        assert_eq!(sanitized["properties"]["count"]["type"], "integer");
+        assert_eq!(sanitized["properties"]["count"]["minimum"], 1);
+        assert_eq!(sanitized["properties"]["count"]["maximum"], 200);
+
+        // Boolean with default
+        assert!(sanitized["properties"]["enabled"].get("default").is_none());
+        assert_eq!(sanitized["properties"]["enabled"]["type"], "boolean");
+    }
+
+    #[test]
+    fn test_sanitize_schema_for_llguidance_resolves_refs() {
+        // Test that $defs section is resolved and $ref is replaced with actual definition
+        // This is critical for tool schemas that use JSON Schema definitions
+        let schema = json!({
+            "$defs": {
+                "AskUserQuestion": {
+                    "type": "object",
+                    "properties": {
+                        "label": {"type": "string"},
+                        "question": {"type": "string"},
+                        "options": {"type": "array", "items": {"type": "string"}}
+                    },
+                    "required": ["label", "question", "options"]
+                },
+                "AskUserOption": {
+                    "type": "object",
+                    "properties": {
+                        "value": {"type": "string"},
+                        "label": {"type": "string"}
+                    },
+                    "required": ["value", "label"]
+                }
+            },
+            "type": "object",
+            "properties": {
+                "questions": {
+                    "type": "array",
+                    "items": {"$ref": "#/$defs/AskUserQuestion"}
+                }
+            },
+            "required": ["questions"]
+        });
+
+        let sanitized = sanitize_schema_for_llguidance(&schema);
+
+        // Debug: print the sanitized schema
+        eprintln!("Original schema: {}", serde_json::to_string_pretty(&schema).unwrap());
+        eprintln!("Sanitized schema: {}", serde_json::to_string_pretty(&sanitized).unwrap());
+
+        // $defs should be REMOVED (not preserved) - refs are resolved
+        assert!(sanitized.get("$defs").is_none(), "$defs should be removed after resolution");
+
+        // $ref should be replaced with the actual definition
+        let items_schema = &sanitized["properties"]["questions"]["items"];
+        assert!(items_schema.get("$ref").is_none(), "$ref should be replaced");
+
+        // The items should now have the resolved definition (type: object with properties)
+        assert_eq!(items_schema["type"], "object", "items should be resolved to object type");
+        assert!(items_schema.get("properties").is_some(), "resolved items should have properties");
+    }
+
+    #[test]
+    fn test_full_tool_offer_schema_with_defs_resolved() {
+        // Test with the exact tool offer schema from the error log
+        // This verifies that $defs are resolved and $refs are replaced with actual definitions
+        let schema = json!({
+            "$defs": {
+                "AskUserOption": {
+                    "description": "A predefined answer option for a question.",
+                    "properties": {
+                        "description": {
+                            "description": "Optional description shown below the label",
+                            "nullable": true,
+                            "type": "string"
+                        },
+                        "label": {
+                            "description": "Display label for the option",
+                            "type": "string"
+                        },
+                        "selected": {
+                            "default": false,
+                            "description": "Default selection state when multi_select is true.",
+                            "type": "boolean"
+                        },
+                        "value": {
+                            "description": "Value to return to LLM when selected",
+                            "type": "string"
+                        }
+                    },
+                    "required": ["value", "label"],
+                    "type": "object"
+                },
+                "AskUserQuestion": {
+                    "description": "A single question presented to the user.",
+                    "properties": {
+                        "allow_custom": {
+                            "default": true,
+                            "description": "Whether to allow custom text input (default: true)",
+                            "type": "boolean"
+                        },
+                        "label": {
+                            "description": "Short unique label for tab display (max ~15 chars recommended)",
+                            "type": "string"
+                        },
+                        "multi_select": {
+                            "default": false,
+                            "description": "When true, user can select/deselect multiple options.",
+                            "type": "boolean"
+                        },
+                        "options": {
+                            "description": "Predefined answer options",
+                            "items": {"$ref": "$defs/AskUserOption"},
+                            "type": "array"
+                        },
+                        "question": {
+                            "description": "Full question text to display",
+                            "type": "string"
+                        }
+                    },
+                    "required": ["label", "question", "options"],
+                    "type": "object"
+                }
+            },
+            "$schema": "https://json-schema.org/draft/2020-12/schema",
+            "description": "Request payload for the `ask_user` tool.",
+            "properties": {
+                "questions": {
+                    "description": "List of questions to ask the user.",
+                    "items": {"$ref": "$defs/AskUserQuestion"},
+                    "type": "array"
+                }
+            },
+            "required": ["questions"],
+            "title": "AskUserRequest",
+            "type": "object"
+        });
+
+        let sanitized = sanitize_schema_for_llguidance(&schema);
+
+        // Debug: print the sanitized schema
+        eprintln!("Full tool offer sanitized schema: {}", serde_json::to_string_pretty(&sanitized).unwrap());
+
+        // $defs should be REMOVED (not preserved) - refs are resolved
+        assert!(sanitized.get("$defs").is_none(), "$defs should be removed after resolution");
+
+        // $ref in properties should be replaced with actual definition
+        let questions_items = &sanitized["properties"]["questions"]["items"];
+        assert!(questions_items.get("$ref").is_none(), "$ref should be replaced with resolved definition");
+
+        // The resolved items should have the actual definition (type: object with properties)
+        assert_eq!(questions_items["type"], "object", "items should be resolved to object type");
+        assert!(questions_items.get("properties").is_some(), "resolved items should have properties");
+        assert!(questions_items.get("required").is_some(), "resolved items should have required");
+
+        // $ref inside $defs should also be resolved
+        let ask_user_question = sanitized["properties"]["questions"]["items"].clone();
+        let options = ask_user_question["properties"]["options"].clone();
+        let items = options["items"].clone();
+        assert!(items.get("$ref").is_none(), "$ref inside options should be resolved");
+        assert_eq!(items["type"], "object", "resolved items should be object type");
+    }
+
+    #[test]
+    fn test_extract_defs_simple_schema() {
+        // Test that extract_defs correctly extracts definitions from $defs section
+        let schema = json!({
+            "$defs": {
+                "Question": {
+                    "type": "object",
+                    "properties": {
+                        "text": {"type": "string"}
+                    }
+                },
+                "Option": {
+                    "type": "object",
+                    "properties": {
+                        "value": {"type": "string"}
+                    }
+                }
+            },
+            "type": "object",
+            "properties": {
+                "question": {"$ref": "#/$defs/Question"}
+            }
+        });
+
+        let (schema_without_defs, defs) = extract_defs(&schema);
+
+        // $defs should be removed from schema
+        assert!(schema_without_defs.get("$defs").is_none(), "$defs should be removed from schema");
+
+        // Definitions should be extracted
+        assert_eq!(defs.len(), 2, "Should extract 2 definitions");
+        assert!(defs.contains_key("Question"), "Should contain Question definition");
+        assert!(defs.contains_key("Option"), "Should contain Option definition");
+
+        // Schema structure should be preserved
+        assert_eq!(schema_without_defs["type"], "object", "Schema type should be preserved");
+        assert!(schema_without_defs.get("properties").is_some(), "Properties should be preserved");
+    }
+
+    #[test]
+    fn test_extract_defs_nested_refs() {
+        // Test that extract_defs handles nested $ref within definitions
+        let schema = json!({
+            "$defs": {
+                "Option": {
+                    "type": "object",
+                    "properties": {
+                        "value": {"type": "string"},
+                        "label": {"type": "string"}
+                    }
+                },
+                "Question": {
+                    "type": "object",
+                    "properties": {
+                        "text": {"type": "string"},
+                        "options": {
+                            "type": "array",
+                            "items": {"$ref": "#/$defs/Option"}
+                        }
+                    }
+                }
+            },
+            "type": "object",
+            "properties": {
+                "questions": {
+                    "type": "array",
+                    "items": {"$ref": "#/$defs/Question"}
+                }
+            }
+        });
+
+        let (schema_without_defs, defs) = extract_defs(&schema);
+
+        // All definitions should be extracted
+        assert_eq!(defs.len(), 2, "Should extract 2 definitions");
+        assert!(defs.contains_key("Option"), "Should contain Option definition");
+        assert!(defs.contains_key("Question"), "Should contain Question definition");
+
+        // $defs should be removed from schema
+        assert!(schema_without_defs.get("$defs").is_none(), "$defs should be removed");
+    }
+
+    #[test]
+    fn test_resolve_schema_refs_simple() {
+        // Test that resolve_schema_refs replaces $ref with actual definition
+        let defs: HashMap<String, Value> = [
+            ("Question".to_string(), json!({
+                "type": "object",
+                "properties": {
+                    "text": {"type": "string"}
+                }
+            }))
+        ].iter().cloned().collect();
+
+        let schema = json!({
+            "type": "object",
+            "properties": {
+                "question": {"$ref": "#/$defs/Question"}
+            }
+        });
+
+        let resolved = resolve_schema_refs(&schema, &defs);
+
+        // $ref should be replaced with actual definition
+        assert!(resolved["properties"]["question"].get("$ref").is_none(), "$ref should be replaced");
+        assert_eq!(resolved["properties"]["question"]["type"], "object", "Should have resolved type");
+        assert!(resolved["properties"]["question"].get("properties").is_some(), "Should have resolved properties");
+    }
+
+    #[test]
+    fn test_resolve_schema_refs_nested() {
+        // Test that resolve_schema_refs handles nested $ref within definitions
+        let defs: HashMap<String, Value> = [
+            ("Option".to_string(), json!({
+                "type": "object",
+                "properties": {
+                    "value": {"type": "string"}
+                }
+            })),
+            ("Question".to_string(), json!({
+                "type": "object",
+                "properties": {
+                    "text": {"type": "string"},
+                    "options": {
+                        "type": "array",
+                        "items": {"$ref": "#/$defs/Option"}
+                    }
+                }
+            }))
+        ].iter().cloned().collect();
+
+        let schema = json!({
+            "type": "object",
+            "properties": {
+                "questions": {
+                    "type": "array",
+                    "items": {"$ref": "#/$defs/Question"}
+                }
+            }
+        });
+
+        let resolved = resolve_schema_refs(&schema, &defs);
+
+        // Top-level $ref should be resolved
+        let questions_items = &resolved["properties"]["questions"]["items"];
+        assert!(questions_items.get("$ref").is_none(), "Top-level $ref should be resolved");
+        assert_eq!(questions_items["type"], "object", "Should have resolved type");
+
+        // Nested $ref in options should also be resolved
+        let options = &questions_items["properties"]["options"];
+        let options_items = &options["items"];
+        assert!(options_items.get("$ref").is_none(), "Nested $ref should be resolved");
+        assert_eq!(options_items["type"], "object", "Nested items should have resolved type");
+    }
+
+    #[test]
+    fn test_resolve_schema_refs_missing_def() {
+        // Test that resolve_schema_refs handles missing definitions gracefully
+        let defs: HashMap<String, Value> = HashMap::new();
+
+        let schema = json!({
+            "type": "object",
+            "properties": {
+                "question": {"$ref": "#/$defs/MissingType"}
+            }
+        });
+
+        let resolved = resolve_schema_refs(&schema, &defs);
+
+        // $ref should remain when definition is missing (won't crash)
+        assert!(resolved["properties"]["question"].get("$ref").is_some(), "$ref should remain when def is missing");
+    }
+
+    #[test]
+    fn test_llguidance_json_schema_with_defs_compiles() {
+        // Test that llguidance can compile a JSON schema with resolved $defs
+        // This verifies the fix works end-to-end with llguidance
+        use llguidance::api::TopLevelGrammar;
+
+        let schema = json!({
+            "$defs": {
+                "AskUserQuestion": {
+                    "type": "object",
+                    "properties": {
+                        "label": {"type": "string"},
+                        "question": {"type": "string"},
+                        "options": {
+                            "type": "array",
+                            "items": {"$ref": "$defs/AskUserOption"}
+                        }
+                    },
+                    "required": ["label", "question", "options"]
+                },
+                "AskUserOption": {
+                    "type": "object",
+                    "properties": {
+                        "value": {"type": "string"},
+                        "label": {"type": "string"}
+                    },
+                    "required": ["value", "label"]
+                }
+            },
+            "type": "object",
+            "properties": {
+                "questions": {
+                    "type": "array",
+                    "items": {"$ref": "$defs/AskUserQuestion"}
+                }
+            },
+            "required": ["questions"]
+        });
+
+        let sanitized = sanitize_schema_for_llguidance(&schema);
+
+        // Debug: print the sanitized schema
+        eprintln!("Sanitized schema for llguidance: {}", serde_json::to_string_pretty(&sanitized).unwrap());
+
+        // $defs should be removed (refs are resolved)
+        assert!(sanitized.get("$defs").is_none(), "$defs should be removed after resolution");
+
+        // $ref should be replaced with actual definition
+        let questions_items = &sanitized["properties"]["questions"]["items"];
+        assert!(questions_items.get("$ref").is_none(), "$ref should be replaced");
+
+        // Try to compile the sanitized schema with llguidance
+        // This should not fail if $defs is properly resolved
+        let _grammar = TopLevelGrammar::from_json_schema(sanitized);
+        // If we get here without panicking, the schema compiled successfully
+    }
+
+    #[test]
+    fn test_sanitize_schema_for_llguidance_preserves_property_names() {
+        // Test that property names (field names) are preserved
+        let schema = json!({
+            "type": "object",
+            "properties": {
+                "city": {"type": "string", "description": "City name"},
+                "mode": {"type": "string", "description": "Mode of transport"},
+                "count": {"type": "integer", "minimum": 1, "maximum": 200}
+            },
+            "required": ["city", "mode"]
+        });
+        let sanitized = sanitize_schema_for_llguidance(&schema);
+
+        // Property names should be preserved
+        assert!(sanitized["properties"].get("city").is_some(), "city property should be preserved");
+        assert!(sanitized["properties"].get("mode").is_some(), "mode property should be preserved");
+        assert!(sanitized["properties"].get("count").is_some(), "count property should be preserved");
+
+        // Metadata should be stripped from properties
+        assert!(sanitized["properties"]["city"].get("description").is_none(), "description should be stripped from city");
+        assert!(sanitized["properties"]["mode"].get("description").is_none(), "description should be stripped from mode");
+    }
+
+    #[test]
+    fn test_sanitize_schema_for_llguidance_preserves_nested_properties() {
+        // Test that nested properties are correctly processed
+        let schema = json!({
+            "type": "object",
+            "properties": {
+                "outer": {
+                    "type": "object",
+                    "properties": {
+                        "inner": {
+                            "type": "string",
+                            "description": "Inner value"
+                        }
+                    },
+                    "required": ["inner"]
+                }
+            },
+            "required": ["outer"]
+        });
+        let sanitized = sanitize_schema_for_llguidance(&schema);
+
+        // Nested property names should be preserved
+        assert!(sanitized["properties"]["outer"]["properties"].get("inner").is_some(), "inner property should be preserved");
+
+        // Metadata should be stripped from nested properties
+        assert!(sanitized["properties"]["outer"]["properties"]["inner"].get("description").is_none(), "description should be stripped");
+    }
+
+    #[test]
+    fn test_sanitize_schema_for_llguidance_strips_format() {
+        // Test that format keyword is preserved (it's a validation keyword in llguidance)
+        // Only metadata fields like description, default, title are stripped
+        let schema = json!({
+            "type": "object",
+            "properties": {
+                "url": {"type": "string", "format": "uri"},
+                "email": {"type": "string", "format": "email"}
+            }
+        });
+        let sanitized = sanitize_schema_for_llguidance(&schema);
+
+        // Format should be preserved (it's a validation keyword)
+        assert_eq!(sanitized["properties"]["url"]["format"], "uri");
+        assert_eq!(sanitized["properties"]["email"]["format"], "email");
+
+        // Type should be preserved
+        assert_eq!(sanitized["properties"]["url"]["type"], "string");
+        assert_eq!(sanitized["properties"]["email"]["type"], "string");
+    }
+
+    #[test]
+    fn test_sanitize_schema_for_llguidance_preserves_nullable_types() {
+        // Test that nullable types (array of types) are preserved
+        let schema = json!({
+            "type": "object",
+            "properties": {
+                "cwd": {"type": ["string", "null"]}
+            },
+            "required": ["cwd"]
+        });
+        let sanitized = sanitize_schema_for_llguidance(&schema);
+
+        // Nullable types should be preserved
+        assert_eq!(
+            sanitized["properties"]["cwd"]["type"],
+            json!(["string", "null"])
+        );
+    }
+
+    #[test]
+    fn test_sanitize_schema_for_llguidance_strips_examples() {
+        // Test that examples field is stripped (it's metadata)
+        let schema = json!({
+            "type": "string",
+            "examples": ["option1", "option2", "option3"]
+        });
+        let sanitized = sanitize_schema_for_llguidance(&schema);
+
+        // Examples should be stripped
+        assert!(sanitized.get("examples").is_none(), "examples should be stripped");
+
+        // Type should be preserved
+        assert_eq!(sanitized["type"], "string");
+    }
+
+    #[test]
+    fn test_sanitize_schema_for_llguidance_strips_default_in_array() {
+        // Test that default is stripped even when nested in arrays
+        let schema = json!({
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "name": {"type": "string", "default": "unknown"}
+                }
+            }
+        });
+        let sanitized = sanitize_schema_for_llguidance(&schema);
+
+        // Default should be stripped from nested properties
+        assert!(sanitized["items"]["properties"]["name"].get("default").is_none(), "default should be stripped");
+
+        // Type should be preserved
+        assert_eq!(sanitized["items"]["properties"]["name"]["type"], "string");
+    }
+
+    #[test]
+    fn test_parse_structural_tag_missing_schema() {
+        let value = json!({});
+        let result = parse_structural_tag(&value);
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_parse_structural_tag_start_end() {
+        let value = json!({
+            "start_tag": "<tool>",
+            "end_tag": "</tool>",
+            "schema": {"type": "object"}
+        });
+        let result = parse_structural_tag(&value);
+        assert!(result.is_ok());
+        let (start, end, schema) = result.unwrap();
+        assert_eq!(start, "<tool>");
+        assert_eq!(end, "</tool>");
+        assert_eq!(schema, json!({"type": "object"}));
+    }
+
+    #[test]
+    fn test_parse_structural_tag_tag() {
+        let value = json!({
+            "tag": "<tool>",
+            "schema": {"type": "object"}
+        });
+        let result = parse_structural_tag(&value);
+        assert!(result.is_ok());
+        let (start, end, _) = result.unwrap();
+        assert_eq!(start, "<tool>");
+        assert_eq!(end, "</tool>");
+    }
+
+    #[test]
+    fn test_parse_structural_tag_invalid() {
+        let value = json!({
+            "schema": {"type": "object"}
+        });
+        let result = parse_structural_tag(&value);
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_lark_quote_escapes_special_chars() {
+        let result = lark_quote("test\"value");
+        assert!(result.contains("test\\\"value"));
+    }
+
+    #[test]
+    fn test_lark_literal_special_tags() {
+        let result = _lark_literal("<tool>", true);
+        assert_eq!(result, "<tool>");
+    }
+
+    #[test]
+    fn test_lark_literal_regular_string() {
+        let result = _lark_literal("regular", false);
+        assert!(result.contains("\"regular\""));
+    }
+
+    #[test]
+    fn test_lark_special_token_single_id() {
+        let mut ids = HashSet::new();
+        ids.insert(151657);
+        let result = lark_special_token(&ids);
+        assert_eq!(result, "<[151657]>");
+    }
+
+    #[test]
+    fn test_lark_special_token_multiple_ids() {
+        let mut ids = HashSet::new();
+        ids.insert(151657);
+        ids.insert(151658);
+        let result = lark_special_token(&ids);
+        assert!(result.contains("[151657]"));
+        assert!(result.contains("[151658]"));
+    }
+
+    #[test]
+    fn test_lark_special_token_empty() {
+        let ids = HashSet::new();
+        let result = lark_special_token(&ids);
+        assert_eq!(result, "");
+    }
+
+    #[test]
+    fn test_build_xml_tool_lark_grammar_qwen3_coder_required_only() {
+        // Test Qwen3-Coder XML tool format with required attributes only
+        let tools = vec![crate::tools::ToolBuilder::new(
+            "search".to_string(),
+            "Search the web".to_string(),
+        )
+        .param("query", "string", "Search query", true)
+        .build()];
+        let request = create_mock_request_with_tools(tools);
+        let guidance_tokens = create_mock_guidance_tokens();
+        let tool_config = crate::server::parser::ToolConfig::for_model_type(
+            &crate::utils::config::ModelType::Qwen3,
+        );
+        let tokenizer = Tokenizer::from_pretrained("bert-base-uncased".to_string(), None).unwrap();
+        let dispatcher = GrammarRequestDispatcher::new(
+            &request,
+            &guidance_tokens,
+            &tool_config,
+            true,
+            false,
+            "qwen_coder".to_string(),
+            &tokenizer,
+            None,
+            false, // disable_reasoning
+        );
+        let mut grammar = dispatcher
+            .build_tool_grammar()
+            .expect("Should have tool grammar");
+        let lark_str = grammar.build_lark();
+        println!("{}", &lark_str);
+
+        // Qwen3Coder uses XML format with start: tool_call
+        assert!(
+            lark_str.contains("start: tool_call"),
+            "Should have start: tool_call"
+        );
+        assert!(
+            lark_str.contains("<function=search>"),
+            "Should contain function tag"
+        );
+        assert!(lark_str.contains("tool_0:"), "Should contain tool_0 rule");
+    }
+
+    #[test]
+    fn test_build_xml_tool_lark_grammar_qwen3_coder_optional() {
+        // Test Qwen3-Coder XML tool format with optional attributes
+        let tools = vec![crate::tools::ToolBuilder::new(
+            "get_weather".to_string(),
+            "Get weather".to_string(),
+        )
+        .param("city", "string", "City name", true)
+        .param("units", "string", "Temperature units (optional)", false)
+        .build()];
+        let request = create_mock_request_with_tools(tools);
+        let guidance_tokens = create_mock_guidance_tokens();
+        let tool_config = crate::server::parser::ToolConfig::for_model_type(
+            &crate::utils::config::ModelType::Qwen3,
+        );
+        let tokenizer = Tokenizer::from_pretrained("bert-base-uncased".to_string(), None).unwrap();
+        let dispatcher = GrammarRequestDispatcher::new(
+            &request,
+            &guidance_tokens,
+            &tool_config,
+            true,
+            false,
+            "qwen_coder".to_string(),
+            &tokenizer,
+            None,
+            false, // disable_reasoning
+        );
+        let mut grammar = dispatcher
+            .build_tool_grammar()
+            .expect("Should have tool grammar");
+        let lark_str = grammar.build_lark();
+
+        assert!(
+            lark_str.contains("start: tool_call"),
+            "Should have start: tool_call"
+        );
+        assert!(
+            lark_str.contains("<function=get_weather>"),
+            "Should contain function tag"
+        );
+        assert!(lark_str.contains("city"), "Should contain city parameter");
+        assert!(
+            lark_str.contains("units"),
+            "Should contain optional units parameter"
+        );
+    }
+
+    #[test]
+    fn test_build_xml_tool_lark_grammar_qwen3_coder_deep_parameters() {
+        // Test Qwen3-Coder XML tool format with nested/complex parameters
+        let tools = vec![crate::tools::ToolBuilder::new(
+            "edit_file".to_string(),
+            "Edit a file with complex parameters".to_string(),
+        )
+        .param("file_path", "string", "Path to the file", true)
+        .param("old_string", "string", "String to replace", true)
+        .param("new_string", "string", "Replacement string", true)
+        .param("replace_all", "boolean", "Replace all occurrences", false)
+        .build()];
+        let request = create_mock_request_with_tools(tools);
+        let guidance_tokens = create_mock_guidance_tokens();
+        let tool_config = crate::server::parser::ToolConfig::for_model_type(
+            &crate::utils::config::ModelType::Qwen3,
+        );
+        let tokenizer = Tokenizer::from_pretrained("bert-base-uncased".to_string(), None).unwrap();
+        let dispatcher = GrammarRequestDispatcher::new(
+            &request,
+            &guidance_tokens,
+            &tool_config,
+            true,
+            false,
+            "qwen_coder".to_string(),
+            &tokenizer,
+            None,
+            false, // disable_reasoning
+        );
+        let mut grammar = dispatcher
+            .build_tool_grammar()
+            .expect("Should have tool grammar");
+        let lark_str = grammar.build_lark();
+        println!("XML Grammar:\n{}", &lark_str);
+
+        // Verify the grammar contains XML structure
+        assert!(
+            lark_str.contains("start: tool_call"),
+            "Should have start: tool_call"
+        );
+        // Note: <function=...> uses U+200C (zero-width non-joiner) which is invisible
+        assert!(
+            lark_str.contains("function="),
+            "Should contain function tag with attribute"
+        );
+
+        // Verify all parameter tags are present
+        // Note: <parameter=...> uses U+200C (zero-width non-joiner) which is invisible
+        assert!(
+            lark_str.contains("parameter=file_path"),
+            "Should contain file_path parameter tag"
+        );
+        assert!(
+            lark_str.contains("parameter=old_string"),
+            "Should contain old_string parameter tag"
+        );
+        assert!(
+            lark_str.contains("parameter=new_string"),
+            "Should contain new_string parameter tag"
+        );
+        assert!(
+            lark_str.contains("parameter=replace_all"),
+            "Should contain replace_all parameter tag"
+        );
+
+        // Verify all string params share the same consolidated value_string rule
+        assert!(
+            lark_str.contains("value_string"),
+            "All string params should use the consolidated value_string rule"
+        );
+
+        // Verify non-string types still have unique rules
+        assert!(
+            lark_str.contains("value_0_3_boolean"),
+            "Boolean param should have its own unique rule"
+        );
+        assert!(
+            lark_str.contains("param_0_0:"),
+            "Should have param_0_0 rule for first param"
+        );
+        assert!(
+            lark_str.contains("param_0_1:"),
+            "Should have param_0_1 rule for second param"
+        );
+        assert!(
+            lark_str.contains("param_0_2:"),
+            "Should have param_0_2 rule for third param"
+        );
+        assert!(
+            lark_str.contains("param_0_3:"),
+            "Should have param_0_3 rule for fourth param"
+        );
+
+        // Verify tool rule has all parameters
+        assert!(lark_str.contains("tool_0:"), "Should have tool_0 rule");
+
+        // Verify deduplication is disabled: each string param should have its own value rule
+        // Rules are named value_{tool_idx}_{param_idx}_{type} so check for pattern
+        let value_string_count = lark_str.matches("value_").count();
+        assert!(
+            value_string_count >= 4,
+            "Each param should have its own value rule (no deduplication), found {}",
+            value_string_count
+        );
+    }
+
+    #[test]
+    fn test_xml_grammar_required_params_no_wrapper() {
+        // Test that XML grammar puts required params directly without (...) * wrapper
+        let tools = vec![crate::tools::ToolBuilder::new(
+            "search_tool".to_string(),
+            "Search tool".to_string(),
+        )
+        .param("query", "string", "Search query", true) // REQUIRED - should appear as bare rule reference
+        .build()];
+
+        let request = create_mock_request_with_tools(tools);
+        let guidance_tokens = create_mock_guidance_tokens();
+        let tool_config = crate::server::parser::ToolConfig::for_model_type(
+            &crate::utils::config::ModelType::Qwen3,
+        );
+        let tokenizer = Tokenizer::from_pretrained("bert-base-uncased".to_string(), None).unwrap();
+        let dispatcher = GrammarRequestDispatcher::new(
+            &request,
+            &guidance_tokens,
+            &tool_config,
+            true,
+            false,
+            "qwen_coder".to_string(),
+            &tokenizer,
+            None,
+            false, // disable_reasoning
+        );
+        let mut grammar = dispatcher
+            .build_tool_grammar()
+            .expect("Should have tool grammar");
+        let lark_str = grammar.build_lark();
+
+        // Verify tool rule has all parameters
+        assert!(lark_str.contains("tool_0:"), "Should have tool_0 rule");
+        assert!(
+            lark_str.contains("value_string"),
+            "Should have the consolidated value_string rule for string params"
+        );
+
+        // Required params appear directly in tool rule without ()* wrapper
+    }
+
+    #[test]
+    fn test_xml_grammar_optional_params_wrapped() {
+        // Test that XML grammar wraps optional params with (...) * syntax
+        let tools = vec![crate::tools::ToolBuilder::new(
+            "mixed_tool".to_string(),
+            "Mixed params".to_string(),
+        )
+        .param("required_param", "string", "Required", true) // REQUIRED
+        .param("optional_param", "string", "Optional", false) // OPTIONAL
+        .build()];
+
+        let request = create_mock_request_with_tools(tools);
+        let guidance_tokens = create_mock_guidance_tokens();
+        let tool_config = crate::server::parser::ToolConfig::for_model_type(
+            &crate::utils::config::ModelType::Qwen3,
+        );
+        let tokenizer = Tokenizer::from_pretrained("bert-base-uncased".to_string(), None).unwrap();
+        let dispatcher = GrammarRequestDispatcher::new(
+            &request,
+            &guidance_tokens,
+            &tool_config,
+            true,
+            false,
+            "qwen_coder".to_string(),
+            &tokenizer,
+            None,
+            false, // disable_reasoning
+        );
+        let mut grammar = dispatcher
+            .build_tool_grammar()
+            .expect("Should have tool grammar");
+        let lark_str = grammar.build_lark();
+
+        println!("XML Grammar for mixed tool:\n{}", lark_str);
+
+        // Optional parameters should appear in a (...) * pattern when there are multiple options
+        assert!(lark_str.contains("tool_0:"), "Should have tool_0 rule");
+    }
+
+    #[test]
+    fn test_xml_tool_call_structure_validates() {
+        // Full end-to-end: verify XML grammar produces valid llguidance TopLevelGrammar structure
+        let tools =
+            vec![
+                crate::tools::ToolBuilder::new("formatter".to_string(), "Formatter".to_string())
+                    .param("text", "string", "Text to format", true)
+                    .build(),
+            ];
+
+        let request = create_mock_request_with_tools(tools);
+        let guidance_tokens = create_mock_guidance_tokens();
+        let tool_config = crate::server::parser::ToolConfig::for_model_type(
+            &crate::utils::config::ModelType::Qwen3,
+        );
+        let tokenizer = Tokenizer::from_pretrained("bert-base-uncased".to_string(), None).unwrap();
+        let dispatcher = GrammarRequestDispatcher::new(
+            &request,
+            &guidance_tokens,
+            &tool_config,
+            true,
+            false,
+            "qwen_coder".to_string(),
+            &tokenizer,
+            None,
+            false, // disable_reasoning
+        );
+        let grammar = dispatcher
+            .build_tool_grammar()
+            .expect("Should have tool grammar");
+
+        // Verify the grammar has tools and produces valid Lark output
+        assert!(
+            grammar.tools.len() > 0,
+            "Should have generated tool grammar"
+        );
+    }
+/*
+    #[test]
+    fn test_gemma4_tool_call_format_matches_template() {
+        // Test that Gemma4 format matches chat_template.jinja specification
+        // Template: <|tool_call>call:function_name{key: value}<tool_call|>
+        // Generated Lark should match: <[start_token]> "call:function_name{...}" <[end_token]>
+
+        let tools = vec![crate::tools::ToolBuilder::new(
+            "search".to_string(),
+            "Search the web".to_string(),
+        )
+        .param("query", "string", "Search query", true)
+        .param("limit", "integer", "Result limit", false)
+        .build()];
+
+        // Create Gemma4 grammar with mock token IDs
+        let start_token_id = 151657u32;
+        let end_token_id = 151658u32;
+        let mut grammar = ToolCallGrammar::new_gemma4(tools, start_token_id, end_token_id);
+
+        let lark_str = grammar.build_lark();
+
+        // Verify structure matches template format
+        assert!(lark_str.contains(&format!("<[{}]>", start_token_id)), "Should contain start token");
+        assert!(lark_str.contains(&format!("<[{}]>", end_token_id)), "Should contain end token");
+        assert!(lark_str.contains("call:search"), "Should contain call:function_name format");
+
+        // Verify the arguments pattern is present
+        assert!(lark_str.contains("query"), "Should contain query parameter");
+        assert!(lark_str.contains("limit"), "Should contain limit parameter");
+
+        // Print for debugging
+        println!("Gemma4 Lark grammar:\n{}", lark_str);
+    }
+
+    #[test]
+    fn test_for_model_type_with_override() {
+        // Test that parser_name override takes precedence over model_type
+        let tools = vec![crate::tools::ToolBuilder::new(
+            "search".to_string(),
+            "Search the web".to_string(),
+        )
+        .param("query", "string", "Search query", true)
+        .build()];
+
+        let start_token_id = 151657u32;
+        let end_token_id = 151658u32;
+
+        // Override with "gemma4" should use Gemma4 format even for Qwen3 model
+        let grammar = ToolCallGrammar::for_model_type(
+            tools.clone(),
+            start_token_id,
+            end_token_id,
+            Some("gemma4"),
+            &crate::utils::config::ModelType::Qwen3,
+        );
+        assert!(matches!(grammar.format, ToolFormat::Gemma4));
+
+        // Override with "qwen_coder" should use XML format even for Gemma4 model
+        let grammar = ToolCallGrammar::for_model_type(
+            tools.clone(),
+            start_token_id,
+            end_token_id,
+            Some("qwen_coder"),
+            &crate::utils::config::ModelType::Gemma4,
+        );
+        assert!(matches!(grammar.format, ToolFormat::QwenCoder));
+
+        // No override - Qwen3 should use XML
+        let grammar = ToolCallGrammar::for_model_type(
+            tools.clone(),
+            start_token_id,
+            end_token_id,
+            None,
+            &crate::utils::config::ModelType::Qwen3,
+        );
+        assert!(matches!(grammar.format, ToolFormat::QwenCoder));
+
+        // No override - Gemma4 should use Gemma4
+        let grammar = ToolCallGrammar::for_model_type(
+            tools,
+            start_token_id,
+            end_token_id,
+            None,
+            &crate::utils::config::ModelType::Gemma4,
+        );
+        assert!(matches!(grammar.format, ToolFormat::Gemma4));
+    }
+    */
+}
diff --git a/src/utils/mod.rs b/src/utils/mod.rs
index fd30bb1c..1c44c309 100644
--- a/src/utils/mod.rs
+++ b/src/utils/mod.rs
@@ -9,12 +9,12 @@ pub mod gptq;
 #[cfg(all(feature = "cuda", feature = "graph"))]
 pub mod graph;
 pub mod guidance;
+pub mod guidance_grammar;
 pub mod heartbeat;
 pub mod image;
 pub mod kvcache_allocator;
 pub mod logits_processor;
 pub mod progress;
-pub mod reasoning;
 pub mod special_tokens;
 use crate::core::GenerationOutput;
 use crate::models::gemma3::config::Gemma3Config;
@@ -30,10 +30,6 @@ use crate::utils::gguf_helper::{load_gguf_info_from_files, GGUFInfo};
 use candle_core::utils::{cuda_is_available, metal_is_available};
 use candle_core::{DType, Device, Result};
 use config::{Config, EngineConfig, EosTokenId, GenerationConfig, TokenizerConfig};
-pub use reasoning::{
-    build_reasoning_grammar, thinking_grammar_with_reasoning_block, ReasoningEffort,
-    ThinkingGrammarBuilder,
-};
 use std::collections::HashMap;
 use std::path::{Path, PathBuf};
 use tokenizers::Tokenizer;
@@ -1330,6 +1326,7 @@ pub fn init_config_tokenizer(
                 config.architectures = cfg.architectures.clone();
                 config.is_multi_model = Some(true);
                 merge_multimodal_top_level_config(&mut config, &raw_config_json)?;
+
                 config.extra_config_json =
                     Some(String::from_utf8(raw_config).map_err(candle_core::Error::wrap)?);
                 // Remap rope_theta in rope_scaling to config file
@@ -1454,6 +1451,7 @@ pub fn init_config_tokenizer(
                         chat_template: None,
                         bos_token: None,
                         eos_token: None,
+                        pad_token: None,
                     }
                 }
             }
@@ -1465,6 +1463,31 @@ pub fn init_config_tokenizer(
         let _ = tokenizer.with_truncation(None);
         let _ = tokenizer.with_padding(None);
 
+        // For multimodal models, merge tokenizer's eos_token string to token IDs
+        // This ensures EOSTOKENIDS includes tokens from both tokenizer and config
+        if config.is_multi_model == Some(true) {
+            let tokenizer_eos_ids: Vec<u32> = config_tokenizer
+                .eos_token
+                .as_ref()
+                .and_then(|eos_str| tokenizer.get_vocab(true).get(eos_str).copied())
+                .map(|id| vec![id])
+                .unwrap_or_default();
+
+            if !tokenizer_eos_ids.is_empty() {
+                let tokenizer_eos = if tokenizer_eos_ids.len() == 1 {
+                    EosTokenId::Single(tokenizer_eos_ids[0])
+                } else {
+                    EosTokenId::Multiple(tokenizer_eos_ids)
+                };
+
+                if let Some(existing_eos) = config.eos_token_id.take() {
+                    config.eos_token_id = Some(existing_eos.merge_dedup(tokenizer_eos));
+                } else {
+                    config.eos_token_id = Some(tokenizer_eos);
+                }
+            }
+        }
+
         let generation_config_path = model_pathes.get_generation_config_filename();
         let generation_cfg = if generation_config_path.display().to_string() != ""
             && Path::new(&generation_config_path).exists()
@@ -1528,6 +1551,7 @@ pub fn init_config_tokenizer(
             bos,
             eos,
             unk: _,
+            pad_token,
             context_length,
             chat_template,
         } = load_gguf_info_from_files(&model_pathes.get_weight_filenames()).map_err(|e| {
@@ -1600,6 +1624,7 @@ pub fn init_config_tokenizer(
             chat_template,
             bos_token: bos,
             eos_token: eos,
+            pad_token,
         };
         let archs = config.architectures.as_ref().unwrap();
 
diff --git a/src/utils/reasoning.rs b/src/utils/reasoning.rs
deleted file mode 100644
index cb705290..00000000
--- a/src/utils/reasoning.rs
+++ /dev/null
@@ -1,309 +0,0 @@
-// src/utils/reasoning.rs
-//! Reasoning grammar builders and utilities
-//!
-//! This module provides utilities for building reasoning block grammars
-//! used in structured output generation.
-
-use crate::utils::special_tokens::SpecialTokens;
-use llguidance::api::TopLevelGrammar;
-
-/// Reasoning effort level for grammar generation
-/// Optimized for specific reasoning strategies based on current research (2024-2025)
-/// Note: For Python builds, this enum is passed via serde serialization, not pyo3
-#[derive(Clone, Debug, serde::Serialize, serde::Deserialize, PartialEq, Eq)]
-#[serde(rename_all = "lowercase")]
-pub enum ReasoningEffort {
-    /// No structured reasoning - direct output only
-    None,
-
-    /// Constrained single-paragraph reasoning (~150 chars max)
-    /// Implements "Fast Thinking" with tight length constraints
-    /// Reduces hallucination risk by limiting generation space
-    Low,
-
-    /// Standard multi-step Chain-of-Thought (CoT)
-    /// Implements Wei et al. (2022) baseline with sentence-based termination
-    /// Balances reasoning depth and efficiency
-    Medium,
-
-    /// Adversarial analysis with self-correction phases
-    /// Implements Cheng & Su (2025) adversarial critique pattern
-    /// Forces explicit error checking before final output
-    High,
-
-    /// Best-of-breed Chain-of-Verification (CoVe) + Self-Critique
-    /// Combines Madaan et al. (2024) CoVe with adversarial self-correction
-    /// Maximum accuracy for complex/fact-sensitive tasks
-    ChainOfThought,
-
-    /// Custom user-provided grammar template
-    /// For non-Python builds, allows users to submit their own reasoning patterns
-    #[cfg(all(not(feature = "python"), not(feature = "pyo3")))]
-    Custom(String),
-}
-
-impl ReasoningEffort {
-    pub fn from_str(s: String) -> Self {
-        match s.to_lowercase().as_str() {
-            "none" => Self::None,
-            "low" => Self::Low,
-            "normal" | "medium" => Self::Medium, // Backward compatibility
-            "high" => Self::High,
-            "chain_of_thought" | "cot" | "cove" => Self::ChainOfThought,
-            #[cfg(all(not(feature = "python"), not(feature = "pyo3")))]
-            s if s.starts_with("custom:") => Self::Custom(s[7..].to_string()),
-            #[cfg(feature = "python")]
-            _ => Self::None,
-            #[cfg(all(not(feature = "python"), not(feature = "pyo3")))]
-            _ => Self::None,
-        }
-    }
-
-    /// Generate the appropriate grammar template for this reasoning level
-    pub fn generate_grammar(&self, start_id: u32, end_id: u32) -> String {
-        match self {
-            Self::None => {
-                // No reasoning block - direct output only
-                // Minimal latency, no structured thinking
-                format!(
-                    r#"start: reasoning_block text
-text: /[\x09\x0A\x0D\x20-\x7E]*?/
-reasoning_block: <[{}]> "\n" text "\n" <[{}]>
-"#,
-                    start_id, end_id
-                )
-            }
-            Self::Low => {
-                // Fast Thinking: Single paragraph constraint (max ~150 chars)
-                // Limits generation space to reduce hallucination risk
-                // Uses non-greedy matching to prevent runaway generation
-                format!(
-                    r#"start: reasoning_block
-reasoning_block: <[{start_id}]> "\n" thinkgram "\n" <[{end_id}]> "\n"
-thinkgram: /[\x09\x0A\x0D\x20-\x7E]+?{{1,300}}/
-"#
-                )
-            }
-            Self::Medium => {
-                // Standard CoT: Multi-step reasoning with natural sentence termination
-                // Implements Wei et al. (2022) baseline pattern
-                // Allows multiple steps but enforces sentence boundaries
-                format!(
-                    r#"start: reasoning_block
-reasoning_block: <[{start_id}]> "\n" thinkgram "\n" <[{end_id}]> "\n"
-thinkgram: /[\x09\x0A\x0D\x20-\x7E]+?{{1,1200}}/
-"#
-                )
-            }
-            Self::High => {
-                // Adversarial Analysis: Explicit self-correction phases
-                // Implements Cheng & Su (2025) adversarial critique pattern
-                // Forces model to challenge its own reasoning before finalizing
-                format!(
-                    r#"start: reasoning_block* analysis_block*
-reasoning_block: <[{start_id}]> "\n" : analysis_block analysis_content critique_phase critique_content thinkgram "\n" <[{end_id}]> "\n"
-analysis_block: "<ANALYZE>" "\n" analysis_content "\n" "</ANALYZE>" "\n"
-analysis_content: /[\x09\x0A\x0D\x20-\x7E]*?{{1,2400}}/
-critique_phase: "<CRITIQUE>" "\n" critique_content "\n" "</CRITIQUE>" "\n"
-critique_content: /[\x09\x0A\x0D\x20-\x7E]*?{{1,1200}}/
-thinkgram: "<STRUCTUREDANSWER>" "\n" /[\x09\x0A\x0D\x20-\x7E]*?{{1,3600}}/ "\n" "</STRUCTUREDANSWER>" "\n"
-"#
-                )
-            }
-            Self::ChainOfThought => {
-                // Best-of-breed: CoVe + Adversarial Critique + Final Consolidation
-                // Combines Madaan et al. (2024) Chain-of-Verification with self-correction
-                // Maximum accuracy for complex/fact-sensitive tasks
-                format!(
-                    r#"start: reasoning_block+
-reasoning_block: <[{start_id}]> "\n" draft_phase verification_phase critique_phase final_phase "\n" <[{end_id}]> "\n"
-draft_phase: /(?s:[^.!?]+[.!?])+/
-verification_phase: "<VERIFY>" "\n" verification_questions "\n" verification_answers "\n" "</VERIFY>" "\n"
-verification_questions: /(?s:[^.!?]+[.!?])+/
-verification_answers: /[\x09\x0A\x0D\x20-\x7E]*?/
-critique_phase: "<CRITIQUE>" "\n" self_critique "\n" "</CRITIQUE>" "\n"
-self_critique: /[\x09\x0A\x0D\x20-\x7E]*?/
-final_phase: "<FINAL_ANSWER>" "\n" final_content "\n"
-final_content: /[\x09\x0A\x0D\x20-\x7E]*?/
-"#
-                )
-            }
-            #[cfg(all(not(feature = "python"), not(feature = "pyo3")))]
-            Self::Custom(template) => {
-                // User-provided template with token ID injection
-                // Supports $START_ID and $END_ID placeholders for dynamic token ID substitution
-                template
-                    .replace("$START_ID", &start_id.to_string())
-                    .replace("$END_ID", &end_id.to_string())
-            }
-        }
-    }
-}
-
-/// Updated grammar builder function that respects reasoning effort levels
-pub fn thinking_grammar_with_reasoning_block(
-    start_id: u32,
-    end_id: u32,
-    effort: Option<ReasoningEffort>,
-) -> String {
-    match effort {
-        Some(level) => level.generate_grammar(start_id, end_id),
-        None => {
-            // Default to Medium if not specified (balanced approach)
-            ReasoningEffort::Medium.generate_grammar(start_id, end_id)
-        }
-    }
-}
-
-/// Builder for thinking grammar with reasoning block
-/// This ensures reasoning_block has finite termination to prevent run-on generation
-/// Note: For Python builds, this struct is not exposed via pyo3 since ReasoningEffort can't be a pyclass
-pub struct ThinkingGrammarBuilder {
-    start_id: u32,
-    end_id: u32,
-    effort: Option<ReasoningEffort>,
-}
-
-impl ThinkingGrammarBuilder {
-    pub fn new(start_id: u32, end_id: u32, effort: Option<ReasoningEffort>) -> Self {
-        Self {
-            start_id,
-            end_id,
-            effort,
-        }
-    }
-
-    /// Create thinking grammar from string token IDs
-    pub fn from_string(start_id: u32, end_id: u32) -> Self {
-        Self {
-            start_id,
-            end_id,
-            effort: None,
-        }
-    }
-
-    /// Build the thinking grammar Lark string
-    pub fn build(&self) -> String {
-        thinking_grammar_with_reasoning_block(self.start_id, self.end_id, self.effort.clone())
-    }
-
-    /// Build as TopLevelGrammar
-    pub fn build_grammar(&self) -> TopLevelGrammar {
-        let lark = self.build();
-        TopLevelGrammar::from_lark(lark)
-    }
-}
-
-/// Build a reasoning-aware grammar composer
-/// Wraps a base composer with reasoning blocks when reasoning effort is enabled
-pub fn build_reasoning_grammar(
-    base_grammar: TopLevelGrammar,
-    reasoning_effort: ReasoningEffort,
-    special_tokens: &SpecialTokens,
-) -> TopLevelGrammar {
-    if reasoning_effort == ReasoningEffort::None {
-        return base_grammar;
-    }
-
-    let reasoning_start_ids = special_tokens.reasoning_start_ids();
-    let reasoning_end_ids = special_tokens.reasoning_end_ids();
-
-    if reasoning_start_ids.is_empty() || reasoning_end_ids.is_empty() {
-        crate::log_warn!(
-            "[llg] Reasoning effort {:?} set but no reasoning tokens found in special_tokens",
-            reasoning_effort
-        );
-        return base_grammar;
-    }
-
-    let start_id = reasoning_start_ids[0];
-    let end_id = reasoning_end_ids[0];
-    let reasoning_lark = thinking_grammar_with_reasoning_block(start_id, end_id, None);
-    let reasoning_gram = TopLevelGrammar::from_lark(reasoning_lark);
-
-    // Merge reasoning block with base grammar
-    // The reasoning block comes first, then the base grammar
-    crate::utils::guidance::merge_top_level_grammars(vec![reasoning_gram, base_grammar], None, None)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_reasoning_effort_from_str() {
-        assert_eq!(
-            ReasoningEffort::from_str("none".to_string()),
-            ReasoningEffort::None
-        );
-        assert_eq!(
-            ReasoningEffort::from_str("low".to_string()),
-            ReasoningEffort::Low
-        );
-        assert_eq!(
-            ReasoningEffort::from_str("medium".to_string()),
-            ReasoningEffort::Medium
-        );
-        assert_eq!(
-            ReasoningEffort::from_str("high".to_string()),
-            ReasoningEffort::High
-        );
-        assert_eq!(
-            ReasoningEffort::from_str("invalid".to_string()),
-            ReasoningEffort::None
-        );
-    }
-
-    #[test]
-    fn test_thinking_grammar_builder() {
-        let builder = ThinkingGrammarBuilder::new(151657, 151658, None);
-        let lark = builder.build();
-        assert!(
-            lark.contains("reasoning_block"),
-            "Should contain reasoning_block"
-        );
-        assert!(lark.contains("<[151657]"), "Should contain start token ID");
-        assert!(lark.contains("<[151658]"), "Should contain end token ID");
-    }
-
-    #[test]
-    fn test_thinking_grammar_builder_from_string() {
-        let builder = ThinkingGrammarBuilder::from_string(151657, 151658);
-        let lark = builder.build();
-        assert!(lark.contains("<[151657]>"), "Should contain start token ID");
-        assert!(lark.contains("<[151658]>"), "Should contain end token ID");
-    }
-
-    #[test]
-    fn test_thinking_grammar_builder_build_grammar() {
-        let builder = ThinkingGrammarBuilder::new(151657, 151658, None);
-        let grammar = builder.build_grammar();
-        assert!(grammar.grammars.len() > 0, "Should have grammars");
-    }
-
-    #[test]
-    #[cfg(not(feature = "python"))]
-    fn test_reasoning_effort_custom_from_str() {
-        let template = "custom:\nstart: reasoning_block\nreasoning_block: <[$START_ID]> thinkgram <[$END_ID]>\nthinkgram: /(?s:[^.!?]+[.!?])+/\n";
-        let effort = ReasoningEffort::from_str(template.to_string());
-        assert!(matches!(effort, ReasoningEffort::Custom(_)));
-    }
-
-    #[test]
-    #[cfg(all(not(feature = "python"), not(feature = "pyo3")))]
-    fn test_reasoning_effort_custom_generate_grammar() {
-        let template = "custom:\nstart: reasoning_block\nreasoning_block: <$START_ID> thinkgram <$END_ID>\nthinkgram: /(?s:[^.!?]+[.!?])+/\n";
-        let effort = ReasoningEffort::Custom(template.to_string());
-        let grammar = effort.generate_grammar(151660, 151661);
-        assert!(
-            grammar.contains("reasoning_block"),
-            "Should contain reasoning_block"
-        );
-        assert!(grammar.contains("<151660>"), "Should contain start_id");
-        assert!(grammar.contains("<151661>"), "Should contain end_id");
-        assert!(
-            grammar.contains("custom:"),
-            "Should contain custom template"
-        );
-    }
-}
diff --git a/src/utils/special_tokens.rs b/src/utils/special_tokens.rs
index a33232ba..19ccdf8e 100644
--- a/src/utils/special_tokens.rs
+++ b/src/utils/special_tokens.rs
@@ -1,49 +1,143 @@
 // src/utils/special_tokens.rs
+use std::collections::HashMap;
 use tokenizers::Tokenizer;
 
-const REASONING_START_TOKENS: &[&str] = &[
-    "<thinking>",
-    "<reasoning>",
-    "<internal>",
-    "<reflection>",
-    "<think>",
-    "<|think|>",
-    "[THINK]",
-    "<thought>",
-    "<|channel>",
+const BOS_TOKEN_STRINGS: &[&str] = &[
+    "<s>", "<|im_start|>",
+    "<start_of_turn>", "<|beginning_of_sentence|>",
+    "<bos>",
+    // Llama4 encapsulates the role between header IDs:
+    // <|start_header_id|>user<|end_header_id|>\n\n
+    "<|start_header_id|>", "<|end_header_id|>",
+    "<|turn>"
+
+];
+
+const REASONING_TOKEN_PAIRS: &[(&str, &str)] = &[
+    ("<thinking>", "</thinking>"),
+    ("[THINK]", "[/THINK]"),
+    ("<|thinking|>", "<|/thinking|>"),
+    ("<reasoning>", "</reasoning>"),
+    ("<internal>", "</internal>"),
+    ("<reflection>", "</reflection>"),
+    ("<|think|>", "<|/think|>"),
+    ("<thought>", "</thought>"),
+    ("<|channel>", "<channel|>"),
+    // Leave the most common selection for last - fallthrough
+    ("<think>", "</think>"),
 ];
 
-const REASONING_END_TOKENS: &[&str] = &[
-    "</thinking>",
-    "</internal>",
-    "</think>",
-    "<|/think|>",
-    "[/THINK]",
-    "</thought>",
-    "<channel|>",
+const TOOL_CALL_TOKEN_PAIRS: &[(&str, &str)] = &[
+    ("<tool_call>", "</tool_call>"),
+    ("<start_function_call>", "<end_function_call>"),
+    ("<|tool_call>", "<tool_call|>"),
+    ("[TOOL_CALLS]", "]"),
+    ("<minimax:tool_call>", "</minimax:tool_call>")
 ];
 
 #[derive(Debug, Clone, Default)]
 pub struct SpecialTokens {
     reasoning_start_ids: Vec<u32>,
     reasoning_end_ids: Vec<u32>,
+    tool_call_start_ids: Vec<u32>,
+    tool_call_end_ids: Vec<u32>,
+    bos_token_ids: Vec<u32>
 }
 
 impl SpecialTokens {
     pub fn new(tokenizer: &Tokenizer) -> Self {
-        let mut reasoning_start_ids =
-            collect_candidate_token_ids(tokenizer, REASONING_START_TOKENS);
-        let mut reasoning_end_ids = collect_candidate_token_ids(tokenizer, REASONING_END_TOKENS);
+        let mut reasoning_start_ids = Vec::new();
+        let mut reasoning_end_ids = Vec::new();
+        let mut tool_call_start_ids = Vec::new();
+        let mut tool_call_end_ids = Vec::new();
+        let mut bos_token_ids = Vec::new();
+
+        // Build lookup maps from added tokens
+        let mut added_start_map: HashMap<String, Vec<u32>> = HashMap::new();
+        let mut added_end_map: HashMap<String, Vec<u32>> = HashMap::new();
+        
+        for (id, token) in tokenizer.get_added_tokens_decoder().iter() {
+            let content = token.content.as_str();
+            
+            // Collect BOS tokens
+            for &bos_str in BOS_TOKEN_STRINGS.iter() {
+                if content == bos_str {
+                    bos_token_ids.push(*id)
+                }
+            }
+
+            // Build added token maps for pairs
+            for &(start, end) in REASONING_TOKEN_PAIRS.iter() {
+                if content == start {
+                    added_start_map.entry(start.to_string()).or_default().push(*id);
+                }
+                if content == end {
+                    added_end_map.entry(end.to_string()).or_default().push(*id);
+                }
+            }
+            
+            for &(start, end) in TOOL_CALL_TOKEN_PAIRS.iter() {
+                if content == start {
+                    added_start_map.entry(start.to_string()).or_default().push(*id);
+                }
+                if content == end {
+                    added_end_map.entry(end.to_string()).or_default().push(*id);
+                }
+            }
+        }
+        
+        // Process reasoning token pairs with fallback to common vocab
+        for &(start, end) in REASONING_TOKEN_PAIRS.iter() {
+            process_pair(
+                start,
+                end,
+                &added_start_map,
+                &added_end_map,
+                &mut reasoning_start_ids,
+                &mut reasoning_end_ids,
+                |s, e| {
+                    let vocab = tokenizer.get_vocab(true);
+                    vocab.get(s).cloned().zip(vocab.get(e).cloned())
+                },
+                "reasoning",
+            );
+        }
+        
+        // Process tool call token pairs with fallback to common vocab
+        for &(start, end) in TOOL_CALL_TOKEN_PAIRS.iter() {
+            process_pair(
+                start,
+                end,
+                &added_start_map,
+                &added_end_map,
+                &mut tool_call_start_ids,
+                &mut tool_call_end_ids,
+                |s, e| {
+                    let vocab = tokenizer.get_vocab(true);
+                    vocab.get(s).cloned().zip(vocab.get(e).cloned())
+                },
+                "tool_call",
+            );
+        }
 
         sort_and_dedup(&mut reasoning_start_ids);
         sort_and_dedup(&mut reasoning_end_ids);
+        sort_and_dedup(&mut tool_call_start_ids);
+        sort_and_dedup(&mut tool_call_end_ids);
 
         Self {
             reasoning_start_ids,
             reasoning_end_ids,
+            tool_call_start_ids,
+            tool_call_end_ids,
+            bos_token_ids
         }
     }
 
+    pub fn bos_token_ids(&self) -> Vec<u32> {
+        self.bos_token_ids.clone()
+    }
+
     pub fn reasoning_start_ids(&self) -> Vec<u32> {
         self.reasoning_start_ids.clone()
     }
@@ -51,32 +145,66 @@ impl SpecialTokens {
     pub fn reasoning_end_ids(&self) -> Vec<u32> {
         self.reasoning_end_ids.clone()
     }
-}
 
-fn collect_candidate_token_ids(tokenizer: &Tokenizer, candidates: &[&str]) -> Vec<u32> {
-    candidates
-        .iter()
-        .filter_map(|candidate| candidate_token_id(tokenizer, candidate))
-        .collect()
-}
-
-fn candidate_token_id(tokenizer: &Tokenizer, candidate: &str) -> Option<u32> {
-    let encoding = tokenizer.encode(candidate, false).ok()?;
-    let ids = encoding.get_ids();
-    let tokens = encoding.get_tokens();
+    pub fn tool_call_start_ids(&self) -> Vec<u32> {
+        self.tool_call_start_ids.clone()
+    }
 
-    if ids.len() == 1 && tokens.len() == 1 && tokens[0] == candidate {
-        Some(ids[0])
-    } else {
-        None
+    pub fn tool_call_end_ids(&self) -> Vec<u32> {
+        self.tool_call_end_ids.clone()
     }
 }
 
+/// Helper function to sort and deduplicate token IDs
 fn sort_and_dedup(ids: &mut Vec<u32>) {
     ids.sort_unstable();
     ids.dedup();
 }
 
+/// Process a token pair, searching added vocab first, then falling back to common vocab
+/// Only adds both tokens if the pair is complete (no stragglers)
+fn process_pair<F>(
+    start_str: &str,
+    end_str: &str,
+    added_start_map: &HashMap<String, Vec<u32>>,
+    added_end_map: &HashMap<String, Vec<u32>>,
+    start_ids: &mut Vec<u32>,
+    end_ids: &mut Vec<u32>,
+    fallback_fn: F,
+    pair_type: &str,
+) where
+    F: FnOnce(&str, &str) -> Option<(u32, u32)>,
+{
+    // First: check if both tokens exist in added vocabulary
+    let start_in_added = added_start_map.contains_key(start_str);
+    let end_in_added = added_end_map.contains_key(end_str);
+
+    if start_in_added && end_in_added {
+        // Both found in added tokens - collect them
+        if let Some(ids) = added_start_map.get(start_str) {
+            start_ids.extend(ids);
+        }
+        if let Some(ids) = added_end_map.get(end_str) {
+            end_ids.extend(ids);
+        }
+        return;
+    }
+
+    // Fallback: check common vocabulary
+    if let Some((start_id, end_id)) = fallback_fn(start_str, end_str) {
+        crate::log_warn!(
+            "[{}] Pair '{}' + '{}' not found in added vocabulary, falling back to common vocab with IDs: {} + {}",
+            pair_type,
+            start_str,
+            end_str,
+            start_id,
+            end_id
+        );
+        start_ids.push(start_id);
+        end_ids.push(end_id);
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::SpecialTokens;
@@ -96,4 +224,60 @@ mod tests {
         assert_eq!(special_tokens.reasoning_start_ids().len(), 1);
         assert_eq!(special_tokens.reasoning_end_ids().len(), 1);
     }
+
+    #[test]
+    fn special_tokens_pair_not_found_in_added_fallback_to_common_vocab() {
+        let mut tokenizer = Tokenizer::new(BPE::default());
+        // Add only start token to added vocabulary
+        tokenizer.add_special_tokens(&[AddedToken::from("<thinking>", true)]);
+
+        let special_tokens = SpecialTokens::new(&tokenizer);
+
+        // Should not find anything since only one token of the pair is in added vocab
+        assert_eq!(special_tokens.reasoning_start_ids().len(), 0);
+        assert_eq!(special_tokens.reasoning_end_ids().len(), 0);
+    }
+
+    #[test]
+    fn special_tokens_pair_fallback_with_both_tokens_in_common_vocab() {
+        let mut tokenizer = Tokenizer::new(BPE::default());
+        tokenizer.add_special_tokens(&[
+            AddedToken::from("<thinking>", true),
+            AddedToken::from("</thinking>", true),
+        ]);
+
+        let special_tokens = SpecialTokens::new(&tokenizer);
+
+        // Both tokens found in added vocabulary
+        assert_eq!(special_tokens.reasoning_start_ids().len(), 1);
+        assert_eq!(special_tokens.reasoning_end_ids().len(), 1);
+    }
+
+    #[test]
+    fn special_tokens_skips_partial_pair() {
+        let mut tokenizer = Tokenizer::new(BPE::default());
+        // Only add start tag, not end tag
+        tokenizer.add_special_tokens(&[AddedToken::from("<thinking>", true)]);
+
+        let special_tokens = SpecialTokens::new(&tokenizer);
+
+        // Should skip the pair entirely - no stragglers
+        assert_eq!(special_tokens.reasoning_start_ids().len(), 0);
+        assert_eq!(special_tokens.reasoning_end_ids().len(), 0);
+    }
+
+    #[test]
+    fn special_tokens_tool_call_pair_handling() {
+        let mut tokenizer = Tokenizer::new(BPE::default());
+        tokenizer.add_special_tokens(&[
+            AddedToken::from("<|tool_call>", true),
+            AddedToken::from("<tool_call|>", true),
+        ]);
+
+        let special_tokens = SpecialTokens::new(&tokenizer);
+
+        // Both tokens found in added vocabulary
+        assert_eq!(special_tokens.tool_call_start_ids().len(), 1);
+        assert_eq!(special_tokens.tool_call_end_ids().len(), 1);
+    }
 }