guoqingbao · guoqingbao · May 27, 2026 · May 27, 2026 · May 28, 2026 · May 28, 2026
diff --git a/Cargo.toml b/Cargo.toml
@@ -46,7 +46,7 @@ ahash = "0.8.11"
 reedline = "0.40.0"
 pyo3 = { version = "0.25.1", features = ["extension-module", "abi3-py38"], optional = true }
 parking_lot = "0.12.4"
-attention-rs = { git = "https://github.com/guoqingbao/attention.rs.git", version="0.6.0", rev = "ab3fd74" }
+attention-rs = { git = "https://github.com/guoqingbao/attention.rs.git", version="0.6.0", rev = "0ed5cd5" }
 once_cell = "1.21.3"
 tqdm = "0.8.0"
 futures = "0.3.31"

diff --git a/ReadMe-CN.md b/ReadMe-CN.md
@@ -14,7 +14,7 @@
 | **⚡** | 极致性能 | 原生 Flash Attention、FlashInfer、CUDA Graphs、持续批处理、前缀缓存、PD 分离。消费级 GPU 上 `30B+` 模型解码速度高达 **197 tok/s** |
 | **🪶** | 极简内核 | 核心调度 + 注意力逻辑仅 **< 5000 行** Rust 代码 |
 | **🌍** | 跨平台 | CUDA（Linux/Windows）、Metal（macOS），统一二进制，统一 API |
-| **🏭** | 生产就绪 | OpenAI/Anthropic 兼容 API、内置 ChatGPT 风格 Web UI、MCP 工具调用、结构化输出、Embedding + 分词器端点 |
+| **🏭** | 生产就绪 | OpenAI/Anthropic 兼容 API、内置 ChatGPT 风格 Web UI、MCP 工具调用、结构化输出、Embedding + 分词器端点、MTP |
 | **🗜️** | 极致 KV 压缩 | TurboQuant（`2–4 位` KV 缓存）以极小的质量损失将上下文扩展至 **4.3 倍**。单卡 24/32 GB GPU 即可运行 `30B+` MoE 模型并支持**百万级上下文** |
 | **🔥** | V100 + NVFP4 | 业界首创：V100 上运行 NVFP4 + 低位 KV 缓存推理 — 无需硬件 FP4，旧 GPU 重获新生 |
 | **🐍** | 轻量 Python 绑定 | 需要 Python 入口时可选 PyO3 wheel 包 |
@@ -55,6 +55,11 @@ xinfer --w /home/Qwen3.6-35B-A3B --d 0,1 --ui-server
 python3 -m xinfer.server --m Qwen/Qwen3.6-27B-FP8 --kvcache-dtype turbo4 --ui-server
 ```
 
+**MTP**
+```bash
+xinfer --w /home/Qwen3.6-35B-A3B --d 0,1 --ui-server --mtp 2
+```
+
 > **提示：** 浏览器打开 `http://IP:8001` 即可使用内置对话界面，或使用 `http://IP:8000/v1/` 作为 API 服务 `Base URL`。
 
 ---
@@ -77,7 +82,7 @@ python3 -m xinfer.server --m Qwen/Qwen3.6-27B-FP8 --kvcache-dtype turbo4 --ui-se
 
 > 测试平台：**V100-32G**、**A100-40G**、**Hopper-80G** 及 **RTX 5090**
 
-| 模型 | 格式 | 大小 | 输出速度 |
+| 模型 | 格式 | 大小 | 输出速度 (非MTP) |
 |---|---|---|---|
 | Ministral-3-3B (**多模态**) | ISQ (BF16→Q4K) | 3B | **193.67** tokens/s |
 | Qwen3-VL-8B-Instruct (**多模态**) | Q8_0 | 8B | **112.51** tokens/s |
@@ -164,6 +169,9 @@ xinfer --m Qwen/Qwen3.6-35B-A3B-FP8 --kvcache-dtype fp8
 # 27B Dense + turbo4
 xinfer --m Qwen/Qwen3.6-27B-FP8 --kvcache-dtype turbo4
 
+# 26B Gemma4 (本地模型, 使用`--kv-fraction`选项增加kvcache占用)
+xinfer --w /data/gemma-4-26B-A4B-it --ui-server --port 9000 --kv-fraction 0.8
+
 # 30B MoE GGUF + turbo4
 xinfer --m unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF \
   --f Qwen3-30B-A3B-Instruct-2507-Q4_K_M.gguf --kvcache-dtype turbo4
@@ -454,6 +462,7 @@ xinfer --m unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF \
 | `--frequency-penalty` | 高频惩罚（−2 到 2） |
 | `--mcp-config` | MCP 服务器 JSON 配置 |
 | `--mcp-command` / `--mcp-args` | 单个 MCP 服务器命令及参数 |
+| `--mtp` | 启用MTP (只针对包含MTP层的模型) ，例如 `--mtp 2`，单次推理2个tokens |
 
 ---
 
@@ -499,7 +508,7 @@ xinfer --m unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF \
 * [x] **MXFP4/NVFP4 模型支持**
 * [x] **支持 Turboquant（4 位、3 位）KvCache**
 * [ ] TentorRT-LLM
-
+* [x] Multi-token Prediction (MTP)
 ---
 
 ## 📚 参考项目

diff --git a/ReadMe.md b/ReadMe.md
@@ -14,7 +14,7 @@
 | **⚡** | Fast | Native Flash Attention, FlashInfer, CUDA Graphs, continuous batching, prefix caching, PD disaggregation. Up to **197 tok/s** decode for `30B+` models on consumer GPUs |
 | **🪶** | Tiny footprint | Core scheduling + attention logic in **< 5 000 lines** of Rust |
 | **🌍** | Cross-platform | CUDA (Linux/Windows), Metal (macOS). Same binary, same API |
-| **🏭** | Production-ready | OpenAI/Anthropic-compatible APIs, built-in ChatGPT-style Web UI, MCP tool calling, structured outputs, embedding + tokenizer endpoints |
+| **🏭** | Production-ready | OpenAI/Anthropic-compatible APIs, built-in ChatGPT-style Web UI, MCP tool calling, structured outputs, embedding + tokenizer endpoints, multi-token prediction (MTP) |
 | **🗜️** | Aggressive KV compression | TurboQuant (`2–4 bit` KV cache) extends context up to **4.3×** with minimal quality loss. Run `30B+` MoE models with **millions of context** on single 24/32 GB GPUs |
 | **🔥** | V100 + NVFP4 | First-ever NVFP4 + low-bit KV cache on V100 — no hardware FP4 needed, coherent output on legacy GPUs |
 | **🐍** | Lightweight Python bindings | Optional PyO3 wheel when you need a Python entry point |
@@ -55,6 +55,11 @@ xinfer --w /home/Qwen3.6-35B-A3B --d 0,1 --ui-server
 python3 -m xinfer.server --m Qwen/Qwen3.6-27B-FP8 --kvcache-dtype turbo4 --ui-server
 ```
 
+**MTP**
+```bash
+xinfer --w /home/Qwen3.6-35B-A3B --d 0,1 --ui-server --mtp 2
+```
+
 > **Tip:** Open `http://IP:8001` for the built-in chat UI, or use `http://IP:8000/v1/` as your API `Base URL`.
 
 ---
@@ -77,7 +82,7 @@ Add `--kvcache-dtype` to compress KV cache and extend context length:
 
 > Tested on **V100-32G**, **A100-40G**, **Hopper-80G** and **RTX 5090**
 
-| Model | Format | Size | Decoding Speed |
+| Model | Format | Size | Decoding Speed (without MTP) |
 |---|---|---|---|
 | Ministral-3-3B (**Multimodal**) | ISQ (BF16→Q4K) | 3B | **193.67** tokens/s |
 | Qwen3-VL-8B-Instruct (**Multimodal**) | Q8_0 | 8B | **112.51** tokens/s |
@@ -164,6 +169,9 @@ xinfer --m Qwen/Qwen3.6-35B-A3B-FP8 --kvcache-dtype fp8
 # 27B Dense + turbo4
 xinfer --m Qwen/Qwen3.6-27B-FP8 --kvcache-dtype turbo4
 
+# 26B Gemma4 (local model, occupy more kvcache with --kv-fraction)
+xinfer --w /data/gemma-4-26B-A4B-it --ui-server --port 9000 --kv-fraction 0.8
+
 # 30B MoE GGUF + turbo4
 xinfer --m unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF \
   --f Qwen3-30B-A3B-Instruct-2507-Q4_K_M.gguf --kvcache-dtype turbo4
@@ -459,6 +467,7 @@ Constraint-based generation via llguidance — Lark grammars, regex, JSON Schema
 | `--frequency-penalty` | Penalize frequent tokens (−2 to 2) |
 | `--mcp-config` | MCP servers JSON config |
 | `--mcp-command` / `--mcp-args` | Single MCP server command + args |
+| `--mtp`| Multi-token prediction, usage `--mtp 2` for two-token prediction per forward pass |
 
 ---
 
@@ -504,6 +513,7 @@ Constraint-based generation via llguidance — Lark grammars, regex, JSON Schema
 * [x] **MXFP4/NVFP4 Model Support**
 * [x] **Support Turboquant (4-bit, 3-bit) KvCache**
 * [ ] TentorRT-LLM
+* [x] **Multi-token Prediciton (MTP)**
 
 ---
 

diff --git a/src/core/block_manager.rs b/src/core/block_manager.rs
@@ -145,6 +145,13 @@ impl BlockManager {
         }
     }
 
+    /// Allocate a single free block and return its ID, or None if no blocks available.
+    pub fn alloc_free_block(&mut self) -> Option<usize> {
+        let block_id = self.free_block_ids.pop_front()?;
+        self.allocate_block(block_id);
+        Some(block_id)
+    }
+
     fn image_prefix_seed(images: &ImageData) -> u64 {
         let mut hasher = std::collections::hash_map::DefaultHasher::new();
         images.raw.hash(&mut hasher);