raullenchai · raullenchai · Apr 6, 2026 · Feb 24, 2026 · Feb 28, 2026 · Feb 28, 2026
diff --git a/README.md b/README.md
@@ -299,11 +299,14 @@ All 17 parsers include automatic recovery — if a quantized model outputs broke
 | **GPT-OSS 20B** | **127** tok/s · 100% tools | 79 (mlx-lm serve) | **1.6x** |
 | **Qwen3.5-9B** | **108** tok/s | 46 (Ollama) | **2.3x** |
 | **Kimi-Linear-48B** | **94** tok/s · 100% tools | — (only engine) | — |
+| 🆕 **Gemma 4 26B-A4B** | **94** tok/s · 100% tools | — (day-0, only engine) | — |
+| 🆕 **Gemma 4 E4B** | **83** tok/s · 100% tools | — (day-0, only engine) | — |
 | **Qwen3.5-35B-A3B** | **83** tok/s · 100% tools | 75 (oMLX) | **1.1x** |
 | **Qwen3-Coder 80B** | **74** tok/s · 100% tools | 69 (mlx-lm serve) | **1.1x** |
 | **Qwen3.5-122B** | **44** tok/s · 100% tools | 43 (mlx-lm serve) | ~1.0x |
+| 🆕 **Gemma 4 31B** | **31** tok/s · 100% tools | 10.9 (mlx-vlm bf16) | **2.8x** |
 
-*Full benchmark data with all 18 models, TTFT tables, DeltaNet snapshots, and engine comparison below.*
+*Full benchmark data with all models, TTFT tables, DeltaNet snapshots, and engine comparison below.*
 
 <details>
 <summary><strong>TTFT — Prompt Cache Advantage</strong></summary>
@@ -325,6 +328,9 @@ Prompt cache keeps multi-turn conversations fast. For standard transformers, KV
 | Qwen3-Coder-Next 80B | **0.16s** | 0.27s | 1.7x |
 | GPT-OSS 20B | **0.16s** | 0.27s | 1.7x |
 | Qwen3.5-9B | **0.22s** | 0.26s | 1.2x |
+| 🆕 Gemma 4 E4B | **0.25s** | — (day-0) | — |
+| 🆕 Gemma 4 26B-A4B | **0.25s** | — (day-0) | — |
+| 🆕 Gemma 4 31B | **0.34s** | 0.57s (mlx-vlm bf16) | **1.7x** |
 
 **DeltaNet state snapshots (hybrid RNN + attention):**
 
@@ -368,7 +374,7 @@ Qwen3.5 uses Gated DeltaNet (75% RNN) + full attention (25% KV). Other engines r
 | **DeltaNet state snapshots** | Deep-copy RNN state at prefix boundary, restore in ~0.1ms | Qwen3.5 (4B, 9B, 27B, 35B, 122B), Qwen3-Coder-Next |
 | **Hybrid cache sync** | Keep trimmable KV + non-trimmable RNN layers in sync | Qwen3.5 (Gated DeltaNet + attention) |
 | **Tool logits bias** | Jump-forward decoding — bias logits toward structured tokens | All models with `--enable-tool-logits-bias` |
-| **Auto tool recovery** | Detect broken text-format tool calls, convert to structured | All 17 parser formats |
+| **Auto tool recovery** | Detect broken text-format tool calls, convert to structured | All 18 parser formats (incl. Gemma 4) |
 | **Speculative decoding** | Draft model generates candidates, main model verifies | Any model + `--draft-model` |
 | **KV quantization** | 4/8-bit KV cache for longer contexts in less memory | All models with `--kv-bits` |
 | **Prefill chunking** | Configurable step size for large-prompt throughput | All models |
@@ -379,10 +385,13 @@ Qwen3.5 uses Gated DeltaNet (75% RNN) + full attention (25% KV). Other engines r
 <details>
 <summary><strong>Eval benchmarks (17 models, 4 suites)</strong></summary>
 
-17 models across tool calling (30 scenarios), coding (HumanEval+), reasoning (MATH-500), and general knowledge (MMLU-Pro). All with `enable_thinking: false` on M3 Ultra.
+19 models across tool calling (30 scenarios), coding (HumanEval+), reasoning (MATH-500), and general knowledge (MMLU-Pro). All with `enable_thinking: false` on M3 Ultra. 🆕 = Gemma 4 (day-0 support).
 
 | Model | Quant | RAM | Decode | Tools | Code | Reason | General | Avg |
 |-------|-------|-----|--------|-------|------|--------|---------|-----|
+| 🆕 Gemma 4 26B-A4B | 4bit | 14.4 GB | 94 t/s | **100%** | — | — | — | — |
+| 🆕 Gemma 4 E4B | 4bit | 6.4 GB | 83 t/s | **100%** | — | — | — | — |
+| 🆕 Gemma 4 31B | 4bit | 17.0 GB | 31 t/s | **100%** | — | — | — | — |
 | Qwen3.5-122B-A10B | 8bit | 129.8 GB | 44 t/s | 87% | **90%** | **90%** | **90%** | **89%** |
 | Qwen3.5-122B-A10B | mxfp4 | 65.0 GB | 57 t/s | **90%** | **90%** | 80% | **90%** | 88% |
 | Qwen3.5-35B-A3B | 8bit | 36.9 GB | 83 t/s | **90%** | **90%** | 80% | 80% | 85% |

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "rapid-mlx"
-version = "0.2.7"
+version = "0.4.0"
 description = "Rapid-MLX — AI inference for Apple Silicon. Drop-in OpenAI API, 2-4x faster than Ollama."
 readme = "README.md"
 license = {text = "Apache-2.0"}
@@ -31,7 +31,7 @@ dependencies = [
     # Core — these are all you need for `rapid-mlx serve <text-model>`
     "mlx>=0.29.0",
     "mlx-lm>=0.31.0",  # 0.31+ required for ArraysCache native batching (hybrid models)
-    "mlx-vlm>=0.1.0",  # VLM support
+    "mlx-vlm>=0.4.4",  # 0.4.4+ required for Gemma 4 support
     "transformers>=5.0.0",  # mlx-lm 0.30.5+ requires transformers 5.0 (rc3 bug fixed in stable)
     "tokenizers>=0.19.0",
     "huggingface-hub>=0.23.0",

diff --git a/reports/benchmarks/gemma4-26b-a4b-4bit.json b/reports/benchmarks/gemma4-26b-a4b-4bit.json
@@ -0,0 +1,34 @@
+[
+  {
+    "engine": "Rapid-MLX",
+    "model": "/Volumes/Extreme SSD/mlx-models/gemma-4-26b-a4b-it-4bit",
+    "short_decode_tps": {
+      "mean": 91.99284441095028,
+      "median": 91.9997184259893,
+      "min": 91.90088449538918,
+      "max": 92.07793031147239
+    },
+    "short_prefill_tps": {
+      "median": 104.6567741055699
+    },
+    "long_decode_tps": {
+      "mean": 90.20737344249109,
+      "median": 90.15377328174793,
+      "min": 90.10046743781253,
+      "max": 90.36787960791281
+    },
+    "long_prefill_tps": {
+      "median": 404.29261432785694
+    },
+    "ttft_cold_s": 0.6939874159870669,
+    "ttft_cached_s": 0.25729072900139727,
+    "multi_turn_ttft_cold_s": 0.3698056659777649,
+    "multi_turn_ttft_cached_s": 0.257844791514799,
+    "peak_ram_mb": 14697.5,
+    "tool_call_rate": 1.0,
+    "recovery_rate": 1.0,
+    "leak_rate": 0.0,
+    "vision": true,
+    "audio": false
+  }
+]
diff --git a/reports/benchmarks/gemma4-31b-4bit.json b/reports/benchmarks/gemma4-31b-4bit.json
@@ -0,0 +1,34 @@
+[
+  {
+    "engine": "Rapid-MLX",
+    "model": "/Volumes/Extreme SSD/mlx-models/gemma-4-31b-it-4bit-local",
+    "short_decode_tps": {
+      "mean": 30.658969382957626,
+      "median": 30.650768843603235,
+      "min": 30.636982911649486,
+      "max": 30.68915639362016
+    },
+    "short_prefill_tps": {
+      "median": 77.82051641509116
+    },
+    "long_decode_tps": {
+      "mean": 29.81875147521854,
+      "median": 29.834616923274258,
+      "min": 29.772306347596366,
+      "max": 29.849331154785002
+    },
+    "long_prefill_tps": {
+      "median": 318.28190673479276
+    },
+    "ttft_cold_s": 9.772502000036184,
+    "ttft_cached_s": 0.34381089551607147,
+    "multi_turn_ttft_cold_s": 0.7450880000251345,
+    "multi_turn_ttft_cached_s": 0.34492891700938344,
+    "peak_ram_mb": 17363.453125,
+    "tool_call_rate": 1.0,
+    "recovery_rate": 1.0,
+    "leak_rate": 0.0,
+    "vision": true,
+    "audio": false
+  }
+]
diff --git a/reports/benchmarks/gemma4-31b-bf16-mllm.json b/reports/benchmarks/gemma4-31b-bf16-mllm.json
@@ -0,0 +1,34 @@
+[
+  {
+    "engine": "Rapid-MLX",
+    "model": "/Volumes/Extreme SSD/mlx-models/gemma-4-31b-it-bf16",
+    "short_decode_tps": {
+      "mean": 7.684495219859486,
+      "median": 7.685015108337882,
+      "min": 7.683350416504045,
+      "max": 7.685120134736532
+    },
+    "short_prefill_tps": {
+      "median": 49.61073354493354
+    },
+    "long_decode_tps": {
+      "mean": 6.150148014216069,
+      "median": 6.149420465554755,
+      "min": 6.148029410337342,
+      "max": 6.152994166756111
+    },
+    "long_prefill_tps": {
+      "median": 130.33556741428563
+    },
+    "ttft_cold_s": 0.8671420829778071,
+    "ttft_cached_s": 0.503123354021227,
+    "multi_turn_ttft_cold_s": 0.878063625015784,
+    "multi_turn_ttft_cached_s": 0.8742528125003446,
+    "peak_ram_mb": 60796.328125,
+    "tool_call_rate": 1.0,
+    "recovery_rate": 1.0,
+    "leak_rate": 0.0,
+    "vision": false,
+    "audio": false
+  }
+]
diff --git a/reports/benchmarks/gemma4-31b-bf16.json b/reports/benchmarks/gemma4-31b-bf16.json
@@ -0,0 +1,34 @@
+[
+  {
+    "engine": "Rapid-MLX",
+    "model": "/Volumes/Extreme SSD/mlx-models/gemma-4-31b-it-bf16",
+    "short_decode_tps": {
+      "mean": 10.877661903575952,
+      "median": 10.881409537294747,
+      "min": 10.8682413908779,
+      "max": 10.883334782555206
+    },
+    "short_prefill_tps": {
+      "median": 46.99511568078357
+    },
+    "long_decode_tps": {
+      "mean": 10.730247908489643,
+      "median": 10.733271737703564,
+      "min": 10.722421680460178,
+      "max": 10.735050307305189
+    },
+    "long_prefill_tps": {
+      "median": 186.58741680810584
+    },
+    "ttft_cold_s": 76.44581962499069,
+    "ttft_cached_s": 0.5739832909894176,
+    "multi_turn_ttft_cold_s": 1.105832208006177,
+    "multi_turn_ttft_cached_s": 0.5784412914945278,
+    "peak_ram_mb": 59444.0625,
+    "tool_call_rate": 1.0,
+    "recovery_rate": 1.0,
+    "leak_rate": 0.0,
+    "vision": true,
+    "audio": false
+  }
+]
diff --git a/reports/benchmarks/gemma4-e4b-4bit.json b/reports/benchmarks/gemma4-e4b-4bit.json
@@ -0,0 +1,34 @@
+[
+  {
+    "engine": "Rapid-MLX",
+    "model": "/Volumes/Extreme SSD/mlx-models/gemma-4-e4b-it-4bit-local",
+    "short_decode_tps": {
+      "mean": 82.22621304400961,
+      "median": 82.2157561516956,
+      "min": 82.17578086740563,
+      "max": 82.28710211292758
+    },
+    "short_prefill_tps": {
+      "median": 101.84400488869173
+    },
+    "long_decode_tps": {
+      "mean": 79.74346950172897,
+      "median": 80.09642988741999,
+      "min": 78.86214671758383,
+      "max": 80.27183190018309
+    },
+    "long_prefill_tps": {
+      "median": 349.3339133508353
+    },
+    "ttft_cold_s": 2.396504874981474,
+    "ttft_cached_s": 0.2615705000353046,
+    "multi_turn_ttft_cold_s": 0.3181427090312354,
+    "multi_turn_ttft_cached_s": 0.25800837500719354,
+    "peak_ram_mb": 6519.265625,
+    "tool_call_rate": 1.0,
+    "recovery_rate": 1.0,
+    "leak_rate": 0.0,
+    "vision": true,
+    "audio": false
+  }
+]