vllm-project · DarkLight1337 · May 7, 2026 · May 4, 2026 · May 5, 2026 · May 6, 2026
diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
@@ -270,6 +270,36 @@ Known supported models (with corresponding benchmarks):
 
 ## Input Processing
 
+### fastokens Tokenizer Mode
+
+By default vLLM uses the standard Hugging Face `tokenizers` library to power
+the fast tokenizer (`--tokenizer-mode hf`). For BPE tokenizers (Qwen, Llama,
+DeepSeek, GPT-OSS, etc.) you can switch to the
+[fastokens](https://github.com/crusoecloud/fastokens) Rust backend, a drop-in
+replacement that's substantially faster on encode/decode and on streaming
+detokenization:
+
+```console
+vllm serve Qwen/Qwen3-8B --tokenizer-mode fastokens
+```
+
+Equivalent in the offline API:
+
+```python
+from vllm import LLM
+llm = LLM(model="Qwen/Qwen3-8B", tokenizer_mode="fastokens")
+```
+
+The `fastokens` Python package must be installed; if it isn't, vLLM raises
+a clear `ImportError` at tokenizer load. `fastokens` loads a Hugging Face
+fast tokenizer with its inner Rust tokenizer replaced by the fastokens shim,
+so it is mutually exclusive with non-HF modes such as `mistral` or
+`deepseek_v32`.
+
+Tokenizer-bound workloads — long shared prefixes, bursty short prompts,
+batch detokenization — see the largest wins. If your bottleneck is GPU
+prefill/decode, the tokenizer change is unlikely to be visible end-to-end.
+
 ### Parallel Processing
 
 You can run input processing in parallel via [API server scale-out](../serving/data_parallel_deployment.md#internal-load-balancing).

diff --git a/docs/design/huggingface_integration.md b/docs/design/huggingface_integration.md
@@ -21,7 +21,7 @@ Let's say we want to serve the popular Qwen model by running `vllm serve Qwen/Qw
 
 Beyond that, there are two more things vLLM depends on Hugging Face for.
 
-1. **Tokenizer**: vLLM uses the tokenizer from Hugging Face to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check Hugging Face's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [vllm.tokenizers.hf.get_cached_tokenizer][].
+1. **Tokenizer**: vLLM uses the tokenizer from Hugging Face to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Setting `--tokenizer-mode fastokens` swaps in a drop-in Rust BPE backend for the HF fast tokenizer (see [fastokens Tokenizer Mode](../configuration/optimization.md#fastokens-tokenizer-mode)). Please check Hugging Face's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [vllm.tokenizers.hf.get_cached_tokenizer][].
 
 2. **Model weight**: vLLM downloads the model weight from the Hugging Face model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights.
     - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that:

@@ -83,7 +83,9 @@
 RunnerOption = Literal["auto", RunnerType]
 ConvertType = Literal["none", "embed", "classify"]
 ConvertOption = Literal["auto", ConvertType]
-TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32", "deepseek_v4"]
+TokenizerMode = Literal[
+    "auto", "hf", "slow", "mistral", "deepseek_v32", "deepseek_v4", "fastokens"
+]
 ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
 LogprobsMode = Literal[
     "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
@@ -136,6 +138,9 @@ class ModelConfig:
     - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.
     - "deepseek_v4" will always use the tokenizer from `deepseek_v4`.
     - "qwen_vl" will always use the tokenizer from `qwen_vl`.
+    - "fastokens" loads a Hugging Face fast tokenizer powered by the
+      [fastokens](https://github.com/crusoecloud/fastokens) Rust BPE backend
+      (requires the `fastokens` package to be installed).
     - Other custom values can be supported via plugins."""
     trust_remote_code: bool = False
     """Trust remote code (e.g., from HuggingFace) when downloading the model

@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""``fastokens`` tokenizer mode.
+
+Loads a Hugging Face fast tokenizer whose internal Rust tokenizer is replaced
+by the fastokens shim. fastokens also rebinds
+``tokenizers.decoders.DecodeStream`` so the streaming detokenizer accepts the
+shim. Both patches are installed for the lifetime of the process —
+``patch_transformers()`` is idempotent.
+"""
+
+from pathlib import Path
+
+from .hf import CachedHfTokenizer, HfTokenizer
+from .protocol import TokenizerLike
+
+
+def _apply_fastokens_patch() -> None:
+    try:
+        import fastokens
+    except ImportError as e:
+        raise ImportError(
+            "The 'fastokens' package is required for tokenizer_mode='fastokens'."
+        ) from e
+    fastokens.patch_transformers()
+
+
+class FastokensTokenizer(TokenizerLike):
+    @classmethod
+    def from_pretrained(
+        cls,
+        path_or_repo_id: str | Path,
+        *args,
+        trust_remote_code: bool = False,
+        revision: str | None = None,
+        download_dir: str | None = None,
+        **kwargs,
+    ) -> HfTokenizer:
+        _apply_fastokens_patch()
+        return CachedHfTokenizer.from_pretrained(
+            path_or_repo_id,
+            *args,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            download_dir=download_dir,
+            **kwargs,
+        )
@@ -43,6 +43,7 @@
 _VLLM_TOKENIZERS = {
     "deepseek_v32": ("deepseek_v32", "DeepseekV32Tokenizer"),
     "deepseek_v4": ("deepseek_v4", "DeepseekV4Tokenizer"),
+    "fastokens": ("fastokens", "FastokensTokenizer"),
     "grok2": ("grok2", "Grok2Tokenizer"),
     "hf": ("hf", "CachedHfTokenizer"),
     "kimi_audio": ("kimi_audio", "KimiAudioTokenizer"),

@@ -3,9 +3,9 @@
 from abc import ABC, abstractmethod
 
 import tokenizers
+import tokenizers.decoders
 from packaging import version
 from tokenizers import Tokenizer
-from tokenizers.decoders import DecodeStream
 from transformers import PreTrainedTokenizerFast
 
 from vllm.logger import init_logger
@@ -177,7 +177,10 @@ def __init__(self, tokenizer: PreTrainedTokenizerFast, request: EngineCoreReques
         self.tokenizer: Tokenizer = tokenizer._tokenizer
 
         # Use native prefill to prime the decode stream with prompt tokens.
-        self.stream = DecodeStream(
+        # Look up DecodeStream on the module so backend patches (e.g. the
+        # fastokens shim that replaces ``tokenizers.decoders.DecodeStream``)
+        # are honored regardless of import order.
+        self.stream = tokenizers.decoders.DecodeStream(
             ids=request.prompt_token_ids,
             skip_special_tokens=self.skip_special_tokens,
         )
@@ -237,7 +240,9 @@ def _protected_step(self, next_token_id: int) -> str | None:
                 " for request %s, resetting decode stream.",
                 self.request_id,
             )
-            self.stream = DecodeStream(skip_special_tokens=self.skip_special_tokens)
+            self.stream = tokenizers.decoders.DecodeStream(
+                skip_special_tokens=self.skip_special_tokens
+            )
             token = self.stream.step(self.tokenizer, next_token_id)
         return token