From b52d121ddad6d6621f7ea1765f31a0926999baf2 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 22 Dec 2025 11:43:29 +0100 Subject: [PATCH 1/6] WIP Signed-off-by: Patrick von Platen --- vllm/tokenizers/mistral.py | 87 ++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 50 deletions(-) diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py index 534b0da484a5..1ff3d6d7d588 100644 --- a/vllm/tokenizers/mistral.py +++ b/vllm/tokenizers/mistral.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from functools import cached_property from pathlib import Path from typing import TYPE_CHECKING, Any, cast @@ -9,11 +10,40 @@ from .protocol import TokenizerLike + +from mistral_common.tokens.tokenizers.tekken import Tekkenizer +from mistral_common.protocol.instruct.validator import ValidationMode +from mistral_common.protocol.instruct.tool_calls import Function, Tool +from mistral_common.tokens.tokenizers.sentencepiece import ( + SentencePieceTokenizer, +) +from mistral_common.tokens.tokenizers.tekken import Tekkenizer +from mistral_common.protocol.instruct.request import ( + ChatCompletionRequest as MistralChatCompletionRequest, +) +from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy +from mistral_common.protocol.instruct.validator import ValidationMode +from mistral_common.tokens.tokenizers.sentencepiece import ( + SentencePieceTokenizer, +) +from mistral_common.tokens.tokenizers.tekken import Tekkenizer + +from mistral_common.tokens.tokenizers.base import ( + SpecialTokenPolicy, + SpecialTokens, +) +from mistral_common.tokens.tokenizers.sentencepiece import ( + SentencePieceTokenizer, +) +from mistral_common.tokens.tokenizers.tekken import Tekkenizer +from mistral_common.tokens.tokenizers.base import ( + SpecialTokenPolicy, + SpecialTokens, +) +from mistral_common.tokens.tokenizers.instruct import InstructTokenizerV13 + + if TYPE_CHECKING: - from mistral_common.protocol.instruct.request import ( - ChatCompletionRequest as MistralChatCompletionRequest, - ) - from mistral_common.tokens.tokenizers.tekken import Tekkenizer from transformers import BatchEncoding try: @@ -101,8 +131,6 @@ def _prepare_apply_chat_template_tools_and_messages( continue_final_message: bool = False, add_generation_prompt: bool = False, ) -> tuple[list["ChatCompletionMessageParam"], list[dict[str, Any]] | None]: - from mistral_common.protocol.instruct.tool_calls import Function, Tool - if add_generation_prompt and continue_final_message: raise ValueError( "Cannot set both `add_generation_prompt` and " @@ -181,8 +209,6 @@ def validate_request_params(request: "ChatCompletionRequest"): def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int: - from mistral_common.tokens.tokenizers.tekken import Tekkenizer - assert isinstance(tokenizer, Tekkenizer), type(tokenizer) t_bytes = t.encode("utf-8") if not isinstance(t, bytes) else t @@ -210,8 +236,6 @@ def from_pretrained( download_dir: str | None = None, **kwargs, ) -> "MistralTokenizer": - from mistral_common.protocol.instruct.validator import ValidationMode - try: # Transformers v5 from transformers.tokenization_mistral_common import MistralCommonBackend @@ -235,12 +259,6 @@ def from_pretrained( def __init__(self, tokenizer: "MistralCommonBackend") -> None: super().__init__() - from mistral_common.protocol.instruct.validator import ValidationMode - from mistral_common.tokens.tokenizers.sentencepiece import ( - SentencePieceTokenizer, - ) - from mistral_common.tokens.tokenizers.tekken import Tekkenizer - self.transformers_tokenizer = tokenizer self.mistral = tokenizer.tokenizer self.instruct = self.mistral.instruct_tokenizer @@ -271,7 +289,6 @@ def __init__(self, tokenizer: "MistralCommonBackend") -> None: self._vocab_dict = dict(sorted(self._vocab_dict.items(), key=lambda x: x[1])) # Cache special tokens for faster access. - self._special_token_ids = self._get_special_token_ids() self._special_token_ids_set = set(self._special_token_ids) self._special_tokens = self._get_special_tokens(self._special_token_ids) self._special_tokens_set = set(self._special_tokens) @@ -280,27 +297,11 @@ def __init__(self, tokenizer: "MistralCommonBackend") -> None: self._vocab = self.tokenizer._vocab self._max_token_id = self.vocab_size - 1 - def _get_special_token_ids(self) -> list[int]: - from mistral_common.tokens.tokenizers.sentencepiece import ( - SentencePieceTokenizer, - ) - from mistral_common.tokens.tokenizers.tekken import Tekkenizer - - if self.is_tekken: - assert isinstance(self.tokenizer, Tekkenizer), type(self.tokenizer) - special_ids = {t["rank"] for t in self.tokenizer._all_special_tokens} - elif self.is_spm: - assert isinstance(self.tokenizer, SentencePieceTokenizer), type( - self.tokenizer - ) - special_ids = self.tokenizer._control_tokens - else: - raise ValueError(f"Unknown tokenizer type: {type(self.tokenizer)}") - return sorted(special_ids) + @cached_property + def _special_token_ids(self) -> list[int]: + return sorted(self.tokenizer.is_special(i) for i in len(self.tokenizer.vocab())) def _get_special_tokens(self, all_special_ids: list[int]) -> list[str]: - from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy - return [ self.tokenizer.decode([i], special_token_policy=SpecialTokenPolicy.KEEP) for i in all_special_ids @@ -460,14 +461,6 @@ def batch_decode( ) def convert_tokens_to_string(self, tokens: list[str]) -> str: - from mistral_common.tokens.tokenizers.base import ( - SpecialTokenPolicy, - SpecialTokens, - ) - from mistral_common.tokens.tokenizers.sentencepiece import ( - SentencePieceTokenizer, - ) - from mistral_common.tokens.tokenizers.tekken import Tekkenizer to_decode_special_tokens = {SpecialTokens.tool_calls} if self.is_tekken: @@ -523,12 +516,6 @@ def convert_ids_to_tokens( ids: list[int], skip_special_tokens: bool = False, ) -> list[str]: - from mistral_common.tokens.tokenizers.base import ( - SpecialTokenPolicy, - SpecialTokens, - ) - from mistral_common.tokens.tokenizers.instruct import InstructTokenizerV13 - if not skip_special_tokens: return [self.tokenizer.id_to_piece(token_id) for token_id in ids] From 53c0455136f5de5f0065af6d22f8df7667b378fd Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 22 Dec 2025 11:47:57 +0100 Subject: [PATCH 2/6] WIP Signed-off-by: Patrick von Platen --- vllm/tokenizers/mistral.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py index 1ff3d6d7d588..9c6b40caac4e 100644 --- a/vllm/tokenizers/mistral.py +++ b/vllm/tokenizers/mistral.py @@ -289,17 +289,18 @@ def __init__(self, tokenizer: "MistralCommonBackend") -> None: self._vocab_dict = dict(sorted(self._vocab_dict.items(), key=lambda x: x[1])) # Cache special tokens for faster access. + self._vocab = self.tokenizer.vocab() + # Vocab sorted by token id. + self._max_token_id = self.vocab_size - 1 + + self._special_token_ids = self._get_special_token_ids() self._special_token_ids_set = set(self._special_token_ids) self._special_tokens = self._get_special_tokens(self._special_token_ids) self._special_tokens_set = set(self._special_tokens) - # Vocab sorted by token id. - self._vocab = self.tokenizer._vocab - self._max_token_id = self.vocab_size - 1 - @cached_property - def _special_token_ids(self) -> list[int]: - return sorted(self.tokenizer.is_special(i) for i in len(self.tokenizer.vocab())) + def _get_special_token_ids(self) -> list[int]: + return sorted(self.tokenizer.is_special(i) for i in len(self._vocab)) def _get_special_tokens(self, all_special_ids: list[int]) -> list[str]: return [ From 669d8e8004398d1b13af9663f919b1930edf311b Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 22 Dec 2025 11:50:30 +0100 Subject: [PATCH 3/6] up Signed-off-by: Patrick von Platen --- requirements/common.txt | 2 +- requirements/nightly_torch_test.txt | 2 +- requirements/test.in | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements/common.txt b/requirements/common.txt index 7c89385da6ba..ce7cb29a6972 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -31,7 +31,7 @@ partial-json-parser # used for parsing partial JSON outputs pyzmq >= 25.0.0 msgspec gguf >= 0.17.0 -mistral_common[image] >= 1.8.5 +mistral_common[image] >= 1.8.8 opencv-python-headless >= 4.11.0 # required for video IO pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt index 7b2c665448a3..d6a9e87ead58 100644 --- a/requirements/nightly_torch_test.txt +++ b/requirements/nightly_torch_test.txt @@ -23,7 +23,7 @@ jiwer # required for audio tests timm # required for internvl test transformers_stream_generator # required for qwen-vl test matplotlib # required for qwen-vl test -mistral_common[image,audio] >= 1.8.5 # required for voxtral test +mistral_common[image,audio] >= 1.8.8 # required for voxtral test num2words # required for smolvlm test opencv-python-headless >= 4.11.0 # required for video test datamodel_code_generator # required for minicpm3 test diff --git a/requirements/test.in b/requirements/test.in index 55452ce83f23..3bf2516e875c 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -29,7 +29,7 @@ torchaudio==2.9.1 torchvision==0.24.1 transformers_stream_generator # required for qwen-vl test matplotlib # required for qwen-vl test -mistral_common[image,audio] >= 1.8.5 # required for voxtral test +mistral_common[image,audio] >= 1.8.8 # required for voxtral test num2words # required for smolvlm test open_clip_torch==2.32.0 # Required for nemotron_vl test opencv-python-headless >= 4.11.0 # required for video test From 44110d1f131d9fe9786394dabedbf4fe51849b50 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 22 Dec 2025 11:54:13 +0100 Subject: [PATCH 4/6] up Signed-off-by: Patrick von Platen --- requirements/test.txt | 2 +- vllm/tokenizers/mistral.py | 37 ++++++++----------------------------- 2 files changed, 9 insertions(+), 30 deletions(-) diff --git a/requirements/test.txt b/requirements/test.txt index ea2093e4347f..5df05e79678d 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -474,7 +474,7 @@ mbstrdecoder==1.1.3 # typepy mdurl==0.1.2 # via markdown-it-py -mistral-common==1.8.5 +mistral-common==1.8.8 # via -r requirements/test.in mlflow==2.22.0 # via terratorch diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py index 9c6b40caac4e..06e3f168dd12 100644 --- a/vllm/tokenizers/mistral.py +++ b/vllm/tokenizers/mistral.py @@ -1,47 +1,28 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from functools import cached_property from pathlib import Path from typing import TYPE_CHECKING, Any, cast -from vllm.entrypoints.chat_utils import ChatCompletionMessageParam -from vllm.entrypoints.openai.protocol import ChatCompletionRequest -from vllm.logger import init_logger - -from .protocol import TokenizerLike - - -from mistral_common.tokens.tokenizers.tekken import Tekkenizer -from mistral_common.protocol.instruct.validator import ValidationMode -from mistral_common.protocol.instruct.tool_calls import Function, Tool -from mistral_common.tokens.tokenizers.sentencepiece import ( - SentencePieceTokenizer, -) -from mistral_common.tokens.tokenizers.tekken import Tekkenizer from mistral_common.protocol.instruct.request import ( ChatCompletionRequest as MistralChatCompletionRequest, ) -from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy +from mistral_common.protocol.instruct.tool_calls import Function, Tool from mistral_common.protocol.instruct.validator import ValidationMode -from mistral_common.tokens.tokenizers.sentencepiece import ( - SentencePieceTokenizer, -) -from mistral_common.tokens.tokenizers.tekken import Tekkenizer - from mistral_common.tokens.tokenizers.base import ( SpecialTokenPolicy, SpecialTokens, ) +from mistral_common.tokens.tokenizers.instruct import InstructTokenizerV13 from mistral_common.tokens.tokenizers.sentencepiece import ( SentencePieceTokenizer, ) from mistral_common.tokens.tokenizers.tekken import Tekkenizer -from mistral_common.tokens.tokenizers.base import ( - SpecialTokenPolicy, - SpecialTokens, -) -from mistral_common.tokens.tokenizers.instruct import InstructTokenizerV13 +from vllm.entrypoints.chat_utils import ChatCompletionMessageParam +from vllm.entrypoints.openai.protocol import ChatCompletionRequest +from vllm.logger import init_logger + +from .protocol import TokenizerLike if TYPE_CHECKING: from transformers import BatchEncoding @@ -298,9 +279,8 @@ def __init__(self, tokenizer: "MistralCommonBackend") -> None: self._special_tokens = self._get_special_tokens(self._special_token_ids) self._special_tokens_set = set(self._special_tokens) - def _get_special_token_ids(self) -> list[int]: - return sorted(self.tokenizer.is_special(i) for i in len(self._vocab)) + return sorted(self.tokenizer.is_special(i) for i in range(len(self._vocab))) def _get_special_tokens(self, all_special_ids: list[int]) -> list[str]: return [ @@ -462,7 +442,6 @@ def batch_decode( ) def convert_tokens_to_string(self, tokens: list[str]) -> str: - to_decode_special_tokens = {SpecialTokens.tool_calls} if self.is_tekken: assert isinstance(self.tokenizer, Tekkenizer), type(self.tokenizer) From 1f8ac26564b67876ef40b766c8cded89e2a19b02 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 22 Dec 2025 12:01:49 +0100 Subject: [PATCH 5/6] Update vllm/tokenizers/mistral.py Co-authored-by: Julien Denize <40604584+juliendenize@users.noreply.github.com> Signed-off-by: Patrick von Platen --- vllm/tokenizers/mistral.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py index 06e3f168dd12..38e8be6f1e4f 100644 --- a/vllm/tokenizers/mistral.py +++ b/vllm/tokenizers/mistral.py @@ -269,11 +269,11 @@ def __init__(self, tokenizer: "MistralCommonBackend") -> None: # Sort the dict for convenience self._vocab_dict = dict(sorted(self._vocab_dict.items(), key=lambda x: x[1])) - # Cache special tokens for faster access. - self._vocab = self.tokenizer.vocab() # Vocab sorted by token id. + self._vocab = self.tokenizer.vocab() self._max_token_id = self.vocab_size - 1 + # Cache special tokens for faster access. self._special_token_ids = self._get_special_token_ids() self._special_token_ids_set = set(self._special_token_ids) self._special_tokens = self._get_special_tokens(self._special_token_ids) From 1762062ac588bc265a939fb739e0a1ebf5e92a57 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 22 Dec 2025 12:02:22 +0100 Subject: [PATCH 6/6] Update vllm/tokenizers/mistral.py Co-authored-by: Julien Denize <40604584+juliendenize@users.noreply.github.com> Signed-off-by: Patrick von Platen --- vllm/tokenizers/mistral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py index 38e8be6f1e4f..090286228dda 100644 --- a/vllm/tokenizers/mistral.py +++ b/vllm/tokenizers/mistral.py @@ -280,7 +280,7 @@ def __init__(self, tokenizer: "MistralCommonBackend") -> None: self._special_tokens_set = set(self._special_tokens) def _get_special_token_ids(self) -> list[int]: - return sorted(self.tokenizer.is_special(i) for i in range(len(self._vocab))) + return [i for i in range(len(self._vocab)) if self.tokenizer.is_special(i)] def _get_special_tokens(self, all_special_ids: list[int]) -> list[str]: return [