Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
7db5290
update deepseek v2 for tokenizers v5
Feb 24, 2026
5ef0061
adding remote code fix
Feb 24, 2026
7be0c57
fix deepseek name
Feb 24, 2026
1d22701
handle spm conversion from proto only when overriding bad_models
itazap Feb 26, 2026
43a07e1
add script to compare xlni and code_search_net output of 2 tokenizers
itazap Feb 27, 2026
ffb5f09
tiktoken models support
itazap Feb 27, 2026
c1a3a0d
fix tests
itazap Feb 27, 2026
4307831
testssss
itazap Feb 27, 2026
1bbe257
fix gemma
itazap Mar 2, 2026
b5d0bad
apply some feedback
itazap Mar 2, 2026
b7547e4
paligemma processor tests fix
itazap Mar 2, 2026
f5fd840
add relevant changes from #44298
itazap Mar 2, 2026
2b9efdf
json serializable fix
itazap Mar 2, 2026
e141166
add more xlni cases
itazap Mar 2, 2026
ae25381
t5 fix
itazap Mar 2, 2026
77120a2
ruff check code quality
itazap Mar 2, 2026
3b053b0
missed file for t5 test fix
itazap Mar 2, 2026
a5542cc
modular failures
itazap Mar 2, 2026
95bba6c
other modular fixes
itazap Mar 2, 2026
7d46f77
tiktoken.model test
itazap Mar 2, 2026
be29c60
more feedback updates!
itazap Mar 3, 2026
53753c3
fixing models so AutoTokenizer == TokenizersBackend - aligning with c…
itazap Mar 3, 2026
4745745
seamless m4t
itazap Mar 3, 2026
cbda0ca
missed the most important files
itazap Mar 3, 2026
e5c8a2f
Revert "missed the most important files"
itazap Mar 3, 2026
8bf6df0
undo changes to big bird , bert, seamless
itazap Mar 3, 2026
df12cc4
setup and qual
itazap Mar 3, 2026
08b91c6
lasr
itazap Mar 4, 2026
a7c2435
t5
itazap Mar 4, 2026
d5e9aba
dpr bert
itazap Mar 4, 2026
ceeb319
xlmroberta
itazap Mar 4, 2026
b512fc7
reformer
itazap Mar 4, 2026
0c95842
nllb
itazap Mar 4, 2026
34d83ed
style and shit
ArthurZucker Mar 4, 2026
5c8af86
update
ArthurZucker Mar 4, 2026
4d06871
fix
ArthurZucker Mar 4, 2026
2159e92
extract the charsmap
ArthurZucker Mar 4, 2026
db0c5b5
fix mbart?
ArthurZucker Mar 4, 2026
31ff32d
style
ArthurZucker Mar 4, 2026
083ec50
nllb and test tok common read spm precompiled charsmap
itazap Mar 4, 2026
a4fc098
fix whisper?
ArthurZucker Mar 4, 2026
2710fad
Merge branch 'bad_models_update' of github.com:huggingface/transforme…
ArthurZucker Mar 4, 2026
969c0fc
nllb
itazap Mar 4, 2026
47a772a
checked on v4!
ArthurZucker Mar 4, 2026
8039c2b
Merge branch 'bad_models_update' of github.com:huggingface/transforme…
ArthurZucker Mar 4, 2026
e3d3025
fix repo
ArthurZucker Mar 4, 2026
6edc1d3
fix lasr
ArthurZucker Mar 4, 2026
dade5e6
style
ArthurZucker Mar 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions src/transformers/convert_slow_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ def extract(self, model_type, **kwargs) -> tuple[dict[str, int], list[tuple]]:
AddedToken(token, normalized=False, special=special)
for id, token, special in sorted(spm_added_tokens, key=lambda x: x[0])
]
kwargs["_spm_precompiled_charsmap"] = getattr(self.proto.normalizer_spec, "precompiled_charsmap", None)
return kwargs


Expand Down Expand Up @@ -635,6 +636,54 @@ class SpmConverter(Converter):
SpmExtractor = SentencePieceExtractor
special_tokens = {}

@staticmethod
def build_tokenizer_from_spm_proto(proto, vocab, merges=None):
"""
Similar to convert_from_spm method, but used only when there is no `model_type` class, i.e. there is no matching class in `TOKENIZERS_MAPPING` and we just create a tokenizer instead of extracting stuff from the sentencepiece file
"""
byte_fallback = proto.trainer_spec.byte_fallback
unk_piece = proto.trainer_spec.unk_piece
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap

# model
if isinstance(vocab, dict):
tokenizer = Tokenizer(
BPE(
vocab=vocab,
merges=merges or [],
unk_token=unk_piece,
fuse_unk=True,
byte_fallback=byte_fallback,
dropout=None,
)
)
elif isinstance(vocab, list) and vocab and isinstance(vocab[0], (tuple, list)):
tokenizer = Tokenizer(
Unigram(
vocab=vocab,
unk_id=proto.trainer_spec.unk_id,
byte_fallback=byte_fallback,
)
)
else:
return None

# normalizer
_normalizers = [normalizers.Replace(" ", "▁")]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's only for some model not all of them (ex gpt2 uses Ġ )

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah MB, sentencepiece never used Ġ !
So ignore this comment probably

if precompiled_charsmap:
_normalizers.insert(0, normalizers.Precompiled(precompiled_charsmap))
tokenizer.normalizer = normalizers.Sequence(_normalizers)

# decoder
if byte_fallback:
tokenizer.decoder = decoders.Sequence(
[decoders.Replace("▁", " "), decoders.ByteFallback(), decoders.Fuse()]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

)
else:
tokenizer.decoder = decoders.Sequence([decoders.Replace("▁", " ")])

return tokenizer

@classmethod
def convert_from_spm(cls, vocab=None, **kwargs):
"""
Expand Down
9 changes: 8 additions & 1 deletion src/transformers/models/auto/tokenization_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,6 @@
("lighton_ocr", "Qwen2TokenizerFast" if is_tokenizers_available() else None),
("lilt", "RobertaTokenizer" if is_tokenizers_available() else None),
("longformer", "RobertaTokenizer" if is_tokenizers_available() else None),
("longt5", "T5Tokenizer" if is_tokenizers_available() else None),
("luke", "LukeTokenizer"),
("lxmert", "LxmertTokenizer" if is_tokenizers_available() else None),
("m2m_100", "M2M100Tokenizer" if is_sentencepiece_available() else None),
Expand Down Expand Up @@ -342,9 +341,11 @@
MODELS_WITH_INCORRECT_HUB_TOKENIZER_CLASS: set[str] = {
"arctic",
"deepseek_vl",
"deepseek_vl_v2",
"deepseek_vl_hybrid",
"fuyu",
"hyperclovax_vlm",
"internlm2",
"janus",
"jamba",
"llava",
Expand Down Expand Up @@ -706,6 +707,12 @@ def from_pretrained(
or tokenizer_class_from_name(tokenizer_config_class + "Fast") is not None
)
)

# V5: Skip remote tokenizer for custom models with incorrect hub tokenizer class
if has_remote_code and config_model_type in MODELS_WITH_INCORRECT_HUB_TOKENIZER_CLASS:
has_remote_code = False
tokenizer_auto_map = None

if has_remote_code:
# V5: Always prefer fast tokenizer (index 1), fallback to slow (index 0)
if tokenizer_auto_map[1] is not None:
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/bert/tokenization_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class BertTokenizer(TokenizersBackend):
Args:
vocab (`str` or `dict[str, int]`, *optional*):
Custom vocabulary dictionary. If not provided, vocabulary is loaded from `vocab_file`.
do_lower_case (`bool`, *optional*, defaults to `False`):
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
Expand Down Expand Up @@ -79,7 +79,7 @@ class BertTokenizer(TokenizersBackend):
def __init__(
self,
vocab: str | dict[str, int] | None = None,
do_lower_case: bool = False,
do_lower_case: bool = True,
unk_token: str = "[UNK]",
sep_token: str = "[SEP]",
pad_token: str = "[PAD]",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.
"""Tokenization class for Blenderbot."""

from tokenizers import Tokenizer, decoders, pre_tokenizers, processors
from tokenizers import Tokenizer, decoders, pre_tokenizers
from tokenizers.models import BPE

from ...tokenization_utils_base import AddedToken
Expand Down Expand Up @@ -170,12 +170,6 @@ def __init__(
add_prefix_space=add_prefix_space,
**kwargs,
)
self._tokenizer.post_processor = processors.RobertaProcessing(
sep=(str(eos_token), self.eos_token_id),
cls=(str(bos_token), self.bos_token_id),
add_prefix_space=add_prefix_space,
trim_offsets=True,
)


__all__ = ["BlenderbotTokenizer"]
9 changes: 4 additions & 5 deletions src/transformers/models/gemma/tokenization_gemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from tokenizers import Tokenizer, decoders, normalizers
from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers
from tokenizers.models import BPE

from ...tokenization_utils_tokenizers import TokenizersBackend
Expand Down Expand Up @@ -88,6 +88,9 @@ def __init__(
byte_fallback=True,
)
)
self._tokenizer.pre_tokenizer = pre_tokenizers.Split(
pattern=" ", behavior="merged_with_previous", invert=False
)

self._tokenizer.decoder = decoders.Sequence(
[decoders.Replace("▁", " "), decoders.ByteFallback(), decoders.Fuse()]
Expand All @@ -102,9 +105,5 @@ def __init__(
**kwargs,
)

def _unk_id(self) -> int:
# Align with historical Gemma convention: pad, eos, bos, unk
return 3


__all__ = ["GemmaTokenizer"]
2 changes: 1 addition & 1 deletion src/transformers/models/gpt_neox/tokenization_gpt_neox.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def __init__(
self._tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
add_prefix_space=add_prefix_space, trim_offsets=trim_offsets
)
self._tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False, trim_offsets=True)
self._tokenizer.decoder = decoders.ByteLevel()

super().__init__(
errors=errors,
Expand Down
68 changes: 58 additions & 10 deletions src/transformers/models/lasr/modular_lasr.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from collections.abc import Callable

import torch
from tokenizers import Tokenizer
from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
from tokenizers.models import Unigram
from torch import nn

Expand Down Expand Up @@ -46,28 +46,76 @@ def __init__(
eos_token="</s>",
unk_token="<unk>",
pad_token="<pad>",
_spm_precompiled_charsmap=None,
extra_ids=100,
additional_special_tokens=None,
vocab=None,
vocab_file=None,
**kwargs,
):
super().__init__(
self._extra_ids = extra_ids

# Handle extra_ids and additional_special_tokens
if additional_special_tokens is not None:
extra_tokens = [x for x in additional_special_tokens if "<extra_id_" in str(x)]
if len(extra_tokens) < 1:
additional_special_tokens += [f"<extra_id_{i}>" for i in range(extra_ids)]
elif extra_ids > 0 and extra_ids != len(extra_tokens):
raise ValueError(
f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are"
" provided to LasrTokenizer. In this case the additional_special_tokens must include the extra_ids"
" tokens"
)
else:
extra_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
additional_special_tokens = extra_tokens

# LASR vocab structure: <pad>=0, </s>=1, <unk>=2, then regular vocab, then extra_ids in reverse
if vocab is not None:
self._vocab_scores = vocab
else:
self._vocab_scores = [
(str(pad_token), 0.0),
(str(eos_token), 0.0),
(str(unk_token), 0.0),
("▁", -2.0), # Space token
]
for i in range(extra_ids - 1, -1, -1):
self._vocab_scores.append((f"<extra_id_{i}>", 0.0))
self._tokenizer = Tokenizer(
Unigram(
self._vocab_scores,
unk_id=3,
byte_fallback=False,
)
)

if _spm_precompiled_charsmap is not None:
self._tokenizer.normalizer = normalizers.Precompiled(_spm_precompiled_charsmap)

self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
[
pre_tokenizers.WhitespaceSplit(),
pre_tokenizers.Metaspace(replacement="▁", prepend_scheme="always", split=True),
]
)
self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="always", split=True)

TokenizersBackend.__init__(
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
extra_ids=extra_ids,
additional_special_tokens=additional_special_tokens,
vocab=vocab,
vocab_file=vocab_file,
**kwargs,
)
self._tokenizer = Tokenizer(
Unigram(
self._vocab_scores,
unk_id=3,
byte_fallback=False,
)

self._tokenizer.post_processor = processors.TemplateProcessing(
single=["$A", "</s>"],
pair=["$A", "</s>", "$B", "</s>"],
special_tokens=[
("</s>", self.eos_token_id),
],
)

def _decode(
Expand Down
7 changes: 4 additions & 3 deletions src/transformers/models/lasr/tokenization_lasr.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import itertools
import re

from tokenizers import Tokenizer, decoders, pre_tokenizers, processors
from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
from tokenizers.models import Unigram

from ...tokenization_utils_tokenizers import TokenizersBackend
Expand Down Expand Up @@ -76,6 +76,7 @@ def __init__(
eos_token="</s>",
unk_token="<unk>",
pad_token="<pad>",
_spm_precompiled_charsmap=None,
extra_ids=100,
additional_special_tokens=None,
vocab=None,
Expand Down Expand Up @@ -119,15 +120,15 @@ def __init__(
)
)

self._tokenizer.normalizer = None
if _spm_precompiled_charsmap is not None:
self._tokenizer.normalizer = normalizers.Precompiled(_spm_precompiled_charsmap)

self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
[
pre_tokenizers.WhitespaceSplit(),
pre_tokenizers.Metaspace(replacement="▁", prepend_scheme="always", split=True),
]
)

self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="always", split=True)

super().__init__(
Expand Down
19 changes: 6 additions & 13 deletions src/transformers/models/mbart50/tokenization_mbart50.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ class MBart50Tokenizer(TokenizersBackend):
def __init__(
self,
vocab: str | dict | list | None = None,
_spm_precompiled_charsmap: str | None = None,
src_lang=None,
tgt_lang=None,
eos_token="</s>",
Expand Down Expand Up @@ -158,19 +159,11 @@ def __init__(
)
)

# Set normalizer equivalent to Precompiled + Strip + Replace from tokenizer.json
# When loading from pretrained, this will be overridden by the tokenizer.json config
# When creating from extractor (vocab), this provides equivalent behavior
self._tokenizer.normalizer = normalizers.Sequence(
[
normalizers.Replace(Regex(r"[\n\r\t]"), " "), # Precompiled converts newlines/tabs to spaces
normalizers.NFKC(), # Precompiled does NFKC normalization
normalizers.Strip(left=False, right=True), # Strip trailing whitespace (matches tokenizer.json)
normalizers.Replace(
Regex(r" {2,}"), "▁"
), # Replace multiple spaces with underscore (matches tokenizer.json)
]
)
normalizers_ = [normalizers.Replace(Regex(r" {2,}"), " ")]
if _spm_precompiled_charsmap is not None:
normalizers_ = [normalizers.Precompiled(_spm_precompiled_charsmap)] + normalizers_

self._tokenizer.normalizer = normalizers.Sequence(normalizers_)
self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme="always", split=True)

self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="always", split=True)
Expand Down
Loading