From d7631f551ffb6765ec049d5787df608529a0adac Mon Sep 17 00:00:00 2001 From: Arthur Date: Mon, 5 Jan 2026 13:11:59 +0100 Subject: [PATCH 1/4] make sur the warning is just a warning --- tokenizers/src/models/mod.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tokenizers/src/models/mod.rs b/tokenizers/src/models/mod.rs index 6f5fafca9..041e3b629 100644 --- a/tokenizers/src/models/mod.rs +++ b/tokenizers/src/models/mod.rs @@ -51,8 +51,7 @@ impl Serialize for OrderedVocabIter<'_> { }; if !holes.is_empty() { - warn!("The OrderedVocab you are attempting to save contains holes for indices {holes:?}, your vocabulary could be corrupted!"); - println!("The OrderedVocab you are attempting to save contains holes for indices {holes:?}, your vocabulary could be corrupted!"); + warn!("The OrderedVocab you are attempting to serialize contains holes for indices {holes:?}, your vocabulary could be corrupted!"); } result } From b94ff2eb5d7774ae857d9ed5e14c7268f435b619 Mon Sep 17 00:00:00 2001 From: Arthur Date: Mon, 5 Jan 2026 13:20:31 +0100 Subject: [PATCH 2/4] update --- bindings/python/src/models.rs | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/bindings/python/src/models.rs b/bindings/python/src/models.rs index 67339c3a1..371489640 100644 --- a/bindings/python/src/models.rs +++ b/bindings/python/src/models.rs @@ -443,11 +443,6 @@ impl PyBPE { builder = builder.vocab_and_merges(vocab, merges); } (PyVocab::Filename(vocab_filename), PyMerges::Filename(merges_filename)) => { - deprecation_warning( - py, - "0.9.0", - "BPE.__init__ will not create from files anymore, try `BPE.from_file` instead", - )?; builder = builder.files(vocab_filename.to_string(), merges_filename.to_string()); } @@ -662,11 +657,6 @@ impl PyWordPiece { builder = builder.vocab(vocab); } PyVocab::Filename(vocab_filename) => { - deprecation_warning( - py, - "0.9.0", - "WordPiece.__init__ will not create from files anymore, try `WordPiece.from_file` instead", - )?; builder = builder.files(vocab_filename.to_string()); } } @@ -778,12 +768,6 @@ impl PyWordLevel { builder = builder.vocab(vocab); } PyVocab::Filename(vocab_filename) => { - deprecation_warning( - py, - "0.9.0", - "WordLevel.__init__ will not create from files anymore, \ - try `WordLevel.from_file` instead", - )?; builder = builder.files(vocab_filename.to_string()); } }; From ae9cb1cbd57f1b952577baf15e35b73572a940d5 Mon Sep 17 00:00:00 2001 From: Arthur Date: Mon, 5 Jan 2026 13:35:37 +0100 Subject: [PATCH 3/4] nits --- bindings/python/src/models.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bindings/python/src/models.rs b/bindings/python/src/models.rs index 371489640..d093fa142 100644 --- a/bindings/python/src/models.rs +++ b/bindings/python/src/models.rs @@ -424,7 +424,7 @@ impl PyBPE { signature = (vocab=None, merges=None, **kwargs), text_signature = "(self, vocab=None, merges=None, cache_capacity=None, dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=None, byte_fallback=False, ignore_merges=False)")] fn new( - py: Python<'_>, + _py: Python<'_>, vocab: Option, merges: Option, kwargs: Option<&Bound<'_, PyDict>>, @@ -644,7 +644,7 @@ impl PyWordPiece { text_signature = "(self, vocab=None, unk_token='[UNK]', max_input_chars_per_word=100, continuing_subword_prefix='##')" )] fn new( - py: Python<'_>, + _py: Python<'_>, vocab: Option, kwargs: Option<&Bound<'_, PyDict>>, ) -> PyResult<(Self, PyModel)> { @@ -755,7 +755,7 @@ impl PyWordLevel { text_signature = "(self, vocab=None, unk_token=None)" )] fn new( - py: Python<'_>, + _py: Python<'_>, vocab: Option, unk_token: Option, ) -> PyResult<(Self, PyModel)> { From 0baa990729ba3ff79e8fefa2190e7288f0fe6f01 Mon Sep 17 00:00:00 2001 From: Arthur Date: Mon, 5 Jan 2026 13:45:27 +0100 Subject: [PATCH 4/4] fix tests --- bindings/python/tests/bindings/test_models.py | 45 ++----------------- .../python/tests/bindings/test_processors.py | 4 +- .../python/tests/bindings/test_tokenizer.py | 6 +-- 3 files changed, 7 insertions(+), 48 deletions(-) diff --git a/bindings/python/tests/bindings/test_models.py b/bindings/python/tests/bindings/test_models.py index 063698384..34fba4fb1 100644 --- a/bindings/python/tests/bindings/test_models.py +++ b/bindings/python/tests/bindings/test_models.py @@ -7,38 +7,6 @@ class TestBPE: - def test_instantiate(self, roberta_files): - assert isinstance(BPE(), Model) - assert isinstance(BPE(), BPE) - - vocab = {"a": 0, "b": 1, "ab": 2} - merges = [("a", "b")] - assert isinstance(BPE(vocab, merges), Model) - assert isinstance(BPE.from_file(roberta_files["vocab"], roberta_files["merges"]), BPE) - with pytest.raises(ValueError, match="`vocab` and `merges` must be both specified"): - BPE(vocab=vocab) - with pytest.raises(ValueError, match="`vocab` and `merges` must be both specified"): - BPE(merges=merges) - - assert isinstance( - pickle.loads(pickle.dumps(BPE(vocab, merges))), - BPE, - ) - - # Deprecated calls in 0.9 - with pytest.deprecated_call(): - assert isinstance(BPE(roberta_files["vocab"], roberta_files["merges"]), Model) - - with pytest.raises(ValueError, match="`vocab` and `merges` must be both specified"): - BPE(vocab=roberta_files["vocab"]) - with pytest.raises(ValueError, match="`vocab` and `merges` must be both specified"): - BPE(merges=roberta_files["merges"]) - with pytest.deprecated_call(): - assert isinstance( - pickle.loads(pickle.dumps(BPE(roberta_files["vocab"], roberta_files["merges"]))), - BPE, - ) - def test_can_modify(self): model = BPE( dropout=0.5, @@ -85,11 +53,8 @@ def test_instantiate(self, bert_files): assert isinstance(WordPiece.from_file(bert_files["vocab"]), WordPiece) assert isinstance(pickle.loads(pickle.dumps(WordPiece(vocab))), WordPiece) - # Deprecated calls in 0.9 - with pytest.deprecated_call(): - assert isinstance(WordPiece(bert_files["vocab"]), Model) - with pytest.deprecated_call(): - assert isinstance(pickle.loads(pickle.dumps(WordPiece(bert_files["vocab"]))), WordPiece) + assert isinstance(WordPiece(bert_files["vocab"]), Model) + assert isinstance(pickle.loads(pickle.dumps(WordPiece(bert_files["vocab"]))), WordPiece) def test_can_modify(self): model = WordPiece( @@ -123,10 +88,8 @@ def test_instantiate(self, roberta_files): # The WordLevel model expects a vocab.json using the same format as roberta # so we can just try to load with this file - with pytest.deprecated_call(): - assert isinstance(WordLevel(roberta_files["vocab"]), Model) - with pytest.deprecated_call(): - assert isinstance(WordLevel(roberta_files["vocab"]), WordLevel) + assert isinstance(WordLevel(roberta_files["vocab"]), Model) + assert isinstance(WordLevel(roberta_files["vocab"]), WordLevel) def test_can_modify(self): model = WordLevel(unk_token="") diff --git a/bindings/python/tests/bindings/test_processors.py b/bindings/python/tests/bindings/test_processors.py index 50cf770f0..a7e0ae13e 100644 --- a/bindings/python/tests/bindings/test_processors.py +++ b/bindings/python/tests/bindings/test_processors.py @@ -72,9 +72,7 @@ def test_instantiate(self): assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel) def test_processing(self, roberta_files): - # Deprecated in 0.9 - with pytest.deprecated_call(): - tokenizer = Tokenizer(BPE(roberta_files["vocab"], roberta_files["merges"])) + tokenizer = Tokenizer(BPE(roberta_files["vocab"], roberta_files["merges"])) tokenizer.pre_tokenizer = ByteLevelPreTokenizer(add_prefix_space=True) # Keeps original offsets diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py index 8780531c3..b28104e91 100644 --- a/bindings/python/tests/bindings/test_tokenizer.py +++ b/bindings/python/tests/bindings/test_tokenizer.py @@ -155,8 +155,7 @@ def test_encode(self): assert len(output) == 2 def test_encode_formats(self, bert_files): - with pytest.deprecated_call(): - tokenizer = BertWordPieceTokenizer(bert_files["vocab"]) + tokenizer = BertWordPieceTokenizer(bert_files["vocab"]) # Encode output = tokenizer.encode("my name is john") @@ -287,8 +286,7 @@ def test_pair(input, is_pretokenized=False): tokenizer.encode(["My", "name", "is", "John"], "pair", is_pretokenized=True) def test_encode_add_special_tokens(self, roberta_files): - with pytest.deprecated_call(): - tokenizer = Tokenizer(BPE(roberta_files["vocab"], roberta_files["merges"])) + tokenizer = Tokenizer(BPE(roberta_files["vocab"], roberta_files["merges"])) tokenizer.add_special_tokens(["", ""]) tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)