Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 3 additions & 19 deletions bindings/python/src/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ impl PyBPE {
signature = (vocab=None, merges=None, **kwargs),
text_signature = "(self, vocab=None, merges=None, cache_capacity=None, dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=None, byte_fallback=False, ignore_merges=False)")]
fn new(
py: Python<'_>,
_py: Python<'_>,
vocab: Option<PyVocab>,
merges: Option<PyMerges>,
kwargs: Option<&Bound<'_, PyDict>>,
Expand All @@ -443,11 +443,6 @@ impl PyBPE {
builder = builder.vocab_and_merges(vocab, merges);
}
(PyVocab::Filename(vocab_filename), PyMerges::Filename(merges_filename)) => {
deprecation_warning(
py,
"0.9.0",
"BPE.__init__ will not create from files anymore, try `BPE.from_file` instead",
)?;
builder =
builder.files(vocab_filename.to_string(), merges_filename.to_string());
}
Expand Down Expand Up @@ -649,7 +644,7 @@ impl PyWordPiece {
text_signature = "(self, vocab=None, unk_token='[UNK]', max_input_chars_per_word=100, continuing_subword_prefix='##')"
)]
fn new(
py: Python<'_>,
_py: Python<'_>,
vocab: Option<PyVocab>,
kwargs: Option<&Bound<'_, PyDict>>,
) -> PyResult<(Self, PyModel)> {
Expand All @@ -662,11 +657,6 @@ impl PyWordPiece {
builder = builder.vocab(vocab);
}
PyVocab::Filename(vocab_filename) => {
deprecation_warning(
py,
"0.9.0",
"WordPiece.__init__ will not create from files anymore, try `WordPiece.from_file` instead",
)?;
builder = builder.files(vocab_filename.to_string());
}
}
Expand Down Expand Up @@ -765,7 +755,7 @@ impl PyWordLevel {
text_signature = "(self, vocab=None, unk_token=None)"
)]
fn new(
py: Python<'_>,
_py: Python<'_>,
vocab: Option<PyVocab>,
unk_token: Option<String>,
) -> PyResult<(Self, PyModel)> {
Expand All @@ -778,12 +768,6 @@ impl PyWordLevel {
builder = builder.vocab(vocab);
}
PyVocab::Filename(vocab_filename) => {
deprecation_warning(
py,
"0.9.0",
"WordLevel.__init__ will not create from files anymore, \
try `WordLevel.from_file` instead",
)?;
builder = builder.files(vocab_filename.to_string());
}
};
Expand Down
45 changes: 4 additions & 41 deletions bindings/python/tests/bindings/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,38 +7,6 @@


class TestBPE:
def test_instantiate(self, roberta_files):
assert isinstance(BPE(), Model)
assert isinstance(BPE(), BPE)

vocab = {"a": 0, "b": 1, "ab": 2}
merges = [("a", "b")]
assert isinstance(BPE(vocab, merges), Model)
assert isinstance(BPE.from_file(roberta_files["vocab"], roberta_files["merges"]), BPE)
with pytest.raises(ValueError, match="`vocab` and `merges` must be both specified"):
BPE(vocab=vocab)
with pytest.raises(ValueError, match="`vocab` and `merges` must be both specified"):
BPE(merges=merges)

assert isinstance(
pickle.loads(pickle.dumps(BPE(vocab, merges))),
BPE,
)

# Deprecated calls in 0.9
with pytest.deprecated_call():
assert isinstance(BPE(roberta_files["vocab"], roberta_files["merges"]), Model)

with pytest.raises(ValueError, match="`vocab` and `merges` must be both specified"):
BPE(vocab=roberta_files["vocab"])
with pytest.raises(ValueError, match="`vocab` and `merges` must be both specified"):
BPE(merges=roberta_files["merges"])
with pytest.deprecated_call():
assert isinstance(
pickle.loads(pickle.dumps(BPE(roberta_files["vocab"], roberta_files["merges"]))),
BPE,
)

def test_can_modify(self):
model = BPE(
dropout=0.5,
Expand Down Expand Up @@ -85,11 +53,8 @@ def test_instantiate(self, bert_files):
assert isinstance(WordPiece.from_file(bert_files["vocab"]), WordPiece)
assert isinstance(pickle.loads(pickle.dumps(WordPiece(vocab))), WordPiece)

# Deprecated calls in 0.9
with pytest.deprecated_call():
assert isinstance(WordPiece(bert_files["vocab"]), Model)
with pytest.deprecated_call():
assert isinstance(pickle.loads(pickle.dumps(WordPiece(bert_files["vocab"]))), WordPiece)
assert isinstance(WordPiece(bert_files["vocab"]), Model)
assert isinstance(pickle.loads(pickle.dumps(WordPiece(bert_files["vocab"]))), WordPiece)

def test_can_modify(self):
model = WordPiece(
Expand Down Expand Up @@ -123,10 +88,8 @@ def test_instantiate(self, roberta_files):

# The WordLevel model expects a vocab.json using the same format as roberta
# so we can just try to load with this file
with pytest.deprecated_call():
assert isinstance(WordLevel(roberta_files["vocab"]), Model)
with pytest.deprecated_call():
assert isinstance(WordLevel(roberta_files["vocab"]), WordLevel)
assert isinstance(WordLevel(roberta_files["vocab"]), Model)
assert isinstance(WordLevel(roberta_files["vocab"]), WordLevel)

def test_can_modify(self):
model = WordLevel(unk_token="<oov>")
Expand Down
4 changes: 1 addition & 3 deletions bindings/python/tests/bindings/test_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,7 @@ def test_instantiate(self):
assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel)

def test_processing(self, roberta_files):
# Deprecated in 0.9
with pytest.deprecated_call():
tokenizer = Tokenizer(BPE(roberta_files["vocab"], roberta_files["merges"]))
tokenizer = Tokenizer(BPE(roberta_files["vocab"], roberta_files["merges"]))
tokenizer.pre_tokenizer = ByteLevelPreTokenizer(add_prefix_space=True)

# Keeps original offsets
Expand Down
6 changes: 2 additions & 4 deletions bindings/python/tests/bindings/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,7 @@ def test_encode(self):
assert len(output) == 2

def test_encode_formats(self, bert_files):
with pytest.deprecated_call():
tokenizer = BertWordPieceTokenizer(bert_files["vocab"])
tokenizer = BertWordPieceTokenizer(bert_files["vocab"])

# Encode
output = tokenizer.encode("my name is john")
Expand Down Expand Up @@ -287,8 +286,7 @@ def test_pair(input, is_pretokenized=False):
tokenizer.encode(["My", "name", "is", "John"], "pair", is_pretokenized=True)

def test_encode_add_special_tokens(self, roberta_files):
with pytest.deprecated_call():
tokenizer = Tokenizer(BPE(roberta_files["vocab"], roberta_files["merges"]))
tokenizer = Tokenizer(BPE(roberta_files["vocab"], roberta_files["merges"]))
tokenizer.add_special_tokens(["<s>", "</s>"])

tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)
Expand Down
3 changes: 1 addition & 2 deletions tokenizers/src/models/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,7 @@ impl Serialize for OrderedVocabIter<'_> {
};

if !holes.is_empty() {
warn!("The OrderedVocab you are attempting to save contains holes for indices {holes:?}, your vocabulary could be corrupted!");
println!("The OrderedVocab you are attempting to save contains holes for indices {holes:?}, your vocabulary could be corrupted!");
warn!("The OrderedVocab you are attempting to serialize contains holes for indices {holes:?}, your vocabulary could be corrupted!");
}
result
}
Expand Down
Loading