Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions scripts/fsmt/fsmt-make-super-tiny-model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/usr/bin/env python
# coding: utf-8

# This script creates a super tiny model that is useful inside tests, when we just want to test that
# the machinery works, without needing to the check the quality of the outcomes.
#
# This version creates a tiny vocab first, and then a tiny model - so the outcome is truly tiny -
# all files ~60KB. As compared to taking a full-size model, reducing to the minimum its layers and
# emb dimensions, but keeping the full vocab + merges files, leading to ~3MB in total for all files.
# The latter is done by `fsmt-make-super-tiny-model.py`.
#
# It will be used then as "stas/tiny-wmt19-en-ru"

from pathlib import Path
import json
import tempfile

from transformers import FSMTTokenizer, FSMTConfig, FSMTForConditionalGeneration
from transformers.tokenization_fsmt import VOCAB_FILES_NAMES

mname_tiny = "tiny-wmt19-en-ru"

# Build

# borrowed from a test
vocab = [ "l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "w</w>", "r</w>", "t</w>", "lo", "low", "er</w>", "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>", ]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]

with tempfile.TemporaryDirectory() as tmpdirname:
build_dir = Path(tmpdirname)
src_vocab_file = build_dir / VOCAB_FILES_NAMES["src_vocab_file"]
tgt_vocab_file = build_dir / VOCAB_FILES_NAMES["tgt_vocab_file"]
merges_file = build_dir / VOCAB_FILES_NAMES["merges_file"]
with open(src_vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens))
with open(tgt_vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens))
with open(merges_file, "w") as fp : fp.write("\n".join(merges))

tokenizer = FSMTTokenizer(
langs=["en", "ru"],
src_vocab_size = len(vocab),
tgt_vocab_size = len(vocab),
src_vocab_file=src_vocab_file,
tgt_vocab_file=tgt_vocab_file,
merges_file=merges_file,
)

config = FSMTConfig(
langs=['ru', 'en'],
src_vocab_size=1000, tgt_vocab_size=1000,
d_model=4,
encoder_layers=1, decoder_layers=1,
encoder_ffn_dim=4, decoder_ffn_dim=4,
encoder_attention_heads=1, decoder_attention_heads=1,
)

tiny_model = FSMTForConditionalGeneration(config)
print(f"num of params {tiny_model.num_parameters()}")

# Test
batch = tokenizer.prepare_seq2seq_batch(["Making tiny model"])
outputs = tiny_model(**batch, return_dict=True)

print("test output:", len(outputs.logits[0]))

# Save
tiny_model.half() # makes it smaller
tiny_model.save_pretrained(mname_tiny)
tokenizer.save_pretrained(mname_tiny)

print(f"Generated {mname_tiny}")

# Upload
# transformers-cli upload tiny-wmt19-en-ru
23 changes: 18 additions & 5 deletions scripts/fsmt/fsmt-make-tiny-model.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
#!/usr/bin/env python
# coding: utf-8

# this script creates a tiny model that is useful inside tests, when we just want to test that the machinery works,
# without needing to the check the quality of the outcomes.
# it will be used then as "stas/tiny-wmt19-en-de"
# This script creates a super tiny model that is useful inside tests, when we just want to test that
# the machinery works, without needing to the check the quality of the outcomes.
#
# This version creates a tiny model through reduction of a normal pre-trained model, but keeping the
# full vocab, merges file, and thus also resulting in a larger model due to a large vocab size.
# This gives ~3MB in total for all files.
#
# If you want a 50 times smaller than this see `fsmt-make-super-tiny-model.py`, which is slightly more complicated
#
#
# It will be used then as "stas/tiny-wmt19-en-de"

# Build
from transformers import FSMTTokenizer, FSMTConfig, FSMTForConditionalGeneration
mname = "facebook/wmt19-en-de"
tokenizer = FSMTTokenizer.from_pretrained(mname)
Expand All @@ -18,16 +27,20 @@

tiny_model = FSMTForConditionalGeneration(config)
print(f"num of params {tiny_model.num_parameters()}")
# Test it

# Test
batch = tokenizer.prepare_seq2seq_batch(["Making tiny model"])
outputs = tiny_model(**batch, return_dict=True)

print(len(outputs.logits[0]))
print("test output:", len(outputs.logits[0]))

# Save
mname_tiny = "tiny-wmt19-en-de"
tiny_model.half() # makes it smaller
tiny_model.save_pretrained(mname_tiny)
tokenizer.save_pretrained(mname_tiny)

print(f"Generated {mname_tiny}")

# Upload
# transformers-cli upload tiny-wmt19-en-de
13 changes: 13 additions & 0 deletions tests/test_tokenization_fsmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@
from .test_tokenization_common import TokenizerTesterMixin


# using a different tiny model than the one used for default params defined in init to ensure proper testing
FSMT_TINY2 = "stas/tiny-wmt19-en-ru"


class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = FSMTTokenizer

Expand Down Expand Up @@ -86,6 +90,15 @@ def tokenizer_ru_en(self):
def tokenizer_en_ru(self):
return FSMTTokenizer.from_pretrained("facebook/wmt19-en-ru")

def test_online_tokenizer_config(self):
"""this just tests that the online tokenizer files get correctly fetched and
loaded via its tokenizer_config.json and it's not slow so it's run by normal CI
"""
tokenizer = FSMTTokenizer.from_pretrained(FSMT_TINY2)
self.assertListEqual([tokenizer.src_lang, tokenizer.tgt_lang], ["en", "ru"])
self.assertEqual(tokenizer.src_vocab_size, 21)
self.assertEqual(tokenizer.tgt_vocab_size, 21)

def test_full_tokenizer(self):
""" Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
tokenizer = FSMTTokenizer(self.langs, self.src_vocab_file, self.tgt_vocab_file, self.merges_file)
Expand Down