-
Notifications
You must be signed in to change notification settings - Fork 31.9k
Integrate fast tokenizers library inside transformers #2674
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f8f7487
c435009
96bc6e6
c2a5805
92ce90d
0e19ed3
7f5e943
8d4322a
02dcd7c
4a2ef66
3ad2ed6
233773f
0f31b31
f806ec1
e58a2ad
8ec45ee
fa926ed
da3a899
a63b25d
11414a6
b4cf279
4b57478
5cdef87
78d975a
285da47
30ce9ee
cb59a27
7cd0858
18ca932
ea75afc
f2ccac3
1d7cdde
14dfb32
7be2a07
3ae3811
f29a103
e18396a
c16df3b
490e690
2f0df23
1f25635
8fd7e67
0edb712
f934a2b
028a2ab
73aa1da
5fcb4f0
1cad8f7
bce7676
1111567
84a8c80
bc38709
1f38b59
150f38c
e07af64
9ebefaf
6c58a79
43afcec
623064d
adc9d59
60152ec
7c9d853
339175d
c64b472
d2ff615
2689cf0
cc94880
7d05684
a2043b0
f8e3cf4
8499ddc
ad61705
4c50f20
3342897
590ceb5
56748e8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,7 +20,7 @@ | |
| import os | ||
| import unicodedata | ||
|
|
||
| import tokenizers as tk | ||
| from tokenizers import BertWordPieceTokenizer | ||
|
|
||
| from .tokenization_utils import PreTrainedTokenizer, PreTrainedTokenizerFast | ||
|
|
||
|
|
@@ -550,14 +550,19 @@ def __init__( | |
| cls_token="[CLS]", | ||
| mask_token="[MASK]", | ||
| tokenize_chinese_chars=True, | ||
| max_length=None, | ||
| pad_to_max_length=False, | ||
| stride=0, | ||
| truncation_strategy="longest_first", | ||
| add_special_tokens=True, | ||
| **kwargs | ||
| ): | ||
| super().__init__( | ||
| BertWordPieceTokenizer( | ||
| vocab_file=vocab_file, | ||
| add_special_tokens=add_special_tokens, | ||
| unk_token=unk_token, | ||
| sep_token=sep_token, | ||
| cls_token=cls_token, | ||
| handle_chinese_chars=tokenize_chinese_chars, | ||
| lowercase=do_lower_case, | ||
| ), | ||
| unk_token=unk_token, | ||
| sep_token=sep_token, | ||
| pad_token=pad_token, | ||
|
|
@@ -566,32 +571,4 @@ def __init__( | |
| **kwargs, | ||
| ) | ||
|
|
||
| self._tokenizer = tk.Tokenizer(tk.models.WordPiece.from_files(vocab_file, unk_token=unk_token)) | ||
| self._update_special_tokens() | ||
| self._tokenizer.with_pre_tokenizer( | ||
| tk.pre_tokenizers.BertPreTokenizer.new( | ||
| do_basic_tokenize=do_basic_tokenize, | ||
| do_lower_case=do_lower_case, | ||
| tokenize_chinese_chars=tokenize_chinese_chars, | ||
| never_split=never_split if never_split is not None else [], | ||
| ) | ||
| ) | ||
| self._tokenizer.with_decoder(tk.decoders.WordPiece.new()) | ||
|
|
||
| if add_special_tokens: | ||
| self._tokenizer.with_post_processor( | ||
| tk.processors.BertProcessing.new( | ||
| (sep_token, self._tokenizer.token_to_id(sep_token)), | ||
| (cls_token, self._tokenizer.token_to_id(cls_token)), | ||
| ) | ||
| ) | ||
| if max_length is not None: | ||
| self._tokenizer.with_truncation(max_length, stride=stride, strategy=truncation_strategy) | ||
| self._tokenizer.with_padding( | ||
| max_length=max_length if pad_to_max_length else None, | ||
| direction=self.padding_side, | ||
| pad_id=self.pad_token_id, | ||
| pad_type_id=self.pad_token_type_id, | ||
| pad_token=self.pad_token, | ||
| ) | ||
| self._decoder = tk.decoders.WordPiece.new() | ||
|
Comment on lines
-569
to
-597
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is very satisfying |
||
| self.do_lower_case = do_lower_case | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,9 +19,18 @@ | |
| import logging | ||
| import os | ||
| import re | ||
| from typing import List, Optional, Union | ||
|
|
||
| from tokenizers import Tokenizer | ||
| from tokenizers.decoders import BPEDecoder | ||
| from tokenizers.implementations import BaseTokenizer | ||
| from tokenizers.models import BPE | ||
| from tokenizers.normalizers import BertNormalizer, Sequence, unicode_normalizer_from_str | ||
| from tokenizers.pre_tokenizers import BertPreTokenizer | ||
| from tokenizers.trainers import BpeTrainer | ||
|
|
||
| from .tokenization_bert import BasicTokenizer | ||
| from .tokenization_utils import PreTrainedTokenizer | ||
| from .tokenization_utils import PreTrainedTokenizer, PreTrainedTokenizerFast | ||
|
|
||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
@@ -213,3 +222,93 @@ def save_vocabulary(self, save_directory): | |
| index += 1 | ||
|
|
||
| return vocab_file, merge_file | ||
|
|
||
|
|
||
| class _OpenAIGPTCharBPETokenizer(BaseTokenizer): | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we have to have this class here? Don't we have an implementation of char-level BPE in
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We do need a special OpenaiGPT implementation because it slightly differs from the char-level BPE we have in tokenizers:
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we put TransformerXL into tokenizers.implementations, may be this one can make its way to tokenizers too. cc @n1t0
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Honestly, I'm not too sure about this. I think
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
You mean have all the things that made the success of Jking Well ok for me to keep these in |
||
| """ | ||
| OpenAI character-level BPE Tokenizer | ||
| """ | ||
|
|
||
| def __init__( | ||
| self, | ||
| vocab_file: Optional[str] = None, | ||
| merges_file: Optional[str] = None, | ||
| unk_token: Optional[str] = "<unk>", | ||
| suffix: Optional[str] = "</w>", | ||
| dropout: Optional[float] = None, | ||
| unicode_normalizer: Optional[str] = None, | ||
| ): | ||
| if vocab_file is not None and merges_file is not None: | ||
| tokenizer = Tokenizer( | ||
| BPE.from_files( | ||
| vocab_file, merges_file, dropout=dropout, unk_token=unk_token, end_of_word_suffix=suffix | ||
| ) | ||
| ) | ||
| else: | ||
| tokenizer = Tokenizer(BPE.empty()) | ||
|
|
||
| # Check for Unicode normalization first (before everything else) | ||
| normalizers = [] | ||
|
|
||
| if unicode_normalizer: | ||
| normalizers += [unicode_normalizer_from_str(unicode_normalizer)] | ||
|
|
||
| # OpenAI normalization is the same as Bert | ||
| normalizers += [BertNormalizer()] | ||
|
|
||
| # Create the normalizer structure | ||
| if len(normalizers) > 0: | ||
| if len(normalizers) > 1: | ||
| tokenizer.normalizer = Sequence(normalizers) | ||
| else: | ||
| tokenizer.normalizer = normalizers[0] | ||
|
|
||
| tokenizer.pre_tokenizer = BertPreTokenizer() | ||
| tokenizer.decoder = BPEDecoder(suffix=suffix) | ||
|
|
||
| parameters = { | ||
| "model": "BPE", | ||
| "unk_token": unk_token, | ||
| "suffix": suffix, | ||
| "dropout": dropout, | ||
| } | ||
|
|
||
| super().__init__(tokenizer, parameters) | ||
|
|
||
| def train( | ||
| self, | ||
| files: Union[str, List[str]], | ||
| vocab_size: int = 30000, | ||
| min_frequency: int = 2, | ||
| special_tokens: List[str] = ["<unk>"], | ||
| limit_alphabet: int = 1000, | ||
| initial_alphabet: List[str] = [], | ||
| suffix: Optional[str] = "</w>", | ||
| show_progress: bool = True, | ||
| ): | ||
| """ Train the model using the given files """ | ||
|
|
||
| trainer = BpeTrainer( | ||
| vocab_size=vocab_size, | ||
| min_frequency=min_frequency, | ||
| special_tokens=special_tokens, | ||
| limit_alphabet=limit_alphabet, | ||
| initial_alphabet=initial_alphabet, | ||
| end_of_word_suffix=suffix, | ||
| show_progress=show_progress, | ||
| ) | ||
| if isinstance(files, str): | ||
| files = [files] | ||
| self._tokenizer.train(trainer, files) | ||
|
|
||
|
|
||
| class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast): | ||
| vocab_files_names = VOCAB_FILES_NAMES | ||
| pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP | ||
| max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES | ||
|
|
||
| def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs): | ||
| kwargs.setdefault("unk_token", unk_token) | ||
| super().__init__( | ||
| _OpenAIGPTCharBPETokenizer(vocab_file=vocab_file, merges_file=merges_file, unk_token=unk_token), **kwargs | ||
| ) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice to have this upstream now!