diff --git a/.gitignore b/.gitignore index 9a6b945c..d8c8629e 100644 --- a/.gitignore +++ b/.gitignore @@ -27,4 +27,4 @@ datasets qqp glm_large_qqp_pytorch wandb -clip_benchmark_datasets/ \ No newline at end of file +clip_benchmark_datasets diff --git a/README.md b/README.md index 54ce31f3..d48f1be3 100644 --- a/README.md +++ b/README.md @@ -260,6 +260,6 @@ The majority of FlagAI is licensed under the [Apache 2.0 license](LICENSE), howe ### ↳ Star History
-[![Star History Chart](https://api.star-history.com/svg?repos=FlagAI-Open/FlagAI&type=Date)](https://star-history.com/#baaivision/EVA&Date) +![Star History Chart](https://api.star-history.com/svg?repos=FlagAI-Open/FlagAI&type=Date)]
diff --git a/examples/bert_title_generation_english/generate.py b/examples/bert_title_generation_english/generate.py index 1124d16d..fdfa2f41 100644 --- a/examples/bert_title_generation_english/generate.py +++ b/examples/bert_title_generation_english/generate.py @@ -7,7 +7,7 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -model_dir = "../state_dict/" +model_dir = "./checkpoints/" # Note "./checkpoints_seq2seq/{}/mp_rank_00_model_states.pt", {} is a directory in the checkpoints_seq2seq. model_save_path = "./checkpoints_seq2seq/7079/mp_rank_00_model_states.pt" diff --git a/examples/roberta_semantic_matching/train.py b/examples/roberta_semantic_matching/train.py index e0648063..30e9821f 100644 --- a/examples/roberta_semantic_matching/train.py +++ b/examples/roberta_semantic_matching/train.py @@ -27,7 +27,7 @@ cur_dir = os.path.dirname(os.path.abspath(__file__)) train_path = cur_dir + "/data/train.tsv" -model_dir = "./state_dict/" +model_dir = "./checkpoints/" maxlen = 256 auto_loader = AutoLoader("semantic-matching", diff --git a/flagai/data/dataset/block/blocklm_utils.py b/flagai/data/dataset/block/blocklm_utils.py index 4687305f..44fda3d2 100644 --- a/flagai/data/dataset/block/blocklm_utils.py +++ b/flagai/data/dataset/block/blocklm_utils.py @@ -86,10 +86,10 @@ def __init__(self, self.encoder_decoder = encoder_decoder self.shuffle_blocks = shuffle_blocks self.sentinel_token = sentinel_token - self.generation_mask = 'gMASK' if task_mask else 'MASK' + self.generation_mask = 'gMASK' if task_mask else 'mask' self.generation_mask = self.tokenizer.get_command_id( self.generation_mask) - self.gap_sentence_mask = 'sMASK' if task_mask else 'MASK' + self.gap_sentence_mask = 'sMASK' if task_mask else 'mask' self.gap_sentence_mask = self.tokenizer.get_command_id( self.gap_sentence_mask) self.random_position = random_position @@ -205,7 +205,7 @@ def make_masked_data(self, # position_ids = np.arange(len(tokens), dtype=np.int64) targets = copy.deepcopy(tokens) - mask_id = self.tokenizer.get_command_id('MASK') + mask_id = self.tokenizer.get_command_id('mask') mlm_masks = np.zeros(len(tokens), dtype=np.int64) for start, end in block_spans: for idx in range(start, end): @@ -273,7 +273,7 @@ def make_block_data(self, elif task == 'gap_sentence': mask_id = self.gap_sentence_mask else: - mask_token = 'MASK' if idx == 0 else f'MASK{idx}' + mask_token = 'mask' if idx == 0 else f'MASK{idx}' mask_id = self.tokenizer.get_command_id(mask_token) local_spans.append((current_length, current_length + start - last)) source_tokens.append(tokens[last:start]) diff --git a/flagai/data/dataset/data_collator/collate_fn.py b/flagai/data/dataset/data_collator/collate_fn.py index 73b2f8e5..6eb629d5 100644 --- a/flagai/data/dataset/data_collator/collate_fn.py +++ b/flagai/data/dataset/data_collator/collate_fn.py @@ -126,7 +126,7 @@ def __init__(self, args, tokenizer, task_name): def encode(self, example): cls_id = self.tokenizer.get_command_id('cls') - mask_token = 'sMASK' if self.args.task_mask else 'MASK' + mask_token = 'sMASK' if self.args.task_mask else 'mask' mask_id = self.tokenizer.get_command_id(mask_token) pad_id = self.tokenizer.get_command_id('pad') sop_id = self.tokenizer.get_command_id('sop') @@ -175,7 +175,7 @@ def sub_finder(mylist, pattern): source_tokens = [cls_id] + source_tokens + [mask_id ] + answer_tokens elif self.task_name in ["cmrc"]: - mask_id = self.tokenizer.get_command_id('MASK') + mask_id = self.tokenizer.get_command_id('mask') source_text = example.text_a target_text = example.meta["answer"].strip() question = example.meta["question"].strip() @@ -191,7 +191,7 @@ def sub_finder(mylist, pattern): mask_id ] + source_tokens[:max_src_length] elif self.task_name in ["wsc"]: - mask_id = self.tokenizer.get_command_id('MASK') + mask_id = self.tokenizer.get_command_id('mask') source_text = example.text_a target_text = example.meta["answer"].strip() question = example.meta["question"].strip() @@ -307,10 +307,10 @@ def __init__(self, self.encoder_decoder = encoder_decoder self.shuffle_blocks = shuffle_blocks self.sentinel_token = sentinel_token - self.generation_mask = 'gMASK' if task_mask else 'MASK' + self.generation_mask = 'gMASK' if task_mask else 'mask' self.generation_mask = self.tokenizer.get_command_id( self.generation_mask) - self.gap_sentence_mask = 'sMASK' if task_mask else 'MASK' + self.gap_sentence_mask = 'sMASK' if task_mask else 'mask' self.gap_sentence_mask = self.tokenizer.get_command_id( self.gap_sentence_mask) self.random_position = random_position @@ -426,7 +426,7 @@ def make_masked_data(self, position_ids = np.arange(len(tokens), dtype=np.int64) targets = copy.deepcopy(tokens) - mask_id = self.tokenizer.get_command_id('MASK') + mask_id = self.tokenizer.get_command_id('mask') mlm_masks = np.zeros(len(tokens), dtype=np.int64) for start, end in block_spans: for idx in range(start, end): @@ -494,7 +494,7 @@ def make_block_data(self, elif task == 'gap_sentence': mask_id = self.gap_sentence_mask else: - mask_token = 'MASK' if idx == 0 else f'MASK{idx}' + mask_token = 'mask' if idx == 0 else f'MASK{idx}' mask_id = self.tokenizer.get_command_id(mask_token) local_spans.append((current_length, current_length + start - last)) source_tokens.append(tokens[last:start]) diff --git a/flagai/data/dataset/data_utils.py b/flagai/data/dataset/data_utils.py index 4f0ee38d..1efee372 100644 --- a/flagai/data/dataset/data_utils.py +++ b/flagai/data/dataset/data_utils.py @@ -134,7 +134,7 @@ def build_input_from_ids(text_a_ids, # Prepare ids for special tokens if mask_id is None: - mask_id = tokenizer.get_command_id('MASK') + mask_id = tokenizer.get_command_id('mask') eos_id = tokenizer.get_command_id('eos') # end of sentence token cls_id = tokenizer.get_command_id('cls') # start of sentence token sep_id = tokenizer.get_command_id('sep') # seperator of two texts token @@ -235,7 +235,7 @@ def build_input_from_ids(text_a_ids, # def build_decoder_input(enc_ids, answer_ids, max_seq_length, max_dec_seq_length, tokenizer): - mask_id = tokenizer.get_command_id('MASK') + mask_id = tokenizer.get_command_id('mask') eos_id = tokenizer.get_command_id('eos') sop_id = tokenizer.get_command_id('sop') masks = [] diff --git a/flagai/data/dataset/language_model/dataset.py b/flagai/data/dataset/language_model/dataset.py index b291251b..a911df81 100644 --- a/flagai/data/dataset/language_model/dataset.py +++ b/flagai/data/dataset/language_model/dataset.py @@ -38,7 +38,7 @@ def __init__(self, args, documents, tokenizer, num_original_tokens, self.left_weights = [0] + self.weights[:-1] self.unidirectional = args.unidirectional self.block_lm = args.block_lm - mask_token = "gMASK" if args.task_mask else 'MASK' + mask_token = "gMASK" if args.task_mask else 'mask' self.mask_id = self.tokenizer.get_command_id(mask_token) def __len__(self): @@ -115,7 +115,7 @@ def __init__(self, args, tokenizer, strict=True): self.strict = strict self.block_lm = args.block_lm self.unidirectional = args.unidirectional - mask_token = "gMASK" if args.task_mask else 'MASK' + mask_token = "gMASK" if args.task_mask else 'mask' self.mask_id = self.tokenizer.get_command_id(mask_token) self.tokens = [] diff --git a/flagai/data/dataset/seq2seq/dataset.py b/flagai/data/dataset/seq2seq/dataset.py index adc28149..b0bc4148 100644 --- a/flagai/data/dataset/seq2seq/dataset.py +++ b/flagai/data/dataset/seq2seq/dataset.py @@ -477,7 +477,7 @@ def __len__(self): def __getitem__(self, idx): example = self.example_list[idx] source_text, target_text = example.text_a, example.text_b - mask_token = 'MASK' + mask_token = 'mask' mask_id = self.tokenizer.get_command_id(mask_token) sop_id = self.tokenizer.get_command_id('sop') eop_id = self.tokenizer.get_command_id('eop') @@ -612,7 +612,7 @@ def __len__(self): def __getitem__(self, idx): example = self.example_list[idx] source_text = example.text_a - mask_token = 'gMASK' if self.args.task_mask else 'MASK' + mask_token = 'gMASK' if self.args.task_mask else 'mask' mask_id = self.tokenizer.get_command_id(mask_token) sop_id = self.tokenizer.get_command_id('sop') eop_id = self.tokenizer.get_command_id('eop') diff --git a/flagai/data/dataset/superglue/pvp.py b/flagai/data/dataset/superglue/pvp.py index d4d07b39..8a4d6ee3 100644 --- a/flagai/data/dataset/superglue/pvp.py +++ b/flagai/data/dataset/superglue/pvp.py @@ -97,12 +97,12 @@ def spell_length(self): @property def mask(self) -> str: """Return the underlying LM's mask token""" - return self.tokenizer.get_command_id('MASK') + return self.tokenizer.get_command_id('mask') @property def mask_id(self) -> int: """Return the underlying LM's mask id""" - return self.tokenizer.get_command_id('MASK') + return self.tokenizer.get_command_id('mask') @property def max_num_verbalizers(self) -> int: @@ -574,13 +574,13 @@ def spell_length(self): @property def mask(self) -> str: """Return the underlying LM's mask token""" - mask_token = 'MASK' + mask_token = 'mask' return self.tokenizer.get_command_id(mask_token) @property def mask_id(self) -> int: """Return the underlying LM's mask id""" - mask_token = 'MASK' + mask_token = 'mask' return self.tokenizer.get_command_id(mask_token) def get_answers(self, example: InputExample): diff --git a/flagai/data/tokenizer/bert/bert_tokenizer.py b/flagai/data/tokenizer/bert/bert_tokenizer.py index eec168ea..3c935713 100644 --- a/flagai/data/tokenizer/bert/bert_tokenizer.py +++ b/flagai/data/tokenizer/bert/bert_tokenizer.py @@ -75,7 +75,7 @@ def __init__(self, tokenizer_model_type=None, cache_dir=None): self._command_tokens = [ CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), - CommandToken('MASK', '[MASK]', + CommandToken('mask', '[MASK]', self.get_specialid_from_text_tokenizer('mask')), CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')), diff --git a/flagai/data/tokenizer/cpm_1/cpm1_tokenizer.py b/flagai/data/tokenizer/cpm_1/cpm1_tokenizer.py index 7bae0deb..24275e1f 100644 --- a/flagai/data/tokenizer/cpm_1/cpm1_tokenizer.py +++ b/flagai/data/tokenizer/cpm_1/cpm1_tokenizer.py @@ -37,7 +37,7 @@ def __init__(self, vocab_file, model_file, max_length=None): self.encoder = json.load(open(vocab_file)) self.decoder = {v: k for k, v in self.encoder.items()} - self.sp = spm.SentencePieceProcessor(model_file=model_file) + self.sp_model = spm.SentencePieceProcessor(model_file=model_file) self.translator = str.maketrans(" \n", "\u2582\u2583") self.token_start_id = 0 self.token_end_id = 3 @@ -48,6 +48,13 @@ def __init__(self, vocab_file, model_file, max_length=None): def vocab_size(self): return len(self.encoder) + def get_vocab(self): + vocab = { + self.convert_id_to_token(i): i + for i in range(self.vocab_size) + } + return vocab + def __len__(self): return len(self.encoder) + len(self.special_tokens) @@ -57,19 +64,28 @@ def eod(self): def tokenize(self, text): """ Tokenize a string. """ - seg_list = [ - x.translate(self.translator) - for x in jieba.cut(text, cut_all=False) - ] - new_seg = " ".join(seg_list) - return self.sp.encode(new_seg) + seg_list = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)] + new_seg = "".join(seg_list) + return self.sp_model.encode(new_seg) def encode(self, text): res = self.tokenize(text) return res + + def convert_tokens_to_ids(self, tokens): + return [self.sp_model.PieceToId(token) for token in tokens] + + def convert_token_to_id(self, token): + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, idx): + return self.sp_model.IdToPiece(int(idx)) + + def convert_ids_to_tokens(self, idxs): + return [self.sp_model.IdToPiece(int(idx)) for idx in idxs] def decode(self, tokens): - text = self.sp.decode(tokens) + text = self.sp_model.decode(tokens) text = text.replace(' ', '').replace('\u2582', ' ').replace('\u2583', '\n') return text @@ -78,3 +94,18 @@ def encode_plus(self, text, max_length=None): res = self.encode(text) return {"input_ids": res} + + def convert_tokens_to_string(self, tokens, all_command_token={}): + """Converts a sequence of tokens (string) in a single string.""" + current_sub_tokens = [] + out_string = "" + for token in tokens: + # make sure that special tokens are not decoded using sentencepiece model + if token in all_command_token: + out_string += self.sp_model.decode_pieces( + current_sub_tokens) + token + " " + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + out_string += self.sp_model.decode_pieces(current_sub_tokens) + return out_string.strip() diff --git a/flagai/data/tokenizer/galactica/galactica_tokenizer.py b/flagai/data/tokenizer/galactica/galactica_tokenizer.py index fdaf5be6..f028d0f0 100644 --- a/flagai/data/tokenizer/galactica/galactica_tokenizer.py +++ b/flagai/data/tokenizer/galactica/galactica_tokenizer.py @@ -15,7 +15,7 @@ def __init__(self, download_dir) -> None: self._command_tokens = [ CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), - CommandToken('MASK', '[MASK]', + CommandToken('mask', '[MASK]', self.get_specialid_from_text_tokenizer('mask')), CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')), diff --git a/flagai/data/tokenizer/glm_10b_en/glm_10b_en_bpe_tokenizer.py b/flagai/data/tokenizer/glm_10b_en/glm_10b_en_bpe_tokenizer.py index b762b66b..e592d33d 100644 --- a/flagai/data/tokenizer/glm_10b_en/glm_10b_en_bpe_tokenizer.py +++ b/flagai/data/tokenizer/glm_10b_en/glm_10b_en_bpe_tokenizer.py @@ -60,7 +60,7 @@ def __init__(self, self.text_tokenizer.encoder['']), CommandToken('cls', '[CLS]', self.text_tokenizer.encoder['']), - CommandToken('MASK', + CommandToken('mask', '[MASK]', self.text_tokenizer.encoder[''], lstrip=True), @@ -88,7 +88,7 @@ def __init__(self, CommandToken('sop', '<|startofpiece|>', self.num_tokens), CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1), CommandToken('cls', '[CLS]', self.num_tokens + 2), - CommandToken('MASK', + CommandToken('mask', '[MASK]', self.num_tokens + 3, lstrip=True), diff --git a/flagai/data/tokenizer/glm_large_ch/glm_large_ch_tokenizer.py b/flagai/data/tokenizer/glm_large_ch/glm_large_ch_tokenizer.py index 69048d3a..b91797f6 100644 --- a/flagai/data/tokenizer/glm_large_ch/glm_large_ch_tokenizer.py +++ b/flagai/data/tokenizer/glm_large_ch/glm_large_ch_tokenizer.py @@ -55,7 +55,7 @@ def __init__(self, CommandToken('eos', '<|endoftext|>', self.num_text_tokens), CommandToken('sep', '[SEP]', self.num_text_tokens + 1), CommandToken('cls', '[CLS]', self.num_text_tokens + 2), - CommandToken('MASK', + CommandToken('mask', '[MASK]', self.num_text_tokens + 3, lstrip=True), diff --git a/flagai/data/tokenizer/glm_large_en/glm_large_en_tokenizer.py b/flagai/data/tokenizer/glm_large_en/glm_large_en_tokenizer.py index ff4e1e4a..db4c726f 100644 --- a/flagai/data/tokenizer/glm_large_en/glm_large_en_tokenizer.py +++ b/flagai/data/tokenizer/glm_large_en/glm_large_en_tokenizer.py @@ -59,7 +59,7 @@ def __init__(self, self._command_tokens = [ CommandToken('pad', '[PAD]', self.text_tokenizer.vocab['[PAD]']), CommandToken('cls', '[CLS]', self.text_tokenizer.vocab['[CLS]']), - CommandToken('MASK', '[MASK]', + CommandToken('mask', '[MASK]', self.text_tokenizer.vocab['[MASK]']), CommandToken('unk', '[UNK]', self.text_tokenizer.vocab['[UNK]']), CommandToken('sep', '[SEP]', self.text_tokenizer.vocab['[SEP]']), diff --git a/flagai/data/tokenizer/opt/opt_en_tokenizer.py b/flagai/data/tokenizer/opt/opt_en_tokenizer.py index 5c1c0de8..9e8e528c 100644 --- a/flagai/data/tokenizer/opt/opt_en_tokenizer.py +++ b/flagai/data/tokenizer/opt/opt_en_tokenizer.py @@ -35,7 +35,7 @@ def __init__(self, tokenizer_model_type="facebook/opt-125m", cache_dir=None): self._command_tokens = [ CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), - CommandToken('MASK', '[MASK]', + CommandToken('mask', '[MASK]', self.get_specialid_from_text_tokenizer('mask')), CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')), diff --git a/flagai/data/tokenizer/roberta/roberta_tokenizer.py b/flagai/data/tokenizer/roberta/roberta_tokenizer.py index 553a8a83..f1b270e4 100644 --- a/flagai/data/tokenizer/roberta/roberta_tokenizer.py +++ b/flagai/data/tokenizer/roberta/roberta_tokenizer.py @@ -38,7 +38,7 @@ def __init__(self, tokenizer_model_type="roberta-base", cache_dir=None): self._command_tokens = [ CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), - CommandToken('MASK', '[MASK]', + CommandToken('mask', '[MASK]', self.get_specialid_from_text_tokenizer('mask')), CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')), diff --git a/flagai/data/tokenizer/t5/t5_tokenizer.py b/flagai/data/tokenizer/t5/t5_tokenizer.py index ef793b67..499aa83e 100644 --- a/flagai/data/tokenizer/t5/t5_tokenizer.py +++ b/flagai/data/tokenizer/t5/t5_tokenizer.py @@ -45,7 +45,7 @@ def __init__(self, tokenizer_model_type="t5-base", cache_dir=None): CommandToken('pad', '[PAD]', self.num_tokens + 1), CommandToken('cls', '[CLS]', self.num_tokens + 2), - CommandToken('MASK', '[MASK]', + CommandToken('mask', '[MASK]', self.num_tokens + 3), ] self._command_tokens.extend([ diff --git a/flagai/data/tokenizer/tokenizer.py b/flagai/data/tokenizer/tokenizer.py index c3ba085f..43585688 100644 --- a/flagai/data/tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/tokenizer.py @@ -54,7 +54,7 @@ def __str__(self): ('sep', 4), ('L2R', 5), ('cls', 6), - ('MASK', 7), + ('mask', 7), ] DEFAULT_COMMAND_TOKENS = prep_command_tokens(DEFAULT_COMMAND_TOKENS) """define some default type tokens for bert training""" diff --git a/flagai/data/tokenizer/uni_tokenizer/base_tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/base_tokenizer.py index f3583437..37629623 100644 --- a/flagai/data/tokenizer/uni_tokenizer/base_tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/base_tokenizer.py @@ -83,7 +83,8 @@ def from_pretrained(cls, *inputs, **kwargs) elif tokenizer_class == "sp": - return cls(sp_model_file=resolved_sp_file, + return cls(vocab_file=resolved_vocab_json_file, + sp_model_file=resolved_sp_file, tokenizer_class=tokenizer_class, tokenizer_model_name=tokenizer_model_name, tokenizer_json_file=resolved_tokenizer_json_file, diff --git a/flagai/data/tokenizer/uni_tokenizer/bpe_tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/bpe_tokenizer.py index a873c54e..1f175e16 100644 --- a/flagai/data/tokenizer/uni_tokenizer/bpe_tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/bpe_tokenizer.py @@ -152,7 +152,7 @@ def tokenize(self, text): def convert_token_to_id(self, token): """ Converts a sequence of tokens into ids using the vocab. """ - return self.encoder.get(token, 0) + return self.encoder[token] def convert_tokens_to_ids(self, tokens): """ Converts a sequence of tokens into ids using the vocab. """ diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py index 457c6412..4f0c0cde 100644 --- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py @@ -31,6 +31,7 @@ from flagai.data.tokenizer.uni_tokenizer.diffusion_bert_tokenizer import FullTokenizer from typing import List, Union, Optional import unicodedata +import json def is_control(ch): @@ -49,7 +50,9 @@ def __init__(self, add_sentinel_token=0, add_task_mask=True, add_decoder_mask=False, - fix_command_token=True, + fix_command_token=False, + pre_tokenizer=None, + special_tokens=['cls','pad','unk','eos','sep','mask'], **kwargs): super().__init__(**kwargs) @@ -69,277 +72,88 @@ def __init__(self, self.text_tokenizer = BPETokenizer(self.vocab_file, self.merges_file) elif self.tokenizer_class == "sp": - self.text_tokenizer = SentencePieceTokenizer(self.sp_model_file) + if self.tokenizer_model_name.lower().startswith('cpm'): + from flagai.data.tokenizer.cpm_1.cpm1_tokenizer import CPMTokenizer + self.text_tokenizer = CPMTokenizer(self.vocab_file, self.sp_model_file) + elif self.tokenizer_model_name.lower().startswith('cpm3'): + from flagai.data.tokenizer.cpm_3.cpm3_tokenizer import CPMTokenizer + self.text_tokenizer = CPMTokenizer(self.tokenizer_json_file, self.sp_model_file) + else: + self.text_tokenizer = SentencePieceTokenizer(self.sp_model_file) else: raise NotImplementedError("cannot assign a tokenize class") self.is_glm = self.tokenizer_model_name.lower().startswith('glm') # self.is_clip = self.tokenizer_model_name.startswith('clip') self.num_tokens = self.text_tokenizer.vocab_size - - if self.tokenizer_class == "wp": - # set command tokens from wordpiece tokenizer values - self.num_command_tokens = 6 - self.num_text_tokens = self.num_tokens - 5 - self.num_type_tokens = 2 - self.token_start_id = None - self.token_end_id = None - self.token_pad_id = None - try: - self._command_tokens = [ - CommandToken( - 'pad', '[PAD]', - self.text_tokenizer.convert_token_to_id('[PAD]')), - CommandToken( - 'cls', '[CLS]', - self.text_tokenizer.convert_token_to_id('[CLS]')), - CommandToken( - 'MASK', '[MASK]', - self.text_tokenizer.convert_token_to_id('[MASK]')), - CommandToken( - 'unk', '[UNK]', - self.text_tokenizer.convert_token_to_id('[UNK]')), - CommandToken( - 'sep', '[SEP]', - self.text_tokenizer.convert_token_to_id('[SEP]')), - CommandToken( - 'eos', '[PAD]', - self.text_tokenizer.convert_token_to_id('[PAD]')), - ] - self.token_start_id = self.text_tokenizer.convert_token_to_id( - '[CLS]') - self.token_end_id = self.text_tokenizer.convert_token_to_id( - '[SEP]') - self.token_pad_id = self.text_tokenizer.convert_token_to_id( - '[PAD]') + if self.tokenizer_model_name.startswith('cpm'): + special_tokens.append('eod') + if self.tokenizer_model_name.startswith('opt'): + special_tokens.append('bos') + + try: + with open(self.special_tokens_map, encoding='utf8') as file: dct=json.load(file) + sp_tokens = [(k.replace("_token",""),v['content']) for k,v in dct.items()] + except FileNotFoundError: + dct = None + sp_tokens = [] + for tk in special_tokens: + res = self.search_special(tk) + if res: + sp_tokens += [(tk, res)] + self._command_tokens = [CommandToken(e[0], e[1], self.text_tokenizer.convert_token_to_id(e[1])) for e in sp_tokens] + if self.tokenizer_model_name.lower().startswith("glm"): + if self.tokenizer_class == "wp": self.text_tokenizer._token_cls = "[CLS]" self.text_tokenizer._token_sep = "[SEP]" - - except KeyError: + fix_command_token = False + elif self.tokenizer_class == "sp": + fix_command_token = True self._command_tokens = [ - CommandToken( - 'pad', '[PAD]', - self.text_tokenizer.convert_token_to_id('')), - CommandToken( - 'cls', '[CLS]', - self.text_tokenizer.convert_token_to_id('')), - CommandToken( - 'MASK', '[MASK]', - self.text_tokenizer.convert_token_to_id('')), - CommandToken( - 'unk', '[UNK]', - self.text_tokenizer.convert_token_to_id('')), - CommandToken( - 'sep', '[SEP]', - self.text_tokenizer.convert_token_to_id('')), - CommandToken( - 'eos', '[PAD]', - self.text_tokenizer.convert_token_to_id('')), + CommandToken('pad', '<|endoftext|>', self.num_tokens), + CommandToken('eos', '<|endoftext|>', self.num_tokens), + CommandToken('sep', '[SEP]', self.num_tokens + 1), + CommandToken('cls', '[CLS]', self.num_tokens + 2), + CommandToken('mask', '[MASK]', self.num_tokens + 3, lstrip=True), + CommandToken('unk', '[UNK]', self.num_tokens + 4) ] - self.token_start_id = self.text_tokenizer.convert_token_to_id( - '') - self.token_end_id = self.text_tokenizer.convert_token_to_id( - '') - self.token_pad_id = self.text_tokenizer.convert_token_to_id( - '') - self.text_tokenizer._token_cls = "" - self.text_tokenizer._token_sep = "" - if add_block_symbols: - self.add_command_token('sop', '<|startofpiece|>') - self.add_command_token('eop', '<|endofpiece|>',) - if add_task_mask: - self.add_command_token('gMASK', '[gMASK]') - self.add_command_token('sMASK', '[sMASK]') - if add_decoder_mask: - self.add_command_token('dBLOCK', '[dBLOCK]') - if add_sentinel_token > 0: - for i in range(1, add_sentinel_token): - self.add_command_token(f'MASK{i}', f'[MASK{i}]') - self.add_command_token(f'sop{i}', f'<|startofpiece{i}|>') - elif self.tokenizer_class == "bpe": - if self.tokenizer_model_name.lower().startswith('roberta'): - self.num_command_tokens = 6 - self.num_text_tokens = self.num_tokens - 3 - self._command_tokens = [ - CommandToken( - 'pad', '<|endoftext|>', - self.text_tokenizer.convert_token_to_id('')), - CommandToken( - 'eos', '<|endoftext|>', - self.text_tokenizer.convert_token_to_id('')), - CommandToken( - 'sep', '[SEP]', - self.text_tokenizer.convert_token_to_id('')), - CommandToken( - 'cls', '[CLS]', - self.text_tokenizer.convert_token_to_id('')), - CommandToken( - 'MASK', - '[MASK]', - self.text_tokenizer.convert_token_to_id(''), - lstrip=True), - CommandToken( - 'unk', '[UNK]', - self.text_tokenizer.convert_token_to_id('')) - ] - if add_block_symbols: - self._command_tokens.extend([ - CommandToken('sop', '<|startofpiece|>', - self.num_tokens), - CommandToken('eop', '<|endofpiece|>', - self.num_tokens + 1) - ]) - self.num_tokens += 2 - self.num_command_tokens += 2 - self.token_end_id = self.text_tokenizer.convert_token_to_id( - '') - elif self.tokenizer_model_name.lower().startswith('clip'): - self.num_command_tokens = 2 + self.num_tokens += 6 + elif self.tokenizer_class == "bpe": self._command_tokens = [ - CommandToken( - 'sot', '', - self.text_tokenizer.convert_token_to_id('')), - CommandToken( - 'eot', '', - self.text_tokenizer.convert_token_to_id('')), + CommandToken('pad', '<|endoftext|>', + self.text_tokenizer.encoder['<|endoftext|>']), + CommandToken('eos', '<|endoftext|>', + self.text_tokenizer.encoder['<|endoftext|>']) ] - self.num_tokens += self.num_command_tokens - self.token_end_id = self.text_tokenizer.convert_token_to_id( - '') - else: - self.num_command_tokens = 2 - self.num_text_tokens = self.num_tokens - 1 - self._command_tokens = [ - CommandToken( - 'pad', '<|endoftext|>', - self.text_tokenizer.convert_token_to_id( - '<|endoftext|>')), - CommandToken( - 'eos', '<|endoftext|>', - self.text_tokenizer.convert_token_to_id( - '<|endoftext|>')) - ] - self.token_end_id = self.text_tokenizer.convert_token_to_id( - '<|endoftext|>') - if add_block_symbols: - if self.tokenizer_model_name.lower().startswith('glm'): - unk_token_id = self.num_tokens + 5 - cls_token_id = self.num_tokens + 2 - num_tokens_to_add = 5 - else: - unk_token_id = self.text_tokenizer.convert_token_to_id( - '<|endoftext|>') - cls_token_id = self.text_tokenizer.convert_token_to_id( - '<|endoftext|>') - num_tokens_to_add = 4 - self._command_tokens.extend([ - CommandToken('sop', '<|startofpiece|>', - self.num_tokens), - CommandToken('eop', '<|endofpiece|>', - self.num_tokens + 1), - CommandToken('cls', '[CLS]', cls_token_id), - CommandToken('MASK', - '[MASK]', - self.num_tokens + 3, - lstrip=True), - CommandToken('sep', '[SEP]', self.num_tokens + 4), - CommandToken('unk', '[UNK]', unk_token_id) - ]) - self.num_tokens += num_tokens_to_add - self.num_command_tokens += 6 - - if add_block_symbols: - if add_task_mask: - self._command_tokens.extend([ - CommandToken('gMASK', - '[gMASK]', - self.num_tokens, - lstrip=True), - CommandToken('sMASK', - '[sMASK]', - self.num_tokens + 1, - lstrip=True) - ]) - self.num_tokens += 2 - self.num_command_tokens += 2 - if add_decoder_mask: - self._command_tokens.extend( - [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)]) - self.num_tokens += 1 - self.num_command_tokens += 1 - - elif self.tokenizer_class == "sp": - self.num_command_tokens = 0 - self.num_text_tokens = self.text_tokenizer.vocab_size - self.num_tokens = self.num_text_tokens - - if self.tokenizer_model_name.lower().startswith('glm'): - pad_token_id = self.num_tokens - eos_token_id = self.num_tokens - unk_token_id = self.num_tokens + 4 - else: - pad_token_id = self.text_tokenizer.convert_token_to_id('') - eos_token_id = self.text_tokenizer.convert_token_to_id('') - unk_token_id = self.text_tokenizer.convert_token_to_id('') - self._command_tokens = [ - CommandToken('pad', '<|endoftext|>', self.num_text_tokens), - CommandToken('eos', '<|endoftext|>', self.num_text_tokens), - CommandToken('sep', '[SEP]', self.num_text_tokens + 1), - CommandToken('cls', '[CLS]', self.num_text_tokens + 2), - CommandToken('MASK', - '[MASK]', - self.num_text_tokens + 3, - lstrip=True), - CommandToken('unk', '[UNK]', self.num_text_tokens + 4) - ] - - self.num_tokens += 5 - self.num_command_tokens += 6 - self.token_end_id = self.text_tokenizer.convert_token_to_id( - '') - if add_block_symbols: - sop_id = self.text_tokenizer.convert_token_to_id('<|startofpiece|>') - eop_id = self.text_tokenizer.convert_token_to_id('<|endofpiece|>') self._command_tokens.extend([ - CommandToken('sop', '<|startofpiece|>', - self.num_tokens + 1), - CommandToken('eop', '<|endofpiece|>', self.num_tokens + 2) + CommandToken('sop', '<|startofpiece|>', self.num_tokens), + CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1), + CommandToken('cls', '[CLS]', self.num_tokens + 2), + CommandToken('mask', + '[MASK]', + self.num_tokens + 3, + lstrip=True), + CommandToken('sep', '[SEP]', self.num_tokens + 4), + CommandToken('unk', '[UNK]', self.num_tokens + 5) ]) - if fix_command_token: - self.num_tokens += 3 - else: - self.num_tokens += 2 - self.num_command_tokens += 2 + self.num_tokens += 6 + if add_block_symbols: + if not self.tokenizer_class == "bpe": + self.add_command_token('sop', '<|startofpiece|>',self.tokenizer_class) + self.add_command_token('eop', '<|endofpiece|>',self.tokenizer_class) if add_task_mask: if fix_command_token: - self._command_tokens.extend([ - CommandToken('sMASK', - '[sMASK]', - self.num_tokens, - lstrip=True), - CommandToken('gMASK', - '[gMASK]', - self.num_tokens + 1, - lstrip=True) - ]) + self.add_command_token('sMASK', '[sMASK]',self.tokenizer_class) + self.add_command_token('gMASK', '[gMASK]',self.tokenizer_class) else: - self._command_tokens.extend([ - CommandToken('gMASK', - '[gMASK]', - self.num_tokens, - lstrip=True), - CommandToken('sMASK', - '[sMASK]', - self.num_tokens + 1, - lstrip=True) - ]) - self.num_tokens += 2 - self.num_command_tokens += 2 + self.add_command_token('gMASK', '[gMASK]',self.tokenizer_class) + self.add_command_token('sMASK', '[sMASK]',self.tokenizer_class) if add_decoder_mask: - self._command_tokens.extend( - [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)]) - self.num_tokens += 1 - self.num_command_tokens += 1 + self.add_command_token('dBLOCK', '[dBLOCK]',self.tokenizer_class) + if add_sentinel_token > 0: + for i in range(1, add_sentinel_token): + self.add_command_token(f'MASK{i}', f'[MASK{i}]',self.tokenizer_class) + self.add_command_token(f'sop{i}', f'<|startofpiece{i}|>',self.tokenizer_class) self.command_name_map = {tok.name: tok for tok in self._command_tokens} self.command_token_map = { tok.token: tok @@ -347,7 +161,17 @@ def __init__(self, } self.command_id_map = {tok.Id: tok for tok in self._command_tokens} self._command_token_tokens = list(self.command_token_map.keys()) - logger.info("All special tokens: %s", str([(k,v.Id) for k,v in self.command_name_map.items()])) + vocab = self.text_tokenizer.get_vocab() + self.token_start_id = vocab.get('', None) + if not self.token_start_id: + self.token_start_id = vocab.get('[CLS]', None) + + self.token_end_id = vocab.get('', None) + if not self.token_end_id: + self.token_end_id = vocab.get('<|endoftext|>', None) + if not self.token_end_id: + self.token_end_id = vocab.get('[SEP]', None) + print("All special tokens: ", str([(k, v.token, v.Id) for k,v in self.command_name_map.items()])) def get_vocab(self): return self.text_tokenizer.get_vocab() @@ -356,9 +180,12 @@ def get_command_id(self, name): """get command token corresponding to `name`""" return self.command_name_map[name].Id - def add_command_token(self, name, token): + def add_command_token(self, name, token, tokenizer_class="wp"): try: - id = self.text_tokenizer.convert_token_to_id(token) + if tokenizer_class == "sp": + id = self.text_tokenizer.get_vocab()[token] + else: + id = self.text_tokenizer.convert_token_to_id(token) except KeyError: id = self.num_tokens self.num_tokens += 1 @@ -458,7 +285,7 @@ def TokenToId(self, token): def DecodeIds(self, ids): """converts ids to wordpiece tokens and joins them as a text string""" - tokens = [] + tokens = [] for id in ids: if id in self.command_id_map: tokens.append(self.command_id_map[id].token) @@ -472,10 +299,14 @@ def DecodeIds(self, ids): tokens, self.command_token_map) def encode(self, text): + if hasattr(self.text_tokenizer, "encode"): + return self.text_tokenizer.encode(text) return self.convert_tokens_to_ids( self.text_tokenizer.tokenize(text)) def decode(self, ids): + if hasattr(self.text_tokenizer, "decode"): + return self.text_tokenizer.decode(ids) return self.DecodeIds(ids) def DecodeTokens(self, tokens): @@ -566,8 +397,8 @@ def encode_plus_non_glm( ): def get_input_ids(text): - tokens = self.text_tokenizer.tokenize(text) - return self.text_tokenizer.convert_tokens_to_ids(tokens) + tokens = self.tokenize(text) + return self.convert_tokens_to_ids(tokens) first_ids = get_input_ids(text) second_ids = get_input_ids( @@ -635,10 +466,16 @@ def encode_plus( # for Seq2seq max_length=None, padding=True, ): - if not self.tokenizer_model_name.lower().startswith("glm") and not self.tokenizer_model_name.lower().startswith( + if hasattr(self.text_tokenizer, "encode_plus"): + return self.text_tokenizer.encode_plus(source_text) + elif not self.tokenizer_model_name.lower().startswith("glm") and not self.tokenizer_model_name.lower().startswith( "alm"): return self.encode_plus_non_glm(source_text, second_text, truncation, max_length) + + + # elif self.tokenizer_model_name.lower().startswith("opt"): + # return None sop_id = self.get_command_id('sop') # start of piece eop_id = self.get_command_id('eop') # end of piece sep_id = self.get_command_id('sep') # seperation @@ -734,4 +571,42 @@ def tokenize(self, text, maxlen=None, add_spatial_tokens=False): if maxlen is not None: index = int(self.get_command_id('sep') is not None) + 1 self.truncate_sequence(maxlen, tokens, pop_index=-index) - return tokens \ No newline at end of file + return tokens + + def search_special(self, name): + if name == "cls": + if self.check_special(''): return '' + elif self.check_special('[CLS]'): return '[CLS]' + elif name == "pad": + if self.check_special(''): return '' + elif self.check_special('[PAD]'): return '[PAD]' + elif self.check_special('<|endoftext|>'): return '<|endoftext|>' + elif name == "eos": + if self.check_special(''): return '' + elif self.check_special('|endoftext|'): return '|endoftext|' + elif self.check_special('[PAD]'): return '[PAD]' + elif name == "sep": + if self.check_special(''): return '' + elif self.check_special('[SEP]'): return '[SEP]' + elif name == "unk": + if self.check_special(''): return '' + elif self.check_special('[UNK]'): return '[UNK]' + elif name == "bos": + if self.check_special(''): return '' + elif name == "mask": + if self.check_special('[MASK]'): return '[MASK]' + elif self.check_special(''): return '' + elif name == "eod": + if self.check_special(''): return '' + return None + + def check_special(self, tk): + + try: + if self.tokenizer_class == 'sp': + self.text_tokenizer.get_vocab()[tk] + else: + self.text_tokenizer.convert_token_to_id(tk) + return True + except KeyError: + return False diff --git a/flagai/model/predictor/utils.py b/flagai/model/predictor/utils.py index 72077041..61d91ce6 100644 --- a/flagai/model/predictor/utils.py +++ b/flagai/model/predictor/utils.py @@ -1133,7 +1133,7 @@ def alm_beamsearch(model, tokenizer, text, out_max_length, beam_size, eod_token= dtype=torch.long) position_ids = torch.stack((position_ids, block_position_ids), dim=0) position_ids = position_ids.unsqueeze(0) - mask_tokens = ['MASK', 'sMASK', 'gMASK'] + mask_tokens = ['mask', 'sMASK', 'gMASK'] mask_tokens = [tokenizer.get_command_id(token) for token in mask_tokens] end_tokens = [tokenizer.get_command_id('eop'), eod_token] mask_positions = [] @@ -1434,7 +1434,7 @@ def glm_generate_sample( dtype=torch.long) position_ids = torch.stack((position_ids, block_position_ids), dim=0) position_ids = position_ids.unsqueeze(0) - mask_tokens = ['MASK', 'sMASK', 'gMASK'] + mask_tokens = ['mask', 'sMASK', 'gMASK'] mask_tokens = [tokenizer.get_command_id(token) for token in mask_tokens] end_tokens = [tokenizer.get_command_id('eop'), eod_token] mask_positions = [] diff --git a/flagai/test_utils.py b/flagai/test_utils.py index 83dacde3..5faa0aec 100644 --- a/flagai/test_utils.py +++ b/flagai/test_utils.py @@ -14,7 +14,7 @@ def build_input_from_ids(text_a_ids=None, mask_id=None, masked_lm=False): if mask_id is None: - mask_id = tokenizer.get_command_id('MASK') + mask_id = tokenizer.get_command_id('mask') eos_id = tokenizer.get_command_id('eos') cls_id = tokenizer.get_command_id('cls') sep_id = tokenizer.get_command_id('sep') diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 87b7fa63..c72d34a7 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -14,6 +14,14 @@ def test_tokenizer_GLM_large_ch(self): [3378, 1567, 2613, 20282], 'EncodeAsIds Error') self.assertEqual(tokenizer.DecodeIds([3378, 1567, 2613, 20282]), '今天吃饭吃了肯德基', 'DecodeIds Error') + self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), + ['▁今天', '吃饭', '吃了', '肯德基'], 'tokenize Error') + self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], + [50006, 3378, 1567, 2613, 20282, 50001], 'encode_plus Error') + self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]), + {('pad', '<|endoftext|>', 50000), ('eos', '<|endoftext|>', 50000), ('sep', '[SEP]', 50001), + ('cls', '[CLS]', 50002), ('mask', '[MASK]', 50003), ('unk', '[UNK]', 50004), ('sop', '<|startofpiece|>', 50006), + ('eop', '<|endofpiece|>', 50007), ('sMASK', '[sMASK]', 50008), ('gMASK', '[gMASK]', 50009)}, 'SpecialTokens error') def test_tokenizer_GLM_large_en(self): tokenizer = Tokenizer.from_pretrained("GLM-large-en") @@ -22,6 +30,10 @@ def test_tokenizer_GLM_large_en(self): [13017, 7975, 3084, 2033, 3407], '') self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), 'fried chicken makes me happy', 'DecodeIds Error') + self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]), + {('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('mask', '[MASK]', 103), ('unk', '[UNK]', 100), + ('sep', '[SEP]', 102), ('pad', '[PAD]', 0), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523), + ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)}) # def test_tokenizer_glm_10b_en(self): # tokenizer = Tokenizer.from_pretrained("GLM-10b-en") @@ -30,23 +42,43 @@ def test_tokenizer_GLM_large_en(self): # [25520, 9015, 1838, 502, 3772], '') # self.assertEqual(tokenizer.DecodeIds([25520, 9015, 1838, 502, 3772]), # 'fried chicken makes me happy', 'DecodeIds Error') + # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + # [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('mask', '[MASK]', 103), ('unk', '[UNK]', 100), + # ('sep', '[SEP]', 102), ('pad', '[PAD]', 0), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523), + # ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)]) + def test_tokenizer_t5(self): - tokenizer = Tokenizer.from_pretrained('t5-base-en') - self.assertEqual(tokenizer.TokenToId("day"), 1135, '') - self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), - [3, 7704, 3832, 656, 140, 1095], '') - self.assertEqual(tokenizer.DecodeIds([3, 7704, 3832, 656, 140, 1095]), - 'fried chicken makes me happy', 'DecodeIds Error') + tokenizer = Tokenizer.from_pretrained('T5-base-ch') + self.assertEqual(tokenizer.TokenToId("人"), 297, '') + self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), + [306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166], '') + self.assertEqual(tokenizer.DecodeIds([306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166]), + '今天吃饭吃了肯德基', 'DecodeIds Error') + encode_plus_result = tokenizer.encode_plus("今天吃饭吃了肯德基") + self.assertEqual(list(encode_plus_result.keys()), + ['input_ids', 'token_type_ids'], 'encode_plus Error') + self.assertEqual(encode_plus_result['input_ids'], + [101, 306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166, 102], 'encode_plus Error') + self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]), + {('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('mask', '[MASK]', 103), ('unk', '[UNK]', 100), + ('sep', '[SEP]', 102), ('pad', '[PAD]', 0)}, 'SpecialTokens error') + def test_tokenizer_roberta(self): tokenizer = Tokenizer.from_pretrained('RoBERTa-base-ch') - # print(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825])) self.assertEqual(tokenizer.TokenToId("人"), 782, '') self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), [791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825], '') self.assertEqual(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825]), '今天吃饭吃了肯德基', 'DecodeIds Error') + self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), + ['今', '天', '吃', '饭', '吃', '了', '肯', '德', '基'], 'tokenize Error') + self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], + [101, 791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825, 102], 'encode_plus Error') + self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]), + {('unk', '[UNK]', 100), ('cls', '[CLS]', 101), ('sep', '[SEP]', 102), ('mask', '[MASK]', 103), + ('eos', '[PAD]', 0), ('pad', '[PAD]', 0)}, 'SpecialTokens error') def test_tokenizer_bert(self): tokenizer = Tokenizer.from_pretrained('BERT-base-en') @@ -55,26 +87,48 @@ def test_tokenizer_bert(self): [13017, 7975, 3084, 2033, 3407], '') self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), 'fried chicken makes me happy', 'DecodeIds Error') + self.assertEqual(tokenizer.tokenize('fried chicken makes me happy'), + ['fried', 'chicken', 'makes', 'me', 'happy'], 'tokenize Error') + self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'], + [101, 13017, 7975, 3084, 2033, 3407, 102], 'encode_plus Error') + self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]), + {('eos', '[PAD]', 0), ('unk', '[UNK]', 100), ('cls', '[CLS]', 101), ('sep', '[SEP]', 102), + ('mask', '[MASK]', 103), ('pad', '[PAD]', 0)}, 'SpecialTokens error') - def test_tokenizer_cpm1(self): - loader = AutoLoader(task_name="lm", - model_name="CPM-large-ch", - model_dir="./checkpoints/", - only_download_config=True) - tokenizer = loader.get_tokenizer() - self.assertEqual(tokenizer.encode("day"), [8, 8275], '') - self.assertEqual(tokenizer.encode("fried chicken makes me happy"), - [2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239], '') - self.assertEqual(tokenizer.decode([2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239]), - 'fried chicken makes me happy', 'DecodeIds Error') + # def test_tokenizer_cpm1(self): + # loader = AutoLoader(task_name="lm", + # model_name="CPM-large-ch", + # model_dir="./checkpoints/", + # only_download_config=True) + + # tokenizer = loader.get_tokenizer() + # self.assertEqual(tokenizer.TokenToId("人"), 62, '') + # self.assertEqual(tokenizer.encode("今天吃饭吃了肯德基"), + # [837, 3079, 1777, 3079, 139, 3687, 513, 1463], '') + # self.assertEqual(tokenizer.DecodeIds([837, 3079, 1777, 3079, 139, 3687, 513, 1463]), + # '今天吃饭吃了肯德基', 'DecodeIds Error') + # self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), + # [837, 3079, 1777, 3079, 139, 3687, 513, 1463], 'tokenize Error') + # self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], + # [837, 3079, 1777, 3079, 139, 3687, 513, 1463], 'encode_plus Error') + # self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]), + # {('unk', '', 0), ('cls', '', 1), ('eos', '', 2), ('sep', '', 4), + # ('mask', '', 6), ('pad', '', 5),('eod', '', 7)}, 'SpecialTokens error') def test_tokenizer_opt(self): - tokenizer = Tokenizer.from_pretrained('opt-125m-en') + tokenizer = Tokenizer.from_pretrained('opt-1.3b-en') self.assertEqual(tokenizer.encode("day"), [1208], '') self.assertEqual(tokenizer.encode_plus("fried chicken makes me happy")["input_ids"], - [50260, 21209, 5884, 817, 162, 1372, 50260], '') + [0, 21209, 5884, 817, 162, 1372, 2], '') self.assertEqual(tokenizer.decode([21209, 5884, 817, 162, 1372]), 'fried chicken makes me happy', 'DecodeIds Error') + self.assertEqual(tokenizer.tokenize('fried chicken makes me happy'), + ['fried', 'Ġchicken', 'Ġmakes', 'Ġme', 'Ġhappy'], 'tokenize Error') + self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'], + [0, 21209, 5884, 817, 162, 1372, 2], 'encode_plus Error') + self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]), + {('cls', '', 0), ('pad', '', 1), ('bos', '', 2), ('eos', '', 2), ('unk', '', 3), + ('mask', '', 50264)}, 'SpecialTokens error') def test_tokenizer_clip(self): loader = AutoLoader(task_name="txt_img_matching", @@ -89,6 +143,7 @@ def test_tokenizer_evaclip(self): self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') + def suite(): suite = unittest.TestSuite() suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_ch')) @@ -97,7 +152,7 @@ def suite(): suite.addTest(TokenizerTestCase('test_tokenizer_t5')) suite.addTest(TokenizerTestCase('test_tokenizer_roberta')) suite.addTest(TokenizerTestCase('test_tokenizer_bert')) - suite.addTest(TokenizerTestCase('test_tokenizer_cpm1')) + # suite.addTest(TokenizerTestCase('test_tokenizer_cpm1')) suite.addTest(TokenizerTestCase('test_tokenizer_opt')) suite.addTest(TokenizerTestCase('test_tokenizer_clip')) suite.addTest(TokenizerTestCase('test_tokenizer_evaclip'))