From 257da7cb6b54c18c8644aa7659d03791b6b43572 Mon Sep 17 00:00:00 2001 From: Anhforth Date: Fri, 17 Feb 2023 14:57:34 +0800 Subject: [PATCH 01/54] updated Signed-off-by: Anhforth --- examples/gpt2_title_generation/generate.py | 2 + examples/roberta_ner/generate.py | 12 +- examples/roberta_title_generation/generate.py | 2 +- .../tokenizer/uni_tokenizer/bpe_tokenizer.py | 1 + .../data/tokenizer/uni_tokenizer/tokenizer.py | 137 +++++++++--------- test.py | 92 ++++++++++-- tests/test_tokenizer.py | 20 ++- 7 files changed, 181 insertions(+), 85 deletions(-) diff --git a/examples/gpt2_title_generation/generate.py b/examples/gpt2_title_generation/generate.py index a76d6f81..79023e03 100644 --- a/examples/gpt2_title_generation/generate.py +++ b/examples/gpt2_title_generation/generate.py @@ -1,6 +1,8 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") +import sys +sys.path.append("/home/yanzhaodong/anhforth/FlagAI") import torch from flagai.auto_model.auto_loader import AutoLoader from flagai.model.predictor.predictor import Predictor diff --git a/examples/roberta_ner/generate.py b/examples/roberta_ner/generate.py index 4a7e4850..303a4d93 100644 --- a/examples/roberta_ner/generate.py +++ b/examples/roberta_ner/generate.py @@ -1,17 +1,21 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") +# import sys +# sys.path.append("/home/yanzhaodong/anhforth/FlagAI") import torch from flagai.auto_model.auto_loader import AutoLoader from flagai.model.predictor.predictor import Predictor device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + task_name = "ner" -model_dir = "./state_dict/" +model_dir = "./checkpoints" # Note "./checkpoints_ner/{}/mp_rank_00_model_states.pt", {} is a directory in the checkpoints_ner. -model_save_path = "./checkpoints_ner/3913/mp_rank_00_model_states.pt" +# model_save_path = "./checkpoints_ner/3913/mp_rank_00_model_states.pt" target = ["O", "B-LOC", "I-LOC", "B-ORG", "I-ORG", "B-PER", "I-PER"] @@ -25,8 +29,8 @@ tokenizer = auto_loader.get_tokenizer() predictor = Predictor(model, tokenizer) -model.load_state_dict( - torch.load(model_save_path, map_location=device)["module"]) +# model.load_state_dict( +# torch.load(model_save_path, map_location=device)["module"]) model.to(device) model.eval() diff --git a/examples/roberta_title_generation/generate.py b/examples/roberta_title_generation/generate.py index c28960be..00dbdf8e 100644 --- a/examples/roberta_title_generation/generate.py +++ b/examples/roberta_title_generation/generate.py @@ -7,7 +7,7 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -model_dir = "./state_dict" +model_dir = "./checkpoints" # Note "./checkpoints_seq2seq/{}/mp_rank_00_model_states.pt", {} is a directory in the checkpoints_seq2seq. model_save_path = "./checkpoints_seq2seq/10/mp_rank_00_model_states.pt" diff --git a/flagai/data/tokenizer/uni_tokenizer/bpe_tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/bpe_tokenizer.py index 25883388..a873c54e 100644 --- a/flagai/data/tokenizer/uni_tokenizer/bpe_tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/bpe_tokenizer.py @@ -42,6 +42,7 @@ def lru_cache(): return lambda func: func + class BPETokenizer(object): def __init__(self, diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py index b27bb24d..4f8e6951 100644 --- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py @@ -46,7 +46,7 @@ def is_control(ch): class Tokenizer(BaseTokenizer): def __init__(self, - add_block_symbols=True, + add_block_symbols=False, add_sentinel_token=0, add_task_mask=True, add_decoder_mask=False, @@ -55,13 +55,11 @@ def __init__(self, super().__init__(**kwargs) if self.tokenizer_class == "wp": - if self.tokenizer_model_name.lower().endswith("ch"): - self.text_tokenizer = WordpieceTokenizer(self.vocab_file, - is_ch=True) - elif self.tokenizer_model_name.lower().startswith('clip-cn'): + if self.tokenizer_model_name.lower().startswith('clip-cn'): self.text_tokenizer = FullTokenizer(self.vocab_file) else: - self.text_tokenizer = WordpieceTokenizer(self.vocab_file) + self.text_tokenizer = WordpieceTokenizer(self.vocab_file, + is_ch=self.tokenizer_model_name.lower().endswith("ch")) elif self.tokenizer_class == "bpe": if self.tokenizer_model_name.lower().startswith('clip'): self.text_tokenizer = MMBPETokenizer(self.vocab_file, @@ -74,10 +72,12 @@ def __init__(self, else: raise NotImplementedError("cannot assign a tokenize class") - self.is_glm = self.tokenizer_model_name.lower().startswith('glm') + if self.tokenizer_model_name.lower().startswith('glm') or self.tokenizer_model_name.lower().startswith('alm'): + add_block_symbols=False + add_block_symbols = True # self.is_clip = self.tokenizer_model_name.startswith('clip') self.num_tokens = self.text_tokenizer.vocab_size - + if self.tokenizer_class == "wp": # set command tokens from wordpiece tokenizer values self.num_command_tokens = 6 @@ -86,65 +86,65 @@ def __init__(self, self.token_start_id = None self.token_end_id = None self.token_pad_id = None - try: - self._command_tokens = [ - CommandToken( - 'pad', '[PAD]', - self.text_tokenizer.convert_token_to_id('[PAD]')), - CommandToken( - 'cls', '[CLS]', - self.text_tokenizer.convert_token_to_id('[CLS]')), - CommandToken( - 'MASK', '[MASK]', - self.text_tokenizer.convert_token_to_id('[MASK]')), - CommandToken( - 'unk', '[UNK]', - self.text_tokenizer.convert_token_to_id('[UNK]')), - CommandToken( - 'sep', '[SEP]', - self.text_tokenizer.convert_token_to_id('[SEP]')), - CommandToken( - 'eos', '[PAD]', - self.text_tokenizer.convert_token_to_id('[PAD]')), - ] - self.token_start_id = self.text_tokenizer.convert_token_to_id( - '[CLS]') - self.token_end_id = self.text_tokenizer.convert_token_to_id( - '[SEP]') - self.token_pad_id = self.text_tokenizer.convert_token_to_id( - '[PAD]') - self.text_tokenizer._token_cls = "[CLS]" - self.text_tokenizer._token_sep = "[SEP]" - - except KeyError: - self._command_tokens = [ - CommandToken( - 'pad', '[PAD]', - self.text_tokenizer.convert_token_to_id('')), - CommandToken( - 'cls', '[CLS]', - self.text_tokenizer.convert_token_to_id('')), - CommandToken( - 'MASK', '[MASK]', - self.text_tokenizer.convert_token_to_id('')), - CommandToken( - 'unk', '[UNK]', - self.text_tokenizer.convert_token_to_id('')), - CommandToken( - 'sep', '[SEP]', - self.text_tokenizer.convert_token_to_id('')), - CommandToken( - 'eos', '[PAD]', - self.text_tokenizer.convert_token_to_id('')), - ] - self.token_start_id = self.text_tokenizer.convert_token_to_id( - '') - self.token_end_id = self.text_tokenizer.convert_token_to_id( - '') - self.token_pad_id = self.text_tokenizer.convert_token_to_id( - '') - self.text_tokenizer._token_cls = "" - self.text_tokenizer._token_sep = "" + # try: + self._command_tokens = [ + CommandToken( + 'pad', '[PAD]', + self.text_tokenizer.convert_token_to_id('[PAD]')), + CommandToken( + 'cls', '[CLS]', + self.text_tokenizer.convert_token_to_id('[CLS]')), + CommandToken( + 'MASK', '[MASK]', + self.text_tokenizer.convert_token_to_id('[MASK]')), + CommandToken( + 'unk', '[UNK]', + self.text_tokenizer.convert_token_to_id('[UNK]')), + CommandToken( + 'sep', '[SEP]', + self.text_tokenizer.convert_token_to_id('[SEP]')), + CommandToken( + 'eos', '[PAD]', + self.text_tokenizer.convert_token_to_id('[PAD]')), + ] + self.token_start_id = self.text_tokenizer.convert_token_to_id( + '[CLS]') + self.token_end_id = self.text_tokenizer.convert_token_to_id( + '[SEP]') + self.token_pad_id = self.text_tokenizer.convert_token_to_id( + '[PAD]') + self.text_tokenizer._token_cls = "[CLS]" + self.text_tokenizer._token_sep = "[SEP]" + + # except KeyError: + # self._command_tokens = [ + # CommandToken( + # 'pad', '[PAD]', + # self.text_tokenizer.convert_token_to_id('')), + # CommandToken( + # 'cls', '[CLS]', + # self.text_tokenizer.convert_token_to_id('')), + # CommandToken( + # 'MASK', '[MASK]', + # self.text_tokenizer.convert_token_to_id('')), + # CommandToken( + # 'unk', '[UNK]', + # self.text_tokenizer.convert_token_to_id('')), + # CommandToken( + # 'sep', '[SEP]', + # self.text_tokenizer.convert_token_to_id('')), + # CommandToken( + # 'eos', '[PAD]', + # self.text_tokenizer.convert_token_to_id('')), + # ] + # self.token_start_id = self.text_tokenizer.convert_token_to_id( + # '') + # self.token_end_id = self.text_tokenizer.convert_token_to_id( + # '') + # self.token_pad_id = self.text_tokenizer.convert_token_to_id( + # '') + # self.text_tokenizer._token_cls = "" + # self.text_tokenizer._token_sep = "" if add_block_symbols: self.add_command_token('sop', '<|startofpiece|>') self.add_command_token('eop', '<|endofpiece|>',) @@ -348,7 +348,8 @@ def __init__(self, } self.command_id_map = {tok.Id: tok for tok in self._command_tokens} self._command_token_tokens = list(self.command_token_map.keys()) - logger.info("All special tokens: %s", str([(k,v.Id) for k,v in self.command_name_map.items()])) + print("All special tokens: ", str([(v.name, k, v.Id) for k,v in self.command_token_map.items()])) + # logger.info("All special tokens: %s", str([(k,v.Id) for k,v in self.command_name_map.items()])) def get_vocab(self): return self.text_tokenizer.get_vocab() diff --git a/test.py b/test.py index fce506e0..2cccd4fc 100644 --- a/test.py +++ b/test.py @@ -1,13 +1,85 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") -import unittest - -print('test syn') -test_dir = './tests' -test_report_path = './test_report' -discover = unittest.defaultTestLoader.discover(test_dir, pattern='test_*.py') -with open(test_report_path, "w") as report_file: - runner = unittest.TextTestRunner(stream=report_file, verbosity=2) - #runner=unittest.TextTestRunner() - runner.run(discover) \ No newline at end of file +# import unittest + +# print('test syn') +# test_dir = './tests' +# test_report_path = './test_report' +# discover = unittest.defaultTestLoader.discover(test_dir, pattern='test_*.py') +# with open(test_report_path, "w") as report_file: +# runner = unittest.TextTestRunner(stream=report_file, verbosity=2) +# #runner=unittest.TextTestRunner() +# runner.run(discover) +from dataclasses import dataclass, field +@dataclass(frozen=True, eq=True) +class AddedToken: + """ + AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the + way it should behave. + """ + + content: str = field(default_factory=str) + single_word: bool = False + lstrip: bool = False + rstrip: bool = False + normalized: bool = True + + def __getstate__(self): + return self.__dict__ + +class SpecialTokensMixin: + SPECIAL_TOKENS_ATTRIBUTES = [ + "bos_token", + "eos_token", + "unk_token", + "sep_token", + "pad_token", + "cls_token", + "mask_token", + "additional_special_tokens", + ] + def __init__(self, **kwargs): + print(kwargs) + self._bos_token = None + self._eos_token = None + self._unk_token = None + self._sep_token = None + self._pad_token = None + self._cls_token = None + self._mask_token = None + self._pad_token_type_id = 0 + self._additional_special_tokens = [] + # self.verbose = verbose + for key, value in kwargs.items(): + if value is None: + continue + if key in self.SPECIAL_TOKENS_ATTRIBUTES: + if key == "additional_special_tokens": + assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple" + assert all( + isinstance(t, (str, AddedToken)) for t in value + ), "One of the tokens is not a string or an AddedToken" + setattr(self, key, value) + elif isinstance(value, (str, AddedToken)): + setattr(self, key, value) + else: + raise TypeError(f"special token {key} has to be either str or AddedToken but got: {type(value)}") + +class Tokenizer(SpecialTokensMixin): + def __init__(self,eos_token="", + unk_token="", + pad_token="", + additional_special_tokens=None, + **kwargs): + super().__init__( + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + # extra_ids=extra_ids, + additional_special_tokens=additional_special_tokens, + # sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) +tokenizer = Tokenizer() +import pdb;pdb.set_trace() \ No newline at end of file diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index ffc6de0e..35757d5d 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -14,6 +14,10 @@ def test_tokenizer_GLM_large_ch(self): [3378, 1567, 2613, 20282], 'EncodeAsIds Error') self.assertEqual(tokenizer.DecodeIds([3378, 1567, 2613, 20282]), '今天吃饭吃了肯德基', 'DecodeIds Error') + self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], + [('eos', '<|endoftext|>', 50000), ('sep', '[SEP]', 50001), ('cls', '[CLS]', 50002), + ('MASK', '[MASK]', 50003), ('unk', '[UNK]', 50004), ('sop', '<|startofpiece|>', 50006), + ('eop', '<|endofpiece|>', 50007), ('sMASK', '[sMASK]', 50008), ('gMASK', '[gMASK]', 50009)]) def test_tokenizer_GLM_large_en(self): tokenizer = Tokenizer.from_pretrained("GLM-large-en") @@ -22,6 +26,10 @@ def test_tokenizer_GLM_large_en(self): [13017, 7975, 3084, 2033, 3407], '') self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), 'fried chicken makes me happy', 'DecodeIds Error') + self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], + [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('MASK', '[MASK]', 103), ('unk', '[UNK]', 100), + ('sep', '[SEP]', 102), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523), + ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)]) # def test_tokenizer_glm_10b_en(self): # tokenizer = Tokenizer.from_pretrained("GLM-10b-en") @@ -38,9 +46,12 @@ def test_tokenizer_t5(self): [3, 7704, 3832, 656, 140, 1095], '') self.assertEqual(tokenizer.DecodeIds([3, 7704, 3832, 656, 140, 1095]), 'fried chicken makes me happy', 'DecodeIds Error') + self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], + [('eos', '<|endoftext|>', 32000), ('sep', '[SEP]', 32001), ('cls', '[CLS]', 32002), + ('MASK', '[MASK]', 32003), ('unk', '[UNK]', 32004)]) def test_tokenizer_roberta(self): - tokenizer = Tokenizer.from_pretrained('RoBERTa-base-ch') + tokenizer = Tokenizer.from_pretrained('roberta-base-ch') # print(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825])) self.assertEqual(tokenizer.TokenToId("人"), 782, '') self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), @@ -48,13 +59,18 @@ def test_tokenizer_roberta(self): self.assertEqual(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825]), '今天吃饭吃了肯德基', 'DecodeIds Error') + self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], + [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('MASK', '[MASK]', 103), ('unk', '[UNK]', 100), ('sep', '[SEP]', 102)]) + def test_tokenizer_bert(self): - tokenizer = Tokenizer.from_pretrained('BERT-base-en') + tokenizer = Tokenizer.from_pretrained('bert-base-en') self.assertEqual(tokenizer.TokenToId("day"), 2154, '') self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), [13017, 7975, 3084, 2033, 3407], '') self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), 'fried chicken makes me happy', 'DecodeIds Error') + self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], + [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('MASK', '[MASK]', 103), ('unk', '[UNK]', 100), ('sep', '[SEP]', 102)]) def test_tokenizer_cpm1(self): loader = AutoLoader(task_name="lm", From cd13fc5a99bd309cc2a7bfde410d759c600bad78 Mon Sep 17 00:00:00 2001 From: Anhforth Date: Mon, 20 Feb 2023 10:48:32 +0800 Subject: [PATCH 02/54] updated Signed-off-by: Anhforth --- examples/opt/generate_opt_1.3b.py | 2 + examples/t5_title_generation/generate.py | 4 +- flagai/auto_model/auto_loader.py | 1 + flagai/data/file_utils.py | 1 - .../tokenizer/uni_tokenizer/base_tokenizer.py | 8 +- ...kenizer.py => diffusion_bert_tokenizer.py} | 0 .../tokenizer/uni_tokenizer/properties.py | 3 +- .../data/tokenizer/uni_tokenizer/tokenizer.py | 36 +++- tests/test_tokenizer.py | 171 +++++++++--------- 9 files changed, 134 insertions(+), 92 deletions(-) rename flagai/data/tokenizer/uni_tokenizer/{difffusion_bert_tokenizer.py => diffusion_bert_tokenizer.py} (100%) diff --git a/examples/opt/generate_opt_1.3b.py b/examples/opt/generate_opt_1.3b.py index 8311a9f1..7d928300 100644 --- a/examples/opt/generate_opt_1.3b.py +++ b/examples/opt/generate_opt_1.3b.py @@ -1,3 +1,5 @@ +import sys +sys.path.append("/home/yanzhaodong/anhforth/FlagAI") from flagai.model.predictor.predictor import Predictor from flagai.auto_model.auto_loader import AutoLoader diff --git a/examples/t5_title_generation/generate.py b/examples/t5_title_generation/generate.py index f6e58391..8e1b630c 100644 --- a/examples/t5_title_generation/generate.py +++ b/examples/t5_title_generation/generate.py @@ -1,11 +1,13 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") +import sys +sys.path.append("/home/yanzhaodong/anhforth/FlagAI") from flagai.auto_model.auto_loader import AutoLoader from flagai.model.predictor.predictor import Predictor if __name__ == '__main__': - loader = AutoLoader("title-generation", "T5-base-ch", model_dir="./state_dict/") + loader = AutoLoader("title-generation", "T5-base-ch", model_dir="./checkpoints") model = loader.get_model() tokenizer = loader.get_tokenizer() predictor = Predictor(model, tokenizer) diff --git a/flagai/auto_model/auto_loader.py b/flagai/auto_model/auto_loader.py index a7a480f5..3f3caa96 100644 --- a/flagai/auto_model/auto_loader.py +++ b/flagai/auto_model/auto_loader.py @@ -46,6 +46,7 @@ def __getattr__(self, name): "cpm_lm": ("flagai.model.gpt2_model", "GPT2Model"), "t5_seq2seq": ["flagai.model.t5_model", "T5Model"], "t5_lm": ["flagai.model.t5_model", "T5Model"], + "t5_title-generation": ["flagai.model.t5_model", "T5Model"], "alm_lm": ["flagai.model.alm_model", "ALMModel"], "glm_lm": ["flagai.model.glm_model", "GLMModel"], "glm_seq2seq": ["flagai.model.glm_model", "GLMForSeq2Seq"], diff --git a/flagai/data/file_utils.py b/flagai/data/file_utils.py index 15ebf0e9..40af5d47 100644 --- a/flagai/data/file_utils.py +++ b/flagai/data/file_utils.py @@ -20,7 +20,6 @@ from hashlib import sha256 import sys from io import open - import boto3 import requests from botocore.exceptions import ClientError diff --git a/flagai/data/tokenizer/uni_tokenizer/base_tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/base_tokenizer.py index dca6bb82..4768e7a9 100644 --- a/flagai/data/tokenizer/uni_tokenizer/base_tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/base_tokenizer.py @@ -1,6 +1,6 @@ import os from flagai.model.file_utils import _get_model_files, _get_model_id, _get_vocab_path -from flagai.data.tokenizer.uni_tokenizer.properties import VOCAB_FILE, MERGES_FILE, SP_MODEL_FILE, VOCAB_JSON_FILE +from flagai.data.tokenizer.uni_tokenizer.properties import VOCAB_FILE, MERGES_FILE, SP_MODEL_FILE, VOCAB_JSON_FILE, SPECIAL_TOKENS_MAP import warnings @@ -63,10 +63,12 @@ def from_pretrained(cls, resolved_vocab_file = os.path.join(cache_dir, VOCAB_FILE) resolved_merges_file = os.path.join(cache_dir, MERGES_FILE) resolved_sp_file = os.path.join(cache_dir, SP_MODEL_FILE) + special_tokens_map = os.path.join(cache_dir, SPECIAL_TOKENS_MAP) if tokenizer_class == "wp": return cls(vocab_file=resolved_vocab_file, tokenizer_class=tokenizer_class, tokenizer_model_name=tokenizer_model_name, + special_tokens_map=special_tokens_map, cache_dir=cache_dir, *inputs, **kwargs) @@ -75,6 +77,7 @@ def from_pretrained(cls, merges_file=resolved_merges_file, tokenizer_class=tokenizer_class, tokenizer_model_name=tokenizer_model_name, + special_tokens_map=special_tokens_map, cache_dir=cache_dir, *inputs, **kwargs) @@ -82,6 +85,7 @@ def from_pretrained(cls, return cls(sp_model_file=resolved_sp_file, tokenizer_class=tokenizer_class, tokenizer_model_name=tokenizer_model_name, + special_tokens_map=special_tokens_map, cache_dir=cache_dir, *inputs, **kwargs) @@ -96,6 +100,7 @@ def __init__(self, sp_model_file=None, tokenizer_class=None, tokenizer_model_name=None, + special_tokens_map=None, cache_dir=None, *inputs, **kwargs): @@ -105,5 +110,6 @@ def __init__(self, self.sp_model_file = sp_model_file self.tokenizer_class = tokenizer_class self.tokenizer_model_name = tokenizer_model_name + self.special_tokens_map = special_tokens_map self.cache_dir = cache_dir self.deprecation_warnings = ({}) diff --git a/flagai/data/tokenizer/uni_tokenizer/difffusion_bert_tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/diffusion_bert_tokenizer.py similarity index 100% rename from flagai/data/tokenizer/uni_tokenizer/difffusion_bert_tokenizer.py rename to flagai/data/tokenizer/uni_tokenizer/diffusion_bert_tokenizer.py diff --git a/flagai/data/tokenizer/uni_tokenizer/properties.py b/flagai/data/tokenizer/uni_tokenizer/properties.py index 78499629..d604730e 100644 --- a/flagai/data/tokenizer/uni_tokenizer/properties.py +++ b/flagai/data/tokenizer/uni_tokenizer/properties.py @@ -2,4 +2,5 @@ VOCAB_JSON_FILE = 'vocab.json' MERGES_FILE = 'merges.txt' SP_MODEL_FILE = 'spiece.model' -SPECIAL_TOKENS_NAME = 'special_tokens.txt' \ No newline at end of file +SPECIAL_TOKENS_NAME = 'special_tokens.txt' +SPECIAL_TOKENS_MAP = 'special_tokens_map.json' \ No newline at end of file diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py index 4f8e6951..876c0989 100644 --- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py @@ -22,13 +22,14 @@ import itertools import logging import torch +import json logger = logging.getLogger(__name__) from flagai.data.tokenizer.tokenizer import CommandToken from flagai.data.tokenizer.uni_tokenizer.wp_tokenizer import WordpieceTokenizer from flagai.data.tokenizer.uni_tokenizer.bpe_tokenizer import BPETokenizer, MMBPETokenizer from flagai.data.tokenizer.uni_tokenizer.sp_tokenizer import SentencePieceTokenizer from flagai.data.tokenizer.uni_tokenizer.base_tokenizer import BaseTokenizer -from flagai.data.tokenizer.uni_tokenizer.difffusion_bert_tokenizer import FullTokenizer +from flagai.data.tokenizer.uni_tokenizer.diffusion_bert_tokenizer import FullTokenizer from typing import List, Union, Optional import unicodedata @@ -53,7 +54,6 @@ def __init__(self, fix_command_token=True, **kwargs): super().__init__(**kwargs) - if self.tokenizer_class == "wp": if self.tokenizer_model_name.lower().startswith('clip-cn'): self.text_tokenizer = FullTokenizer(self.vocab_file) @@ -73,11 +73,12 @@ def __init__(self, raise NotImplementedError("cannot assign a tokenize class") if self.tokenizer_model_name.lower().startswith('glm') or self.tokenizer_model_name.lower().startswith('alm'): - add_block_symbols=False - add_block_symbols = True + add_block_symbols=True # self.is_clip = self.tokenizer_model_name.startswith('clip') self.num_tokens = self.text_tokenizer.vocab_size - + with open(self.special_tokens_map, encoding='utf8') as file: dct=json.load(file) + sp_tokens = [(k.replace("_token",""),v['content']) for k,v in dct.items()] + # self._command_tokens = [CommandToken(e[0], e[1], self.text_tokenizer.convert_token_to_id(e[1])) for e in sp_tokens ] if self.tokenizer_class == "wp": # set command tokens from wordpiece tokenizer values self.num_command_tokens = 6 @@ -207,6 +208,29 @@ def __init__(self, self.num_tokens += self.num_command_tokens self.token_end_id = self.text_tokenizer.convert_token_to_id( '') + elif self.tokenizer_model_name.lower().startswith('opt'): + self._command_tokens = [ + CommandToken( + 'pad', '<|endoftext|>', + self.text_tokenizer.convert_token_to_id('')), + CommandToken( + 'eos', '<|endoftext|>', + self.text_tokenizer.convert_token_to_id('')), + CommandToken( + 'sep', '[SEP]', + self.text_tokenizer.convert_token_to_id('')), + CommandToken( + 'cls', '[CLS]', + self.text_tokenizer.convert_token_to_id('')), + CommandToken( + 'MASK', + '[MASK]', + self.text_tokenizer.convert_token_to_id(''), + lstrip=True), + CommandToken( + 'unk', '[UNK]', + self.text_tokenizer.convert_token_to_id('')) + ] else: self.num_command_tokens = 2 self.num_text_tokens = self.num_tokens - 1 @@ -273,7 +297,7 @@ def __init__(self, self.num_command_tokens = 0 self.num_text_tokens = self.text_tokenizer.vocab_size self.num_tokens = self.num_text_tokens - + import pdb; if self.tokenizer_model_name.lower().startswith('glm'): pad_token_id = self.num_tokens eos_token_id = self.num_tokens diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 35757d5d..686d3fa0 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -1,35 +1,37 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") +import sys +sys.path.append("/home/yanzhaodong/anhforth/FlagAI") import unittest from flagai.data.tokenizer import Tokenizer from flagai.auto_model.auto_loader import AutoLoader class TokenizerTestCase(unittest.TestCase): - def test_tokenizer_GLM_large_ch(self): - tokenizer = Tokenizer.from_pretrained("GLM-large-ch") - self.assertEqual(tokenizer.TokenToId("人"), 43371, 'Token id "人" error') - self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), - [3378, 1567, 2613, 20282], 'EncodeAsIds Error') - self.assertEqual(tokenizer.DecodeIds([3378, 1567, 2613, 20282]), - '今天吃饭吃了肯德基', 'DecodeIds Error') - self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], - [('eos', '<|endoftext|>', 50000), ('sep', '[SEP]', 50001), ('cls', '[CLS]', 50002), - ('MASK', '[MASK]', 50003), ('unk', '[UNK]', 50004), ('sop', '<|startofpiece|>', 50006), - ('eop', '<|endofpiece|>', 50007), ('sMASK', '[sMASK]', 50008), ('gMASK', '[gMASK]', 50009)]) - - def test_tokenizer_GLM_large_en(self): - tokenizer = Tokenizer.from_pretrained("GLM-large-en") - self.assertEqual(tokenizer.TokenToId("day"), 2154, '') - self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), - [13017, 7975, 3084, 2033, 3407], '') - self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), - 'fried chicken makes me happy', 'DecodeIds Error') - self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], - [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('MASK', '[MASK]', 103), ('unk', '[UNK]', 100), - ('sep', '[SEP]', 102), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523), - ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)]) + # def test_tokenizer_GLM_large_ch(self): + # tokenizer = Tokenizer.from_pretrained("GLM-large-ch") + # self.assertEqual(tokenizer.TokenToId("人"), 43371, 'Token id "人" error') + # self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), + # [3378, 1567, 2613, 20282], 'EncodeAsIds Error') + # self.assertEqual(tokenizer.DecodeIds([3378, 1567, 2613, 20282]), + # '今天吃饭吃了肯德基', 'DecodeIds Error') + # self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], + # [('eos', '<|endoftext|>', 50000), ('sep', '[SEP]', 50001), ('cls', '[CLS]', 50002), + # ('MASK', '[MASK]', 50003), ('unk', '[UNK]', 50004), ('sop', '<|startofpiece|>', 50006), + # ('eop', '<|endofpiece|>', 50007), ('sMASK', '[sMASK]', 50008), ('gMASK', '[gMASK]', 50009)]) + + # def test_tokenizer_GLM_large_en(self): + # tokenizer = Tokenizer.from_pretrained("GLM-large-en") + # self.assertEqual(tokenizer.TokenToId("day"), 2154, '') + # self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), + # [13017, 7975, 3084, 2033, 3407], '') + # self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), + # 'fried chicken makes me happy', 'DecodeIds Error') + # self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], + # [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('MASK', '[MASK]', 103), ('unk', '[UNK]', 100), + # ('sep', '[SEP]', 102), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523), + # ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)]) # def test_tokenizer_glm_10b_en(self): # tokenizer = Tokenizer.from_pretrained("GLM-10b-en") @@ -50,73 +52,78 @@ def test_tokenizer_t5(self): [('eos', '<|endoftext|>', 32000), ('sep', '[SEP]', 32001), ('cls', '[CLS]', 32002), ('MASK', '[MASK]', 32003), ('unk', '[UNK]', 32004)]) - def test_tokenizer_roberta(self): - tokenizer = Tokenizer.from_pretrained('roberta-base-ch') - # print(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825])) - self.assertEqual(tokenizer.TokenToId("人"), 782, '') - self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), - [791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825], '') - self.assertEqual(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825]), - '今天吃饭吃了肯德基', 'DecodeIds Error') - - self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], - [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('MASK', '[MASK]', 103), ('unk', '[UNK]', 100), ('sep', '[SEP]', 102)]) - - def test_tokenizer_bert(self): - tokenizer = Tokenizer.from_pretrained('bert-base-en') - self.assertEqual(tokenizer.TokenToId("day"), 2154, '') - self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), - [13017, 7975, 3084, 2033, 3407], '') - self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), - 'fried chicken makes me happy', 'DecodeIds Error') - self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], - [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('MASK', '[MASK]', 103), ('unk', '[UNK]', 100), ('sep', '[SEP]', 102)]) - - def test_tokenizer_cpm1(self): - loader = AutoLoader(task_name="lm", - model_name="CPM-large-ch", - model_dir="./checkpoints/", - only_download_config=True) - tokenizer = loader.get_tokenizer() - self.assertEqual(tokenizer.encode("day"), [8, 8275], '') - self.assertEqual(tokenizer.encode("fried chicken makes me happy"), - [2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239], '') - self.assertEqual(tokenizer.decode([2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239]), - 'fried chicken makes me happy', 'DecodeIds Error') - - def test_tokenizer_opt(self): - tokenizer = Tokenizer.from_pretrained('opt-125m-en') - self.assertEqual(tokenizer.encode("day"), [1208], '') - self.assertEqual(tokenizer.encode_plus("fried chicken makes me happy")["input_ids"], - [50260, 21209, 5884, 817, 162, 1372, 50260], '') - self.assertEqual(tokenizer.decode([21209, 5884, 817, 162, 1372]), - 'fried chicken makes me happy', 'DecodeIds Error') + # def test_tokenizer_roberta(self): + # tokenizer = Tokenizer.from_pretrained('RoBERTa-base-ch') + # # print(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825])) + # self.assertEqual(tokenizer.TokenToId("人"), 782, '') + # self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), + # [791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825], '') + # self.assertEqual(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825]), + # '今天吃饭吃了肯德基', 'DecodeIds Error') + + # self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], + # [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('MASK', '[MASK]', 103), ('unk', '[UNK]', 100), ('sep', '[SEP]', 102)]) + + # def test_tokenizer_bert(self): + # tokenizer = Tokenizer.from_pretrained('BERT-base-en') + # self.assertEqual(tokenizer.TokenToId("day"), 2154, '') + # self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), + # [13017, 7975, 3084, 2033, 3407], '') + # self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), + # 'fried chicken makes me happy', 'DecodeIds Error') + # self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], + # [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('MASK', '[MASK]', 103), ('unk', '[UNK]', 100), ('sep', '[SEP]', 102)]) + + # def test_tokenizer_cpm1(self): + # loader = AutoLoader(task_name="lm", + # model_name="CPM-large-ch", + # model_dir="./checkpoints/", + # only_download_config=True) + # tokenizer = loader.get_tokenizer() + # self.assertEqual(tokenizer.encode("day"), [8, 8275], '') + # self.assertEqual(tokenizer.encode("fried chicken makes me happy"), + # [2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239], '') + # self.assertEqual(tokenizer.decode([2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239]), + # 'fried chicken makes me happy', 'DecodeIds Error') + # self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], + # [('eos', '<|endoftext|>', 30000), ('sep', '[SEP]', 30001), ('cls', '[CLS]', 30002), ('MASK', '[MASK]', 30003), ('unk', '[UNK]', 30004)]) + + # def test_tokenizer_opt(self): + # tokenizer = Tokenizer.from_pretrained('opt-125m-en') + # self.assertEqual(tokenizer.encode("day"), [1208], '') + # self.assertEqual(tokenizer.encode_plus("fried chicken makes me happy")["input_ids"], + # [50260, 21209, 5884, 817, 162, 1372, 50260], '') + # self.assertEqual(tokenizer.decode([21209, 5884, 817, 162, 1372]), + # 'fried chicken makes me happy', 'DecodeIds Error') + # self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], + # [('eos', '<|endoftext|>', 50260)]) + - def test_tokenizer_clip(self): - loader = AutoLoader(task_name="txt_img_matching", - model_name="clip-base-p32-224") - tokenizer = loader.get_tokenizer() - self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') + # def test_tokenizer_clip(self): + # loader = AutoLoader(task_name="txt_img_matching", + # model_name="clip-base-p32-224") + # tokenizer = loader.get_tokenizer() + # self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') - def test_tokenizer_evaclip(self): - loader = AutoLoader(task_name="txt_img_matching", - model_name="eva-clip") - tokenizer = loader.get_tokenizer() - self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') + # def test_tokenizer_evaclip(self): + # loader = AutoLoader(task_name="txt_img_matching", + # model_name="eva-clip") + # tokenizer = loader.get_tokenizer() + # self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') def suite(): suite = unittest.TestSuite() - suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_ch')) - suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_en')) + # suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_ch')) + # suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_en')) # suite.addTest(TokenizerTestCase('test_tokenizer_glm_10_en')) suite.addTest(TokenizerTestCase('test_tokenizer_t5')) - suite.addTest(TokenizerTestCase('test_tokenizer_roberta')) - suite.addTest(TokenizerTestCase('test_tokenizer_bert')) - suite.addTest(TokenizerTestCase('test_tokenizer_cpm1')) - suite.addTest(TokenizerTestCase('test_tokenizer_opt')) - suite.addTest(TokenizerTestCase('test_tokenizer_clip')) - suite.addTest(TokenizerTestCase('test_tokenizer_evaclip')) + # suite.addTest(TokenizerTestCase('test_tokenizer_roberta')) + # suite.addTest(TokenizerTestCase('test_tokenizer_bert')) + # suite.addTest(TokenizerTestCase('test_tokenizer_cpm1')) + # suite.addTest(TokenizerTestCase('test_tokenizer_opt')) + # suite.addTest(TokenizerTestCase('test_tokenizer_clip')) + # suite.addTest(TokenizerTestCase('test_tokenizer_evaclip')) return suite From 9dc6e10ffe45e4d97aca72a54834ebde61f072be Mon Sep 17 00:00:00 2001 From: Anhforth Date: Mon, 20 Feb 2023 16:10:47 +0800 Subject: [PATCH 03/54] upadted Signed-off-by: Anhforth --- examples/glm_blank_filling/glm_generate_samples.py | 3 ++- .../glm_blank_filling/glm_generate_samples_en.py | 3 ++- examples/t5_title_generation/generate.py | 14 +++++++------- flagai/data/tokenizer/uni_tokenizer/tokenizer.py | 11 ++++++++++- flagai/model/predictor/utils.py | 3 ++- tests/test_tokenizer.py | 4 ++-- 6 files changed, 25 insertions(+), 13 deletions(-) diff --git a/examples/glm_blank_filling/glm_generate_samples.py b/examples/glm_blank_filling/glm_generate_samples.py index 40e385a9..f290f662 100644 --- a/examples/glm_blank_filling/glm_generate_samples.py +++ b/examples/glm_blank_filling/glm_generate_samples.py @@ -1,7 +1,8 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") - +import sys +sys.path.append("/home/yanzhaodong/anhforth/FlagAI") import torch from flagai.model.glm_model import GLMModel from flagai.data.tokenizer import Tokenizer diff --git a/examples/glm_blank_filling/glm_generate_samples_en.py b/examples/glm_blank_filling/glm_generate_samples_en.py index 009b4ed1..96347ec8 100644 --- a/examples/glm_blank_filling/glm_generate_samples_en.py +++ b/examples/glm_blank_filling/glm_generate_samples_en.py @@ -1,7 +1,8 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") - +import sys +sys.path.append("/home/yanzhaodong/anhforth/FlagAI") import torch from flagai.model.glm_model import GLMModel from flagai.data.tokenizer import Tokenizer diff --git a/examples/t5_title_generation/generate.py b/examples/t5_title_generation/generate.py index 8e1b630c..5a4910e8 100644 --- a/examples/t5_title_generation/generate.py +++ b/examples/t5_title_generation/generate.py @@ -19,12 +19,12 @@ beam_size=3, input_max_length=512, out_max_length=100) - out_2 = predictor.predict_generate_randomsample(text, - input_max_length=512, - out_max_length=100, - repetition_penalty=1.5, - top_k=20, - top_p=0.8) +# out_2 = predictor.predict_generate_randomsample(text, +# input_max_length=512, +# out_max_length=100, +# repetition_penalty=1.5, +# top_k=20, +# top_p=0.8) print(f"out_1 is {out_1}") - print(f"out_2 is {out_2}") +# print(f"out_2 is {out_2}") diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py index 876c0989..30db5574 100644 --- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py @@ -52,6 +52,7 @@ def __init__(self, add_task_mask=True, add_decoder_mask=False, fix_command_token=True, + pre_tokenizer=None, **kwargs): super().__init__(**kwargs) if self.tokenizer_class == "wp": @@ -75,6 +76,9 @@ def __init__(self, if self.tokenizer_model_name.lower().startswith('glm') or self.tokenizer_model_name.lower().startswith('alm'): add_block_symbols=True # self.is_clip = self.tokenizer_model_name.startswith('clip') + # if self.tokenizer_model_name.startswith('t5'): + # import jieba + # self.pre_tokenizer = lambda x: jieba.cut(x, HMM=False) self.num_tokens = self.text_tokenizer.vocab_size with open(self.special_tokens_map, encoding='utf8') as file: dct=json.load(file) sp_tokens = [(k.replace("_token",""),v['content']) for k,v in dct.items()] @@ -590,7 +594,8 @@ def encode_plus_non_glm( truncation=True, max_length=None, ): - + if self.tokenizer_model_name.startswith('t5'): + assert second_text is None, "t5 does not support multi-sentence encoding" def get_input_ids(text): tokens = self.text_tokenizer.tokenize(text) return self.text_tokenizer.convert_tokens_to_ids(tokens) @@ -753,6 +758,10 @@ def tokenize_as_tensor(self, texts): eot_token=eot_token) def tokenize(self, text, maxlen=None, add_spatial_tokens=False): + """ + add_spatial_token: (bool) Add cls at the front and sep at the end + max_len: Truncate the length to max_len + """ tokens = self.text_tokenizer.tokenize(text) if add_spatial_tokens: diff --git a/flagai/model/predictor/utils.py b/flagai/model/predictor/utils.py index 22eab1da..6d78c842 100644 --- a/flagai/model/predictor/utils.py +++ b/flagai/model/predictor/utils.py @@ -928,8 +928,9 @@ def t5_random_sample(model, tokenizer, text, input_max_length, out_max_length, TopPLogitsProcessor(top_p=top_p), ] list_processor = ListProcessor(lp) + from tqdm import trange with torch.no_grad(): - for step in range(out_max_length): + for step in trange(out_max_length): scores = model(**{ "input_ids": token_ids, "decoder_input_ids": input_decoder_ids diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 686d3fa0..666bda4f 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -49,8 +49,8 @@ def test_tokenizer_t5(self): self.assertEqual(tokenizer.DecodeIds([3, 7704, 3832, 656, 140, 1095]), 'fried chicken makes me happy', 'DecodeIds Error') self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], - [('eos', '<|endoftext|>', 32000), ('sep', '[SEP]', 32001), ('cls', '[CLS]', 32002), - ('MASK', '[MASK]', 32003), ('unk', '[UNK]', 32004)]) + [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('MASK', '[MASK]', 103), + ('unk', '[UNK]', 100), ('sep', '[SEP]', 102)]) # def test_tokenizer_roberta(self): # tokenizer = Tokenizer.from_pretrained('RoBERTa-base-ch') From c6ac834579ce9e37f822c7982a1c3d9ee8d1b315 Mon Sep 17 00:00:00 2001 From: Anhforth Date: Wed, 22 Feb 2023 17:23:58 +0800 Subject: [PATCH 04/54] changed special token fule Signed-off-by: Anhforth --- ...TORIAL_15_BERT_EXAMPLE_TITLE_GENERATION.md | 2 +- doc_zh/TUTORIAL_3_MODEL.md | 2 +- docs/TUTORIAL_3_MODEL.md | 2 +- examples/cpm3_generation/generation.py | 2 +- examples/cpm3_pretrain/data_analyze.py | 2 +- examples/cpm_1/generate.py | 4 +- .../glm_blank_filling/glm_generate_samples.py | 30 +- examples/roberta_ner/train.py | 2 +- examples/roberta_ner/train_crf.py | 2 +- examples/roberta_ner/train_global_pointer.py | 2 +- flagai/auto_model/auto_loader.py | 2 +- flagai/data/dataset/block/blocklm_utils.py | 2 +- .../data/dataset/data_collator/collate_fn.py | 6 +- flagai/data/dataset/data_utils.py | 4 +- flagai/data/dataset/superglue/pvp.py | 4 +- flagai/data/tokenizer/__init__.py | 1 + flagai/data/tokenizer/bert/bert_tokenizer.py | 4 +- flagai/data/tokenizer/opt/opt_en_tokenizer.py | 4 +- .../tokenizer/uni_tokenizer/base_tokenizer.py | 6 +- .../tokenizer/uni_tokenizer/properties.py | 1 + .../tokenizer/uni_tokenizer/sp_tokenizer.py | 2 - .../data/tokenizer/uni_tokenizer/tokenizer.py | 734 +++++++++++------- flagai/model/predictor/utils.py | 2 +- flagai/test_utils.py | 2 +- tests/test_tokenizer.py | 116 +-- 25 files changed, 558 insertions(+), 382 deletions(-) diff --git a/doc_zh/TUTORIAL_15_BERT_EXAMPLE_TITLE_GENERATION.md b/doc_zh/TUTORIAL_15_BERT_EXAMPLE_TITLE_GENERATION.md index 9be5a623..763c345c 100644 --- a/doc_zh/TUTORIAL_15_BERT_EXAMPLE_TITLE_GENERATION.md +++ b/doc_zh/TUTORIAL_15_BERT_EXAMPLE_TITLE_GENERATION.md @@ -24,7 +24,7 @@ ### 1. 数据加载 样例数据位于 /examples/bert_title_generation/data/ -需要在 ```trianer.py```文件中定义数据读取过程,例如: +需要在 ```trainer.py```文件中定义数据读取过程,例如: ```python def read_file(): src = [] diff --git a/doc_zh/TUTORIAL_3_MODEL.md b/doc_zh/TUTORIAL_3_MODEL.md index 407379d3..e5ca27de 100644 --- a/doc_zh/TUTORIAL_3_MODEL.md +++ b/doc_zh/TUTORIAL_3_MODEL.md @@ -20,7 +20,7 @@ ## From_pretrain `From_pretrain` 函数用于加载模型。同一个模型结构的模型可以用同一个class进行加载,比如`BERT-base-ch` 和`Roberta-base-ch`模型都能用`BertModel`这个`Class`进行加载。`From_pretrain`为了数据/模型并行的模型加载进行了特定优化,避免重复下载导致的资源浪费。 -通过调用`ClassName.from_pretrian()`来进行加载. +通过调用`ClassName.from_pretrain()`来进行加载. ### 从modelhub加载 现在我们支持从modelhub中下载[常用模型](#所有支持模型),可以直接通过`from_pretrain`下载模型配置文件`config.json`,模型权重`pytorch_model.bin`,以及字典文件`vocab.txt`。例子: ```python diff --git a/docs/TUTORIAL_3_MODEL.md b/docs/TUTORIAL_3_MODEL.md index 60d3c3a7..adb2b88a 100644 --- a/docs/TUTORIAL_3_MODEL.md +++ b/docs/TUTORIAL_3_MODEL.md @@ -25,7 +25,7 @@ All supported models now support the three most common model types [encoder, dec ### load model from modelhub -By calling `ClassName.from_pretrian()` to load following [supported models](#all-supported-models), it will automatically download the model configuration file `config.json`, model weights `pytorch_model.bin`, and dictionary files `vocab .txt`. +By calling `ClassName.from_pretrain()` to load following [supported models](#all-supported-models), it will automatically download the model configuration file `config.json`, model weights `pytorch_model.bin`, and dictionary files `vocab .txt`. ```python >>> # Downloading GLM-large-ch from modelhub diff --git a/examples/cpm3_generation/generation.py b/examples/cpm3_generation/generation.py index 809e7bae..61b3f670 100644 --- a/examples/cpm3_generation/generation.py +++ b/examples/cpm3_generation/generation.py @@ -103,7 +103,7 @@ def calc_banned_ngram_tokens( return banned_tokens -# min_length_constriant +# min_length_constraint def min_length_constraint(logits, cur_len, min_len, tokenizer): # This enforcing a min-length by setting EOS probability to 0. if cur_len <= min_len: diff --git a/examples/cpm3_pretrain/data_analyze.py b/examples/cpm3_pretrain/data_analyze.py index d295eb60..b5fb3a3f 100644 --- a/examples/cpm3_pretrain/data_analyze.py +++ b/examples/cpm3_pretrain/data_analyze.py @@ -1,6 +1,6 @@ import json -fout = open('{}'.format('/sharefs/baai-mrnd/xw/cpm3_trian_data/cpm3_train_data.jsonl'), "w", encoding='utf-8') +fout = open('{}'.format('/sharefs/baai-mrnd/xw/cpm3_train_data/cpm3_train_data.jsonl'), "w", encoding='utf-8') fin = open('{}'.format('/sharefs/webbrain-lijijie/data/CEPSUM/test_public.jsonl'), 'r', encoding='utf-8') def random_mask(source: str): diff --git a/examples/cpm_1/generate.py b/examples/cpm_1/generate.py index 9482c006..24f4fe79 100644 --- a/examples/cpm_1/generate.py +++ b/examples/cpm_1/generate.py @@ -1,3 +1,5 @@ +import sys +sys.path.append("/home/yanzhaodong/anhforth/FlagAI") from flagai.auto_model.auto_loader import AutoLoader from flagai.model.predictor.predictor import Predictor @@ -9,7 +11,7 @@ loader = AutoLoader(task_name="lm", model_name="CPM-large-ch", - model_dir="./state_dict/") + model_dir="./checkpoints") model = loader.get_model() tokenizer = loader.get_tokenizer() diff --git a/examples/glm_blank_filling/glm_generate_samples.py b/examples/glm_blank_filling/glm_generate_samples.py index f290f662..61b66290 100644 --- a/examples/glm_blank_filling/glm_generate_samples.py +++ b/examples/glm_blank_filling/glm_generate_samples.py @@ -1,7 +1,7 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") -import sys +import sys sys.path.append("/home/yanzhaodong/anhforth/FlagAI") import torch from flagai.model.glm_model import GLMModel @@ -22,7 +22,7 @@ model.cuda(torch.cuda.current_device()) predictor = Predictor(model, tokenizer) - # generate samples + text = [ '问题:啤酒伤胃吗?回答:[gMASK]', "问题:隔夜菜能吃吗?回答:[gMASK]", "问题:如何评价许嵩?回答:[gMASK]" ] @@ -31,17 +31,17 @@ t, top_k=50, repetition_penalty=4.0, top_p=1.0) print(t, '\n', output) - text = ['北京故宫是中国[MASK]非物质文化遗产。', "上海是中国[MASK]大都市。", "天津大学是[MASK]现代大学。"] - for t in text: - output = predictor.predict_generate_randomsample( - t, top_k=50, repetition_penalty=4.0, top_p=1.0) - print(t, '\n', output) + # text = ['北京故宫是中国[MASK]非物质文化遗产。', "上海是中国[mask]大都市。", "天津大学是[mask]现代大学。"] + # for t in text: + # output = predictor.predict_generate_randomsample( + # t, top_k=50, repetition_penalty=4.0, top_p=1.0) + # print(t, '\n', output) # - text = [ - "人工智能是一个以计算机科学为基础,由计算机、数学、哲学等多学科交叉融合的交叉学科,[sMASK],具有非常巨大的前景。", - "最近十多年来,人工神经网络的研究工作不断深入,已经取得了很大的进展,[sMASK],表现出了良好的智能特性。" - ] - for t in text: - output = predictor.predict_generate_randomsample( - t, top_k=50, repetition_penalty=4.0, top_p=1.0) - print(t, '\n', output) + # text = [ + # "人工智能是一个以计算机科学为基础,由计算机、数学、哲学等多学科交叉融合的交叉学科,[sMASK],具有非常巨大的前景。", + # "最近十多年来,人工神经网络的研究工作不断深入,已经取得了很大的进展,[sMASK],表现出了良好的智能特性。" + # ] + # for t in text: + # output = predictor.predict_generate_randomsample( + # t, top_k=50, repetition_penalty=4.0, top_p=1.0) + # print(t, '\n', output) diff --git a/examples/roberta_ner/train.py b/examples/roberta_ner/train.py index 74d279ad..d795793b 100644 --- a/examples/roberta_ner/train.py +++ b/examples/roberta_ner/train.py @@ -69,7 +69,7 @@ def load_data(filename): val_data = load_data(valid_path) test_data = load_data(test_path) -print(f"trian_data is {len(train_data)}") +print(f"train_data is {len(train_data)}") print(f"val_data is {len(val_data)}") print(f"test_data is {len(test_data)}") print(f"target is {target}") diff --git a/examples/roberta_ner/train_crf.py b/examples/roberta_ner/train_crf.py index 882745c0..36bfaa09 100644 --- a/examples/roberta_ner/train_crf.py +++ b/examples/roberta_ner/train_crf.py @@ -65,7 +65,7 @@ def load_data(filename): val_data = load_data(valid_path) test_data = load_data(test_path) -print(f"trian_data is {len(train_data)}") +print(f"train_data is {len(train_data)}") print(f"val_data is {len(val_data)}") print(f"test_data is {len(test_data)}") print(f"target is {target}") diff --git a/examples/roberta_ner/train_global_pointer.py b/examples/roberta_ner/train_global_pointer.py index 26bfcd02..43fc627b 100644 --- a/examples/roberta_ner/train_global_pointer.py +++ b/examples/roberta_ner/train_global_pointer.py @@ -61,7 +61,7 @@ def load_data(filename): val_data = load_data(valid_path) test_data = load_data(test_path) -print(f"trian_data is {len(train_data)}") +print(f"train_data is {len(train_data)}") print(f"val_data is {len(val_data)}") print(f"test_data is {len(test_data)}") print(f"target is {target}") diff --git a/flagai/auto_model/auto_loader.py b/flagai/auto_model/auto_loader.py index 3f3caa96..359562d8 100644 --- a/flagai/auto_model/auto_loader.py +++ b/flagai/auto_model/auto_loader.py @@ -62,7 +62,7 @@ def __getattr__(self, name): "swinv2_classification": ("flagai.model.vision.swinv2", "SwinTransformerV2"), "cpm3_lm": ("flagai.model.cpm3_model", "CPM3"), - "cpm3_trian": ("flagai.model.cpm3_trian_model", "CPM3"), + "cpm3_train": ("flagai.model.cpm3_train_model", "CPM3"), "diffusion_text2img": ("flagai.model.mm.AltDiffusion", "LatentDiffusion"), "altclip_txt_img_matching": ("flagai.model.mm.AltCLIP", "AltCLIP"), "evaclip_txt_img_matching": ("flagai.model.mm.eva_clip_model", "EVA_CLIP"), diff --git a/flagai/data/dataset/block/blocklm_utils.py b/flagai/data/dataset/block/blocklm_utils.py index 4687305f..0103fcba 100644 --- a/flagai/data/dataset/block/blocklm_utils.py +++ b/flagai/data/dataset/block/blocklm_utils.py @@ -205,7 +205,7 @@ def make_masked_data(self, # position_ids = np.arange(len(tokens), dtype=np.int64) targets = copy.deepcopy(tokens) - mask_id = self.tokenizer.get_command_id('MASK') + mask_id = self.tokenizer.get_command_id('mask') mlm_masks = np.zeros(len(tokens), dtype=np.int64) for start, end in block_spans: for idx in range(start, end): diff --git a/flagai/data/dataset/data_collator/collate_fn.py b/flagai/data/dataset/data_collator/collate_fn.py index 73b2f8e5..d6783990 100644 --- a/flagai/data/dataset/data_collator/collate_fn.py +++ b/flagai/data/dataset/data_collator/collate_fn.py @@ -175,7 +175,7 @@ def sub_finder(mylist, pattern): source_tokens = [cls_id] + source_tokens + [mask_id ] + answer_tokens elif self.task_name in ["cmrc"]: - mask_id = self.tokenizer.get_command_id('MASK') + mask_id = self.tokenizer.get_command_id('mask') source_text = example.text_a target_text = example.meta["answer"].strip() question = example.meta["question"].strip() @@ -191,7 +191,7 @@ def sub_finder(mylist, pattern): mask_id ] + source_tokens[:max_src_length] elif self.task_name in ["wsc"]: - mask_id = self.tokenizer.get_command_id('MASK') + mask_id = self.tokenizer.get_command_id('mask') source_text = example.text_a target_text = example.meta["answer"].strip() question = example.meta["question"].strip() @@ -426,7 +426,7 @@ def make_masked_data(self, position_ids = np.arange(len(tokens), dtype=np.int64) targets = copy.deepcopy(tokens) - mask_id = self.tokenizer.get_command_id('MASK') + mask_id = self.tokenizer.get_command_id('mask') mlm_masks = np.zeros(len(tokens), dtype=np.int64) for start, end in block_spans: for idx in range(start, end): diff --git a/flagai/data/dataset/data_utils.py b/flagai/data/dataset/data_utils.py index 4f0ee38d..1efee372 100644 --- a/flagai/data/dataset/data_utils.py +++ b/flagai/data/dataset/data_utils.py @@ -134,7 +134,7 @@ def build_input_from_ids(text_a_ids, # Prepare ids for special tokens if mask_id is None: - mask_id = tokenizer.get_command_id('MASK') + mask_id = tokenizer.get_command_id('mask') eos_id = tokenizer.get_command_id('eos') # end of sentence token cls_id = tokenizer.get_command_id('cls') # start of sentence token sep_id = tokenizer.get_command_id('sep') # seperator of two texts token @@ -235,7 +235,7 @@ def build_input_from_ids(text_a_ids, # def build_decoder_input(enc_ids, answer_ids, max_seq_length, max_dec_seq_length, tokenizer): - mask_id = tokenizer.get_command_id('MASK') + mask_id = tokenizer.get_command_id('mask') eos_id = tokenizer.get_command_id('eos') sop_id = tokenizer.get_command_id('sop') masks = [] diff --git a/flagai/data/dataset/superglue/pvp.py b/flagai/data/dataset/superglue/pvp.py index d4d07b39..a1b76b59 100644 --- a/flagai/data/dataset/superglue/pvp.py +++ b/flagai/data/dataset/superglue/pvp.py @@ -97,12 +97,12 @@ def spell_length(self): @property def mask(self) -> str: """Return the underlying LM's mask token""" - return self.tokenizer.get_command_id('MASK') + return self.tokenizer.get_command_id('mask') @property def mask_id(self) -> int: """Return the underlying LM's mask id""" - return self.tokenizer.get_command_id('MASK') + return self.tokenizer.get_command_id('mask') @property def max_num_verbalizers(self) -> int: diff --git a/flagai/data/tokenizer/__init__.py b/flagai/data/tokenizer/__init__.py index e07653af..c780a6b8 100644 --- a/flagai/data/tokenizer/__init__.py +++ b/flagai/data/tokenizer/__init__.py @@ -6,5 +6,6 @@ from .bert.bert_tokenizer import BertWordPieceTokenizer from .cpm_1.cpm1_tokenizer import CPMTokenizer from .opt.opt_en_tokenizer import OPTTokenizer +from .t5.t5_pegasus_tokenizer import T5PegasusTokenizer from .uni_tokenizer.tokenizer import Tokenizer # from .uni_tokenizer.base_tokenizer import BaseTokenizer diff --git a/flagai/data/tokenizer/bert/bert_tokenizer.py b/flagai/data/tokenizer/bert/bert_tokenizer.py index 0ba3fdf6..3c935713 100644 --- a/flagai/data/tokenizer/bert/bert_tokenizer.py +++ b/flagai/data/tokenizer/bert/bert_tokenizer.py @@ -74,8 +74,8 @@ def __init__(self, tokenizer_model_type=None, cache_dir=None): self._command_tokens = [ CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), - CommandToken('ENC', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), - CommandToken('MASK', '[MASK]', + CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), + CommandToken('mask', '[MASK]', self.get_specialid_from_text_tokenizer('mask')), CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')), diff --git a/flagai/data/tokenizer/opt/opt_en_tokenizer.py b/flagai/data/tokenizer/opt/opt_en_tokenizer.py index 8501601a..9e8e528c 100644 --- a/flagai/data/tokenizer/opt/opt_en_tokenizer.py +++ b/flagai/data/tokenizer/opt/opt_en_tokenizer.py @@ -34,8 +34,8 @@ def __init__(self, tokenizer_model_type="facebook/opt-125m", cache_dir=None): self._command_tokens = [ CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), - CommandToken('ENC', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), - CommandToken('MASK', '[MASK]', + CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), + CommandToken('mask', '[MASK]', self.get_specialid_from_text_tokenizer('mask')), CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')), diff --git a/flagai/data/tokenizer/uni_tokenizer/base_tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/base_tokenizer.py index 4768e7a9..f3583437 100644 --- a/flagai/data/tokenizer/uni_tokenizer/base_tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/base_tokenizer.py @@ -1,6 +1,6 @@ import os from flagai.model.file_utils import _get_model_files, _get_model_id, _get_vocab_path -from flagai.data.tokenizer.uni_tokenizer.properties import VOCAB_FILE, MERGES_FILE, SP_MODEL_FILE, VOCAB_JSON_FILE, SPECIAL_TOKENS_MAP +from flagai.data.tokenizer.uni_tokenizer.properties import VOCAB_FILE, MERGES_FILE, SP_MODEL_FILE, VOCAB_JSON_FILE, TOKENIZER_JSON_FILE, SPECIAL_TOKENS_MAP import warnings @@ -64,6 +64,7 @@ def from_pretrained(cls, resolved_merges_file = os.path.join(cache_dir, MERGES_FILE) resolved_sp_file = os.path.join(cache_dir, SP_MODEL_FILE) special_tokens_map = os.path.join(cache_dir, SPECIAL_TOKENS_MAP) + resolved_tokenizer_json_file = os.path.join(cache_dir, TOKENIZER_JSON_FILE) if tokenizer_class == "wp": return cls(vocab_file=resolved_vocab_file, tokenizer_class=tokenizer_class, @@ -85,6 +86,7 @@ def from_pretrained(cls, return cls(sp_model_file=resolved_sp_file, tokenizer_class=tokenizer_class, tokenizer_model_name=tokenizer_model_name, + tokenizer_json_file=resolved_tokenizer_json_file, special_tokens_map=special_tokens_map, cache_dir=cache_dir, *inputs, @@ -98,6 +100,7 @@ def __init__(self, vocab_file=None, merges_file=None, sp_model_file=None, + tokenizer_json_file=None, tokenizer_class=None, tokenizer_model_name=None, special_tokens_map=None, @@ -110,6 +113,7 @@ def __init__(self, self.sp_model_file = sp_model_file self.tokenizer_class = tokenizer_class self.tokenizer_model_name = tokenizer_model_name + self.tokenizer_json_file = tokenizer_json_file self.special_tokens_map = special_tokens_map self.cache_dir = cache_dir self.deprecation_warnings = ({}) diff --git a/flagai/data/tokenizer/uni_tokenizer/properties.py b/flagai/data/tokenizer/uni_tokenizer/properties.py index d604730e..aa841bfa 100644 --- a/flagai/data/tokenizer/uni_tokenizer/properties.py +++ b/flagai/data/tokenizer/uni_tokenizer/properties.py @@ -2,5 +2,6 @@ VOCAB_JSON_FILE = 'vocab.json' MERGES_FILE = 'merges.txt' SP_MODEL_FILE = 'spiece.model' +TOKENIZER_JSON_FILE = 'tokenizer.json' SPECIAL_TOKENS_NAME = 'special_tokens.txt' SPECIAL_TOKENS_MAP = 'special_tokens_map.json' \ No newline at end of file diff --git a/flagai/data/tokenizer/uni_tokenizer/sp_tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/sp_tokenizer.py index 8d817142..e38e7ec5 100644 --- a/flagai/data/tokenizer/uni_tokenizer/sp_tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/sp_tokenizer.py @@ -29,8 +29,6 @@ def __init__(self, model_path): self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(model_path) # vocab = self.get_vocab() - # print(vocab["<|endoftext|>"]) - # print(vocab["<|endofpiece|>"]) @property def vocab_size(self): diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py index 30db5574..54326b36 100644 --- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py @@ -51,13 +51,13 @@ def __init__(self, add_sentinel_token=0, add_task_mask=True, add_decoder_mask=False, - fix_command_token=True, + fix_command_token=False, pre_tokenizer=None, **kwargs): super().__init__(**kwargs) if self.tokenizer_class == "wp": if self.tokenizer_model_name.lower().startswith('clip-cn'): - self.text_tokenizer = FullTokenizer(self.vocab_file) + self.text_tokenizer = FullTokenizer(self.vocab_file) else: self.text_tokenizer = WordpieceTokenizer(self.vocab_file, is_ch=self.tokenizer_model_name.lower().endswith("ch")) @@ -69,306 +69,425 @@ def __init__(self, self.text_tokenizer = BPETokenizer(self.vocab_file, self.merges_file) elif self.tokenizer_class == "sp": - self.text_tokenizer = SentencePieceTokenizer(self.sp_model_file) + if self.tokenizer_model_name.lower().startswith('cpm1'): + from flagai.data.tokenizer.cpm_1.cpm1_tokenizer import CPMTokenizer + self.text_tokenizer = CPMTokenizer(self.tokenizer_json_file, self.sp_model_file) + elif self.tokenizer_model_name.lower().startswith('cpm3'): + from flagai.data.tokenizer.cpm_3.cpm3_tokenizer import CPMTokenizer + self.text_tokenizer = CPMTokenizer(self.tokenizer_json_file, self.sp_model_file) + else: + self.text_tokenizer = SentencePieceTokenizer(self.sp_model_file) else: raise NotImplementedError("cannot assign a tokenize class") if self.tokenizer_model_name.lower().startswith('glm') or self.tokenizer_model_name.lower().startswith('alm'): add_block_symbols=True # self.is_clip = self.tokenizer_model_name.startswith('clip') - # if self.tokenizer_model_name.startswith('t5'): - # import jieba - # self.pre_tokenizer = lambda x: jieba.cut(x, HMM=False) + self.num_tokens = self.text_tokenizer.vocab_size - with open(self.special_tokens_map, encoding='utf8') as file: dct=json.load(file) - sp_tokens = [(k.replace("_token",""),v['content']) for k,v in dct.items()] - # self._command_tokens = [CommandToken(e[0], e[1], self.text_tokenizer.convert_token_to_id(e[1])) for e in sp_tokens ] - if self.tokenizer_class == "wp": - # set command tokens from wordpiece tokenizer values - self.num_command_tokens = 6 - self.num_text_tokens = self.num_tokens - 5 - self.num_type_tokens = 2 - self.token_start_id = None - self.token_end_id = None - self.token_pad_id = None - # try: - self._command_tokens = [ - CommandToken( - 'pad', '[PAD]', - self.text_tokenizer.convert_token_to_id('[PAD]')), - CommandToken( - 'cls', '[CLS]', - self.text_tokenizer.convert_token_to_id('[CLS]')), - CommandToken( - 'MASK', '[MASK]', - self.text_tokenizer.convert_token_to_id('[MASK]')), - CommandToken( - 'unk', '[UNK]', - self.text_tokenizer.convert_token_to_id('[UNK]')), - CommandToken( - 'sep', '[SEP]', - self.text_tokenizer.convert_token_to_id('[SEP]')), - CommandToken( - 'eos', '[PAD]', - self.text_tokenizer.convert_token_to_id('[PAD]')), - ] - self.token_start_id = self.text_tokenizer.convert_token_to_id( - '[CLS]') - self.token_end_id = self.text_tokenizer.convert_token_to_id( - '[SEP]') - self.token_pad_id = self.text_tokenizer.convert_token_to_id( - '[PAD]') - self.text_tokenizer._token_cls = "[CLS]" - self.text_tokenizer._token_sep = "[SEP]" - - # except KeyError: - # self._command_tokens = [ - # CommandToken( - # 'pad', '[PAD]', - # self.text_tokenizer.convert_token_to_id('')), - # CommandToken( - # 'cls', '[CLS]', - # self.text_tokenizer.convert_token_to_id('')), - # CommandToken( - # 'MASK', '[MASK]', - # self.text_tokenizer.convert_token_to_id('')), - # CommandToken( - # 'unk', '[UNK]', - # self.text_tokenizer.convert_token_to_id('')), - # CommandToken( - # 'sep', '[SEP]', - # self.text_tokenizer.convert_token_to_id('')), - # CommandToken( - # 'eos', '[PAD]', - # self.text_tokenizer.convert_token_to_id('')), - # ] - # self.token_start_id = self.text_tokenizer.convert_token_to_id( - # '') - # self.token_end_id = self.text_tokenizer.convert_token_to_id( - # '') - # self.token_pad_id = self.text_tokenizer.convert_token_to_id( - # '') - # self.text_tokenizer._token_cls = "" - # self.text_tokenizer._token_sep = "" - if add_block_symbols: - self.add_command_token('sop', '<|startofpiece|>') - self.add_command_token('eop', '<|endofpiece|>',) - if add_task_mask: - self.add_command_token('gMASK', '[gMASK]') - self.add_command_token('sMASK', '[sMASK]') - if add_decoder_mask: - self.add_command_token('dBLOCK', '[dBLOCK]') - if add_sentinel_token > 0: - for i in range(1, add_sentinel_token): - self.add_command_token(f'MASK{i}', f'[MASK{i}]') - self.add_command_token(f'sop{i}', f'<|startofpiece{i}|>') - elif self.tokenizer_class == "bpe": - if self.tokenizer_model_name.lower().startswith('roberta'): - self.num_command_tokens = 6 - self.num_text_tokens = self.num_tokens - 3 - self._command_tokens = [ - CommandToken( - 'pad', '<|endoftext|>', - self.text_tokenizer.convert_token_to_id('')), - CommandToken( - 'eos', '<|endoftext|>', - self.text_tokenizer.convert_token_to_id('')), - CommandToken( - 'sep', '[SEP]', - self.text_tokenizer.convert_token_to_id('')), - CommandToken( - 'cls', '[CLS]', - self.text_tokenizer.convert_token_to_id('')), - CommandToken( - 'MASK', - '[MASK]', - self.text_tokenizer.convert_token_to_id(''), - lstrip=True), - CommandToken( - 'unk', '[UNK]', - self.text_tokenizer.convert_token_to_id('')) - ] - if add_block_symbols: - self._command_tokens.extend([ - CommandToken('sop', '<|startofpiece|>', - self.num_tokens), - CommandToken('eop', '<|endofpiece|>', - self.num_tokens + 1) - ]) - self.num_tokens += 2 - self.num_command_tokens += 2 - self.token_end_id = self.text_tokenizer.convert_token_to_id( - '') - elif self.tokenizer_model_name.lower().startswith('clip'): - self.num_command_tokens = 2 - self._command_tokens = [ - CommandToken( - 'sot', '', - self.text_tokenizer.convert_token_to_id('')), - CommandToken( - 'eot', '', - self.text_tokenizer.convert_token_to_id('')), - ] - self.num_tokens += self.num_command_tokens - self.token_end_id = self.text_tokenizer.convert_token_to_id( - '') - elif self.tokenizer_model_name.lower().startswith('opt'): + try: + with open(self.special_tokens_map, encoding='utf8') as file: dct=json.load(file) + sp_tokens = [(k.replace("_token",""),v['content']) for k,v in dct.items()] + except FileNotFoundError: + dct = None + sp_tokens = [] + self._command_tokens = [CommandToken(e[0], e[1], self.text_tokenizer.convert_token_to_id(e[1])) for e in sp_tokens] + + if self.tokenizer_model_name.lower().startswith("glm"): + if self.tokenizer_class == "wp": + self.text_tokenizer._token_cls = "[CLS]" + self.text_tokenizer._token_sep = "[SEP]" + fix_command_token = False + elif self.tokenizer_class == "sp": + self._command_tokens = [ - CommandToken( - 'pad', '<|endoftext|>', - self.text_tokenizer.convert_token_to_id('')), - CommandToken( - 'eos', '<|endoftext|>', - self.text_tokenizer.convert_token_to_id('')), - CommandToken( - 'sep', '[SEP]', - self.text_tokenizer.convert_token_to_id('')), - CommandToken( - 'cls', '[CLS]', - self.text_tokenizer.convert_token_to_id('')), - CommandToken( - 'MASK', - '[MASK]', - self.text_tokenizer.convert_token_to_id(''), - lstrip=True), - CommandToken( - 'unk', '[UNK]', - self.text_tokenizer.convert_token_to_id('')) + CommandToken('pad', '<|endoftext|>', self.num_tokens), + CommandToken('eos', '<|endoftext|>', self.num_tokens), + CommandToken('sep', '[SEP]', self.num_tokens + 1), + CommandToken('cls', '[CLS]', self.num_tokens + 2), + CommandToken('mask', '[MASK]', self.num_tokens + 3, lstrip=True), + CommandToken('unk', '[UNK]', self.num_tokens + 4) ] - else: - self.num_command_tokens = 2 - self.num_text_tokens = self.num_tokens - 1 + self.num_tokens += 6 + elif self.tokenizer_class == "bpe": self._command_tokens = [ - CommandToken( - 'pad', '<|endoftext|>', - self.text_tokenizer.convert_token_to_id( - '<|endoftext|>')), - CommandToken( - 'eos', '<|endoftext|>', - self.text_tokenizer.convert_token_to_id( - '<|endoftext|>')) + CommandToken('pad', '<|endoftext|>', + self.text_tokenizer.encoder['<|endoftext|>']), + CommandToken('eos', '<|endoftext|>', + self.text_tokenizer.encoder['<|endoftext|>']) ] - self.token_end_id = self.text_tokenizer.convert_token_to_id( - '<|endoftext|>') - if add_block_symbols: - if self.tokenizer_model_name.lower().startswith('glm'): - unk_token_id = self.num_tokens + 5 - cls_token_id = self.num_tokens + 2 - num_tokens_to_add = 5 - else: - unk_token_id = self.text_tokenizer.convert_token_to_id( - '<|endoftext|>') - cls_token_id = self.text_tokenizer.convert_token_to_id( - '<|endoftext|>') - num_tokens_to_add = 4 - self._command_tokens.extend([ - CommandToken('sop', '<|startofpiece|>', - self.num_tokens), - CommandToken('eop', '<|endofpiece|>', - self.num_tokens + 1), - CommandToken('cls', '[CLS]', cls_token_id), - CommandToken('MASK', - '[MASK]', - self.num_tokens + 3, - lstrip=True), - CommandToken('sep', '[SEP]', self.num_tokens + 4), - CommandToken('unk', '[UNK]', unk_token_id) - ]) - self.num_tokens += num_tokens_to_add - self.num_command_tokens += 6 - - if add_block_symbols: - if add_task_mask: - self._command_tokens.extend([ - CommandToken('gMASK', - '[gMASK]', - self.num_tokens, - lstrip=True), - CommandToken('sMASK', - '[sMASK]', - self.num_tokens + 1, - lstrip=True) - ]) - self.num_tokens += 2 - self.num_command_tokens += 2 - if add_decoder_mask: - self._command_tokens.extend( - [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)]) - self.num_tokens += 1 - self.num_command_tokens += 1 - - elif self.tokenizer_class == "sp": - self.num_command_tokens = 0 - self.num_text_tokens = self.text_tokenizer.vocab_size - self.num_tokens = self.num_text_tokens - import pdb; - if self.tokenizer_model_name.lower().startswith('glm'): - pad_token_id = self.num_tokens - eos_token_id = self.num_tokens - unk_token_id = self.num_tokens + 4 - else: - pad_token_id = self.text_tokenizer.convert_token_to_id('') - eos_token_id = self.text_tokenizer.convert_token_to_id('') - unk_token_id = self.text_tokenizer.convert_token_to_id('') - self._command_tokens = [ - CommandToken('pad', '<|endoftext|>', self.num_text_tokens), - CommandToken('eos', '<|endoftext|>', self.num_text_tokens), - CommandToken('sep', '[SEP]', self.num_text_tokens + 1), - CommandToken('cls', '[CLS]', self.num_text_tokens + 2), - CommandToken('MASK', - '[MASK]', - self.num_text_tokens + 3, - lstrip=True), - CommandToken('unk', '[UNK]', self.num_text_tokens + 4) - ] - - self.num_tokens += 5 - self.num_command_tokens += 6 - self.token_end_id = self.text_tokenizer.convert_token_to_id( - '') - if add_block_symbols: - sop_id = self.text_tokenizer.convert_token_to_id('<|startofpiece|>') - eop_id = self.text_tokenizer.convert_token_to_id('<|endofpiece|>') self._command_tokens.extend([ - CommandToken('sop', '<|startofpiece|>', - self.num_tokens + 1), - CommandToken('eop', '<|endofpiece|>', self.num_tokens + 2) + CommandToken('sop', '<|startofpiece|>', self.num_tokens), + CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1), + CommandToken('cls', '[CLS]', self.num_tokens + 2), + CommandToken('mask', + '[MASK]', + self.num_tokens + 3, + lstrip=True), + CommandToken('sep', '[SEP]', self.num_tokens + 4), + CommandToken('unk', '[UNK]', self.num_tokens + 5) ]) + self.num_tokens += 6 + # elif self.tokenizer_model_name.lower().startswith("glm-large-ch"): + # self._command_tokens = [ + # CommandToken('pad', '<|endoftext|>', self.num_tokens), + # CommandToken('eos', '<|endoftext|>', self.num_tokens), + # CommandToken('sep', '[SEP]', self.num_tokens + 1), + # CommandToken('cls', '[CLS]', self.num_tokens + 2), + # CommandToken('mask', '[MASK]', self.num_tokens + 3, lstrip=True), + # CommandToken('unk', '[UNK]', self.num_tokens + 4) + # ] + # self.num_tokens += 5 + if add_block_symbols: + # sop_id = self.text_tokenizer.convert_token_to_id('<|startofpiece|>') + # eop_id = self.text_tokenizer.convert_token_to_id('<|endofpiece|>') + if not self.tokenizer_class == "bpe": + self.add_command_token('sop', '<|startofpiece|>') + self.add_command_token('eop', '<|endofpiece|>') + if add_task_mask: if fix_command_token: - self.num_tokens += 3 + self.add_command_token('sMASK', '[sMASK]') + self.add_command_token('gMASK', '[gMASK]') else: - self.num_tokens += 2 - self.num_command_tokens += 2 - if add_task_mask: - if fix_command_token: - self._command_tokens.extend([ - CommandToken('sMASK', - '[sMASK]', - self.num_tokens, - lstrip=True), - CommandToken('gMASK', - '[gMASK]', - self.num_tokens + 1, - lstrip=True) - ]) - else: - self._command_tokens.extend([ - CommandToken('gMASK', - '[gMASK]', - self.num_tokens, - lstrip=True), - CommandToken('sMASK', - '[sMASK]', - self.num_tokens + 1, - lstrip=True) - ]) - self.num_tokens += 2 - self.num_command_tokens += 2 - if add_decoder_mask: - self._command_tokens.extend( - [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)]) - self.num_tokens += 1 - self.num_command_tokens += 1 + self.add_command_token('gMASK', '[gMASK]') + self.add_command_token('sMASK', '[sMASK]') + if add_decoder_mask: + self.add_command_token('dBLOCK', '[dBLOCK]') + if add_sentinel_token > 0: + for i in range(1, add_sentinel_token): + self.add_command_token(f'MASK{i}', f'[MASK{i}]') + self.add_command_token(f'sop{i}', f'<|startofpiece{i}|>') + # self._command_tokens.extend([ + # CommandToken('sop', '<|startofpiece|>', + # self.num_tokens + 1), + # CommandToken('eop', '<|endofpiece|>', self.num_tokens + 2) + # ]) + # if fix_command_token: + # self.num_tokens += 3 + # else: + # self.num_tokens += 2 + # if add_task_mask: + # if fix_command_token: + # self._command_tokens.extend([ + # CommandToken('sMASK', + # '[sMASK]', + # self.num_tokens, + # lstrip=True), + # CommandToken('gMASK', + # '[gMASK]', + # self.num_tokens + 1, + # lstrip=True) + # ]) + # else: + # self._command_tokens.extend([ + # CommandToken('gMASK', + # '[gMASK]', + # self.num_tokens, + # lstrip=True), + # CommandToken('sMASK', + # '[sMASK]', + # self.num_tokens + 1, + # lstrip=True) + # ]) + # if add_decoder_mask: + # self._command_tokens.extend( + # [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)]) + # import pdb;pdb.set_trace() + # + # if self.tokenizer_class == "wp": + # # set command tokens from wordpiece tokenizer values + # self.num_command_tokens = 6 + # self.num_text_tokens = self.num_tokens - 5 + # self.num_type_tokens = 2 + # self.token_start_id = None + # self.token_end_id = None + # self.token_pad_id = None + # # try: + # self._command_tokens = [ + # CommandToken( + # 'pad', '[PAD]', + # self.text_tokenizer.convert_token_to_id('[PAD]')), + # CommandToken( + # 'cls', '[CLS]', + # self.text_tokenizer.convert_token_to_id('[CLS]')), + # CommandToken( + # 'MASK', '[MASK]', + # self.text_tokenizer.convert_token_to_id('[MASK]')), + # CommandToken( + # 'unk', '[UNK]', + # self.text_tokenizer.convert_token_to_id('[UNK]')), + # CommandToken( + # 'sep', '[SEP]', + # self.text_tokenizer.convert_token_to_id('[SEP]')), + # CommandToken( + # 'eos', '[PAD]', + # self.text_tokenizer.convert_token_to_id('[PAD]')), + # ] + # self.token_start_id = self.text_tokenizer.convert_token_to_id( + # '[CLS]') + # self.token_end_id = self.text_tokenizer.convert_token_to_id( + # '[SEP]') + # self.token_pad_id = self.text_tokenizer.convert_token_to_id( + # '[PAD]') + # self.text_tokenizer._token_cls = "[CLS]" + # self.text_tokenizer._token_sep = "[SEP]" + + # # except KeyError: + # # self._command_tokens = [ + # # CommandToken( + # # 'pad', '[PAD]', + # # self.text_tokenizer.convert_token_to_id('')), + # # CommandToken( + # # 'cls', '[CLS]', + # # self.text_tokenizer.convert_token_to_id('')), + # # CommandToken( + # # 'MASK', '[MASK]', + # # self.text_tokenizer.convert_token_to_id('')), + # # CommandToken( + # # 'unk', '[UNK]', + # # self.text_tokenizer.convert_token_to_id('')), + # # CommandToken( + # # 'sep', '[SEP]', + # # self.text_tokenizer.convert_token_to_id('')), + # # CommandToken( + # # 'eos', '[PAD]', + # # self.text_tokenizer.convert_token_to_id('')), + # # ] + # # self.token_start_id = self.text_tokenizer.convert_token_to_id( + # # '') + # # self.token_end_id = self.text_tokenizer.convert_token_to_id( + # # '') + # # self.token_pad_id = self.text_tokenizer.convert_token_to_id( + # # '') + # # self.text_tokenizer._token_cls = "" + # # self.text_tokenizer._token_sep = "" + # if add_block_symbols: + # self.add_command_token('sop', '<|startofpiece|>') + # self.add_command_token('eop', '<|endofpiece|>',) + # if add_task_mask: + # self.add_command_token('gMASK', '[gMASK]') + # self.add_command_token('sMASK', '[sMASK]') + # if add_decoder_mask: + # self.add_command_token('dBLOCK', '[dBLOCK]') + # if add_sentinel_token > 0: + # for i in range(1, add_sentinel_token): + # self.add_command_token(f'MASK{i}', f'[MASK{i}]') + # self.add_command_token(f'sop{i}', f'<|startofpiece{i}|>') + # elif self.tokenizer_class == "bpe": + # if self.tokenizer_model_name.lower().startswith('roberta'): + # self.num_command_tokens = 6 + # self.num_text_tokens = self.num_tokens - 3 + # self._command_tokens = [ + # CommandToken( + # 'pad', '<|endoftext|>', + # self.text_tokenizer.convert_token_to_id('')), + # CommandToken( + # 'eos', '<|endoftext|>', + # self.text_tokenizer.convert_token_to_id('')), + # CommandToken( + # 'sep', '[SEP]', + # self.text_tokenizer.convert_token_to_id('')), + # CommandToken( + # 'cls', '[CLS]', + # self.text_tokenizer.convert_token_to_id('')), + # CommandToken( + # 'MASK', + # '[MASK]', + # self.text_tokenizer.convert_token_to_id(''), + # lstrip=True), + # CommandToken( + # 'unk', '[UNK]', + # self.text_tokenizer.convert_token_to_id('')) + # ] + # if add_block_symbols: + # self._command_tokens.extend([ + # CommandToken('sop', '<|startofpiece|>', + # self.num_tokens), + # CommandToken('eop', '<|endofpiece|>', + # self.num_tokens + 1) + # ]) + # self.num_tokens += 2 + # self.num_command_tokens += 2 + # self.token_end_id = self.text_tokenizer.convert_token_to_id( + # '') + # elif self.tokenizer_model_name.lower().startswith('clip'): + # self.num_command_tokens = 2 + # self._command_tokens = [ + # CommandToken( + # 'sot', '', + # self.text_tokenizer.convert_token_to_id('')), + # CommandToken( + # 'eot', '', + # self.text_tokenizer.convert_token_to_id('')), + # ] + # self.num_tokens += self.num_command_tokens + # self.token_end_id = self.text_tokenizer.convert_token_to_id( + # '') + # elif self.tokenizer_model_name.lower().startswith('opt'): + # self._command_tokens = [ + # CommandToken( + # 'pad', '<|endoftext|>', + # self.text_tokenizer.convert_token_to_id('')), + # CommandToken( + # 'eos', '<|endoftext|>', + # self.text_tokenizer.convert_token_to_id('')), + # CommandToken( + # 'sep', '[SEP]', + # self.text_tokenizer.convert_token_to_id('')), + # CommandToken( + # 'cls', '[CLS]', + # self.text_tokenizer.convert_token_to_id('')), + # CommandToken( + # 'MASK', + # '[MASK]', + # self.text_tokenizer.convert_token_to_id(''), + # lstrip=True), + # CommandToken( + # 'unk', '[UNK]', + # self.text_tokenizer.convert_token_to_id('')) + # ] + # else: + # self.num_command_tokens = 2 + # self.num_text_tokens = self.num_tokens - 1 + # self._command_tokens = [ + # CommandToken( + # 'pad', '<|endoftext|>', + # self.text_tokenizer.convert_token_to_id( + # '<|endoftext|>')), + # CommandToken( + # 'eos', '<|endoftext|>', + # self.text_tokenizer.convert_token_to_id( + # '<|endoftext|>')) + # ] + # self.token_end_id = self.text_tokenizer.convert_token_to_id( + # '<|endoftext|>') + # if add_block_symbols: + # if self.tokenizer_model_name.lower().startswith('glm'): + # unk_token_id = self.num_tokens + 5 + # cls_token_id = self.num_tokens + 2 + # num_tokens_to_add = 5 + # else: + # unk_token_id = self.text_tokenizer.convert_token_to_id( + # '<|endoftext|>') + # cls_token_id = self.text_tokenizer.convert_token_to_id( + # '<|endoftext|>') + # num_tokens_to_add = 4 + # self._command_tokens.extend([ + # CommandToken('sop', '<|startofpiece|>', + # self.num_tokens), + # CommandToken('eop', '<|endofpiece|>', + # self.num_tokens + 1), + # CommandToken('cls', '[CLS]', cls_token_id), + # CommandToken('MASK', + # '[MASK]', + # self.num_tokens + 3, + # lstrip=True), + # CommandToken('sep', '[SEP]', self.num_tokens + 4), + # CommandToken('unk', '[UNK]', unk_token_id) + # ]) + # self.num_tokens += num_tokens_to_add + # self.num_command_tokens += 6 + + # if add_block_symbols: + # if add_task_mask: + # self._command_tokens.extend([ + # CommandToken('gMASK', + # '[gMASK]', + # self.num_tokens, + # lstrip=True), + # CommandToken('sMASK', + # '[sMASK]', + # self.num_tokens + 1, + # lstrip=True) + # ]) + # self.num_tokens += 2 + # self.num_command_tokens += 2 + # if add_decoder_mask: + # self._command_tokens.extend( + # [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)]) + # self.num_tokens += 1 + # self.num_command_tokens += 1 + + # elif self.tokenizer_class == "sp": + # import pdb;pdb.set_trace() + # self.num_command_tokens = 0 + # self.num_text_tokens = self.text_tokenizer.vocab_size + # self.num_tokens = self.num_text_tokens + # if self.tokenizer_model_name.lower().startswith('glm'): + # self._command_tokens = [ + # CommandToken('pad', '<|endoftext|>', self.num_text_tokens), + # CommandToken('eos', '<|endoftext|>', self.num_text_tokens), + # CommandToken('sep', '[SEP]', self.num_text_tokens + 1), + # CommandToken('cls', '[CLS]', self.num_text_tokens + 2), + # CommandToken('MASK', + # '[MASK]', + # self.num_text_tokens + 3, + # lstrip=True), + # CommandToken('unk', '[UNK]', self.num_text_tokens + 4) + # ] + # self.num_tokens += 5 + # self.num_command_tokens += 6 + # else: + # # try: + # # self.text_tokenizer.convert_token_to_id('') + # # self.text_tokenizer.convert_token_to_id('') + # # self.text_tokenizer.convert_token_to_id('') + # # self.text_tokenizer.convert_token_to_id('') + # # self.text_tokenizer.convert_token_to_id('') + # # self.text_tokenizer.convert_token_to_id('') + # # self.text_tokenizer.convert_token_to_id('') + # # self._command_tokens = [ + # # CommandToken('pad', '', self.text_tokenizer.convert_token_to_id('')), + # # CommandToken('eos', '', self.text_tokenizer.convert_token_to_id('')), + # # CommandToken('unk', '', self.text_tokenizer.convert_token_to_id('')) + # # ] + # self.num_tokens += 3 + # self.num_command_tokens += 3 + # self.token_end_id = self.text_tokenizer.convert_token_to_id( + # '') + # if add_block_symbols: + # sop_id = self.text_tokenizer.convert_token_to_id('<|startofpiece|>') + # eop_id = self.text_tokenizer.convert_token_to_id('<|endofpiece|>') + # self._command_tokens.extend([ + # CommandToken('sop', '<|startofpiece|>', + # self.num_tokens + 1), + # CommandToken('eop', '<|endofpiece|>', self.num_tokens + 2) + # ]) + # if fix_command_token: + # self.num_tokens += 3 + # else: + # self.num_tokens += 2 + # self.num_command_tokens += 2 + # if add_task_mask: + # if fix_command_token: + # self._command_tokens.extend([ + # CommandToken('sMASK', + # '[sMASK]', + # self.num_tokens, + # lstrip=True), + # CommandToken('gMASK', + # '[gMASK]', + # self.num_tokens + 1, + # lstrip=True) + # ]) + # else: + # self._command_tokens.extend([ + # CommandToken('gMASK', + # '[gMASK]', + # self.num_tokens, + # lstrip=True), + # CommandToken('sMASK', + # '[sMASK]', + # self.num_tokens + 1, + # lstrip=True) + # ]) + # self.num_tokens += 2 + # self.num_command_tokens += 2 + # if add_decoder_mask: + # self._command_tokens.extend( + # [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)]) + # self.num_tokens += 1 + # self.num_command_tokens += 1 self.command_name_map = {tok.name: tok for tok in self._command_tokens} self.command_token_map = { tok.token: tok @@ -376,7 +495,19 @@ def __init__(self, } self.command_id_map = {tok.Id: tok for tok in self._command_tokens} self._command_token_tokens = list(self.command_token_map.keys()) - print("All special tokens: ", str([(v.name, k, v.Id) for k,v in self.command_token_map.items()])) + # import pdb;pdb.set_trace() + vocab = self.text_tokenizer.get_vocab() + self.token_start_id = vocab.get('', None) + if not self.token_start_id: + self.token_start_id = vocab.get('[CLS]', None) + + self.token_end_id = vocab.get('', None) + if not self.token_end_id: + self.token_end_id = vocab.get('<|endoftext|>', None) + if not self.token_end_id: + self.token_end_id = vocab.get('[SEP]', None) + # import pdb;pdb.set_trace() + print("All special tokens: ", str([(k, v.token, v.Id) for k,v in self.command_name_map.items()])) # logger.info("All special tokens: %s", str([(k,v.Id) for k,v in self.command_name_map.items()])) def get_vocab(self): @@ -387,13 +518,10 @@ def get_command_id(self, name): return self.command_name_map[name].Id def add_command_token(self, name, token): - try: - id = self.text_tokenizer.convert_token_to_id(token) - except KeyError: - id = self.num_tokens - self.num_tokens += 1 - self._command_tokens.append(CommandToken(name, token, id)) + self._command_tokens.append(CommandToken(name, token, self.num_tokens)) + self.num_tokens += 1 return + def rematch(self, text, tokens): """output the mapping relation between raw text and tokenizezd text """ @@ -594,7 +722,7 @@ def encode_plus_non_glm( truncation=True, max_length=None, ): - if self.tokenizer_model_name.startswith('t5'): + if self.tokenizer_model_name.lower().startswith('t5'): assert second_text is None, "t5 does not support multi-sentence encoding" def get_input_ids(text): tokens = self.text_tokenizer.tokenize(text) @@ -670,6 +798,8 @@ def encode_plus( # for Seq2seq "alm"): return self.encode_plus_non_glm(source_text, second_text, truncation, max_length) + elif self.tokenizer_model_name.lower().startswith("opt"): + return None sop_id = self.get_command_id('sop') # start of piece eop_id = self.get_command_id('eop') # end of piece sep_id = self.get_command_id('sep') # seperation @@ -757,11 +887,25 @@ def tokenize_as_tensor(self, texts): sot_token=sot_token, eot_token=eot_token) + def tokenize_t5(self, text, *arg, **kwargs): + split_tokens = [] + for token in self.pre_tokenizer(text): + if token in self.vocab: + split_tokens.append(token) + else: + split_tokens.extend(self.text_tokenizer.tokenize(token)) + return split_tokens + def tokenize(self, text, maxlen=None, add_spatial_tokens=False): """ add_spatial_token: (bool) Add cls at the front and sep at the end max_len: Truncate the length to max_len """ + if self.tokenizer_model_name.lower().startswith('t5'): + import jieba + self.pre_tokenizer = lambda x: jieba.cut(x, HMM=False) + return self.tokenize_t5(text) + tokens = self.text_tokenizer.tokenize(text) if add_spatial_tokens: diff --git a/flagai/model/predictor/utils.py b/flagai/model/predictor/utils.py index 6d78c842..b3ebc49d 100644 --- a/flagai/model/predictor/utils.py +++ b/flagai/model/predictor/utils.py @@ -1436,7 +1436,7 @@ def glm_generate_sample( dtype=torch.long) position_ids = torch.stack((position_ids, block_position_ids), dim=0) position_ids = position_ids.unsqueeze(0) - mask_tokens = ['MASK', 'sMASK', 'gMASK'] + mask_tokens = ['mask', 'sMASK', 'gMASK'] mask_tokens = [tokenizer.get_command_id(token) for token in mask_tokens] end_tokens = [tokenizer.get_command_id('eop'), eod_token] mask_positions = [] diff --git a/flagai/test_utils.py b/flagai/test_utils.py index 83dacde3..5faa0aec 100644 --- a/flagai/test_utils.py +++ b/flagai/test_utils.py @@ -14,7 +14,7 @@ def build_input_from_ids(text_a_ids=None, mask_id=None, masked_lm=False): if mask_id is None: - mask_id = tokenizer.get_command_id('MASK') + mask_id = tokenizer.get_command_id('mask') eos_id = tokenizer.get_command_id('eos') cls_id = tokenizer.get_command_id('cls') sep_id = tokenizer.get_command_id('sep') diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 666bda4f..43dfdc0a 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -9,17 +9,21 @@ class TokenizerTestCase(unittest.TestCase): - # def test_tokenizer_GLM_large_ch(self): - # tokenizer = Tokenizer.from_pretrained("GLM-large-ch") - # self.assertEqual(tokenizer.TokenToId("人"), 43371, 'Token id "人" error') - # self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), - # [3378, 1567, 2613, 20282], 'EncodeAsIds Error') - # self.assertEqual(tokenizer.DecodeIds([3378, 1567, 2613, 20282]), - # '今天吃饭吃了肯德基', 'DecodeIds Error') - # self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], - # [('eos', '<|endoftext|>', 50000), ('sep', '[SEP]', 50001), ('cls', '[CLS]', 50002), - # ('MASK', '[MASK]', 50003), ('unk', '[UNK]', 50004), ('sop', '<|startofpiece|>', 50006), - # ('eop', '<|endofpiece|>', 50007), ('sMASK', '[sMASK]', 50008), ('gMASK', '[gMASK]', 50009)]) + def test_tokenizer_GLM_large_ch(self): + tokenizer = Tokenizer.from_pretrained("GLM-large-ch") + self.assertEqual(tokenizer.TokenToId("人"), 43371, 'Token id "人" error') + self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), + [3378, 1567, 2613, 20282], 'EncodeAsIds Error') + self.assertEqual(tokenizer.DecodeIds([3378, 1567, 2613, 20282]), + '今天吃饭吃了肯德基', 'DecodeIds Error') + self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), + ['▁今天', '吃饭', '吃了', '肯德基'], 'tokenize Error') + self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], + [50006, 3378, 1567, 2613, 20282, 50001], 'encode_plus Error') + self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + [('pad', '<|endoftext|>', 50000), ('eos', '<|endoftext|>', 50000), ('sep', '[SEP]', 50001), + ('cls', '[CLS]', 50002), ('mask', '[MASK]', 50003), ('unk', '[UNK]', 50004), ('sop', '<|startofpiece|>', 50006), + ('eop', '<|endofpiece|>', 50007), ('gMASK', '[gMASK]', 50007), ('sMASK', '[sMASK]', 50008)], 'SpecialTokens error') # def test_tokenizer_GLM_large_en(self): # tokenizer = Tokenizer.from_pretrained("GLM-large-en") @@ -28,30 +32,33 @@ class TokenizerTestCase(unittest.TestCase): # [13017, 7975, 3084, 2033, 3407], '') # self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), # 'fried chicken makes me happy', 'DecodeIds Error') - # self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], - # [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('MASK', '[MASK]', 103), ('unk', '[UNK]', 100), - # ('sep', '[SEP]', 102), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523), + # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + # [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('mask', '[MASK]', 103), ('unk', '[UNK]', 100), + # ('sep', '[SEP]', 102), ('pad', '[PAD]', 0), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523), # ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)]) - # def test_tokenizer_glm_10b_en(self): - # tokenizer = Tokenizer.from_pretrained("GLM-10b-en") - # self.assertEqual(tokenizer.TokenToId("day"), 820, '') - # self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), - # [25520, 9015, 1838, 502, 3772], '') - # self.assertEqual(tokenizer.DecodeIds([25520, 9015, 1838, 502, 3772]), - # 'fried chicken makes me happy', 'DecodeIds Error') + # # def test_tokenizer_glm_10b_en(self): + # # tokenizer = Tokenizer.from_pretrained("GLM-10b-en") + # # self.assertEqual(tokenizer.TokenToId("day"), 820, '') + # # self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), + # # [25520, 9015, 1838, 502, 3772], '') + # # self.assertEqual(tokenizer.DecodeIds([25520, 9015, 1838, 502, 3772]), + # # 'fried chicken makes me happy', 'DecodeIds Error') - def test_tokenizer_t5(self): - tokenizer = Tokenizer.from_pretrained('t5-base-en') - self.assertEqual(tokenizer.TokenToId("day"), 1135, '') - self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), - [3, 7704, 3832, 656, 140, 1095], '') - self.assertEqual(tokenizer.DecodeIds([3, 7704, 3832, 656, 140, 1095]), - 'fried chicken makes me happy', 'DecodeIds Error') - self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], - [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('MASK', '[MASK]', 103), - ('unk', '[UNK]', 100), ('sep', '[SEP]', 102)]) - + # def test_tokenizer_t5(self): + # tokenizer = Tokenizer.from_pretrained('T5-base-ch') + # # import pdb;pdb.set_trace() + # self.assertEqual(tokenizer.TokenToId("人"), 297, '') + # self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), + # [306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166], '') + # self.assertEqual(tokenizer.DecodeIds([306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166]), + # '今天吃饭吃了肯德基', 'DecodeIds Error') + # encode_plus_result = tokenizer.encode_plus("今天吃饭吃了肯德基") + # self.assertEqual(list(encode_plus_result.keys()), + # ['input_ids', 'token_type_ids'], 'encode_plus Error') + # self.assertEqual(encode_plus_result['input_ids'], + # [101, 306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166, 102], 'encode_plus Error') + # def test_tokenizer_roberta(self): # tokenizer = Tokenizer.from_pretrained('RoBERTa-base-ch') # # print(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825])) @@ -60,9 +67,13 @@ def test_tokenizer_t5(self): # [791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825], '') # self.assertEqual(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825]), # '今天吃饭吃了肯德基', 'DecodeIds Error') - - # self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], - # [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('MASK', '[MASK]', 103), ('unk', '[UNK]', 100), ('sep', '[SEP]', 102)]) + # self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), + # ['今', '天', '吃', '饭', '吃', '了', '肯', '德', '基'], 'tokenize Error') + # self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], + # [101, 791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825, 102], 'encode_plus Error') + # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + # [('eos', '[PAD]', 0), ('unk', '[UNK]', 100), ('cls', '[CLS]', 101), ('sep', '[SEP]', 102), + # ('mask', '[MASK]', 103), ('pad', '[PAD]', 0)], 'SpecialTokens error') # def test_tokenizer_bert(self): # tokenizer = Tokenizer.from_pretrained('BERT-base-en') @@ -71,8 +82,13 @@ def test_tokenizer_t5(self): # [13017, 7975, 3084, 2033, 3407], '') # self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), # 'fried chicken makes me happy', 'DecodeIds Error') - # self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], - # [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('MASK', '[MASK]', 103), ('unk', '[UNK]', 100), ('sep', '[SEP]', 102)]) + # self.assertEqual(tokenizer.tokenize('fried chicken makes me happy'), + # ['fried', 'chicken', 'makes', 'me', 'happy'], 'tokenize Error') + # self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'], + # [101, 13017, 7975, 3084, 2033, 3407, 102], 'encode_plus Error') + # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + # [('eos', '[PAD]', 0), ('unk', '[UNK]', 100), ('cls', '[CLS]', 101), ('sep', '[SEP]', 102), + # ('mask', '[MASK]', 103), ('pad', '[PAD]', 0)], 'SpecialTokens error') # def test_tokenizer_cpm1(self): # loader = AutoLoader(task_name="lm", @@ -85,39 +101,49 @@ def test_tokenizer_t5(self): # [2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239], '') # self.assertEqual(tokenizer.decode([2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239]), # 'fried chicken makes me happy', 'DecodeIds Error') + # self.assertEqual(tokenizer.tokenize('fried chicken makes me happy'), + # ['▁f', 'ried', '▁ch', 'ick', 'en', '▁make', 's', '▁me', '▁happy'], 'tokenize Error') + # self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'], + # [1, 2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239, 2], 'encode_plus Error') # self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], - # [('eos', '<|endoftext|>', 30000), ('sep', '[SEP]', 30001), ('cls', '[CLS]', 30002), ('MASK', '[MASK]', 30003), ('unk', '[UNK]', 30004)]) + # [('unk', '', 0), ('cls', '', 1), ('eos', '', 2), ('sep', '', 4), ('mask', '', 6), ('eod', '', 7), ('eop', '', 0)]) # def test_tokenizer_opt(self): - # tokenizer = Tokenizer.from_pretrained('opt-125m-en') + # tokenizer = Tokenizer.from_pretrained('opt-1.3b-en') # self.assertEqual(tokenizer.encode("day"), [1208], '') # self.assertEqual(tokenizer.encode_plus("fried chicken makes me happy")["input_ids"], - # [50260, 21209, 5884, 817, 162, 1372, 50260], '') + # [0, 21209, 5884, 817, 162, 1372, 2], '') # self.assertEqual(tokenizer.decode([21209, 5884, 817, 162, 1372]), # 'fried chicken makes me happy', 'DecodeIds Error') + # self.assertEqual(tokenizer.tokenize('fried chicken makes me happy'), + # ['fried', 'Ġchicken', 'Ġmakes', 'Ġme', 'Ġhappy'], 'tokenize Error') + # self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'], + # [0, 21209, 5884, 817, 162, 1372, 2], 'encode_plus Error') # self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], - # [('eos', '<|endoftext|>', 50260)]) + # [('cls', '', 0), ('pad', '', 1), ('eos', '', 2), ('unk', '', 3)]) # def test_tokenizer_clip(self): # loader = AutoLoader(task_name="txt_img_matching", - # model_name="clip-base-p32-224") + # model_name="clip-base-p32-224", + # only_download_config=True) # tokenizer = loader.get_tokenizer() # self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') # def test_tokenizer_evaclip(self): # loader = AutoLoader(task_name="txt_img_matching", - # model_name="eva-clip") + # model_name="eva-clip", + # only_download_config=True) # tokenizer = loader.get_tokenizer() # self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') def suite(): suite = unittest.TestSuite() - # suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_ch')) + suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_ch')) # suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_en')) # suite.addTest(TokenizerTestCase('test_tokenizer_glm_10_en')) - suite.addTest(TokenizerTestCase('test_tokenizer_t5')) + # suite.addTest(TokenizerTestCase('test_tokenizer_t5')) # suite.addTest(TokenizerTestCase('test_tokenizer_roberta')) # suite.addTest(TokenizerTestCase('test_tokenizer_bert')) # suite.addTest(TokenizerTestCase('test_tokenizer_cpm1')) From 62dbded9e255dd1e62524e53c9f7174871be9b54 Mon Sep 17 00:00:00 2001 From: Anhforth Date: Thu, 23 Feb 2023 14:52:06 +0800 Subject: [PATCH 05/54] optimize tokenizer Signed-off-by: Anhforth --- .../bert_title_generation_english/train.py | 1 - .../glm_blank_filling/glm_generate_samples.py | 20 ++++++++-------- .../glm_generate_samples_en.py | 2 +- examples/gpt2_text_writting/generate.py | 2 ++ .../data/tokenizer/uni_tokenizer/tokenizer.py | 2 +- flagai/model/predictor/gpt.py | 2 +- flagai/model/predictor/utils.py | 3 ++- tests/test_tokenizer.py | 24 +++++++++---------- 8 files changed, 29 insertions(+), 27 deletions(-) diff --git a/examples/bert_title_generation_english/train.py b/examples/bert_title_generation_english/train.py index a7f3423e..f22c6609 100644 --- a/examples/bert_title_generation_english/train.py +++ b/examples/bert_title_generation_english/train.py @@ -1,7 +1,6 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") -import sys import os import torch from torch.utils.data import Dataset diff --git a/examples/glm_blank_filling/glm_generate_samples.py b/examples/glm_blank_filling/glm_generate_samples.py index 61b66290..700407a3 100644 --- a/examples/glm_blank_filling/glm_generate_samples.py +++ b/examples/glm_blank_filling/glm_generate_samples.py @@ -23,20 +23,20 @@ predictor = Predictor(model, tokenizer) - text = [ - '问题:啤酒伤胃吗?回答:[gMASK]', "问题:隔夜菜能吃吗?回答:[gMASK]", "问题:如何评价许嵩?回答:[gMASK]" - ] - for t in text: - output = predictor.predict_generate_randomsample( - t, top_k=50, repetition_penalty=4.0, top_p=1.0) - print(t, '\n', output) - - # text = ['北京故宫是中国[MASK]非物质文化遗产。', "上海是中国[mask]大都市。", "天津大学是[mask]现代大学。"] + # text = [ + # '问题:啤酒伤胃吗?回答:[gMASK]', "问题:隔夜菜能吃吗?回答:[gMASK]", "问题:如何评价许嵩?回答:[gMASK]" + # ] # for t in text: # output = predictor.predict_generate_randomsample( # t, top_k=50, repetition_penalty=4.0, top_p=1.0) # print(t, '\n', output) - # + + text = ['北京故宫是中国[MASK]非物质文化遗产。', "上海是中国[MASK]大都市。", "天津大学是[MASK]现代大学。"] + for t in text: + output = predictor.predict_generate_randomsample( + t, top_k=50, repetition_penalty=4.0, top_p=1.0) + print(t, '\n', output) + # text = [ # "人工智能是一个以计算机科学为基础,由计算机、数学、哲学等多学科交叉融合的交叉学科,[sMASK],具有非常巨大的前景。", # "最近十多年来,人工神经网络的研究工作不断深入,已经取得了很大的进展,[sMASK],表现出了良好的智能特性。" diff --git a/examples/glm_blank_filling/glm_generate_samples_en.py b/examples/glm_blank_filling/glm_generate_samples_en.py index 96347ec8..f4633c50 100644 --- a/examples/glm_blank_filling/glm_generate_samples_en.py +++ b/examples/glm_blank_filling/glm_generate_samples_en.py @@ -14,7 +14,7 @@ print('Generate Samples') loader = AutoLoader(task_name='lm', - model_name='GLM-large-en-generation', + model_name='GLM-large-en', only_download_config=False) model = loader.get_model() tokenizer = loader.get_tokenizer() diff --git a/examples/gpt2_text_writting/generate.py b/examples/gpt2_text_writting/generate.py index 42edd9b6..70352c90 100644 --- a/examples/gpt2_text_writting/generate.py +++ b/examples/gpt2_text_writting/generate.py @@ -1,6 +1,8 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") +import sys +sys.path.append("/home/yanzhaodong/anhforth/FlagAI") from flagai.auto_model.auto_loader import AutoLoader from flagai.model.predictor.predictor import Predictor diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py index 54326b36..557df627 100644 --- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py @@ -99,7 +99,7 @@ def __init__(self, self.text_tokenizer._token_sep = "[SEP]" fix_command_token = False elif self.tokenizer_class == "sp": - + fix_command_token = True self._command_tokens = [ CommandToken('pad', '<|endoftext|>', self.num_tokens), CommandToken('eos', '<|endoftext|>', self.num_tokens), diff --git a/flagai/model/predictor/gpt.py b/flagai/model/predictor/gpt.py index d4027c32..d8497a20 100644 --- a/flagai/model/predictor/gpt.py +++ b/flagai/model/predictor/gpt.py @@ -7,7 +7,7 @@ def gpt_random_sample_use_cache(model, tokenizer, text, input_max_length, out_ma top_k, top_p, repetition_penalty, temperature, device): tokenizer_out = tokenizer.encode_plus(text, max_length=input_max_length) token_ids = tokenizer_out["input_ids"] - token_end_id = tokenizer.get_command_id('sep') + token_end_id = tokenizer.token_end_id if token_ids[-1] == token_end_id: token_ids = token_ids[:-1] diff --git a/flagai/model/predictor/utils.py b/flagai/model/predictor/utils.py index b3ebc49d..0b4f92e8 100644 --- a/flagai/model/predictor/utils.py +++ b/flagai/model/predictor/utils.py @@ -1419,7 +1419,7 @@ def glm_generate_sample( context_length = context_length_tensor[0].item() context_tokens_tensor = torch.LongTensor(context_tokens) text = tokenizer.DecodeIds(context_tokens_tensor.tolist()) - + start_time = time.time() mems = [] tokens = context_tokens_tensor @@ -1446,6 +1446,7 @@ def glm_generate_sample( mask_positions.sort() output_ = model(tokens, position_ids, attention_mask, return_memory=True) mems = output_['hidden_states'] + import pdb;pdb.set_trace() for mask_position in mask_positions: position = mask_position tokens, mems = glm_sample_sequence(model, diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 43dfdc0a..49b6e95b 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -23,19 +23,19 @@ def test_tokenizer_GLM_large_ch(self): self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], [('pad', '<|endoftext|>', 50000), ('eos', '<|endoftext|>', 50000), ('sep', '[SEP]', 50001), ('cls', '[CLS]', 50002), ('mask', '[MASK]', 50003), ('unk', '[UNK]', 50004), ('sop', '<|startofpiece|>', 50006), - ('eop', '<|endofpiece|>', 50007), ('gMASK', '[gMASK]', 50007), ('sMASK', '[sMASK]', 50008)], 'SpecialTokens error') + ('eop', '<|endofpiece|>', 50007), ('sMASK', '[sMASK]', 50008), ('gMASK', '[gMASK]', 50009)], 'SpecialTokens error') - # def test_tokenizer_GLM_large_en(self): - # tokenizer = Tokenizer.from_pretrained("GLM-large-en") - # self.assertEqual(tokenizer.TokenToId("day"), 2154, '') - # self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), - # [13017, 7975, 3084, 2033, 3407], '') - # self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), - # 'fried chicken makes me happy', 'DecodeIds Error') - # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - # [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('mask', '[MASK]', 103), ('unk', '[UNK]', 100), - # ('sep', '[SEP]', 102), ('pad', '[PAD]', 0), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523), - # ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)]) + def test_tokenizer_GLM_large_en(self): + tokenizer = Tokenizer.from_pretrained("GLM-large-en") + self.assertEqual(tokenizer.TokenToId("day"), 2154, '') + self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), + [13017, 7975, 3084, 2033, 3407], '') + self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), + 'fried chicken makes me happy', 'DecodeIds Error') + self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('mask', '[MASK]', 103), ('unk', '[UNK]', 100), + ('sep', '[SEP]', 102), ('pad', '[PAD]', 0), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523), + ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)]) # # def test_tokenizer_glm_10b_en(self): # # tokenizer = Tokenizer.from_pretrained("GLM-10b-en") From d09f1d5e5ad640aed0acc378470e7496bff84b3f Mon Sep 17 00:00:00 2001 From: Anhforth Date: Mon, 27 Feb 2023 10:07:35 +0800 Subject: [PATCH 06/54] updated Signed-off-by: Anhforth --- examples/AltCLIP/altclip_inference.py | 4 +- .../data/tokenizer/uni_tokenizer/tokenizer.py | 348 +----------------- tests/test_tokenizer.py | 80 ++-- 3 files changed, 56 insertions(+), 376 deletions(-) diff --git a/examples/AltCLIP/altclip_inference.py b/examples/AltCLIP/altclip_inference.py index 6ba53e41..0e62947f 100644 --- a/examples/AltCLIP/altclip_inference.py +++ b/examples/AltCLIP/altclip_inference.py @@ -1,5 +1,7 @@ import torch from PIL import Image +import sys +sys.path.append("/home/yanzhaodong/anhforth/FlagAI") from flagai.auto_model.auto_loader import AutoLoader device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -19,7 +21,7 @@ tokenizer = loader.get_tokenizer() def inference(): - image = Image.open("./dog.jpeg") + image = Image.open("./examples/AltCLIP/dog.jpeg") image = transform(image) image = torch.tensor(image["pixel_values"]).to(device) tokenizer_out = tokenizer(["a rat", "a dog", "a cat"], diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py index 557df627..7ea53166 100644 --- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py @@ -57,7 +57,7 @@ def __init__(self, super().__init__(**kwargs) if self.tokenizer_class == "wp": if self.tokenizer_model_name.lower().startswith('clip-cn'): - self.text_tokenizer = FullTokenizer(self.vocab_file) + self.text_tokenizer = FullTokenizer(self.vocab_file) else: self.text_tokenizer = WordpieceTokenizer(self.vocab_file, is_ch=self.tokenizer_model_name.lower().endswith("ch")) @@ -128,19 +128,7 @@ def __init__(self, CommandToken('unk', '[UNK]', self.num_tokens + 5) ]) self.num_tokens += 6 - # elif self.tokenizer_model_name.lower().startswith("glm-large-ch"): - # self._command_tokens = [ - # CommandToken('pad', '<|endoftext|>', self.num_tokens), - # CommandToken('eos', '<|endoftext|>', self.num_tokens), - # CommandToken('sep', '[SEP]', self.num_tokens + 1), - # CommandToken('cls', '[CLS]', self.num_tokens + 2), - # CommandToken('mask', '[MASK]', self.num_tokens + 3, lstrip=True), - # CommandToken('unk', '[UNK]', self.num_tokens + 4) - # ] - # self.num_tokens += 5 if add_block_symbols: - # sop_id = self.text_tokenizer.convert_token_to_id('<|startofpiece|>') - # eop_id = self.text_tokenizer.convert_token_to_id('<|endofpiece|>') if not self.tokenizer_class == "bpe": self.add_command_token('sop', '<|startofpiece|>') self.add_command_token('eop', '<|endofpiece|>') @@ -157,337 +145,6 @@ def __init__(self, for i in range(1, add_sentinel_token): self.add_command_token(f'MASK{i}', f'[MASK{i}]') self.add_command_token(f'sop{i}', f'<|startofpiece{i}|>') - # self._command_tokens.extend([ - # CommandToken('sop', '<|startofpiece|>', - # self.num_tokens + 1), - # CommandToken('eop', '<|endofpiece|>', self.num_tokens + 2) - # ]) - # if fix_command_token: - # self.num_tokens += 3 - # else: - # self.num_tokens += 2 - # if add_task_mask: - # if fix_command_token: - # self._command_tokens.extend([ - # CommandToken('sMASK', - # '[sMASK]', - # self.num_tokens, - # lstrip=True), - # CommandToken('gMASK', - # '[gMASK]', - # self.num_tokens + 1, - # lstrip=True) - # ]) - # else: - # self._command_tokens.extend([ - # CommandToken('gMASK', - # '[gMASK]', - # self.num_tokens, - # lstrip=True), - # CommandToken('sMASK', - # '[sMASK]', - # self.num_tokens + 1, - # lstrip=True) - # ]) - # if add_decoder_mask: - # self._command_tokens.extend( - # [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)]) - # import pdb;pdb.set_trace() - # - # if self.tokenizer_class == "wp": - # # set command tokens from wordpiece tokenizer values - # self.num_command_tokens = 6 - # self.num_text_tokens = self.num_tokens - 5 - # self.num_type_tokens = 2 - # self.token_start_id = None - # self.token_end_id = None - # self.token_pad_id = None - # # try: - # self._command_tokens = [ - # CommandToken( - # 'pad', '[PAD]', - # self.text_tokenizer.convert_token_to_id('[PAD]')), - # CommandToken( - # 'cls', '[CLS]', - # self.text_tokenizer.convert_token_to_id('[CLS]')), - # CommandToken( - # 'MASK', '[MASK]', - # self.text_tokenizer.convert_token_to_id('[MASK]')), - # CommandToken( - # 'unk', '[UNK]', - # self.text_tokenizer.convert_token_to_id('[UNK]')), - # CommandToken( - # 'sep', '[SEP]', - # self.text_tokenizer.convert_token_to_id('[SEP]')), - # CommandToken( - # 'eos', '[PAD]', - # self.text_tokenizer.convert_token_to_id('[PAD]')), - # ] - # self.token_start_id = self.text_tokenizer.convert_token_to_id( - # '[CLS]') - # self.token_end_id = self.text_tokenizer.convert_token_to_id( - # '[SEP]') - # self.token_pad_id = self.text_tokenizer.convert_token_to_id( - # '[PAD]') - # self.text_tokenizer._token_cls = "[CLS]" - # self.text_tokenizer._token_sep = "[SEP]" - - # # except KeyError: - # # self._command_tokens = [ - # # CommandToken( - # # 'pad', '[PAD]', - # # self.text_tokenizer.convert_token_to_id('')), - # # CommandToken( - # # 'cls', '[CLS]', - # # self.text_tokenizer.convert_token_to_id('')), - # # CommandToken( - # # 'MASK', '[MASK]', - # # self.text_tokenizer.convert_token_to_id('')), - # # CommandToken( - # # 'unk', '[UNK]', - # # self.text_tokenizer.convert_token_to_id('')), - # # CommandToken( - # # 'sep', '[SEP]', - # # self.text_tokenizer.convert_token_to_id('')), - # # CommandToken( - # # 'eos', '[PAD]', - # # self.text_tokenizer.convert_token_to_id('')), - # # ] - # # self.token_start_id = self.text_tokenizer.convert_token_to_id( - # # '') - # # self.token_end_id = self.text_tokenizer.convert_token_to_id( - # # '') - # # self.token_pad_id = self.text_tokenizer.convert_token_to_id( - # # '') - # # self.text_tokenizer._token_cls = "" - # # self.text_tokenizer._token_sep = "" - # if add_block_symbols: - # self.add_command_token('sop', '<|startofpiece|>') - # self.add_command_token('eop', '<|endofpiece|>',) - # if add_task_mask: - # self.add_command_token('gMASK', '[gMASK]') - # self.add_command_token('sMASK', '[sMASK]') - # if add_decoder_mask: - # self.add_command_token('dBLOCK', '[dBLOCK]') - # if add_sentinel_token > 0: - # for i in range(1, add_sentinel_token): - # self.add_command_token(f'MASK{i}', f'[MASK{i}]') - # self.add_command_token(f'sop{i}', f'<|startofpiece{i}|>') - # elif self.tokenizer_class == "bpe": - # if self.tokenizer_model_name.lower().startswith('roberta'): - # self.num_command_tokens = 6 - # self.num_text_tokens = self.num_tokens - 3 - # self._command_tokens = [ - # CommandToken( - # 'pad', '<|endoftext|>', - # self.text_tokenizer.convert_token_to_id('')), - # CommandToken( - # 'eos', '<|endoftext|>', - # self.text_tokenizer.convert_token_to_id('')), - # CommandToken( - # 'sep', '[SEP]', - # self.text_tokenizer.convert_token_to_id('')), - # CommandToken( - # 'cls', '[CLS]', - # self.text_tokenizer.convert_token_to_id('')), - # CommandToken( - # 'MASK', - # '[MASK]', - # self.text_tokenizer.convert_token_to_id(''), - # lstrip=True), - # CommandToken( - # 'unk', '[UNK]', - # self.text_tokenizer.convert_token_to_id('')) - # ] - # if add_block_symbols: - # self._command_tokens.extend([ - # CommandToken('sop', '<|startofpiece|>', - # self.num_tokens), - # CommandToken('eop', '<|endofpiece|>', - # self.num_tokens + 1) - # ]) - # self.num_tokens += 2 - # self.num_command_tokens += 2 - # self.token_end_id = self.text_tokenizer.convert_token_to_id( - # '') - # elif self.tokenizer_model_name.lower().startswith('clip'): - # self.num_command_tokens = 2 - # self._command_tokens = [ - # CommandToken( - # 'sot', '', - # self.text_tokenizer.convert_token_to_id('')), - # CommandToken( - # 'eot', '', - # self.text_tokenizer.convert_token_to_id('')), - # ] - # self.num_tokens += self.num_command_tokens - # self.token_end_id = self.text_tokenizer.convert_token_to_id( - # '') - # elif self.tokenizer_model_name.lower().startswith('opt'): - # self._command_tokens = [ - # CommandToken( - # 'pad', '<|endoftext|>', - # self.text_tokenizer.convert_token_to_id('')), - # CommandToken( - # 'eos', '<|endoftext|>', - # self.text_tokenizer.convert_token_to_id('')), - # CommandToken( - # 'sep', '[SEP]', - # self.text_tokenizer.convert_token_to_id('')), - # CommandToken( - # 'cls', '[CLS]', - # self.text_tokenizer.convert_token_to_id('')), - # CommandToken( - # 'MASK', - # '[MASK]', - # self.text_tokenizer.convert_token_to_id(''), - # lstrip=True), - # CommandToken( - # 'unk', '[UNK]', - # self.text_tokenizer.convert_token_to_id('')) - # ] - # else: - # self.num_command_tokens = 2 - # self.num_text_tokens = self.num_tokens - 1 - # self._command_tokens = [ - # CommandToken( - # 'pad', '<|endoftext|>', - # self.text_tokenizer.convert_token_to_id( - # '<|endoftext|>')), - # CommandToken( - # 'eos', '<|endoftext|>', - # self.text_tokenizer.convert_token_to_id( - # '<|endoftext|>')) - # ] - # self.token_end_id = self.text_tokenizer.convert_token_to_id( - # '<|endoftext|>') - # if add_block_symbols: - # if self.tokenizer_model_name.lower().startswith('glm'): - # unk_token_id = self.num_tokens + 5 - # cls_token_id = self.num_tokens + 2 - # num_tokens_to_add = 5 - # else: - # unk_token_id = self.text_tokenizer.convert_token_to_id( - # '<|endoftext|>') - # cls_token_id = self.text_tokenizer.convert_token_to_id( - # '<|endoftext|>') - # num_tokens_to_add = 4 - # self._command_tokens.extend([ - # CommandToken('sop', '<|startofpiece|>', - # self.num_tokens), - # CommandToken('eop', '<|endofpiece|>', - # self.num_tokens + 1), - # CommandToken('cls', '[CLS]', cls_token_id), - # CommandToken('MASK', - # '[MASK]', - # self.num_tokens + 3, - # lstrip=True), - # CommandToken('sep', '[SEP]', self.num_tokens + 4), - # CommandToken('unk', '[UNK]', unk_token_id) - # ]) - # self.num_tokens += num_tokens_to_add - # self.num_command_tokens += 6 - - # if add_block_symbols: - # if add_task_mask: - # self._command_tokens.extend([ - # CommandToken('gMASK', - # '[gMASK]', - # self.num_tokens, - # lstrip=True), - # CommandToken('sMASK', - # '[sMASK]', - # self.num_tokens + 1, - # lstrip=True) - # ]) - # self.num_tokens += 2 - # self.num_command_tokens += 2 - # if add_decoder_mask: - # self._command_tokens.extend( - # [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)]) - # self.num_tokens += 1 - # self.num_command_tokens += 1 - - # elif self.tokenizer_class == "sp": - # import pdb;pdb.set_trace() - # self.num_command_tokens = 0 - # self.num_text_tokens = self.text_tokenizer.vocab_size - # self.num_tokens = self.num_text_tokens - # if self.tokenizer_model_name.lower().startswith('glm'): - # self._command_tokens = [ - # CommandToken('pad', '<|endoftext|>', self.num_text_tokens), - # CommandToken('eos', '<|endoftext|>', self.num_text_tokens), - # CommandToken('sep', '[SEP]', self.num_text_tokens + 1), - # CommandToken('cls', '[CLS]', self.num_text_tokens + 2), - # CommandToken('MASK', - # '[MASK]', - # self.num_text_tokens + 3, - # lstrip=True), - # CommandToken('unk', '[UNK]', self.num_text_tokens + 4) - # ] - # self.num_tokens += 5 - # self.num_command_tokens += 6 - # else: - # # try: - # # self.text_tokenizer.convert_token_to_id('') - # # self.text_tokenizer.convert_token_to_id('') - # # self.text_tokenizer.convert_token_to_id('') - # # self.text_tokenizer.convert_token_to_id('') - # # self.text_tokenizer.convert_token_to_id('') - # # self.text_tokenizer.convert_token_to_id('') - # # self.text_tokenizer.convert_token_to_id('') - # # self._command_tokens = [ - # # CommandToken('pad', '', self.text_tokenizer.convert_token_to_id('')), - # # CommandToken('eos', '', self.text_tokenizer.convert_token_to_id('')), - # # CommandToken('unk', '', self.text_tokenizer.convert_token_to_id('')) - # # ] - # self.num_tokens += 3 - # self.num_command_tokens += 3 - # self.token_end_id = self.text_tokenizer.convert_token_to_id( - # '') - # if add_block_symbols: - # sop_id = self.text_tokenizer.convert_token_to_id('<|startofpiece|>') - # eop_id = self.text_tokenizer.convert_token_to_id('<|endofpiece|>') - # self._command_tokens.extend([ - # CommandToken('sop', '<|startofpiece|>', - # self.num_tokens + 1), - # CommandToken('eop', '<|endofpiece|>', self.num_tokens + 2) - # ]) - # if fix_command_token: - # self.num_tokens += 3 - # else: - # self.num_tokens += 2 - # self.num_command_tokens += 2 - # if add_task_mask: - # if fix_command_token: - # self._command_tokens.extend([ - # CommandToken('sMASK', - # '[sMASK]', - # self.num_tokens, - # lstrip=True), - # CommandToken('gMASK', - # '[gMASK]', - # self.num_tokens + 1, - # lstrip=True) - # ]) - # else: - # self._command_tokens.extend([ - # CommandToken('gMASK', - # '[gMASK]', - # self.num_tokens, - # lstrip=True), - # CommandToken('sMASK', - # '[sMASK]', - # self.num_tokens + 1, - # lstrip=True) - # ]) - # self.num_tokens += 2 - # self.num_command_tokens += 2 - # if add_decoder_mask: - # self._command_tokens.extend( - # [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)]) - # self.num_tokens += 1 - # self.num_command_tokens += 1 self.command_name_map = {tok.name: tok for tok in self._command_tokens} self.command_token_map = { tok.token: tok @@ -509,7 +166,7 @@ def __init__(self, # import pdb;pdb.set_trace() print("All special tokens: ", str([(k, v.token, v.Id) for k,v in self.command_name_map.items()])) # logger.info("All special tokens: %s", str([(k,v.Id) for k,v in self.command_name_map.items()])) - + def get_vocab(self): return self.text_tokenizer.get_vocab() @@ -569,6 +226,7 @@ def _encode(self, text): return ids def convert_tokens_to_ids(self, tokens): + import pdb;pdb.set_trace() res = [] for token in tokens: if token in self.command_token_map: diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 49b6e95b..40d276ae 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -6,36 +6,37 @@ import unittest from flagai.data.tokenizer import Tokenizer from flagai.auto_model.auto_loader import AutoLoader +from flagai.data.tokenizer import Tokenizer class TokenizerTestCase(unittest.TestCase): - def test_tokenizer_GLM_large_ch(self): - tokenizer = Tokenizer.from_pretrained("GLM-large-ch") - self.assertEqual(tokenizer.TokenToId("人"), 43371, 'Token id "人" error') - self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), - [3378, 1567, 2613, 20282], 'EncodeAsIds Error') - self.assertEqual(tokenizer.DecodeIds([3378, 1567, 2613, 20282]), - '今天吃饭吃了肯德基', 'DecodeIds Error') - self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), - ['▁今天', '吃饭', '吃了', '肯德基'], 'tokenize Error') - self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], - [50006, 3378, 1567, 2613, 20282, 50001], 'encode_plus Error') - self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - [('pad', '<|endoftext|>', 50000), ('eos', '<|endoftext|>', 50000), ('sep', '[SEP]', 50001), - ('cls', '[CLS]', 50002), ('mask', '[MASK]', 50003), ('unk', '[UNK]', 50004), ('sop', '<|startofpiece|>', 50006), - ('eop', '<|endofpiece|>', 50007), ('sMASK', '[sMASK]', 50008), ('gMASK', '[gMASK]', 50009)], 'SpecialTokens error') + # def test_tokenizer_GLM_large_ch(self): + # tokenizer = Tokenizer.from_pretrained("GLM-large-ch") + # self.assertEqual(tokenizer.TokenToId("人"), 43371, 'Token id "人" error') + # self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), + # [3378, 1567, 2613, 20282], 'EncodeAsIds Error') + # self.assertEqual(tokenizer.DecodeIds([3378, 1567, 2613, 20282]), + # '今天吃饭吃了肯德基', 'DecodeIds Error') + # self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), + # ['▁今天', '吃饭', '吃了', '肯德基'], 'tokenize Error') + # self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], + # [50006, 3378, 1567, 2613, 20282, 50001], 'encode_plus Error') + # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + # [('pad', '<|endoftext|>', 50000), ('eos', '<|endoftext|>', 50000), ('sep', '[SEP]', 50001), + # ('cls', '[CLS]', 50002), ('mask', '[MASK]', 50003), ('unk', '[UNK]', 50004), ('sop', '<|startofpiece|>', 50006), + # ('eop', '<|endofpiece|>', 50007), ('sMASK', '[sMASK]', 50008), ('gMASK', '[gMASK]', 50009)], 'SpecialTokens error') - def test_tokenizer_GLM_large_en(self): - tokenizer = Tokenizer.from_pretrained("GLM-large-en") - self.assertEqual(tokenizer.TokenToId("day"), 2154, '') - self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), - [13017, 7975, 3084, 2033, 3407], '') - self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), - 'fried chicken makes me happy', 'DecodeIds Error') - self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('mask', '[MASK]', 103), ('unk', '[UNK]', 100), - ('sep', '[SEP]', 102), ('pad', '[PAD]', 0), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523), - ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)]) + # def test_tokenizer_GLM_large_en(self): + # tokenizer = Tokenizer.from_pretrained("GLM-large-en") + # self.assertEqual(tokenizer.TokenToId("day"), 2154, '') + # self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), + # [13017, 7975, 3084, 2033, 3407], '') + # self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), + # 'fried chicken makes me happy', 'DecodeIds Error') + # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + # [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('mask', '[MASK]', 103), ('unk', '[UNK]', 100), + # ('sep', '[SEP]', 102), ('pad', '[PAD]', 0), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523), + # ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)]) # # def test_tokenizer_glm_10b_en(self): # # tokenizer = Tokenizer.from_pretrained("GLM-10b-en") @@ -105,9 +106,27 @@ def test_tokenizer_GLM_large_en(self): # ['▁f', 'ried', '▁ch', 'ick', 'en', '▁make', 's', '▁me', '▁happy'], 'tokenize Error') # self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'], # [1, 2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239, 2], 'encode_plus Error') - # self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], + # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], # [('unk', '', 0), ('cls', '', 1), ('eos', '', 2), ('sep', '', 4), ('mask', '', 6), ('eod', '', 7), ('eop', '', 0)]) + def test_tokenizer_cpm2_large(self): + loader = AutoLoader(task_name="lm", + model_name="CPM2-Xlarge-ch", + model_dir="./checkpoints/", + only_download_config=True) + tokenizer = Tokenizer.from_pretrained("CPM2-Xlarge-ch") + self.assertEqual(tokenizer.TokenToId("人"), 38, '') + self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), + [1540, 243, 225, 1511, 225, 21, 3041, 467, 995], '') + self.assertEqual(tokenizer.DecodeIds([1540, 243, 225, 1511, 225, 21, 3041, 467, 995]), + '今天吃饭吃了肯德基', 'DecodeIds Error') + self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), + ['今', '天', '吃', '饭', '吃', '了', '肯', '德', '基'], 'tokenize Error') + self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], + [1, 1540, 243, 225, 1511, 225, 21, 3041, 467, 995, 2], 'encode_plus Error') + self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + [('unk', '', 0), ('cls', '', 1), ('eos', '', 2), ('sep', '', 4), ('mask', '', 6), ('eod', '', 7)]) + # def test_tokenizer_opt(self): # tokenizer = Tokenizer.from_pretrained('opt-1.3b-en') # self.assertEqual(tokenizer.encode("day"), [1208], '') @@ -119,8 +138,8 @@ def test_tokenizer_GLM_large_en(self): # ['fried', 'Ġchicken', 'Ġmakes', 'Ġme', 'Ġhappy'], 'tokenize Error') # self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'], # [0, 21209, 5884, 817, 162, 1372, 2], 'encode_plus Error') - # self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], - # [('cls', '', 0), ('pad', '', 1), ('eos', '', 2), ('unk', '', 3)]) + # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + # [('cls', '', 0), ('pad', '', 1), ('bos', '', 2), ('eos', '', 2), ('unk', '', 3)], 'SpecialTokens error') # def test_tokenizer_clip(self): @@ -140,13 +159,14 @@ def test_tokenizer_GLM_large_en(self): def suite(): suite = unittest.TestSuite() - suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_ch')) + # suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_ch')) # suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_en')) # suite.addTest(TokenizerTestCase('test_tokenizer_glm_10_en')) # suite.addTest(TokenizerTestCase('test_tokenizer_t5')) # suite.addTest(TokenizerTestCase('test_tokenizer_roberta')) # suite.addTest(TokenizerTestCase('test_tokenizer_bert')) # suite.addTest(TokenizerTestCase('test_tokenizer_cpm1')) + suite.addTest(TokenizerTestCase('test_tokenizer_cpm2_large')) # suite.addTest(TokenizerTestCase('test_tokenizer_opt')) # suite.addTest(TokenizerTestCase('test_tokenizer_clip')) # suite.addTest(TokenizerTestCase('test_tokenizer_evaclip')) From c830a02987fcd3d2286c44a283f8cb69ba4b5929 Mon Sep 17 00:00:00 2001 From: ldwang Date: Tue, 28 Feb 2023 15:20:29 +0800 Subject: [PATCH 07/54] Update setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index be63eb7d..f4a690c5 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="flagai", - version="v1.5.1", + version="v1.6.0", description="FlagAI aims to help researchers and developers to freely train and test large-scale models for NLP/CV/VL tasks.", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", From 02ded3bbd99d3a50f9c1e387fab50e190b6e9053 Mon Sep 17 00:00:00 2001 From: xuan Date: Tue, 28 Feb 2023 15:43:52 +0800 Subject: [PATCH 08/54] Create __init__.py --- flagai/data/tokenizer/galactica/__init__.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 flagai/data/tokenizer/galactica/__init__.py diff --git a/flagai/data/tokenizer/galactica/__init__.py b/flagai/data/tokenizer/galactica/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/flagai/data/tokenizer/galactica/__init__.py @@ -0,0 +1 @@ + From 21a20902a400121068729bd4590837d84a4e10dd Mon Sep 17 00:00:00 2001 From: ftgreat Date: Tue, 28 Feb 2023 17:18:18 +0800 Subject: [PATCH 09/54] bminf added Signed-off-by: ftgreat --- examples/cpm_1/generate_bminf.py | 32 ++++++++++++ examples/gpt2_text_writting/generate_bminf.py | 37 +++++++++++++ examples/gpt2_title_generation/deepspeed.json | 2 +- .../gpt2_title_generation/train_multi_gpu.py | 7 +-- .../data/tokenizer/uni_tokenizer/tokenizer.py | 6 +-- flagai/model/gpt2_model.py | 17 +++--- flagai/mp_tools.py | 5 +- tests/test_tokenizer.py | 52 ++++++++++++------- 8 files changed, 125 insertions(+), 33 deletions(-) create mode 100644 examples/cpm_1/generate_bminf.py create mode 100644 examples/gpt2_text_writting/generate_bminf.py diff --git a/examples/cpm_1/generate_bminf.py b/examples/cpm_1/generate_bminf.py new file mode 100644 index 00000000..7ff390cc --- /dev/null +++ b/examples/cpm_1/generate_bminf.py @@ -0,0 +1,32 @@ +import sys +sys.path.append("/home/yanzhaodong/anhforth/FlagAI") +import torch +from flagai.auto_model.auto_loader import AutoLoader +from flagai.model.predictor.predictor import Predictor +import bminf + +if __name__ == '__main__': + + text = '''默写古诗: + 白日依山尽,黄河入海流。 + 床前明月光,''' + + loader = AutoLoader(task_name="lm", + model_name="CPM-large-ch", + model_dir="./checkpoints", + device="cpu") + + model = loader.get_model() + # with torch.cuda.device(0): + # model = bminf.wrapper(model, quantization=False, memory_limit=20 << 30) + tokenizer = loader.get_tokenizer() + + predictor = Predictor(model=model, + tokenizer=tokenizer, + ) + + out = predictor.predict_generate_randomsample(text, + top_p=0.9, + out_max_length=50) + + print(out) diff --git a/examples/gpt2_text_writting/generate_bminf.py b/examples/gpt2_text_writting/generate_bminf.py new file mode 100644 index 00000000..2656d12a --- /dev/null +++ b/examples/gpt2_text_writting/generate_bminf.py @@ -0,0 +1,37 @@ +# Copyright © 2022 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") +import sys +sys.path.append("/home/yanzhaodong/anhforth/FlagAI") +import torch +from flagai.auto_model.auto_loader import AutoLoader +from flagai.model.predictor.predictor import Predictor +import bminf +import time + +if __name__ == '__main__': + + loader = AutoLoader("seq2seq", + "GPT2-base-ch", + model_dir="./checkpoints/") + model = loader.get_model() + model = model.to('cpu') + tokenizer = loader.get_tokenizer() + time_start=time.time() + with torch.cuda.device(0): + model = bminf.wrapper(model, quantization=False, memory_limit=20 << 30) + predictor = Predictor(model, tokenizer) + + text = "今天天气不错" + + out_2 = predictor.predict_generate_randomsample(text, + input_max_length=512, + out_max_length=100, + repetition_penalty=1.5, + top_k=20, + top_p=0.8) + + time_end=time.time() + print('time cost',time_end-time_start,'s') + # print(f"out_1 is {out_1}") + print(f"out_2 is {out_2}") diff --git a/examples/gpt2_title_generation/deepspeed.json b/examples/gpt2_title_generation/deepspeed.json index c18e41e6..36810822 100644 --- a/examples/gpt2_title_generation/deepspeed.json +++ b/examples/gpt2_title_generation/deepspeed.json @@ -4,7 +4,7 @@ "steps_per_print": 50, "gradient_clipping": 1.0, "zero_optimization": { - "stage": 1, + "stage": 2, "contiguous_gradients": false, "overlap_comm": true, "reduce_scatter": true, diff --git a/examples/gpt2_title_generation/train_multi_gpu.py b/examples/gpt2_title_generation/train_multi_gpu.py index 7ad121db..ffa0a969 100644 --- a/examples/gpt2_title_generation/train_multi_gpu.py +++ b/examples/gpt2_title_generation/train_multi_gpu.py @@ -2,6 +2,7 @@ # # Licensed under the Apache License, Version 2.0 (the "License") import sys +sys.path.append("/home/yanzhaodong/anhforth/FlagAI") import os import torch from torch.utils.data import Dataset @@ -12,7 +13,7 @@ # device = torch.device("cpu") # single gpu trainer = Trainer( - env_type="pytorchDDP", + env_type="deepspeed+mpu", experiment_name="roberta_seq2seq", batch_size=1, gradient_accumulation_steps=1, @@ -31,7 +32,7 @@ num_nodes=1, num_gpus=2, checkpoint_activations=False, - model_parallel_size=1, + model_parallel_size=2, hostfile='./hostfile', deepspeed_config='./deepspeed.json', training_script=__file__, @@ -39,7 +40,7 @@ cur_dir = os.path.dirname(os.path.abspath(__file__)) src_dir = cur_dir + '/data/train.src' tgt_dir = cur_dir + '/data/train.tgt' -model_dir = "./state_dict/" +model_dir = "./checkpoints/" os.makedirs(model_dir, exist_ok=True) maxlen = 256 diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py index 7ea53166..f8786741 100644 --- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py @@ -59,8 +59,8 @@ def __init__(self, if self.tokenizer_model_name.lower().startswith('clip-cn'): self.text_tokenizer = FullTokenizer(self.vocab_file) else: - self.text_tokenizer = WordpieceTokenizer(self.vocab_file, - is_ch=self.tokenizer_model_name.lower().endswith("ch")) + self.text_tokenizer = WordpieceTokenizer(self.vocab_file, is_ch=False) + # is_ch=self.tokenizer_model_name.lower().endswith("ch")) elif self.tokenizer_class == "bpe": if self.tokenizer_model_name.lower().startswith('clip'): self.text_tokenizer = MMBPETokenizer(self.vocab_file, @@ -91,6 +91,7 @@ def __init__(self, except FileNotFoundError: dct = None sp_tokens = [] + self._command_tokens = [CommandToken(e[0], e[1], self.text_tokenizer.convert_token_to_id(e[1])) for e in sp_tokens] if self.tokenizer_model_name.lower().startswith("glm"): @@ -166,7 +167,6 @@ def __init__(self, # import pdb;pdb.set_trace() print("All special tokens: ", str([(k, v.token, v.Id) for k,v in self.command_name_map.items()])) # logger.info("All special tokens: %s", str([(k,v.Id) for k,v in self.command_name_map.items()])) - def get_vocab(self): return self.text_tokenizer.get_vocab() diff --git a/flagai/model/gpt2_model.py b/flagai/model/gpt2_model.py index 2e90cc1f..c65cb68b 100644 --- a/flagai/model/gpt2_model.py +++ b/flagai/model/gpt2_model.py @@ -9,7 +9,7 @@ from flagai.model.utils import normal_init_method from flagai.model.base_model import BaseModel import torch.nn.functional as F - +import bminf if os.getenv('ENV_TYPE') == 'deepspeed+mpu': from flagai.mpu.utils import divide from flagai.mpu.random import checkpoint @@ -94,7 +94,7 @@ def __getitem__(self, index): def __setitem__(self, key, value): if hasattr(self, key): setattr(self, key, value) - +import bminf class GPT2Stack(nn.Module): def __init__(self, config): @@ -112,10 +112,15 @@ def __init__(self, config): self.drop = nn.Dropout(config.embd_pdrop) self.project_in = None self.project_out = None - self.h = nn.ModuleList([ + self.h = bminf.TransformerBlockList([ GPT2Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer) - ]) + ],[0]) + # self.h = nn.ModuleList([ + # GPT2Block(config.n_ctx, config, scale=True) + # for _ in range(config.n_layer) + # ]) + self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) self.device_map = None @@ -274,9 +279,9 @@ def __init__(self, config, **kwargs): self.parallel_output = True self.transformer = GPT2Stack(config_gpt) - self.lm_head = nn.Linear(config_gpt.n_embd, + self.lm_head = bminf.QuantizedLinear(nn.Linear(config_gpt.n_embd, config_gpt.vocab_size, - bias=False) + bias=False)) def _make_causal_mask(self, input_ids): device = input_ids.device diff --git a/flagai/mp_tools.py b/flagai/mp_tools.py index b4e9a108..774159c5 100644 --- a/flagai/mp_tools.py +++ b/flagai/mp_tools.py @@ -7,7 +7,7 @@ import copy from_1_to_n_models = { - "gpt": { + "gpt2": { "wte.weight": 0, "attn.c_attn.weight": 30, "attn.c_attn.bias": 30, @@ -238,7 +238,8 @@ def change_pytorch_model_mp_from_1_to_n_new(model_name_brief, checkpoint: str, t d = d["module"] for k, v in d.items(): - assert len(v.shape) < 3 + if len(v.shape)>2: + continue flag = 0 for keys in trans_keys: if keys in k: diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 40d276ae..5c612b06 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -59,7 +59,21 @@ class TokenizerTestCase(unittest.TestCase): # ['input_ids', 'token_type_ids'], 'encode_plus Error') # self.assertEqual(encode_plus_result['input_ids'], # [101, 306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166, 102], 'encode_plus Error') - + + # def test_tokenizer_t5(self): + # tokenizer = Tokenizer.from_pretrained('T5-base-ch') + # # import pdb;pdb.set_trace() + # self.assertEqual(tokenizer.TokenToId("人"), 297, '') + # self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), + # [306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166], '') + # self.assertEqual(tokenizer.DecodeIds([306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166]), + # '今天吃饭吃了肯德基', 'DecodeIds Error') + # encode_plus_result = tokenizer.encode_plus("今天吃饭吃了肯德基") + # self.assertEqual(list(encode_plus_result.keys()), + # ['input_ids', 'token_type_ids'], 'encode_plus Error') + # self.assertEqual(encode_plus_result['input_ids'], + # [101, 306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166, 102], 'encode_plus Error') + # def test_tokenizer_roberta(self): # tokenizer = Tokenizer.from_pretrained('RoBERTa-base-ch') # # print(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825])) @@ -109,23 +123,24 @@ class TokenizerTestCase(unittest.TestCase): # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], # [('unk', '', 0), ('cls', '', 1), ('eos', '', 2), ('sep', '', 4), ('mask', '', 6), ('eod', '', 7), ('eop', '', 0)]) - def test_tokenizer_cpm2_large(self): - loader = AutoLoader(task_name="lm", - model_name="CPM2-Xlarge-ch", - model_dir="./checkpoints/", - only_download_config=True) - tokenizer = Tokenizer.from_pretrained("CPM2-Xlarge-ch") - self.assertEqual(tokenizer.TokenToId("人"), 38, '') - self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), - [1540, 243, 225, 1511, 225, 21, 3041, 467, 995], '') - self.assertEqual(tokenizer.DecodeIds([1540, 243, 225, 1511, 225, 21, 3041, 467, 995]), - '今天吃饭吃了肯德基', 'DecodeIds Error') - self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), - ['今', '天', '吃', '饭', '吃', '了', '肯', '德', '基'], 'tokenize Error') - self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], - [1, 1540, 243, 225, 1511, 225, 21, 3041, 467, 995, 2], 'encode_plus Error') + def test_tokenizer_gpt(self): + # loader = AutoLoader(task_name="lm", + # model_name="GPT2-base-ch", + # model_dir="./checkpoints/", + # only_download_config=True) + tokenizer = Tokenizer.from_pretrained("GPT2-base-ch") + import pdb;pdb.set_trace() + self.assertEqual(tokenizer.encode("day"), [8, 8275], '') + self.assertEqual(tokenizer.encode("fried chicken makes me happy"), + [2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239], '') + self.assertEqual(tokenizer.decode([2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239]), + 'fried chicken makes me happy', 'DecodeIds Error') + self.assertEqual(tokenizer.tokenize('fried chicken makes me happy'), + ['▁f', 'ried', '▁ch', 'ick', 'en', '▁make', 's', '▁me', '▁happy'], 'tokenize Error') + self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'], + [1, 2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239, 2], 'encode_plus Error') self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - [('unk', '', 0), ('cls', '', 1), ('eos', '', 2), ('sep', '', 4), ('mask', '', 6), ('eod', '', 7)]) + [('unk', '', 0), ('cls', '', 1), ('eos', '', 2), ('sep', '', 4), ('mask', '', 6), ('eod', '', 7), ('eop', '', 0)]) # def test_tokenizer_opt(self): # tokenizer = Tokenizer.from_pretrained('opt-1.3b-en') @@ -166,7 +181,8 @@ def suite(): # suite.addTest(TokenizerTestCase('test_tokenizer_roberta')) # suite.addTest(TokenizerTestCase('test_tokenizer_bert')) # suite.addTest(TokenizerTestCase('test_tokenizer_cpm1')) - suite.addTest(TokenizerTestCase('test_tokenizer_cpm2_large')) + # suite.addTest(TokenizerTestCase('test_tokenizer_cpm2_large')) + suite.addTest(TokenizerTestCase('test_tokenizer_gpt')) # suite.addTest(TokenizerTestCase('test_tokenizer_opt')) # suite.addTest(TokenizerTestCase('test_tokenizer_clip')) # suite.addTest(TokenizerTestCase('test_tokenizer_evaclip')) From bcdf9b1a86f7e0395f61fed1a3f7235d65839e36 Mon Sep 17 00:00:00 2001 From: ftgreat Date: Wed, 1 Mar 2023 14:18:03 +0800 Subject: [PATCH 10/54] updated Signed-off-by: ftgreat --- examples/galactica/generate_galactica_1.3b.py | 48 ++++++++++++------- .../data/tokenizer/uni_tokenizer/tokenizer.py | 3 ++ flagai/model/galactica_model.py | 1 + flagai/model/gpt2_model.py | 10 ++-- 4 files changed, 42 insertions(+), 20 deletions(-) diff --git a/examples/galactica/generate_galactica_1.3b.py b/examples/galactica/generate_galactica_1.3b.py index fe404336..b2997423 100644 --- a/examples/galactica/generate_galactica_1.3b.py +++ b/examples/galactica/generate_galactica_1.3b.py @@ -1,27 +1,41 @@ +import sys +sys.path.append("/home/yanzhaodong/anhforth/FlagAI") from flagai.model.predictor.predictor import Predictor from flagai.auto_model.auto_loader import AutoLoader import torch -device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu") +import bminf +import time +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -loader = AutoLoader(task_name="lm", - model_name="galactica-1.3b-en", - model_dir="/share/projset/baaishare/baai-mrnd/xingzhaohu/") -model = loader.get_model() -model.to(device) -model.eval() +from flagai.data.tokenizer import Tokenizer +tokenizer = Tokenizer.from_pretrained("galactica-1.3b-en",cache_dir="/share/projset/baaishare/baai-mrnd/xingzhaohu/") -tokenizer = loader.get_tokenizer() -predictor = Predictor(model, tokenizer) +# loader = AutoLoader(task_name="lm", +# model_name="galactica-1.3b-en", +# model_dir="/share/projset/baaishare/baai-mrnd/xingzhaohu/") + +# model = loader.get_model() +# model.to(device) +# model.eval() +# time_start=time.time() +# # with torch.cuda.device(0): +# # model = bminf.wrapper(model, quantization=False, memory_limit=20 << 30) +# tokenizer = loader.get_tokenizer() + +# predictor = Predictor(model, tokenizer) + +# text = "Please write a abstract about the computer vision. \n" +# out = predictor.predict_generate_randomsample(text, +# out_max_length=700, +# top_k=50, +# repetition_penalty=1.2, +# temperature=0.7 +# ) +# time_end=time.time() +# print('time cost',time_end-time_start,'s') +# print(out) -text = "Please write a abstract about the computer vision. \n" -out = predictor.predict_generate_randomsample(text, - out_max_length=700, - top_k=50, - repetition_penalty=1.2, - temperature=0.7 - ) -print(out) diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py index f8786741..6d1d1bdb 100644 --- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py @@ -167,6 +167,9 @@ def __init__(self, # import pdb;pdb.set_trace() print("All special tokens: ", str([(k, v.token, v.Id) for k,v in self.command_name_map.items()])) # logger.info("All special tokens: %s", str([(k,v.Id) for k,v in self.command_name_map.items()])) + import pdb;pbb.set_trace() + + def get_vocab(self): return self.text_tokenizer.get_vocab() diff --git a/flagai/model/galactica_model.py b/flagai/model/galactica_model.py index 98e1f9cb..f94e62f4 100644 --- a/flagai/model/galactica_model.py +++ b/flagai/model/galactica_model.py @@ -22,6 +22,7 @@ from torch.nn import CrossEntropyLoss from flagai.model.layers.activations import ACT2FN from flagai.model.gpt2_model import GPT2Model, GPT2Stack, GPT2Config +import bminf class OPTLearnedPositionalEmbedding(nn.Embedding): diff --git a/flagai/model/gpt2_model.py b/flagai/model/gpt2_model.py index c65cb68b..a3ab1054 100644 --- a/flagai/model/gpt2_model.py +++ b/flagai/model/gpt2_model.py @@ -94,7 +94,7 @@ def __getitem__(self, index): def __setitem__(self, key, value): if hasattr(self, key): setattr(self, key, value) -import bminf + class GPT2Stack(nn.Module): def __init__(self, config): @@ -279,9 +279,13 @@ def __init__(self, config, **kwargs): self.parallel_output = True self.transformer = GPT2Stack(config_gpt) - self.lm_head = bminf.QuantizedLinear(nn.Linear(config_gpt.n_embd, + # self.lm_head = bminf.QuantizedLinear(nn.Linear(config_gpt.n_embd, + # config_gpt.vocab_size, + # bias=False)) + self.lm_head = nn.Linear(config_gpt.n_embd, config_gpt.vocab_size, - bias=False)) + bias=False) + def _make_causal_mask(self, input_ids): device = input_ids.device From 08115308e911d3a2eba12e310317deaa9dfe4558 Mon Sep 17 00:00:00 2001 From: ftgreat Date: Thu, 2 Mar 2023 14:35:49 +0800 Subject: [PATCH 11/54] added bminf Signed-off-by: ftgreat --- examples/bminf_generate/README.md | 45 +++++++++++++++++ examples/bminf_generate/cpm1_generate.py | 37 ++++++++++++++ .../bminf_generate/galactica_6.7b_generate.py | 36 +++++++++++++ .../gpt2_generate.py} | 0 examples/cpm_1/generate_bminf.py | 9 +++- examples/galactica/generate_galactica_1.3b.py | 50 +++++++------------ examples/gpt2_text_writting/generate.py | 2 - flagai/data/tokenizer/tokenizer.py | 6 ++- .../data/tokenizer/uni_tokenizer/tokenizer.py | 6 --- 9 files changed, 147 insertions(+), 44 deletions(-) create mode 100644 examples/bminf_generate/README.md create mode 100644 examples/bminf_generate/cpm1_generate.py create mode 100644 examples/bminf_generate/galactica_6.7b_generate.py rename examples/{gpt2_text_writting/generate_bminf.py => bminf_generate/gpt2_generate.py} (100%) diff --git a/examples/bminf_generate/README.md b/examples/bminf_generate/README.md new file mode 100644 index 00000000..4240b9df --- /dev/null +++ b/examples/bminf_generate/README.md @@ -0,0 +1,45 @@ + +# BMInf + +## 简介/Overview + +BMInf is a low-resource inference package for large-scale pretrained language models. + +BMInf supports running models with more than 10 billion parameters on a single NVIDIA GTX 1060 GPU in its minimum requirements. Running with better GPUs leads to better performance. In cases where the GPU memory supports the large model inference (such as V100 or A100), BMInf still has a significant performance improvement over the existing PyTorch implementation. + +BMInf Github Repository address: https://github.com/OpenBMB/BMInf + +BMInf (Big Model Inference) 是一个用于大规模预训练语言模型(pretrained language models, PLM)推理阶段的低资源工具包。 + +BMInf最低支持在NVIDIA GTX 1060单卡运行百亿大模型。在此基础上,使用更好的gpu运行会有更好的性能。在显存支持进行大模型推理的情况下(如V100或A100显卡),BMInf的实现较现有PyTorch版本仍有较大性能提升。 + +BMInf 仓库地址:https://github.com/OpenBMB/BMInf + +## 应用/Application + +在模型加载参数之后,使用如下代码来用BMInf转换模型 + +```Python +with torch.cuda.device(0): + model = bminf.wrapper(model, quantization=False, memory_limit=20 << 30) +``` +The `quantization` parameter represents whether to use the model quantization technique, but if it is a generated class model, it needs to be set to `False`. + +You can use the `memory_limit` parameter to set the maximum available storage, the unit is Mb. + +`quantization`参数代表是否使用了模型量化的技巧,但如果是生成类模型,则需要设置成`False` + +可以用`memory_limit`参数设置最大的可用存储,单位为Mb + +如果`bminf.wrapper`不能很好的适配你的模型,你可以用以下的方法来进行手动适配。 + +* 将 `torch.nn.ModuleList` 替换为 `bminf.TransformerBlockList`. +```python +module_list = bminf.TransformerBlockList([ +], [CUDA_DEVICE_INDEX]) +``` + +* 将 `torch.nn.Linear` 替换为 `bminf.QuantizedLinear`. +```python +linear = bminf.QuantizedLinear(torch.nn.Linear(...)) +``` \ No newline at end of file diff --git a/examples/bminf_generate/cpm1_generate.py b/examples/bminf_generate/cpm1_generate.py new file mode 100644 index 00000000..982eb45a --- /dev/null +++ b/examples/bminf_generate/cpm1_generate.py @@ -0,0 +1,37 @@ +import sys +sys.path.append("/home/yanzhaodong/anhforth/FlagAI") +import torch +from flagai.auto_model.auto_loader import AutoLoader +from flagai.model.predictor.predictor import Predictor +import bminf +import time + + +if __name__ == '__main__': + + text = '''默写古诗: + 白日依山尽,黄河入海流。 + 床前明月光,''' + + loader = AutoLoader(task_name="lm", + model_name="CPM-large-ch", + model_dir="./checkpoints", + device="cpu") + + model = loader.get_model() + time_start=time.time() + with torch.cuda.device(0): + model = bminf.wrapper(model, quantization=False, memory_limit=20 << 30) + tokenizer = loader.get_tokenizer() + + predictor = Predictor(model=model, + tokenizer=tokenizer, + ) + + out = predictor.predict_generate_randomsample(text, + top_p=0.9, + out_max_length=50) + time_end=time.time() + print('time cost',time_end-time_start,'s') + + print(out) diff --git a/examples/bminf_generate/galactica_6.7b_generate.py b/examples/bminf_generate/galactica_6.7b_generate.py new file mode 100644 index 00000000..964f319b --- /dev/null +++ b/examples/bminf_generate/galactica_6.7b_generate.py @@ -0,0 +1,36 @@ +from flagai.model.predictor.predictor import Predictor +from flagai.auto_model.auto_loader import AutoLoader +import torch +import bminf +import time +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + +loader = AutoLoader(task_name="lm", + model_name="galactica-6.7b-en", + model_dir="/share/projset/baaishare/baai-mrnd/xingzhaohu/") + +model = loader.get_model() +with torch.cuda.device(0): + model = bminf.wrapper(model, quantization=False, memory_limit=20 << 30) +model.to(device) +model.eval() +tokenizer = loader.get_tokenizer() +predictor = Predictor(model, tokenizer) +print("model loaded") +time_start=time.time() + +text = "Please write a abstract about the computer vision. \n" +out = predictor.predict_generate_randomsample(text, + out_max_length=700, + top_k=50, + repetition_penalty=1.2, + temperature=0.7 + ) + +time_end=time.time() +print('time cost',time_end-time_start,'s') +print(out) + + + diff --git a/examples/gpt2_text_writting/generate_bminf.py b/examples/bminf_generate/gpt2_generate.py similarity index 100% rename from examples/gpt2_text_writting/generate_bminf.py rename to examples/bminf_generate/gpt2_generate.py diff --git a/examples/cpm_1/generate_bminf.py b/examples/cpm_1/generate_bminf.py index 7ff390cc..982eb45a 100644 --- a/examples/cpm_1/generate_bminf.py +++ b/examples/cpm_1/generate_bminf.py @@ -4,6 +4,8 @@ from flagai.auto_model.auto_loader import AutoLoader from flagai.model.predictor.predictor import Predictor import bminf +import time + if __name__ == '__main__': @@ -17,8 +19,9 @@ device="cpu") model = loader.get_model() - # with torch.cuda.device(0): - # model = bminf.wrapper(model, quantization=False, memory_limit=20 << 30) + time_start=time.time() + with torch.cuda.device(0): + model = bminf.wrapper(model, quantization=False, memory_limit=20 << 30) tokenizer = loader.get_tokenizer() predictor = Predictor(model=model, @@ -28,5 +31,7 @@ out = predictor.predict_generate_randomsample(text, top_p=0.9, out_max_length=50) + time_end=time.time() + print('time cost',time_end-time_start,'s') print(out) diff --git a/examples/galactica/generate_galactica_1.3b.py b/examples/galactica/generate_galactica_1.3b.py index b2997423..91920f45 100644 --- a/examples/galactica/generate_galactica_1.3b.py +++ b/examples/galactica/generate_galactica_1.3b.py @@ -1,41 +1,25 @@ -import sys -sys.path.append("/home/yanzhaodong/anhforth/FlagAI") from flagai.model.predictor.predictor import Predictor from flagai.auto_model.auto_loader import AutoLoader import torch -import bminf -import time -device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") +device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu") +loader = AutoLoader(task_name="lm", + model_name="galactica-1.3b-en", + model_dir="/share/projset/baaishare/baai-mrnd/xingzhaohu/") -from flagai.data.tokenizer import Tokenizer -tokenizer = Tokenizer.from_pretrained("galactica-1.3b-en",cache_dir="/share/projset/baaishare/baai-mrnd/xingzhaohu/") - - -# loader = AutoLoader(task_name="lm", -# model_name="galactica-1.3b-en", -# model_dir="/share/projset/baaishare/baai-mrnd/xingzhaohu/") - -# model = loader.get_model() -# model.to(device) -# model.eval() -# time_start=time.time() -# # with torch.cuda.device(0): -# # model = bminf.wrapper(model, quantization=False, memory_limit=20 << 30) -# tokenizer = loader.get_tokenizer() - -# predictor = Predictor(model, tokenizer) - -# text = "Please write a abstract about the computer vision. \n" -# out = predictor.predict_generate_randomsample(text, -# out_max_length=700, -# top_k=50, -# repetition_penalty=1.2, -# temperature=0.7 -# ) -# time_end=time.time() -# print('time cost',time_end-time_start,'s') -# print(out) +model = loader.get_model() +model.to(device) +model.eval() +tokenizer = loader.get_tokenizer() +predictor = Predictor(model, tokenizer) +text = "Please write a abstract about the computer vision. \n" +out = predictor.predict_generate_randomsample(text, + out_max_length=700, + top_k=50, + repetition_penalty=1.2, + temperature=0.7 + ) +print(out) \ No newline at end of file diff --git a/examples/gpt2_text_writting/generate.py b/examples/gpt2_text_writting/generate.py index 70352c90..42edd9b6 100644 --- a/examples/gpt2_text_writting/generate.py +++ b/examples/gpt2_text_writting/generate.py @@ -1,8 +1,6 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") -import sys -sys.path.append("/home/yanzhaodong/anhforth/FlagAI") from flagai.auto_model.auto_loader import AutoLoader from flagai.model.predictor.predictor import Predictor diff --git a/flagai/data/tokenizer/tokenizer.py b/flagai/data/tokenizer/tokenizer.py index 3f82e7f5..c3ba085f 100644 --- a/flagai/data/tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/tokenizer.py @@ -457,8 +457,12 @@ def DecodeTokens(self, tokens): """A list of tokens => recovered text string""" return self.text_tokenizer.convert_tokens_to_string(tokens) + def convert_tokens_to_ids(self, tokens): + return self.text_tokenizer.convert_tokens_to_ids(tokens) + + def convert_ids_to_tokens(self, ids): + return self.text_tokenizer.convert_ids_to_tokens(ids) -# class BaseTokenizer(object): class TextTokenizer(object): """ diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py index 6d1d1bdb..a1e66de0 100644 --- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py @@ -153,7 +153,6 @@ def __init__(self, } self.command_id_map = {tok.Id: tok for tok in self._command_tokens} self._command_token_tokens = list(self.command_token_map.keys()) - # import pdb;pdb.set_trace() vocab = self.text_tokenizer.get_vocab() self.token_start_id = vocab.get('', None) if not self.token_start_id: @@ -164,10 +163,8 @@ def __init__(self, self.token_end_id = vocab.get('<|endoftext|>', None) if not self.token_end_id: self.token_end_id = vocab.get('[SEP]', None) - # import pdb;pdb.set_trace() print("All special tokens: ", str([(k, v.token, v.Id) for k,v in self.command_name_map.items()])) # logger.info("All special tokens: %s", str([(k,v.Id) for k,v in self.command_name_map.items()])) - import pdb;pbb.set_trace() def get_vocab(self): @@ -229,7 +226,6 @@ def _encode(self, text): return ids def convert_tokens_to_ids(self, tokens): - import pdb;pdb.set_trace() res = [] for token in tokens: if token in self.command_token_map: @@ -239,8 +235,6 @@ def convert_tokens_to_ids(self, tokens): return res def convert_ids_to_tokens(self, ids): - # if torch.is_tensor(ids): - # ids = ids.tolist() res = [] for id in ids: if id in self.command_id_map: From 66aa41aa77fd61f12dac4586bc206a7ad46e786b Mon Sep 17 00:00:00 2001 From: ftgreat Date: Thu, 2 Mar 2023 15:01:02 +0800 Subject: [PATCH 12/54] fixed conflicts Signed-off-by: ftgreat --- examples/AltCLIP/altclip_inference.py | 4 +--- flagai/data/dataset/block/blocklm_utils.py | 2 +- flagai/data/dataset/data_collator/collate_fn.py | 6 +++--- flagai/data/dataset/data_utils.py | 4 ++-- flagai/data/dataset/superglue/pvp.py | 4 ++-- flagai/data/tokenizer/bert/bert_tokenizer.py | 4 ++-- flagai/data/tokenizer/bert/wordpiece.py | 2 +- flagai/data/tokenizer/galactica/galactica_tokenizer.py | 2 +- flagai/data/tokenizer/opt/opt_en_tokenizer.py | 4 ++-- flagai/data/tokenizer/roberta/roberta_tokenizer.py | 2 +- flagai/data/tokenizer/t5/t5_tokenizer.py | 2 +- flagai/data/tokenizer/uni_tokenizer/tokenizer.py | 4 ++-- flagai/model/predictor/utils.py | 6 ++---- flagai/test_utils.py | 2 +- 14 files changed, 22 insertions(+), 26 deletions(-) diff --git a/examples/AltCLIP/altclip_inference.py b/examples/AltCLIP/altclip_inference.py index 0e62947f..6ba53e41 100644 --- a/examples/AltCLIP/altclip_inference.py +++ b/examples/AltCLIP/altclip_inference.py @@ -1,7 +1,5 @@ import torch from PIL import Image -import sys -sys.path.append("/home/yanzhaodong/anhforth/FlagAI") from flagai.auto_model.auto_loader import AutoLoader device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -21,7 +19,7 @@ tokenizer = loader.get_tokenizer() def inference(): - image = Image.open("./examples/AltCLIP/dog.jpeg") + image = Image.open("./dog.jpeg") image = transform(image) image = torch.tensor(image["pixel_values"]).to(device) tokenizer_out = tokenizer(["a rat", "a dog", "a cat"], diff --git a/flagai/data/dataset/block/blocklm_utils.py b/flagai/data/dataset/block/blocklm_utils.py index 0103fcba..4687305f 100644 --- a/flagai/data/dataset/block/blocklm_utils.py +++ b/flagai/data/dataset/block/blocklm_utils.py @@ -205,7 +205,7 @@ def make_masked_data(self, # position_ids = np.arange(len(tokens), dtype=np.int64) targets = copy.deepcopy(tokens) - mask_id = self.tokenizer.get_command_id('mask') + mask_id = self.tokenizer.get_command_id('MASK') mlm_masks = np.zeros(len(tokens), dtype=np.int64) for start, end in block_spans: for idx in range(start, end): diff --git a/flagai/data/dataset/data_collator/collate_fn.py b/flagai/data/dataset/data_collator/collate_fn.py index d6783990..73b2f8e5 100644 --- a/flagai/data/dataset/data_collator/collate_fn.py +++ b/flagai/data/dataset/data_collator/collate_fn.py @@ -175,7 +175,7 @@ def sub_finder(mylist, pattern): source_tokens = [cls_id] + source_tokens + [mask_id ] + answer_tokens elif self.task_name in ["cmrc"]: - mask_id = self.tokenizer.get_command_id('mask') + mask_id = self.tokenizer.get_command_id('MASK') source_text = example.text_a target_text = example.meta["answer"].strip() question = example.meta["question"].strip() @@ -191,7 +191,7 @@ def sub_finder(mylist, pattern): mask_id ] + source_tokens[:max_src_length] elif self.task_name in ["wsc"]: - mask_id = self.tokenizer.get_command_id('mask') + mask_id = self.tokenizer.get_command_id('MASK') source_text = example.text_a target_text = example.meta["answer"].strip() question = example.meta["question"].strip() @@ -426,7 +426,7 @@ def make_masked_data(self, position_ids = np.arange(len(tokens), dtype=np.int64) targets = copy.deepcopy(tokens) - mask_id = self.tokenizer.get_command_id('mask') + mask_id = self.tokenizer.get_command_id('MASK') mlm_masks = np.zeros(len(tokens), dtype=np.int64) for start, end in block_spans: for idx in range(start, end): diff --git a/flagai/data/dataset/data_utils.py b/flagai/data/dataset/data_utils.py index 1efee372..4f0ee38d 100644 --- a/flagai/data/dataset/data_utils.py +++ b/flagai/data/dataset/data_utils.py @@ -134,7 +134,7 @@ def build_input_from_ids(text_a_ids, # Prepare ids for special tokens if mask_id is None: - mask_id = tokenizer.get_command_id('mask') + mask_id = tokenizer.get_command_id('MASK') eos_id = tokenizer.get_command_id('eos') # end of sentence token cls_id = tokenizer.get_command_id('cls') # start of sentence token sep_id = tokenizer.get_command_id('sep') # seperator of two texts token @@ -235,7 +235,7 @@ def build_input_from_ids(text_a_ids, # def build_decoder_input(enc_ids, answer_ids, max_seq_length, max_dec_seq_length, tokenizer): - mask_id = tokenizer.get_command_id('mask') + mask_id = tokenizer.get_command_id('MASK') eos_id = tokenizer.get_command_id('eos') sop_id = tokenizer.get_command_id('sop') masks = [] diff --git a/flagai/data/dataset/superglue/pvp.py b/flagai/data/dataset/superglue/pvp.py index a1b76b59..d4d07b39 100644 --- a/flagai/data/dataset/superglue/pvp.py +++ b/flagai/data/dataset/superglue/pvp.py @@ -97,12 +97,12 @@ def spell_length(self): @property def mask(self) -> str: """Return the underlying LM's mask token""" - return self.tokenizer.get_command_id('mask') + return self.tokenizer.get_command_id('MASK') @property def mask_id(self) -> int: """Return the underlying LM's mask id""" - return self.tokenizer.get_command_id('mask') + return self.tokenizer.get_command_id('MASK') @property def max_num_verbalizers(self) -> int: diff --git a/flagai/data/tokenizer/bert/bert_tokenizer.py b/flagai/data/tokenizer/bert/bert_tokenizer.py index 3c935713..8488bc88 100644 --- a/flagai/data/tokenizer/bert/bert_tokenizer.py +++ b/flagai/data/tokenizer/bert/bert_tokenizer.py @@ -75,8 +75,8 @@ def __init__(self, tokenizer_model_type=None, cache_dir=None): self._command_tokens = [ CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), - CommandToken('mask', '[MASK]', - self.get_specialid_from_text_tokenizer('mask')), + CommandToken('MASK', '[MASK]', + self.get_specialid_from_text_tokenizer('MASK')), CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')), CommandToken('eos', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), diff --git a/flagai/data/tokenizer/bert/wordpiece.py b/flagai/data/tokenizer/bert/wordpiece.py index eb636f7a..566969a5 100644 --- a/flagai/data/tokenizer/bert/wordpiece.py +++ b/flagai/data/tokenizer/bert/wordpiece.py @@ -118,7 +118,7 @@ def __init__(self, self._token_unk = '[UNK]' self._token_mask = '[MASK]' - for token in ['pad', 'cls', 'sep', 'unk', 'mask']: + for token in ['pad', 'cls', 'sep', 'unk', 'MASK']: _token_id = self.vocab[getattr(self, "_token_" + str(token))] setattr(self, "_token_" + str(token) + "_id", _token_id) diff --git a/flagai/data/tokenizer/galactica/galactica_tokenizer.py b/flagai/data/tokenizer/galactica/galactica_tokenizer.py index 87a28412..9a2ce280 100644 --- a/flagai/data/tokenizer/galactica/galactica_tokenizer.py +++ b/flagai/data/tokenizer/galactica/galactica_tokenizer.py @@ -16,7 +16,7 @@ def __init__(self, download_dir) -> None: CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), CommandToken('ENC', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), CommandToken('MASK', '[MASK]', - self.get_specialid_from_text_tokenizer('mask')), + self.get_specialid_from_text_tokenizer('MASK')), CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')), CommandToken('eos', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), diff --git a/flagai/data/tokenizer/opt/opt_en_tokenizer.py b/flagai/data/tokenizer/opt/opt_en_tokenizer.py index 9e8e528c..0129af7b 100644 --- a/flagai/data/tokenizer/opt/opt_en_tokenizer.py +++ b/flagai/data/tokenizer/opt/opt_en_tokenizer.py @@ -35,8 +35,8 @@ def __init__(self, tokenizer_model_type="facebook/opt-125m", cache_dir=None): self._command_tokens = [ CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), - CommandToken('mask', '[MASK]', - self.get_specialid_from_text_tokenizer('mask')), + CommandToken('MASK', '[MASK]', + self.get_specialid_from_text_tokenizer('MASK')), CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')), CommandToken('eos', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), diff --git a/flagai/data/tokenizer/roberta/roberta_tokenizer.py b/flagai/data/tokenizer/roberta/roberta_tokenizer.py index a525f2a6..7447f7f9 100644 --- a/flagai/data/tokenizer/roberta/roberta_tokenizer.py +++ b/flagai/data/tokenizer/roberta/roberta_tokenizer.py @@ -39,7 +39,7 @@ def __init__(self, tokenizer_model_type="roberta-base", cache_dir=None): CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), CommandToken('ENC', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), CommandToken('MASK', '[MASK]', - self.get_specialid_from_text_tokenizer('mask')), + self.get_specialid_from_text_tokenizer('MASK')), CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')), CommandToken('eos', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), diff --git a/flagai/data/tokenizer/t5/t5_tokenizer.py b/flagai/data/tokenizer/t5/t5_tokenizer.py index 8774b3af..5197706b 100644 --- a/flagai/data/tokenizer/t5/t5_tokenizer.py +++ b/flagai/data/tokenizer/t5/t5_tokenizer.py @@ -268,7 +268,7 @@ def __init__(self, self._token_dict = token_dict self._token_dict_inv = {v: k for k, v in token_dict.items()} - for token in ['pad', 'cls', 'sep', 'unk', 'mask']: + for token in ['pad', 'cls', 'sep', 'unk', 'MASK']: try: _token_id = token_dict[getattr(self, "_token_" + str(token))] # print(_token_id) diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py index a1e66de0..f94b9123 100644 --- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py @@ -106,7 +106,7 @@ def __init__(self, CommandToken('eos', '<|endoftext|>', self.num_tokens), CommandToken('sep', '[SEP]', self.num_tokens + 1), CommandToken('cls', '[CLS]', self.num_tokens + 2), - CommandToken('mask', '[MASK]', self.num_tokens + 3, lstrip=True), + CommandToken('MASK', '[MASK]', self.num_tokens + 3, lstrip=True), CommandToken('unk', '[UNK]', self.num_tokens + 4) ] self.num_tokens += 6 @@ -121,7 +121,7 @@ def __init__(self, CommandToken('sop', '<|startofpiece|>', self.num_tokens), CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1), CommandToken('cls', '[CLS]', self.num_tokens + 2), - CommandToken('mask', + CommandToken('MASK', '[MASK]', self.num_tokens + 3, lstrip=True), diff --git a/flagai/model/predictor/utils.py b/flagai/model/predictor/utils.py index 0b4f92e8..efef509a 100644 --- a/flagai/model/predictor/utils.py +++ b/flagai/model/predictor/utils.py @@ -928,9 +928,8 @@ def t5_random_sample(model, tokenizer, text, input_max_length, out_max_length, TopPLogitsProcessor(top_p=top_p), ] list_processor = ListProcessor(lp) - from tqdm import trange with torch.no_grad(): - for step in trange(out_max_length): + for step in range(out_max_length): scores = model(**{ "input_ids": token_ids, "decoder_input_ids": input_decoder_ids @@ -1436,7 +1435,7 @@ def glm_generate_sample( dtype=torch.long) position_ids = torch.stack((position_ids, block_position_ids), dim=0) position_ids = position_ids.unsqueeze(0) - mask_tokens = ['mask', 'sMASK', 'gMASK'] + mask_tokens = ['MASK', 'sMASK', 'gMASK'] mask_tokens = [tokenizer.get_command_id(token) for token in mask_tokens] end_tokens = [tokenizer.get_command_id('eop'), eod_token] mask_positions = [] @@ -1446,7 +1445,6 @@ def glm_generate_sample( mask_positions.sort() output_ = model(tokens, position_ids, attention_mask, return_memory=True) mems = output_['hidden_states'] - import pdb;pdb.set_trace() for mask_position in mask_positions: position = mask_position tokens, mems = glm_sample_sequence(model, diff --git a/flagai/test_utils.py b/flagai/test_utils.py index 5faa0aec..83dacde3 100644 --- a/flagai/test_utils.py +++ b/flagai/test_utils.py @@ -14,7 +14,7 @@ def build_input_from_ids(text_a_ids=None, mask_id=None, masked_lm=False): if mask_id is None: - mask_id = tokenizer.get_command_id('mask') + mask_id = tokenizer.get_command_id('MASK') eos_id = tokenizer.get_command_id('eos') cls_id = tokenizer.get_command_id('cls') sep_id = tokenizer.get_command_id('sep') From 0fe927fe2a0e947c0fb8b28604cd1673cbdda7a8 Mon Sep 17 00:00:00 2001 From: ftgreat Date: Thu, 2 Mar 2023 15:04:38 +0800 Subject: [PATCH 13/54] test Signed-off-by: ftgreat --- examples/cpm_1/generate_bminf.py | 37 ------------------- .../glm_blank_filling/glm_generate_samples.py | 32 ++++++++-------- 2 files changed, 15 insertions(+), 54 deletions(-) delete mode 100644 examples/cpm_1/generate_bminf.py diff --git a/examples/cpm_1/generate_bminf.py b/examples/cpm_1/generate_bminf.py deleted file mode 100644 index 982eb45a..00000000 --- a/examples/cpm_1/generate_bminf.py +++ /dev/null @@ -1,37 +0,0 @@ -import sys -sys.path.append("/home/yanzhaodong/anhforth/FlagAI") -import torch -from flagai.auto_model.auto_loader import AutoLoader -from flagai.model.predictor.predictor import Predictor -import bminf -import time - - -if __name__ == '__main__': - - text = '''默写古诗: - 白日依山尽,黄河入海流。 - 床前明月光,''' - - loader = AutoLoader(task_name="lm", - model_name="CPM-large-ch", - model_dir="./checkpoints", - device="cpu") - - model = loader.get_model() - time_start=time.time() - with torch.cuda.device(0): - model = bminf.wrapper(model, quantization=False, memory_limit=20 << 30) - tokenizer = loader.get_tokenizer() - - predictor = Predictor(model=model, - tokenizer=tokenizer, - ) - - out = predictor.predict_generate_randomsample(text, - top_p=0.9, - out_max_length=50) - time_end=time.time() - print('time cost',time_end-time_start,'s') - - print(out) diff --git a/examples/glm_blank_filling/glm_generate_samples.py b/examples/glm_blank_filling/glm_generate_samples.py index 700407a3..603c1127 100644 --- a/examples/glm_blank_filling/glm_generate_samples.py +++ b/examples/glm_blank_filling/glm_generate_samples.py @@ -1,8 +1,6 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") -import sys -sys.path.append("/home/yanzhaodong/anhforth/FlagAI") import torch from flagai.model.glm_model import GLMModel from flagai.data.tokenizer import Tokenizer @@ -23,13 +21,13 @@ predictor = Predictor(model, tokenizer) - # text = [ - # '问题:啤酒伤胃吗?回答:[gMASK]', "问题:隔夜菜能吃吗?回答:[gMASK]", "问题:如何评价许嵩?回答:[gMASK]" - # ] - # for t in text: - # output = predictor.predict_generate_randomsample( - # t, top_k=50, repetition_penalty=4.0, top_p=1.0) - # print(t, '\n', output) + text = [ + '问题:啤酒伤胃吗?回答:[gMASK]', "问题:隔夜菜能吃吗?回答:[gMASK]", "问题:如何评价许嵩?回答:[gMASK]" + ] + for t in text: + output = predictor.predict_generate_randomsample( + t, top_k=50, repetition_penalty=4.0, top_p=1.0) + print(t, '\n', output) text = ['北京故宫是中国[MASK]非物质文化遗产。', "上海是中国[MASK]大都市。", "天津大学是[MASK]现代大学。"] for t in text: @@ -37,11 +35,11 @@ t, top_k=50, repetition_penalty=4.0, top_p=1.0) print(t, '\n', output) - # text = [ - # "人工智能是一个以计算机科学为基础,由计算机、数学、哲学等多学科交叉融合的交叉学科,[sMASK],具有非常巨大的前景。", - # "最近十多年来,人工神经网络的研究工作不断深入,已经取得了很大的进展,[sMASK],表现出了良好的智能特性。" - # ] - # for t in text: - # output = predictor.predict_generate_randomsample( - # t, top_k=50, repetition_penalty=4.0, top_p=1.0) - # print(t, '\n', output) + text = [ + "人工智能是一个以计算机科学为基础,由计算机、数学、哲学等多学科交叉融合的交叉学科,[sMASK],具有非常巨大的前景。", + "最近十多年来,人工神经网络的研究工作不断深入,已经取得了很大的进展,[sMASK],表现出了良好的智能特性。" + ] + for t in text: + output = predictor.predict_generate_randomsample( + t, top_k=50, repetition_penalty=4.0, top_p=1.0) + print(t, '\n', output) From 95dfe19e1cea3f80ea0394a263cb1ef3bafc34f8 Mon Sep 17 00:00:00 2001 From: ftgreat Date: Thu, 2 Mar 2023 15:06:27 +0800 Subject: [PATCH 14/54] test Signed-off-by: ftgreat --- examples/cpm_1/generate.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/cpm_1/generate.py b/examples/cpm_1/generate.py index 24f4fe79..e42116a6 100644 --- a/examples/cpm_1/generate.py +++ b/examples/cpm_1/generate.py @@ -1,5 +1,4 @@ -import sys -sys.path.append("/home/yanzhaodong/anhforth/FlagAI") + from flagai.auto_model.auto_loader import AutoLoader from flagai.model.predictor.predictor import Predictor From 2093bb91d89b04778c3d1c347e7155fc0c4f625f Mon Sep 17 00:00:00 2001 From: ftgreat Date: Thu, 2 Mar 2023 15:13:18 +0800 Subject: [PATCH 15/54] updated Signed-off-by: ftgreat --- examples/bminf_generate/cpm1_generate.py | 2 - examples/bminf_generate/gpt2_generate.py | 2 - examples/cpm_1/generate.py | 1 - .../glm_generate_samples_en.py | 2 - examples/gpt2_title_generation/generate.py | 2 - .../gpt2_title_generation/train_multi_gpu.py | 2 - examples/opt/generate_opt_1.3b.py | 2 - examples/roberta_ner/generate.py | 2 - examples/t5_title_generation/generate.py | 2 - test.py | 85 ------------------- 10 files changed, 102 deletions(-) delete mode 100644 test.py diff --git a/examples/bminf_generate/cpm1_generate.py b/examples/bminf_generate/cpm1_generate.py index 982eb45a..81bb74e4 100644 --- a/examples/bminf_generate/cpm1_generate.py +++ b/examples/bminf_generate/cpm1_generate.py @@ -1,5 +1,3 @@ -import sys -sys.path.append("/home/yanzhaodong/anhforth/FlagAI") import torch from flagai.auto_model.auto_loader import AutoLoader from flagai.model.predictor.predictor import Predictor diff --git a/examples/bminf_generate/gpt2_generate.py b/examples/bminf_generate/gpt2_generate.py index 2656d12a..f05c7ba9 100644 --- a/examples/bminf_generate/gpt2_generate.py +++ b/examples/bminf_generate/gpt2_generate.py @@ -1,8 +1,6 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") -import sys -sys.path.append("/home/yanzhaodong/anhforth/FlagAI") import torch from flagai.auto_model.auto_loader import AutoLoader from flagai.model.predictor.predictor import Predictor diff --git a/examples/cpm_1/generate.py b/examples/cpm_1/generate.py index e42116a6..186864f9 100644 --- a/examples/cpm_1/generate.py +++ b/examples/cpm_1/generate.py @@ -1,4 +1,3 @@ - from flagai.auto_model.auto_loader import AutoLoader from flagai.model.predictor.predictor import Predictor diff --git a/examples/glm_blank_filling/glm_generate_samples_en.py b/examples/glm_blank_filling/glm_generate_samples_en.py index f4633c50..9fae6140 100644 --- a/examples/glm_blank_filling/glm_generate_samples_en.py +++ b/examples/glm_blank_filling/glm_generate_samples_en.py @@ -1,8 +1,6 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") -import sys -sys.path.append("/home/yanzhaodong/anhforth/FlagAI") import torch from flagai.model.glm_model import GLMModel from flagai.data.tokenizer import Tokenizer diff --git a/examples/gpt2_title_generation/generate.py b/examples/gpt2_title_generation/generate.py index 79023e03..a76d6f81 100644 --- a/examples/gpt2_title_generation/generate.py +++ b/examples/gpt2_title_generation/generate.py @@ -1,8 +1,6 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") -import sys -sys.path.append("/home/yanzhaodong/anhforth/FlagAI") import torch from flagai.auto_model.auto_loader import AutoLoader from flagai.model.predictor.predictor import Predictor diff --git a/examples/gpt2_title_generation/train_multi_gpu.py b/examples/gpt2_title_generation/train_multi_gpu.py index ffa0a969..d6e8d563 100644 --- a/examples/gpt2_title_generation/train_multi_gpu.py +++ b/examples/gpt2_title_generation/train_multi_gpu.py @@ -1,8 +1,6 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") -import sys -sys.path.append("/home/yanzhaodong/anhforth/FlagAI") import os import torch from torch.utils.data import Dataset diff --git a/examples/opt/generate_opt_1.3b.py b/examples/opt/generate_opt_1.3b.py index 7d928300..8311a9f1 100644 --- a/examples/opt/generate_opt_1.3b.py +++ b/examples/opt/generate_opt_1.3b.py @@ -1,5 +1,3 @@ -import sys -sys.path.append("/home/yanzhaodong/anhforth/FlagAI") from flagai.model.predictor.predictor import Predictor from flagai.auto_model.auto_loader import AutoLoader diff --git a/examples/roberta_ner/generate.py b/examples/roberta_ner/generate.py index 303a4d93..79b2dbb1 100644 --- a/examples/roberta_ner/generate.py +++ b/examples/roberta_ner/generate.py @@ -1,8 +1,6 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") -# import sys -# sys.path.append("/home/yanzhaodong/anhforth/FlagAI") import torch from flagai.auto_model.auto_loader import AutoLoader from flagai.model.predictor.predictor import Predictor diff --git a/examples/t5_title_generation/generate.py b/examples/t5_title_generation/generate.py index 5a4910e8..ea20a285 100644 --- a/examples/t5_title_generation/generate.py +++ b/examples/t5_title_generation/generate.py @@ -1,8 +1,6 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") -import sys -sys.path.append("/home/yanzhaodong/anhforth/FlagAI") from flagai.auto_model.auto_loader import AutoLoader from flagai.model.predictor.predictor import Predictor diff --git a/test.py b/test.py deleted file mode 100644 index 2cccd4fc..00000000 --- a/test.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright © 2022 BAAI. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License") -# import unittest - -# print('test syn') -# test_dir = './tests' -# test_report_path = './test_report' -# discover = unittest.defaultTestLoader.discover(test_dir, pattern='test_*.py') -# with open(test_report_path, "w") as report_file: -# runner = unittest.TextTestRunner(stream=report_file, verbosity=2) -# #runner=unittest.TextTestRunner() -# runner.run(discover) -from dataclasses import dataclass, field -@dataclass(frozen=True, eq=True) -class AddedToken: - """ - AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the - way it should behave. - """ - - content: str = field(default_factory=str) - single_word: bool = False - lstrip: bool = False - rstrip: bool = False - normalized: bool = True - - def __getstate__(self): - return self.__dict__ - -class SpecialTokensMixin: - SPECIAL_TOKENS_ATTRIBUTES = [ - "bos_token", - "eos_token", - "unk_token", - "sep_token", - "pad_token", - "cls_token", - "mask_token", - "additional_special_tokens", - ] - def __init__(self, **kwargs): - print(kwargs) - self._bos_token = None - self._eos_token = None - self._unk_token = None - self._sep_token = None - self._pad_token = None - self._cls_token = None - self._mask_token = None - self._pad_token_type_id = 0 - self._additional_special_tokens = [] - # self.verbose = verbose - for key, value in kwargs.items(): - if value is None: - continue - if key in self.SPECIAL_TOKENS_ATTRIBUTES: - if key == "additional_special_tokens": - assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple" - assert all( - isinstance(t, (str, AddedToken)) for t in value - ), "One of the tokens is not a string or an AddedToken" - setattr(self, key, value) - elif isinstance(value, (str, AddedToken)): - setattr(self, key, value) - else: - raise TypeError(f"special token {key} has to be either str or AddedToken but got: {type(value)}") - -class Tokenizer(SpecialTokensMixin): - def __init__(self,eos_token="", - unk_token="", - pad_token="", - additional_special_tokens=None, - **kwargs): - super().__init__( - eos_token=eos_token, - unk_token=unk_token, - pad_token=pad_token, - # extra_ids=extra_ids, - additional_special_tokens=additional_special_tokens, - # sp_model_kwargs=self.sp_model_kwargs, - **kwargs, - ) -tokenizer = Tokenizer() -import pdb;pdb.set_trace() \ No newline at end of file From 8d4e86aeb1cc8954bc8a39d5d5ac73a06adb530f Mon Sep 17 00:00:00 2001 From: ftgreat Date: Thu, 2 Mar 2023 15:28:58 +0800 Subject: [PATCH 16/54] fixed error Signed-off-by: ftgreat --- .../bminf_generate/galactica_6.7b_generate.py | 1 + flagai/data/tokenizer/bert/bert_tokenizer.py | 2 +- flagai/data/tokenizer/bert/wordpiece.py | 2 +- .../galactica/galactica_tokenizer.py | 2 +- flagai/data/tokenizer/opt/opt_en_tokenizer.py | 2 +- .../tokenizer/roberta/roberta_tokenizer.py | 2 +- flagai/data/tokenizer/t5/t5_tokenizer.py | 2 +- flagai/model/gpt2_model.py | 12 +- flagai/model/predictor/utils.py | 1 - tests/test_tokenizer.py | 237 ++++++------------ 10 files changed, 89 insertions(+), 174 deletions(-) diff --git a/examples/bminf_generate/galactica_6.7b_generate.py b/examples/bminf_generate/galactica_6.7b_generate.py index 964f319b..15b1b068 100644 --- a/examples/bminf_generate/galactica_6.7b_generate.py +++ b/examples/bminf_generate/galactica_6.7b_generate.py @@ -1,3 +1,4 @@ + from flagai.model.predictor.predictor import Predictor from flagai.auto_model.auto_loader import AutoLoader import torch diff --git a/flagai/data/tokenizer/bert/bert_tokenizer.py b/flagai/data/tokenizer/bert/bert_tokenizer.py index 8488bc88..eec168ea 100644 --- a/flagai/data/tokenizer/bert/bert_tokenizer.py +++ b/flagai/data/tokenizer/bert/bert_tokenizer.py @@ -76,7 +76,7 @@ def __init__(self, tokenizer_model_type=None, cache_dir=None): CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), CommandToken('MASK', '[MASK]', - self.get_specialid_from_text_tokenizer('MASK')), + self.get_specialid_from_text_tokenizer('mask')), CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')), CommandToken('eos', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), diff --git a/flagai/data/tokenizer/bert/wordpiece.py b/flagai/data/tokenizer/bert/wordpiece.py index 566969a5..eb636f7a 100644 --- a/flagai/data/tokenizer/bert/wordpiece.py +++ b/flagai/data/tokenizer/bert/wordpiece.py @@ -118,7 +118,7 @@ def __init__(self, self._token_unk = '[UNK]' self._token_mask = '[MASK]' - for token in ['pad', 'cls', 'sep', 'unk', 'MASK']: + for token in ['pad', 'cls', 'sep', 'unk', 'mask']: _token_id = self.vocab[getattr(self, "_token_" + str(token))] setattr(self, "_token_" + str(token) + "_id", _token_id) diff --git a/flagai/data/tokenizer/galactica/galactica_tokenizer.py b/flagai/data/tokenizer/galactica/galactica_tokenizer.py index 9a2ce280..87a28412 100644 --- a/flagai/data/tokenizer/galactica/galactica_tokenizer.py +++ b/flagai/data/tokenizer/galactica/galactica_tokenizer.py @@ -16,7 +16,7 @@ def __init__(self, download_dir) -> None: CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), CommandToken('ENC', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), CommandToken('MASK', '[MASK]', - self.get_specialid_from_text_tokenizer('MASK')), + self.get_specialid_from_text_tokenizer('mask')), CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')), CommandToken('eos', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), diff --git a/flagai/data/tokenizer/opt/opt_en_tokenizer.py b/flagai/data/tokenizer/opt/opt_en_tokenizer.py index 0129af7b..5c1c0de8 100644 --- a/flagai/data/tokenizer/opt/opt_en_tokenizer.py +++ b/flagai/data/tokenizer/opt/opt_en_tokenizer.py @@ -36,7 +36,7 @@ def __init__(self, tokenizer_model_type="facebook/opt-125m", cache_dir=None): CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), CommandToken('MASK', '[MASK]', - self.get_specialid_from_text_tokenizer('MASK')), + self.get_specialid_from_text_tokenizer('mask')), CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')), CommandToken('eos', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), diff --git a/flagai/data/tokenizer/roberta/roberta_tokenizer.py b/flagai/data/tokenizer/roberta/roberta_tokenizer.py index 7447f7f9..a525f2a6 100644 --- a/flagai/data/tokenizer/roberta/roberta_tokenizer.py +++ b/flagai/data/tokenizer/roberta/roberta_tokenizer.py @@ -39,7 +39,7 @@ def __init__(self, tokenizer_model_type="roberta-base", cache_dir=None): CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), CommandToken('ENC', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), CommandToken('MASK', '[MASK]', - self.get_specialid_from_text_tokenizer('MASK')), + self.get_specialid_from_text_tokenizer('mask')), CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')), CommandToken('eos', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), diff --git a/flagai/data/tokenizer/t5/t5_tokenizer.py b/flagai/data/tokenizer/t5/t5_tokenizer.py index 5197706b..8774b3af 100644 --- a/flagai/data/tokenizer/t5/t5_tokenizer.py +++ b/flagai/data/tokenizer/t5/t5_tokenizer.py @@ -268,7 +268,7 @@ def __init__(self, self._token_dict = token_dict self._token_dict_inv = {v: k for k, v in token_dict.items()} - for token in ['pad', 'cls', 'sep', 'unk', 'MASK']: + for token in ['pad', 'cls', 'sep', 'unk', 'mask']: try: _token_id = token_dict[getattr(self, "_token_" + str(token))] # print(_token_id) diff --git a/flagai/model/gpt2_model.py b/flagai/model/gpt2_model.py index a3ab1054..15197888 100644 --- a/flagai/model/gpt2_model.py +++ b/flagai/model/gpt2_model.py @@ -112,14 +112,14 @@ def __init__(self, config): self.drop = nn.Dropout(config.embd_pdrop) self.project_in = None self.project_out = None - self.h = bminf.TransformerBlockList([ - GPT2Block(config.n_ctx, config, scale=True) - for _ in range(config.n_layer) - ],[0]) - # self.h = nn.ModuleList([ + # self.h = bminf.TransformerBlockList([ # GPT2Block(config.n_ctx, config, scale=True) # for _ in range(config.n_layer) - # ]) + # ],[0]) + self.h = nn.ModuleList([ + GPT2Block(config.n_ctx, config, scale=True) + for _ in range(config.n_layer) + ]) self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) diff --git a/flagai/model/predictor/utils.py b/flagai/model/predictor/utils.py index efef509a..72077041 100644 --- a/flagai/model/predictor/utils.py +++ b/flagai/model/predictor/utils.py @@ -1117,7 +1117,6 @@ def alm_beamsearch(model, tokenizer, text, out_max_length, beam_size, eod_token= context_length = context_length_tensor[0].item() context_tokens_tensor = torch.LongTensor(context_tokens) text = tokenizer.DecodeIds(context_tokens_tensor.tolist()) - start_time = time.time() mems = [] tokens = context_tokens_tensor diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 5c612b06..87b7fa63 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -1,195 +1,110 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") -import sys -sys.path.append("/home/yanzhaodong/anhforth/FlagAI") import unittest from flagai.data.tokenizer import Tokenizer from flagai.auto_model.auto_loader import AutoLoader -from flagai.data.tokenizer import Tokenizer class TokenizerTestCase(unittest.TestCase): - # def test_tokenizer_GLM_large_ch(self): - # tokenizer = Tokenizer.from_pretrained("GLM-large-ch") - # self.assertEqual(tokenizer.TokenToId("人"), 43371, 'Token id "人" error') - # self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), - # [3378, 1567, 2613, 20282], 'EncodeAsIds Error') - # self.assertEqual(tokenizer.DecodeIds([3378, 1567, 2613, 20282]), - # '今天吃饭吃了肯德基', 'DecodeIds Error') - # self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), - # ['▁今天', '吃饭', '吃了', '肯德基'], 'tokenize Error') - # self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], - # [50006, 3378, 1567, 2613, 20282, 50001], 'encode_plus Error') - # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - # [('pad', '<|endoftext|>', 50000), ('eos', '<|endoftext|>', 50000), ('sep', '[SEP]', 50001), - # ('cls', '[CLS]', 50002), ('mask', '[MASK]', 50003), ('unk', '[UNK]', 50004), ('sop', '<|startofpiece|>', 50006), - # ('eop', '<|endofpiece|>', 50007), ('sMASK', '[sMASK]', 50008), ('gMASK', '[gMASK]', 50009)], 'SpecialTokens error') + def test_tokenizer_GLM_large_ch(self): + tokenizer = Tokenizer.from_pretrained("GLM-large-ch") + self.assertEqual(tokenizer.TokenToId("人"), 43371, 'Token id "人" error') + self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), + [3378, 1567, 2613, 20282], 'EncodeAsIds Error') + self.assertEqual(tokenizer.DecodeIds([3378, 1567, 2613, 20282]), + '今天吃饭吃了肯德基', 'DecodeIds Error') + + def test_tokenizer_GLM_large_en(self): + tokenizer = Tokenizer.from_pretrained("GLM-large-en") + self.assertEqual(tokenizer.TokenToId("day"), 2154, '') + self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), + [13017, 7975, 3084, 2033, 3407], '') + self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), + 'fried chicken makes me happy', 'DecodeIds Error') - # def test_tokenizer_GLM_large_en(self): - # tokenizer = Tokenizer.from_pretrained("GLM-large-en") - # self.assertEqual(tokenizer.TokenToId("day"), 2154, '') + # def test_tokenizer_glm_10b_en(self): + # tokenizer = Tokenizer.from_pretrained("GLM-10b-en") + # self.assertEqual(tokenizer.TokenToId("day"), 820, '') # self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), - # [13017, 7975, 3084, 2033, 3407], '') - # self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), + # [25520, 9015, 1838, 502, 3772], '') + # self.assertEqual(tokenizer.DecodeIds([25520, 9015, 1838, 502, 3772]), # 'fried chicken makes me happy', 'DecodeIds Error') - # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - # [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('mask', '[MASK]', 103), ('unk', '[UNK]', 100), - # ('sep', '[SEP]', 102), ('pad', '[PAD]', 0), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523), - # ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)]) - - # # def test_tokenizer_glm_10b_en(self): - # # tokenizer = Tokenizer.from_pretrained("GLM-10b-en") - # # self.assertEqual(tokenizer.TokenToId("day"), 820, '') - # # self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), - # # [25520, 9015, 1838, 502, 3772], '') - # # self.assertEqual(tokenizer.DecodeIds([25520, 9015, 1838, 502, 3772]), - # # 'fried chicken makes me happy', 'DecodeIds Error') - # def test_tokenizer_t5(self): - # tokenizer = Tokenizer.from_pretrained('T5-base-ch') - # # import pdb;pdb.set_trace() - # self.assertEqual(tokenizer.TokenToId("人"), 297, '') - # self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), - # [306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166], '') - # self.assertEqual(tokenizer.DecodeIds([306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166]), - # '今天吃饭吃了肯德基', 'DecodeIds Error') - # encode_plus_result = tokenizer.encode_plus("今天吃饭吃了肯德基") - # self.assertEqual(list(encode_plus_result.keys()), - # ['input_ids', 'token_type_ids'], 'encode_plus Error') - # self.assertEqual(encode_plus_result['input_ids'], - # [101, 306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166, 102], 'encode_plus Error') - - # def test_tokenizer_t5(self): - # tokenizer = Tokenizer.from_pretrained('T5-base-ch') - # # import pdb;pdb.set_trace() - # self.assertEqual(tokenizer.TokenToId("人"), 297, '') - # self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), - # [306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166], '') - # self.assertEqual(tokenizer.DecodeIds([306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166]), - # '今天吃饭吃了肯德基', 'DecodeIds Error') - # encode_plus_result = tokenizer.encode_plus("今天吃饭吃了肯德基") - # self.assertEqual(list(encode_plus_result.keys()), - # ['input_ids', 'token_type_ids'], 'encode_plus Error') - # self.assertEqual(encode_plus_result['input_ids'], - # [101, 306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166, 102], 'encode_plus Error') - - # def test_tokenizer_roberta(self): - # tokenizer = Tokenizer.from_pretrained('RoBERTa-base-ch') - # # print(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825])) - # self.assertEqual(tokenizer.TokenToId("人"), 782, '') - # self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), - # [791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825], '') - # self.assertEqual(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825]), - # '今天吃饭吃了肯德基', 'DecodeIds Error') - # self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), - # ['今', '天', '吃', '饭', '吃', '了', '肯', '德', '基'], 'tokenize Error') - # self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], - # [101, 791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825, 102], 'encode_plus Error') - # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - # [('eos', '[PAD]', 0), ('unk', '[UNK]', 100), ('cls', '[CLS]', 101), ('sep', '[SEP]', 102), - # ('mask', '[MASK]', 103), ('pad', '[PAD]', 0)], 'SpecialTokens error') - - # def test_tokenizer_bert(self): - # tokenizer = Tokenizer.from_pretrained('BERT-base-en') - # self.assertEqual(tokenizer.TokenToId("day"), 2154, '') - # self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), - # [13017, 7975, 3084, 2033, 3407], '') - # self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), - # 'fried chicken makes me happy', 'DecodeIds Error') - # self.assertEqual(tokenizer.tokenize('fried chicken makes me happy'), - # ['fried', 'chicken', 'makes', 'me', 'happy'], 'tokenize Error') - # self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'], - # [101, 13017, 7975, 3084, 2033, 3407, 102], 'encode_plus Error') - # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - # [('eos', '[PAD]', 0), ('unk', '[UNK]', 100), ('cls', '[CLS]', 101), ('sep', '[SEP]', 102), - # ('mask', '[MASK]', 103), ('pad', '[PAD]', 0)], 'SpecialTokens error') + def test_tokenizer_t5(self): + tokenizer = Tokenizer.from_pretrained('t5-base-en') + self.assertEqual(tokenizer.TokenToId("day"), 1135, '') + self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), + [3, 7704, 3832, 656, 140, 1095], '') + self.assertEqual(tokenizer.DecodeIds([3, 7704, 3832, 656, 140, 1095]), + 'fried chicken makes me happy', 'DecodeIds Error') - # def test_tokenizer_cpm1(self): - # loader = AutoLoader(task_name="lm", - # model_name="CPM-large-ch", - # model_dir="./checkpoints/", - # only_download_config=True) - # tokenizer = loader.get_tokenizer() - # self.assertEqual(tokenizer.encode("day"), [8, 8275], '') - # self.assertEqual(tokenizer.encode("fried chicken makes me happy"), - # [2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239], '') - # self.assertEqual(tokenizer.decode([2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239]), - # 'fried chicken makes me happy', 'DecodeIds Error') - # self.assertEqual(tokenizer.tokenize('fried chicken makes me happy'), - # ['▁f', 'ried', '▁ch', 'ick', 'en', '▁make', 's', '▁me', '▁happy'], 'tokenize Error') - # self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'], - # [1, 2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239, 2], 'encode_plus Error') - # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - # [('unk', '', 0), ('cls', '', 1), ('eos', '', 2), ('sep', '', 4), ('mask', '', 6), ('eod', '', 7), ('eop', '', 0)]) + def test_tokenizer_roberta(self): + tokenizer = Tokenizer.from_pretrained('RoBERTa-base-ch') + # print(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825])) + self.assertEqual(tokenizer.TokenToId("人"), 782, '') + self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), + [791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825], '') + self.assertEqual(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825]), + '今天吃饭吃了肯德基', 'DecodeIds Error') + + def test_tokenizer_bert(self): + tokenizer = Tokenizer.from_pretrained('BERT-base-en') + self.assertEqual(tokenizer.TokenToId("day"), 2154, '') + self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), + [13017, 7975, 3084, 2033, 3407], '') + self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), + 'fried chicken makes me happy', 'DecodeIds Error') - def test_tokenizer_gpt(self): - # loader = AutoLoader(task_name="lm", - # model_name="GPT2-base-ch", - # model_dir="./checkpoints/", - # only_download_config=True) - tokenizer = Tokenizer.from_pretrained("GPT2-base-ch") - import pdb;pdb.set_trace() + def test_tokenizer_cpm1(self): + loader = AutoLoader(task_name="lm", + model_name="CPM-large-ch", + model_dir="./checkpoints/", + only_download_config=True) + tokenizer = loader.get_tokenizer() self.assertEqual(tokenizer.encode("day"), [8, 8275], '') self.assertEqual(tokenizer.encode("fried chicken makes me happy"), [2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239], '') self.assertEqual(tokenizer.decode([2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239]), 'fried chicken makes me happy', 'DecodeIds Error') - self.assertEqual(tokenizer.tokenize('fried chicken makes me happy'), - ['▁f', 'ried', '▁ch', 'ick', 'en', '▁make', 's', '▁me', '▁happy'], 'tokenize Error') - self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'], - [1, 2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239, 2], 'encode_plus Error') - self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - [('unk', '', 0), ('cls', '', 1), ('eos', '', 2), ('sep', '', 4), ('mask', '', 6), ('eod', '', 7), ('eop', '', 0)]) - # def test_tokenizer_opt(self): - # tokenizer = Tokenizer.from_pretrained('opt-1.3b-en') - # self.assertEqual(tokenizer.encode("day"), [1208], '') - # self.assertEqual(tokenizer.encode_plus("fried chicken makes me happy")["input_ids"], - # [0, 21209, 5884, 817, 162, 1372, 2], '') - # self.assertEqual(tokenizer.decode([21209, 5884, 817, 162, 1372]), - # 'fried chicken makes me happy', 'DecodeIds Error') - # self.assertEqual(tokenizer.tokenize('fried chicken makes me happy'), - # ['fried', 'Ġchicken', 'Ġmakes', 'Ġme', 'Ġhappy'], 'tokenize Error') - # self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'], - # [0, 21209, 5884, 817, 162, 1372, 2], 'encode_plus Error') - # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - # [('cls', '', 0), ('pad', '', 1), ('bos', '', 2), ('eos', '', 2), ('unk', '', 3)], 'SpecialTokens error') - + def test_tokenizer_opt(self): + tokenizer = Tokenizer.from_pretrained('opt-125m-en') + self.assertEqual(tokenizer.encode("day"), [1208], '') + self.assertEqual(tokenizer.encode_plus("fried chicken makes me happy")["input_ids"], + [50260, 21209, 5884, 817, 162, 1372, 50260], '') + self.assertEqual(tokenizer.decode([21209, 5884, 817, 162, 1372]), + 'fried chicken makes me happy', 'DecodeIds Error') - # def test_tokenizer_clip(self): - # loader = AutoLoader(task_name="txt_img_matching", - # model_name="clip-base-p32-224", - # only_download_config=True) - # tokenizer = loader.get_tokenizer() - # self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') + def test_tokenizer_clip(self): + loader = AutoLoader(task_name="txt_img_matching", + model_name="clip-base-p32-224") + tokenizer = loader.get_tokenizer() + self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') - # def test_tokenizer_evaclip(self): - # loader = AutoLoader(task_name="txt_img_matching", - # model_name="eva-clip", - # only_download_config=True) - # tokenizer = loader.get_tokenizer() - # self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') + def test_tokenizer_evaclip(self): + loader = AutoLoader(task_name="txt_img_matching", + model_name="eva-clip") + tokenizer = loader.get_tokenizer() + self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') def suite(): suite = unittest.TestSuite() - # suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_ch')) - # suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_en')) + suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_ch')) + suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_en')) # suite.addTest(TokenizerTestCase('test_tokenizer_glm_10_en')) - # suite.addTest(TokenizerTestCase('test_tokenizer_t5')) - # suite.addTest(TokenizerTestCase('test_tokenizer_roberta')) - # suite.addTest(TokenizerTestCase('test_tokenizer_bert')) - # suite.addTest(TokenizerTestCase('test_tokenizer_cpm1')) - # suite.addTest(TokenizerTestCase('test_tokenizer_cpm2_large')) - suite.addTest(TokenizerTestCase('test_tokenizer_gpt')) - # suite.addTest(TokenizerTestCase('test_tokenizer_opt')) - # suite.addTest(TokenizerTestCase('test_tokenizer_clip')) - # suite.addTest(TokenizerTestCase('test_tokenizer_evaclip')) + suite.addTest(TokenizerTestCase('test_tokenizer_t5')) + suite.addTest(TokenizerTestCase('test_tokenizer_roberta')) + suite.addTest(TokenizerTestCase('test_tokenizer_bert')) + suite.addTest(TokenizerTestCase('test_tokenizer_cpm1')) + suite.addTest(TokenizerTestCase('test_tokenizer_opt')) + suite.addTest(TokenizerTestCase('test_tokenizer_clip')) + suite.addTest(TokenizerTestCase('test_tokenizer_evaclip')) return suite if __name__ == '__main__': runner = unittest.TextTestRunner() - runner.run(suite()) + runner.run(suite()) \ No newline at end of file From 0abaed610a0da04334d03223ae9e044aafdf1581 Mon Sep 17 00:00:00 2001 From: ftgreat Date: Thu, 2 Mar 2023 15:36:39 +0800 Subject: [PATCH 17/54] upadted Signed-off-by: ftgreat --- tests/test_tokenizer.py | 629 +++++++++++++++++++++++++++++++++------- 1 file changed, 522 insertions(+), 107 deletions(-) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 87b7fa63..1e7f9721 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -1,110 +1,525 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") -import unittest -from flagai.data.tokenizer import Tokenizer -from flagai.auto_model.auto_loader import AutoLoader - -class TokenizerTestCase(unittest.TestCase): - - def test_tokenizer_GLM_large_ch(self): - tokenizer = Tokenizer.from_pretrained("GLM-large-ch") - self.assertEqual(tokenizer.TokenToId("人"), 43371, 'Token id "人" error') - self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), - [3378, 1567, 2613, 20282], 'EncodeAsIds Error') - self.assertEqual(tokenizer.DecodeIds([3378, 1567, 2613, 20282]), - '今天吃饭吃了肯德基', 'DecodeIds Error') - - def test_tokenizer_GLM_large_en(self): - tokenizer = Tokenizer.from_pretrained("GLM-large-en") - self.assertEqual(tokenizer.TokenToId("day"), 2154, '') - self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), - [13017, 7975, 3084, 2033, 3407], '') - self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), - 'fried chicken makes me happy', 'DecodeIds Error') - - # def test_tokenizer_glm_10b_en(self): - # tokenizer = Tokenizer.from_pretrained("GLM-10b-en") - # self.assertEqual(tokenizer.TokenToId("day"), 820, '') - # self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), - # [25520, 9015, 1838, 502, 3772], '') - # self.assertEqual(tokenizer.DecodeIds([25520, 9015, 1838, 502, 3772]), - # 'fried chicken makes me happy', 'DecodeIds Error') - - def test_tokenizer_t5(self): - tokenizer = Tokenizer.from_pretrained('t5-base-en') - self.assertEqual(tokenizer.TokenToId("day"), 1135, '') - self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), - [3, 7704, 3832, 656, 140, 1095], '') - self.assertEqual(tokenizer.DecodeIds([3, 7704, 3832, 656, 140, 1095]), - 'fried chicken makes me happy', 'DecodeIds Error') - - def test_tokenizer_roberta(self): - tokenizer = Tokenizer.from_pretrained('RoBERTa-base-ch') - # print(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825])) - self.assertEqual(tokenizer.TokenToId("人"), 782, '') - self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), - [791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825], '') - self.assertEqual(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825]), - '今天吃饭吃了肯德基', 'DecodeIds Error') - - def test_tokenizer_bert(self): - tokenizer = Tokenizer.from_pretrained('BERT-base-en') - self.assertEqual(tokenizer.TokenToId("day"), 2154, '') - self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), - [13017, 7975, 3084, 2033, 3407], '') - self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), - 'fried chicken makes me happy', 'DecodeIds Error') - - def test_tokenizer_cpm1(self): - loader = AutoLoader(task_name="lm", - model_name="CPM-large-ch", - model_dir="./checkpoints/", - only_download_config=True) - tokenizer = loader.get_tokenizer() - self.assertEqual(tokenizer.encode("day"), [8, 8275], '') - self.assertEqual(tokenizer.encode("fried chicken makes me happy"), - [2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239], '') - self.assertEqual(tokenizer.decode([2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239]), - 'fried chicken makes me happy', 'DecodeIds Error') - - def test_tokenizer_opt(self): - tokenizer = Tokenizer.from_pretrained('opt-125m-en') - self.assertEqual(tokenizer.encode("day"), [1208], '') - self.assertEqual(tokenizer.encode_plus("fried chicken makes me happy")["input_ids"], - [50260, 21209, 5884, 817, 162, 1372, 50260], '') - self.assertEqual(tokenizer.decode([21209, 5884, 817, 162, 1372]), - 'fried chicken makes me happy', 'DecodeIds Error') - - def test_tokenizer_clip(self): - loader = AutoLoader(task_name="txt_img_matching", - model_name="clip-base-p32-224") - tokenizer = loader.get_tokenizer() - self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') - - def test_tokenizer_evaclip(self): - loader = AutoLoader(task_name="txt_img_matching", - model_name="eva-clip") - tokenizer = loader.get_tokenizer() - self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') - - -def suite(): - suite = unittest.TestSuite() - suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_ch')) - suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_en')) - # suite.addTest(TokenizerTestCase('test_tokenizer_glm_10_en')) - suite.addTest(TokenizerTestCase('test_tokenizer_t5')) - suite.addTest(TokenizerTestCase('test_tokenizer_roberta')) - suite.addTest(TokenizerTestCase('test_tokenizer_bert')) - suite.addTest(TokenizerTestCase('test_tokenizer_cpm1')) - suite.addTest(TokenizerTestCase('test_tokenizer_opt')) - suite.addTest(TokenizerTestCase('test_tokenizer_clip')) - suite.addTest(TokenizerTestCase('test_tokenizer_evaclip')) - - return suite - - -if __name__ == '__main__': - runner = unittest.TextTestRunner() - runner.run(suite()) \ No newline at end of file +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utilities for using and training tokenizers (char, wordpiece, sentencepiece)""" +from collections import namedtuple +import itertools + +print_rank_0 = print +"""define some default command tokens for the tokenizer to use""" +token_format = "<{0}>" + +COMMAND_TUPLE = namedtuple('CommandToken', ('name', 'token', 'Id')) + + +def prep_command_tokens(tokenlist, token_format=token_format): + return [ + CommandToken(tok[0], token_format.format(tok[0]), tok[1]) + for tok in tokenlist + ] + + +class CommandToken(object): + + def __init__(self, name, token, Id, lstrip=False, rstrip=False): + self.name = name + self.token = token + self.Id = Id + self.lstrip = lstrip + self.rstrip = rstrip + + def __str__(self): + return str(COMMAND_TUPLE(self.name, self.token, self.Id)) + + +DEFAULT_COMMAND_TOKENS = [ + ('pad', 0), + ('eos', 1), + ('bos', 2), + ('unk', 3), + ('sep', 4), + ('L2R', 5), + ('cls', 6), + ('MASK', 7), +] +DEFAULT_COMMAND_TOKENS = prep_command_tokens(DEFAULT_COMMAND_TOKENS) +"""define some default type tokens for bert training""" + +TYPE_TUPLE = namedtuple('TypeToken', ('name', 'token', 'Id')) + + +def prep_type_tokens(tokenlist, token_format=token_format): + return [ + TypeToken(tok[0], token_format.format(tok[0]), tok[1]) + for tok in tokenlist + ] + + +class TypeToken(object): + + def __init__(self, name, token, Id): + self.name = name + self.token = token + self.Id = Id + + def __str__(self): + return str(TYPE_TUPLE(self.name, self.token, self.Id)) + + +DEFAULT_TYPE_TOKENS = [ + ('function', 0), + ('command', 1), + ('str0', 2), + ('str1', 3), + ('str2', 4), + ('embedding0', 5), + ('embedding1', 6), + ('embedding2', 7), + ('arg0', 8), + ('arg1', 9), + ('arg2', 10), +] +DEFAULT_TYPE_TOKENS = prep_type_tokens(DEFAULT_TYPE_TOKENS) + + + + + +class GLMTokenizer(object): + """ + Tokenizer object that handles text tokenization, command tokens, and type tokens. + Command tokens and text tokens are stored together in one mapping of size + `len(text_tokenizer)+len(command_tokens)`. Command tokens are stored as first + `len(command_tokens)` tokens. Token idx is stored at `idx+len(command_tokens)`. + Token types are stored in a separate mapping of size `len(type_tokens)`. + """ + + def __init__(self, text_tokenizer, command_tokens=None, type_tokens=None): + # set text tokenizer + self.text_tokenizer = text_tokenizer + if not hasattr(self, 'num_text_tokens'): + self.num_text_tokens = len(self.text_tokenizer) + + # set command tokens + if command_tokens is None: + command_tokens = DEFAULT_COMMAND_TOKENS + self._command_tokens = command_tokens + self.command_name_map = {tok.name: tok for tok in self._command_tokens} + self.command_token_map = { + tok.token: tok + for tok in self._command_tokens + } + self.command_id_map = {tok.Id: tok for tok in self._command_tokens} + if not hasattr(self, 'num_command_tokens'): + self.num_command_tokens = len(self._command_tokens) + if not hasattr(self, 'num_tokens'): + self.num_tokens = self.num_command_tokens + self.num_text_tokens + + # set type tokens + if type_tokens is None: + type_tokens = DEFAULT_TYPE_TOKENS + self.type_tokens = type_tokens + self.type_name_map = {tok.name: tok for tok in self.type_tokens} + self.type_token_map = {tok.token: tok for tok in self.type_tokens} + self.type_id_map = {tok.Id: tok for tok in self.type_tokens} + if not hasattr(self, 'num_type_tokens'): + self.num_type_tokens = len(self.type_tokens) + + # parse tokens and vocabs from tokenizer + self._tokens = list(self.command_token_map.keys()) + list( + self.text_tokenizer.tokens) + self._vocab = {t: Id for Id, t in self.command_id_map.items()} + self._vocab.update({ + t: Id + self.num_command_tokens + for t, Id in self.text_tokenizer.vocab.items() + }) + + self._text_tokens = list(self.text_tokenizer.tokens) + self._text_token_vocab = { + t: Id + self.num_command_tokens + for t, Id in self.text_tokenizer.vocab.items() + } + + self._command_token_tokens = list(self.command_token_map.keys()) + self._command_token_vocab = { + t: Id + for Id, t in self.command_id_map.items() + } + + self._token_types = list(self.type_token_map.keys()) + self._token_type_vocab = {t: Id for Id, t in self.type_id_map.items()} + + def __call__(self, text, process_fn=None): + """run preprocessing and encode text as Ids""" + return self.EncodeAsIds(text, process_fn=process_fn) + + def __len__(self): + """total number of tokens""" + return self.num_tokens + + def get_command_id(self, name): + """get command token corresponding to `name`""" + return self.command_name_map[name] + + def get_type(self, name): + """get type token corresponding to `name`""" + return self.type_name_map[name] + + @property + def tokens(self): + """list (or iterable) of all tokens for tokenizer""" + return self._tokens + + @property + def vocab(self): + """dictionary mapping tokens to ids for tokenizer""" + return self._vocab + + @property + def token_types(self): + """list (or iterable) of all token types for tokenizer""" + return self._token_types + + @property + def token_type_vocab(self): + """dictionary mapping token types to ids for tokenizer""" + return self._token_type_vocab + + @property + def command_tokens(self): + """list (or iterable) of all command tokens for tokenizer""" + return self._command_token_tokens + + @property + def command_token_vocab(self): + """dictionary mapping command tokens to ids for tokenizer""" + return self._command_token_vocab + + @property + def text_tokens(self): + """list (or iterable) of text tokens for text tokenizer""" + return self._text_tokens + + @property + def text_token_vocab(self): + """dictionary mapping text tokens to ids for text tokenizer""" + return self._text_token_vocab + + def EncodeAsIds(self, text, process_fn=None): + """ + encode text using text tokenizer and shift Id values for command tokens + """ + processed_text = text + if process_fn is not None: + processed_text = process_fn(processed_text) + + def split_on_token(tok_extended: CommandToken, text): + result = [] + tok = tok_extended.token + split_text = text.split(tok) + for i, sub_text in enumerate(split_text): + # CommandToken can control whitespace stripping around them. + # We use them for GPT2 and Roberta to have different behavior depending on the special token + # Cf. https://github.com/huggingface/transformers/pull/2778 + # and https://github.com/huggingface/transformers/issues/3788 + # Strip white spaces on the right + if tok_extended.rstrip and i > 0: + # A bit counter-intuitive but we strip the left of the string + # since tok_extended.rstrip means the special token is eating all white spaces on its right + sub_text = sub_text.lstrip() + # Strip white spaces on the left + if tok_extended.lstrip and i < len(split_text) - 1: + sub_text = sub_text.rstrip() # Opposite here + + if i == 0 and not sub_text: + result.append(tok) + elif i == len(split_text) - 1: + if sub_text: + result.append(sub_text) + else: + pass + else: + if sub_text: + result.append(sub_text) + result.append(tok) + return result + + def split_on_tokens(tok_list, text): + if not text.strip(): + return [] + if not tok_list: + return self.text_tokenizer.encode(text) + + tokenized_text = [] + text_list = [text] + for tok in tok_list: + tokenized_text = [] + for sub_text in text_list: + if sub_text not in self._command_token_tokens: + tokenized_text.extend(split_on_token(tok, sub_text)) + else: + tokenized_text.append(sub_text) + text_list = tokenized_text + + return list( + itertools.chain.from_iterable( + (self._encode(token) + if token not in self._command_token_tokens else + [self.command_token_map[token].Id] + for token in tokenized_text))) + + no_split_tokens = self._command_tokens + Ids = split_on_tokens(no_split_tokens, processed_text) + return Ids + + def _encode(self, text): + raise NotImplementedError + + def EncodeAsTokens(self, text, process_fn=None): + """ + encode text as tokens using text tokenizer + """ + tokenization = self.text_tokenizer.EncodeAsTokens( + text, process_fn=process_fn) + tokenization.set_command_tokens(self._command_tokens) + return tokenization + + def IdToToken(self, Id, type_token=False): + """convert Id to token accounting for command and type tokens""" + if isinstance(Id, (TypeToken, CommandToken)): + return Id.token + if type_token: + return self.type_id_map[Id].token + if Id < self.num_command_tokens: + return self.command_id_map[Id].token + return self.text_tokenizer.IdToToken(Id - self.num_command_tokens) + + def TokenToId(self, token, type_token=False): + """convert token to Id accounting for command and type tokens""" + if isinstance(token, (TypeToken, CommandToken)): + return token.Id + if type_token: + return self.type_token_map[token].Id + if token in self.command_token_map: + return self.command_token_map[token].Id + return self.text_tokenizer.TokenToId(token) + self.num_command_tokens + + def DecodeIds(self, Ids, type_token=False): + """ + convert Ids to tokens accounting for command and type tokens, tokens + are joined and returned as a string. + """ + if type_token: + return ' '.join(Id.token if isinstance(Id, TypeToken) else self. + type_id_map[Id].token for Id in Ids) + rtn_strs = [] + current_str = [] + for Id in Ids: + if isinstance(Id, CommandToken): + rtn_strs.append(self.text_tokenizer.DecodeIds(current_str)) + current_str = [] + rtn_strs.append(Id.token) + elif Id < self.num_command_tokens: + rtn_strs.append(self.text_tokenizer.DecodeIds(current_str)) + current_str = [] + rtn_strs.append(self.command_id_map[Id].token) + else: + current_str.append(Id - self.num_command_tokens) + if current_str != []: + rtn_strs.append(self.text_tokenizer.DecodeIds(current_str)) + return ' '.join(rtn_strs) + + def DecodeTokens(self, Tokens, type_token=False): + """ + convert tokens to a string accounting for command and type tokens. + """ + if type_token: + return ' '.join(t.token if isinstance(t, TypeToken) else t + for t in Tokens) + rtn_strs = [] + current_str = [] + for t in Tokens: + if isinstance(t, CommandToken): + rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str)) + current_str = [] + rtn_strs.append(t.token) + elif t in self.command_token_map: + rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str)) + current_str = [] + rtn_strs.append(t) + else: + current_str.append(t) + if current_str != []: + rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str)) + return ' '.join(rtn_strs) + + + + +class Tokenizer(object): + """ + Tokenizer object that handles text tokenization, command tokens, and type tokens. + Command tokens and text tokens are stored together in one mapping of size + `len(text_tokenizer)+len(command_tokens)`. Command tokens are stored as first + `len(command_tokens)` tokens. Token idx is stored at `idx+len(command_tokens)`. + Token types are stored in a separate mapping of size `len(type_tokens)`. + """ + + def __init__(self, text_tokenizer): + # set text tokenizer + self.text_tokenizer = text_tokenizer + if not hasattr(self, 'num_text_tokens'): + self.num_text_tokens = len(self.text_tokenizer) + + # parse tokens and vocabs from tokenizer + self._tokens = list(self.text_tokenizer.tokens) + self._vocab = {t: Id for t, Id in self.text_tokenizer.vocab.items()} + + self._text_tokens = list(self.text_tokenizer.tokens) + self._text_token_vocab = { + t: Id + for t, Id in self.text_tokenizer.vocab.items() + } + + def __call__(self, text, process_fn=None): + """run preprocessing and encode text as Ids""" + return self.EncodeAsIds(text, process_fn=process_fn) + + def __len__(self): + """total number of tokens""" + return self.num_tokens + + @property + def tokens(self): + """list (or iterable) of all tokens for tokenizer""" + return self._tokens + + @property + def vocab(self): + """dictionary mapping tokens to ids for tokenizer""" + return self._vocab + + @property + def text_tokens(self): + """list (or iterable) of text tokens for text tokenizer""" + return self._text_tokens + + @property + def text_token_vocab(self): + """dictionary mapping text tokens to ids for text tokenizer""" + return self._text_token_vocab + + def EncodeAsIds(self, text: str, process_fn=None): + """Input text string => a list of token ids""" + processed_text = text + if process_fn is not None: + processed_text = process_fn(processed_text) + + tokens = self.EncodeAsTokens(processed_text, process_fn=process_fn) + Ids = [self.TokenToId(token) for token in tokens] + return Ids + + def EncodeAsTokens(self, text: str, process_fn=None): + """Input text string => a list of tokens""" + return self.text_tokenizer._tokenize(text) + + def IdToToken(self, Id: int): + """Token id => token""" + return self.text_tokenizer._convert_id_to_token(Id) + + def TokenToId(self, token: str): + """Token => token id""" + return self.text_tokenizer._convert_token_to_id(token) + + def DecodeIds(self, Ids): + """A list of token ids => recovered text string""" + return self.DecodeTokens([self.IdToToken(id) for id in Ids]) + + def DecodeTokens(self, tokens): + """A list of tokens => recovered text string""" + return self.text_tokenizer.convert_tokens_to_string(tokens) + + +class TextTokenizer(object): + """ + Interface for text tokenizer + """ + + def __init__(self): + if not hasattr(self, 'num_text_tokens'): + self.num_text_tokens = 0 + if not hasattr(self, 'num_tokens'): + self.num_tokens = self.num_text_tokens + + def __call__(self, text, process_fn=None): + return self.EncodeAsIds(text, process_fn) + + def __len__(self): + return self.num_text_tokens + + @property + def tokens(self): + """list (or iterable) of text tokens for text tokenizer""" + raise NotImplementedError( + 'TextTokenizer tokens property not implemented') + + @property + def vocab(self): + """dictionary mapping tokens to ids""" + raise NotImplementedError( + 'TextTokenizer vocab property not implemented') + + @staticmethod + def exists(model_path): + """check if the filepath for a text tokenizer exists""" + raise NotImplementedError( + 'TextTokenizer exists method not implemented') + + def Train(self, corpus): + """train a tokenizer on a data corpus and save model for future use""" + raise NotImplementedError('TextTokenizer Train not implemented') + + def EncodeAsIds(self, text, process_fn=None): + """ + Preprocess text and encode as ids. Return a tokenization object with + original text, processed text, and id tokenization. + """ + raise NotImplementedError('TextTokenizer EncodeAsIds not implemented') + + def EncodeAsTokens(self, text, process_fn=None): + """ + Preprocess text and encode as tokens. Return a tokenization object with + original text, processed text, and token tokenization. + """ + raise NotImplementedError( + 'TextTokenizer EncodeAsTokens not implemented') + + def IdToToken(self, Id): + """Convert an Id to Token. Reverse lookup of self.vocab""" + raise NotImplementedError('TextTokenizer IdToToken not implemented') + + def TokenToId(self, token): + """Convert a Token to Id. Lookup of self.vocab""" + raise NotImplementedError('TextTokenizer TokenToId not implemented') + + def DecodeIds(self, Ids): + """Convert a list or tokenization object of Ids to a text string""" + raise NotImplementedError('TextTokenizer DecodeIds not implemented') + + def DecodeTokens(self, Tokens): + """Convert a list or tokenization object of tokens to a text string""" + raise NotImplementedError('TextTokenizer DecodeTokens not implemented') \ No newline at end of file From 5279f8a0a6f14ab0cc4d4fb20fd54d0e0e9b2f1b Mon Sep 17 00:00:00 2001 From: ftgreat Date: Thu, 2 Mar 2023 15:37:54 +0800 Subject: [PATCH 18/54] updated Signed-off-by: ftgreat --- flagai/data/tokenizer/galactica/galactica_tokenizer.py | 2 +- flagai/data/tokenizer/roberta/roberta_tokenizer.py | 2 +- flagai/data/tokenizer/t5/t5_tokenizer.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/flagai/data/tokenizer/galactica/galactica_tokenizer.py b/flagai/data/tokenizer/galactica/galactica_tokenizer.py index 87a28412..fdaf5be6 100644 --- a/flagai/data/tokenizer/galactica/galactica_tokenizer.py +++ b/flagai/data/tokenizer/galactica/galactica_tokenizer.py @@ -14,7 +14,7 @@ def __init__(self, download_dir) -> None: self._command_tokens = [ CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), - CommandToken('ENC', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), + CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), CommandToken('MASK', '[MASK]', self.get_specialid_from_text_tokenizer('mask')), CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), diff --git a/flagai/data/tokenizer/roberta/roberta_tokenizer.py b/flagai/data/tokenizer/roberta/roberta_tokenizer.py index a525f2a6..553a8a83 100644 --- a/flagai/data/tokenizer/roberta/roberta_tokenizer.py +++ b/flagai/data/tokenizer/roberta/roberta_tokenizer.py @@ -37,7 +37,7 @@ def __init__(self, tokenizer_model_type="roberta-base", cache_dir=None): self._command_tokens = [ CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), - CommandToken('ENC', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), + CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), CommandToken('MASK', '[MASK]', self.get_specialid_from_text_tokenizer('mask')), CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), diff --git a/flagai/data/tokenizer/t5/t5_tokenizer.py b/flagai/data/tokenizer/t5/t5_tokenizer.py index 8774b3af..ef793b67 100644 --- a/flagai/data/tokenizer/t5/t5_tokenizer.py +++ b/flagai/data/tokenizer/t5/t5_tokenizer.py @@ -44,7 +44,7 @@ def __init__(self, tokenizer_model_type="t5-base", cache_dir=None): CommandToken('sep', '[SEP]', self.num_tokens), CommandToken('pad', '[PAD]', self.num_tokens + 1), - CommandToken('ENC', '[CLS]', self.num_tokens + 2), + CommandToken('cls', '[CLS]', self.num_tokens + 2), CommandToken('MASK', '[MASK]', self.num_tokens + 3), ] From c50c662e01473db1bdc36cc78c4c459fcfc49f3b Mon Sep 17 00:00:00 2001 From: ftgreat Date: Thu, 2 Mar 2023 15:41:58 +0800 Subject: [PATCH 19/54] fixed inconsistency Signed-off-by: ftgreat --- .../data/tokenizer/uni_tokenizer/tokenizer.py | 384 ++++++++--- tests/test_tokenizer.py | 631 +++--------------- 2 files changed, 383 insertions(+), 632 deletions(-) diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py index f94b9123..457c6412 100644 --- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py @@ -22,7 +22,6 @@ import itertools import logging import torch -import json logger = logging.getLogger(__name__) from flagai.data.tokenizer.tokenizer import CommandToken from flagai.data.tokenizer.uni_tokenizer.wp_tokenizer import WordpieceTokenizer @@ -39,7 +38,6 @@ def is_control(ch): https://en.wikipedia.org/wiki/Control_character https://www.fileformat.info/info/unicode/category/Cc/index.htm https://www.fileformat.info/info/unicode/category/Cf/index.htm - """ return unicodedata.category(ch) in ('Cc', 'Cf') @@ -47,20 +45,22 @@ def is_control(ch): class Tokenizer(BaseTokenizer): def __init__(self, - add_block_symbols=False, + add_block_symbols=True, add_sentinel_token=0, add_task_mask=True, add_decoder_mask=False, - fix_command_token=False, - pre_tokenizer=None, + fix_command_token=True, **kwargs): super().__init__(**kwargs) + if self.tokenizer_class == "wp": - if self.tokenizer_model_name.lower().startswith('clip-cn'): - self.text_tokenizer = FullTokenizer(self.vocab_file) + if self.tokenizer_model_name.lower().endswith("ch"): + self.text_tokenizer = WordpieceTokenizer(self.vocab_file, + is_ch=True) + elif self.tokenizer_model_name.lower().startswith('clip-cn'): + self.text_tokenizer = FullTokenizer(self.vocab_file) else: - self.text_tokenizer = WordpieceTokenizer(self.vocab_file, is_ch=False) - # is_ch=self.tokenizer_model_name.lower().endswith("ch")) + self.text_tokenizer = WordpieceTokenizer(self.vocab_file) elif self.tokenizer_class == "bpe": if self.tokenizer_model_name.lower().startswith('clip'): self.text_tokenizer = MMBPETokenizer(self.vocab_file, @@ -69,83 +69,277 @@ def __init__(self, self.text_tokenizer = BPETokenizer(self.vocab_file, self.merges_file) elif self.tokenizer_class == "sp": - if self.tokenizer_model_name.lower().startswith('cpm1'): - from flagai.data.tokenizer.cpm_1.cpm1_tokenizer import CPMTokenizer - self.text_tokenizer = CPMTokenizer(self.tokenizer_json_file, self.sp_model_file) - elif self.tokenizer_model_name.lower().startswith('cpm3'): - from flagai.data.tokenizer.cpm_3.cpm3_tokenizer import CPMTokenizer - self.text_tokenizer = CPMTokenizer(self.tokenizer_json_file, self.sp_model_file) - else: - self.text_tokenizer = SentencePieceTokenizer(self.sp_model_file) + self.text_tokenizer = SentencePieceTokenizer(self.sp_model_file) else: raise NotImplementedError("cannot assign a tokenize class") - if self.tokenizer_model_name.lower().startswith('glm') or self.tokenizer_model_name.lower().startswith('alm'): - add_block_symbols=True + self.is_glm = self.tokenizer_model_name.lower().startswith('glm') # self.is_clip = self.tokenizer_model_name.startswith('clip') - self.num_tokens = self.text_tokenizer.vocab_size - try: - with open(self.special_tokens_map, encoding='utf8') as file: dct=json.load(file) - sp_tokens = [(k.replace("_token",""),v['content']) for k,v in dct.items()] - except FileNotFoundError: - dct = None - sp_tokens = [] - - self._command_tokens = [CommandToken(e[0], e[1], self.text_tokenizer.convert_token_to_id(e[1])) for e in sp_tokens] - - if self.tokenizer_model_name.lower().startswith("glm"): - if self.tokenizer_class == "wp": - self.text_tokenizer._token_cls = "[CLS]" - self.text_tokenizer._token_sep = "[SEP]" - fix_command_token = False - elif self.tokenizer_class == "sp": - fix_command_token = True + + if self.tokenizer_class == "wp": + # set command tokens from wordpiece tokenizer values + self.num_command_tokens = 6 + self.num_text_tokens = self.num_tokens - 5 + self.num_type_tokens = 2 + self.token_start_id = None + self.token_end_id = None + self.token_pad_id = None + try: self._command_tokens = [ - CommandToken('pad', '<|endoftext|>', self.num_tokens), - CommandToken('eos', '<|endoftext|>', self.num_tokens), - CommandToken('sep', '[SEP]', self.num_tokens + 1), - CommandToken('cls', '[CLS]', self.num_tokens + 2), - CommandToken('MASK', '[MASK]', self.num_tokens + 3, lstrip=True), - CommandToken('unk', '[UNK]', self.num_tokens + 4) + CommandToken( + 'pad', '[PAD]', + self.text_tokenizer.convert_token_to_id('[PAD]')), + CommandToken( + 'cls', '[CLS]', + self.text_tokenizer.convert_token_to_id('[CLS]')), + CommandToken( + 'MASK', '[MASK]', + self.text_tokenizer.convert_token_to_id('[MASK]')), + CommandToken( + 'unk', '[UNK]', + self.text_tokenizer.convert_token_to_id('[UNK]')), + CommandToken( + 'sep', '[SEP]', + self.text_tokenizer.convert_token_to_id('[SEP]')), + CommandToken( + 'eos', '[PAD]', + self.text_tokenizer.convert_token_to_id('[PAD]')), ] - self.num_tokens += 6 - elif self.tokenizer_class == "bpe": + self.token_start_id = self.text_tokenizer.convert_token_to_id( + '[CLS]') + self.token_end_id = self.text_tokenizer.convert_token_to_id( + '[SEP]') + self.token_pad_id = self.text_tokenizer.convert_token_to_id( + '[PAD]') + self.text_tokenizer._token_cls = "[CLS]" + self.text_tokenizer._token_sep = "[SEP]" + + except KeyError: self._command_tokens = [ - CommandToken('pad', '<|endoftext|>', - self.text_tokenizer.encoder['<|endoftext|>']), - CommandToken('eos', '<|endoftext|>', - self.text_tokenizer.encoder['<|endoftext|>']) + CommandToken( + 'pad', '[PAD]', + self.text_tokenizer.convert_token_to_id('')), + CommandToken( + 'cls', '[CLS]', + self.text_tokenizer.convert_token_to_id('')), + CommandToken( + 'MASK', '[MASK]', + self.text_tokenizer.convert_token_to_id('')), + CommandToken( + 'unk', '[UNK]', + self.text_tokenizer.convert_token_to_id('')), + CommandToken( + 'sep', '[SEP]', + self.text_tokenizer.convert_token_to_id('')), + CommandToken( + 'eos', '[PAD]', + self.text_tokenizer.convert_token_to_id('')), ] - self._command_tokens.extend([ - CommandToken('sop', '<|startofpiece|>', self.num_tokens), - CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1), - CommandToken('cls', '[CLS]', self.num_tokens + 2), - CommandToken('MASK', - '[MASK]', - self.num_tokens + 3, - lstrip=True), - CommandToken('sep', '[SEP]', self.num_tokens + 4), - CommandToken('unk', '[UNK]', self.num_tokens + 5) - ]) - self.num_tokens += 6 - if add_block_symbols: - if not self.tokenizer_class == "bpe": + self.token_start_id = self.text_tokenizer.convert_token_to_id( + '') + self.token_end_id = self.text_tokenizer.convert_token_to_id( + '') + self.token_pad_id = self.text_tokenizer.convert_token_to_id( + '') + self.text_tokenizer._token_cls = "" + self.text_tokenizer._token_sep = "" + if add_block_symbols: self.add_command_token('sop', '<|startofpiece|>') - self.add_command_token('eop', '<|endofpiece|>') - if add_task_mask: - if fix_command_token: - self.add_command_token('sMASK', '[sMASK]') - self.add_command_token('gMASK', '[gMASK]') - else: + self.add_command_token('eop', '<|endofpiece|>',) + if add_task_mask: self.add_command_token('gMASK', '[gMASK]') self.add_command_token('sMASK', '[sMASK]') - if add_decoder_mask: - self.add_command_token('dBLOCK', '[dBLOCK]') + if add_decoder_mask: + self.add_command_token('dBLOCK', '[dBLOCK]') if add_sentinel_token > 0: for i in range(1, add_sentinel_token): self.add_command_token(f'MASK{i}', f'[MASK{i}]') self.add_command_token(f'sop{i}', f'<|startofpiece{i}|>') + elif self.tokenizer_class == "bpe": + if self.tokenizer_model_name.lower().startswith('roberta'): + self.num_command_tokens = 6 + self.num_text_tokens = self.num_tokens - 3 + self._command_tokens = [ + CommandToken( + 'pad', '<|endoftext|>', + self.text_tokenizer.convert_token_to_id('')), + CommandToken( + 'eos', '<|endoftext|>', + self.text_tokenizer.convert_token_to_id('')), + CommandToken( + 'sep', '[SEP]', + self.text_tokenizer.convert_token_to_id('')), + CommandToken( + 'cls', '[CLS]', + self.text_tokenizer.convert_token_to_id('')), + CommandToken( + 'MASK', + '[MASK]', + self.text_tokenizer.convert_token_to_id(''), + lstrip=True), + CommandToken( + 'unk', '[UNK]', + self.text_tokenizer.convert_token_to_id('')) + ] + if add_block_symbols: + self._command_tokens.extend([ + CommandToken('sop', '<|startofpiece|>', + self.num_tokens), + CommandToken('eop', '<|endofpiece|>', + self.num_tokens + 1) + ]) + self.num_tokens += 2 + self.num_command_tokens += 2 + self.token_end_id = self.text_tokenizer.convert_token_to_id( + '') + elif self.tokenizer_model_name.lower().startswith('clip'): + self.num_command_tokens = 2 + self._command_tokens = [ + CommandToken( + 'sot', '', + self.text_tokenizer.convert_token_to_id('')), + CommandToken( + 'eot', '', + self.text_tokenizer.convert_token_to_id('')), + ] + self.num_tokens += self.num_command_tokens + self.token_end_id = self.text_tokenizer.convert_token_to_id( + '') + else: + self.num_command_tokens = 2 + self.num_text_tokens = self.num_tokens - 1 + self._command_tokens = [ + CommandToken( + 'pad', '<|endoftext|>', + self.text_tokenizer.convert_token_to_id( + '<|endoftext|>')), + CommandToken( + 'eos', '<|endoftext|>', + self.text_tokenizer.convert_token_to_id( + '<|endoftext|>')) + ] + self.token_end_id = self.text_tokenizer.convert_token_to_id( + '<|endoftext|>') + if add_block_symbols: + if self.tokenizer_model_name.lower().startswith('glm'): + unk_token_id = self.num_tokens + 5 + cls_token_id = self.num_tokens + 2 + num_tokens_to_add = 5 + else: + unk_token_id = self.text_tokenizer.convert_token_to_id( + '<|endoftext|>') + cls_token_id = self.text_tokenizer.convert_token_to_id( + '<|endoftext|>') + num_tokens_to_add = 4 + self._command_tokens.extend([ + CommandToken('sop', '<|startofpiece|>', + self.num_tokens), + CommandToken('eop', '<|endofpiece|>', + self.num_tokens + 1), + CommandToken('cls', '[CLS]', cls_token_id), + CommandToken('MASK', + '[MASK]', + self.num_tokens + 3, + lstrip=True), + CommandToken('sep', '[SEP]', self.num_tokens + 4), + CommandToken('unk', '[UNK]', unk_token_id) + ]) + self.num_tokens += num_tokens_to_add + self.num_command_tokens += 6 + + if add_block_symbols: + if add_task_mask: + self._command_tokens.extend([ + CommandToken('gMASK', + '[gMASK]', + self.num_tokens, + lstrip=True), + CommandToken('sMASK', + '[sMASK]', + self.num_tokens + 1, + lstrip=True) + ]) + self.num_tokens += 2 + self.num_command_tokens += 2 + if add_decoder_mask: + self._command_tokens.extend( + [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)]) + self.num_tokens += 1 + self.num_command_tokens += 1 + + elif self.tokenizer_class == "sp": + self.num_command_tokens = 0 + self.num_text_tokens = self.text_tokenizer.vocab_size + self.num_tokens = self.num_text_tokens + + if self.tokenizer_model_name.lower().startswith('glm'): + pad_token_id = self.num_tokens + eos_token_id = self.num_tokens + unk_token_id = self.num_tokens + 4 + else: + pad_token_id = self.text_tokenizer.convert_token_to_id('') + eos_token_id = self.text_tokenizer.convert_token_to_id('') + unk_token_id = self.text_tokenizer.convert_token_to_id('') + self._command_tokens = [ + CommandToken('pad', '<|endoftext|>', self.num_text_tokens), + CommandToken('eos', '<|endoftext|>', self.num_text_tokens), + CommandToken('sep', '[SEP]', self.num_text_tokens + 1), + CommandToken('cls', '[CLS]', self.num_text_tokens + 2), + CommandToken('MASK', + '[MASK]', + self.num_text_tokens + 3, + lstrip=True), + CommandToken('unk', '[UNK]', self.num_text_tokens + 4) + ] + + self.num_tokens += 5 + self.num_command_tokens += 6 + self.token_end_id = self.text_tokenizer.convert_token_to_id( + '') + if add_block_symbols: + sop_id = self.text_tokenizer.convert_token_to_id('<|startofpiece|>') + eop_id = self.text_tokenizer.convert_token_to_id('<|endofpiece|>') + self._command_tokens.extend([ + CommandToken('sop', '<|startofpiece|>', + self.num_tokens + 1), + CommandToken('eop', '<|endofpiece|>', self.num_tokens + 2) + ]) + if fix_command_token: + self.num_tokens += 3 + else: + self.num_tokens += 2 + self.num_command_tokens += 2 + if add_task_mask: + if fix_command_token: + self._command_tokens.extend([ + CommandToken('sMASK', + '[sMASK]', + self.num_tokens, + lstrip=True), + CommandToken('gMASK', + '[gMASK]', + self.num_tokens + 1, + lstrip=True) + ]) + else: + self._command_tokens.extend([ + CommandToken('gMASK', + '[gMASK]', + self.num_tokens, + lstrip=True), + CommandToken('sMASK', + '[sMASK]', + self.num_tokens + 1, + lstrip=True) + ]) + self.num_tokens += 2 + self.num_command_tokens += 2 + if add_decoder_mask: + self._command_tokens.extend( + [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)]) + self.num_tokens += 1 + self.num_command_tokens += 1 self.command_name_map = {tok.name: tok for tok in self._command_tokens} self.command_token_map = { tok.token: tok @@ -153,19 +347,7 @@ def __init__(self, } self.command_id_map = {tok.Id: tok for tok in self._command_tokens} self._command_token_tokens = list(self.command_token_map.keys()) - vocab = self.text_tokenizer.get_vocab() - self.token_start_id = vocab.get('', None) - if not self.token_start_id: - self.token_start_id = vocab.get('[CLS]', None) - - self.token_end_id = vocab.get('', None) - if not self.token_end_id: - self.token_end_id = vocab.get('<|endoftext|>', None) - if not self.token_end_id: - self.token_end_id = vocab.get('[SEP]', None) - print("All special tokens: ", str([(k, v.token, v.Id) for k,v in self.command_name_map.items()])) - # logger.info("All special tokens: %s", str([(k,v.Id) for k,v in self.command_name_map.items()])) - + logger.info("All special tokens: %s", str([(k,v.Id) for k,v in self.command_name_map.items()])) def get_vocab(self): return self.text_tokenizer.get_vocab() @@ -175,10 +357,13 @@ def get_command_id(self, name): return self.command_name_map[name].Id def add_command_token(self, name, token): - self._command_tokens.append(CommandToken(name, token, self.num_tokens)) - self.num_tokens += 1 + try: + id = self.text_tokenizer.convert_token_to_id(token) + except KeyError: + id = self.num_tokens + self.num_tokens += 1 + self._command_tokens.append(CommandToken(name, token, id)) return - def rematch(self, text, tokens): """output the mapping relation between raw text and tokenizezd text """ @@ -235,6 +420,8 @@ def convert_tokens_to_ids(self, tokens): return res def convert_ids_to_tokens(self, ids): + # if torch.is_tensor(ids): + # ids = ids.tolist() res = [] for id in ids: if id in self.command_id_map: @@ -377,8 +564,7 @@ def encode_plus_non_glm( truncation=True, max_length=None, ): - if self.tokenizer_model_name.lower().startswith('t5'): - assert second_text is None, "t5 does not support multi-sentence encoding" + def get_input_ids(text): tokens = self.text_tokenizer.tokenize(text) return self.text_tokenizer.convert_tokens_to_ids(tokens) @@ -453,8 +639,6 @@ def encode_plus( # for Seq2seq "alm"): return self.encode_plus_non_glm(source_text, second_text, truncation, max_length) - elif self.tokenizer_model_name.lower().startswith("opt"): - return None sop_id = self.get_command_id('sop') # start of piece eop_id = self.get_command_id('eop') # end of piece sep_id = self.get_command_id('sep') # seperation @@ -524,14 +708,12 @@ def truncate_sequence(max_length, def tokenize_as_tensor(self, texts): """ Returns the tokenized representation of given input string(s) - Parameters ---------- texts : Union[str, List[str]] An input string or a list of input strings to tokenize context_length : int The context length to use; all CLIP models use 77 as the context length - Returns ------- A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length] @@ -542,25 +724,7 @@ def tokenize_as_tensor(self, texts): sot_token=sot_token, eot_token=eot_token) - def tokenize_t5(self, text, *arg, **kwargs): - split_tokens = [] - for token in self.pre_tokenizer(text): - if token in self.vocab: - split_tokens.append(token) - else: - split_tokens.extend(self.text_tokenizer.tokenize(token)) - return split_tokens - def tokenize(self, text, maxlen=None, add_spatial_tokens=False): - """ - add_spatial_token: (bool) Add cls at the front and sep at the end - max_len: Truncate the length to max_len - """ - if self.tokenizer_model_name.lower().startswith('t5'): - import jieba - self.pre_tokenizer = lambda x: jieba.cut(x, HMM=False) - return self.tokenize_t5(text) - tokens = self.text_tokenizer.tokenize(text) if add_spatial_tokens: diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 1e7f9721..c897f3e7 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -1,525 +1,112 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") -# coding=utf-8 -# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Utilities for using and training tokenizers (char, wordpiece, sentencepiece)""" -from collections import namedtuple -import itertools - -print_rank_0 = print -"""define some default command tokens for the tokenizer to use""" -token_format = "<{0}>" - -COMMAND_TUPLE = namedtuple('CommandToken', ('name', 'token', 'Id')) - - -def prep_command_tokens(tokenlist, token_format=token_format): - return [ - CommandToken(tok[0], token_format.format(tok[0]), tok[1]) - for tok in tokenlist - ] - - -class CommandToken(object): - - def __init__(self, name, token, Id, lstrip=False, rstrip=False): - self.name = name - self.token = token - self.Id = Id - self.lstrip = lstrip - self.rstrip = rstrip - - def __str__(self): - return str(COMMAND_TUPLE(self.name, self.token, self.Id)) - - -DEFAULT_COMMAND_TOKENS = [ - ('pad', 0), - ('eos', 1), - ('bos', 2), - ('unk', 3), - ('sep', 4), - ('L2R', 5), - ('cls', 6), - ('MASK', 7), -] -DEFAULT_COMMAND_TOKENS = prep_command_tokens(DEFAULT_COMMAND_TOKENS) -"""define some default type tokens for bert training""" - -TYPE_TUPLE = namedtuple('TypeToken', ('name', 'token', 'Id')) - - -def prep_type_tokens(tokenlist, token_format=token_format): - return [ - TypeToken(tok[0], token_format.format(tok[0]), tok[1]) - for tok in tokenlist - ] - - -class TypeToken(object): - - def __init__(self, name, token, Id): - self.name = name - self.token = token - self.Id = Id - - def __str__(self): - return str(TYPE_TUPLE(self.name, self.token, self.Id)) - - -DEFAULT_TYPE_TOKENS = [ - ('function', 0), - ('command', 1), - ('str0', 2), - ('str1', 3), - ('str2', 4), - ('embedding0', 5), - ('embedding1', 6), - ('embedding2', 7), - ('arg0', 8), - ('arg1', 9), - ('arg2', 10), -] -DEFAULT_TYPE_TOKENS = prep_type_tokens(DEFAULT_TYPE_TOKENS) - - - - - -class GLMTokenizer(object): - """ - Tokenizer object that handles text tokenization, command tokens, and type tokens. - Command tokens and text tokens are stored together in one mapping of size - `len(text_tokenizer)+len(command_tokens)`. Command tokens are stored as first - `len(command_tokens)` tokens. Token idx is stored at `idx+len(command_tokens)`. - Token types are stored in a separate mapping of size `len(type_tokens)`. - """ - - def __init__(self, text_tokenizer, command_tokens=None, type_tokens=None): - # set text tokenizer - self.text_tokenizer = text_tokenizer - if not hasattr(self, 'num_text_tokens'): - self.num_text_tokens = len(self.text_tokenizer) - - # set command tokens - if command_tokens is None: - command_tokens = DEFAULT_COMMAND_TOKENS - self._command_tokens = command_tokens - self.command_name_map = {tok.name: tok for tok in self._command_tokens} - self.command_token_map = { - tok.token: tok - for tok in self._command_tokens - } - self.command_id_map = {tok.Id: tok for tok in self._command_tokens} - if not hasattr(self, 'num_command_tokens'): - self.num_command_tokens = len(self._command_tokens) - if not hasattr(self, 'num_tokens'): - self.num_tokens = self.num_command_tokens + self.num_text_tokens - - # set type tokens - if type_tokens is None: - type_tokens = DEFAULT_TYPE_TOKENS - self.type_tokens = type_tokens - self.type_name_map = {tok.name: tok for tok in self.type_tokens} - self.type_token_map = {tok.token: tok for tok in self.type_tokens} - self.type_id_map = {tok.Id: tok for tok in self.type_tokens} - if not hasattr(self, 'num_type_tokens'): - self.num_type_tokens = len(self.type_tokens) - - # parse tokens and vocabs from tokenizer - self._tokens = list(self.command_token_map.keys()) + list( - self.text_tokenizer.tokens) - self._vocab = {t: Id for Id, t in self.command_id_map.items()} - self._vocab.update({ - t: Id + self.num_command_tokens - for t, Id in self.text_tokenizer.vocab.items() - }) - - self._text_tokens = list(self.text_tokenizer.tokens) - self._text_token_vocab = { - t: Id + self.num_command_tokens - for t, Id in self.text_tokenizer.vocab.items() - } - - self._command_token_tokens = list(self.command_token_map.keys()) - self._command_token_vocab = { - t: Id - for Id, t in self.command_id_map.items() - } - - self._token_types = list(self.type_token_map.keys()) - self._token_type_vocab = {t: Id for Id, t in self.type_id_map.items()} - - def __call__(self, text, process_fn=None): - """run preprocessing and encode text as Ids""" - return self.EncodeAsIds(text, process_fn=process_fn) - - def __len__(self): - """total number of tokens""" - return self.num_tokens - - def get_command_id(self, name): - """get command token corresponding to `name`""" - return self.command_name_map[name] - - def get_type(self, name): - """get type token corresponding to `name`""" - return self.type_name_map[name] - - @property - def tokens(self): - """list (or iterable) of all tokens for tokenizer""" - return self._tokens - - @property - def vocab(self): - """dictionary mapping tokens to ids for tokenizer""" - return self._vocab - - @property - def token_types(self): - """list (or iterable) of all token types for tokenizer""" - return self._token_types - - @property - def token_type_vocab(self): - """dictionary mapping token types to ids for tokenizer""" - return self._token_type_vocab - - @property - def command_tokens(self): - """list (or iterable) of all command tokens for tokenizer""" - return self._command_token_tokens - - @property - def command_token_vocab(self): - """dictionary mapping command tokens to ids for tokenizer""" - return self._command_token_vocab - - @property - def text_tokens(self): - """list (or iterable) of text tokens for text tokenizer""" - return self._text_tokens - - @property - def text_token_vocab(self): - """dictionary mapping text tokens to ids for text tokenizer""" - return self._text_token_vocab - - def EncodeAsIds(self, text, process_fn=None): - """ - encode text using text tokenizer and shift Id values for command tokens - """ - processed_text = text - if process_fn is not None: - processed_text = process_fn(processed_text) - - def split_on_token(tok_extended: CommandToken, text): - result = [] - tok = tok_extended.token - split_text = text.split(tok) - for i, sub_text in enumerate(split_text): - # CommandToken can control whitespace stripping around them. - # We use them for GPT2 and Roberta to have different behavior depending on the special token - # Cf. https://github.com/huggingface/transformers/pull/2778 - # and https://github.com/huggingface/transformers/issues/3788 - # Strip white spaces on the right - if tok_extended.rstrip and i > 0: - # A bit counter-intuitive but we strip the left of the string - # since tok_extended.rstrip means the special token is eating all white spaces on its right - sub_text = sub_text.lstrip() - # Strip white spaces on the left - if tok_extended.lstrip and i < len(split_text) - 1: - sub_text = sub_text.rstrip() # Opposite here - - if i == 0 and not sub_text: - result.append(tok) - elif i == len(split_text) - 1: - if sub_text: - result.append(sub_text) - else: - pass - else: - if sub_text: - result.append(sub_text) - result.append(tok) - return result - - def split_on_tokens(tok_list, text): - if not text.strip(): - return [] - if not tok_list: - return self.text_tokenizer.encode(text) - - tokenized_text = [] - text_list = [text] - for tok in tok_list: - tokenized_text = [] - for sub_text in text_list: - if sub_text not in self._command_token_tokens: - tokenized_text.extend(split_on_token(tok, sub_text)) - else: - tokenized_text.append(sub_text) - text_list = tokenized_text - - return list( - itertools.chain.from_iterable( - (self._encode(token) - if token not in self._command_token_tokens else - [self.command_token_map[token].Id] - for token in tokenized_text))) - - no_split_tokens = self._command_tokens - Ids = split_on_tokens(no_split_tokens, processed_text) - return Ids - - def _encode(self, text): - raise NotImplementedError - - def EncodeAsTokens(self, text, process_fn=None): - """ - encode text as tokens using text tokenizer - """ - tokenization = self.text_tokenizer.EncodeAsTokens( - text, process_fn=process_fn) - tokenization.set_command_tokens(self._command_tokens) - return tokenization - - def IdToToken(self, Id, type_token=False): - """convert Id to token accounting for command and type tokens""" - if isinstance(Id, (TypeToken, CommandToken)): - return Id.token - if type_token: - return self.type_id_map[Id].token - if Id < self.num_command_tokens: - return self.command_id_map[Id].token - return self.text_tokenizer.IdToToken(Id - self.num_command_tokens) - - def TokenToId(self, token, type_token=False): - """convert token to Id accounting for command and type tokens""" - if isinstance(token, (TypeToken, CommandToken)): - return token.Id - if type_token: - return self.type_token_map[token].Id - if token in self.command_token_map: - return self.command_token_map[token].Id - return self.text_tokenizer.TokenToId(token) + self.num_command_tokens - - def DecodeIds(self, Ids, type_token=False): - """ - convert Ids to tokens accounting for command and type tokens, tokens - are joined and returned as a string. - """ - if type_token: - return ' '.join(Id.token if isinstance(Id, TypeToken) else self. - type_id_map[Id].token for Id in Ids) - rtn_strs = [] - current_str = [] - for Id in Ids: - if isinstance(Id, CommandToken): - rtn_strs.append(self.text_tokenizer.DecodeIds(current_str)) - current_str = [] - rtn_strs.append(Id.token) - elif Id < self.num_command_tokens: - rtn_strs.append(self.text_tokenizer.DecodeIds(current_str)) - current_str = [] - rtn_strs.append(self.command_id_map[Id].token) - else: - current_str.append(Id - self.num_command_tokens) - if current_str != []: - rtn_strs.append(self.text_tokenizer.DecodeIds(current_str)) - return ' '.join(rtn_strs) - - def DecodeTokens(self, Tokens, type_token=False): - """ - convert tokens to a string accounting for command and type tokens. - """ - if type_token: - return ' '.join(t.token if isinstance(t, TypeToken) else t - for t in Tokens) - rtn_strs = [] - current_str = [] - for t in Tokens: - if isinstance(t, CommandToken): - rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str)) - current_str = [] - rtn_strs.append(t.token) - elif t in self.command_token_map: - rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str)) - current_str = [] - rtn_strs.append(t) - else: - current_str.append(t) - if current_str != []: - rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str)) - return ' '.join(rtn_strs) - - - - -class Tokenizer(object): - """ - Tokenizer object that handles text tokenization, command tokens, and type tokens. - Command tokens and text tokens are stored together in one mapping of size - `len(text_tokenizer)+len(command_tokens)`. Command tokens are stored as first - `len(command_tokens)` tokens. Token idx is stored at `idx+len(command_tokens)`. - Token types are stored in a separate mapping of size `len(type_tokens)`. - """ - - def __init__(self, text_tokenizer): - # set text tokenizer - self.text_tokenizer = text_tokenizer - if not hasattr(self, 'num_text_tokens'): - self.num_text_tokens = len(self.text_tokenizer) - - # parse tokens and vocabs from tokenizer - self._tokens = list(self.text_tokenizer.tokens) - self._vocab = {t: Id for t, Id in self.text_tokenizer.vocab.items()} - - self._text_tokens = list(self.text_tokenizer.tokens) - self._text_token_vocab = { - t: Id - for t, Id in self.text_tokenizer.vocab.items() - } - - def __call__(self, text, process_fn=None): - """run preprocessing and encode text as Ids""" - return self.EncodeAsIds(text, process_fn=process_fn) - - def __len__(self): - """total number of tokens""" - return self.num_tokens - - @property - def tokens(self): - """list (or iterable) of all tokens for tokenizer""" - return self._tokens - - @property - def vocab(self): - """dictionary mapping tokens to ids for tokenizer""" - return self._vocab - - @property - def text_tokens(self): - """list (or iterable) of text tokens for text tokenizer""" - return self._text_tokens - - @property - def text_token_vocab(self): - """dictionary mapping text tokens to ids for text tokenizer""" - return self._text_token_vocab - - def EncodeAsIds(self, text: str, process_fn=None): - """Input text string => a list of token ids""" - processed_text = text - if process_fn is not None: - processed_text = process_fn(processed_text) - - tokens = self.EncodeAsTokens(processed_text, process_fn=process_fn) - Ids = [self.TokenToId(token) for token in tokens] - return Ids - - def EncodeAsTokens(self, text: str, process_fn=None): - """Input text string => a list of tokens""" - return self.text_tokenizer._tokenize(text) - - def IdToToken(self, Id: int): - """Token id => token""" - return self.text_tokenizer._convert_id_to_token(Id) - - def TokenToId(self, token: str): - """Token => token id""" - return self.text_tokenizer._convert_token_to_id(token) - - def DecodeIds(self, Ids): - """A list of token ids => recovered text string""" - return self.DecodeTokens([self.IdToToken(id) for id in Ids]) - - def DecodeTokens(self, tokens): - """A list of tokens => recovered text string""" - return self.text_tokenizer.convert_tokens_to_string(tokens) - - -class TextTokenizer(object): - """ - Interface for text tokenizer - """ - - def __init__(self): - if not hasattr(self, 'num_text_tokens'): - self.num_text_tokens = 0 - if not hasattr(self, 'num_tokens'): - self.num_tokens = self.num_text_tokens - - def __call__(self, text, process_fn=None): - return self.EncodeAsIds(text, process_fn) - - def __len__(self): - return self.num_text_tokens - - @property - def tokens(self): - """list (or iterable) of text tokens for text tokenizer""" - raise NotImplementedError( - 'TextTokenizer tokens property not implemented') - - @property - def vocab(self): - """dictionary mapping tokens to ids""" - raise NotImplementedError( - 'TextTokenizer vocab property not implemented') - - @staticmethod - def exists(model_path): - """check if the filepath for a text tokenizer exists""" - raise NotImplementedError( - 'TextTokenizer exists method not implemented') - - def Train(self, corpus): - """train a tokenizer on a data corpus and save model for future use""" - raise NotImplementedError('TextTokenizer Train not implemented') - - def EncodeAsIds(self, text, process_fn=None): - """ - Preprocess text and encode as ids. Return a tokenization object with - original text, processed text, and id tokenization. - """ - raise NotImplementedError('TextTokenizer EncodeAsIds not implemented') - - def EncodeAsTokens(self, text, process_fn=None): - """ - Preprocess text and encode as tokens. Return a tokenization object with - original text, processed text, and token tokenization. - """ - raise NotImplementedError( - 'TextTokenizer EncodeAsTokens not implemented') - - def IdToToken(self, Id): - """Convert an Id to Token. Reverse lookup of self.vocab""" - raise NotImplementedError('TextTokenizer IdToToken not implemented') - - def TokenToId(self, token): - """Convert a Token to Id. Lookup of self.vocab""" - raise NotImplementedError('TextTokenizer TokenToId not implemented') - - def DecodeIds(self, Ids): - """Convert a list or tokenization object of Ids to a text string""" - raise NotImplementedError('TextTokenizer DecodeIds not implemented') - - def DecodeTokens(self, Tokens): - """Convert a list or tokenization object of tokens to a text string""" - raise NotImplementedError('TextTokenizer DecodeTokens not implemented') \ No newline at end of file +import unittest +from flagai.data.tokenizer import Tokenizer +from flagai.auto_model.auto_loader import AutoLoader +import sys +sys.path.append("/home/yanzhaodong/anhforth/FlagAI") + +class TokenizerTestCase(unittest.TestCase): + + def test_tokenizer_GLM_large_ch(self): + tokenizer = Tokenizer.from_pretrained("GLM-large-ch") + self.assertEqual(tokenizer.TokenToId("人"), 43371, 'Token id "人" error') + self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), + [3378, 1567, 2613, 20282], 'EncodeAsIds Error') + self.assertEqual(tokenizer.DecodeIds([3378, 1567, 2613, 20282]), + '今天吃饭吃了肯德基', 'DecodeIds Error') + + def test_tokenizer_GLM_large_en(self): + tokenizer = Tokenizer.from_pretrained("GLM-large-en") + self.assertEqual(tokenizer.TokenToId("day"), 2154, '') + self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), + [13017, 7975, 3084, 2033, 3407], '') + self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), + 'fried chicken makes me happy', 'DecodeIds Error') + + # def test_tokenizer_glm_10b_en(self): + # tokenizer = Tokenizer.from_pretrained("GLM-10b-en") + # self.assertEqual(tokenizer.TokenToId("day"), 820, '') + # self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), + # [25520, 9015, 1838, 502, 3772], '') + # self.assertEqual(tokenizer.DecodeIds([25520, 9015, 1838, 502, 3772]), + # 'fried chicken makes me happy', 'DecodeIds Error') + + def test_tokenizer_t5(self): + tokenizer = Tokenizer.from_pretrained('t5-base-en') + self.assertEqual(tokenizer.TokenToId("day"), 1135, '') + self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), + [3, 7704, 3832, 656, 140, 1095], '') + self.assertEqual(tokenizer.DecodeIds([3, 7704, 3832, 656, 140, 1095]), + 'fried chicken makes me happy', 'DecodeIds Error') + + def test_tokenizer_roberta(self): + tokenizer = Tokenizer.from_pretrained('RoBERTa-base-ch') + # print(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825])) + self.assertEqual(tokenizer.TokenToId("人"), 782, '') + self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), + [791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825], '') + self.assertEqual(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825]), + '今天吃饭吃了肯德基', 'DecodeIds Error') + + def test_tokenizer_bert(self): + tokenizer = Tokenizer.from_pretrained('BERT-base-en') + self.assertEqual(tokenizer.TokenToId("day"), 2154, '') + self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), + [13017, 7975, 3084, 2033, 3407], '') + self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), + 'fried chicken makes me happy', 'DecodeIds Error') + + def test_tokenizer_cpm1(self): + loader = AutoLoader(task_name="lm", + model_name="CPM-large-ch", + model_dir="./checkpoints/", + only_download_config=True) + tokenizer = loader.get_tokenizer() + self.assertEqual(tokenizer.encode("day"), [8, 8275], '') + self.assertEqual(tokenizer.encode("fried chicken makes me happy"), + [2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239], '') + self.assertEqual(tokenizer.decode([2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239]), + 'fried chicken makes me happy', 'DecodeIds Error') + + def test_tokenizer_opt(self): + tokenizer = Tokenizer.from_pretrained('opt-125m-en') + self.assertEqual(tokenizer.encode("day"), [1208], '') + self.assertEqual(tokenizer.encode_plus("fried chicken makes me happy")["input_ids"], + [50260, 21209, 5884, 817, 162, 1372, 50260], '') + self.assertEqual(tokenizer.decode([21209, 5884, 817, 162, 1372]), + 'fried chicken makes me happy', 'DecodeIds Error') + + def test_tokenizer_clip(self): + loader = AutoLoader(task_name="txt_img_matching", + model_name="clip-base-p32-224") + tokenizer = loader.get_tokenizer() + self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') + + def test_tokenizer_evaclip(self): + loader = AutoLoader(task_name="txt_img_matching", + model_name="eva-clip") + tokenizer = loader.get_tokenizer() + self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_ch')) + suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_en')) + # suite.addTest(TokenizerTestCase('test_tokenizer_glm_10_en')) + suite.addTest(TokenizerTestCase('test_tokenizer_t5')) + suite.addTest(TokenizerTestCase('test_tokenizer_roberta')) + suite.addTest(TokenizerTestCase('test_tokenizer_bert')) + suite.addTest(TokenizerTestCase('test_tokenizer_cpm1')) + suite.addTest(TokenizerTestCase('test_tokenizer_opt')) + suite.addTest(TokenizerTestCase('test_tokenizer_clip')) + suite.addTest(TokenizerTestCase('test_tokenizer_evaclip')) + + return suite + + +if __name__ == '__main__': + runner = unittest.TextTestRunner() + runner.run(suite()) \ No newline at end of file From 3c990371ae8ba7bfcce5567111b0e79c028f6e47 Mon Sep 17 00:00:00 2001 From: ftgreat Date: Thu, 2 Mar 2023 15:43:30 +0800 Subject: [PATCH 20/54] updated Signed-off-by: ftgreat --- tests/test_tokenizer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index c897f3e7..87b7fa63 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -4,8 +4,6 @@ import unittest from flagai.data.tokenizer import Tokenizer from flagai.auto_model.auto_loader import AutoLoader -import sys -sys.path.append("/home/yanzhaodong/anhforth/FlagAI") class TokenizerTestCase(unittest.TestCase): From 173ce998c5020024ee47f547bd27320a571b71ca Mon Sep 17 00:00:00 2001 From: ftgreat Date: Thu, 2 Mar 2023 15:45:31 +0800 Subject: [PATCH 21/54] new version Signed-off-by: ftgreat --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f4a690c5..71b9c6f2 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="flagai", - version="v1.6.0", + version="v1.6.2", description="FlagAI aims to help researchers and developers to freely train and test large-scale models for NLP/CV/VL tasks.", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", From 948e3f93c7208a4eaf71132835db8d697b0ca6b5 Mon Sep 17 00:00:00 2001 From: ftgreat Date: Thu, 2 Mar 2023 15:52:07 +0800 Subject: [PATCH 22/54] removed unused bminf Signed-off-by: ftgreat --- flagai/model/galactica_model.py | 2 +- flagai/model/gpt2_model.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/flagai/model/galactica_model.py b/flagai/model/galactica_model.py index f94e62f4..6e28fddb 100644 --- a/flagai/model/galactica_model.py +++ b/flagai/model/galactica_model.py @@ -22,7 +22,7 @@ from torch.nn import CrossEntropyLoss from flagai.model.layers.activations import ACT2FN from flagai.model.gpt2_model import GPT2Model, GPT2Stack, GPT2Config -import bminf +# import bminf class OPTLearnedPositionalEmbedding(nn.Embedding): diff --git a/flagai/model/gpt2_model.py b/flagai/model/gpt2_model.py index 15197888..96f8781e 100644 --- a/flagai/model/gpt2_model.py +++ b/flagai/model/gpt2_model.py @@ -9,7 +9,7 @@ from flagai.model.utils import normal_init_method from flagai.model.base_model import BaseModel import torch.nn.functional as F -import bminf +# import bminf if os.getenv('ENV_TYPE') == 'deepspeed+mpu': from flagai.mpu.utils import divide from flagai.mpu.random import checkpoint From df91be4bee888dc6b0eebfe3e1981377c9d717d8 Mon Sep 17 00:00:00 2001 From: ftgreat Date: Thu, 2 Mar 2023 16:41:56 +0800 Subject: [PATCH 23/54] modified according to comments Signed-off-by: ftgreat --- examples/bminf_generate/galactica_6.7b_generate.py | 2 +- examples/glm_blank_filling/glm_generate_samples.py | 7 ++++--- examples/gpt2_title_generation/deepspeed.json | 2 +- examples/gpt2_title_generation/train_multi_gpu.py | 2 +- examples/t5_title_generation/generate.py | 14 +++++++------- flagai/model/galactica_model.py | 1 - flagai/model/gpt2_model.py | 8 -------- flagai/mp_tools.py | 7 +++---- setup.py | 2 +- 9 files changed, 18 insertions(+), 27 deletions(-) diff --git a/examples/bminf_generate/galactica_6.7b_generate.py b/examples/bminf_generate/galactica_6.7b_generate.py index 15b1b068..29e8d8df 100644 --- a/examples/bminf_generate/galactica_6.7b_generate.py +++ b/examples/bminf_generate/galactica_6.7b_generate.py @@ -9,7 +9,7 @@ loader = AutoLoader(task_name="lm", model_name="galactica-6.7b-en", - model_dir="/share/projset/baaishare/baai-mrnd/xingzhaohu/") + model_dir="./checkpoints/") model = loader.get_model() with torch.cuda.device(0): diff --git a/examples/glm_blank_filling/glm_generate_samples.py b/examples/glm_blank_filling/glm_generate_samples.py index 603c1127..f6266887 100644 --- a/examples/glm_blank_filling/glm_generate_samples.py +++ b/examples/glm_blank_filling/glm_generate_samples.py @@ -1,6 +1,7 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") + import torch from flagai.model.glm_model import GLMModel from flagai.data.tokenizer import Tokenizer @@ -20,7 +21,7 @@ model.cuda(torch.cuda.current_device()) predictor = Predictor(model, tokenizer) - + # generate samples text = [ '问题:啤酒伤胃吗?回答:[gMASK]', "问题:隔夜菜能吃吗?回答:[gMASK]", "问题:如何评价许嵩?回答:[gMASK]" ] @@ -34,7 +35,7 @@ output = predictor.predict_generate_randomsample( t, top_k=50, repetition_penalty=4.0, top_p=1.0) print(t, '\n', output) - + # text = [ "人工智能是一个以计算机科学为基础,由计算机、数学、哲学等多学科交叉融合的交叉学科,[sMASK],具有非常巨大的前景。", "最近十多年来,人工神经网络的研究工作不断深入,已经取得了很大的进展,[sMASK],表现出了良好的智能特性。" @@ -42,4 +43,4 @@ for t in text: output = predictor.predict_generate_randomsample( t, top_k=50, repetition_penalty=4.0, top_p=1.0) - print(t, '\n', output) + print(t, '\n', output) \ No newline at end of file diff --git a/examples/gpt2_title_generation/deepspeed.json b/examples/gpt2_title_generation/deepspeed.json index 36810822..c18e41e6 100644 --- a/examples/gpt2_title_generation/deepspeed.json +++ b/examples/gpt2_title_generation/deepspeed.json @@ -4,7 +4,7 @@ "steps_per_print": 50, "gradient_clipping": 1.0, "zero_optimization": { - "stage": 2, + "stage": 1, "contiguous_gradients": false, "overlap_comm": true, "reduce_scatter": true, diff --git a/examples/gpt2_title_generation/train_multi_gpu.py b/examples/gpt2_title_generation/train_multi_gpu.py index d6e8d563..5173066e 100644 --- a/examples/gpt2_title_generation/train_multi_gpu.py +++ b/examples/gpt2_title_generation/train_multi_gpu.py @@ -11,7 +11,7 @@ # device = torch.device("cpu") # single gpu trainer = Trainer( - env_type="deepspeed+mpu", + env_type="pytorchDDP", experiment_name="roberta_seq2seq", batch_size=1, gradient_accumulation_steps=1, diff --git a/examples/t5_title_generation/generate.py b/examples/t5_title_generation/generate.py index ea20a285..53bbceca 100644 --- a/examples/t5_title_generation/generate.py +++ b/examples/t5_title_generation/generate.py @@ -17,12 +17,12 @@ beam_size=3, input_max_length=512, out_max_length=100) -# out_2 = predictor.predict_generate_randomsample(text, -# input_max_length=512, -# out_max_length=100, -# repetition_penalty=1.5, -# top_k=20, -# top_p=0.8) + out_2 = predictor.predict_generate_randomsample(text, + input_max_length=512, + out_max_length=100, + repetition_penalty=1.5, + top_k=20, + top_p=0.8) print(f"out_1 is {out_1}") -# print(f"out_2 is {out_2}") + print(f"out_2 is {out_2}") diff --git a/flagai/model/galactica_model.py b/flagai/model/galactica_model.py index 6e28fddb..98e1f9cb 100644 --- a/flagai/model/galactica_model.py +++ b/flagai/model/galactica_model.py @@ -22,7 +22,6 @@ from torch.nn import CrossEntropyLoss from flagai.model.layers.activations import ACT2FN from flagai.model.gpt2_model import GPT2Model, GPT2Stack, GPT2Config -# import bminf class OPTLearnedPositionalEmbedding(nn.Embedding): diff --git a/flagai/model/gpt2_model.py b/flagai/model/gpt2_model.py index 96f8781e..b0277667 100644 --- a/flagai/model/gpt2_model.py +++ b/flagai/model/gpt2_model.py @@ -9,7 +9,6 @@ from flagai.model.utils import normal_init_method from flagai.model.base_model import BaseModel import torch.nn.functional as F -# import bminf if os.getenv('ENV_TYPE') == 'deepspeed+mpu': from flagai.mpu.utils import divide from flagai.mpu.random import checkpoint @@ -112,10 +111,6 @@ def __init__(self, config): self.drop = nn.Dropout(config.embd_pdrop) self.project_in = None self.project_out = None - # self.h = bminf.TransformerBlockList([ - # GPT2Block(config.n_ctx, config, scale=True) - # for _ in range(config.n_layer) - # ],[0]) self.h = nn.ModuleList([ GPT2Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer) @@ -279,9 +274,6 @@ def __init__(self, config, **kwargs): self.parallel_output = True self.transformer = GPT2Stack(config_gpt) - # self.lm_head = bminf.QuantizedLinear(nn.Linear(config_gpt.n_embd, - # config_gpt.vocab_size, - # bias=False)) self.lm_head = nn.Linear(config_gpt.n_embd, config_gpt.vocab_size, bias=False) diff --git a/flagai/mp_tools.py b/flagai/mp_tools.py index 774159c5..f4dfff0b 100644 --- a/flagai/mp_tools.py +++ b/flagai/mp_tools.py @@ -7,7 +7,7 @@ import copy from_1_to_n_models = { - "gpt2": { + "gpt": { "wte.weight": 0, "attn.c_attn.weight": 30, "attn.c_attn.bias": 30, @@ -238,8 +238,7 @@ def change_pytorch_model_mp_from_1_to_n_new(model_name_brief, checkpoint: str, t d = d["module"] for k, v in d.items(): - if len(v.shape)>2: - continue + assert len(v.shape) < 3 flag = 0 for keys in trans_keys: if keys in k: @@ -413,4 +412,4 @@ def change_pytorch_model_mp_from_n_to_1(model_name_brief, checkpoint): if __name__ == "__main__": change_pytorch_model_mp_from_1_to_n( - '/mnt/test_10b_models/state_dict/GLM-10b-en', 2) + '/mnt/test_10b_models/state_dict/GLM-10b-en', 2) \ No newline at end of file diff --git a/setup.py b/setup.py index 71b9c6f2..f4a690c5 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="flagai", - version="v1.6.2", + version="v1.6.0", description="FlagAI aims to help researchers and developers to freely train and test large-scale models for NLP/CV/VL tasks.", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", From f27f9e91dbbd597f6f57aed6af47778013ad5d15 Mon Sep 17 00:00:00 2001 From: ftgreat Date: Thu, 2 Mar 2023 16:56:20 +0800 Subject: [PATCH 24/54] updated Signed-off-by: ftgreat --- examples/gpt2_title_generation/train_multi_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/gpt2_title_generation/train_multi_gpu.py b/examples/gpt2_title_generation/train_multi_gpu.py index 5173066e..a0ed862d 100644 --- a/examples/gpt2_title_generation/train_multi_gpu.py +++ b/examples/gpt2_title_generation/train_multi_gpu.py @@ -30,7 +30,7 @@ num_nodes=1, num_gpus=2, checkpoint_activations=False, - model_parallel_size=2, + model_parallel_size=1, hostfile='./hostfile', deepspeed_config='./deepspeed.json', training_script=__file__, From bd8d6573bd312a83ff2209cb695affc2a146de2c Mon Sep 17 00:00:00 2001 From: ftgreat Date: Thu, 2 Mar 2023 18:11:51 +0800 Subject: [PATCH 25/54] updated Signed-off-by: ftgreat --- README.md | 1 + README_zh.md | 1 + setup.py | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7dee911d..2448663f 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ FlagAI (Fast LArge-scale General AI models) is a fast, easy-to-use and extensibl The code is partially based on [GLM](https://github.com/THUDM/GLM), [Transformers](https://github.com/huggingface/transformers),[timm](https://github.com/rwightman/pytorch-image-models) and [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM). ## News +- [2 Mar 2023] release v1.6.1, Support Galactica model [#234](https://github.com/FlagAI-Open/FlagAI/pull/234); BMInf, a low-resource inference package [#238](https://github.com/FlagAI-Open/FlagAI/pull/238), and examples for p-tuning [#227](https://github.com/FlagAI-Open/FlagAI/pull/238) - [12 Jan 2023] release v1.6.0, support a new parallel lib called [**BMTrain**](https://github.com/OpenBMB/BMTrain) and integate [**Flash Attention**](https://github.com/HazyResearch/flash-attention) to speedup training of Bert and Vit models, examples in [FlashAttentionBERT](https://github.com/FlagAI-Open/FlagAI/blob/master/examples/bert_title_generation_english/train_flash_atten.py) and [FlashAttentionViT](https://github.com/FlagAI-Open/FlagAI/blob/master/examples/vit_cifar100/train_single_gpu_flash_atten.py). Also add the contrastive search based text generation method [**SimCTG**](https://github.com/yxuansu/SimCTG) and DreamBooth finetuning based on AltDiffusion, examples in [AltDiffusionNaruto](https://github.com/FlagAI-Open/FlagAI/blob/master/examples/AltDiffusion/dreambooth.py). - [28 Nov 2022] release v1.5.0, support 1.1B [**EVA-CLIP**](https://github.com/FlagAI-Open/FlagAI/tree/master/examples/EVA_CLIP) and [ALM: A large Arabic Language Model based on GLM], examples in [**ALM**](https://github.com/FlagAI-Open/FlagAI/tree/master/examples/ALM) - [10 Nov 2022] release v1.4.0, support [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679v1), examples in [**AltCLIP**](https://github.com/FlagAI-Open/FlagAI/tree/master/examples/AltCLIP) and [**AltDiffusion**](https://github.com/FlagAI-Open/FlagAI/tree/master/examples/AltDiffusion) diff --git a/README_zh.md b/README_zh.md index 2ab9eff9..eaf54562 100644 --- a/README_zh.md +++ b/README_zh.md @@ -21,6 +21,7 @@ 本项目的部分代码基于[GLM](https://github.com/THUDM/GLM),[Transformers](https://github.com/huggingface/transformers),[timm](https://github.com/rwightman/pytorch-image-models) 和 [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM). ## 动态 +- [2 Mar 2023] 支持v1.6.1版本, 增加Galactica模型 [#234](https://github.com/FlagAI-Open/FlagAI/pull/234), 大模型推理的低资源工具包BMInf [#238](https://github.com/FlagAI-Open/FlagAI/pull/238), 以及P-tuning样例 [#227](https://github.com/FlagAI-Open/FlagAI/pull/238) - [12 Jan 2023] 发布v1.6.0版本, 新增支持并行训练库 [**BMTrain**](https://github.com/OpenBMB/BMTrain) 以及集成 [**Flash Attention**](https://github.com/HazyResearch/flash-attention) 到 Bert 和 Vit 模型提速端到端训练, 示例见 [FlashAttentionBERT](https://github.com/FlagAI-Open/FlagAI/blob/master/examples/bert_title_generation_english/train_flash_atten.py)和 [FlashAttentionViT](https://github.com/FlagAI-Open/FlagAI/blob/master/examples/vit_cifar100/train_single_gpu_flash_atten.py). 同时增加了基于对比搜索的文本生成方法 [**SimCTG**](https://github.com/yxuansu/SimCTG) 以及基于 AltDiffusion 进行 DreamBooth 个性化微调, 示例见 [AltDiffusionNaruto](https://github.com/FlagAI-Open/FlagAI/blob/master/examples/AltDiffusion/dreambooth.py). - [28 Nov 2022] 发布v1.5.0版本, 支持1.1B参数的 [**EVA-CLIP**](https://github.com/FlagAI-Open/FlagAI/tree/master/examples/EVA_CLIP) 以及[ALM: 基于GLM的阿拉伯语大模型], 示例见[**ALM**](https://github.com/FlagAI-Open/FlagAI/tree/master/examples/ALM) - [10 Nov 2022] 发布v1.4.0版本, 支持[AltCLIP: 更改CLIP中的语言编码器以扩展语言功能](https://arxiv.org/abs/2211.06679v1), 示例见[**AltCLIP**](https://github.com/FlagAI-Open/FlagAI/tree/master/examples/AltCLIP)以及[**AltDiffusion**](https://github.com/FlagAI-Open/FlagAI/tree/master/examples/AltDiffusion) diff --git a/setup.py b/setup.py index f4a690c5..beaf4f63 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="flagai", - version="v1.6.0", + version="v1.6.1", description="FlagAI aims to help researchers and developers to freely train and test large-scale models for NLP/CV/VL tasks.", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", From 4d9c638a92bac2fc0473a05d452758227bba79c4 Mon Sep 17 00:00:00 2001 From: ldwang Date: Mon, 6 Mar 2023 09:54:49 +0800 Subject: [PATCH 26/54] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2448663f..54ce31f3 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ FlagAI (Fast LArge-scale General AI models) is a fast, easy-to-use and extensibl * These models can be applied to (Chinese/English) Text, for tasks like text classification, information extraction, question answering, summarization, and text generation. -* FlagAI is backed by the three most popular data/model parallel libraries — [PyTorch](https://pytorch.org/)/[Deepspeed](https://www.deepspeed.ai/)/[Megatron-LM](https://github.com/NVIDIA/Megatron-LM)/[BMTrain](https://github.com/OpenBMB/BMTrain) — with seamless integration between them. Users can parallel their training/testing process with less than ten lines of code. +* FlagAI is backed by the four most popular data/model parallel libraries — [PyTorch](https://pytorch.org/)/[Deepspeed](https://www.deepspeed.ai/)/[Megatron-LM](https://github.com/NVIDIA/Megatron-LM)/[BMTrain](https://github.com/OpenBMB/BMTrain) — with seamless integration between them. Users can parallel their training/testing process with less than ten lines of code. The code is partially based on [GLM](https://github.com/THUDM/GLM), [Transformers](https://github.com/huggingface/transformers),[timm](https://github.com/rwightman/pytorch-image-models) and [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM). From 257d79eeca12f238a7a508276b26d6ec40809732 Mon Sep 17 00:00:00 2001 From: ldwang Date: Tue, 7 Mar 2023 10:03:38 +0800 Subject: [PATCH 27/54] Update README_zh.md --- README_zh.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README_zh.md b/README_zh.md index eaf54562..2ed999f8 100644 --- a/README_zh.md +++ b/README_zh.md @@ -15,7 +15,7 @@ * 这些模型可以应用于文本,用于文本分类、信息提取、问答、摘要、文本生成等任务,尤其是中文。 -* 飞智由三个最流行的数据/模型并行库([PyTorch](https://pytorch.org/)/[Deepspeed](https://www.deepspeed.ai/)/[Megatron-LM](https://github.com/NVIDIA/Megatron-LM)/[BMTrain](https://github.com/OpenBMB/BMTrain))提供支持,它们之间实现了无缝集成。 你可以用不到十行代码来并行你的训练/测试过程。 +* 飞智由四个最流行的数据/模型并行库([PyTorch](https://pytorch.org/)/[Deepspeed](https://www.deepspeed.ai/)/[Megatron-LM](https://github.com/NVIDIA/Megatron-LM)/[BMTrain](https://github.com/OpenBMB/BMTrain))提供支持,它们之间实现了无缝集成。 你可以用不到十行代码来并行你的训练/测试过程。 本项目的部分代码基于[GLM](https://github.com/THUDM/GLM),[Transformers](https://github.com/huggingface/transformers),[timm](https://github.com/rwightman/pytorch-image-models) 和 [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM). From 32c4fce7b8176a473ac85ba25e1d16aac4161140 Mon Sep 17 00:00:00 2001 From: zhaohu xing <920232796@qq.com> Date: Tue, 7 Mar 2023 10:15:50 +0800 Subject: [PATCH 28/54] add llama model Signed-off-by: zhaohu xing <920232796@qq.com> --- examples/llama/llama_generate.py | 26 ++ flagai/auto_model/auto_loader.py | 10 +- flagai/data/tokenizer/llama/tokenizer.py | 40 +++ flagai/model/base_model.py | 2 +- flagai/model/llama.py | 296 +++++++++++++++++++++++ flagai/model/predictor/llama.py | 64 +++++ flagai/model/predictor/predictor.py | 5 + 7 files changed, 441 insertions(+), 2 deletions(-) create mode 100755 examples/llama/llama_generate.py create mode 100644 flagai/data/tokenizer/llama/tokenizer.py create mode 100644 flagai/model/llama.py create mode 100644 flagai/model/predictor/llama.py diff --git a/examples/llama/llama_generate.py b/examples/llama/llama_generate.py new file mode 100755 index 00000000..3f11466a --- /dev/null +++ b/examples/llama/llama_generate.py @@ -0,0 +1,26 @@ +from flagai.model.predictor.predictor import Predictor +from flagai.auto_model.auto_loader import AutoLoader +from flagai.model.llama import setup_model_parallel + +# torchrun --nproc_per_node 2 llama_generate.py + +local_rank, world_size = setup_model_parallel() + +loader = AutoLoader(task_name="lm", + model_name="llama-13b-en", + ) +model = loader.get_model() +tokenizer = loader.get_tokenizer() +predictor = Predictor(model, tokenizer) + +prompts = ["The capital of Germany is the city of", + "Here is my sonnet in the style of Shakespeare about an artificial intelligence:"] +for text in prompts: + result = predictor.predict_generate_randomsample(text, + out_max_length=256, + temperature=1.0, + top_p=0.9) + if local_rank == 0: + print(result) + print("\n==================================\n") + diff --git a/flagai/auto_model/auto_loader.py b/flagai/auto_model/auto_loader.py index cfbbe0db..d32b6280 100644 --- a/flagai/auto_model/auto_loader.py +++ b/flagai/auto_model/auto_loader.py @@ -57,6 +57,7 @@ def __getattr__(self, name): "opt_seq2seq": ("flagai.model.opt_model", "OPTModel"), "opt_lm": ("flagai.model.opt_model", "OPTModel"), "galactica_lm": ("flagai.model.galactica_model", "GalacticaModel"), + "llama_lm": ("flagai.model.llama", "LLAMA",), "vit_classification": ("flagai.model.vision.vit", "VisionTransformer"), "clip_txt_img_matching": ("flagai.model.mm.clip_model", "CLIP"), "swinv1_classification": ("flagai.model.vision.swinv1", "SwinTransformer"), @@ -96,6 +97,10 @@ def __getattr__(self, name): "galactica-6.7b-en": ["flagai.model.galactica_model", "GalacticaModel", "galactica", "nlp", "flagai.data.tokenizer.galactica.galactica_tokenizer", "GalacticaTokenizer"], "galactica-30b-en": ["flagai.model.galactica_model", "GalacticaModel", "galactica", "nlp", "flagai.data.tokenizer.galactica.galactica_tokenizer", "GalacticaTokenizer"], "galactica-120b-en": ["flagai.model.galactica_model", "GalacticaModel", "galactica", "nlp", "flagai.data.tokenizer.galactica.galactica_tokenizer", "GalacticaTokenizer"], + "llama-7b-en": ["flagai.model.llama", "LLAMA", "llama", "nlp", "flagai.data.tokenizer.llama.tokenizer", "Tokenizer"], + "llama-13b-en": ["flagai.model.llama", "LLAMA", "llama", "nlp", "flagai.data.tokenizer.llama.tokenizer", "Tokenizer"], + "llama-30b-en": ["flagai.model.llama", "LLAMA", "llama", "nlp", "flagai.data.tokenizer.llama.tokenizer", "Tokenizer"], + "llama-65b-en": ["flagai.model.llama", "LLAMA", "llama", "nlp", "flagai.data.tokenizer.llama.tokenizer", "Tokenizer"], "vit-base-p16-224": ["flagai.model.vision.vit", "VisionTransformer", "vit", "vision"], "vit-base-p16-384": @@ -212,9 +217,12 @@ def __init__(self, self.model.half() if model_type == "nlp": - if brief_model_name in ["galactica", ]: + if brief_model_name in ["galactica",]: self.tokenizer = getattr(LazyImport(MODEL_DICT[model_name][4]), MODEL_DICT[model_name][5])(download_path) + elif brief_model_name in ["llama",]: + self.tokenizer = getattr(LazyImport(MODEL_DICT[model_name][4]), + MODEL_DICT[model_name][5])(os.path.join(download_path, "tokenizer.model")) else : tokenizer_class = getattr(LazyImport("flagai.data.tokenizer"), "Tokenizer") diff --git a/flagai/data/tokenizer/llama/tokenizer.py b/flagai/data/tokenizer/llama/tokenizer.py new file mode 100644 index 00000000..e4315856 --- /dev/null +++ b/flagai/data/tokenizer/llama/tokenizer.py @@ -0,0 +1,40 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# This software may be used and distributed according to the terms of the GNU General Public License version 3. + +from sentencepiece import SentencePieceProcessor +from logging import getLogger +from typing import List +import os + + +logger = getLogger() + + +class Tokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + logger.info(f"Reloaded SentencePiece model from {model_path}") + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.pad_id() + logger.info( + f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}" + ) + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + def encode(self, s: str, bos: bool, eos: bool) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + return self.sp_model.decode(t) diff --git a/flagai/model/base_model.py b/flagai/model/base_model.py index 46367e28..1b390d61 100644 --- a/flagai/model/base_model.py +++ b/flagai/model/base_model.py @@ -222,7 +222,7 @@ def download(cls, model_files = eval(_get_model_files(model_name)) print("model files:" + str(model_files)) for file_name in model_files: - if not file_name.endswith("bin"): + if not file_name.endswith("bin") and not file_name.endswith("pth"): _get_vocab_path(os.path.join(download_path, model_name), file_name, model_id) else : _get_checkpoint_path(os.path.join(download_path, model_name), file_name, model_id) \ No newline at end of file diff --git a/flagai/model/llama.py b/flagai/model/llama.py new file mode 100644 index 00000000..5ba77b4d --- /dev/null +++ b/flagai/model/llama.py @@ -0,0 +1,296 @@ + +# Copyright (c) Meta Platforms, Inc. and affiliates. +# This software may be used and distributed according to the terms of the GNU General Public License version 3. + +from typing import Optional, Tuple +from dataclasses import dataclass +import math + +import torch +from torch import nn +import torch.nn.functional as F +from pathlib import Path + +import fairscale.nn.model_parallel.initialize as fs_init +from fairscale.nn.model_parallel.layers import ( + ParallelEmbedding, + RowParallelLinear, + ColumnParallelLinear, +) +import os +from fairscale.nn.model_parallel.initialize import initialize_model_parallel + +from flagai.model.base_model import BaseModel + +def setup_model_parallel() -> Tuple[int, int]: + local_rank = int(os.environ.get("LOCAL_RANK", -1)) + world_size = int(os.environ.get("WORLD_SIZE", -1)) + + torch.distributed.init_process_group("nccl") + initialize_model_parallel(world_size) + torch.cuda.set_device(local_rank) + + # seed must be the same in all processes + torch.manual_seed(1) + return local_rank, world_size + + +@dataclass +class ModelArgs: + dim: int = 512 + n_layers: int = 8 + n_heads: int = 8 + vocab_size: int = -1 # defined later by tokenizer + multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2 + norm_eps: float = 1e-5 + + max_batch_size: int = 32 + max_seq_len: int = 1024 + + +class RMSNorm(torch.nn.Module): + def __init__(self, dim: int, eps: float = 1e-6): + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.ones(dim)) + + def _norm(self, x): + return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) + + def forward(self, x): + output = self._norm(x.float()).type_as(x) + return output * self.weight + + +def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0): + freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) + t = torch.arange(end, device=freqs.device) # type: ignore + freqs = torch.outer(t, freqs).float() # type: ignore + freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # complex64 + return freqs_cis + + +def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor): + ndim = x.ndim + assert 0 <= 1 < ndim + assert freqs_cis.shape == (x.shape[1], x.shape[-1]) + shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)] + return freqs_cis.view(*shape) + + +def apply_rotary_emb( + xq: torch.Tensor, + xk: torch.Tensor, + freqs_cis: torch.Tensor, +) -> Tuple[torch.Tensor, torch.Tensor]: + xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) + xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) + freqs_cis = reshape_for_broadcast(freqs_cis, xq_) + xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3) + xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3) + return xq_out.type_as(xq), xk_out.type_as(xk) + + +class Attention(nn.Module): + def __init__(self, args: ModelArgs): + super().__init__() + + self.n_local_heads = args.n_heads // fs_init.get_model_parallel_world_size() + self.head_dim = args.dim // args.n_heads + + self.wq = ColumnParallelLinear( + args.dim, + args.n_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.wk = ColumnParallelLinear( + args.dim, + args.n_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.wv = ColumnParallelLinear( + args.dim, + args.n_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.wo = RowParallelLinear( + args.n_heads * self.head_dim, + args.dim, + bias=False, + input_is_parallel=True, + init_method=lambda x: x, + ) + + self.cache_k = torch.zeros( + (args.max_batch_size, args.max_seq_len, self.n_local_heads, self.head_dim) + ).cuda() + self.cache_v = torch.zeros( + (args.max_batch_size, args.max_seq_len, self.n_local_heads, self.head_dim) + ).cuda() + + def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]): + bsz, seqlen, _ = x.shape + xq, xk, xv = self.wq(x), self.wk(x), self.wv(x) + + xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim) + xk = xk.view(bsz, seqlen, self.n_local_heads, self.head_dim) + xv = xv.view(bsz, seqlen, self.n_local_heads, self.head_dim) + + xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis) + + self.cache_k = self.cache_k.to(xq) + self.cache_v = self.cache_v.to(xq) + + self.cache_k[:bsz, start_pos : start_pos + seqlen] = xk + self.cache_v[:bsz, start_pos : start_pos + seqlen] = xv + + keys = self.cache_k[:bsz, : start_pos + seqlen] + values = self.cache_v[:bsz, : start_pos + seqlen] + + xq = xq.transpose(1, 2) + keys = keys.transpose(1, 2) + values = values.transpose(1, 2) + scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim) + if mask is not None: + scores = scores + mask # (bs, n_local_heads, slen, cache_len + slen) + scores = F.softmax(scores.float(), dim=-1).type_as(xq) + output = torch.matmul(scores, values) # (bs, n_local_heads, slen, head_dim) + output = output.transpose( + 1, 2 + ).contiguous().view(bsz, seqlen, -1) + + return self.wo(output) + + +class FeedForward(nn.Module): + def __init__( + self, + dim: int, + hidden_dim: int, + multiple_of: int, + ): + super().__init__() + hidden_dim = int(2 * hidden_dim / 3) + hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + + self.w1 = ColumnParallelLinear( + dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x + ) + self.w2 = RowParallelLinear( + hidden_dim, dim, bias=False, input_is_parallel=True, init_method=lambda x: x + ) + self.w3 = ColumnParallelLinear( + dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x + ) + + def forward(self, x): + return self.w2(F.silu(self.w1(x)) * self.w3(x)) + + +class TransformerBlock(nn.Module): + def __init__(self, layer_id: int, args: ModelArgs): + super().__init__() + self.n_heads = args.n_heads + self.dim = args.dim + self.head_dim = args.dim // args.n_heads + self.attention = Attention(args) + self.feed_forward = FeedForward( + dim=args.dim, hidden_dim=4 * args.dim, multiple_of=args.multiple_of + ) + self.layer_id = layer_id + self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps) + self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps) + + def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]): + h = x + self.attention.forward(self.attention_norm(x), start_pos, freqs_cis, mask) + out = h + self.feed_forward.forward(self.ffn_norm(h)) + return out + + +class LLAMA(BaseModel): + def __init__(self, config, **kwargs): + super().__init__(config, **kwargs) + + params: ModelArgs = ModelArgs(max_seq_len=1024, max_batch_size=32, + multiple_of=config["multiple_of"], + dim=config["dim"], + n_heads=config["n_heads"], + vocab_size=config["vocab_size"], + norm_eps=config["norm_eps"], + n_layers=config["n_layers"]) + + self.params = params + self.vocab_size = params.vocab_size + self.n_layers = params.n_layers + + self.tok_embeddings = ParallelEmbedding( + params.vocab_size, params.dim, init_method=lambda x: x + ) + + self.layers = torch.nn.ModuleList() + for layer_id in range(params.n_layers): + self.layers.append(TransformerBlock(layer_id, params)) + + self.norm = RMSNorm(params.dim, eps=params.norm_eps) + self.output = ColumnParallelLinear( + params.dim, params.vocab_size, bias=False, init_method=lambda x: x + ) + + self.freqs_cis = precompute_freqs_cis( + self.params.dim // self.params.n_heads, self.params.max_seq_len * 2 + ) + + @torch.inference_mode() + def forward(self, tokens: torch.Tensor, start_pos: int): + _bsz, seqlen = tokens.shape + h = self.tok_embeddings(tokens) + self.freqs_cis = self.freqs_cis.to(h.device) + freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen] + + mask = None + if seqlen > 1: + mask = torch.full((1, 1, seqlen, seqlen), float("-inf"), device=tokens.device) + mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h) + + for layer in self.layers: + h = layer(h, start_pos, freqs_cis, mask) + h = self.norm(h) + output = self.output(h[:, -1, :]) # only compute last logits + return output.float() + + def load_weights(self, checkpoint_path): + self.load_state_dict(torch.load(checkpoint_path, map_location="cpu")) + print(f"model params are loaded successfully...") + + @classmethod + def from_pretrain(cls, + download_path='./checkpoints/', + model_name='RoBERTa-base-ch', + only_download_config=False, + device="cpu", + **kwargs): + super().download(download_path, model_name) + pretrained_model_name_or_path = os.path.join(download_path, model_name) + local_rank = int(os.environ.get("LOCAL_RANK", -1)) + world_size = int(os.environ.get("WORLD_SIZE", -1)) + + checkpoints = sorted(Path(pretrained_model_name_or_path).glob("*.pth")) + assert ( + world_size == len(checkpoints) + ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}" + ckpt_path = checkpoints[local_rank] + print("Loading") + checkpoint = torch.load(ckpt_path, map_location="cpu") + torch.set_default_tensor_type(torch.cuda.HalfTensor) + model = LLAMA.init_from_json(os.path.join(pretrained_model_name_or_path, "config.json")) + torch.set_default_tensor_type(torch.FloatTensor) + + model.load_state_dict(checkpoint, strict=False) + + return model \ No newline at end of file diff --git a/flagai/model/predictor/llama.py b/flagai/model/predictor/llama.py new file mode 100644 index 00000000..c246bdd1 --- /dev/null +++ b/flagai/model/predictor/llama.py @@ -0,0 +1,64 @@ +from typing import List +import torch + + +def llama_generate( + tokenizer, + model, + prompts: List[str], + max_gen_len: int, + temperature: float = 0.8, + top_p: float = 0.95, + ) -> List[str]: + bsz = len(prompts) + + prompt_tokens = [tokenizer.encode(x, bos=True, eos=False) for x in prompts] + + min_prompt_size = min([len(t) for t in prompt_tokens]) + max_prompt_size = max([len(t) for t in prompt_tokens]) + + total_len = min(1024, max_gen_len + max_prompt_size) + + tokens = torch.full((bsz, total_len), tokenizer.pad_id).cuda().long() + for k, t in enumerate(prompt_tokens): + tokens[k, : len(t)] = torch.tensor(t).long() + input_text_mask = tokens != tokenizer.pad_id + start_pos = min_prompt_size + prev_pos = 0 + for cur_pos in range(start_pos, total_len): + logits = model.forward(tokens[:, prev_pos:cur_pos], prev_pos) + if temperature > 0: + probs = torch.softmax(logits / temperature, dim=-1) + next_token = sample_top_p(probs, top_p) + else: + next_token = torch.argmax(logits, dim=-1) + next_token = next_token.reshape(-1) + # only replace token if prompt has already been generated + next_token = torch.where( + input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token + ) + tokens[:, cur_pos] = next_token + prev_pos = cur_pos + + decoded = [] + for i, t in enumerate(tokens.tolist()): + # cut to max gen len + t = t[: len(prompt_tokens[i]) + max_gen_len] + # cut to eos tok if any + try: + t = t[: t.index(tokenizer.eos_id)] + except ValueError: + pass + decoded.append(tokenizer.decode(t)) + return decoded + + +def sample_top_p(probs, p): + probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True) + probs_sum = torch.cumsum(probs_sort, dim=-1) + mask = probs_sum - probs_sort > p + probs_sort[mask] = 0.0 + probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True)) + next_token = torch.multinomial(probs_sort, num_samples=1) + next_token = torch.gather(probs_idx, -1, next_token) + return next_token \ No newline at end of file diff --git a/flagai/model/predictor/predictor.py b/flagai/model/predictor/predictor.py index 03cf7ff4..33e660cb 100644 --- a/flagai/model/predictor/predictor.py +++ b/flagai/model/predictor/predictor.py @@ -18,6 +18,7 @@ from contextlib import contextmanager, nullcontext from einops import rearrange from torch.cuda.amp import autocast as autocast +from .llama import llama_generate class Predictor: def __init__(self, model, tokenizer=None): @@ -345,6 +346,10 @@ def predict_generate_randomsample(self, input_max_length, out_max_length, top_k, top_p, repetition_penalty, temperature, device) + elif "llama" in self.class_name.lower(): + return llama_generate(self.tokenizer, self.model, + [text], out_max_length, + temperature, top_p) else: print("Unsupported decoding mode") From 254226aa3a2cca53e66b3a25a4954cb041689b6f Mon Sep 17 00:00:00 2001 From: zhaohu xing <920232796@qq.com> Date: Tue, 7 Mar 2023 10:21:07 +0800 Subject: [PATCH 29/54] add a weight merging tool file Signed-off-by: zhaohu xing <920232796@qq.com> --- flagai/mp_tools.py | 51 +++++++++++++++++++++--- flagai/tools/merge_huggingface_weight.py | 25 ++++++++++++ 2 files changed, 70 insertions(+), 6 deletions(-) create mode 100644 flagai/tools/merge_huggingface_weight.py diff --git a/flagai/mp_tools.py b/flagai/mp_tools.py index f4dfff0b..201540a0 100644 --- a/flagai/mp_tools.py +++ b/flagai/mp_tools.py @@ -7,15 +7,24 @@ import copy from_1_to_n_models = { - "gpt": { + "gpt2": { "wte.weight": 0, "attn.c_attn.weight": 30, "attn.c_attn.bias": 30, - "attn.c_proj.weight": 1, - "mlp.c_fc.weight": 0, + "attn.c_proj.weight": 0, + "mlp.c_fc.weight": 1, "mlp.c_fc.bias": 0, - "mlp.c_proj.weight": 1, + "mlp.c_proj.weight": 0, }, + # "gpt2": { + # "wte.weight": 0, + # "attn.c_attn.weight": 30, + # "attn.c_attn.bias": 30, + # "attn.c_proj.weight": 1, + # "mlp.c_fc.weight": 0, + # "mlp.c_fc.bias": 0, + # "mlp.c_proj.weight": 1, + # }, "opt": { "decoder.embed_tokens.weight": 0, "self_attn.k_proj.weight": 0, @@ -30,6 +39,20 @@ "fc1.bias": 0, "fc2.weight": 1, }, + "galactica": { + "decoder.embed_tokens.weight": 0, + "self_attn.k_proj.weight": 0, + "self_attn.k_proj.bias": 0, + "self_attn.q_proj.weight": 0, + "self_attn.q_proj.bias": 0, + "self_attn.v_proj.weight": 0, + "self_attn.v_proj.bias": 0, + + "self_attn.out_proj.weight": 1, + "fc1.weight": 0, + "fc1.bias": 0, + "fc2.weight": 1, + }, "glm": { "word_embeddings.weight": 0, "attention.query_key_value.weight": 30, @@ -238,7 +261,8 @@ def change_pytorch_model_mp_from_1_to_n_new(model_name_brief, checkpoint: str, t d = d["module"] for k, v in d.items(): - assert len(v.shape) < 3 + if len(v.shape) > 2: + continue flag = 0 for keys in trans_keys: if keys in k: @@ -261,6 +285,21 @@ def change_pytorch_model_mp_from_1_to_n_new(model_name_brief, checkpoint: str, t ], 0) break + elif dim == 31: + v = v.permute(1, 0) + part = v.shape[0] // ratio // 3 + v = torch.cat([ + v[shift * part:(shift + 1) * + part, :].clone(), + v[(shift + ratio) * + part:(shift + 1 + ratio) * + part, :].clone(), + v[(shift + 2 * ratio) * + part:(shift + 1 + 2 * ratio) * + part, :].clone() + ], 0) + v = v.permute(1, 0) + elif dim == 0: part = v.shape[dim] // ratio d_new['module'][k] = v[shift * @@ -412,4 +451,4 @@ def change_pytorch_model_mp_from_n_to_1(model_name_brief, checkpoint): if __name__ == "__main__": change_pytorch_model_mp_from_1_to_n( - '/mnt/test_10b_models/state_dict/GLM-10b-en', 2) \ No newline at end of file + '/mnt/test_10b_models/state_dict/GLM-10b-en', 2) diff --git a/flagai/tools/merge_huggingface_weight.py b/flagai/tools/merge_huggingface_weight.py new file mode 100644 index 00000000..28641539 --- /dev/null +++ b/flagai/tools/merge_huggingface_weight.py @@ -0,0 +1,25 @@ + +import os +import torch + +def merge_weight(model_dir): + model_files = os.listdir(model_dir) + checkpoint_merge = {} + print(f"merging the model weight....") + # multi weights files + for file_to_load in model_files: + if "pytorch_model-0" in file_to_load: + checkpoint_to_load = torch.load(os.path.join(model_dir, file_to_load),map_location="cpu") + for k, v in checkpoint_to_load.items(): + checkpoint_merge[k] = v + print(f"{file_to_load} is merged successfully.") + # save all parameters + torch.save( + checkpoint_merge, + os.path.join(model_dir, "pytorch_model.bin")) + print(f"models are merged successfully.") + + +if __name__ == "__main__": + # merge_weight(model_dir="/share/projset/baaishare/baai-mrnd/xingzhaohu/galactica-6.7b-en/") + merge_weight(model_dir="./state_dict/opt-6.7b-en") \ No newline at end of file From e8bd3b4d56cea7de14371f03908acbbcc58e213b Mon Sep 17 00:00:00 2001 From: zhaohu xing <920232796@qq.com> Date: Tue, 7 Mar 2023 10:23:52 +0800 Subject: [PATCH 30/54] modify the file name Signed-off-by: zhaohu xing <920232796@qq.com> --- ...lama_generate.py => llama_13b_generate.py} | 2 +- examples/llama/llama_33b_generate.py | 26 +++++++++++++++++++ examples/llama/llama_65b_generate.py | 26 +++++++++++++++++++ examples/llama/llama_7b_generate.py | 26 +++++++++++++++++++ 4 files changed, 79 insertions(+), 1 deletion(-) rename examples/llama/{llama_generate.py => llama_13b_generate.py} (94%) create mode 100755 examples/llama/llama_33b_generate.py create mode 100755 examples/llama/llama_65b_generate.py create mode 100755 examples/llama/llama_7b_generate.py diff --git a/examples/llama/llama_generate.py b/examples/llama/llama_13b_generate.py similarity index 94% rename from examples/llama/llama_generate.py rename to examples/llama/llama_13b_generate.py index 3f11466a..505bef07 100755 --- a/examples/llama/llama_generate.py +++ b/examples/llama/llama_13b_generate.py @@ -2,7 +2,7 @@ from flagai.auto_model.auto_loader import AutoLoader from flagai.model.llama import setup_model_parallel -# torchrun --nproc_per_node 2 llama_generate.py +# torchrun --nproc_per_node 2 llama_13b_generate.py local_rank, world_size = setup_model_parallel() diff --git a/examples/llama/llama_33b_generate.py b/examples/llama/llama_33b_generate.py new file mode 100755 index 00000000..d04f01b0 --- /dev/null +++ b/examples/llama/llama_33b_generate.py @@ -0,0 +1,26 @@ +from flagai.model.predictor.predictor import Predictor +from flagai.auto_model.auto_loader import AutoLoader +from flagai.model.llama import setup_model_parallel + +# torchrun --nproc_per_node 4 llama_33b_generate.py + +local_rank, world_size = setup_model_parallel() + +loader = AutoLoader(task_name="lm", + model_name="llama-33b-en", + ) +model = loader.get_model() +tokenizer = loader.get_tokenizer() +predictor = Predictor(model, tokenizer) + +prompts = ["The capital of Germany is the city of", + "Here is my sonnet in the style of Shakespeare about an artificial intelligence:"] +for text in prompts: + result = predictor.predict_generate_randomsample(text, + out_max_length=256, + temperature=1.0, + top_p=0.9) + if local_rank == 0: + print(result) + print("\n==================================\n") + diff --git a/examples/llama/llama_65b_generate.py b/examples/llama/llama_65b_generate.py new file mode 100755 index 00000000..04ec28be --- /dev/null +++ b/examples/llama/llama_65b_generate.py @@ -0,0 +1,26 @@ +from flagai.model.predictor.predictor import Predictor +from flagai.auto_model.auto_loader import AutoLoader +from flagai.model.llama import setup_model_parallel + +# torchrun --nproc_per_node 8 llama_65b_generate.py + +local_rank, world_size = setup_model_parallel() + +loader = AutoLoader(task_name="lm", + model_name="llama-65b-en", + ) +model = loader.get_model() +tokenizer = loader.get_tokenizer() +predictor = Predictor(model, tokenizer) + +prompts = ["The capital of Germany is the city of", + "Here is my sonnet in the style of Shakespeare about an artificial intelligence:"] +for text in prompts: + result = predictor.predict_generate_randomsample(text, + out_max_length=256, + temperature=1.0, + top_p=0.9) + if local_rank == 0: + print(result) + print("\n==================================\n") + diff --git a/examples/llama/llama_7b_generate.py b/examples/llama/llama_7b_generate.py new file mode 100755 index 00000000..91aeb4c6 --- /dev/null +++ b/examples/llama/llama_7b_generate.py @@ -0,0 +1,26 @@ +from flagai.model.predictor.predictor import Predictor +from flagai.auto_model.auto_loader import AutoLoader +from flagai.model.llama import setup_model_parallel + +# torchrun --nproc_per_node 1 llama_7b_generate.py + +local_rank, world_size = setup_model_parallel() + +loader = AutoLoader(task_name="lm", + model_name="llama-7b-en", + ) +model = loader.get_model() +tokenizer = loader.get_tokenizer() +predictor = Predictor(model, tokenizer) + +prompts = ["The capital of Germany is the city of", + "Here is my sonnet in the style of Shakespeare about an artificial intelligence:"] +for text in prompts: + result = predictor.predict_generate_randomsample(text, + out_max_length=256, + temperature=1.0, + top_p=0.9) + if local_rank == 0: + print(result) + print("\n==================================\n") + From 06af33f8919648318afd31fdea230e22a3142e49 Mon Sep 17 00:00:00 2001 From: ldwang Date: Tue, 7 Mar 2023 15:50:42 +0800 Subject: [PATCH 31/54] Revert "add llama model" --- examples/llama/llama_13b_generate.py | 26 -- examples/llama/llama_33b_generate.py | 26 -- examples/llama/llama_65b_generate.py | 26 -- examples/llama/llama_7b_generate.py | 26 -- flagai/auto_model/auto_loader.py | 10 +- flagai/data/tokenizer/llama/tokenizer.py | 40 --- flagai/model/base_model.py | 2 +- flagai/model/llama.py | 296 ----------------------- flagai/model/predictor/llama.py | 64 ----- flagai/model/predictor/predictor.py | 5 - flagai/mp_tools.py | 51 +--- flagai/tools/merge_huggingface_weight.py | 25 -- 12 files changed, 8 insertions(+), 589 deletions(-) delete mode 100755 examples/llama/llama_13b_generate.py delete mode 100755 examples/llama/llama_33b_generate.py delete mode 100755 examples/llama/llama_65b_generate.py delete mode 100755 examples/llama/llama_7b_generate.py delete mode 100644 flagai/data/tokenizer/llama/tokenizer.py delete mode 100644 flagai/model/llama.py delete mode 100644 flagai/model/predictor/llama.py delete mode 100644 flagai/tools/merge_huggingface_weight.py diff --git a/examples/llama/llama_13b_generate.py b/examples/llama/llama_13b_generate.py deleted file mode 100755 index 505bef07..00000000 --- a/examples/llama/llama_13b_generate.py +++ /dev/null @@ -1,26 +0,0 @@ -from flagai.model.predictor.predictor import Predictor -from flagai.auto_model.auto_loader import AutoLoader -from flagai.model.llama import setup_model_parallel - -# torchrun --nproc_per_node 2 llama_13b_generate.py - -local_rank, world_size = setup_model_parallel() - -loader = AutoLoader(task_name="lm", - model_name="llama-13b-en", - ) -model = loader.get_model() -tokenizer = loader.get_tokenizer() -predictor = Predictor(model, tokenizer) - -prompts = ["The capital of Germany is the city of", - "Here is my sonnet in the style of Shakespeare about an artificial intelligence:"] -for text in prompts: - result = predictor.predict_generate_randomsample(text, - out_max_length=256, - temperature=1.0, - top_p=0.9) - if local_rank == 0: - print(result) - print("\n==================================\n") - diff --git a/examples/llama/llama_33b_generate.py b/examples/llama/llama_33b_generate.py deleted file mode 100755 index d04f01b0..00000000 --- a/examples/llama/llama_33b_generate.py +++ /dev/null @@ -1,26 +0,0 @@ -from flagai.model.predictor.predictor import Predictor -from flagai.auto_model.auto_loader import AutoLoader -from flagai.model.llama import setup_model_parallel - -# torchrun --nproc_per_node 4 llama_33b_generate.py - -local_rank, world_size = setup_model_parallel() - -loader = AutoLoader(task_name="lm", - model_name="llama-33b-en", - ) -model = loader.get_model() -tokenizer = loader.get_tokenizer() -predictor = Predictor(model, tokenizer) - -prompts = ["The capital of Germany is the city of", - "Here is my sonnet in the style of Shakespeare about an artificial intelligence:"] -for text in prompts: - result = predictor.predict_generate_randomsample(text, - out_max_length=256, - temperature=1.0, - top_p=0.9) - if local_rank == 0: - print(result) - print("\n==================================\n") - diff --git a/examples/llama/llama_65b_generate.py b/examples/llama/llama_65b_generate.py deleted file mode 100755 index 04ec28be..00000000 --- a/examples/llama/llama_65b_generate.py +++ /dev/null @@ -1,26 +0,0 @@ -from flagai.model.predictor.predictor import Predictor -from flagai.auto_model.auto_loader import AutoLoader -from flagai.model.llama import setup_model_parallel - -# torchrun --nproc_per_node 8 llama_65b_generate.py - -local_rank, world_size = setup_model_parallel() - -loader = AutoLoader(task_name="lm", - model_name="llama-65b-en", - ) -model = loader.get_model() -tokenizer = loader.get_tokenizer() -predictor = Predictor(model, tokenizer) - -prompts = ["The capital of Germany is the city of", - "Here is my sonnet in the style of Shakespeare about an artificial intelligence:"] -for text in prompts: - result = predictor.predict_generate_randomsample(text, - out_max_length=256, - temperature=1.0, - top_p=0.9) - if local_rank == 0: - print(result) - print("\n==================================\n") - diff --git a/examples/llama/llama_7b_generate.py b/examples/llama/llama_7b_generate.py deleted file mode 100755 index 91aeb4c6..00000000 --- a/examples/llama/llama_7b_generate.py +++ /dev/null @@ -1,26 +0,0 @@ -from flagai.model.predictor.predictor import Predictor -from flagai.auto_model.auto_loader import AutoLoader -from flagai.model.llama import setup_model_parallel - -# torchrun --nproc_per_node 1 llama_7b_generate.py - -local_rank, world_size = setup_model_parallel() - -loader = AutoLoader(task_name="lm", - model_name="llama-7b-en", - ) -model = loader.get_model() -tokenizer = loader.get_tokenizer() -predictor = Predictor(model, tokenizer) - -prompts = ["The capital of Germany is the city of", - "Here is my sonnet in the style of Shakespeare about an artificial intelligence:"] -for text in prompts: - result = predictor.predict_generate_randomsample(text, - out_max_length=256, - temperature=1.0, - top_p=0.9) - if local_rank == 0: - print(result) - print("\n==================================\n") - diff --git a/flagai/auto_model/auto_loader.py b/flagai/auto_model/auto_loader.py index d32b6280..cfbbe0db 100644 --- a/flagai/auto_model/auto_loader.py +++ b/flagai/auto_model/auto_loader.py @@ -57,7 +57,6 @@ def __getattr__(self, name): "opt_seq2seq": ("flagai.model.opt_model", "OPTModel"), "opt_lm": ("flagai.model.opt_model", "OPTModel"), "galactica_lm": ("flagai.model.galactica_model", "GalacticaModel"), - "llama_lm": ("flagai.model.llama", "LLAMA",), "vit_classification": ("flagai.model.vision.vit", "VisionTransformer"), "clip_txt_img_matching": ("flagai.model.mm.clip_model", "CLIP"), "swinv1_classification": ("flagai.model.vision.swinv1", "SwinTransformer"), @@ -97,10 +96,6 @@ def __getattr__(self, name): "galactica-6.7b-en": ["flagai.model.galactica_model", "GalacticaModel", "galactica", "nlp", "flagai.data.tokenizer.galactica.galactica_tokenizer", "GalacticaTokenizer"], "galactica-30b-en": ["flagai.model.galactica_model", "GalacticaModel", "galactica", "nlp", "flagai.data.tokenizer.galactica.galactica_tokenizer", "GalacticaTokenizer"], "galactica-120b-en": ["flagai.model.galactica_model", "GalacticaModel", "galactica", "nlp", "flagai.data.tokenizer.galactica.galactica_tokenizer", "GalacticaTokenizer"], - "llama-7b-en": ["flagai.model.llama", "LLAMA", "llama", "nlp", "flagai.data.tokenizer.llama.tokenizer", "Tokenizer"], - "llama-13b-en": ["flagai.model.llama", "LLAMA", "llama", "nlp", "flagai.data.tokenizer.llama.tokenizer", "Tokenizer"], - "llama-30b-en": ["flagai.model.llama", "LLAMA", "llama", "nlp", "flagai.data.tokenizer.llama.tokenizer", "Tokenizer"], - "llama-65b-en": ["flagai.model.llama", "LLAMA", "llama", "nlp", "flagai.data.tokenizer.llama.tokenizer", "Tokenizer"], "vit-base-p16-224": ["flagai.model.vision.vit", "VisionTransformer", "vit", "vision"], "vit-base-p16-384": @@ -217,12 +212,9 @@ def __init__(self, self.model.half() if model_type == "nlp": - if brief_model_name in ["galactica",]: + if brief_model_name in ["galactica", ]: self.tokenizer = getattr(LazyImport(MODEL_DICT[model_name][4]), MODEL_DICT[model_name][5])(download_path) - elif brief_model_name in ["llama",]: - self.tokenizer = getattr(LazyImport(MODEL_DICT[model_name][4]), - MODEL_DICT[model_name][5])(os.path.join(download_path, "tokenizer.model")) else : tokenizer_class = getattr(LazyImport("flagai.data.tokenizer"), "Tokenizer") diff --git a/flagai/data/tokenizer/llama/tokenizer.py b/flagai/data/tokenizer/llama/tokenizer.py deleted file mode 100644 index e4315856..00000000 --- a/flagai/data/tokenizer/llama/tokenizer.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# This software may be used and distributed according to the terms of the GNU General Public License version 3. - -from sentencepiece import SentencePieceProcessor -from logging import getLogger -from typing import List -import os - - -logger = getLogger() - - -class Tokenizer: - def __init__(self, model_path: str): - # reload tokenizer - assert os.path.isfile(model_path), model_path - self.sp_model = SentencePieceProcessor(model_file=model_path) - logger.info(f"Reloaded SentencePiece model from {model_path}") - - # BOS / EOS token IDs - self.n_words: int = self.sp_model.vocab_size() - self.bos_id: int = self.sp_model.bos_id() - self.eos_id: int = self.sp_model.eos_id() - self.pad_id: int = self.sp_model.pad_id() - logger.info( - f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}" - ) - assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() - - def encode(self, s: str, bos: bool, eos: bool) -> List[int]: - assert type(s) is str - t = self.sp_model.encode(s) - if bos: - t = [self.bos_id] + t - if eos: - t = t + [self.eos_id] - return t - - def decode(self, t: List[int]) -> str: - return self.sp_model.decode(t) diff --git a/flagai/model/base_model.py b/flagai/model/base_model.py index 1b390d61..46367e28 100644 --- a/flagai/model/base_model.py +++ b/flagai/model/base_model.py @@ -222,7 +222,7 @@ def download(cls, model_files = eval(_get_model_files(model_name)) print("model files:" + str(model_files)) for file_name in model_files: - if not file_name.endswith("bin") and not file_name.endswith("pth"): + if not file_name.endswith("bin"): _get_vocab_path(os.path.join(download_path, model_name), file_name, model_id) else : _get_checkpoint_path(os.path.join(download_path, model_name), file_name, model_id) \ No newline at end of file diff --git a/flagai/model/llama.py b/flagai/model/llama.py deleted file mode 100644 index 5ba77b4d..00000000 --- a/flagai/model/llama.py +++ /dev/null @@ -1,296 +0,0 @@ - -# Copyright (c) Meta Platforms, Inc. and affiliates. -# This software may be used and distributed according to the terms of the GNU General Public License version 3. - -from typing import Optional, Tuple -from dataclasses import dataclass -import math - -import torch -from torch import nn -import torch.nn.functional as F -from pathlib import Path - -import fairscale.nn.model_parallel.initialize as fs_init -from fairscale.nn.model_parallel.layers import ( - ParallelEmbedding, - RowParallelLinear, - ColumnParallelLinear, -) -import os -from fairscale.nn.model_parallel.initialize import initialize_model_parallel - -from flagai.model.base_model import BaseModel - -def setup_model_parallel() -> Tuple[int, int]: - local_rank = int(os.environ.get("LOCAL_RANK", -1)) - world_size = int(os.environ.get("WORLD_SIZE", -1)) - - torch.distributed.init_process_group("nccl") - initialize_model_parallel(world_size) - torch.cuda.set_device(local_rank) - - # seed must be the same in all processes - torch.manual_seed(1) - return local_rank, world_size - - -@dataclass -class ModelArgs: - dim: int = 512 - n_layers: int = 8 - n_heads: int = 8 - vocab_size: int = -1 # defined later by tokenizer - multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2 - norm_eps: float = 1e-5 - - max_batch_size: int = 32 - max_seq_len: int = 1024 - - -class RMSNorm(torch.nn.Module): - def __init__(self, dim: int, eps: float = 1e-6): - super().__init__() - self.eps = eps - self.weight = nn.Parameter(torch.ones(dim)) - - def _norm(self, x): - return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) - - def forward(self, x): - output = self._norm(x.float()).type_as(x) - return output * self.weight - - -def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0): - freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) - t = torch.arange(end, device=freqs.device) # type: ignore - freqs = torch.outer(t, freqs).float() # type: ignore - freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # complex64 - return freqs_cis - - -def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor): - ndim = x.ndim - assert 0 <= 1 < ndim - assert freqs_cis.shape == (x.shape[1], x.shape[-1]) - shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)] - return freqs_cis.view(*shape) - - -def apply_rotary_emb( - xq: torch.Tensor, - xk: torch.Tensor, - freqs_cis: torch.Tensor, -) -> Tuple[torch.Tensor, torch.Tensor]: - xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) - xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) - freqs_cis = reshape_for_broadcast(freqs_cis, xq_) - xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3) - xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3) - return xq_out.type_as(xq), xk_out.type_as(xk) - - -class Attention(nn.Module): - def __init__(self, args: ModelArgs): - super().__init__() - - self.n_local_heads = args.n_heads // fs_init.get_model_parallel_world_size() - self.head_dim = args.dim // args.n_heads - - self.wq = ColumnParallelLinear( - args.dim, - args.n_heads * self.head_dim, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.wk = ColumnParallelLinear( - args.dim, - args.n_heads * self.head_dim, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.wv = ColumnParallelLinear( - args.dim, - args.n_heads * self.head_dim, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.wo = RowParallelLinear( - args.n_heads * self.head_dim, - args.dim, - bias=False, - input_is_parallel=True, - init_method=lambda x: x, - ) - - self.cache_k = torch.zeros( - (args.max_batch_size, args.max_seq_len, self.n_local_heads, self.head_dim) - ).cuda() - self.cache_v = torch.zeros( - (args.max_batch_size, args.max_seq_len, self.n_local_heads, self.head_dim) - ).cuda() - - def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]): - bsz, seqlen, _ = x.shape - xq, xk, xv = self.wq(x), self.wk(x), self.wv(x) - - xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim) - xk = xk.view(bsz, seqlen, self.n_local_heads, self.head_dim) - xv = xv.view(bsz, seqlen, self.n_local_heads, self.head_dim) - - xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis) - - self.cache_k = self.cache_k.to(xq) - self.cache_v = self.cache_v.to(xq) - - self.cache_k[:bsz, start_pos : start_pos + seqlen] = xk - self.cache_v[:bsz, start_pos : start_pos + seqlen] = xv - - keys = self.cache_k[:bsz, : start_pos + seqlen] - values = self.cache_v[:bsz, : start_pos + seqlen] - - xq = xq.transpose(1, 2) - keys = keys.transpose(1, 2) - values = values.transpose(1, 2) - scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim) - if mask is not None: - scores = scores + mask # (bs, n_local_heads, slen, cache_len + slen) - scores = F.softmax(scores.float(), dim=-1).type_as(xq) - output = torch.matmul(scores, values) # (bs, n_local_heads, slen, head_dim) - output = output.transpose( - 1, 2 - ).contiguous().view(bsz, seqlen, -1) - - return self.wo(output) - - -class FeedForward(nn.Module): - def __init__( - self, - dim: int, - hidden_dim: int, - multiple_of: int, - ): - super().__init__() - hidden_dim = int(2 * hidden_dim / 3) - hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - - self.w1 = ColumnParallelLinear( - dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x - ) - self.w2 = RowParallelLinear( - hidden_dim, dim, bias=False, input_is_parallel=True, init_method=lambda x: x - ) - self.w3 = ColumnParallelLinear( - dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x - ) - - def forward(self, x): - return self.w2(F.silu(self.w1(x)) * self.w3(x)) - - -class TransformerBlock(nn.Module): - def __init__(self, layer_id: int, args: ModelArgs): - super().__init__() - self.n_heads = args.n_heads - self.dim = args.dim - self.head_dim = args.dim // args.n_heads - self.attention = Attention(args) - self.feed_forward = FeedForward( - dim=args.dim, hidden_dim=4 * args.dim, multiple_of=args.multiple_of - ) - self.layer_id = layer_id - self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps) - self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps) - - def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]): - h = x + self.attention.forward(self.attention_norm(x), start_pos, freqs_cis, mask) - out = h + self.feed_forward.forward(self.ffn_norm(h)) - return out - - -class LLAMA(BaseModel): - def __init__(self, config, **kwargs): - super().__init__(config, **kwargs) - - params: ModelArgs = ModelArgs(max_seq_len=1024, max_batch_size=32, - multiple_of=config["multiple_of"], - dim=config["dim"], - n_heads=config["n_heads"], - vocab_size=config["vocab_size"], - norm_eps=config["norm_eps"], - n_layers=config["n_layers"]) - - self.params = params - self.vocab_size = params.vocab_size - self.n_layers = params.n_layers - - self.tok_embeddings = ParallelEmbedding( - params.vocab_size, params.dim, init_method=lambda x: x - ) - - self.layers = torch.nn.ModuleList() - for layer_id in range(params.n_layers): - self.layers.append(TransformerBlock(layer_id, params)) - - self.norm = RMSNorm(params.dim, eps=params.norm_eps) - self.output = ColumnParallelLinear( - params.dim, params.vocab_size, bias=False, init_method=lambda x: x - ) - - self.freqs_cis = precompute_freqs_cis( - self.params.dim // self.params.n_heads, self.params.max_seq_len * 2 - ) - - @torch.inference_mode() - def forward(self, tokens: torch.Tensor, start_pos: int): - _bsz, seqlen = tokens.shape - h = self.tok_embeddings(tokens) - self.freqs_cis = self.freqs_cis.to(h.device) - freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen] - - mask = None - if seqlen > 1: - mask = torch.full((1, 1, seqlen, seqlen), float("-inf"), device=tokens.device) - mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h) - - for layer in self.layers: - h = layer(h, start_pos, freqs_cis, mask) - h = self.norm(h) - output = self.output(h[:, -1, :]) # only compute last logits - return output.float() - - def load_weights(self, checkpoint_path): - self.load_state_dict(torch.load(checkpoint_path, map_location="cpu")) - print(f"model params are loaded successfully...") - - @classmethod - def from_pretrain(cls, - download_path='./checkpoints/', - model_name='RoBERTa-base-ch', - only_download_config=False, - device="cpu", - **kwargs): - super().download(download_path, model_name) - pretrained_model_name_or_path = os.path.join(download_path, model_name) - local_rank = int(os.environ.get("LOCAL_RANK", -1)) - world_size = int(os.environ.get("WORLD_SIZE", -1)) - - checkpoints = sorted(Path(pretrained_model_name_or_path).glob("*.pth")) - assert ( - world_size == len(checkpoints) - ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}" - ckpt_path = checkpoints[local_rank] - print("Loading") - checkpoint = torch.load(ckpt_path, map_location="cpu") - torch.set_default_tensor_type(torch.cuda.HalfTensor) - model = LLAMA.init_from_json(os.path.join(pretrained_model_name_or_path, "config.json")) - torch.set_default_tensor_type(torch.FloatTensor) - - model.load_state_dict(checkpoint, strict=False) - - return model \ No newline at end of file diff --git a/flagai/model/predictor/llama.py b/flagai/model/predictor/llama.py deleted file mode 100644 index c246bdd1..00000000 --- a/flagai/model/predictor/llama.py +++ /dev/null @@ -1,64 +0,0 @@ -from typing import List -import torch - - -def llama_generate( - tokenizer, - model, - prompts: List[str], - max_gen_len: int, - temperature: float = 0.8, - top_p: float = 0.95, - ) -> List[str]: - bsz = len(prompts) - - prompt_tokens = [tokenizer.encode(x, bos=True, eos=False) for x in prompts] - - min_prompt_size = min([len(t) for t in prompt_tokens]) - max_prompt_size = max([len(t) for t in prompt_tokens]) - - total_len = min(1024, max_gen_len + max_prompt_size) - - tokens = torch.full((bsz, total_len), tokenizer.pad_id).cuda().long() - for k, t in enumerate(prompt_tokens): - tokens[k, : len(t)] = torch.tensor(t).long() - input_text_mask = tokens != tokenizer.pad_id - start_pos = min_prompt_size - prev_pos = 0 - for cur_pos in range(start_pos, total_len): - logits = model.forward(tokens[:, prev_pos:cur_pos], prev_pos) - if temperature > 0: - probs = torch.softmax(logits / temperature, dim=-1) - next_token = sample_top_p(probs, top_p) - else: - next_token = torch.argmax(logits, dim=-1) - next_token = next_token.reshape(-1) - # only replace token if prompt has already been generated - next_token = torch.where( - input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token - ) - tokens[:, cur_pos] = next_token - prev_pos = cur_pos - - decoded = [] - for i, t in enumerate(tokens.tolist()): - # cut to max gen len - t = t[: len(prompt_tokens[i]) + max_gen_len] - # cut to eos tok if any - try: - t = t[: t.index(tokenizer.eos_id)] - except ValueError: - pass - decoded.append(tokenizer.decode(t)) - return decoded - - -def sample_top_p(probs, p): - probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True) - probs_sum = torch.cumsum(probs_sort, dim=-1) - mask = probs_sum - probs_sort > p - probs_sort[mask] = 0.0 - probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True)) - next_token = torch.multinomial(probs_sort, num_samples=1) - next_token = torch.gather(probs_idx, -1, next_token) - return next_token \ No newline at end of file diff --git a/flagai/model/predictor/predictor.py b/flagai/model/predictor/predictor.py index 33e660cb..03cf7ff4 100644 --- a/flagai/model/predictor/predictor.py +++ b/flagai/model/predictor/predictor.py @@ -18,7 +18,6 @@ from contextlib import contextmanager, nullcontext from einops import rearrange from torch.cuda.amp import autocast as autocast -from .llama import llama_generate class Predictor: def __init__(self, model, tokenizer=None): @@ -346,10 +345,6 @@ def predict_generate_randomsample(self, input_max_length, out_max_length, top_k, top_p, repetition_penalty, temperature, device) - elif "llama" in self.class_name.lower(): - return llama_generate(self.tokenizer, self.model, - [text], out_max_length, - temperature, top_p) else: print("Unsupported decoding mode") diff --git a/flagai/mp_tools.py b/flagai/mp_tools.py index 201540a0..f4dfff0b 100644 --- a/flagai/mp_tools.py +++ b/flagai/mp_tools.py @@ -7,24 +7,15 @@ import copy from_1_to_n_models = { - "gpt2": { + "gpt": { "wte.weight": 0, "attn.c_attn.weight": 30, "attn.c_attn.bias": 30, - "attn.c_proj.weight": 0, - "mlp.c_fc.weight": 1, + "attn.c_proj.weight": 1, + "mlp.c_fc.weight": 0, "mlp.c_fc.bias": 0, - "mlp.c_proj.weight": 0, + "mlp.c_proj.weight": 1, }, - # "gpt2": { - # "wte.weight": 0, - # "attn.c_attn.weight": 30, - # "attn.c_attn.bias": 30, - # "attn.c_proj.weight": 1, - # "mlp.c_fc.weight": 0, - # "mlp.c_fc.bias": 0, - # "mlp.c_proj.weight": 1, - # }, "opt": { "decoder.embed_tokens.weight": 0, "self_attn.k_proj.weight": 0, @@ -39,20 +30,6 @@ "fc1.bias": 0, "fc2.weight": 1, }, - "galactica": { - "decoder.embed_tokens.weight": 0, - "self_attn.k_proj.weight": 0, - "self_attn.k_proj.bias": 0, - "self_attn.q_proj.weight": 0, - "self_attn.q_proj.bias": 0, - "self_attn.v_proj.weight": 0, - "self_attn.v_proj.bias": 0, - - "self_attn.out_proj.weight": 1, - "fc1.weight": 0, - "fc1.bias": 0, - "fc2.weight": 1, - }, "glm": { "word_embeddings.weight": 0, "attention.query_key_value.weight": 30, @@ -261,8 +238,7 @@ def change_pytorch_model_mp_from_1_to_n_new(model_name_brief, checkpoint: str, t d = d["module"] for k, v in d.items(): - if len(v.shape) > 2: - continue + assert len(v.shape) < 3 flag = 0 for keys in trans_keys: if keys in k: @@ -285,21 +261,6 @@ def change_pytorch_model_mp_from_1_to_n_new(model_name_brief, checkpoint: str, t ], 0) break - elif dim == 31: - v = v.permute(1, 0) - part = v.shape[0] // ratio // 3 - v = torch.cat([ - v[shift * part:(shift + 1) * - part, :].clone(), - v[(shift + ratio) * - part:(shift + 1 + ratio) * - part, :].clone(), - v[(shift + 2 * ratio) * - part:(shift + 1 + 2 * ratio) * - part, :].clone() - ], 0) - v = v.permute(1, 0) - elif dim == 0: part = v.shape[dim] // ratio d_new['module'][k] = v[shift * @@ -451,4 +412,4 @@ def change_pytorch_model_mp_from_n_to_1(model_name_brief, checkpoint): if __name__ == "__main__": change_pytorch_model_mp_from_1_to_n( - '/mnt/test_10b_models/state_dict/GLM-10b-en', 2) + '/mnt/test_10b_models/state_dict/GLM-10b-en', 2) \ No newline at end of file diff --git a/flagai/tools/merge_huggingface_weight.py b/flagai/tools/merge_huggingface_weight.py deleted file mode 100644 index 28641539..00000000 --- a/flagai/tools/merge_huggingface_weight.py +++ /dev/null @@ -1,25 +0,0 @@ - -import os -import torch - -def merge_weight(model_dir): - model_files = os.listdir(model_dir) - checkpoint_merge = {} - print(f"merging the model weight....") - # multi weights files - for file_to_load in model_files: - if "pytorch_model-0" in file_to_load: - checkpoint_to_load = torch.load(os.path.join(model_dir, file_to_load),map_location="cpu") - for k, v in checkpoint_to_load.items(): - checkpoint_merge[k] = v - print(f"{file_to_load} is merged successfully.") - # save all parameters - torch.save( - checkpoint_merge, - os.path.join(model_dir, "pytorch_model.bin")) - print(f"models are merged successfully.") - - -if __name__ == "__main__": - # merge_weight(model_dir="/share/projset/baaishare/baai-mrnd/xingzhaohu/galactica-6.7b-en/") - merge_weight(model_dir="./state_dict/opt-6.7b-en") \ No newline at end of file From 249b127b3bed08c6889d8aefa8161f0e21e1a977 Mon Sep 17 00:00:00 2001 From: ftgreat Date: Tue, 7 Mar 2023 16:55:38 +0800 Subject: [PATCH 32/54] updated Signed-off-by: ftgreat --- flagai/data/tokenizer/cpm_1/cpm1_tokenizer.py | 1 + .../data/tokenizer/uni_tokenizer/tokenizer.py | 3 +- tests/test_tokenizer.py | 47 ++++++------------- 3 files changed, 16 insertions(+), 35 deletions(-) diff --git a/flagai/data/tokenizer/cpm_1/cpm1_tokenizer.py b/flagai/data/tokenizer/cpm_1/cpm1_tokenizer.py index 7bae0deb..bd2616a4 100644 --- a/flagai/data/tokenizer/cpm_1/cpm1_tokenizer.py +++ b/flagai/data/tokenizer/cpm_1/cpm1_tokenizer.py @@ -35,6 +35,7 @@ class CPMTokenizer(object): def __init__(self, vocab_file, model_file, max_length=None): self.max_len = max_length if max_length is not None else int(1e12) self.encoder = json.load(open(vocab_file)) + import pdb;pdb.set_trace() self.decoder = {v: k for k, v in self.encoder.items()} self.sp = spm.SentencePieceProcessor(model_file=model_file) diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py index 7ea53166..f14b4314 100644 --- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py @@ -69,7 +69,7 @@ def __init__(self, self.text_tokenizer = BPETokenizer(self.vocab_file, self.merges_file) elif self.tokenizer_class == "sp": - if self.tokenizer_model_name.lower().startswith('cpm1'): + if self.tokenizer_model_name.lower().startswith('cpm'): from flagai.data.tokenizer.cpm_1.cpm1_tokenizer import CPMTokenizer self.text_tokenizer = CPMTokenizer(self.tokenizer_json_file, self.sp_model_file) elif self.tokenizer_model_name.lower().startswith('cpm3'): @@ -226,7 +226,6 @@ def _encode(self, text): return ids def convert_tokens_to_ids(self, tokens): - import pdb;pdb.set_trace() res = [] for token in tokens: if token in self.command_token_map: diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 40d276ae..01ecd062 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -91,41 +91,23 @@ class TokenizerTestCase(unittest.TestCase): # [('eos', '[PAD]', 0), ('unk', '[UNK]', 100), ('cls', '[CLS]', 101), ('sep', '[SEP]', 102), # ('mask', '[MASK]', 103), ('pad', '[PAD]', 0)], 'SpecialTokens error') - # def test_tokenizer_cpm1(self): - # loader = AutoLoader(task_name="lm", - # model_name="CPM-large-ch", - # model_dir="./checkpoints/", - # only_download_config=True) - # tokenizer = loader.get_tokenizer() - # self.assertEqual(tokenizer.encode("day"), [8, 8275], '') - # self.assertEqual(tokenizer.encode("fried chicken makes me happy"), - # [2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239], '') - # self.assertEqual(tokenizer.decode([2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239]), - # 'fried chicken makes me happy', 'DecodeIds Error') - # self.assertEqual(tokenizer.tokenize('fried chicken makes me happy'), - # ['▁f', 'ried', '▁ch', 'ick', 'en', '▁make', 's', '▁me', '▁happy'], 'tokenize Error') - # self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'], - # [1, 2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239, 2], 'encode_plus Error') - # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - # [('unk', '', 0), ('cls', '', 1), ('eos', '', 2), ('sep', '', 4), ('mask', '', 6), ('eod', '', 7), ('eop', '', 0)]) - - def test_tokenizer_cpm2_large(self): + def test_tokenizer_cpm1(self): loader = AutoLoader(task_name="lm", - model_name="CPM2-Xlarge-ch", + model_name="CPM-large-ch", model_dir="./checkpoints/", only_download_config=True) - tokenizer = Tokenizer.from_pretrained("CPM2-Xlarge-ch") - self.assertEqual(tokenizer.TokenToId("人"), 38, '') - self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), - [1540, 243, 225, 1511, 225, 21, 3041, 467, 995], '') - self.assertEqual(tokenizer.DecodeIds([1540, 243, 225, 1511, 225, 21, 3041, 467, 995]), - '今天吃饭吃了肯德基', 'DecodeIds Error') - self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), - ['今', '天', '吃', '饭', '吃', '了', '肯', '德', '基'], 'tokenize Error') - self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], - [1, 1540, 243, 225, 1511, 225, 21, 3041, 467, 995, 2], 'encode_plus Error') + tokenizer = loader.get_tokenizer() + self.assertEqual(tokenizer.encode("day"), [8, 8275], '') + self.assertEqual(tokenizer.encode("fried chicken makes me happy"), + [2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239], '') + self.assertEqual(tokenizer.decode([2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239]), + 'fried chicken makes me happy', 'DecodeIds Error') + self.assertEqual(tokenizer.tokenize('fried chicken makes me happy'), + ['▁f', 'ried', '▁ch', 'ick', 'en', '▁make', 's', '▁me', '▁happy'], 'tokenize Error') + self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'], + [1, 2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239, 2], 'encode_plus Error') self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - [('unk', '', 0), ('cls', '', 1), ('eos', '', 2), ('sep', '', 4), ('mask', '', 6), ('eod', '', 7)]) + [('unk', '', 0), ('cls', '', 1), ('eos', '', 2), ('sep', '', 4), ('mask', '', 6), ('eod', '', 7), ('eop', '', 0)]) # def test_tokenizer_opt(self): # tokenizer = Tokenizer.from_pretrained('opt-1.3b-en') @@ -165,8 +147,7 @@ def suite(): # suite.addTest(TokenizerTestCase('test_tokenizer_t5')) # suite.addTest(TokenizerTestCase('test_tokenizer_roberta')) # suite.addTest(TokenizerTestCase('test_tokenizer_bert')) - # suite.addTest(TokenizerTestCase('test_tokenizer_cpm1')) - suite.addTest(TokenizerTestCase('test_tokenizer_cpm2_large')) + suite.addTest(TokenizerTestCase('test_tokenizer_cpm1')) # suite.addTest(TokenizerTestCase('test_tokenizer_opt')) # suite.addTest(TokenizerTestCase('test_tokenizer_clip')) # suite.addTest(TokenizerTestCase('test_tokenizer_evaclip')) From b01e285e6875d8390e523c3d23435496fa4e7422 Mon Sep 17 00:00:00 2001 From: ftgreat Date: Wed, 8 Mar 2023 14:35:24 +0800 Subject: [PATCH 33/54] fixed issue246 Signed-off-by: ftgreat --- examples/glm_blank_filling/README.md | 33 +++++++------------ .../glm_blank_filling/glm_generate_samples.py | 3 +- 2 files changed, 14 insertions(+), 22 deletions(-) diff --git a/examples/glm_blank_filling/README.md b/examples/glm_blank_filling/README.md index ac1803f0..27a65aa3 100644 --- a/examples/glm_blank_filling/README.md +++ b/examples/glm_blank_filling/README.md @@ -43,17 +43,14 @@ filling task ```python import torch from flagai.model.glm_model import GLMModel -from flagai.data.tokenizer import GLMLargeChTokenizer +from flagai.data.tokenizer import Tokenizer from flagai.model.predictor.predictor import Predictor if __name__ == "__main__": """Main training program.""" print('Generate Samples') - tokenizer = GLMLargeChTokenizer(vocab_path='./checkpoints/glm-large-ch/cog-pretrain.model', - add_block_symbols=True, - add_task_mask=True, - add_decoder_mask=False, - fix_command_token=False) - model = GLMModel.from_pretrain(model_name='glm-large-ch', only_download_config=False) + tokenizer = Tokenizer.from_pretrained(model_name) + model = GLMModel.from_pretrain(model_name=model_name, + download_path="./checkpoints") model.cuda(torch.cuda.current_device()) predictor = Predictor(model, tokenizer) # question-answering @@ -67,17 +64,14 @@ Similar to BERT, GLM can predict masked tokens as ```python import torch from flagai.model.glm_model import GLMModel -from flagai.data.tokenizer import GLMLargeChTokenizer +from flagai.data.tokenizer import Tokenizer from flagai.model.predictor.predictor import Predictor if __name__ == "__main__": """Main training program.""" print('Generate Samples') - tokenizer = GLMLargeChTokenizer(vocab_path='./checkpoints/glm-large-ch/cog-pretrain.model', - add_block_symbols=True, - add_task_mask=True, - add_decoder_mask=False, - fix_command_token=False) - model = GLMModel.from_pretrain(model_name='glm-large-ch', only_download_config=False) + tokenizer = Tokenizer.from_pretrained(model_name) + model = GLMModel.from_pretrain(model_name=model_name, + download_path="./checkpoints") model.cuda(torch.cuda.current_device()) predictor = Predictor(model, tokenizer) # question-answering @@ -90,17 +84,14 @@ and predict masked sentences as ```python import torch from flagai.model.glm_model import GLMModel -from flagai.data.tokenizer import GLMLargeChTokenizer +from flagai.data.tokenizer import Tokenizer from flagai.model.predictor.predictor import Predictor if __name__ == "__main__": """Main training program.""" print('Generate Samples') - tokenizer = GLMLargeChTokenizer(vocab_path='./checkpoints/glm-large-ch/cog-pretrain.model', - add_block_symbols=True, - add_task_mask=True, - add_decoder_mask=False, - fix_command_token=False) - model = GLMModel.from_pretrain(model_name='glm-large-ch', only_download_config=False) + tokenizer = Tokenizer.from_pretrained(model_name) + model = GLMModel.from_pretrain(model_name=model_name, + download_path="./checkpoints") model.cuda(torch.cuda.current_device()) predictor = Predictor(model, tokenizer) # question-answering diff --git a/examples/glm_blank_filling/glm_generate_samples.py b/examples/glm_blank_filling/glm_generate_samples.py index f6266887..b1837478 100644 --- a/examples/glm_blank_filling/glm_generate_samples.py +++ b/examples/glm_blank_filling/glm_generate_samples.py @@ -1,7 +1,8 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") - +import sys +sys.path.append("/home/yanzhaodong/anhforth/FlagAI") import torch from flagai.model.glm_model import GLMModel from flagai.data.tokenizer import Tokenizer From a792538c4ef2e991374b0e89ad7d9fa61611e7ac Mon Sep 17 00:00:00 2001 From: ftgreat Date: Wed, 8 Mar 2023 14:37:51 +0800 Subject: [PATCH 34/54] updated Signed-off-by: ftgreat --- .../bminf_generate/gpt2_generate_original.py | 35 +++++++++++++++++++ .../glm_blank_filling/glm_generate_samples.py | 2 -- 2 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 examples/bminf_generate/gpt2_generate_original.py diff --git a/examples/bminf_generate/gpt2_generate_original.py b/examples/bminf_generate/gpt2_generate_original.py new file mode 100644 index 00000000..54ada5c6 --- /dev/null +++ b/examples/bminf_generate/gpt2_generate_original.py @@ -0,0 +1,35 @@ +# Copyright © 2022 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") +import torch +from flagai.auto_model.auto_loader import AutoLoader +from flagai.model.predictor.predictor import Predictor +import bminf +import time + + +if __name__ == '__main__': + loader = AutoLoader("seq2seq", + "GPT2-base-ch", + model_dir="./checkpoints/") + model = loader.get_model() + tokenizer = loader.get_tokenizer() + time_start=time.time() + # with torch.cuda.device(0): + # model = bminf.wrapper(model, quantization=False, memory_limit=20 << 30) + model.cuda() + predictor = Predictor(model, tokenizer) + + text = "今天天气不错" + + out_2 = predictor.predict_generate_randomsample(text, + input_max_length=512, + out_max_length=100, + repetition_penalty=1.5, + top_k=20, + top_p=0.8) + + time_end=time.time() + print('time cost',time_end-time_start,'s') + # print(f"out_1 is {out_1}") + print(f"out_2 is {out_2}") diff --git a/examples/glm_blank_filling/glm_generate_samples.py b/examples/glm_blank_filling/glm_generate_samples.py index b1837478..01b1bf00 100644 --- a/examples/glm_blank_filling/glm_generate_samples.py +++ b/examples/glm_blank_filling/glm_generate_samples.py @@ -1,8 +1,6 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") -import sys -sys.path.append("/home/yanzhaodong/anhforth/FlagAI") import torch from flagai.model.glm_model import GLMModel from flagai.data.tokenizer import Tokenizer From ff5028be0d0121a73f0e165e0ab0af4622d906c5 Mon Sep 17 00:00:00 2001 From: ftgreat Date: Fri, 10 Mar 2023 10:52:17 +0800 Subject: [PATCH 35/54] updated Signed-off-by: ftgreat --- .gitignore | 1 + examples/AltCLIP/altclip_finetuning.py | 4 +- examples/AltDiffusion/generate.py | 10 +- .../glm_blank_filling/glm_generate_samples.py | 25 +- examples/opt/generate_opt_350m.py | 1 + examples/roberta_ner/generate.py | 2 - flagai/data/tokenizer/cpm_1/cpm1_tokenizer.py | 48 +++- .../tokenizer/uni_tokenizer/base_tokenizer.py | 3 +- .../tokenizer/uni_tokenizer/bpe_tokenizer.py | 2 +- .../data/tokenizer/uni_tokenizer/tokenizer.py | 48 +++- flagai/model/predictor/utils.py | 1 - tests/test_model.py | 162 ++++++++++++ tests/test_tokenizer.py | 234 +++++++++--------- 13 files changed, 391 insertions(+), 150 deletions(-) create mode 100644 tests/test_model.py diff --git a/.gitignore b/.gitignore index 2a481889..d8c8629e 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,4 @@ datasets qqp glm_large_qqp_pytorch wandb +clip_benchmark_datasets diff --git a/examples/AltCLIP/altclip_finetuning.py b/examples/AltCLIP/altclip_finetuning.py index 2b95fd4c..d98095e7 100644 --- a/examples/AltCLIP/altclip_finetuning.py +++ b/examples/AltCLIP/altclip_finetuning.py @@ -1,6 +1,7 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") +import sys;sys.path.append("/home/yanzhaodong/anhforth/FlagAI") import torch from flagai.auto_model.auto_loader import AutoLoader import os @@ -32,6 +33,7 @@ trainer = Trainer(env_type="pytorch", pytorch_device=device, experiment_name="clip_finetuning", + eval_interval=10, batch_size=4, lr=1e-4, epochs=10, @@ -62,4 +64,4 @@ def cifar10_collate_fn(batch): } if __name__ == "__main__": - trainer.train(model=model, train_dataset=dataset, collate_fn=cifar10_collate_fn) \ No newline at end of file + trainer.train(model=model, train_dataset=dataset, collate_fn=cifar10_collate_fn, metric_methods=["accuracy"]) \ No newline at end of file diff --git a/examples/AltDiffusion/generate.py b/examples/AltDiffusion/generate.py index 1fa2c88f..3fb72642 100644 --- a/examples/AltDiffusion/generate.py +++ b/examples/AltDiffusion/generate.py @@ -1,6 +1,7 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") +import sys;sys.path.append("/home/yanzhaodong/anhforth/FlagAI") import torch from flagai.auto_model.auto_loader import AutoLoader from flagai.model.predictor.predictor import Predictor @@ -9,7 +10,7 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") loader = AutoLoader(task_name="text2img", #contrastive learning - model_name="AltDiffusion-m9", + model_name="AltDiffusion", model_dir="./checkpoints", use_fp16=False) @@ -17,6 +18,9 @@ model.eval() model.to(device) predictor = Predictor(model) +# predictor.predict_generate_images( +# "Anime portrait of natalie portman as an anime girl by stanley artgerm lau, wlop, rossdraws, james jean, andrei riabovitchev, marc simonetti, and sakimichan, trending on artstation" +# ) predictor.predict_generate_images( - "Anime portrait of natalie portman as an anime girl by stanley artgerm lau, wlop, rossdraws, james jean, andrei riabovitchev, marc simonetti, and sakimichan, trending on artstation" -) + "Anime portrait of a crouching smiling baby inside a glass bottle, wlop, rossdraws, james jean, andrei riabovitchev, marc simonetti, and sakimichan, trending on artstation" +) \ No newline at end of file diff --git a/examples/glm_blank_filling/glm_generate_samples.py b/examples/glm_blank_filling/glm_generate_samples.py index 700407a3..f12fdfe9 100644 --- a/examples/glm_blank_filling/glm_generate_samples.py +++ b/examples/glm_blank_filling/glm_generate_samples.py @@ -7,22 +7,27 @@ from flagai.model.glm_model import GLMModel from flagai.data.tokenizer import Tokenizer from flagai.model.predictor.predictor import Predictor +import bminf if __name__ == "__main__": """Main training program.""" print('Generate Samples') # Random seeds for reproducability. # Model, - model_name = 'GLM-large-ch' + model_name = 'GLM-10b-ch' model = GLMModel.from_pretrain(model_name=model_name, download_path="./checkpoints") tokenizer = Tokenizer.from_pretrained(model_name) - - # model.load_state_dict(torch.load("../glm_pretrain/checkpoints/1000/pytorch_model.bin")["module"]) - model.cuda(torch.cuda.current_device()) - + with torch.cuda.device(0): + model = bminf.wrapper(model, quantization=False, memory_limit=30 << 39) predictor = Predictor(model, tokenizer) + text = ["今天天气不错[gMASK]"] + for t in text: + output = predictor.predict_generate_randomsample( + t, top_k=50, repetition_penalty=4.0, top_p=1.0) + print(t, '\n', output) + # text = [ # '问题:啤酒伤胃吗?回答:[gMASK]', "问题:隔夜菜能吃吗?回答:[gMASK]", "问题:如何评价许嵩?回答:[gMASK]" # ] @@ -31,11 +36,11 @@ # t, top_k=50, repetition_penalty=4.0, top_p=1.0) # print(t, '\n', output) - text = ['北京故宫是中国[MASK]非物质文化遗产。', "上海是中国[MASK]大都市。", "天津大学是[MASK]现代大学。"] - for t in text: - output = predictor.predict_generate_randomsample( - t, top_k=50, repetition_penalty=4.0, top_p=1.0) - print(t, '\n', output) + # text = ['北京故宫是中国[MASK]非物质文化遗产。', "上海是中国[MASK]大都市。", "天津大学是[MASK]现代大学。"] + # for t in text: + # output = predictor.predict_generate_randomsample( + # t, top_k=50, repetition_penalty=4.0, top_p=1.0) + # print(t, '\n', output) # text = [ # "人工智能是一个以计算机科学为基础,由计算机、数学、哲学等多学科交叉融合的交叉学科,[sMASK],具有非常巨大的前景。", diff --git a/examples/opt/generate_opt_350m.py b/examples/opt/generate_opt_350m.py index 132eeffd..747266cf 100644 --- a/examples/opt/generate_opt_350m.py +++ b/examples/opt/generate_opt_350m.py @@ -1,3 +1,4 @@ +import sys;sys.path.append("/home/yanzhaodong/anhforth/FlagAI") from flagai.model.predictor.predictor import Predictor from flagai.auto_model.auto_loader import AutoLoader import torch diff --git a/examples/roberta_ner/generate.py b/examples/roberta_ner/generate.py index 303a4d93..79b2dbb1 100644 --- a/examples/roberta_ner/generate.py +++ b/examples/roberta_ner/generate.py @@ -1,8 +1,6 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") -# import sys -# sys.path.append("/home/yanzhaodong/anhforth/FlagAI") import torch from flagai.auto_model.auto_loader import AutoLoader from flagai.model.predictor.predictor import Predictor diff --git a/flagai/data/tokenizer/cpm_1/cpm1_tokenizer.py b/flagai/data/tokenizer/cpm_1/cpm1_tokenizer.py index bd2616a4..24275e1f 100644 --- a/flagai/data/tokenizer/cpm_1/cpm1_tokenizer.py +++ b/flagai/data/tokenizer/cpm_1/cpm1_tokenizer.py @@ -35,10 +35,9 @@ class CPMTokenizer(object): def __init__(self, vocab_file, model_file, max_length=None): self.max_len = max_length if max_length is not None else int(1e12) self.encoder = json.load(open(vocab_file)) - import pdb;pdb.set_trace() self.decoder = {v: k for k, v in self.encoder.items()} - self.sp = spm.SentencePieceProcessor(model_file=model_file) + self.sp_model = spm.SentencePieceProcessor(model_file=model_file) self.translator = str.maketrans(" \n", "\u2582\u2583") self.token_start_id = 0 self.token_end_id = 3 @@ -49,6 +48,13 @@ def __init__(self, vocab_file, model_file, max_length=None): def vocab_size(self): return len(self.encoder) + def get_vocab(self): + vocab = { + self.convert_id_to_token(i): i + for i in range(self.vocab_size) + } + return vocab + def __len__(self): return len(self.encoder) + len(self.special_tokens) @@ -58,19 +64,28 @@ def eod(self): def tokenize(self, text): """ Tokenize a string. """ - seg_list = [ - x.translate(self.translator) - for x in jieba.cut(text, cut_all=False) - ] - new_seg = " ".join(seg_list) - return self.sp.encode(new_seg) + seg_list = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)] + new_seg = "".join(seg_list) + return self.sp_model.encode(new_seg) def encode(self, text): res = self.tokenize(text) return res + + def convert_tokens_to_ids(self, tokens): + return [self.sp_model.PieceToId(token) for token in tokens] + + def convert_token_to_id(self, token): + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, idx): + return self.sp_model.IdToPiece(int(idx)) + + def convert_ids_to_tokens(self, idxs): + return [self.sp_model.IdToPiece(int(idx)) for idx in idxs] def decode(self, tokens): - text = self.sp.decode(tokens) + text = self.sp_model.decode(tokens) text = text.replace(' ', '').replace('\u2582', ' ').replace('\u2583', '\n') return text @@ -79,3 +94,18 @@ def encode_plus(self, text, max_length=None): res = self.encode(text) return {"input_ids": res} + + def convert_tokens_to_string(self, tokens, all_command_token={}): + """Converts a sequence of tokens (string) in a single string.""" + current_sub_tokens = [] + out_string = "" + for token in tokens: + # make sure that special tokens are not decoded using sentencepiece model + if token in all_command_token: + out_string += self.sp_model.decode_pieces( + current_sub_tokens) + token + " " + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + out_string += self.sp_model.decode_pieces(current_sub_tokens) + return out_string.strip() diff --git a/flagai/data/tokenizer/uni_tokenizer/base_tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/base_tokenizer.py index f3583437..37629623 100644 --- a/flagai/data/tokenizer/uni_tokenizer/base_tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/base_tokenizer.py @@ -83,7 +83,8 @@ def from_pretrained(cls, *inputs, **kwargs) elif tokenizer_class == "sp": - return cls(sp_model_file=resolved_sp_file, + return cls(vocab_file=resolved_vocab_json_file, + sp_model_file=resolved_sp_file, tokenizer_class=tokenizer_class, tokenizer_model_name=tokenizer_model_name, tokenizer_json_file=resolved_tokenizer_json_file, diff --git a/flagai/data/tokenizer/uni_tokenizer/bpe_tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/bpe_tokenizer.py index a873c54e..1f175e16 100644 --- a/flagai/data/tokenizer/uni_tokenizer/bpe_tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/bpe_tokenizer.py @@ -152,7 +152,7 @@ def tokenize(self, text): def convert_token_to_id(self, token): """ Converts a sequence of tokens into ids using the vocab. """ - return self.encoder.get(token, 0) + return self.encoder[token] def convert_tokens_to_ids(self, tokens): """ Converts a sequence of tokens into ids using the vocab. """ diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py index f14b4314..cec7a8fd 100644 --- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py @@ -71,7 +71,7 @@ def __init__(self, elif self.tokenizer_class == "sp": if self.tokenizer_model_name.lower().startswith('cpm'): from flagai.data.tokenizer.cpm_1.cpm1_tokenizer import CPMTokenizer - self.text_tokenizer = CPMTokenizer(self.tokenizer_json_file, self.sp_model_file) + self.text_tokenizer = CPMTokenizer(self.vocab_file, self.sp_model_file) elif self.tokenizer_model_name.lower().startswith('cpm3'): from flagai.data.tokenizer.cpm_3.cpm3_tokenizer import CPMTokenizer self.text_tokenizer = CPMTokenizer(self.tokenizer_json_file, self.sp_model_file) @@ -85,6 +85,7 @@ def __init__(self, # self.is_clip = self.tokenizer_model_name.startswith('clip') self.num_tokens = self.text_tokenizer.vocab_size + # import pdb;pdb.set_trace() try: with open(self.special_tokens_map, encoding='utf8') as file: dct=json.load(file) sp_tokens = [(k.replace("_token",""),v['content']) for k,v in dct.items()] @@ -273,7 +274,7 @@ def TokenToId(self, token): def DecodeIds(self, ids): """converts ids to wordpiece tokens and joins them as a text string""" - tokens = [] + tokens = [] for id in ids: if id in self.command_id_map: tokens.append(self.command_id_map[id].token) @@ -287,10 +288,14 @@ def DecodeIds(self, ids): tokens, self.command_token_map) def encode(self, text): + if hasattr(self.text_tokenizer, "encode"): + return self.text_tokenizer.encode(text) return self.convert_tokens_to_ids( self.text_tokenizer.tokenize(text)) def decode(self, ids): + if hasattr(self.text_tokenizer, "decode"): + return self.text_tokenizer.decode(ids) return self.DecodeIds(ids) def DecodeTokens(self, tokens): @@ -451,12 +456,16 @@ def encode_plus( # for Seq2seq max_length=None, padding=True, ): - if not self.tokenizer_model_name.lower().startswith("glm") and not self.tokenizer_model_name.lower().startswith( + if hasattr(self.text_tokenizer, "encode_plus"): + return self.text_tokenizer.encode_plus(source_text) + elif not self.tokenizer_model_name.lower().startswith("glm") and not self.tokenizer_model_name.lower().startswith( "alm"): return self.encode_plus_non_glm(source_text, second_text, truncation, max_length) - elif self.tokenizer_model_name.lower().startswith("opt"): - return None + + + # elif self.tokenizer_model_name.lower().startswith("opt"): + # return None sop_id = self.get_command_id('sop') # start of piece eop_id = self.get_command_id('eop') # end of piece sep_id = self.get_command_id('sep') # seperation @@ -572,4 +581,31 @@ def tokenize(self, text, maxlen=None, add_spatial_tokens=False): if maxlen is not None: index = int(self.get_command_id('sep') is not None) + 1 self.truncate_sequence(maxlen, tokens, pop_index=-index) - return tokens \ No newline at end of file + return tokens + + # def search_special(self, name): + # if name == "cls": + # if self.check_special(''): return '' + # elif self.check_special('[CLS]'): return '' + # elif name == "pad": + # if self.check_special(''): return '' + # elif self.check_special(''): return '[PAD]' + # elif self.check_special(''): return '<|endoftext|>' + # elif name == "eos": + # if self.check_special(''): return '' + # elif self.check_special('|endoftext|'): return '|endoftext|' + # elif name == "sep": + # if self.check_special(''): return '' + # elif self.check_special('[SEP]'): return '[SEP]' + # elif name == "unk": + # if self.check_special(''): return '' + # elif self.check_special('[UNK]'): return '[UNK]' + # elif name == "bos": + # if self.check_special(''): return '' + + # def check_special(self, tk): + # try: + # self.text_tokenizer.convert_token_to_id(tk) + # return True + # except KeyError: + # return False diff --git a/flagai/model/predictor/utils.py b/flagai/model/predictor/utils.py index 0b4f92e8..f86f5ad5 100644 --- a/flagai/model/predictor/utils.py +++ b/flagai/model/predictor/utils.py @@ -1446,7 +1446,6 @@ def glm_generate_sample( mask_positions.sort() output_ = model(tokens, position_ids, attention_mask, return_memory=True) mems = output_['hidden_states'] - import pdb;pdb.set_trace() for mask_position in mask_positions: position = mask_position tokens, mems = glm_sample_sequence(model, diff --git a/tests/test_model.py b/tests/test_model.py new file mode 100644 index 00000000..aa93e4f7 --- /dev/null +++ b/tests/test_model.py @@ -0,0 +1,162 @@ +# Copyright © 2022 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") +import sys +sys.path.append("/home/yanzhaodong/anhforth/FlagAI") +import unittest +from flagai.data.tokenizer import Tokenizer +from flagai.auto_model.auto_loader import AutoLoader +from flagai.data.tokenizer import Tokenizer + +class TokenizerTestCase(unittest.TestCase): + + # def test_tokenizer_GLM_large_ch(self): + # tokenizer = Tokenizer.from_pretrained("GLM-large-ch") + # self.assertEqual(tokenizer.TokenToId("人"), 43371, 'Token id "人" error') + # self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), + # [3378, 1567, 2613, 20282], 'EncodeAsIds Error') + # self.assertEqual(tokenizer.DecodeIds([3378, 1567, 2613, 20282]), + # '今天吃饭吃了肯德基', 'DecodeIds Error') + # self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), + # ['▁今天', '吃饭', '吃了', '肯德基'], 'tokenize Error') + # self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], + # [50006, 3378, 1567, 2613, 20282, 50001], 'encode_plus Error') + # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + # [('pad', '<|endoftext|>', 50000), ('eos', '<|endoftext|>', 50000), ('sep', '[SEP]', 50001), + # ('cls', '[CLS]', 50002), ('mask', '[MASK]', 50003), ('unk', '[UNK]', 50004), ('sop', '<|startofpiece|>', 50006), + # ('eop', '<|endofpiece|>', 50007), ('sMASK', '[sMASK]', 50008), ('gMASK', '[gMASK]', 50009)], 'SpecialTokens error') + + # def test_tokenizer_GLM_large_en(self): + # tokenizer = Tokenizer.from_pretrained("GLM-large-en") + # self.assertEqual(tokenizer.TokenToId("day"), 2154, '') + # self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), + # [13017, 7975, 3084, 2033, 3407], '') + # self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), + # 'fried chicken makes me happy', 'DecodeIds Error') + # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + # [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('mask', '[MASK]', 103), ('unk', '[UNK]', 100), + # ('sep', '[SEP]', 102), ('pad', '[PAD]', 0), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523), + # ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)]) + + # # def test_tokenizer_glm_10b_en(self): + # # tokenizer = Tokenizer.from_pretrained("GLM-10b-en") + # # self.assertEqual(tokenizer.TokenToId("day"), 820, '') + # # self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), + # # [25520, 9015, 1838, 502, 3772], '') + # # self.assertEqual(tokenizer.DecodeIds([25520, 9015, 1838, 502, 3772]), + # # 'fried chicken makes me happy', 'DecodeIds Error') + + # def test_tokenizer_t5(self): + # tokenizer = Tokenizer.from_pretrained('T5-base-ch') + # # import pdb;pdb.set_trace() + # self.assertEqual(tokenizer.TokenToId("人"), 297, '') + # self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), + # [306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166], '') + # self.assertEqual(tokenizer.DecodeIds([306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166]), + # '今天吃饭吃了肯德基', 'DecodeIds Error') + # encode_plus_result = tokenizer.encode_plus("今天吃饭吃了肯德基") + # self.assertEqual(list(encode_plus_result.keys()), + # ['input_ids', 'token_type_ids'], 'encode_plus Error') + # self.assertEqual(encode_plus_result['input_ids'], + # [101, 306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166, 102], 'encode_plus Error') + + # def test_tokenizer_roberta(self): + # tokenizer = Tokenizer.from_pretrained('RoBERTa-base-ch') + # # print(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825])) + # self.assertEqual(tokenizer.TokenToId("人"), 782, '') + # self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), + # [791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825], '') + # self.assertEqual(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825]), + # '今天吃饭吃了肯德基', 'DecodeIds Error') + # self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), + # ['今', '天', '吃', '饭', '吃', '了', '肯', '德', '基'], 'tokenize Error') + # self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], + # [101, 791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825, 102], 'encode_plus Error') + # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + # [('eos', '[PAD]', 0), ('unk', '[UNK]', 100), ('cls', '[CLS]', 101), ('sep', '[SEP]', 102), + # ('mask', '[MASK]', 103), ('pad', '[PAD]', 0)], 'SpecialTokens error') + + # def test_tokenizer_bert(self): + # tokenizer = Tokenizer.from_pretrained('BERT-base-en') + # self.assertEqual(tokenizer.TokenToId("day"), 2154, '') + # self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), + # [13017, 7975, 3084, 2033, 3407], '') + # self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), + # 'fried chicken makes me happy', 'DecodeIds Error') + # self.assertEqual(tokenizer.tokenize('fried chicken makes me happy'), + # ['fried', 'chicken', 'makes', 'me', 'happy'], 'tokenize Error') + # self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'], + # [101, 13017, 7975, 3084, 2033, 3407, 102], 'encode_plus Error') + # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + # [('eos', '[PAD]', 0), ('unk', '[UNK]', 100), ('cls', '[CLS]', 101), ('sep', '[SEP]', 102), + # ('mask', '[MASK]', 103), ('pad', '[PAD]', 0)], 'SpecialTokens error') + + def test_tokenizer_cpm1(self): + loader = AutoLoader(task_name="lm", + model_name="CPM-large-ch", + model_dir="./checkpoints/", + only_download_config=True) + + tokenizer = loader.get_tokenizer() + self.assertEqual(tokenizer.TokenToId("人"), 62, '') + self.assertEqual(tokenizer.encode("今天吃饭吃了肯德基"), + [837, 3079, 1777, 3079, 139, 3687, 513, 1463], '') + self.assertEqual(tokenizer.DecodeIds([837, 3079, 1777, 3079, 139, 3687, 513, 1463]), + '今天吃饭吃了肯德基', 'DecodeIds Error') + self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), + [837, 3079, 1777, 3079, 139, 3687, 513, 1463], 'tokenize Error') + self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], + [837, 3079, 1777, 3079, 139, 3687, 513, 1463], 'encode_plus Error') + self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + [('unk', '', 0), ('cls', '', 1), ('eos', '', 2), ('sep', '', 4), + ('mask', '', 6), ('eod', '', 7)], 'SpecialTokens error') + + # def test_tokenizer_opt(self): + # tokenizer = Tokenizer.from_pretrained('opt-1.3b-en') + # self.assertEqual(tokenizer.encode("day"), [1208], '') + # self.assertEqual(tokenizer.encode_plus("fried chicken makes me happy")["input_ids"], + # [0, 21209, 5884, 817, 162, 1372, 2], '') + # self.assertEqual(tokenizer.decode([21209, 5884, 817, 162, 1372]), + # 'fried chicken makes me happy', 'DecodeIds Error') + # self.assertEqual(tokenizer.tokenize('fried chicken makes me happy'), + # ['fried', 'Ġchicken', 'Ġmakes', 'Ġme', 'Ġhappy'], 'tokenize Error') + # self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'], + # [0, 21209, 5884, 817, 162, 1372, 2], 'encode_plus Error') + # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + # [('cls', '', 0), ('pad', '', 1), ('bos', '', 2), ('eos', '', 2), ('unk', '', 3)], 'SpecialTokens error') + + + # def test_tokenizer_clip(self): + # loader = AutoLoader(task_name="txt_img_matching", + # model_name="clip-base-p32-224", + # only_download_config=True) + # tokenizer = loader.get_tokenizer() + # self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') + + # def test_tokenizer_evaclip(self): + # loader = AutoLoader(task_name="txt_img_matching", + # model_name="eva-clip", + # only_download_config=True) + # tokenizer = loader.get_tokenizer() + # self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') + + +def suite(): + suite = unittest.TestSuite() + # suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_ch')) + # suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_en')) + # suite.addTest(TokenizerTestCase('test_tokenizer_glm_10_en')) + # suite.addTest(TokenizerTestCase('test_tokenizer_t5')) + # suite.addTest(TokenizerTestCase('test_tokenizer_roberta')) + # suite.addTest(TokenizerTestCase('test_tokenizer_bert')) + suite.addTest(TokenizerTestCase('test_tokenizer_cpm1')) + # suite.addTest(TokenizerTestCase('test_tokenizer_opt')) + # suite.addTest(TokenizerTestCase('test_tokenizer_clip')) + # suite.addTest(TokenizerTestCase('test_tokenizer_evaclip')) + + return suite + + +if __name__ == '__main__': + runner = unittest.TextTestRunner() + runner.run(suite()) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 01ecd062..00160d9b 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -10,147 +10,149 @@ class TokenizerTestCase(unittest.TestCase): - # def test_tokenizer_GLM_large_ch(self): - # tokenizer = Tokenizer.from_pretrained("GLM-large-ch") - # self.assertEqual(tokenizer.TokenToId("人"), 43371, 'Token id "人" error') - # self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), - # [3378, 1567, 2613, 20282], 'EncodeAsIds Error') - # self.assertEqual(tokenizer.DecodeIds([3378, 1567, 2613, 20282]), - # '今天吃饭吃了肯德基', 'DecodeIds Error') - # self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), - # ['▁今天', '吃饭', '吃了', '肯德基'], 'tokenize Error') - # self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], - # [50006, 3378, 1567, 2613, 20282, 50001], 'encode_plus Error') - # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - # [('pad', '<|endoftext|>', 50000), ('eos', '<|endoftext|>', 50000), ('sep', '[SEP]', 50001), - # ('cls', '[CLS]', 50002), ('mask', '[MASK]', 50003), ('unk', '[UNK]', 50004), ('sop', '<|startofpiece|>', 50006), - # ('eop', '<|endofpiece|>', 50007), ('sMASK', '[sMASK]', 50008), ('gMASK', '[gMASK]', 50009)], 'SpecialTokens error') + def test_tokenizer_GLM_large_ch(self): + tokenizer = Tokenizer.from_pretrained("GLM-large-ch") + self.assertEqual(tokenizer.TokenToId("人"), 43371, 'Token id "人" error') + self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), + [3378, 1567, 2613, 20282], 'EncodeAsIds Error') + self.assertEqual(tokenizer.DecodeIds([3378, 1567, 2613, 20282]), + '今天吃饭吃了肯德基', 'DecodeIds Error') + self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), + ['▁今天', '吃饭', '吃了', '肯德基'], 'tokenize Error') + self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], + [50006, 3378, 1567, 2613, 20282, 50001], 'encode_plus Error') + self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + [('pad', '<|endoftext|>', 50000), ('eos', '<|endoftext|>', 50000), ('sep', '[SEP]', 50001), + ('cls', '[CLS]', 50002), ('mask', '[MASK]', 50003), ('unk', '[UNK]', 50004), ('sop', '<|startofpiece|>', 50006), + ('eop', '<|endofpiece|>', 50007), ('sMASK', '[sMASK]', 50008), ('gMASK', '[gMASK]', 50009)], 'SpecialTokens error') - # def test_tokenizer_GLM_large_en(self): - # tokenizer = Tokenizer.from_pretrained("GLM-large-en") - # self.assertEqual(tokenizer.TokenToId("day"), 2154, '') + def test_tokenizer_GLM_large_en(self): + tokenizer = Tokenizer.from_pretrained("GLM-large-en") + self.assertEqual(tokenizer.TokenToId("day"), 2154, '') + self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), + [13017, 7975, 3084, 2033, 3407], '') + self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), + 'fried chicken makes me happy', 'DecodeIds Error') + self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('mask', '[MASK]', 103), ('unk', '[UNK]', 100), + ('sep', '[SEP]', 102), ('pad', '[PAD]', 0), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523), + ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)]) + + # def test_tokenizer_glm_10b_en(self): + # tokenizer = Tokenizer.from_pretrained("GLM-10b-en") + # self.assertEqual(tokenizer.TokenToId("day"), 820, '') # self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), - # [13017, 7975, 3084, 2033, 3407], '') - # self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), + # [25520, 9015, 1838, 502, 3772], '') + # self.assertEqual(tokenizer.DecodeIds([25520, 9015, 1838, 502, 3772]), # 'fried chicken makes me happy', 'DecodeIds Error') - # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - # [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('mask', '[MASK]', 103), ('unk', '[UNK]', 100), - # ('sep', '[SEP]', 102), ('pad', '[PAD]', 0), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523), - # ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)]) - - # # def test_tokenizer_glm_10b_en(self): - # # tokenizer = Tokenizer.from_pretrained("GLM-10b-en") - # # self.assertEqual(tokenizer.TokenToId("day"), 820, '') - # # self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), - # # [25520, 9015, 1838, 502, 3772], '') - # # self.assertEqual(tokenizer.DecodeIds([25520, 9015, 1838, 502, 3772]), - # # 'fried chicken makes me happy', 'DecodeIds Error') - # def test_tokenizer_t5(self): - # tokenizer = Tokenizer.from_pretrained('T5-base-ch') - # # import pdb;pdb.set_trace() - # self.assertEqual(tokenizer.TokenToId("人"), 297, '') - # self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), - # [306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166], '') - # self.assertEqual(tokenizer.DecodeIds([306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166]), - # '今天吃饭吃了肯德基', 'DecodeIds Error') - # encode_plus_result = tokenizer.encode_plus("今天吃饭吃了肯德基") - # self.assertEqual(list(encode_plus_result.keys()), - # ['input_ids', 'token_type_ids'], 'encode_plus Error') - # self.assertEqual(encode_plus_result['input_ids'], - # [101, 306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166, 102], 'encode_plus Error') + def test_tokenizer_t5(self): + tokenizer = Tokenizer.from_pretrained('T5-base-ch') + # import pdb;pdb.set_trace() + self.assertEqual(tokenizer.TokenToId("人"), 297, '') + self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), + [306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166], '') + self.assertEqual(tokenizer.DecodeIds([306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166]), + '今天吃饭吃了肯德基', 'DecodeIds Error') + encode_plus_result = tokenizer.encode_plus("今天吃饭吃了肯德基") + self.assertEqual(list(encode_plus_result.keys()), + ['input_ids', 'token_type_ids'], 'encode_plus Error') + self.assertEqual(encode_plus_result['input_ids'], + [101, 306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166, 102], 'encode_plus Error') - # def test_tokenizer_roberta(self): - # tokenizer = Tokenizer.from_pretrained('RoBERTa-base-ch') - # # print(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825])) - # self.assertEqual(tokenizer.TokenToId("人"), 782, '') - # self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), - # [791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825], '') - # self.assertEqual(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825]), - # '今天吃饭吃了肯德基', 'DecodeIds Error') - # self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), - # ['今', '天', '吃', '饭', '吃', '了', '肯', '德', '基'], 'tokenize Error') - # self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], - # [101, 791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825, 102], 'encode_plus Error') - # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - # [('eos', '[PAD]', 0), ('unk', '[UNK]', 100), ('cls', '[CLS]', 101), ('sep', '[SEP]', 102), - # ('mask', '[MASK]', 103), ('pad', '[PAD]', 0)], 'SpecialTokens error') + def test_tokenizer_roberta(self): + tokenizer = Tokenizer.from_pretrained('RoBERTa-base-ch') + # print(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825])) + self.assertEqual(tokenizer.TokenToId("人"), 782, '') + self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), + [791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825], '') + self.assertEqual(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825]), + '今天吃饭吃了肯德基', 'DecodeIds Error') + self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), + ['今', '天', '吃', '饭', '吃', '了', '肯', '德', '基'], 'tokenize Error') + self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], + [101, 791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825, 102], 'encode_plus Error') + self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + [('eos', '[PAD]', 0), ('unk', '[UNK]', 100), ('cls', '[CLS]', 101), ('sep', '[SEP]', 102), + ('mask', '[MASK]', 103), ('pad', '[PAD]', 0)], 'SpecialTokens error') - # def test_tokenizer_bert(self): - # tokenizer = Tokenizer.from_pretrained('BERT-base-en') - # self.assertEqual(tokenizer.TokenToId("day"), 2154, '') - # self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), - # [13017, 7975, 3084, 2033, 3407], '') - # self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), - # 'fried chicken makes me happy', 'DecodeIds Error') - # self.assertEqual(tokenizer.tokenize('fried chicken makes me happy'), - # ['fried', 'chicken', 'makes', 'me', 'happy'], 'tokenize Error') - # self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'], - # [101, 13017, 7975, 3084, 2033, 3407, 102], 'encode_plus Error') - # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - # [('eos', '[PAD]', 0), ('unk', '[UNK]', 100), ('cls', '[CLS]', 101), ('sep', '[SEP]', 102), - # ('mask', '[MASK]', 103), ('pad', '[PAD]', 0)], 'SpecialTokens error') + def test_tokenizer_bert(self): + tokenizer = Tokenizer.from_pretrained('BERT-base-en') + self.assertEqual(tokenizer.TokenToId("day"), 2154, '') + self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), + [13017, 7975, 3084, 2033, 3407], '') + self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), + 'fried chicken makes me happy', 'DecodeIds Error') + self.assertEqual(tokenizer.tokenize('fried chicken makes me happy'), + ['fried', 'chicken', 'makes', 'me', 'happy'], 'tokenize Error') + self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'], + [101, 13017, 7975, 3084, 2033, 3407, 102], 'encode_plus Error') + self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + [('eos', '[PAD]', 0), ('unk', '[UNK]', 100), ('cls', '[CLS]', 101), ('sep', '[SEP]', 102), + ('mask', '[MASK]', 103), ('pad', '[PAD]', 0)], 'SpecialTokens error') def test_tokenizer_cpm1(self): loader = AutoLoader(task_name="lm", model_name="CPM-large-ch", model_dir="./checkpoints/", only_download_config=True) + tokenizer = loader.get_tokenizer() - self.assertEqual(tokenizer.encode("day"), [8, 8275], '') - self.assertEqual(tokenizer.encode("fried chicken makes me happy"), - [2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239], '') - self.assertEqual(tokenizer.decode([2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239]), + self.assertEqual(tokenizer.TokenToId("人"), 62, '') + self.assertEqual(tokenizer.encode("今天吃饭吃了肯德基"), + [837, 3079, 1777, 3079, 139, 3687, 513, 1463], '') + self.assertEqual(tokenizer.DecodeIds([837, 3079, 1777, 3079, 139, 3687, 513, 1463]), + '今天吃饭吃了肯德基', 'DecodeIds Error') + self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), + [837, 3079, 1777, 3079, 139, 3687, 513, 1463], 'tokenize Error') + self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], + [837, 3079, 1777, 3079, 139, 3687, 513, 1463], 'encode_plus Error') + self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + [('unk', '', 0), ('cls', '', 1), ('eos', '', 2), ('sep', '', 4), + ('mask', '', 6), ('eod', '', 7)], 'SpecialTokens error') + + def test_tokenizer_opt(self): + tokenizer = Tokenizer.from_pretrained('opt-1.3b-en') + self.assertEqual(tokenizer.encode("day"), [1208], '') + self.assertEqual(tokenizer.encode_plus("fried chicken makes me happy")["input_ids"], + [0, 21209, 5884, 817, 162, 1372, 2], '') + self.assertEqual(tokenizer.decode([21209, 5884, 817, 162, 1372]), 'fried chicken makes me happy', 'DecodeIds Error') self.assertEqual(tokenizer.tokenize('fried chicken makes me happy'), - ['▁f', 'ried', '▁ch', 'ick', 'en', '▁make', 's', '▁me', '▁happy'], 'tokenize Error') + ['fried', 'Ġchicken', 'Ġmakes', 'Ġme', 'Ġhappy'], 'tokenize Error') self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'], - [1, 2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239, 2], 'encode_plus Error') + [0, 21209, 5884, 817, 162, 1372, 2], 'encode_plus Error') self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - [('unk', '', 0), ('cls', '', 1), ('eos', '', 2), ('sep', '', 4), ('mask', '', 6), ('eod', '', 7), ('eop', '', 0)]) - - # def test_tokenizer_opt(self): - # tokenizer = Tokenizer.from_pretrained('opt-1.3b-en') - # self.assertEqual(tokenizer.encode("day"), [1208], '') - # self.assertEqual(tokenizer.encode_plus("fried chicken makes me happy")["input_ids"], - # [0, 21209, 5884, 817, 162, 1372, 2], '') - # self.assertEqual(tokenizer.decode([21209, 5884, 817, 162, 1372]), - # 'fried chicken makes me happy', 'DecodeIds Error') - # self.assertEqual(tokenizer.tokenize('fried chicken makes me happy'), - # ['fried', 'Ġchicken', 'Ġmakes', 'Ġme', 'Ġhappy'], 'tokenize Error') - # self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'], - # [0, 21209, 5884, 817, 162, 1372, 2], 'encode_plus Error') - # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - # [('cls', '', 0), ('pad', '', 1), ('bos', '', 2), ('eos', '', 2), ('unk', '', 3)], 'SpecialTokens error') + [('cls', '', 0), ('pad', '', 1), ('bos', '', 2), ('eos', '', 2), ('unk', '', 3)], 'SpecialTokens error') - # def test_tokenizer_clip(self): - # loader = AutoLoader(task_name="txt_img_matching", - # model_name="clip-base-p32-224", - # only_download_config=True) - # tokenizer = loader.get_tokenizer() - # self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') + def test_tokenizer_clip(self): + loader = AutoLoader(task_name="txt_img_matching", + model_name="clip-base-p32-224", + only_download_config=True) + tokenizer = loader.get_tokenizer() + self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') - # def test_tokenizer_evaclip(self): - # loader = AutoLoader(task_name="txt_img_matching", - # model_name="eva-clip", - # only_download_config=True) - # tokenizer = loader.get_tokenizer() - # self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') + def test_tokenizer_evaclip(self): + loader = AutoLoader(task_name="txt_img_matching", + model_name="eva-clip", + only_download_config=True) + tokenizer = loader.get_tokenizer() + self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') def suite(): suite = unittest.TestSuite() - # suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_ch')) - # suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_en')) - # suite.addTest(TokenizerTestCase('test_tokenizer_glm_10_en')) - # suite.addTest(TokenizerTestCase('test_tokenizer_t5')) - # suite.addTest(TokenizerTestCase('test_tokenizer_roberta')) - # suite.addTest(TokenizerTestCase('test_tokenizer_bert')) + suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_ch')) + suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_en')) + suite.addTest(TokenizerTestCase('test_tokenizer_glm_10_en')) + suite.addTest(TokenizerTestCase('test_tokenizer_t5')) + suite.addTest(TokenizerTestCase('test_tokenizer_roberta')) + suite.addTest(TokenizerTestCase('test_tokenizer_bert')) suite.addTest(TokenizerTestCase('test_tokenizer_cpm1')) - # suite.addTest(TokenizerTestCase('test_tokenizer_opt')) - # suite.addTest(TokenizerTestCase('test_tokenizer_clip')) - # suite.addTest(TokenizerTestCase('test_tokenizer_evaclip')) + suite.addTest(TokenizerTestCase('test_tokenizer_opt')) + suite.addTest(TokenizerTestCase('test_tokenizer_clip')) + suite.addTest(TokenizerTestCase('test_tokenizer_evaclip')) return suite From 523bb6156e450019a3e420e72846b381ac0881a4 Mon Sep 17 00:00:00 2001 From: ftgreat Date: Fri, 10 Mar 2023 11:03:39 +0800 Subject: [PATCH 36/54] ignore file updated Signed-off-by: ftgreat --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 2a481889..9a6b945c 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,4 @@ datasets qqp glm_large_qqp_pytorch wandb +clip_benchmark_datasets/ \ No newline at end of file From c8c3e60caa1cf65c69de51a80bdc1e9cad3beb65 Mon Sep 17 00:00:00 2001 From: ftgreat Date: Fri, 10 Mar 2023 15:38:09 +0800 Subject: [PATCH 37/54] saved work Signed-off-by: ftgreat --- README.md | 2 +- .../data/tokenizer/uni_tokenizer/tokenizer.py | 59 +++--- tests/test_tokenizer.py | 182 +++++++++--------- 3 files changed, 125 insertions(+), 118 deletions(-) diff --git a/README.md b/README.md index 7dee911d..42315c56 100644 --- a/README.md +++ b/README.md @@ -259,6 +259,6 @@ The majority of FlagAI is licensed under the [Apache 2.0 license](LICENSE), howe ### ↳ Star History
-[![Star History Chart](https://api.star-history.com/svg?repos=FlagAI-Open/FlagAI&type=Date)](https://star-history.com/#baaivision/EVA&Date) +[![Star History Chart](https://api.star-history.com/svg?repos=FlagAI-Open/FlagAI&type=Date)]
diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py index cec7a8fd..efdfb34e 100644 --- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py @@ -53,6 +53,7 @@ def __init__(self, add_decoder_mask=False, fix_command_token=False, pre_tokenizer=None, + special_tokens=['cls','pad','unk','eos','bos','sep'], **kwargs): super().__init__(**kwargs) if self.tokenizer_class == "wp": @@ -92,6 +93,10 @@ def __init__(self, except FileNotFoundError: dct = None sp_tokens = [] + for tk in special_tokens: + res = self.search_special(tk) + if res: + sp_tokens += [(tk, res)] self._command_tokens = [CommandToken(e[0], e[1], self.text_tokenizer.convert_token_to_id(e[1])) for e in sp_tokens] if self.tokenizer_model_name.lower().startswith("glm"): @@ -583,29 +588,31 @@ def tokenize(self, text, maxlen=None, add_spatial_tokens=False): self.truncate_sequence(maxlen, tokens, pop_index=-index) return tokens - # def search_special(self, name): - # if name == "cls": - # if self.check_special(''): return '' - # elif self.check_special('[CLS]'): return '' - # elif name == "pad": - # if self.check_special(''): return '' - # elif self.check_special(''): return '[PAD]' - # elif self.check_special(''): return '<|endoftext|>' - # elif name == "eos": - # if self.check_special(''): return '' - # elif self.check_special('|endoftext|'): return '|endoftext|' - # elif name == "sep": - # if self.check_special(''): return '' - # elif self.check_special('[SEP]'): return '[SEP]' - # elif name == "unk": - # if self.check_special(''): return '' - # elif self.check_special('[UNK]'): return '[UNK]' - # elif name == "bos": - # if self.check_special(''): return '' - - # def check_special(self, tk): - # try: - # self.text_tokenizer.convert_token_to_id(tk) - # return True - # except KeyError: - # return False + def search_special(self, name): + if name == "cls": + if self.check_special(''): return '' + elif self.check_special('[CLS]'): return '[CLS]' + elif name == "pad": + if self.check_special(''): return '' + elif self.check_special('[PAD]'): return '[PAD]' + elif self.check_special('<|endoftext|>'): return '<|endoftext|>' + elif name == "eos": + if self.check_special(''): return '' + elif self.check_special('|endoftext|'): return '|endoftext|' + elif self.check_special('[PAD]'): return '[PAD]' + elif name == "sep": + if self.check_special(''): return '' + elif self.check_special('[SEP]'): return '[SEP]' + elif name == "unk": + if self.check_special(''): return '' + elif self.check_special('[UNK]'): return '[UNK]' + elif name == "bos": + if self.check_special(''): return '' + return None + + def check_special(self, tk): + try: + self.text_tokenizer.convert_token_to_id(tk) + return True + except KeyError: + return False diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 00160d9b..2c08dd32 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -10,33 +10,33 @@ class TokenizerTestCase(unittest.TestCase): - def test_tokenizer_GLM_large_ch(self): - tokenizer = Tokenizer.from_pretrained("GLM-large-ch") - self.assertEqual(tokenizer.TokenToId("人"), 43371, 'Token id "人" error') - self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), - [3378, 1567, 2613, 20282], 'EncodeAsIds Error') - self.assertEqual(tokenizer.DecodeIds([3378, 1567, 2613, 20282]), - '今天吃饭吃了肯德基', 'DecodeIds Error') - self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), - ['▁今天', '吃饭', '吃了', '肯德基'], 'tokenize Error') - self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], - [50006, 3378, 1567, 2613, 20282, 50001], 'encode_plus Error') - self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - [('pad', '<|endoftext|>', 50000), ('eos', '<|endoftext|>', 50000), ('sep', '[SEP]', 50001), - ('cls', '[CLS]', 50002), ('mask', '[MASK]', 50003), ('unk', '[UNK]', 50004), ('sop', '<|startofpiece|>', 50006), - ('eop', '<|endofpiece|>', 50007), ('sMASK', '[sMASK]', 50008), ('gMASK', '[gMASK]', 50009)], 'SpecialTokens error') + # def test_tokenizer_GLM_large_ch(self): + # tokenizer = Tokenizer.from_pretrained("GLM-large-ch") + # self.assertEqual(tokenizer.TokenToId("人"), 43371, 'Token id "人" error') + # self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), + # [3378, 1567, 2613, 20282], 'EncodeAsIds Error') + # self.assertEqual(tokenizer.DecodeIds([3378, 1567, 2613, 20282]), + # '今天吃饭吃了肯德基', 'DecodeIds Error') + # self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), + # ['▁今天', '吃饭', '吃了', '肯德基'], 'tokenize Error') + # self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], + # [50006, 3378, 1567, 2613, 20282, 50001], 'encode_plus Error') + # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + # [('pad', '<|endoftext|>', 50000), ('eos', '<|endoftext|>', 50000), ('sep', '[SEP]', 50001), + # ('cls', '[CLS]', 50002), ('mask', '[MASK]', 50003), ('unk', '[UNK]', 50004), ('sop', '<|startofpiece|>', 50006), + # ('eop', '<|endofpiece|>', 50007), ('sMASK', '[sMASK]', 50008), ('gMASK', '[gMASK]', 50009)], 'SpecialTokens error') - def test_tokenizer_GLM_large_en(self): - tokenizer = Tokenizer.from_pretrained("GLM-large-en") - self.assertEqual(tokenizer.TokenToId("day"), 2154, '') - self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), - [13017, 7975, 3084, 2033, 3407], '') - self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), - 'fried chicken makes me happy', 'DecodeIds Error') - self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('mask', '[MASK]', 103), ('unk', '[UNK]', 100), - ('sep', '[SEP]', 102), ('pad', '[PAD]', 0), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523), - ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)]) + # def test_tokenizer_GLM_large_en(self): + # tokenizer = Tokenizer.from_pretrained("GLM-large-en") + # self.assertEqual(tokenizer.TokenToId("day"), 2154, '') + # self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), + # [13017, 7975, 3084, 2033, 3407], '') + # self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]), + # 'fried chicken makes me happy', 'DecodeIds Error') + # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + # [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('mask', '[MASK]', 103), ('unk', '[UNK]', 100), + # ('sep', '[SEP]', 102), ('pad', '[PAD]', 0), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523), + # ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)]) # def test_tokenizer_glm_10b_en(self): # tokenizer = Tokenizer.from_pretrained("GLM-10b-en") @@ -46,35 +46,35 @@ def test_tokenizer_GLM_large_en(self): # self.assertEqual(tokenizer.DecodeIds([25520, 9015, 1838, 502, 3772]), # 'fried chicken makes me happy', 'DecodeIds Error') - def test_tokenizer_t5(self): - tokenizer = Tokenizer.from_pretrained('T5-base-ch') - # import pdb;pdb.set_trace() - self.assertEqual(tokenizer.TokenToId("人"), 297, '') - self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), - [306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166], '') - self.assertEqual(tokenizer.DecodeIds([306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166]), - '今天吃饭吃了肯德基', 'DecodeIds Error') - encode_plus_result = tokenizer.encode_plus("今天吃饭吃了肯德基") - self.assertEqual(list(encode_plus_result.keys()), - ['input_ids', 'token_type_ids'], 'encode_plus Error') - self.assertEqual(encode_plus_result['input_ids'], - [101, 306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166, 102], 'encode_plus Error') + # def test_tokenizer_t5(self): + # tokenizer = Tokenizer.from_pretrained('T5-base-ch') + # # import pdb;pdb.set_trace() + # self.assertEqual(tokenizer.TokenToId("人"), 297, '') + # self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), + # [306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166], '') + # self.assertEqual(tokenizer.DecodeIds([306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166]), + # '今天吃饭吃了肯德基', 'DecodeIds Error') + # encode_plus_result = tokenizer.encode_plus("今天吃饭吃了肯德基") + # self.assertEqual(list(encode_plus_result.keys()), + # ['input_ids', 'token_type_ids'], 'encode_plus Error') + # self.assertEqual(encode_plus_result['input_ids'], + # [101, 306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166, 102], 'encode_plus Error') - def test_tokenizer_roberta(self): - tokenizer = Tokenizer.from_pretrained('RoBERTa-base-ch') - # print(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825])) - self.assertEqual(tokenizer.TokenToId("人"), 782, '') - self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), - [791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825], '') - self.assertEqual(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825]), - '今天吃饭吃了肯德基', 'DecodeIds Error') - self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), - ['今', '天', '吃', '饭', '吃', '了', '肯', '德', '基'], 'tokenize Error') - self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], - [101, 791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825, 102], 'encode_plus Error') - self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - [('eos', '[PAD]', 0), ('unk', '[UNK]', 100), ('cls', '[CLS]', 101), ('sep', '[SEP]', 102), - ('mask', '[MASK]', 103), ('pad', '[PAD]', 0)], 'SpecialTokens error') + # def test_tokenizer_roberta(self): + # tokenizer = Tokenizer.from_pretrained('RoBERTa-base-ch') + # # print(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825])) + # self.assertEqual(tokenizer.TokenToId("人"), 782, '') + # self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), + # [791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825], '') + # self.assertEqual(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825]), + # '今天吃饭吃了肯德基', 'DecodeIds Error') + # self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), + # ['今', '天', '吃', '饭', '吃', '了', '肯', '德', '基'], 'tokenize Error') + # self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], + # [101, 791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825, 102], 'encode_plus Error') + # self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]), + # {('unk', '[UNK]', 100), ('cls', '[CLS]', 101), ('sep', '[SEP]', 102), ('eos', '[PAD]', 0), + # ('pad', '[PAD]', 0)}, 'SpecialTokens error') def test_tokenizer_bert(self): tokenizer = Tokenizer.from_pretrained('BERT-base-en') @@ -87,9 +87,9 @@ def test_tokenizer_bert(self): ['fried', 'chicken', 'makes', 'me', 'happy'], 'tokenize Error') self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'], [101, 13017, 7975, 3084, 2033, 3407, 102], 'encode_plus Error') - self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - [('eos', '[PAD]', 0), ('unk', '[UNK]', 100), ('cls', '[CLS]', 101), ('sep', '[SEP]', 102), - ('mask', '[MASK]', 103), ('pad', '[PAD]', 0)], 'SpecialTokens error') + self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]), + {('eos', '[PAD]', 0), ('unk', '[UNK]', 100), ('cls', '[CLS]', 101), ('sep', '[SEP]', 102), + ('mask', '[MASK]', 103), ('pad', '[PAD]', 0)}, 'SpecialTokens error') def test_tokenizer_cpm1(self): loader = AutoLoader(task_name="lm", @@ -111,48 +111,48 @@ def test_tokenizer_cpm1(self): [('unk', '', 0), ('cls', '', 1), ('eos', '', 2), ('sep', '', 4), ('mask', '', 6), ('eod', '', 7)], 'SpecialTokens error') - def test_tokenizer_opt(self): - tokenizer = Tokenizer.from_pretrained('opt-1.3b-en') - self.assertEqual(tokenizer.encode("day"), [1208], '') - self.assertEqual(tokenizer.encode_plus("fried chicken makes me happy")["input_ids"], - [0, 21209, 5884, 817, 162, 1372, 2], '') - self.assertEqual(tokenizer.decode([21209, 5884, 817, 162, 1372]), - 'fried chicken makes me happy', 'DecodeIds Error') - self.assertEqual(tokenizer.tokenize('fried chicken makes me happy'), - ['fried', 'Ġchicken', 'Ġmakes', 'Ġme', 'Ġhappy'], 'tokenize Error') - self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'], - [0, 21209, 5884, 817, 162, 1372, 2], 'encode_plus Error') - self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - [('cls', '', 0), ('pad', '', 1), ('bos', '', 2), ('eos', '', 2), ('unk', '', 3)], 'SpecialTokens error') + # def test_tokenizer_opt(self): + # tokenizer = Tokenizer.from_pretrained('opt-1.3b-en') + # self.assertEqual(tokenizer.encode("day"), [1208], '') + # self.assertEqual(tokenizer.encode_plus("fried chicken makes me happy")["input_ids"], + # [0, 21209, 5884, 817, 162, 1372, 2], '') + # self.assertEqual(tokenizer.decode([21209, 5884, 817, 162, 1372]), + # 'fried chicken makes me happy', 'DecodeIds Error') + # self.assertEqual(tokenizer.tokenize('fried chicken makes me happy'), + # ['fried', 'Ġchicken', 'Ġmakes', 'Ġme', 'Ġhappy'], 'tokenize Error') + # self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'], + # [0, 21209, 5884, 817, 162, 1372, 2], 'encode_plus Error') + # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + # [('cls', '', 0), ('pad', '', 1), ('bos', '', 2), ('eos', '', 2), ('unk', '', 3)], 'SpecialTokens error') - def test_tokenizer_clip(self): - loader = AutoLoader(task_name="txt_img_matching", - model_name="clip-base-p32-224", - only_download_config=True) - tokenizer = loader.get_tokenizer() - self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') + # def test_tokenizer_clip(self): + # loader = AutoLoader(task_name="txt_img_matching", + # model_name="clip-base-p32-224", + # only_download_config=True) + # tokenizer = loader.get_tokenizer() + # self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') - def test_tokenizer_evaclip(self): - loader = AutoLoader(task_name="txt_img_matching", - model_name="eva-clip", - only_download_config=True) - tokenizer = loader.get_tokenizer() - self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') + # def test_tokenizer_evaclip(self): + # loader = AutoLoader(task_name="txt_img_matching", + # model_name="eva-clip", + # only_download_config=True) + # tokenizer = loader.get_tokenizer() + # self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '') def suite(): suite = unittest.TestSuite() - suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_ch')) - suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_en')) - suite.addTest(TokenizerTestCase('test_tokenizer_glm_10_en')) - suite.addTest(TokenizerTestCase('test_tokenizer_t5')) - suite.addTest(TokenizerTestCase('test_tokenizer_roberta')) - suite.addTest(TokenizerTestCase('test_tokenizer_bert')) + # suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_ch')) + # suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_en')) + # suite.addTest(TokenizerTestCase('test_tokenizer_glm_10_en')) + # suite.addTest(TokenizerTestCase('test_tokenizer_t5')) + # suite.addTest(TokenizerTestCase('test_tokenizer_roberta')) + # suite.addTest(TokenizerTestCase('test_tokenizer_bert')) suite.addTest(TokenizerTestCase('test_tokenizer_cpm1')) - suite.addTest(TokenizerTestCase('test_tokenizer_opt')) - suite.addTest(TokenizerTestCase('test_tokenizer_clip')) - suite.addTest(TokenizerTestCase('test_tokenizer_evaclip')) + # suite.addTest(TokenizerTestCase('test_tokenizer_opt')) + # suite.addTest(TokenizerTestCase('test_tokenizer_clip')) + # suite.addTest(TokenizerTestCase('test_tokenizer_evaclip')) return suite From 1a3bc5e9e8e0262778a815db94f121ed1f70d5cb Mon Sep 17 00:00:00 2001 From: ftgreat Date: Tue, 14 Mar 2023 09:58:03 +0800 Subject: [PATCH 38/54] fixed Signed-off-by: ftgreat --- examples/bminf_generate/glm_generate.py | 20 ++++++++++++++++++++ flagai/trainer.py | 1 - 2 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 examples/bminf_generate/glm_generate.py diff --git a/examples/bminf_generate/glm_generate.py b/examples/bminf_generate/glm_generate.py new file mode 100644 index 00000000..ddf3f730 --- /dev/null +++ b/examples/bminf_generate/glm_generate.py @@ -0,0 +1,20 @@ +from flagai.model.glm_model import GLMModel +from flagai.data.tokenizer import Tokenizer +from flagai.auto_model.auto_loader import AutoLoader +from flagai.model.predictor.predictor import Predictor +import torch +import bminf + +model_name = 'GLM-10b-ch' +loader = AutoLoader("lm", 'GLM-10b-ch', model_dir="./checkpoints/") +model = loader.get_model() +tokenizer = loader.get_tokenizer() +with torch.cuda.device(0): + model = bminf.wrapper(model, quantization=False, memory_limit=30 << 39) + +tokenizer = Tokenizer.from_pretrained(model_name) +predictor = Predictor(model, tokenizer) + +text = "今天天气不错[gMASK]" +output = predictor.predict_generate_randomsample(text, out_max_length=10) +print(text, '\n', output) \ No newline at end of file diff --git a/flagai/trainer.py b/flagai/trainer.py index 577909d6..07d257d8 100644 --- a/flagai/trainer.py +++ b/flagai/trainer.py @@ -1009,7 +1009,6 @@ def evaluate(self, labels = data_iterator['labels'] else: labels = data_iterator['target_ids'] - loss_mask = data_iterator['loss_mask'] if len(self.metric_methods) != 0: if {metric_tuple[0] for metric_tuple in self.metric_methods} & {"rouge", "bleu"}: batch_preds = torch.argmax(logits.detach(), dim=-1).cpu() From 3f46c38f4cfa8ba5023e95666a5a1f93175d9632 Mon Sep 17 00:00:00 2001 From: ftgreat Date: Wed, 15 Mar 2023 14:34:15 +0800 Subject: [PATCH 39/54] add optimzier Signed-off-by: ftgreat --- flagai/optimizers.py | 27 +++++++++++++++++++++++++++ flagai/trainer.py | 5 ++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/flagai/optimizers.py b/flagai/optimizers.py index 43be0138..1ed3ccd1 100644 --- a/flagai/optimizers.py +++ b/flagai/optimizers.py @@ -103,6 +103,33 @@ def get_optimizer(param_groups, lr=lr, relative_step=False, warmup_init=False) + elif optimizer == 'AdamW': + optimizer = AdamW(param_groups, + lr=lr, + weight_decay=weight_decay, + betas=(adam_beta1, adam_beta2), + eps=adam_eps) + elif optimizer == 'Lion': + from lion_pytorch import Lion + optimizer = Lion(param_groups, + lr=lr, + weight_decay=weight_decay, + betas=(adam_beta1, adam_beta2) + ) + elif optimizer == 'Adan': + from adan import Adan + optimizer = Adan(param_groups, + lr=lr, + weight_decay=weight_decay, + betas=(adam_beta1, adam_beta2, 0.99), + eps=adam_eps) + elif optimizer == 'LAMB': + from torch_optimizer import Lamb + optimizer = Lamb(param_groups, + lr=lr, + weight_decay=weight_decay, + betas=(adam_beta1, adam_beta2), + eps=adam_eps) else: raise NotImplementedError diff --git a/flagai/trainer.py b/flagai/trainer.py index 07d257d8..50309849 100644 --- a/flagai/trainer.py +++ b/flagai/trainer.py @@ -159,6 +159,7 @@ def __init__( deepspeed_config=None, model_parallel_size=1, training_script="train.py", + optimizer_type='adam', ): if timers is not None: @@ -185,6 +186,8 @@ def __init__( self.eval_interval = eval_interval self.tokenizer = tokenizer + self.optimizer_type = optimizer_type + # model checkpointing self.save_dir = save_dir self.save_interval = save_interval @@ -471,7 +474,7 @@ def train(self, cpu_optimizer=False, cpu_torch_adam=False, fp16=self.fp16, - optimizer='adam') # if not self.fp16 else 'adafactor') + optimizer=self.optimizer_type) # if not self.fp16 else 'adafactor') if lr_scheduler == None and optimizer != None and self.warm_up > 0 and 'deepspeed' not in self.env_type and self.epochs > 0: if self.env_type == 'bmtrain': From 12433c8b1bde1b5b4c753cffa7114b0b0872b6ce Mon Sep 17 00:00:00 2001 From: ftgreat Date: Wed, 15 Mar 2023 14:56:49 +0800 Subject: [PATCH 40/54] updated Signed-off-by: ftgreat --- README.md | 2 +- examples/AltCLIP/altclip_finetuning.py | 3 +- examples/AltDiffusion/generate.py | 7 +--- .../bert_title_generation_english/generate.py | 2 +- .../glm_blank_filling/glm_generate_samples.py | 42 +++---------------- examples/roberta_semantic_matching/train.py | 2 +- flagai/data/dataset/block/blocklm_utils.py | 8 ++-- .../data/dataset/data_collator/collate_fn.py | 14 +++---- flagai/data/dataset/data_utils.py | 4 +- flagai/data/dataset/language_model/dataset.py | 4 +- flagai/data/dataset/seq2seq/dataset.py | 4 +- flagai/data/dataset/superglue/pvp.py | 8 ++-- flagai/data/tokenizer/bert/bert_tokenizer.py | 2 +- .../galactica/galactica_tokenizer.py | 2 +- .../glm_10b_en/glm_10b_en_bpe_tokenizer.py | 4 +- .../glm_large_ch/glm_large_ch_tokenizer.py | 2 +- .../glm_large_en/glm_large_en_tokenizer.py | 2 +- flagai/data/tokenizer/opt/opt_en_tokenizer.py | 2 +- .../tokenizer/roberta/roberta_tokenizer.py | 2 +- flagai/data/tokenizer/t5/t5_tokenizer.py | 2 +- flagai/data/tokenizer/tokenizer.py | 2 +- flagai/model/predictor/utils.py | 4 +- flagai/test_utils.py | 2 +- 23 files changed, 46 insertions(+), 80 deletions(-) diff --git a/README.md b/README.md index 416ccc80..d48f1be3 100644 --- a/README.md +++ b/README.md @@ -260,6 +260,6 @@ The majority of FlagAI is licensed under the [Apache 2.0 license](LICENSE), howe ### ↳ Star History
-[![Star History Chart](https://api.star-history.com/svg?repos=FlagAI-Open/FlagAI&type=Date)] +![Star History Chart](https://api.star-history.com/svg?repos=FlagAI-Open/FlagAI&type=Date)]
diff --git a/examples/AltCLIP/altclip_finetuning.py b/examples/AltCLIP/altclip_finetuning.py index 7eeb9703..2b95fd4c 100644 --- a/examples/AltCLIP/altclip_finetuning.py +++ b/examples/AltCLIP/altclip_finetuning.py @@ -32,7 +32,6 @@ trainer = Trainer(env_type="pytorch", pytorch_device=device, experiment_name="clip_finetuning", - eval_interval=10, batch_size=4, lr=1e-4, epochs=10, @@ -63,4 +62,4 @@ def cifar10_collate_fn(batch): } if __name__ == "__main__": - trainer.train(model=model, train_dataset=dataset, collate_fn=cifar10_collate_fn, metric_methods=["accuracy"]) \ No newline at end of file + trainer.train(model=model, train_dataset=dataset, collate_fn=cifar10_collate_fn) \ No newline at end of file diff --git a/examples/AltDiffusion/generate.py b/examples/AltDiffusion/generate.py index d55a364c..79e59d17 100644 --- a/examples/AltDiffusion/generate.py +++ b/examples/AltDiffusion/generate.py @@ -17,9 +17,6 @@ model.eval() model.to(device) predictor = Predictor(model) -# predictor.predict_generate_images( -# "Anime portrait of natalie portman as an anime girl by stanley artgerm lau, wlop, rossdraws, james jean, andrei riabovitchev, marc simonetti, and sakimichan, trending on artstation" -# ) predictor.predict_generate_images( - "Anime portrait of a crouching smiling baby inside a glass bottle, wlop, rossdraws, james jean, andrei riabovitchev, marc simonetti, and sakimichan, trending on artstation" -) \ No newline at end of file + "Anime portrait of natalie portman as an anime girl by stanley artgerm lau, wlop, rossdraws, james jean, andrei riabovitchev, marc simonetti, and sakimichan, trending on artstation" +) diff --git a/examples/bert_title_generation_english/generate.py b/examples/bert_title_generation_english/generate.py index 1124d16d..fdfa2f41 100644 --- a/examples/bert_title_generation_english/generate.py +++ b/examples/bert_title_generation_english/generate.py @@ -7,7 +7,7 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -model_dir = "../state_dict/" +model_dir = "./checkpoints/" # Note "./checkpoints_seq2seq/{}/mp_rank_00_model_states.pt", {} is a directory in the checkpoints_seq2seq. model_save_path = "./checkpoints_seq2seq/7079/mp_rank_00_model_states.pt" diff --git a/examples/glm_blank_filling/glm_generate_samples.py b/examples/glm_blank_filling/glm_generate_samples.py index 4bcdf374..01b1bf00 100644 --- a/examples/glm_blank_filling/glm_generate_samples.py +++ b/examples/glm_blank_filling/glm_generate_samples.py @@ -5,58 +5,29 @@ from flagai.model.glm_model import GLMModel from flagai.data.tokenizer import Tokenizer from flagai.model.predictor.predictor import Predictor -import bminf if __name__ == "__main__": """Main training program.""" print('Generate Samples') # Random seeds for reproducability. # Model, - model_name = 'GLM-10b-ch' + model_name = 'GLM-large-ch' model = GLMModel.from_pretrain(model_name=model_name, download_path="./checkpoints") tokenizer = Tokenizer.from_pretrained(model_name) - with torch.cuda.device(0): - model = bminf.wrapper(model, quantization=False, memory_limit=30 << 39) - predictor = Predictor(model, tokenizer) -<<<<<<< HEAD + + # model.load_state_dict(torch.load("../glm_pretrain/checkpoints/1000/pytorch_model.bin")["module"]) + model.cuda(torch.cuda.current_device()) - text = ["今天天气不错[gMASK]"] -======= + predictor = Predictor(model, tokenizer) # generate samples text = [ '问题:啤酒伤胃吗?回答:[gMASK]', "问题:隔夜菜能吃吗?回答:[gMASK]", "问题:如何评价许嵩?回答:[gMASK]" ] ->>>>>>> master for t in text: output = predictor.predict_generate_randomsample( t, top_k=50, repetition_penalty=4.0, top_p=1.0) print(t, '\n', output) -<<<<<<< HEAD - - # text = [ - # '问题:啤酒伤胃吗?回答:[gMASK]', "问题:隔夜菜能吃吗?回答:[gMASK]", "问题:如何评价许嵩?回答:[gMASK]" - # ] - # for t in text: - # output = predictor.predict_generate_randomsample( - # t, top_k=50, repetition_penalty=4.0, top_p=1.0) - # print(t, '\n', output) - - # text = ['北京故宫是中国[MASK]非物质文化遗产。', "上海是中国[MASK]大都市。", "天津大学是[MASK]现代大学。"] - # for t in text: - # output = predictor.predict_generate_randomsample( - # t, top_k=50, repetition_penalty=4.0, top_p=1.0) - # print(t, '\n', output) - - # text = [ - # "人工智能是一个以计算机科学为基础,由计算机、数学、哲学等多学科交叉融合的交叉学科,[sMASK],具有非常巨大的前景。", - # "最近十多年来,人工神经网络的研究工作不断深入,已经取得了很大的进展,[sMASK],表现出了良好的智能特性。" - # ] - # for t in text: - # output = predictor.predict_generate_randomsample( - # t, top_k=50, repetition_penalty=4.0, top_p=1.0) - # print(t, '\n', output) -======= text = ['北京故宫是中国[MASK]非物质文化遗产。', "上海是中国[MASK]大都市。", "天津大学是[MASK]现代大学。"] for t in text: @@ -71,5 +42,4 @@ for t in text: output = predictor.predict_generate_randomsample( t, top_k=50, repetition_penalty=4.0, top_p=1.0) - print(t, '\n', output) ->>>>>>> master + print(t, '\n', output) \ No newline at end of file diff --git a/examples/roberta_semantic_matching/train.py b/examples/roberta_semantic_matching/train.py index e0648063..30e9821f 100644 --- a/examples/roberta_semantic_matching/train.py +++ b/examples/roberta_semantic_matching/train.py @@ -27,7 +27,7 @@ cur_dir = os.path.dirname(os.path.abspath(__file__)) train_path = cur_dir + "/data/train.tsv" -model_dir = "./state_dict/" +model_dir = "./checkpoints/" maxlen = 256 auto_loader = AutoLoader("semantic-matching", diff --git a/flagai/data/dataset/block/blocklm_utils.py b/flagai/data/dataset/block/blocklm_utils.py index 4687305f..44fda3d2 100644 --- a/flagai/data/dataset/block/blocklm_utils.py +++ b/flagai/data/dataset/block/blocklm_utils.py @@ -86,10 +86,10 @@ def __init__(self, self.encoder_decoder = encoder_decoder self.shuffle_blocks = shuffle_blocks self.sentinel_token = sentinel_token - self.generation_mask = 'gMASK' if task_mask else 'MASK' + self.generation_mask = 'gMASK' if task_mask else 'mask' self.generation_mask = self.tokenizer.get_command_id( self.generation_mask) - self.gap_sentence_mask = 'sMASK' if task_mask else 'MASK' + self.gap_sentence_mask = 'sMASK' if task_mask else 'mask' self.gap_sentence_mask = self.tokenizer.get_command_id( self.gap_sentence_mask) self.random_position = random_position @@ -205,7 +205,7 @@ def make_masked_data(self, # position_ids = np.arange(len(tokens), dtype=np.int64) targets = copy.deepcopy(tokens) - mask_id = self.tokenizer.get_command_id('MASK') + mask_id = self.tokenizer.get_command_id('mask') mlm_masks = np.zeros(len(tokens), dtype=np.int64) for start, end in block_spans: for idx in range(start, end): @@ -273,7 +273,7 @@ def make_block_data(self, elif task == 'gap_sentence': mask_id = self.gap_sentence_mask else: - mask_token = 'MASK' if idx == 0 else f'MASK{idx}' + mask_token = 'mask' if idx == 0 else f'MASK{idx}' mask_id = self.tokenizer.get_command_id(mask_token) local_spans.append((current_length, current_length + start - last)) source_tokens.append(tokens[last:start]) diff --git a/flagai/data/dataset/data_collator/collate_fn.py b/flagai/data/dataset/data_collator/collate_fn.py index 73b2f8e5..6eb629d5 100644 --- a/flagai/data/dataset/data_collator/collate_fn.py +++ b/flagai/data/dataset/data_collator/collate_fn.py @@ -126,7 +126,7 @@ def __init__(self, args, tokenizer, task_name): def encode(self, example): cls_id = self.tokenizer.get_command_id('cls') - mask_token = 'sMASK' if self.args.task_mask else 'MASK' + mask_token = 'sMASK' if self.args.task_mask else 'mask' mask_id = self.tokenizer.get_command_id(mask_token) pad_id = self.tokenizer.get_command_id('pad') sop_id = self.tokenizer.get_command_id('sop') @@ -175,7 +175,7 @@ def sub_finder(mylist, pattern): source_tokens = [cls_id] + source_tokens + [mask_id ] + answer_tokens elif self.task_name in ["cmrc"]: - mask_id = self.tokenizer.get_command_id('MASK') + mask_id = self.tokenizer.get_command_id('mask') source_text = example.text_a target_text = example.meta["answer"].strip() question = example.meta["question"].strip() @@ -191,7 +191,7 @@ def sub_finder(mylist, pattern): mask_id ] + source_tokens[:max_src_length] elif self.task_name in ["wsc"]: - mask_id = self.tokenizer.get_command_id('MASK') + mask_id = self.tokenizer.get_command_id('mask') source_text = example.text_a target_text = example.meta["answer"].strip() question = example.meta["question"].strip() @@ -307,10 +307,10 @@ def __init__(self, self.encoder_decoder = encoder_decoder self.shuffle_blocks = shuffle_blocks self.sentinel_token = sentinel_token - self.generation_mask = 'gMASK' if task_mask else 'MASK' + self.generation_mask = 'gMASK' if task_mask else 'mask' self.generation_mask = self.tokenizer.get_command_id( self.generation_mask) - self.gap_sentence_mask = 'sMASK' if task_mask else 'MASK' + self.gap_sentence_mask = 'sMASK' if task_mask else 'mask' self.gap_sentence_mask = self.tokenizer.get_command_id( self.gap_sentence_mask) self.random_position = random_position @@ -426,7 +426,7 @@ def make_masked_data(self, position_ids = np.arange(len(tokens), dtype=np.int64) targets = copy.deepcopy(tokens) - mask_id = self.tokenizer.get_command_id('MASK') + mask_id = self.tokenizer.get_command_id('mask') mlm_masks = np.zeros(len(tokens), dtype=np.int64) for start, end in block_spans: for idx in range(start, end): @@ -494,7 +494,7 @@ def make_block_data(self, elif task == 'gap_sentence': mask_id = self.gap_sentence_mask else: - mask_token = 'MASK' if idx == 0 else f'MASK{idx}' + mask_token = 'mask' if idx == 0 else f'MASK{idx}' mask_id = self.tokenizer.get_command_id(mask_token) local_spans.append((current_length, current_length + start - last)) source_tokens.append(tokens[last:start]) diff --git a/flagai/data/dataset/data_utils.py b/flagai/data/dataset/data_utils.py index 4f0ee38d..1efee372 100644 --- a/flagai/data/dataset/data_utils.py +++ b/flagai/data/dataset/data_utils.py @@ -134,7 +134,7 @@ def build_input_from_ids(text_a_ids, # Prepare ids for special tokens if mask_id is None: - mask_id = tokenizer.get_command_id('MASK') + mask_id = tokenizer.get_command_id('mask') eos_id = tokenizer.get_command_id('eos') # end of sentence token cls_id = tokenizer.get_command_id('cls') # start of sentence token sep_id = tokenizer.get_command_id('sep') # seperator of two texts token @@ -235,7 +235,7 @@ def build_input_from_ids(text_a_ids, # def build_decoder_input(enc_ids, answer_ids, max_seq_length, max_dec_seq_length, tokenizer): - mask_id = tokenizer.get_command_id('MASK') + mask_id = tokenizer.get_command_id('mask') eos_id = tokenizer.get_command_id('eos') sop_id = tokenizer.get_command_id('sop') masks = [] diff --git a/flagai/data/dataset/language_model/dataset.py b/flagai/data/dataset/language_model/dataset.py index b291251b..a911df81 100644 --- a/flagai/data/dataset/language_model/dataset.py +++ b/flagai/data/dataset/language_model/dataset.py @@ -38,7 +38,7 @@ def __init__(self, args, documents, tokenizer, num_original_tokens, self.left_weights = [0] + self.weights[:-1] self.unidirectional = args.unidirectional self.block_lm = args.block_lm - mask_token = "gMASK" if args.task_mask else 'MASK' + mask_token = "gMASK" if args.task_mask else 'mask' self.mask_id = self.tokenizer.get_command_id(mask_token) def __len__(self): @@ -115,7 +115,7 @@ def __init__(self, args, tokenizer, strict=True): self.strict = strict self.block_lm = args.block_lm self.unidirectional = args.unidirectional - mask_token = "gMASK" if args.task_mask else 'MASK' + mask_token = "gMASK" if args.task_mask else 'mask' self.mask_id = self.tokenizer.get_command_id(mask_token) self.tokens = [] diff --git a/flagai/data/dataset/seq2seq/dataset.py b/flagai/data/dataset/seq2seq/dataset.py index adc28149..b0bc4148 100644 --- a/flagai/data/dataset/seq2seq/dataset.py +++ b/flagai/data/dataset/seq2seq/dataset.py @@ -477,7 +477,7 @@ def __len__(self): def __getitem__(self, idx): example = self.example_list[idx] source_text, target_text = example.text_a, example.text_b - mask_token = 'MASK' + mask_token = 'mask' mask_id = self.tokenizer.get_command_id(mask_token) sop_id = self.tokenizer.get_command_id('sop') eop_id = self.tokenizer.get_command_id('eop') @@ -612,7 +612,7 @@ def __len__(self): def __getitem__(self, idx): example = self.example_list[idx] source_text = example.text_a - mask_token = 'gMASK' if self.args.task_mask else 'MASK' + mask_token = 'gMASK' if self.args.task_mask else 'mask' mask_id = self.tokenizer.get_command_id(mask_token) sop_id = self.tokenizer.get_command_id('sop') eop_id = self.tokenizer.get_command_id('eop') diff --git a/flagai/data/dataset/superglue/pvp.py b/flagai/data/dataset/superglue/pvp.py index d4d07b39..8a4d6ee3 100644 --- a/flagai/data/dataset/superglue/pvp.py +++ b/flagai/data/dataset/superglue/pvp.py @@ -97,12 +97,12 @@ def spell_length(self): @property def mask(self) -> str: """Return the underlying LM's mask token""" - return self.tokenizer.get_command_id('MASK') + return self.tokenizer.get_command_id('mask') @property def mask_id(self) -> int: """Return the underlying LM's mask id""" - return self.tokenizer.get_command_id('MASK') + return self.tokenizer.get_command_id('mask') @property def max_num_verbalizers(self) -> int: @@ -574,13 +574,13 @@ def spell_length(self): @property def mask(self) -> str: """Return the underlying LM's mask token""" - mask_token = 'MASK' + mask_token = 'mask' return self.tokenizer.get_command_id(mask_token) @property def mask_id(self) -> int: """Return the underlying LM's mask id""" - mask_token = 'MASK' + mask_token = 'mask' return self.tokenizer.get_command_id(mask_token) def get_answers(self, example: InputExample): diff --git a/flagai/data/tokenizer/bert/bert_tokenizer.py b/flagai/data/tokenizer/bert/bert_tokenizer.py index eec168ea..3c935713 100644 --- a/flagai/data/tokenizer/bert/bert_tokenizer.py +++ b/flagai/data/tokenizer/bert/bert_tokenizer.py @@ -75,7 +75,7 @@ def __init__(self, tokenizer_model_type=None, cache_dir=None): self._command_tokens = [ CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), - CommandToken('MASK', '[MASK]', + CommandToken('mask', '[MASK]', self.get_specialid_from_text_tokenizer('mask')), CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')), diff --git a/flagai/data/tokenizer/galactica/galactica_tokenizer.py b/flagai/data/tokenizer/galactica/galactica_tokenizer.py index fdaf5be6..f028d0f0 100644 --- a/flagai/data/tokenizer/galactica/galactica_tokenizer.py +++ b/flagai/data/tokenizer/galactica/galactica_tokenizer.py @@ -15,7 +15,7 @@ def __init__(self, download_dir) -> None: self._command_tokens = [ CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), - CommandToken('MASK', '[MASK]', + CommandToken('mask', '[MASK]', self.get_specialid_from_text_tokenizer('mask')), CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')), diff --git a/flagai/data/tokenizer/glm_10b_en/glm_10b_en_bpe_tokenizer.py b/flagai/data/tokenizer/glm_10b_en/glm_10b_en_bpe_tokenizer.py index b762b66b..e592d33d 100644 --- a/flagai/data/tokenizer/glm_10b_en/glm_10b_en_bpe_tokenizer.py +++ b/flagai/data/tokenizer/glm_10b_en/glm_10b_en_bpe_tokenizer.py @@ -60,7 +60,7 @@ def __init__(self, self.text_tokenizer.encoder['']), CommandToken('cls', '[CLS]', self.text_tokenizer.encoder['']), - CommandToken('MASK', + CommandToken('mask', '[MASK]', self.text_tokenizer.encoder[''], lstrip=True), @@ -88,7 +88,7 @@ def __init__(self, CommandToken('sop', '<|startofpiece|>', self.num_tokens), CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1), CommandToken('cls', '[CLS]', self.num_tokens + 2), - CommandToken('MASK', + CommandToken('mask', '[MASK]', self.num_tokens + 3, lstrip=True), diff --git a/flagai/data/tokenizer/glm_large_ch/glm_large_ch_tokenizer.py b/flagai/data/tokenizer/glm_large_ch/glm_large_ch_tokenizer.py index 69048d3a..b91797f6 100644 --- a/flagai/data/tokenizer/glm_large_ch/glm_large_ch_tokenizer.py +++ b/flagai/data/tokenizer/glm_large_ch/glm_large_ch_tokenizer.py @@ -55,7 +55,7 @@ def __init__(self, CommandToken('eos', '<|endoftext|>', self.num_text_tokens), CommandToken('sep', '[SEP]', self.num_text_tokens + 1), CommandToken('cls', '[CLS]', self.num_text_tokens + 2), - CommandToken('MASK', + CommandToken('mask', '[MASK]', self.num_text_tokens + 3, lstrip=True), diff --git a/flagai/data/tokenizer/glm_large_en/glm_large_en_tokenizer.py b/flagai/data/tokenizer/glm_large_en/glm_large_en_tokenizer.py index ff4e1e4a..db4c726f 100644 --- a/flagai/data/tokenizer/glm_large_en/glm_large_en_tokenizer.py +++ b/flagai/data/tokenizer/glm_large_en/glm_large_en_tokenizer.py @@ -59,7 +59,7 @@ def __init__(self, self._command_tokens = [ CommandToken('pad', '[PAD]', self.text_tokenizer.vocab['[PAD]']), CommandToken('cls', '[CLS]', self.text_tokenizer.vocab['[CLS]']), - CommandToken('MASK', '[MASK]', + CommandToken('mask', '[MASK]', self.text_tokenizer.vocab['[MASK]']), CommandToken('unk', '[UNK]', self.text_tokenizer.vocab['[UNK]']), CommandToken('sep', '[SEP]', self.text_tokenizer.vocab['[SEP]']), diff --git a/flagai/data/tokenizer/opt/opt_en_tokenizer.py b/flagai/data/tokenizer/opt/opt_en_tokenizer.py index 5c1c0de8..9e8e528c 100644 --- a/flagai/data/tokenizer/opt/opt_en_tokenizer.py +++ b/flagai/data/tokenizer/opt/opt_en_tokenizer.py @@ -35,7 +35,7 @@ def __init__(self, tokenizer_model_type="facebook/opt-125m", cache_dir=None): self._command_tokens = [ CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), - CommandToken('MASK', '[MASK]', + CommandToken('mask', '[MASK]', self.get_specialid_from_text_tokenizer('mask')), CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')), diff --git a/flagai/data/tokenizer/roberta/roberta_tokenizer.py b/flagai/data/tokenizer/roberta/roberta_tokenizer.py index 553a8a83..f1b270e4 100644 --- a/flagai/data/tokenizer/roberta/roberta_tokenizer.py +++ b/flagai/data/tokenizer/roberta/roberta_tokenizer.py @@ -38,7 +38,7 @@ def __init__(self, tokenizer_model_type="roberta-base", cache_dir=None): self._command_tokens = [ CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), - CommandToken('MASK', '[MASK]', + CommandToken('mask', '[MASK]', self.get_specialid_from_text_tokenizer('mask')), CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')), diff --git a/flagai/data/tokenizer/t5/t5_tokenizer.py b/flagai/data/tokenizer/t5/t5_tokenizer.py index ef793b67..499aa83e 100644 --- a/flagai/data/tokenizer/t5/t5_tokenizer.py +++ b/flagai/data/tokenizer/t5/t5_tokenizer.py @@ -45,7 +45,7 @@ def __init__(self, tokenizer_model_type="t5-base", cache_dir=None): CommandToken('pad', '[PAD]', self.num_tokens + 1), CommandToken('cls', '[CLS]', self.num_tokens + 2), - CommandToken('MASK', '[MASK]', + CommandToken('mask', '[MASK]', self.num_tokens + 3), ] self._command_tokens.extend([ diff --git a/flagai/data/tokenizer/tokenizer.py b/flagai/data/tokenizer/tokenizer.py index c3ba085f..43585688 100644 --- a/flagai/data/tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/tokenizer.py @@ -54,7 +54,7 @@ def __str__(self): ('sep', 4), ('L2R', 5), ('cls', 6), - ('MASK', 7), + ('mask', 7), ] DEFAULT_COMMAND_TOKENS = prep_command_tokens(DEFAULT_COMMAND_TOKENS) """define some default type tokens for bert training""" diff --git a/flagai/model/predictor/utils.py b/flagai/model/predictor/utils.py index 72077041..61d91ce6 100644 --- a/flagai/model/predictor/utils.py +++ b/flagai/model/predictor/utils.py @@ -1133,7 +1133,7 @@ def alm_beamsearch(model, tokenizer, text, out_max_length, beam_size, eod_token= dtype=torch.long) position_ids = torch.stack((position_ids, block_position_ids), dim=0) position_ids = position_ids.unsqueeze(0) - mask_tokens = ['MASK', 'sMASK', 'gMASK'] + mask_tokens = ['mask', 'sMASK', 'gMASK'] mask_tokens = [tokenizer.get_command_id(token) for token in mask_tokens] end_tokens = [tokenizer.get_command_id('eop'), eod_token] mask_positions = [] @@ -1434,7 +1434,7 @@ def glm_generate_sample( dtype=torch.long) position_ids = torch.stack((position_ids, block_position_ids), dim=0) position_ids = position_ids.unsqueeze(0) - mask_tokens = ['MASK', 'sMASK', 'gMASK'] + mask_tokens = ['mask', 'sMASK', 'gMASK'] mask_tokens = [tokenizer.get_command_id(token) for token in mask_tokens] end_tokens = [tokenizer.get_command_id('eop'), eod_token] mask_positions = [] diff --git a/flagai/test_utils.py b/flagai/test_utils.py index 83dacde3..5faa0aec 100644 --- a/flagai/test_utils.py +++ b/flagai/test_utils.py @@ -14,7 +14,7 @@ def build_input_from_ids(text_a_ids=None, mask_id=None, masked_lm=False): if mask_id is None: - mask_id = tokenizer.get_command_id('MASK') + mask_id = tokenizer.get_command_id('mask') eos_id = tokenizer.get_command_id('eos') cls_id = tokenizer.get_command_id('cls') sep_id = tokenizer.get_command_id('sep') From ade08953f346644ebd7fb41b0ac342c3952df4c4 Mon Sep 17 00:00:00 2001 From: ftgreat Date: Wed, 15 Mar 2023 15:43:03 +0800 Subject: [PATCH 41/54] updated Signed-off-by: ftgreat --- examples/AltDiffusion/generate.py | 2 +- flagai/data/tokenizer/uni_tokenizer/tokenizer.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/AltDiffusion/generate.py b/examples/AltDiffusion/generate.py index 79e59d17..1fa2c88f 100644 --- a/examples/AltDiffusion/generate.py +++ b/examples/AltDiffusion/generate.py @@ -9,7 +9,7 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") loader = AutoLoader(task_name="text2img", #contrastive learning - model_name="AltDiffusion", + model_name="AltDiffusion-m9", model_dir="./checkpoints", use_fp16=False) diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py index 4660b994..4f0c0cde 100644 --- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py @@ -397,8 +397,8 @@ def encode_plus_non_glm( ): def get_input_ids(text): - tokens = self.text_tokenizer.tokenize(text) - return self.text_tokenizer.convert_tokens_to_ids(tokens) + tokens = self.tokenize(text) + return self.convert_tokens_to_ids(tokens) first_ids = get_input_ids(text) second_ids = get_input_ids( From 3560b608c46a9d2245cbcee8ddc823d67e194ea4 Mon Sep 17 00:00:00 2001 From: ftgreat Date: Wed, 15 Mar 2023 16:02:40 +0800 Subject: [PATCH 42/54] can assert new special tokens Signed-off-by: ftgreat --- tests/test_tokenizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 9ba33089..7b1d92de 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -67,7 +67,6 @@ def test_tokenizer_t5(self): def test_tokenizer_roberta(self): tokenizer = Tokenizer.from_pretrained('RoBERTa-base-ch') - # print(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825])) self.assertEqual(tokenizer.TokenToId("人"), 782, '') self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"), [791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825], '') From 5f7edd3f6d0ed265279ef1896c7ecaf0556c3bba Mon Sep 17 00:00:00 2001 From: shunxing1234 Date: Wed, 15 Mar 2023 08:04:12 +0000 Subject: [PATCH 43/54] add optimzier Signed-off-by: shunxing1234 --- flagai/optimizers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/flagai/optimizers.py b/flagai/optimizers.py index 1ed3ccd1..fa9710d2 100644 --- a/flagai/optimizers.py +++ b/flagai/optimizers.py @@ -103,27 +103,27 @@ def get_optimizer(param_groups, lr=lr, relative_step=False, warmup_init=False) - elif optimizer == 'AdamW': + elif optimizer == 'adamw': optimizer = AdamW(param_groups, lr=lr, weight_decay=weight_decay, betas=(adam_beta1, adam_beta2), eps=adam_eps) - elif optimizer == 'Lion': + elif optimizer == 'lion': from lion_pytorch import Lion optimizer = Lion(param_groups, lr=lr, weight_decay=weight_decay, betas=(adam_beta1, adam_beta2) ) - elif optimizer == 'Adan': + elif optimizer == 'adan': from adan import Adan optimizer = Adan(param_groups, lr=lr, weight_decay=weight_decay, betas=(adam_beta1, adam_beta2, 0.99), eps=adam_eps) - elif optimizer == 'LAMB': + elif optimizer == 'lamb': from torch_optimizer import Lamb optimizer = Lamb(param_groups, lr=lr, From 0ce3898086672f328f30116e08e523d573607cc0 Mon Sep 17 00:00:00 2001 From: ftgreat Date: Wed, 15 Mar 2023 16:23:17 +0800 Subject: [PATCH 44/54] removed testing cpm tokenizer Signed-off-by: ftgreat --- tests/test_tokenizer.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 7b1d92de..c72d34a7 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -95,25 +95,25 @@ def test_tokenizer_bert(self): {('eos', '[PAD]', 0), ('unk', '[UNK]', 100), ('cls', '[CLS]', 101), ('sep', '[SEP]', 102), ('mask', '[MASK]', 103), ('pad', '[PAD]', 0)}, 'SpecialTokens error') - def test_tokenizer_cpm1(self): - loader = AutoLoader(task_name="lm", - model_name="CPM-large-ch", - model_dir="./checkpoints/", - only_download_config=True) + # def test_tokenizer_cpm1(self): + # loader = AutoLoader(task_name="lm", + # model_name="CPM-large-ch", + # model_dir="./checkpoints/", + # only_download_config=True) - tokenizer = loader.get_tokenizer() - self.assertEqual(tokenizer.TokenToId("人"), 62, '') - self.assertEqual(tokenizer.encode("今天吃饭吃了肯德基"), - [837, 3079, 1777, 3079, 139, 3687, 513, 1463], '') - self.assertEqual(tokenizer.DecodeIds([837, 3079, 1777, 3079, 139, 3687, 513, 1463]), - '今天吃饭吃了肯德基', 'DecodeIds Error') - self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), - [837, 3079, 1777, 3079, 139, 3687, 513, 1463], 'tokenize Error') - self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], - [837, 3079, 1777, 3079, 139, 3687, 513, 1463], 'encode_plus Error') - self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]), - {('unk', '', 0), ('cls', '', 1), ('eos', '', 2), ('sep', '', 4), - ('mask', '', 6), ('pad', '', 5),('eod', '', 7)}, 'SpecialTokens error') + # tokenizer = loader.get_tokenizer() + # self.assertEqual(tokenizer.TokenToId("人"), 62, '') + # self.assertEqual(tokenizer.encode("今天吃饭吃了肯德基"), + # [837, 3079, 1777, 3079, 139, 3687, 513, 1463], '') + # self.assertEqual(tokenizer.DecodeIds([837, 3079, 1777, 3079, 139, 3687, 513, 1463]), + # '今天吃饭吃了肯德基', 'DecodeIds Error') + # self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), + # [837, 3079, 1777, 3079, 139, 3687, 513, 1463], 'tokenize Error') + # self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], + # [837, 3079, 1777, 3079, 139, 3687, 513, 1463], 'encode_plus Error') + # self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]), + # {('unk', '', 0), ('cls', '', 1), ('eos', '', 2), ('sep', '', 4), + # ('mask', '', 6), ('pad', '', 5),('eod', '', 7)}, 'SpecialTokens error') def test_tokenizer_opt(self): tokenizer = Tokenizer.from_pretrained('opt-1.3b-en') @@ -152,7 +152,7 @@ def suite(): suite.addTest(TokenizerTestCase('test_tokenizer_t5')) suite.addTest(TokenizerTestCase('test_tokenizer_roberta')) suite.addTest(TokenizerTestCase('test_tokenizer_bert')) - suite.addTest(TokenizerTestCase('test_tokenizer_cpm1')) + # suite.addTest(TokenizerTestCase('test_tokenizer_cpm1')) suite.addTest(TokenizerTestCase('test_tokenizer_opt')) suite.addTest(TokenizerTestCase('test_tokenizer_clip')) suite.addTest(TokenizerTestCase('test_tokenizer_evaclip')) From 269ca8948abfa28ed46759eb73239c543fdac520 Mon Sep 17 00:00:00 2001 From: shunxing1234 Date: Wed, 15 Mar 2023 08:24:22 +0000 Subject: [PATCH 45/54] fix Signed-off-by: shunxing1234 --- flagai/optimizers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/flagai/optimizers.py b/flagai/optimizers.py index fa9710d2..8d0867b3 100644 --- a/flagai/optimizers.py +++ b/flagai/optimizers.py @@ -104,6 +104,7 @@ def get_optimizer(param_groups, relative_step=False, warmup_init=False) elif optimizer == 'adamw': + from torch.optim import AdamW optimizer = AdamW(param_groups, lr=lr, weight_decay=weight_decay, From b8f86391cb7532b558f61c8ac97a74f6e875442f Mon Sep 17 00:00:00 2001 From: jongjyh Date: Thu, 16 Mar 2023 21:41:46 +0800 Subject: [PATCH 46/54] fix bug in setting for mp size >1 --- flagai/model/base_model.py | 6 ++++-- flagai/trainer.py | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/flagai/model/base_model.py b/flagai/model/base_model.py index 46367e28..c385c52a 100644 --- a/flagai/model/base_model.py +++ b/flagai/model/base_model.py @@ -77,9 +77,11 @@ def from_pretrain(cls, config_path = os.path.join(download_path, "config.json") checkpoint_path = os.path.join(download_path, "pytorch_model.bin") - def load_local(checkpoint_path): + def load_local(checkpoint_path, only_download_config=False): model = cls.init_from_json(config_path, **kwargs) model.to(device) + if only_download_config: + return model if os.getenv('ENV_TYPE') != 'deepspeed+mpu': if os.path.exists(checkpoint_path): model.load_weights(checkpoint_path) @@ -146,7 +148,7 @@ def load_diffusion_local(yaml_path, only_download_config=False, **kwargs): It is fine when checkpoint_path does not exist, for the case that only_download_config=True At that time the model will not be loaded. """ - return load_local(checkpoint_path) + return load_local(checkpoint_path, only_download_config=only_download_config) try: model_id = _get_model_id(model_name) diff --git a/flagai/trainer.py b/flagai/trainer.py index 50309849..17703d73 100644 --- a/flagai/trainer.py +++ b/flagai/trainer.py @@ -335,9 +335,9 @@ def get_dataloader(self, dataset, collate_fn, shuffle=False): shuffle=shuffle) else: if self.env_type == 'deepspeed+mpu': - rank = mpu.get_model_parallel_src_rank() + rank = mpu.get_data_parallel_rank() print("*"*80) - print("local rank",self.rank, "model rank", rank) + print("local rank",self.rank, "data parallel rank", rank) print("*"*80) sampler = torch.utils.data.distributed.DistributedSampler( dataset, From ad9e3a39d07a8706462805437dec24cdb452438e Mon Sep 17 00:00:00 2001 From: shunxing1234 Date: Fri, 17 Mar 2023 06:24:25 +0000 Subject: [PATCH 47/54] add optimizer tutorial Signed-off-by: shunxing1234 --- docs/TUTORIAL_21_OPTIMIZER.md | 41 +++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 docs/TUTORIAL_21_OPTIMIZER.md diff --git a/docs/TUTORIAL_21_OPTIMIZER.md b/docs/TUTORIAL_21_OPTIMIZER.md new file mode 100644 index 00000000..9e606f4f --- /dev/null +++ b/docs/TUTORIAL_21_OPTIMIZER.md @@ -0,0 +1,41 @@ +# How to use Optimizer + +## What is Optimizer? +In the context of machine learning and deep learning, +an optimizer is an algorithm or method used to update the parameters of a model in order to minimize the error between the predicted output and the actual output. + +The goal of an optimizer is to find the optimal set of parameters that can achieve the best performance on a given task. +This process is typically performed during the training phase of a machine learning model. + +Optimizers work by computing the gradients of the loss function with respect to the model parameters, +and using this information to update the parameters in the direction that reduces the loss. +There are various optimization algorithms available, +such as stochastic gradient descent (SGD), Adagrad, Adam, RMSprop, and more, each with their own advantages and disadvantages. + +The choice of optimizer depends on the specific problem, the size of the dataset, +the complexity of the model, and other factors. +A good optimizer can significantly improve the training speed and accuracy of a model. + + + + +## Loading optimizer +```python +>>> trainer = Trainer(env_type='pytorch', +>>> epochs=1, +>>> batch_size=2, +>>> eval_interval=100, +>>> log_interval=10, +>>> experiment_name='glm_large_bmtrain', +>>> pytorch_device='cuda', +>>> load_dir=None, +>>> lr=1e-4, +>>> num_gpus = 1, +>>> weight_decay=1e-2, +>>> save_interval=1000, +>>> hostfile='./hostfile', +>>> training_script=__file__, +>>> deepspeed_config='./deepspeed.json', +>>> optimizer_type='lion') #load optimizer +``` + From d71563be3469c642f9221e1ac6b2bcb187cca70c Mon Sep 17 00:00:00 2001 From: shunxing1234 Date: Fri, 17 Mar 2023 06:36:01 +0000 Subject: [PATCH 48/54] add optimizer tutorial Signed-off-by: shunxing1234 --- doc_zh/TUTORIAL_21_OPTIMIZER.md | 38 +++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 doc_zh/TUTORIAL_21_OPTIMIZER.md diff --git a/doc_zh/TUTORIAL_21_OPTIMIZER.md b/doc_zh/TUTORIAL_21_OPTIMIZER.md new file mode 100644 index 00000000..93065fd5 --- /dev/null +++ b/doc_zh/TUTORIAL_21_OPTIMIZER.md @@ -0,0 +1,38 @@ +# 如何使用优化器 + +## 优化器是什么? +在机器学习和深度学习的语境下, +优化器(Optimizer)是指用于更新模型参数的算法或方法,以便最小化预测输出和实际输出之间的误差。 + +优化器的目标是找到最优的参数组合,以在给定任务上获得最佳性能。 +这个过程通常在机器学习模型的训练阶段执行。 + +优化器通过计算损失函数相对于模型参数的梯度,并使用这些信息来更新参数,以减少损失。 +有多种可用的优化算法,例如随机梯度下降(SGD)、Adagrad、Adam、RMSprop等,每种算法都有其优点和缺点。 + +优化器的选择取决于特定问题、数据集的大小、模型的复杂性和其他因素。 +一个好的优化器可以显著提高模型的训练速度和准确性。 + + + + +## 加载 优化器 +```python +>>> trainer = Trainer(env_type='pytorch', +>>> epochs=1, +>>> batch_size=2, +>>> eval_interval=100, +>>> log_interval=10, +>>> experiment_name='glm_large_bmtrain', +>>> pytorch_device='cuda', +>>> load_dir=None, +>>> lr=1e-4, +>>> num_gpus = 1, +>>> weight_decay=1e-2, +>>> save_interval=1000, +>>> hostfile='./hostfile', +>>> training_script=__file__, +>>> deepspeed_config='./deepspeed.json', +>>> optimizer_type='lion') #load optimizer +``` + From 5f816cf4ef1f9c43561796e32b2b5e9dc609b010 Mon Sep 17 00:00:00 2001 From: shunxing1234 Date: Fri, 17 Mar 2023 06:39:51 +0000 Subject: [PATCH 49/54] fix optimizer zh tutorial Signed-off-by: shunxing1234 --- doc_zh/TUTORIAL_21_OPTIMIZER.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc_zh/TUTORIAL_21_OPTIMIZER.md b/doc_zh/TUTORIAL_21_OPTIMIZER.md index 93065fd5..2762ae36 100644 --- a/doc_zh/TUTORIAL_21_OPTIMIZER.md +++ b/doc_zh/TUTORIAL_21_OPTIMIZER.md @@ -16,7 +16,7 @@ -## 加载 优化器 +## 加载优化器 ```python >>> trainer = Trainer(env_type='pytorch', >>> epochs=1, From 9900ed493d210c25309d395f2282adba438310a1 Mon Sep 17 00:00:00 2001 From: shunxing1234 Date: Fri, 17 Mar 2023 07:06:18 +0000 Subject: [PATCH 50/54] add optimizer tutorial Signed-off-by: shunxing1234 --- doc_zh/TUTORIAL_21_OPTIMIZER.md | 1 + docs/TUTORIAL_21_OPTIMIZER.md | 1 + 2 files changed, 2 insertions(+) diff --git a/doc_zh/TUTORIAL_21_OPTIMIZER.md b/doc_zh/TUTORIAL_21_OPTIMIZER.md index 2762ae36..d5c8de04 100644 --- a/doc_zh/TUTORIAL_21_OPTIMIZER.md +++ b/doc_zh/TUTORIAL_21_OPTIMIZER.md @@ -18,6 +18,7 @@ ## 加载优化器 ```python +>>> # currently FlagAI support adam, adamw, lion, adan, adafactor and lamb, which can be defined by setting optimizer_type when defining Trainer >>> trainer = Trainer(env_type='pytorch', >>> epochs=1, >>> batch_size=2, diff --git a/docs/TUTORIAL_21_OPTIMIZER.md b/docs/TUTORIAL_21_OPTIMIZER.md index 9e606f4f..90ae13b8 100644 --- a/docs/TUTORIAL_21_OPTIMIZER.md +++ b/docs/TUTORIAL_21_OPTIMIZER.md @@ -21,6 +21,7 @@ A good optimizer can significantly improve the training speed and accuracy of a ## Loading optimizer ```python +>>> # currently FlagAI support adam, adamw, lion, adan, adafactor and lamb, which can be defined by setting optimizer_type when defining Trainer >>> trainer = Trainer(env_type='pytorch', >>> epochs=1, >>> batch_size=2, From 84993b6fc8d4fc53c7fd6d504009370dce8922ac Mon Sep 17 00:00:00 2001 From: shunxing1234 Date: Fri, 17 Mar 2023 07:37:35 +0000 Subject: [PATCH 51/54] add tutorial Signed-off-by: shunxing1234 --- doc_zh/TUTORIAL_21_OPTIMIZER.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/doc_zh/TUTORIAL_21_OPTIMIZER.md b/doc_zh/TUTORIAL_21_OPTIMIZER.md index d5c8de04..c30ac609 100644 --- a/doc_zh/TUTORIAL_21_OPTIMIZER.md +++ b/doc_zh/TUTORIAL_21_OPTIMIZER.md @@ -17,6 +17,21 @@ ## 加载优化器 + +### 依赖 +#### adan +``` +python3 -m pip install git+https://github.com/sail-sg/Adan.git +``` +#### lion +``` +$ pip install lion-pytorch +``` +#### lamb +``` +$ pip install torch_optimizer +``` + ```python >>> # currently FlagAI support adam, adamw, lion, adan, adafactor and lamb, which can be defined by setting optimizer_type when defining Trainer >>> trainer = Trainer(env_type='pytorch', From 9967b3c529c4ad2903e14c76f19fcff1963d33c3 Mon Sep 17 00:00:00 2001 From: shunxing1234 Date: Fri, 17 Mar 2023 07:41:34 +0000 Subject: [PATCH 52/54] add tutorial Signed-off-by: shunxing1234 --- doc_zh/TUTORIAL_21_OPTIMIZER.md | 2 +- docs/TUTORIAL_21_OPTIMIZER.md | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/doc_zh/TUTORIAL_21_OPTIMIZER.md b/doc_zh/TUTORIAL_21_OPTIMIZER.md index c30ac609..d1527b59 100644 --- a/doc_zh/TUTORIAL_21_OPTIMIZER.md +++ b/doc_zh/TUTORIAL_21_OPTIMIZER.md @@ -31,7 +31,7 @@ $ pip install lion-pytorch ``` $ pip install torch_optimizer ``` - +#### 例子 ```python >>> # currently FlagAI support adam, adamw, lion, adan, adafactor and lamb, which can be defined by setting optimizer_type when defining Trainer >>> trainer = Trainer(env_type='pytorch', diff --git a/docs/TUTORIAL_21_OPTIMIZER.md b/docs/TUTORIAL_21_OPTIMIZER.md index 90ae13b8..b77db929 100644 --- a/docs/TUTORIAL_21_OPTIMIZER.md +++ b/docs/TUTORIAL_21_OPTIMIZER.md @@ -20,6 +20,21 @@ A good optimizer can significantly improve the training speed and accuracy of a ## Loading optimizer + +### dependencies +#### adan +``` +python3 -m pip install git+https://github.com/sail-sg/Adan.git +``` +#### lion +``` +$ pip install lion-pytorch +``` +#### lamb +``` +$ pip install torch_optimizer +``` +#### example ```python >>> # currently FlagAI support adam, adamw, lion, adan, adafactor and lamb, which can be defined by setting optimizer_type when defining Trainer >>> trainer = Trainer(env_type='pytorch', From 5352f6ad0727977274d326ea2958693c4b8d4ecf Mon Sep 17 00:00:00 2001 From: ldwang Date: Mon, 20 Mar 2023 15:53:30 +0800 Subject: [PATCH 53/54] env_trainer fix data_loader sampler when ds&mpu --- flagai/env_trainer.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/flagai/env_trainer.py b/flagai/env_trainer.py index c7ef8678..afa0dc4e 100644 --- a/flagai/env_trainer.py +++ b/flagai/env_trainer.py @@ -192,18 +192,15 @@ def get_dataloader(self, dataset, collate_fn, shuffle=False): shuffle=shuffle) else: if self.env_type == 'deepspeed+mpu': - # num_replicas = self.world_size // mpu.get_model_parallel_world_size( - # ) - # rank = self.rank // mpu.get_model_parallel_world_size() - # rank = mpu.get_model_parallel_rank() rank = mpu.get_model_parallel_src_rank() - print("*"*80) - print("local rank",self.rank, "model rank", rank) - print("*"*80) + data_rank = mpu.get_data_parallel_rank() + log_dist("*"*80) + log_dist(f"local rank {self.rank} src rank {rank} data rank {data_rank}") + log_dist("*"*80) sampler = torch.utils.data.distributed.DistributedSampler( dataset, - # num_replicas=num_replicas, - rank=rank, + num_replicas=self.world_size//self.model_parallel_size, + rank=data_rank, shuffle=shuffle) else: num_replicas = self.world_size @@ -917,4 +914,4 @@ def evaluate_and_print_results( log_dist('-' * length, [0]) log_dist(string, [0]) log_dist('-' * length, [0]) - return eval_dict \ No newline at end of file + return eval_dict From 059603e7a05617d92db73c18c1ae57bcfd093f77 Mon Sep 17 00:00:00 2001 From: ldwang Date: Mon, 20 Mar 2023 15:54:44 +0800 Subject: [PATCH 54/54] fix trainer dataloader sampler when ds&mpu --- flagai/trainer.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/flagai/trainer.py b/flagai/trainer.py index 17703d73..235e632e 100644 --- a/flagai/trainer.py +++ b/flagai/trainer.py @@ -335,14 +335,15 @@ def get_dataloader(self, dataset, collate_fn, shuffle=False): shuffle=shuffle) else: if self.env_type == 'deepspeed+mpu': - rank = mpu.get_data_parallel_rank() - print("*"*80) - print("local rank",self.rank, "data parallel rank", rank) - print("*"*80) + rank = mpu.get_model_parallel_src_rank() + data_rank = mpu.get_data_parallel_rank() + log_dist("*"*80) + log_dist(f"local rank {self.rank} src rank {rank} data rank {data_rank}") + log_dist("*"*80) sampler = torch.utils.data.distributed.DistributedSampler( dataset, - # num_replicas=num_replicas, - rank=rank, + num_replicas=self.world_size//self.model_parallel_size, + rank=data_rank, shuffle=shuffle) elif self.env_type == 'bmtrain': print("*"*80)