diff --git a/.gitignore b/.gitignore
index 9a6b945c..d8c8629e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,4 +27,4 @@ datasets
qqp
glm_large_qqp_pytorch
wandb
-clip_benchmark_datasets/
\ No newline at end of file
+clip_benchmark_datasets
diff --git a/README.md b/README.md
index 54ce31f3..d48f1be3 100644
--- a/README.md
+++ b/README.md
@@ -260,6 +260,6 @@ The majority of FlagAI is licensed under the [Apache 2.0 license](LICENSE), howe
### ↳ Star History
-[](https://star-history.com/#baaivision/EVA&Date)
+]
diff --git a/examples/bert_title_generation_english/generate.py b/examples/bert_title_generation_english/generate.py
index 1124d16d..fdfa2f41 100644
--- a/examples/bert_title_generation_english/generate.py
+++ b/examples/bert_title_generation_english/generate.py
@@ -7,7 +7,7 @@
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model_dir = "../state_dict/"
+model_dir = "./checkpoints/"
# Note "./checkpoints_seq2seq/{}/mp_rank_00_model_states.pt", {} is a directory in the checkpoints_seq2seq.
model_save_path = "./checkpoints_seq2seq/7079/mp_rank_00_model_states.pt"
diff --git a/examples/roberta_semantic_matching/train.py b/examples/roberta_semantic_matching/train.py
index e0648063..30e9821f 100644
--- a/examples/roberta_semantic_matching/train.py
+++ b/examples/roberta_semantic_matching/train.py
@@ -27,7 +27,7 @@
cur_dir = os.path.dirname(os.path.abspath(__file__))
train_path = cur_dir + "/data/train.tsv"
-model_dir = "./state_dict/"
+model_dir = "./checkpoints/"
maxlen = 256
auto_loader = AutoLoader("semantic-matching",
diff --git a/flagai/data/dataset/block/blocklm_utils.py b/flagai/data/dataset/block/blocklm_utils.py
index 4687305f..44fda3d2 100644
--- a/flagai/data/dataset/block/blocklm_utils.py
+++ b/flagai/data/dataset/block/blocklm_utils.py
@@ -86,10 +86,10 @@ def __init__(self,
self.encoder_decoder = encoder_decoder
self.shuffle_blocks = shuffle_blocks
self.sentinel_token = sentinel_token
- self.generation_mask = 'gMASK' if task_mask else 'MASK'
+ self.generation_mask = 'gMASK' if task_mask else 'mask'
self.generation_mask = self.tokenizer.get_command_id(
self.generation_mask)
- self.gap_sentence_mask = 'sMASK' if task_mask else 'MASK'
+ self.gap_sentence_mask = 'sMASK' if task_mask else 'mask'
self.gap_sentence_mask = self.tokenizer.get_command_id(
self.gap_sentence_mask)
self.random_position = random_position
@@ -205,7 +205,7 @@ def make_masked_data(self,
#
position_ids = np.arange(len(tokens), dtype=np.int64)
targets = copy.deepcopy(tokens)
- mask_id = self.tokenizer.get_command_id('MASK')
+ mask_id = self.tokenizer.get_command_id('mask')
mlm_masks = np.zeros(len(tokens), dtype=np.int64)
for start, end in block_spans:
for idx in range(start, end):
@@ -273,7 +273,7 @@ def make_block_data(self,
elif task == 'gap_sentence':
mask_id = self.gap_sentence_mask
else:
- mask_token = 'MASK' if idx == 0 else f'MASK{idx}'
+ mask_token = 'mask' if idx == 0 else f'MASK{idx}'
mask_id = self.tokenizer.get_command_id(mask_token)
local_spans.append((current_length, current_length + start - last))
source_tokens.append(tokens[last:start])
diff --git a/flagai/data/dataset/data_collator/collate_fn.py b/flagai/data/dataset/data_collator/collate_fn.py
index 73b2f8e5..6eb629d5 100644
--- a/flagai/data/dataset/data_collator/collate_fn.py
+++ b/flagai/data/dataset/data_collator/collate_fn.py
@@ -126,7 +126,7 @@ def __init__(self, args, tokenizer, task_name):
def encode(self, example):
cls_id = self.tokenizer.get_command_id('cls')
- mask_token = 'sMASK' if self.args.task_mask else 'MASK'
+ mask_token = 'sMASK' if self.args.task_mask else 'mask'
mask_id = self.tokenizer.get_command_id(mask_token)
pad_id = self.tokenizer.get_command_id('pad')
sop_id = self.tokenizer.get_command_id('sop')
@@ -175,7 +175,7 @@ def sub_finder(mylist, pattern):
source_tokens = [cls_id] + source_tokens + [mask_id
] + answer_tokens
elif self.task_name in ["cmrc"]:
- mask_id = self.tokenizer.get_command_id('MASK')
+ mask_id = self.tokenizer.get_command_id('mask')
source_text = example.text_a
target_text = example.meta["answer"].strip()
question = example.meta["question"].strip()
@@ -191,7 +191,7 @@ def sub_finder(mylist, pattern):
mask_id
] + source_tokens[:max_src_length]
elif self.task_name in ["wsc"]:
- mask_id = self.tokenizer.get_command_id('MASK')
+ mask_id = self.tokenizer.get_command_id('mask')
source_text = example.text_a
target_text = example.meta["answer"].strip()
question = example.meta["question"].strip()
@@ -307,10 +307,10 @@ def __init__(self,
self.encoder_decoder = encoder_decoder
self.shuffle_blocks = shuffle_blocks
self.sentinel_token = sentinel_token
- self.generation_mask = 'gMASK' if task_mask else 'MASK'
+ self.generation_mask = 'gMASK' if task_mask else 'mask'
self.generation_mask = self.tokenizer.get_command_id(
self.generation_mask)
- self.gap_sentence_mask = 'sMASK' if task_mask else 'MASK'
+ self.gap_sentence_mask = 'sMASK' if task_mask else 'mask'
self.gap_sentence_mask = self.tokenizer.get_command_id(
self.gap_sentence_mask)
self.random_position = random_position
@@ -426,7 +426,7 @@ def make_masked_data(self,
position_ids = np.arange(len(tokens), dtype=np.int64)
targets = copy.deepcopy(tokens)
- mask_id = self.tokenizer.get_command_id('MASK')
+ mask_id = self.tokenizer.get_command_id('mask')
mlm_masks = np.zeros(len(tokens), dtype=np.int64)
for start, end in block_spans:
for idx in range(start, end):
@@ -494,7 +494,7 @@ def make_block_data(self,
elif task == 'gap_sentence':
mask_id = self.gap_sentence_mask
else:
- mask_token = 'MASK' if idx == 0 else f'MASK{idx}'
+ mask_token = 'mask' if idx == 0 else f'MASK{idx}'
mask_id = self.tokenizer.get_command_id(mask_token)
local_spans.append((current_length, current_length + start - last))
source_tokens.append(tokens[last:start])
diff --git a/flagai/data/dataset/data_utils.py b/flagai/data/dataset/data_utils.py
index 4f0ee38d..1efee372 100644
--- a/flagai/data/dataset/data_utils.py
+++ b/flagai/data/dataset/data_utils.py
@@ -134,7 +134,7 @@ def build_input_from_ids(text_a_ids,
# Prepare ids for special tokens
if mask_id is None:
- mask_id = tokenizer.get_command_id('MASK')
+ mask_id = tokenizer.get_command_id('mask')
eos_id = tokenizer.get_command_id('eos') # end of sentence token
cls_id = tokenizer.get_command_id('cls') # start of sentence token
sep_id = tokenizer.get_command_id('sep') # seperator of two texts token
@@ -235,7 +235,7 @@ def build_input_from_ids(text_a_ids,
#
def build_decoder_input(enc_ids, answer_ids, max_seq_length,
max_dec_seq_length, tokenizer):
- mask_id = tokenizer.get_command_id('MASK')
+ mask_id = tokenizer.get_command_id('mask')
eos_id = tokenizer.get_command_id('eos')
sop_id = tokenizer.get_command_id('sop')
masks = []
diff --git a/flagai/data/dataset/language_model/dataset.py b/flagai/data/dataset/language_model/dataset.py
index b291251b..a911df81 100644
--- a/flagai/data/dataset/language_model/dataset.py
+++ b/flagai/data/dataset/language_model/dataset.py
@@ -38,7 +38,7 @@ def __init__(self, args, documents, tokenizer, num_original_tokens,
self.left_weights = [0] + self.weights[:-1]
self.unidirectional = args.unidirectional
self.block_lm = args.block_lm
- mask_token = "gMASK" if args.task_mask else 'MASK'
+ mask_token = "gMASK" if args.task_mask else 'mask'
self.mask_id = self.tokenizer.get_command_id(mask_token)
def __len__(self):
@@ -115,7 +115,7 @@ def __init__(self, args, tokenizer, strict=True):
self.strict = strict
self.block_lm = args.block_lm
self.unidirectional = args.unidirectional
- mask_token = "gMASK" if args.task_mask else 'MASK'
+ mask_token = "gMASK" if args.task_mask else 'mask'
self.mask_id = self.tokenizer.get_command_id(mask_token)
self.tokens = []
diff --git a/flagai/data/dataset/seq2seq/dataset.py b/flagai/data/dataset/seq2seq/dataset.py
index adc28149..b0bc4148 100644
--- a/flagai/data/dataset/seq2seq/dataset.py
+++ b/flagai/data/dataset/seq2seq/dataset.py
@@ -477,7 +477,7 @@ def __len__(self):
def __getitem__(self, idx):
example = self.example_list[idx]
source_text, target_text = example.text_a, example.text_b
- mask_token = 'MASK'
+ mask_token = 'mask'
mask_id = self.tokenizer.get_command_id(mask_token)
sop_id = self.tokenizer.get_command_id('sop')
eop_id = self.tokenizer.get_command_id('eop')
@@ -612,7 +612,7 @@ def __len__(self):
def __getitem__(self, idx):
example = self.example_list[idx]
source_text = example.text_a
- mask_token = 'gMASK' if self.args.task_mask else 'MASK'
+ mask_token = 'gMASK' if self.args.task_mask else 'mask'
mask_id = self.tokenizer.get_command_id(mask_token)
sop_id = self.tokenizer.get_command_id('sop')
eop_id = self.tokenizer.get_command_id('eop')
diff --git a/flagai/data/dataset/superglue/pvp.py b/flagai/data/dataset/superglue/pvp.py
index d4d07b39..8a4d6ee3 100644
--- a/flagai/data/dataset/superglue/pvp.py
+++ b/flagai/data/dataset/superglue/pvp.py
@@ -97,12 +97,12 @@ def spell_length(self):
@property
def mask(self) -> str:
"""Return the underlying LM's mask token"""
- return self.tokenizer.get_command_id('MASK')
+ return self.tokenizer.get_command_id('mask')
@property
def mask_id(self) -> int:
"""Return the underlying LM's mask id"""
- return self.tokenizer.get_command_id('MASK')
+ return self.tokenizer.get_command_id('mask')
@property
def max_num_verbalizers(self) -> int:
@@ -574,13 +574,13 @@ def spell_length(self):
@property
def mask(self) -> str:
"""Return the underlying LM's mask token"""
- mask_token = 'MASK'
+ mask_token = 'mask'
return self.tokenizer.get_command_id(mask_token)
@property
def mask_id(self) -> int:
"""Return the underlying LM's mask id"""
- mask_token = 'MASK'
+ mask_token = 'mask'
return self.tokenizer.get_command_id(mask_token)
def get_answers(self, example: InputExample):
diff --git a/flagai/data/tokenizer/bert/bert_tokenizer.py b/flagai/data/tokenizer/bert/bert_tokenizer.py
index eec168ea..3c935713 100644
--- a/flagai/data/tokenizer/bert/bert_tokenizer.py
+++ b/flagai/data/tokenizer/bert/bert_tokenizer.py
@@ -75,7 +75,7 @@ def __init__(self, tokenizer_model_type=None, cache_dir=None):
self._command_tokens = [
CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')),
CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')),
- CommandToken('MASK', '[MASK]',
+ CommandToken('mask', '[MASK]',
self.get_specialid_from_text_tokenizer('mask')),
CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')),
CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')),
diff --git a/flagai/data/tokenizer/cpm_1/cpm1_tokenizer.py b/flagai/data/tokenizer/cpm_1/cpm1_tokenizer.py
index 7bae0deb..24275e1f 100644
--- a/flagai/data/tokenizer/cpm_1/cpm1_tokenizer.py
+++ b/flagai/data/tokenizer/cpm_1/cpm1_tokenizer.py
@@ -37,7 +37,7 @@ def __init__(self, vocab_file, model_file, max_length=None):
self.encoder = json.load(open(vocab_file))
self.decoder = {v: k for k, v in self.encoder.items()}
- self.sp = spm.SentencePieceProcessor(model_file=model_file)
+ self.sp_model = spm.SentencePieceProcessor(model_file=model_file)
self.translator = str.maketrans(" \n", "\u2582\u2583")
self.token_start_id = 0
self.token_end_id = 3
@@ -48,6 +48,13 @@ def __init__(self, vocab_file, model_file, max_length=None):
def vocab_size(self):
return len(self.encoder)
+ def get_vocab(self):
+ vocab = {
+ self.convert_id_to_token(i): i
+ for i in range(self.vocab_size)
+ }
+ return vocab
+
def __len__(self):
return len(self.encoder) + len(self.special_tokens)
@@ -57,19 +64,28 @@ def eod(self):
def tokenize(self, text):
""" Tokenize a string. """
- seg_list = [
- x.translate(self.translator)
- for x in jieba.cut(text, cut_all=False)
- ]
- new_seg = " ".join(seg_list)
- return self.sp.encode(new_seg)
+ seg_list = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)]
+ new_seg = "".join(seg_list)
+ return self.sp_model.encode(new_seg)
def encode(self, text):
res = self.tokenize(text)
return res
+
+ def convert_tokens_to_ids(self, tokens):
+ return [self.sp_model.PieceToId(token) for token in tokens]
+
+ def convert_token_to_id(self, token):
+ return self.sp_model.PieceToId(token)
+
+ def convert_id_to_token(self, idx):
+ return self.sp_model.IdToPiece(int(idx))
+
+ def convert_ids_to_tokens(self, idxs):
+ return [self.sp_model.IdToPiece(int(idx)) for idx in idxs]
def decode(self, tokens):
- text = self.sp.decode(tokens)
+ text = self.sp_model.decode(tokens)
text = text.replace(' ', '').replace('\u2582',
' ').replace('\u2583', '\n')
return text
@@ -78,3 +94,18 @@ def encode_plus(self, text, max_length=None):
res = self.encode(text)
return {"input_ids": res}
+
+ def convert_tokens_to_string(self, tokens, all_command_token={}):
+ """Converts a sequence of tokens (string) in a single string."""
+ current_sub_tokens = []
+ out_string = ""
+ for token in tokens:
+ # make sure that special tokens are not decoded using sentencepiece model
+ if token in all_command_token:
+ out_string += self.sp_model.decode_pieces(
+ current_sub_tokens) + token + " "
+ current_sub_tokens = []
+ else:
+ current_sub_tokens.append(token)
+ out_string += self.sp_model.decode_pieces(current_sub_tokens)
+ return out_string.strip()
diff --git a/flagai/data/tokenizer/galactica/galactica_tokenizer.py b/flagai/data/tokenizer/galactica/galactica_tokenizer.py
index fdaf5be6..f028d0f0 100644
--- a/flagai/data/tokenizer/galactica/galactica_tokenizer.py
+++ b/flagai/data/tokenizer/galactica/galactica_tokenizer.py
@@ -15,7 +15,7 @@ def __init__(self, download_dir) -> None:
self._command_tokens = [
CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')),
CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')),
- CommandToken('MASK', '[MASK]',
+ CommandToken('mask', '[MASK]',
self.get_specialid_from_text_tokenizer('mask')),
CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')),
CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')),
diff --git a/flagai/data/tokenizer/glm_10b_en/glm_10b_en_bpe_tokenizer.py b/flagai/data/tokenizer/glm_10b_en/glm_10b_en_bpe_tokenizer.py
index b762b66b..e592d33d 100644
--- a/flagai/data/tokenizer/glm_10b_en/glm_10b_en_bpe_tokenizer.py
+++ b/flagai/data/tokenizer/glm_10b_en/glm_10b_en_bpe_tokenizer.py
@@ -60,7 +60,7 @@ def __init__(self,
self.text_tokenizer.encoder['']),
CommandToken('cls', '[CLS]',
self.text_tokenizer.encoder['']),
- CommandToken('MASK',
+ CommandToken('mask',
'[MASK]',
self.text_tokenizer.encoder[''],
lstrip=True),
@@ -88,7 +88,7 @@ def __init__(self,
CommandToken('sop', '<|startofpiece|>', self.num_tokens),
CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1),
CommandToken('cls', '[CLS]', self.num_tokens + 2),
- CommandToken('MASK',
+ CommandToken('mask',
'[MASK]',
self.num_tokens + 3,
lstrip=True),
diff --git a/flagai/data/tokenizer/glm_large_ch/glm_large_ch_tokenizer.py b/flagai/data/tokenizer/glm_large_ch/glm_large_ch_tokenizer.py
index 69048d3a..b91797f6 100644
--- a/flagai/data/tokenizer/glm_large_ch/glm_large_ch_tokenizer.py
+++ b/flagai/data/tokenizer/glm_large_ch/glm_large_ch_tokenizer.py
@@ -55,7 +55,7 @@ def __init__(self,
CommandToken('eos', '<|endoftext|>', self.num_text_tokens),
CommandToken('sep', '[SEP]', self.num_text_tokens + 1),
CommandToken('cls', '[CLS]', self.num_text_tokens + 2),
- CommandToken('MASK',
+ CommandToken('mask',
'[MASK]',
self.num_text_tokens + 3,
lstrip=True),
diff --git a/flagai/data/tokenizer/glm_large_en/glm_large_en_tokenizer.py b/flagai/data/tokenizer/glm_large_en/glm_large_en_tokenizer.py
index ff4e1e4a..db4c726f 100644
--- a/flagai/data/tokenizer/glm_large_en/glm_large_en_tokenizer.py
+++ b/flagai/data/tokenizer/glm_large_en/glm_large_en_tokenizer.py
@@ -59,7 +59,7 @@ def __init__(self,
self._command_tokens = [
CommandToken('pad', '[PAD]', self.text_tokenizer.vocab['[PAD]']),
CommandToken('cls', '[CLS]', self.text_tokenizer.vocab['[CLS]']),
- CommandToken('MASK', '[MASK]',
+ CommandToken('mask', '[MASK]',
self.text_tokenizer.vocab['[MASK]']),
CommandToken('unk', '[UNK]', self.text_tokenizer.vocab['[UNK]']),
CommandToken('sep', '[SEP]', self.text_tokenizer.vocab['[SEP]']),
diff --git a/flagai/data/tokenizer/opt/opt_en_tokenizer.py b/flagai/data/tokenizer/opt/opt_en_tokenizer.py
index 5c1c0de8..9e8e528c 100644
--- a/flagai/data/tokenizer/opt/opt_en_tokenizer.py
+++ b/flagai/data/tokenizer/opt/opt_en_tokenizer.py
@@ -35,7 +35,7 @@ def __init__(self, tokenizer_model_type="facebook/opt-125m", cache_dir=None):
self._command_tokens = [
CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')),
CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')),
- CommandToken('MASK', '[MASK]',
+ CommandToken('mask', '[MASK]',
self.get_specialid_from_text_tokenizer('mask')),
CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')),
CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')),
diff --git a/flagai/data/tokenizer/roberta/roberta_tokenizer.py b/flagai/data/tokenizer/roberta/roberta_tokenizer.py
index 553a8a83..f1b270e4 100644
--- a/flagai/data/tokenizer/roberta/roberta_tokenizer.py
+++ b/flagai/data/tokenizer/roberta/roberta_tokenizer.py
@@ -38,7 +38,7 @@ def __init__(self, tokenizer_model_type="roberta-base", cache_dir=None):
self._command_tokens = [
CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')),
CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')),
- CommandToken('MASK', '[MASK]',
+ CommandToken('mask', '[MASK]',
self.get_specialid_from_text_tokenizer('mask')),
CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')),
CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')),
diff --git a/flagai/data/tokenizer/t5/t5_tokenizer.py b/flagai/data/tokenizer/t5/t5_tokenizer.py
index ef793b67..499aa83e 100644
--- a/flagai/data/tokenizer/t5/t5_tokenizer.py
+++ b/flagai/data/tokenizer/t5/t5_tokenizer.py
@@ -45,7 +45,7 @@ def __init__(self, tokenizer_model_type="t5-base", cache_dir=None):
CommandToken('pad', '[PAD]', self.num_tokens + 1),
CommandToken('cls', '[CLS]', self.num_tokens + 2),
- CommandToken('MASK', '[MASK]',
+ CommandToken('mask', '[MASK]',
self.num_tokens + 3),
]
self._command_tokens.extend([
diff --git a/flagai/data/tokenizer/tokenizer.py b/flagai/data/tokenizer/tokenizer.py
index c3ba085f..43585688 100644
--- a/flagai/data/tokenizer/tokenizer.py
+++ b/flagai/data/tokenizer/tokenizer.py
@@ -54,7 +54,7 @@ def __str__(self):
('sep', 4),
('L2R', 5),
('cls', 6),
- ('MASK', 7),
+ ('mask', 7),
]
DEFAULT_COMMAND_TOKENS = prep_command_tokens(DEFAULT_COMMAND_TOKENS)
"""define some default type tokens for bert training"""
diff --git a/flagai/data/tokenizer/uni_tokenizer/base_tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/base_tokenizer.py
index f3583437..37629623 100644
--- a/flagai/data/tokenizer/uni_tokenizer/base_tokenizer.py
+++ b/flagai/data/tokenizer/uni_tokenizer/base_tokenizer.py
@@ -83,7 +83,8 @@ def from_pretrained(cls,
*inputs,
**kwargs)
elif tokenizer_class == "sp":
- return cls(sp_model_file=resolved_sp_file,
+ return cls(vocab_file=resolved_vocab_json_file,
+ sp_model_file=resolved_sp_file,
tokenizer_class=tokenizer_class,
tokenizer_model_name=tokenizer_model_name,
tokenizer_json_file=resolved_tokenizer_json_file,
diff --git a/flagai/data/tokenizer/uni_tokenizer/bpe_tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/bpe_tokenizer.py
index a873c54e..1f175e16 100644
--- a/flagai/data/tokenizer/uni_tokenizer/bpe_tokenizer.py
+++ b/flagai/data/tokenizer/uni_tokenizer/bpe_tokenizer.py
@@ -152,7 +152,7 @@ def tokenize(self, text):
def convert_token_to_id(self, token):
""" Converts a sequence of tokens into ids using the vocab. """
- return self.encoder.get(token, 0)
+ return self.encoder[token]
def convert_tokens_to_ids(self, tokens):
""" Converts a sequence of tokens into ids using the vocab. """
diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py
index 457c6412..4f0c0cde 100644
--- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py
+++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py
@@ -31,6 +31,7 @@
from flagai.data.tokenizer.uni_tokenizer.diffusion_bert_tokenizer import FullTokenizer
from typing import List, Union, Optional
import unicodedata
+import json
def is_control(ch):
@@ -49,7 +50,9 @@ def __init__(self,
add_sentinel_token=0,
add_task_mask=True,
add_decoder_mask=False,
- fix_command_token=True,
+ fix_command_token=False,
+ pre_tokenizer=None,
+ special_tokens=['cls','pad','unk','eos','sep','mask'],
**kwargs):
super().__init__(**kwargs)
@@ -69,277 +72,88 @@ def __init__(self,
self.text_tokenizer = BPETokenizer(self.vocab_file,
self.merges_file)
elif self.tokenizer_class == "sp":
- self.text_tokenizer = SentencePieceTokenizer(self.sp_model_file)
+ if self.tokenizer_model_name.lower().startswith('cpm'):
+ from flagai.data.tokenizer.cpm_1.cpm1_tokenizer import CPMTokenizer
+ self.text_tokenizer = CPMTokenizer(self.vocab_file, self.sp_model_file)
+ elif self.tokenizer_model_name.lower().startswith('cpm3'):
+ from flagai.data.tokenizer.cpm_3.cpm3_tokenizer import CPMTokenizer
+ self.text_tokenizer = CPMTokenizer(self.tokenizer_json_file, self.sp_model_file)
+ else:
+ self.text_tokenizer = SentencePieceTokenizer(self.sp_model_file)
else:
raise NotImplementedError("cannot assign a tokenize class")
self.is_glm = self.tokenizer_model_name.lower().startswith('glm')
# self.is_clip = self.tokenizer_model_name.startswith('clip')
self.num_tokens = self.text_tokenizer.vocab_size
-
- if self.tokenizer_class == "wp":
- # set command tokens from wordpiece tokenizer values
- self.num_command_tokens = 6
- self.num_text_tokens = self.num_tokens - 5
- self.num_type_tokens = 2
- self.token_start_id = None
- self.token_end_id = None
- self.token_pad_id = None
- try:
- self._command_tokens = [
- CommandToken(
- 'pad', '[PAD]',
- self.text_tokenizer.convert_token_to_id('[PAD]')),
- CommandToken(
- 'cls', '[CLS]',
- self.text_tokenizer.convert_token_to_id('[CLS]')),
- CommandToken(
- 'MASK', '[MASK]',
- self.text_tokenizer.convert_token_to_id('[MASK]')),
- CommandToken(
- 'unk', '[UNK]',
- self.text_tokenizer.convert_token_to_id('[UNK]')),
- CommandToken(
- 'sep', '[SEP]',
- self.text_tokenizer.convert_token_to_id('[SEP]')),
- CommandToken(
- 'eos', '[PAD]',
- self.text_tokenizer.convert_token_to_id('[PAD]')),
- ]
- self.token_start_id = self.text_tokenizer.convert_token_to_id(
- '[CLS]')
- self.token_end_id = self.text_tokenizer.convert_token_to_id(
- '[SEP]')
- self.token_pad_id = self.text_tokenizer.convert_token_to_id(
- '[PAD]')
+ if self.tokenizer_model_name.startswith('cpm'):
+ special_tokens.append('eod')
+ if self.tokenizer_model_name.startswith('opt'):
+ special_tokens.append('bos')
+
+ try:
+ with open(self.special_tokens_map, encoding='utf8') as file: dct=json.load(file)
+ sp_tokens = [(k.replace("_token",""),v['content']) for k,v in dct.items()]
+ except FileNotFoundError:
+ dct = None
+ sp_tokens = []
+ for tk in special_tokens:
+ res = self.search_special(tk)
+ if res:
+ sp_tokens += [(tk, res)]
+ self._command_tokens = [CommandToken(e[0], e[1], self.text_tokenizer.convert_token_to_id(e[1])) for e in sp_tokens]
+ if self.tokenizer_model_name.lower().startswith("glm"):
+ if self.tokenizer_class == "wp":
self.text_tokenizer._token_cls = "[CLS]"
self.text_tokenizer._token_sep = "[SEP]"
-
- except KeyError:
+ fix_command_token = False
+ elif self.tokenizer_class == "sp":
+ fix_command_token = True
self._command_tokens = [
- CommandToken(
- 'pad', '[PAD]',
- self.text_tokenizer.convert_token_to_id('')),
- CommandToken(
- 'cls', '[CLS]',
- self.text_tokenizer.convert_token_to_id('')),
- CommandToken(
- 'MASK', '[MASK]',
- self.text_tokenizer.convert_token_to_id('')),
- CommandToken(
- 'unk', '[UNK]',
- self.text_tokenizer.convert_token_to_id('')),
- CommandToken(
- 'sep', '[SEP]',
- self.text_tokenizer.convert_token_to_id('')),
- CommandToken(
- 'eos', '[PAD]',
- self.text_tokenizer.convert_token_to_id('')),
+ CommandToken('pad', '<|endoftext|>', self.num_tokens),
+ CommandToken('eos', '<|endoftext|>', self.num_tokens),
+ CommandToken('sep', '[SEP]', self.num_tokens + 1),
+ CommandToken('cls', '[CLS]', self.num_tokens + 2),
+ CommandToken('mask', '[MASK]', self.num_tokens + 3, lstrip=True),
+ CommandToken('unk', '[UNK]', self.num_tokens + 4)
]
- self.token_start_id = self.text_tokenizer.convert_token_to_id(
- '')
- self.token_end_id = self.text_tokenizer.convert_token_to_id(
- '')
- self.token_pad_id = self.text_tokenizer.convert_token_to_id(
- '')
- self.text_tokenizer._token_cls = ""
- self.text_tokenizer._token_sep = ""
- if add_block_symbols:
- self.add_command_token('sop', '<|startofpiece|>')
- self.add_command_token('eop', '<|endofpiece|>',)
- if add_task_mask:
- self.add_command_token('gMASK', '[gMASK]')
- self.add_command_token('sMASK', '[sMASK]')
- if add_decoder_mask:
- self.add_command_token('dBLOCK', '[dBLOCK]')
- if add_sentinel_token > 0:
- for i in range(1, add_sentinel_token):
- self.add_command_token(f'MASK{i}', f'[MASK{i}]')
- self.add_command_token(f'sop{i}', f'<|startofpiece{i}|>')
- elif self.tokenizer_class == "bpe":
- if self.tokenizer_model_name.lower().startswith('roberta'):
- self.num_command_tokens = 6
- self.num_text_tokens = self.num_tokens - 3
- self._command_tokens = [
- CommandToken(
- 'pad', '<|endoftext|>',
- self.text_tokenizer.convert_token_to_id('')),
- CommandToken(
- 'eos', '<|endoftext|>',
- self.text_tokenizer.convert_token_to_id('')),
- CommandToken(
- 'sep', '[SEP]',
- self.text_tokenizer.convert_token_to_id('')),
- CommandToken(
- 'cls', '[CLS]',
- self.text_tokenizer.convert_token_to_id('')),
- CommandToken(
- 'MASK',
- '[MASK]',
- self.text_tokenizer.convert_token_to_id(''),
- lstrip=True),
- CommandToken(
- 'unk', '[UNK]',
- self.text_tokenizer.convert_token_to_id(''))
- ]
- if add_block_symbols:
- self._command_tokens.extend([
- CommandToken('sop', '<|startofpiece|>',
- self.num_tokens),
- CommandToken('eop', '<|endofpiece|>',
- self.num_tokens + 1)
- ])
- self.num_tokens += 2
- self.num_command_tokens += 2
- self.token_end_id = self.text_tokenizer.convert_token_to_id(
- '')
- elif self.tokenizer_model_name.lower().startswith('clip'):
- self.num_command_tokens = 2
+ self.num_tokens += 6
+ elif self.tokenizer_class == "bpe":
self._command_tokens = [
- CommandToken(
- 'sot', '',
- self.text_tokenizer.convert_token_to_id('')),
- CommandToken(
- 'eot', '',
- self.text_tokenizer.convert_token_to_id('')),
+ CommandToken('pad', '<|endoftext|>',
+ self.text_tokenizer.encoder['<|endoftext|>']),
+ CommandToken('eos', '<|endoftext|>',
+ self.text_tokenizer.encoder['<|endoftext|>'])
]
- self.num_tokens += self.num_command_tokens
- self.token_end_id = self.text_tokenizer.convert_token_to_id(
- '')
- else:
- self.num_command_tokens = 2
- self.num_text_tokens = self.num_tokens - 1
- self._command_tokens = [
- CommandToken(
- 'pad', '<|endoftext|>',
- self.text_tokenizer.convert_token_to_id(
- '<|endoftext|>')),
- CommandToken(
- 'eos', '<|endoftext|>',
- self.text_tokenizer.convert_token_to_id(
- '<|endoftext|>'))
- ]
- self.token_end_id = self.text_tokenizer.convert_token_to_id(
- '<|endoftext|>')
- if add_block_symbols:
- if self.tokenizer_model_name.lower().startswith('glm'):
- unk_token_id = self.num_tokens + 5
- cls_token_id = self.num_tokens + 2
- num_tokens_to_add = 5
- else:
- unk_token_id = self.text_tokenizer.convert_token_to_id(
- '<|endoftext|>')
- cls_token_id = self.text_tokenizer.convert_token_to_id(
- '<|endoftext|>')
- num_tokens_to_add = 4
- self._command_tokens.extend([
- CommandToken('sop', '<|startofpiece|>',
- self.num_tokens),
- CommandToken('eop', '<|endofpiece|>',
- self.num_tokens + 1),
- CommandToken('cls', '[CLS]', cls_token_id),
- CommandToken('MASK',
- '[MASK]',
- self.num_tokens + 3,
- lstrip=True),
- CommandToken('sep', '[SEP]', self.num_tokens + 4),
- CommandToken('unk', '[UNK]', unk_token_id)
- ])
- self.num_tokens += num_tokens_to_add
- self.num_command_tokens += 6
-
- if add_block_symbols:
- if add_task_mask:
- self._command_tokens.extend([
- CommandToken('gMASK',
- '[gMASK]',
- self.num_tokens,
- lstrip=True),
- CommandToken('sMASK',
- '[sMASK]',
- self.num_tokens + 1,
- lstrip=True)
- ])
- self.num_tokens += 2
- self.num_command_tokens += 2
- if add_decoder_mask:
- self._command_tokens.extend(
- [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)])
- self.num_tokens += 1
- self.num_command_tokens += 1
-
- elif self.tokenizer_class == "sp":
- self.num_command_tokens = 0
- self.num_text_tokens = self.text_tokenizer.vocab_size
- self.num_tokens = self.num_text_tokens
-
- if self.tokenizer_model_name.lower().startswith('glm'):
- pad_token_id = self.num_tokens
- eos_token_id = self.num_tokens
- unk_token_id = self.num_tokens + 4
- else:
- pad_token_id = self.text_tokenizer.convert_token_to_id('')
- eos_token_id = self.text_tokenizer.convert_token_to_id('')
- unk_token_id = self.text_tokenizer.convert_token_to_id('')
- self._command_tokens = [
- CommandToken('pad', '<|endoftext|>', self.num_text_tokens),
- CommandToken('eos', '<|endoftext|>', self.num_text_tokens),
- CommandToken('sep', '[SEP]', self.num_text_tokens + 1),
- CommandToken('cls', '[CLS]', self.num_text_tokens + 2),
- CommandToken('MASK',
- '[MASK]',
- self.num_text_tokens + 3,
- lstrip=True),
- CommandToken('unk', '[UNK]', self.num_text_tokens + 4)
- ]
-
- self.num_tokens += 5
- self.num_command_tokens += 6
- self.token_end_id = self.text_tokenizer.convert_token_to_id(
- '')
- if add_block_symbols:
- sop_id = self.text_tokenizer.convert_token_to_id('<|startofpiece|>')
- eop_id = self.text_tokenizer.convert_token_to_id('<|endofpiece|>')
self._command_tokens.extend([
- CommandToken('sop', '<|startofpiece|>',
- self.num_tokens + 1),
- CommandToken('eop', '<|endofpiece|>', self.num_tokens + 2)
+ CommandToken('sop', '<|startofpiece|>', self.num_tokens),
+ CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1),
+ CommandToken('cls', '[CLS]', self.num_tokens + 2),
+ CommandToken('mask',
+ '[MASK]',
+ self.num_tokens + 3,
+ lstrip=True),
+ CommandToken('sep', '[SEP]', self.num_tokens + 4),
+ CommandToken('unk', '[UNK]', self.num_tokens + 5)
])
- if fix_command_token:
- self.num_tokens += 3
- else:
- self.num_tokens += 2
- self.num_command_tokens += 2
+ self.num_tokens += 6
+ if add_block_symbols:
+ if not self.tokenizer_class == "bpe":
+ self.add_command_token('sop', '<|startofpiece|>',self.tokenizer_class)
+ self.add_command_token('eop', '<|endofpiece|>',self.tokenizer_class)
if add_task_mask:
if fix_command_token:
- self._command_tokens.extend([
- CommandToken('sMASK',
- '[sMASK]',
- self.num_tokens,
- lstrip=True),
- CommandToken('gMASK',
- '[gMASK]',
- self.num_tokens + 1,
- lstrip=True)
- ])
+ self.add_command_token('sMASK', '[sMASK]',self.tokenizer_class)
+ self.add_command_token('gMASK', '[gMASK]',self.tokenizer_class)
else:
- self._command_tokens.extend([
- CommandToken('gMASK',
- '[gMASK]',
- self.num_tokens,
- lstrip=True),
- CommandToken('sMASK',
- '[sMASK]',
- self.num_tokens + 1,
- lstrip=True)
- ])
- self.num_tokens += 2
- self.num_command_tokens += 2
+ self.add_command_token('gMASK', '[gMASK]',self.tokenizer_class)
+ self.add_command_token('sMASK', '[sMASK]',self.tokenizer_class)
if add_decoder_mask:
- self._command_tokens.extend(
- [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)])
- self.num_tokens += 1
- self.num_command_tokens += 1
+ self.add_command_token('dBLOCK', '[dBLOCK]',self.tokenizer_class)
+ if add_sentinel_token > 0:
+ for i in range(1, add_sentinel_token):
+ self.add_command_token(f'MASK{i}', f'[MASK{i}]',self.tokenizer_class)
+ self.add_command_token(f'sop{i}', f'<|startofpiece{i}|>',self.tokenizer_class)
self.command_name_map = {tok.name: tok for tok in self._command_tokens}
self.command_token_map = {
tok.token: tok
@@ -347,7 +161,17 @@ def __init__(self,
}
self.command_id_map = {tok.Id: tok for tok in self._command_tokens}
self._command_token_tokens = list(self.command_token_map.keys())
- logger.info("All special tokens: %s", str([(k,v.Id) for k,v in self.command_name_map.items()]))
+ vocab = self.text_tokenizer.get_vocab()
+ self.token_start_id = vocab.get('', None)
+ if not self.token_start_id:
+ self.token_start_id = vocab.get('[CLS]', None)
+
+ self.token_end_id = vocab.get('', None)
+ if not self.token_end_id:
+ self.token_end_id = vocab.get('<|endoftext|>', None)
+ if not self.token_end_id:
+ self.token_end_id = vocab.get('[SEP]', None)
+ print("All special tokens: ", str([(k, v.token, v.Id) for k,v in self.command_name_map.items()]))
def get_vocab(self):
return self.text_tokenizer.get_vocab()
@@ -356,9 +180,12 @@ def get_command_id(self, name):
"""get command token corresponding to `name`"""
return self.command_name_map[name].Id
- def add_command_token(self, name, token):
+ def add_command_token(self, name, token, tokenizer_class="wp"):
try:
- id = self.text_tokenizer.convert_token_to_id(token)
+ if tokenizer_class == "sp":
+ id = self.text_tokenizer.get_vocab()[token]
+ else:
+ id = self.text_tokenizer.convert_token_to_id(token)
except KeyError:
id = self.num_tokens
self.num_tokens += 1
@@ -458,7 +285,7 @@ def TokenToId(self, token):
def DecodeIds(self, ids):
"""converts ids to wordpiece tokens and joins them as a text string"""
- tokens = []
+ tokens = []
for id in ids:
if id in self.command_id_map:
tokens.append(self.command_id_map[id].token)
@@ -472,10 +299,14 @@ def DecodeIds(self, ids):
tokens, self.command_token_map)
def encode(self, text):
+ if hasattr(self.text_tokenizer, "encode"):
+ return self.text_tokenizer.encode(text)
return self.convert_tokens_to_ids(
self.text_tokenizer.tokenize(text))
def decode(self, ids):
+ if hasattr(self.text_tokenizer, "decode"):
+ return self.text_tokenizer.decode(ids)
return self.DecodeIds(ids)
def DecodeTokens(self, tokens):
@@ -566,8 +397,8 @@ def encode_plus_non_glm(
):
def get_input_ids(text):
- tokens = self.text_tokenizer.tokenize(text)
- return self.text_tokenizer.convert_tokens_to_ids(tokens)
+ tokens = self.tokenize(text)
+ return self.convert_tokens_to_ids(tokens)
first_ids = get_input_ids(text)
second_ids = get_input_ids(
@@ -635,10 +466,16 @@ def encode_plus( # for Seq2seq
max_length=None,
padding=True,
):
- if not self.tokenizer_model_name.lower().startswith("glm") and not self.tokenizer_model_name.lower().startswith(
+ if hasattr(self.text_tokenizer, "encode_plus"):
+ return self.text_tokenizer.encode_plus(source_text)
+ elif not self.tokenizer_model_name.lower().startswith("glm") and not self.tokenizer_model_name.lower().startswith(
"alm"):
return self.encode_plus_non_glm(source_text, second_text,
truncation, max_length)
+
+
+ # elif self.tokenizer_model_name.lower().startswith("opt"):
+ # return None
sop_id = self.get_command_id('sop') # start of piece
eop_id = self.get_command_id('eop') # end of piece
sep_id = self.get_command_id('sep') # seperation
@@ -734,4 +571,42 @@ def tokenize(self, text, maxlen=None, add_spatial_tokens=False):
if maxlen is not None:
index = int(self.get_command_id('sep') is not None) + 1
self.truncate_sequence(maxlen, tokens, pop_index=-index)
- return tokens
\ No newline at end of file
+ return tokens
+
+ def search_special(self, name):
+ if name == "cls":
+ if self.check_special(''): return ''
+ elif self.check_special('[CLS]'): return '[CLS]'
+ elif name == "pad":
+ if self.check_special(''): return ''
+ elif self.check_special('[PAD]'): return '[PAD]'
+ elif self.check_special('<|endoftext|>'): return '<|endoftext|>'
+ elif name == "eos":
+ if self.check_special(''): return ''
+ elif self.check_special('|endoftext|'): return '|endoftext|'
+ elif self.check_special('[PAD]'): return '[PAD]'
+ elif name == "sep":
+ if self.check_special(''): return ''
+ elif self.check_special('[SEP]'): return '[SEP]'
+ elif name == "unk":
+ if self.check_special(''): return ''
+ elif self.check_special('[UNK]'): return '[UNK]'
+ elif name == "bos":
+ if self.check_special(''): return ''
+ elif name == "mask":
+ if self.check_special('[MASK]'): return '[MASK]'
+ elif self.check_special(''): return ''
+ elif name == "eod":
+ if self.check_special(''): return ''
+ return None
+
+ def check_special(self, tk):
+
+ try:
+ if self.tokenizer_class == 'sp':
+ self.text_tokenizer.get_vocab()[tk]
+ else:
+ self.text_tokenizer.convert_token_to_id(tk)
+ return True
+ except KeyError:
+ return False
diff --git a/flagai/model/predictor/utils.py b/flagai/model/predictor/utils.py
index 72077041..61d91ce6 100644
--- a/flagai/model/predictor/utils.py
+++ b/flagai/model/predictor/utils.py
@@ -1133,7 +1133,7 @@ def alm_beamsearch(model, tokenizer, text, out_max_length, beam_size, eod_token=
dtype=torch.long)
position_ids = torch.stack((position_ids, block_position_ids), dim=0)
position_ids = position_ids.unsqueeze(0)
- mask_tokens = ['MASK', 'sMASK', 'gMASK']
+ mask_tokens = ['mask', 'sMASK', 'gMASK']
mask_tokens = [tokenizer.get_command_id(token) for token in mask_tokens]
end_tokens = [tokenizer.get_command_id('eop'), eod_token]
mask_positions = []
@@ -1434,7 +1434,7 @@ def glm_generate_sample(
dtype=torch.long)
position_ids = torch.stack((position_ids, block_position_ids), dim=0)
position_ids = position_ids.unsqueeze(0)
- mask_tokens = ['MASK', 'sMASK', 'gMASK']
+ mask_tokens = ['mask', 'sMASK', 'gMASK']
mask_tokens = [tokenizer.get_command_id(token) for token in mask_tokens]
end_tokens = [tokenizer.get_command_id('eop'), eod_token]
mask_positions = []
diff --git a/flagai/test_utils.py b/flagai/test_utils.py
index 83dacde3..5faa0aec 100644
--- a/flagai/test_utils.py
+++ b/flagai/test_utils.py
@@ -14,7 +14,7 @@ def build_input_from_ids(text_a_ids=None,
mask_id=None,
masked_lm=False):
if mask_id is None:
- mask_id = tokenizer.get_command_id('MASK')
+ mask_id = tokenizer.get_command_id('mask')
eos_id = tokenizer.get_command_id('eos')
cls_id = tokenizer.get_command_id('cls')
sep_id = tokenizer.get_command_id('sep')
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index 87b7fa63..c72d34a7 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -14,6 +14,14 @@ def test_tokenizer_GLM_large_ch(self):
[3378, 1567, 2613, 20282], 'EncodeAsIds Error')
self.assertEqual(tokenizer.DecodeIds([3378, 1567, 2613, 20282]),
'今天吃饭吃了肯德基', 'DecodeIds Error')
+ self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'),
+ ['▁今天', '吃饭', '吃了', '肯德基'], 'tokenize Error')
+ self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'],
+ [50006, 3378, 1567, 2613, 20282, 50001], 'encode_plus Error')
+ self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]),
+ {('pad', '<|endoftext|>', 50000), ('eos', '<|endoftext|>', 50000), ('sep', '[SEP]', 50001),
+ ('cls', '[CLS]', 50002), ('mask', '[MASK]', 50003), ('unk', '[UNK]', 50004), ('sop', '<|startofpiece|>', 50006),
+ ('eop', '<|endofpiece|>', 50007), ('sMASK', '[sMASK]', 50008), ('gMASK', '[gMASK]', 50009)}, 'SpecialTokens error')
def test_tokenizer_GLM_large_en(self):
tokenizer = Tokenizer.from_pretrained("GLM-large-en")
@@ -22,6 +30,10 @@ def test_tokenizer_GLM_large_en(self):
[13017, 7975, 3084, 2033, 3407], '')
self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]),
'fried chicken makes me happy', 'DecodeIds Error')
+ self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]),
+ {('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('mask', '[MASK]', 103), ('unk', '[UNK]', 100),
+ ('sep', '[SEP]', 102), ('pad', '[PAD]', 0), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523),
+ ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)})
# def test_tokenizer_glm_10b_en(self):
# tokenizer = Tokenizer.from_pretrained("GLM-10b-en")
@@ -30,23 +42,43 @@ def test_tokenizer_GLM_large_en(self):
# [25520, 9015, 1838, 502, 3772], '')
# self.assertEqual(tokenizer.DecodeIds([25520, 9015, 1838, 502, 3772]),
# 'fried chicken makes me happy', 'DecodeIds Error')
+ # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()],
+ # [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('mask', '[MASK]', 103), ('unk', '[UNK]', 100),
+ # ('sep', '[SEP]', 102), ('pad', '[PAD]', 0), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523),
+ # ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)])
+
def test_tokenizer_t5(self):
- tokenizer = Tokenizer.from_pretrained('t5-base-en')
- self.assertEqual(tokenizer.TokenToId("day"), 1135, '')
- self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"),
- [3, 7704, 3832, 656, 140, 1095], '')
- self.assertEqual(tokenizer.DecodeIds([3, 7704, 3832, 656, 140, 1095]),
- 'fried chicken makes me happy', 'DecodeIds Error')
+ tokenizer = Tokenizer.from_pretrained('T5-base-ch')
+ self.assertEqual(tokenizer.TokenToId("人"), 297, '')
+ self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"),
+ [306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166], '')
+ self.assertEqual(tokenizer.DecodeIds([306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166]),
+ '今天吃饭吃了肯德基', 'DecodeIds Error')
+ encode_plus_result = tokenizer.encode_plus("今天吃饭吃了肯德基")
+ self.assertEqual(list(encode_plus_result.keys()),
+ ['input_ids', 'token_type_ids'], 'encode_plus Error')
+ self.assertEqual(encode_plus_result['input_ids'],
+ [101, 306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166, 102], 'encode_plus Error')
+ self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]),
+ {('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('mask', '[MASK]', 103), ('unk', '[UNK]', 100),
+ ('sep', '[SEP]', 102), ('pad', '[PAD]', 0)}, 'SpecialTokens error')
+
def test_tokenizer_roberta(self):
tokenizer = Tokenizer.from_pretrained('RoBERTa-base-ch')
- # print(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825]))
self.assertEqual(tokenizer.TokenToId("人"), 782, '')
self.assertEqual(tokenizer.EncodeAsIds("今天吃饭吃了肯德基"),
[791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825], '')
self.assertEqual(tokenizer.DecodeIds([791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825]),
'今天吃饭吃了肯德基', 'DecodeIds Error')
+ self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'),
+ ['今', '天', '吃', '饭', '吃', '了', '肯', '德', '基'], 'tokenize Error')
+ self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'],
+ [101, 791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825, 102], 'encode_plus Error')
+ self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]),
+ {('unk', '[UNK]', 100), ('cls', '[CLS]', 101), ('sep', '[SEP]', 102), ('mask', '[MASK]', 103),
+ ('eos', '[PAD]', 0), ('pad', '[PAD]', 0)}, 'SpecialTokens error')
def test_tokenizer_bert(self):
tokenizer = Tokenizer.from_pretrained('BERT-base-en')
@@ -55,26 +87,48 @@ def test_tokenizer_bert(self):
[13017, 7975, 3084, 2033, 3407], '')
self.assertEqual(tokenizer.DecodeIds([13017, 7975, 3084, 2033, 3407]),
'fried chicken makes me happy', 'DecodeIds Error')
+ self.assertEqual(tokenizer.tokenize('fried chicken makes me happy'),
+ ['fried', 'chicken', 'makes', 'me', 'happy'], 'tokenize Error')
+ self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'],
+ [101, 13017, 7975, 3084, 2033, 3407, 102], 'encode_plus Error')
+ self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]),
+ {('eos', '[PAD]', 0), ('unk', '[UNK]', 100), ('cls', '[CLS]', 101), ('sep', '[SEP]', 102),
+ ('mask', '[MASK]', 103), ('pad', '[PAD]', 0)}, 'SpecialTokens error')
- def test_tokenizer_cpm1(self):
- loader = AutoLoader(task_name="lm",
- model_name="CPM-large-ch",
- model_dir="./checkpoints/",
- only_download_config=True)
- tokenizer = loader.get_tokenizer()
- self.assertEqual(tokenizer.encode("day"), [8, 8275], '')
- self.assertEqual(tokenizer.encode("fried chicken makes me happy"),
- [2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239], '')
- self.assertEqual(tokenizer.decode([2487, 27385, 9291, 9412, 3531, 14588, 289, 4406, 25239]),
- 'fried chicken makes me happy', 'DecodeIds Error')
+ # def test_tokenizer_cpm1(self):
+ # loader = AutoLoader(task_name="lm",
+ # model_name="CPM-large-ch",
+ # model_dir="./checkpoints/",
+ # only_download_config=True)
+
+ # tokenizer = loader.get_tokenizer()
+ # self.assertEqual(tokenizer.TokenToId("人"), 62, '')
+ # self.assertEqual(tokenizer.encode("今天吃饭吃了肯德基"),
+ # [837, 3079, 1777, 3079, 139, 3687, 513, 1463], '')
+ # self.assertEqual(tokenizer.DecodeIds([837, 3079, 1777, 3079, 139, 3687, 513, 1463]),
+ # '今天吃饭吃了肯德基', 'DecodeIds Error')
+ # self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'),
+ # [837, 3079, 1777, 3079, 139, 3687, 513, 1463], 'tokenize Error')
+ # self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'],
+ # [837, 3079, 1777, 3079, 139, 3687, 513, 1463], 'encode_plus Error')
+ # self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]),
+ # {('unk', '', 0), ('cls', '', 1), ('eos', '', 2), ('sep', '', 4),
+ # ('mask', '', 6), ('pad', '', 5),('eod', '', 7)}, 'SpecialTokens error')
def test_tokenizer_opt(self):
- tokenizer = Tokenizer.from_pretrained('opt-125m-en')
+ tokenizer = Tokenizer.from_pretrained('opt-1.3b-en')
self.assertEqual(tokenizer.encode("day"), [1208], '')
self.assertEqual(tokenizer.encode_plus("fried chicken makes me happy")["input_ids"],
- [50260, 21209, 5884, 817, 162, 1372, 50260], '')
+ [0, 21209, 5884, 817, 162, 1372, 2], '')
self.assertEqual(tokenizer.decode([21209, 5884, 817, 162, 1372]),
'fried chicken makes me happy', 'DecodeIds Error')
+ self.assertEqual(tokenizer.tokenize('fried chicken makes me happy'),
+ ['fried', 'Ġchicken', 'Ġmakes', 'Ġme', 'Ġhappy'], 'tokenize Error')
+ self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'],
+ [0, 21209, 5884, 817, 162, 1372, 2], 'encode_plus Error')
+ self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]),
+ {('cls', '', 0), ('pad', '', 1), ('bos', '', 2), ('eos', '', 2), ('unk', '', 3),
+ ('mask', '', 50264)}, 'SpecialTokens error')
def test_tokenizer_clip(self):
loader = AutoLoader(task_name="txt_img_matching",
@@ -89,6 +143,7 @@ def test_tokenizer_evaclip(self):
self.assertEqual(tokenizer.tokenize_as_tensor("cat")[0][:3].tolist(), [49406, 2368, 49407], '')
+
def suite():
suite = unittest.TestSuite()
suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_ch'))
@@ -97,7 +152,7 @@ def suite():
suite.addTest(TokenizerTestCase('test_tokenizer_t5'))
suite.addTest(TokenizerTestCase('test_tokenizer_roberta'))
suite.addTest(TokenizerTestCase('test_tokenizer_bert'))
- suite.addTest(TokenizerTestCase('test_tokenizer_cpm1'))
+ # suite.addTest(TokenizerTestCase('test_tokenizer_cpm1'))
suite.addTest(TokenizerTestCase('test_tokenizer_opt'))
suite.addTest(TokenizerTestCase('test_tokenizer_clip'))
suite.addTest(TokenizerTestCase('test_tokenizer_evaclip'))