Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,4 @@ datasets
qqp
glm_large_qqp_pytorch
wandb
clip_benchmark_datasets/
clip_benchmark_datasets
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,6 @@ The majority of FlagAI is licensed under the [Apache 2.0 license](LICENSE), howe
### ↳ Star History
<div align="center">

[![Star History Chart](https://api.star-history.com/svg?repos=FlagAI-Open/FlagAI&type=Date)](https://star-history.com/#baaivision/EVA&Date)
![Star History Chart](https://api.star-history.com/svg?repos=FlagAI-Open/FlagAI&type=Date)]

</div>
2 changes: 1 addition & 1 deletion examples/bert_title_generation_english/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_dir = "../state_dict/"
model_dir = "./checkpoints/"

# Note "./checkpoints_seq2seq/{}/mp_rank_00_model_states.pt", {} is a directory in the checkpoints_seq2seq.
model_save_path = "./checkpoints_seq2seq/7079/mp_rank_00_model_states.pt"
Expand Down
2 changes: 1 addition & 1 deletion examples/roberta_semantic_matching/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

cur_dir = os.path.dirname(os.path.abspath(__file__))
train_path = cur_dir + "/data/train.tsv"
model_dir = "./state_dict/"
model_dir = "./checkpoints/"
maxlen = 256

auto_loader = AutoLoader("semantic-matching",
Expand Down
8 changes: 4 additions & 4 deletions flagai/data/dataset/block/blocklm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,10 @@ def __init__(self,
self.encoder_decoder = encoder_decoder
self.shuffle_blocks = shuffle_blocks
self.sentinel_token = sentinel_token
self.generation_mask = 'gMASK' if task_mask else 'MASK'
self.generation_mask = 'gMASK' if task_mask else 'mask'
self.generation_mask = self.tokenizer.get_command_id(
self.generation_mask)
self.gap_sentence_mask = 'sMASK' if task_mask else 'MASK'
self.gap_sentence_mask = 'sMASK' if task_mask else 'mask'
self.gap_sentence_mask = self.tokenizer.get_command_id(
self.gap_sentence_mask)
self.random_position = random_position
Expand Down Expand Up @@ -205,7 +205,7 @@ def make_masked_data(self,
#
position_ids = np.arange(len(tokens), dtype=np.int64)
targets = copy.deepcopy(tokens)
mask_id = self.tokenizer.get_command_id('MASK')
mask_id = self.tokenizer.get_command_id('mask')
mlm_masks = np.zeros(len(tokens), dtype=np.int64)
for start, end in block_spans:
for idx in range(start, end):
Expand Down Expand Up @@ -273,7 +273,7 @@ def make_block_data(self,
elif task == 'gap_sentence':
mask_id = self.gap_sentence_mask
else:
mask_token = 'MASK' if idx == 0 else f'MASK{idx}'
mask_token = 'mask' if idx == 0 else f'MASK{idx}'
mask_id = self.tokenizer.get_command_id(mask_token)
local_spans.append((current_length, current_length + start - last))
source_tokens.append(tokens[last:start])
Expand Down
14 changes: 7 additions & 7 deletions flagai/data/dataset/data_collator/collate_fn.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def __init__(self, args, tokenizer, task_name):

def encode(self, example):
cls_id = self.tokenizer.get_command_id('cls')
mask_token = 'sMASK' if self.args.task_mask else 'MASK'
mask_token = 'sMASK' if self.args.task_mask else 'mask'
mask_id = self.tokenizer.get_command_id(mask_token)
pad_id = self.tokenizer.get_command_id('pad')
sop_id = self.tokenizer.get_command_id('sop')
Expand Down Expand Up @@ -175,7 +175,7 @@ def sub_finder(mylist, pattern):
source_tokens = [cls_id] + source_tokens + [mask_id
] + answer_tokens
elif self.task_name in ["cmrc"]:
mask_id = self.tokenizer.get_command_id('MASK')
mask_id = self.tokenizer.get_command_id('mask')
source_text = example.text_a
target_text = example.meta["answer"].strip()
question = example.meta["question"].strip()
Expand All @@ -191,7 +191,7 @@ def sub_finder(mylist, pattern):
mask_id
] + source_tokens[:max_src_length]
elif self.task_name in ["wsc"]:
mask_id = self.tokenizer.get_command_id('MASK')
mask_id = self.tokenizer.get_command_id('mask')
source_text = example.text_a
target_text = example.meta["answer"].strip()
question = example.meta["question"].strip()
Expand Down Expand Up @@ -307,10 +307,10 @@ def __init__(self,
self.encoder_decoder = encoder_decoder
self.shuffle_blocks = shuffle_blocks
self.sentinel_token = sentinel_token
self.generation_mask = 'gMASK' if task_mask else 'MASK'
self.generation_mask = 'gMASK' if task_mask else 'mask'
self.generation_mask = self.tokenizer.get_command_id(
self.generation_mask)
self.gap_sentence_mask = 'sMASK' if task_mask else 'MASK'
self.gap_sentence_mask = 'sMASK' if task_mask else 'mask'
self.gap_sentence_mask = self.tokenizer.get_command_id(
self.gap_sentence_mask)
self.random_position = random_position
Expand Down Expand Up @@ -426,7 +426,7 @@ def make_masked_data(self,

position_ids = np.arange(len(tokens), dtype=np.int64)
targets = copy.deepcopy(tokens)
mask_id = self.tokenizer.get_command_id('MASK')
mask_id = self.tokenizer.get_command_id('mask')
mlm_masks = np.zeros(len(tokens), dtype=np.int64)
for start, end in block_spans:
for idx in range(start, end):
Expand Down Expand Up @@ -494,7 +494,7 @@ def make_block_data(self,
elif task == 'gap_sentence':
mask_id = self.gap_sentence_mask
else:
mask_token = 'MASK' if idx == 0 else f'MASK{idx}'
mask_token = 'mask' if idx == 0 else f'MASK{idx}'
mask_id = self.tokenizer.get_command_id(mask_token)
local_spans.append((current_length, current_length + start - last))
source_tokens.append(tokens[last:start])
Expand Down
4 changes: 2 additions & 2 deletions flagai/data/dataset/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def build_input_from_ids(text_a_ids,

# Prepare ids for special tokens
if mask_id is None:
mask_id = tokenizer.get_command_id('MASK')
mask_id = tokenizer.get_command_id('mask')
eos_id = tokenizer.get_command_id('eos') # end of sentence token
cls_id = tokenizer.get_command_id('cls') # start of sentence token
sep_id = tokenizer.get_command_id('sep') # seperator of two texts token
Expand Down Expand Up @@ -235,7 +235,7 @@ def build_input_from_ids(text_a_ids,
#
def build_decoder_input(enc_ids, answer_ids, max_seq_length,
max_dec_seq_length, tokenizer):
mask_id = tokenizer.get_command_id('MASK')
mask_id = tokenizer.get_command_id('mask')
eos_id = tokenizer.get_command_id('eos')
sop_id = tokenizer.get_command_id('sop')
masks = []
Expand Down
4 changes: 2 additions & 2 deletions flagai/data/dataset/language_model/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(self, args, documents, tokenizer, num_original_tokens,
self.left_weights = [0] + self.weights[:-1]
self.unidirectional = args.unidirectional
self.block_lm = args.block_lm
mask_token = "gMASK" if args.task_mask else 'MASK'
mask_token = "gMASK" if args.task_mask else 'mask'
self.mask_id = self.tokenizer.get_command_id(mask_token)

def __len__(self):
Expand Down Expand Up @@ -115,7 +115,7 @@ def __init__(self, args, tokenizer, strict=True):
self.strict = strict
self.block_lm = args.block_lm
self.unidirectional = args.unidirectional
mask_token = "gMASK" if args.task_mask else 'MASK'
mask_token = "gMASK" if args.task_mask else 'mask'
self.mask_id = self.tokenizer.get_command_id(mask_token)

self.tokens = []
Expand Down
4 changes: 2 additions & 2 deletions flagai/data/dataset/seq2seq/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,7 @@ def __len__(self):
def __getitem__(self, idx):
example = self.example_list[idx]
source_text, target_text = example.text_a, example.text_b
mask_token = 'MASK'
mask_token = 'mask'
mask_id = self.tokenizer.get_command_id(mask_token)
sop_id = self.tokenizer.get_command_id('sop')
eop_id = self.tokenizer.get_command_id('eop')
Expand Down Expand Up @@ -612,7 +612,7 @@ def __len__(self):
def __getitem__(self, idx):
example = self.example_list[idx]
source_text = example.text_a
mask_token = 'gMASK' if self.args.task_mask else 'MASK'
mask_token = 'gMASK' if self.args.task_mask else 'mask'
mask_id = self.tokenizer.get_command_id(mask_token)
sop_id = self.tokenizer.get_command_id('sop')
eop_id = self.tokenizer.get_command_id('eop')
Expand Down
8 changes: 4 additions & 4 deletions flagai/data/dataset/superglue/pvp.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,12 @@ def spell_length(self):
@property
def mask(self) -> str:
"""Return the underlying LM's mask token"""
return self.tokenizer.get_command_id('MASK')
return self.tokenizer.get_command_id('mask')

@property
def mask_id(self) -> int:
"""Return the underlying LM's mask id"""
return self.tokenizer.get_command_id('MASK')
return self.tokenizer.get_command_id('mask')

@property
def max_num_verbalizers(self) -> int:
Expand Down Expand Up @@ -574,13 +574,13 @@ def spell_length(self):
@property
def mask(self) -> str:
"""Return the underlying LM's mask token"""
mask_token = 'MASK'
mask_token = 'mask'
return self.tokenizer.get_command_id(mask_token)

@property
def mask_id(self) -> int:
"""Return the underlying LM's mask id"""
mask_token = 'MASK'
mask_token = 'mask'
return self.tokenizer.get_command_id(mask_token)

def get_answers(self, example: InputExample):
Expand Down
2 changes: 1 addition & 1 deletion flagai/data/tokenizer/bert/bert_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def __init__(self, tokenizer_model_type=None, cache_dir=None):
self._command_tokens = [
CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')),
CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')),
CommandToken('MASK', '[MASK]',
CommandToken('mask', '[MASK]',
self.get_specialid_from_text_tokenizer('mask')),
CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')),
CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')),
Expand Down
47 changes: 39 additions & 8 deletions flagai/data/tokenizer/cpm_1/cpm1_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(self, vocab_file, model_file, max_length=None):
self.encoder = json.load(open(vocab_file))
self.decoder = {v: k for k, v in self.encoder.items()}

self.sp = spm.SentencePieceProcessor(model_file=model_file)
self.sp_model = spm.SentencePieceProcessor(model_file=model_file)
self.translator = str.maketrans(" \n", "\u2582\u2583")
self.token_start_id = 0
self.token_end_id = 3
Expand All @@ -48,6 +48,13 @@ def __init__(self, vocab_file, model_file, max_length=None):
def vocab_size(self):
return len(self.encoder)

def get_vocab(self):
vocab = {
self.convert_id_to_token(i): i
for i in range(self.vocab_size)
}
return vocab

def __len__(self):
return len(self.encoder) + len(self.special_tokens)

Expand All @@ -57,19 +64,28 @@ def eod(self):

def tokenize(self, text):
""" Tokenize a string. """
seg_list = [
x.translate(self.translator)
for x in jieba.cut(text, cut_all=False)
]
new_seg = " ".join(seg_list)
return self.sp.encode(new_seg)
seg_list = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)]
new_seg = "".join(seg_list)
return self.sp_model.encode(new_seg)

def encode(self, text):
res = self.tokenize(text)
return res

def convert_tokens_to_ids(self, tokens):
return [self.sp_model.PieceToId(token) for token in tokens]

def convert_token_to_id(self, token):
return self.sp_model.PieceToId(token)

def convert_id_to_token(self, idx):
return self.sp_model.IdToPiece(int(idx))

def convert_ids_to_tokens(self, idxs):
return [self.sp_model.IdToPiece(int(idx)) for idx in idxs]

def decode(self, tokens):
text = self.sp.decode(tokens)
text = self.sp_model.decode(tokens)
text = text.replace(' ', '').replace('\u2582',
' ').replace('\u2583', '\n')
return text
Expand All @@ -78,3 +94,18 @@ def encode_plus(self, text, max_length=None):
res = self.encode(text)

return {"input_ids": res}

def convert_tokens_to_string(self, tokens, all_command_token={}):
"""Converts a sequence of tokens (string) in a single string."""
current_sub_tokens = []
out_string = ""
for token in tokens:
# make sure that special tokens are not decoded using sentencepiece model
if token in all_command_token:
out_string += self.sp_model.decode_pieces(
current_sub_tokens) + token + " "
current_sub_tokens = []
else:
current_sub_tokens.append(token)
out_string += self.sp_model.decode_pieces(current_sub_tokens)
return out_string.strip()
2 changes: 1 addition & 1 deletion flagai/data/tokenizer/galactica/galactica_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def __init__(self, download_dir) -> None:
self._command_tokens = [
CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')),
CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')),
CommandToken('MASK', '[MASK]',
CommandToken('mask', '[MASK]',
self.get_specialid_from_text_tokenizer('mask')),
CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')),
CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')),
Expand Down
4 changes: 2 additions & 2 deletions flagai/data/tokenizer/glm_10b_en/glm_10b_en_bpe_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def __init__(self,
self.text_tokenizer.encoder['</s>']),
CommandToken('cls', '[CLS]',
self.text_tokenizer.encoder['<s>']),
CommandToken('MASK',
CommandToken('mask',
'[MASK]',
self.text_tokenizer.encoder['<mask>'],
lstrip=True),
Expand Down Expand Up @@ -88,7 +88,7 @@ def __init__(self,
CommandToken('sop', '<|startofpiece|>', self.num_tokens),
CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1),
CommandToken('cls', '[CLS]', self.num_tokens + 2),
CommandToken('MASK',
CommandToken('mask',
'[MASK]',
self.num_tokens + 3,
lstrip=True),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def __init__(self,
CommandToken('eos', '<|endoftext|>', self.num_text_tokens),
CommandToken('sep', '[SEP]', self.num_text_tokens + 1),
CommandToken('cls', '[CLS]', self.num_text_tokens + 2),
CommandToken('MASK',
CommandToken('mask',
'[MASK]',
self.num_text_tokens + 3,
lstrip=True),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def __init__(self,
self._command_tokens = [
CommandToken('pad', '[PAD]', self.text_tokenizer.vocab['[PAD]']),
CommandToken('cls', '[CLS]', self.text_tokenizer.vocab['[CLS]']),
CommandToken('MASK', '[MASK]',
CommandToken('mask', '[MASK]',
self.text_tokenizer.vocab['[MASK]']),
CommandToken('unk', '[UNK]', self.text_tokenizer.vocab['[UNK]']),
CommandToken('sep', '[SEP]', self.text_tokenizer.vocab['[SEP]']),
Expand Down
2 changes: 1 addition & 1 deletion flagai/data/tokenizer/opt/opt_en_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def __init__(self, tokenizer_model_type="facebook/opt-125m", cache_dir=None):
self._command_tokens = [
CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')),
CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')),
CommandToken('MASK', '[MASK]',
CommandToken('mask', '[MASK]',
self.get_specialid_from_text_tokenizer('mask')),
CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')),
CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')),
Expand Down
2 changes: 1 addition & 1 deletion flagai/data/tokenizer/roberta/roberta_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(self, tokenizer_model_type="roberta-base", cache_dir=None):
self._command_tokens = [
CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')),
CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')),
CommandToken('MASK', '[MASK]',
CommandToken('mask', '[MASK]',
self.get_specialid_from_text_tokenizer('mask')),
CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')),
CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')),
Expand Down
2 changes: 1 addition & 1 deletion flagai/data/tokenizer/t5/t5_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __init__(self, tokenizer_model_type="t5-base", cache_dir=None):

CommandToken('pad', '[PAD]', self.num_tokens + 1),
CommandToken('cls', '[CLS]', self.num_tokens + 2),
CommandToken('MASK', '[MASK]',
CommandToken('mask', '[MASK]',
self.num_tokens + 3),
]
self._command_tokens.extend([
Expand Down
2 changes: 1 addition & 1 deletion flagai/data/tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def __str__(self):
('sep', 4),
('L2R', 5),
('cls', 6),
('MASK', 7),
('mask', 7),
]
DEFAULT_COMMAND_TOKENS = prep_command_tokens(DEFAULT_COMMAND_TOKENS)
"""define some default type tokens for bert training"""
Expand Down
3 changes: 2 additions & 1 deletion flagai/data/tokenizer/uni_tokenizer/base_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ def from_pretrained(cls,
*inputs,
**kwargs)
elif tokenizer_class == "sp":
return cls(sp_model_file=resolved_sp_file,
return cls(vocab_file=resolved_vocab_json_file,
sp_model_file=resolved_sp_file,
tokenizer_class=tokenizer_class,
tokenizer_model_name=tokenizer_model_name,
tokenizer_json_file=resolved_tokenizer_json_file,
Expand Down
2 changes: 1 addition & 1 deletion flagai/data/tokenizer/uni_tokenizer/bpe_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def tokenize(self, text):

def convert_token_to_id(self, token):
""" Converts a sequence of tokens into ids using the vocab. """
return self.encoder.get(token, 0)
return self.encoder[token]

def convert_tokens_to_ids(self, tokens):
""" Converts a sequence of tokens into ids using the vocab. """
Expand Down
Loading