diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index c6551cdfb..7d258c26a 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -103,6 +103,9 @@ def inv_vocab(self): def tokenize(self, text): pass + def batch_tokenize(self, texts): + return [self.tokenize(text) for text in texts] + def detokenize(self, token_ids): raise NotImplementedError('detokenizer is not implemented for {} ' 'tokenizer'.format(self.name)) @@ -323,6 +326,9 @@ def inv_vocab(self): def tokenize(self, text): return self.tokenizer.encode(text) + def batch_tokenize(self, texts): + return self.tokenizer.batch_encode_plus(texts, return_attention_mask=False)["input_ids"] + def detokenize(self, token_ids): return self.tokenizer.decode(token_ids) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index da05b4e4d..e69aeb14c 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -80,11 +80,12 @@ def encode(self, json_line): ids = {} for key in self.args.json_keys: text = data[key] - doc_ids = [] - for sentence in Encoder.splitter.tokenize(text): - sentence_ids = Encoder.tokenizer.tokenize(sentence) - if len(sentence_ids) > 0: - doc_ids.append(sentence_ids) + sentences = Encoder.splitter.tokenize(text) + doc_ids = [ + sentence_ids + for sentence_ids in Encoder.tokenizer.batch_tokenize(sentences) + if len(sentence_ids) > 0 + ] if len(doc_ids) > 0 and self.args.append_eod: doc_ids[-1].append(Encoder.tokenizer.eod) ids[key] = doc_ids