Skip to content

Commit

Permalink
Merge pull request #1 from rioyokotalab/v0.3-dev
Browse files Browse the repository at this point in the history
Support Qen2MoE, Deepseek
  • Loading branch information
okoge-kaz authored Jul 18, 2024
2 parents 93f317e + 87a13e0 commit 02701df
Show file tree
Hide file tree
Showing 22 changed files with 2,646 additions and 37 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
configs/*.json
mixtral-8x7b.json
qwen2-57B.json
deepseek-moe-16b.json

.DS_Store
__pycache__
Expand Down
2 changes: 2 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"bettertransformer",
"colour",
"Concatenator",
"deepseek",
"detokenize",
"detokenizer",
"ELYZA",
Expand Down Expand Up @@ -48,6 +49,7 @@
"plamo",
"psutil",
"pubmed",
"Qwen",
"samsum",
"seqyebce",
"Sharded",
Expand Down
2 changes: 1 addition & 1 deletion megatron_lm/megatron/core/datasets/helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ py::array build_sample_idx(const py::array_t<int32_t> &sizes_,
while (sample_index <= num_samples)
{
// Start with a fresh sequence.
int32_t remaining_seq_length = seq_length + 1;
int32_t remaining_seq_length = seq_length;
while (remaining_seq_length != 0)
{
// Get the document length.
Expand Down
126 changes: 126 additions & 0 deletions megatron_lm/megatron/core/datasets/megatron_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import json
from abc import ABC, abstractmethod
from collections import OrderedDict
from typing import Any

import numpy


class MegatronTokenizer(ABC):
"""Abstract class for tokenizer
Absent a config or class-specific tracking of which objects are uniquely identifying, we must
include all key word arguments as unique identifiers
Args:
tokenizer_paths (Tuple[str]): All tokenizer source paths or prefixes
kwargs (Dict[str, Any]): All tokenizer options
"""

def __init__(self, *tokenizer_paths: str, **tokenizer_options: Any):

self.unique_identifiers = OrderedDict()
self.unique_identifiers["class"] = type(self).__name__
self.unique_identifiers["tokenizer_path"] = list(tokenizer_paths)
for option in tokenizer_options:
self.unique_identifiers[option] = str(tokenizer_options[option])

self.unique_description = json.dumps(self.unique_identifiers, indent=4)

super().__init__()

@abstractmethod
def tokenize(self, text: str) -> numpy.ndarray:
"""Convert text to embedding ids
Args:
text (str): The text to convert
Returns:
numpy.ndarray: The converted embedding ids
"""
pass

def detokenize(self, ids: numpy.ndarray) -> str:
"""Convert embedding ids to text
Args:
ids (numpy.ndarray): The ids to convert
Returns:
str: The converted text
Raises:
NotImplementedError: Non-abstract, optional method
"""
raise NotImplementedError("{} has no method 'detokenize'".format(type(self).__name__))

@property
@abstractmethod
def vocab(self):
"""Dictionary from vocab text token to id token
"""
pass

@property
@abstractmethod
def inv_vocab(self):
"""Dictionary from vocab id token to text token
"""
pass

@property
@abstractmethod
def vocab_size(self):
"""The vocabulary size
"""
pass

@property
def cls(self):
"""The CLS token id
Raises:
NotImplementedError: Non-abstract, optional attribute
"""
raise NotImplementedError("{} has no attribute 'cls'".format(type(self).__name__))

@property
def sep(self):
"""The SEP token id
Raises:
NotImplementedError: Non-abstract, optional attribute
"""
raise NotImplementedError("{} has no attribute 'sep'".format(type(self).__name__))

@property
def pad(self):
"""The PAD token id
Raises:
NotImplementedError: Non-abstract, optional attribute
"""
raise NotImplementedError("{} has no attribute 'pad'".format(type(self).__name__))

@property
def eod(self):
"""The EOD token id
Raises:
NotImplementedError: Non-abstract, optional attribute
"""
raise NotImplementedError("{} has no attribute 'eod'".format(type(self).__name__))

@property
def bos(self):
"""The BOS token id
Raises:
NotImplementedError: Non-abstract, optional attribute
"""
raise NotImplementedError("{} has no attribute 'bos'".format(type(self).__name__))

@property
def eos(self):
"""The EOS token id
Raises:
NotImplementedError: Non-abstract, optional attribute
"""
raise NotImplementedError("{} has no attribute 'eos'".format(type(self).__name__))

@property
def mask(self):
"""The MASK token id
Raises:
NotImplementedError: Non-abstract, optional attribute
"""
raise NotImplementedError("{} has no attribute 'mask'".format(type(self).__name__))
149 changes: 149 additions & 0 deletions megatron_lm/megatron/tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@
import argparse
from abc import ABC
from abc import abstractmethod
import os
from typing import Optional

from llama_recipes.utils.distributed import is_rank_0
from megatron_lm.megatron.core.datasets.megatron_tokenizer import MegatronTokenizer


def build_tokenizer(args: argparse.Namespace):
Expand All @@ -22,6 +26,12 @@ def build_tokenizer(args: argparse.Namespace):
elif args.tokenizer_type == 'Llama2Tokenizer':
assert args.tokenizer_model is not None
tokenizer = _Llama2Tokenizer(args.tokenizer_model)
elif args.tokenizer_type == 'Qwen2Tokenizer':
assert args.tokenizer_model is not None
tokenizer = _Qwen2Tokenizer(args.tokenizer_model)
elif args.tokenizer_type == 'DeepseekTokenizer':
assert args.tokenizer_model is not None
tokenizer = _DeepseekTokenizer(args.tokenizer_model)
elif args.tokenizer_type == 'NullTokenizer':
assert args.vocab_size is not None
tokenizer = _NullTokenizer(args.vocab_size)
Expand Down Expand Up @@ -371,6 +381,145 @@ def additional_special_tokens_ids(self):
return None


class _Qwen2Tokenizer(MegatronTokenizer):
def __init__(
self,
model_file: str,
vocab_extra_ids=0,
) -> None:
self.name = "Qwe2Tokenizer"
super().__init__(model_file, vocab_extra_ids=vocab_extra_ids)

from transformers import AutoTokenizer
self.tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=os.path.dirname(model_file)
)
self.bos_id: Optional[int] = self.tokenizer.bos_token_id
self.eos_id: Optional[int] = self.tokenizer.eos_token_id
self.pad_id: Optional[int] = self.tokenizer.pad_token_id

assert self.tokenizer.pad_token_id is not None and self.tokenizer.pad_token_id == 151643
assert self.tokenizer.bos_token_id is None
assert self.tokenizer.eos_token_id is not None and self.tokenizer.eos_token_id == 151643
assert self.tokenizer.pad_token_id == self.tokenizer.eos_token_id
assert len(self.tokenizer) >= 151646, f"vocab_size: {len(self.tokenizer)}"

# Qwen2 tokenizer has no bos token, so we don't need to add it.
def tokenize(self, text: str, bos=False, eos=False) -> list[int]:
'''Default args for text completion, not chat/dialog.'''
assert type(text) is str
t = self.tokenizer.encode(text, add_special_tokens=False) # type: ignore
if bos and self.bos_id is not None:
t = [self.bos_id] + t
if eos and self.eos_id is not None:
t = t + [self.eos_id]
return t

def detokenize(self, ids: list[int]):
return self.tokenizer.decode(ids, skip_special_tokens=True)

@property
def cls(self):
return -1

@property
def sep(self):
return -1

@property
def mask(self):
return -1

@property
def eod(self):
return self.tokenizer.eos_token_id

@property
def additional_special_tokens_ids(self):
return None

@property
def vocab(self):
return self.tokenizer.get_vocab()

@property
def inv_vocab(self):
return {v: k for k, v in self.tokenizer.get_vocab().items()}

@property
def vocab_size(self):
return len(self.tokenizer)


class _DeepseekTokenizer(MegatronTokenizer):
def __init__(
self,
model_file: str,
vocab_extra_ids=0,
) -> None:
self.name = "DeepseekTokenizer"
super().__init__(model_file, vocab_extra_ids=vocab_extra_ids)

from transformers import AutoTokenizer
self.tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=os.path.dirname(model_file)
)
self.bos_id: Optional[int] = self.tokenizer.bos_token_id
self.eos_id: Optional[int] = self.tokenizer.eos_token_id
self.pad_id: Optional[int] = self.tokenizer.pad_token_id

assert self.tokenizer.pad_token_id is not None and self.tokenizer.pad_token_id == 100001
assert self.tokenizer.bos_token_id is not None and self.tokenizer.bos_token_id == 100000
assert self.tokenizer.eos_token_id is not None and self.tokenizer.eos_token_id == 100001
assert self.tokenizer.pad_token_id == self.tokenizer.eos_token_id
assert len(self.tokenizer) >= 100015, f"vocab_size: {len(self.tokenizer)}"

def tokenize(self, text: str, bos=True, eos=False) -> list[int]:
'''Default args for text completion, not chat/dialog.'''
assert type(text) is str
t = self.tokenizer.encode(text, add_special_tokens=False) # type: ignore
if bos and self.bos_id is not None:
t = [self.bos_id] + t
if eos and self.eos_id is not None:
t = t + [self.eos_id]
return t

def detokenize(self, ids: list[int]):
return self.tokenizer.decode(ids, skip_special_tokens=True)

@property
def cls(self):
return -1

@property
def sep(self):
return -1

@property
def mask(self):
return -1

@property
def eod(self):
return self.tokenizer.eos_token_id

@property
def additional_special_tokens_ids(self):
return None

@property
def vocab(self):
return self.tokenizer.get_vocab()

@property
def inv_vocab(self):
return {v: k for k, v in self.tokenizer.get_vocab().items()}

@property
def vocab_size(self):
return len(self.tokenizer)


class _NullTokenizer:
def __init__(self, vocab_size):
vocab_size = int(vocab_size)
Expand Down
2 changes: 1 addition & 1 deletion megatron_lm/tools/preprocess_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ def get_args():
group.add_argument('--tokenizer-type', type=str, required=True,
choices=['MambaTokenizer', 'SentencePieceTokenizer',
'GPTSentencePieceTokenizer', 'Llama2Tokenizer',
'NullTokenizer'],
'Qwen2Tokenizer', 'DeepseekTokenizer', 'NullTokenizer'],
help='What type of tokenizer to use.')
group.add_argument('--tokenizer-model', type=str, default=None,
help='YTTM tokenizer model.')
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
--find-links https://download.pytorch.org/whl/torch_stable.html
torch==2.2.2+cu121
torch==2.3.1+cu121

# huggingface
transformers>=4.35.0
Expand Down
Loading

0 comments on commit 02701df

Please sign in to comment.