-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from rioyokotalab/v0.3-dev
Support Qen2MoE, Deepseek
- Loading branch information
Showing
22 changed files
with
2,646 additions
and
37 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,7 @@ | ||
configs/*.json | ||
mixtral-8x7b.json | ||
qwen2-57B.json | ||
deepseek-moe-16b.json | ||
|
||
.DS_Store | ||
__pycache__ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
126 changes: 126 additions & 0 deletions
126
megatron_lm/megatron/core/datasets/megatron_tokenizer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
import json | ||
from abc import ABC, abstractmethod | ||
from collections import OrderedDict | ||
from typing import Any | ||
|
||
import numpy | ||
|
||
|
||
class MegatronTokenizer(ABC): | ||
"""Abstract class for tokenizer | ||
Absent a config or class-specific tracking of which objects are uniquely identifying, we must | ||
include all key word arguments as unique identifiers | ||
Args: | ||
tokenizer_paths (Tuple[str]): All tokenizer source paths or prefixes | ||
kwargs (Dict[str, Any]): All tokenizer options | ||
""" | ||
|
||
def __init__(self, *tokenizer_paths: str, **tokenizer_options: Any): | ||
|
||
self.unique_identifiers = OrderedDict() | ||
self.unique_identifiers["class"] = type(self).__name__ | ||
self.unique_identifiers["tokenizer_path"] = list(tokenizer_paths) | ||
for option in tokenizer_options: | ||
self.unique_identifiers[option] = str(tokenizer_options[option]) | ||
|
||
self.unique_description = json.dumps(self.unique_identifiers, indent=4) | ||
|
||
super().__init__() | ||
|
||
@abstractmethod | ||
def tokenize(self, text: str) -> numpy.ndarray: | ||
"""Convert text to embedding ids | ||
Args: | ||
text (str): The text to convert | ||
Returns: | ||
numpy.ndarray: The converted embedding ids | ||
""" | ||
pass | ||
|
||
def detokenize(self, ids: numpy.ndarray) -> str: | ||
"""Convert embedding ids to text | ||
Args: | ||
ids (numpy.ndarray): The ids to convert | ||
Returns: | ||
str: The converted text | ||
Raises: | ||
NotImplementedError: Non-abstract, optional method | ||
""" | ||
raise NotImplementedError("{} has no method 'detokenize'".format(type(self).__name__)) | ||
|
||
@property | ||
@abstractmethod | ||
def vocab(self): | ||
"""Dictionary from vocab text token to id token | ||
""" | ||
pass | ||
|
||
@property | ||
@abstractmethod | ||
def inv_vocab(self): | ||
"""Dictionary from vocab id token to text token | ||
""" | ||
pass | ||
|
||
@property | ||
@abstractmethod | ||
def vocab_size(self): | ||
"""The vocabulary size | ||
""" | ||
pass | ||
|
||
@property | ||
def cls(self): | ||
"""The CLS token id | ||
Raises: | ||
NotImplementedError: Non-abstract, optional attribute | ||
""" | ||
raise NotImplementedError("{} has no attribute 'cls'".format(type(self).__name__)) | ||
|
||
@property | ||
def sep(self): | ||
"""The SEP token id | ||
Raises: | ||
NotImplementedError: Non-abstract, optional attribute | ||
""" | ||
raise NotImplementedError("{} has no attribute 'sep'".format(type(self).__name__)) | ||
|
||
@property | ||
def pad(self): | ||
"""The PAD token id | ||
Raises: | ||
NotImplementedError: Non-abstract, optional attribute | ||
""" | ||
raise NotImplementedError("{} has no attribute 'pad'".format(type(self).__name__)) | ||
|
||
@property | ||
def eod(self): | ||
"""The EOD token id | ||
Raises: | ||
NotImplementedError: Non-abstract, optional attribute | ||
""" | ||
raise NotImplementedError("{} has no attribute 'eod'".format(type(self).__name__)) | ||
|
||
@property | ||
def bos(self): | ||
"""The BOS token id | ||
Raises: | ||
NotImplementedError: Non-abstract, optional attribute | ||
""" | ||
raise NotImplementedError("{} has no attribute 'bos'".format(type(self).__name__)) | ||
|
||
@property | ||
def eos(self): | ||
"""The EOS token id | ||
Raises: | ||
NotImplementedError: Non-abstract, optional attribute | ||
""" | ||
raise NotImplementedError("{} has no attribute 'eos'".format(type(self).__name__)) | ||
|
||
@property | ||
def mask(self): | ||
"""The MASK token id | ||
Raises: | ||
NotImplementedError: Non-abstract, optional attribute | ||
""" | ||
raise NotImplementedError("{} has no attribute 'mask'".format(type(self).__name__)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.