|
20 | 20 | import gguf |
21 | 21 |
|
22 | 22 |
|
23 | | -def bytes_to_unicode(): |
24 | | - # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py |
25 | | - """ |
26 | | - Returns list of utf-8 byte and a corresponding list of unicode strings. |
27 | | - The reversible bpe codes work on unicode strings. |
28 | | - This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. |
29 | | - When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. |
30 | | - This is a significant percentage of your normal, say, 32K bpe vocab. |
31 | | - To avoid that, we want lookup tables between utf-8 bytes and unicode strings. |
32 | | - And avoids mapping to whitespace/control characters the bpe code barfs on. |
33 | | - """ |
34 | | - bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) |
35 | | - cs = bs[:] |
36 | | - n = 0 |
37 | | - for b in range(2**8): |
38 | | - if b not in bs: |
39 | | - bs.append(b) |
40 | | - cs.append(2**8+n) |
41 | | - n += 1 |
42 | | - return dict(zip(bs, (chr(n) for n in cs))) |
43 | | - |
44 | | - |
45 | 23 | def count_model_parts(dir_model: Path) -> int: |
46 | 24 | num_parts = 0 |
47 | 25 | for filename in os.listdir(dir_model): |
@@ -133,50 +111,32 @@ def parse_args() -> argparse.Namespace: |
133 | 111 | print("gguf: get tokenizer metadata") |
134 | 112 |
|
135 | 113 | tokens: list[bytearray] = [] |
136 | | - |
137 | | -tokenizer_json_file = dir_model / 'tokenizer.json' |
138 | | -if not tokenizer_json_file.is_file(): |
139 | | - print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr) |
140 | | - sys.exit(1) |
| 114 | +scores: list[float] = [] |
| 115 | +toktypes: list[int] = [] |
141 | 116 |
|
142 | 117 | # gpt2 tokenizer |
143 | 118 | gguf_writer.add_tokenizer_model("gpt2") |
144 | 119 |
|
145 | | -with open(tokenizer_json_file, "r", encoding="utf-8") as f: |
146 | | - tokenizer_json = json.load(f) |
147 | | - |
148 | 120 | print("gguf: get gpt2 tokenizer vocab") |
149 | 121 |
|
150 | | -# The number of tokens in tokenizer.json can differ from the expected vocab size. |
151 | | -# This causes downstream issues with mismatched tensor sizes when running the inference |
152 | | -vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"]) |
153 | | - |
154 | 122 | # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py |
155 | 123 | tokenizer = AutoTokenizer.from_pretrained(dir_model) |
156 | 124 |
|
| 125 | +# The number of tokens in tokenizer.json can differ from the expected vocab size. |
| 126 | +# This causes downstream issues with mismatched tensor sizes when running the inference |
| 127 | +vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) |
| 128 | +assert max(tokenizer.vocab.values()) < vocab_size |
| 129 | + |
157 | 130 | reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} |
158 | | -byte_encoder = bytes_to_unicode() |
159 | | -byte_decoder = {v: k for k, v in byte_encoder.items()} |
160 | 131 |
|
161 | 132 | for i in range(vocab_size): |
162 | | - if i in reverse_vocab: |
163 | | - try: |
164 | | - text = bytearray([byte_decoder[c] for c in reverse_vocab[i]]) |
165 | | - except KeyError: |
166 | | - text = bytearray() |
167 | | - for c in reverse_vocab[i]: |
168 | | - if ord(c) < 256: # single byte character |
169 | | - text.append(byte_decoder[ord(c)]) |
170 | | - else: # multibyte special token character |
171 | | - text.extend(c.encode('utf-8')) |
172 | | - else: |
173 | | - print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.") |
174 | | - pad_token = f"[PAD{i}]".encode("utf8") |
175 | | - text = bytearray(pad_token) |
176 | | - |
177 | | - tokens.append(text) |
| 133 | + tokens.append(reverse_vocab[i]) |
| 134 | + scores.append(0.0) # dummy |
| 135 | + toktypes.append(gguf.TokenType.NORMAL) |
178 | 136 |
|
179 | 137 | gguf_writer.add_token_list(tokens) |
| 138 | +gguf_writer.add_token_scores(scores) |
| 139 | +gguf_writer.add_token_types(toktypes) |
180 | 140 |
|
181 | 141 | special_vocab = gguf.SpecialVocab(dir_model, load_merges = True) |
182 | 142 | special_vocab.add_to_gguf(gguf_writer) |
|
0 commit comments