Skip to content

Commit a137273

Browse files
committed
py : pad with unknown tokens when data is missing
ggml-ci
1 parent 9b464b4 commit a137273

File tree

1 file changed

+9
-0
lines changed

1 file changed

+9
-0
lines changed

convert.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1098,6 +1098,15 @@ def extract_vocabulary_from_model(self, vocab: Vocab) -> Tuple[list, list, list]
10981098
scores.append(score)
10991099
toktypes.append(toktype)
11001100

1101+
# pad with unknown tokens and print warnings
1102+
# ref: https://github.com/ggerganov/llama.cpp/issues/4958
1103+
if len(tokens) < vocab.vocab_size:
1104+
for i in range(len(tokens), vocab.vocab_size):
1105+
tokens.append(f"<unk{i}>".encode("utf-8"))
1106+
scores.append(-1000.0)
1107+
toktypes.append(gguf.TokenType.UNKNOWN)
1108+
print(f"Warning: token {i} not found in vocab - padding with {tokens[-1]}")
1109+
11011110
return tokens, scores, toktypes
11021111

11031112
def add_meta_vocab(self, vocab: Vocab) -> None:

0 commit comments

Comments
 (0)