Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions gguf-py/gguf/vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -675,6 +675,11 @@ def __init__(self, base_path: Path):
all_files = [f.as_posix() for f in base_path.glob("**/*") if f.is_file()]
valid_tokenizer_files = _filter_valid_tokenizer_files(all_files)

if len(valid_tokenizer_files) >= 1 and isinstance(valid_tokenizer_files[0], tuple):
# Later mistral-common versions return tuples of (file_name, file_path) instead of a string list file_names[].
# ref: https://github.com/ggml-org/llama.cpp/issues/17691
valid_tokenizer_files = [vf[0] for vf in valid_tokenizer_files]

if len(valid_tokenizer_files) == 0:
raise ValueError(f"No tokenizer file found in the directory: {base_path}")
# If there are multiple tokenizer files, we use tekken.json if it exists, otherwise the versioned one.
Expand Down