Skip to content

Commit dd62dcf

Browse files
authored
convert : Make mistral-common dependency optional (#16738)
* Make mistral-common dependency optional * Fix typing
1 parent d0660f2 commit dd62dcf

File tree

3 files changed

+38
-15
lines changed

3 files changed

+38
-15
lines changed

convert_hf_to_gguf.py

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,29 @@
2929
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
3030
import gguf
3131
from gguf.vocab import MistralTokenizerType, MistralVocab
32-
from mistral_common.tokens.tokenizers.base import TokenizerVersion
33-
from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN, DATASET_STD
34-
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
35-
from mistral_common.tokens.tokenizers.sentencepiece import (
36-
SentencePieceTokenizer,
37-
)
32+
33+
try:
34+
from mistral_common.tokens.tokenizers.base import TokenizerVersion # pyright: ignore[reportMissingImports]
35+
from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # pyright: ignore[reportMissingImports]
36+
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports]
37+
from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
38+
SentencePieceTokenizer,
39+
)
40+
41+
_mistral_common_installed = True
42+
_mistral_import_error_msg = ""
43+
except ImportError:
44+
_MISTRAL_COMMON_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
45+
_MISTRAL_COMMON_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
46+
47+
_mistral_common_installed = False
48+
TokenizerVersion = None
49+
Tekkenizer = None
50+
SentencePieceTokenizer = None
51+
_mistral_import_error_msg = (
52+
"Mistral format requires `mistral-common` to be installed. Please run "
53+
"`pip install mistral-common[image,audio]` to install it."
54+
)
3855

3956

4057
logger = logging.getLogger("hf-to-gguf")
@@ -107,6 +124,9 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
107124
type(self) is MmprojModel:
108125
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
109126

127+
if self.is_mistral_format and not _mistral_common_installed:
128+
raise ImportError(_mistral_import_error_msg)
129+
110130
self.dir_model = dir_model
111131
self.ftype = ftype
112132
self.fname_out = fname_out
@@ -1363,8 +1383,8 @@ def set_gguf_parameters(self):
13631383
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))
13641384

13651385
# preprocessor config
1366-
image_mean = DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
1367-
image_std = DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"]
1386+
image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
1387+
image_std = _MISTRAL_COMMON_DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"]
13681388

13691389
self.gguf_writer.add_vision_image_mean(image_mean)
13701390
self.gguf_writer.add_vision_image_std(image_std)
@@ -2033,6 +2053,9 @@ def __init__(self, *args, **kwargs):
20332053
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
20342054

20352055
def _set_vocab_mistral(self):
2056+
if not _mistral_common_installed:
2057+
raise ImportError(_mistral_import_error_msg)
2058+
20362059
vocab = MistralVocab(self.dir_model)
20372060
logger.info(
20382061
f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}."
@@ -9212,7 +9235,7 @@ class MistralModel(LlamaModel):
92129235

92139236
@staticmethod
92149237
def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool):
9215-
assert TokenizerVersion is not None, "mistral_common is not installed"
9238+
assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None, _mistral_import_error_msg
92169239
assert isinstance(vocab.tokenizer, (Tekkenizer, SentencePieceTokenizer)), (
92179240
f"Expected Tekkenizer or SentencePieceTokenizer, got {type(vocab.tokenizer)}"
92189241
)
@@ -9594,6 +9617,8 @@ def main() -> None:
95949617
fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-")
95959618

95969619
is_mistral_format = args.mistral_format
9620+
if is_mistral_format and not _mistral_common_installed:
9621+
raise ImportError(_mistral_import_error_msg)
95979622
disable_mistral_community_chat_template = args.disable_mistral_community_chat_template
95989623

95999624
with torch.inference_mode():

gguf-py/gguf/vocab.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@
1414
SentencePieceProcessor = None
1515

1616
try:
17-
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
18-
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
19-
from mistral_common.tokens.tokenizers.utils import (
17+
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # pyright: ignore[reportMissingImports]
18+
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports]
19+
from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports]
2020
_filter_valid_tokenizer_files,
2121
)
22-
from mistral_common.tokens.tokenizers.sentencepiece import (
22+
from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
2323
SentencePieceTokenizer,
2424
)
2525
except ImportError:

requirements/requirements-convert_hf_to_gguf.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
mistral-common>=1.8.3
2-
31
-r ./requirements-convert_legacy_llama.txt
42
--extra-index-url https://download.pytorch.org/whl/cpu
53

0 commit comments

Comments
 (0)