Support local GGUF in VLLM and use HF tokenizer #943 (#972)

JIElite · web-flow · commit d6a65ca5b7f8 · 2025-09-18T16:39:00.000+02:00
* Support local GGUF in VLLM and use HF tokenizer #943 * Improve the readability of implementation
diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py
@@ -85,6 +85,8 @@ class VLLMModelConfig(ModelConfig):
     Attributes:
         model_name (str):
             HuggingFace Hub model ID or path to the model to load.
+        tokenizer (str | None):
+            HuggingFace Hub model ID or path to the tokenizer to load.
         revision (str):
             Git revision of the model. Defaults to "main".
         dtype (str):
@@ -150,6 +152,7 @@ class VLLMModelConfig(ModelConfig):
     """
 
     model_name: str
+    tokenizer: str | None = None
     revision: str = "main"  # revision of the model
     dtype: str = "bfloat16"
     tensor_parallel_size: PositiveInt = 1  # how many GPUs to use for tensor parallelism
@@ -289,7 +292,7 @@ def _create_auto_model(self, config: VLLMModelConfig) -> Optional[LLM]:
 
     def _create_auto_tokenizer(self, config: VLLMModelConfig):
         tokenizer = get_tokenizer(
-            config.model_name,
+            config.tokenizer or config.model_name,  # use HF tokenizer for non-HF models, like GGUF model.
             tokenizer_mode="auto",
             trust_remote_code=config.trust_remote_code,
             revision=config.revision,