-
-
Notifications
You must be signed in to change notification settings - Fork 5.4k
FP8: Load model on-the-fly in vLLM #3717
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -290,12 +290,15 @@ def from_pretrained( | |
| load_in_4bit, | ||
| load_in_8bit, | ||
| load_in_16bit, | ||
| use_exact_model_name, | ||
| ) | ||
| model_name = _offline_quantize_to_fp8(model_name, fp8_mode) | ||
| else: | ||
|
Comment on lines
290
to
295
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
When Useful? React with 👍 / 👎. |
||
| assert new_model_name is not None | ||
| model_name = new_model_name | ||
| # If mapper resolved to a pre-quantized FP8 model, disable | ||
| # on-the-fly quantization to avoid double quantization | ||
| if load_in_fp8 != False and new_model_name != old_model_name: | ||
| load_in_fp8 = False | ||
|
|
||
| # Check if pre-quantized models are allowed | ||
| # For eg AMD Instinct GPUs need blocksize = 128, but our pre-quants are blocksize = 64 | ||
|
|
@@ -615,6 +618,7 @@ def from_pretrained( | |
| random_state = random_state, | ||
| max_lora_rank = max_lora_rank, | ||
| disable_log_stats = disable_log_stats, | ||
| load_in_fp8 = load_in_fp8, | ||
| *args, | ||
| **kwargs, | ||
| ) | ||
|
|
@@ -894,12 +898,15 @@ def from_pretrained( | |
| load_in_4bit, | ||
| load_in_8bit, | ||
| load_in_16bit, | ||
| use_exact_model_name, | ||
| ) | ||
| model_name = _offline_quantize_to_fp8(model_name, fp8_mode) | ||
| else: | ||
| assert new_model_name is not None | ||
| model_name = new_model_name | ||
| # If mapper resolved to a pre-quantized FP8 model, disable | ||
| # on-the-fly quantization to avoid double quantization | ||
| if load_in_fp8 != False and new_model_name != old_model_name: | ||
| load_in_fp8 = False | ||
|
|
||
| # Check if pre-quantized models are allowed | ||
| # For eg AMD Instinct GPUs need blocksize = 128, but our pre-quants are blocksize = 64 | ||
|
|
@@ -1311,6 +1318,7 @@ def from_pretrained( | |
| random_state = random_state, | ||
| max_lora_rank = max_lora_rank, | ||
| disable_log_stats = disable_log_stats, | ||
| load_in_fp8 = load_in_fp8, | ||
| *args, | ||
| **kwargs, | ||
| ) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,6 +31,7 @@ | |
| ) | ||
| from ._utils import __version__, importlib_version, _prepare_model_for_qat | ||
| from ._utils import * | ||
| from .loader_utils import _get_fp8_mode_and_check_settings | ||
| from ..save import patch_saving_functions | ||
| from ..models.loader_utils import is_distributed | ||
| from unsloth_zoo.gradient_checkpointing import ( | ||
|
|
@@ -433,6 +434,7 @@ def from_pretrained( | |
| max_lora_rank = 64, | ||
| disable_log_stats = False, | ||
| unsloth_vllm_standby = False, | ||
| load_in_fp8 = False, # fp8 LoRA (True, False, 'block') | ||
| **kwargs, | ||
| ): | ||
| if unsloth_vllm_standby and os.environ.get("UNSLOTH_VLLM_STANDBY", "0") != "1": | ||
|
|
@@ -838,6 +840,17 @@ def from_pretrained( | |
| model_name, model_config | ||
| ) | ||
|
|
||
| fp8_mode = None | ||
| if load_in_fp8 != False: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| fp8_mode = _get_fp8_mode_and_check_settings( | ||
| load_in_fp8, | ||
| fast_inference, | ||
| full_finetuning, | ||
| load_in_4bit, | ||
| load_in_8bit, | ||
| load_in_16bit, | ||
| ) | ||
|
|
||
| allowed_args = inspect.getfullargspec(load_vllm).args | ||
| load_vllm_kwargs = dict( | ||
| model_name = model_name, | ||
|
|
@@ -852,6 +865,7 @@ def from_pretrained( | |
| use_bitsandbytes = load_in_4bit, | ||
| unsloth_vllm_standby = unsloth_vllm_standby, | ||
| is_vision_model = is_vlm, | ||
| fp8_mode = fp8_mode, | ||
| ) | ||
| for allowed_arg in allowed_args: | ||
| if allowed_arg not in load_vllm_kwargs and allowed_arg in kwargs: | ||
|
|
@@ -865,6 +879,7 @@ def from_pretrained( | |
| llm, | ||
| config = model_config, | ||
| is_vision_model = is_vlm, | ||
| load_in_fp8 = load_in_fp8, | ||
| ) | ||
| model = convert_vllm_to_huggingface( | ||
| quant_state_dict, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For boolean checks, it's more idiomatic in Python to use the truthiness of the value directly rather than comparing with
False. Theload_in_fp8parameter can beTrue,False, or a string like'block'. BothTrueand non-empty strings are truthy, whileFalseis falsy. Usingif load_in_fp8:is more concise and readable, and achieves the same result.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah agree with gemini here :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sure I can change it, I just had it this way because I saw that's how Daniel wrote it in a few existing places