From 8d33530f6c77df820a2edb572d5e3a51c2074bf4 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 6 May 2025 07:16:20 +0000 Subject: [PATCH 1/7] Done Signed-off-by: Jee Jee Li --- .../lora_with_quantization_inference.py | 56 +++++++++---------- vllm/engine/arg_utils.py | 31 +++++----- .../model_loader/weight_utils.py | 11 +--- 3 files changed, 41 insertions(+), 57 deletions(-) diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py index ab235ddd7545..2b2ae685e814 100644 --- a/examples/offline_inference/lora_with_quantization_inference.py +++ b/examples/offline_inference/lora_with_quantization_inference.py @@ -75,43 +75,37 @@ def initialize_engine(model: str, quantization: str, lora_repo: Optional[str]) -> LLMEngine: """Initialize the LLMEngine.""" - if quantization == "bitsandbytes": - # QLoRA (https://arxiv.org/abs/2305.14314) is a quantization technique. - # It quantizes the model when loading, with some config info from the - # LoRA adapter repo. So need to set the parameter of load_format and - # qlora_adapter_name_or_path as below. - engine_args = EngineArgs(model=model, - quantization=quantization, - qlora_adapter_name_or_path=lora_repo, - enable_lora=True, - max_lora_rank=64) - else: - engine_args = EngineArgs(model=model, - quantization=quantization, - enable_lora=True, - max_loras=4) + engine_args = EngineArgs(model=model, + quantization=quantization, + enable_lora=True, + max_loras=4) return LLMEngine.from_engine_args(engine_args) def main(): """Main function that sets up and runs the prompt processing.""" - test_configs = [{ - "name": "qlora_inference_example", - 'model': "huggyllama/llama-7b", - 'quantization': "bitsandbytes", - 'lora_repo': 'timdettmers/qlora-flan-7b' - }, { - "name": "AWQ_inference_with_lora_example", - 'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ', - 'quantization': "awq", - 'lora_repo': 'jashing/tinyllama-colorist-lora' - }, { - "name": "GPTQ_inference_with_lora_example", - 'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ', - 'quantization': "gptq", - 'lora_repo': 'jashing/tinyllama-colorist-lora' - }] + test_configs = [ + # QLoRA (https://arxiv.org/abs/2305.14314) + { + "name": "qlora_inference_example", + 'model': "huggyllama/llama-7b", + 'quantization': "bitsandbytes", + 'lora_repo': 'timdettmers/qlora-flan-7b' + }, + { + "name": "AWQ_inference_with_lora_example", + 'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ', + 'quantization': "awq", + 'lora_repo': 'jashing/tinyllama-colorist-lora' + }, + { + "name": "GPTQ_inference_with_lora_example", + 'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ', + 'quantization': "gptq", + 'lora_repo': 'jashing/tinyllama-colorist-lora' + } + ] for test_config in test_configs: print( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 08dbb4c45039..d48df285fb64 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -6,6 +6,7 @@ import json import re import threading +import warnings from dataclasses import MISSING, dataclass, fields from itertools import permutations from typing import (Any, Callable, Dict, List, Literal, Optional, Type, @@ -394,7 +395,13 @@ def __post_init__(self): if isinstance(self.compilation_config, (int, dict)): self.compilation_config = CompilationConfig.from_cli( str(self.compilation_config)) - + if self.qlora_adapter_name_or_path is not None: + warnings.warn( + "The 'qlora_adapter_name_or_path' is deprecated " + "and will be removed in a future release. ", + DeprecationWarning, + stacklevel=2, + ) # Setup plugins from vllm.plugins import load_general_plugins load_general_plugins() @@ -504,10 +511,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: **load_kwargs["ignore_patterns"]) load_group.add_argument("--use-tqdm-on-load", **load_kwargs["use_tqdm_on_load"]) - load_group.add_argument('--qlora-adapter-name-or-path', - type=str, - default=None, - help='Name or path of the QLoRA adapter.') + load_group.add_argument( + "--qlora-adapter-name-or-path", + type=str, + default=None, + help="The `--qlora-adapter-name-or-path` is " + "deprecated and will be removed in a future release.", + ) load_group.add_argument('--pt-load-map-location', **load_kwargs["pt_load_map_location"]) @@ -895,12 +905,6 @@ def create_model_config(self) -> ModelConfig: def create_load_config(self) -> LoadConfig: - if(self.qlora_adapter_name_or_path is not None) and \ - self.quantization != "bitsandbytes": - raise ValueError( - "QLoRA adapter only support " - f"'bitsandbytes' quantization, but got {self.quantization}") - if self.quantization == "bitsandbytes": self.load_format = "bitsandbytes" @@ -1097,11 +1101,6 @@ def create_engine_config( max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras and self.max_cpu_loras > 0 else None) if self.enable_lora else None - if self.qlora_adapter_name_or_path is not None and \ - self.qlora_adapter_name_or_path != "": - self.model_loader_extra_config[ - "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path - # bitsandbytes pre-quantized model need a specific model loader if model_config.quantization == "bitsandbytes": self.quantization = self.load_format = "bitsandbytes" diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 10bc55ca5f7d..567e8da696f6 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -162,17 +162,8 @@ def get_quant_config(model_config: ModelConfig, None) if hf_quant_config is not None: return quant_cls.from_config(hf_quant_config) - # In case of bitsandbytes/QLoRA, get quant config from the adapter model. - if model_config.quantization == "bitsandbytes": - if (not load_config.model_loader_extra_config - or "qlora_adapter_name_or_path" - not in load_config.model_loader_extra_config): - return quant_cls.from_config({"adapter_name_or_path": ""}) - model_name_or_path = load_config.model_loader_extra_config[ - "qlora_adapter_name_or_path"] - else: - model_name_or_path = model_config.model + model_name_or_path = model_config.model is_local = os.path.isdir(model_name_or_path) if not is_local: # Download the config files. From 5e7ec0878d27f35373d1eb5b4ae8657f95ab66ff Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 6 May 2025 07:26:31 +0000 Subject: [PATCH 2/7] Done Signed-off-by: Jee Jee Li --- examples/offline_inference/lora_with_quantization_inference.py | 1 + vllm/engine/arg_utils.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py index 2b2ae685e814..b6608ec6e958 100644 --- a/examples/offline_inference/lora_with_quantization_inference.py +++ b/examples/offline_inference/lora_with_quantization_inference.py @@ -78,6 +78,7 @@ def initialize_engine(model: str, quantization: str, engine_args = EngineArgs(model=model, quantization=quantization, enable_lora=True, + max_lora_rank=64, max_loras=4) return LLMEngine.from_engine_args(engine_args) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d48df285fb64..795633ae8416 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -397,7 +397,7 @@ def __post_init__(self): str(self.compilation_config)) if self.qlora_adapter_name_or_path is not None: warnings.warn( - "The 'qlora_adapter_name_or_path' is deprecated " + "The `qlora_adapter_name_or_path` is deprecated " "and will be removed in a future release. ", DeprecationWarning, stacklevel=2, From 77e2a288267d87478a9cb34029534386a9e1c86e Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 6 May 2025 07:39:24 +0000 Subject: [PATCH 3/7] Fix typo Signed-off-by: Jee Jee Li --- vllm/engine/arg_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 795633ae8416..2bcd7df9ab12 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -543,7 +543,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: action=argparse.BooleanOptionalAction, help="[DEPRECATED] The `--enable-reasoning` flag is deprecated as " "of v0.8.6. Use `--reasoning-parser` to specify the reasoning " - "parser backend insteadThis flag (`--enable-reasoning`) will be " + "parser backend instead. This flag (`--enable-reasoning`) will be " "removed in v0.10.0. When `--reasoning-parser` is specified, " "reasoning mode is automatically enabled.") guided_decoding_group.add_argument( From 20a468319f07196dca8b5d56af6c817c36eec536 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 6 May 2025 07:45:04 +0000 Subject: [PATCH 4/7] Modify comment Signed-off-by: Jee Jee Li --- vllm/engine/arg_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 2bcd7df9ab12..bee96a217db2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -515,8 +515,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "--qlora-adapter-name-or-path", type=str, default=None, - help="The `--qlora-adapter-name-or-path` is " - "deprecated and will be removed in a future release.", + help="The `--qlora-adapter-name-or-path` has no effect, do not set" + " it, and it will be removed in a future release.", ) load_group.add_argument('--pt-load-map-location', **load_kwargs["pt_load_map_location"]) From f934ce840f84752e2d02d029263f410da9f60f77 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 6 May 2025 07:54:21 +0000 Subject: [PATCH 5/7] Modify comment Signed-off-by: Jee Jee Li --- vllm/engine/arg_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index bee96a217db2..580c4baf1797 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -398,7 +398,7 @@ def __post_init__(self): if self.qlora_adapter_name_or_path is not None: warnings.warn( "The `qlora_adapter_name_or_path` is deprecated " - "and will be removed in a future release. ", + "and will be removed in v0.10.0. ", DeprecationWarning, stacklevel=2, ) @@ -516,7 +516,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: type=str, default=None, help="The `--qlora-adapter-name-or-path` has no effect, do not set" - " it, and it will be removed in a future release.", + " it, and it will be removed in v0.10.0.", ) load_group.add_argument('--pt-load-map-location', **load_kwargs["pt_load_map_location"]) From 8b6906808570849590b68cb625fdedb1a448bea4 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 6 May 2025 10:38:28 +0000 Subject: [PATCH 6/7] Fix bug Signed-off-by: Jee Jee Li --- requirements/test.txt | 21 +++++++++++++++++-- vllm/engine/arg_utils.py | 1 + .../model_loader/weight_utils.py | 15 ++++++------- 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/requirements/test.txt b/requirements/test.txt index 9a15d9a0d824..e2a853a1469d 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -27,6 +27,10 @@ argcomplete==3.5.1 # via datamodel-code-generator arrow==1.3.0 # via isoduration +async-timeout==5.0.1 + # via + # aiohttp + # redis attrs==24.2.0 # via # aiohttp @@ -126,6 +130,11 @@ encodec==0.1.1 # via vocos evaluate==0.4.3 # via lm-eval +exceptiongroup==1.2.2 + # via + # anyio + # hypothesis + # pytest fastparquet==2024.11.0 # via genai-perf fastrlock==0.8.2 @@ -623,7 +632,6 @@ setuptools==77.0.3 # via # mamba-ssm # pytablewriter - # torch # triton shellingham==1.5.4 # via typer @@ -683,8 +691,13 @@ tokenizers==0.21.1 # via # -r requirements/test.in # transformers +toml==0.10.2 + # via datamodel-code-generator tomli==2.2.1 - # via schemathesis + # via + # black + # pytest + # schemathesis tomli-w==1.2.0 # via schemathesis torch==2.7.0+cu128 @@ -756,12 +769,16 @@ types-python-dateutil==2.9.0.20241206 # via arrow typing-extensions==4.12.2 # via + # anyio + # black # huggingface-hub # librosa # mistral-common + # multidict # pqdm # pydantic # pydantic-core + # rich # torch # typer tzdata==2024.2 diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 580c4baf1797..c2a7ff90be14 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -517,6 +517,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default=None, help="The `--qlora-adapter-name-or-path` has no effect, do not set" " it, and it will be removed in v0.10.0.", + deprecated=True, ) load_group.add_argument('--pt-load-map-location', **load_kwargs["pt_load_map_location"]) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 567e8da696f6..beff33414ad7 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -162,14 +162,15 @@ def get_quant_config(model_config: ModelConfig, None) if hf_quant_config is not None: return quant_cls.from_config(hf_quant_config) - - model_name_or_path = model_config.model - is_local = os.path.isdir(model_name_or_path) + # Inflight BNB quantization + if model_config.quantization == "bitsandbytes": + return quant_cls.from_config({}) + is_local = os.path.isdir(model_config.model) if not is_local: # Download the config files. - with get_lock(model_name_or_path, load_config.download_dir): + with get_lock(model_config.model, load_config.download_dir): hf_folder = snapshot_download( - model_name_or_path, + model_config.model, revision=model_config.revision, allow_patterns="*.json", cache_dir=load_config.download_dir, @@ -177,7 +178,7 @@ def get_quant_config(model_config: ModelConfig, tqdm_class=DisabledTqdm, ) else: - hf_folder = model_name_or_path + hf_folder = model_config.model possible_config_filenames = quant_cls.get_config_filenames() @@ -204,7 +205,7 @@ def get_quant_config(model_config: ModelConfig, config = json.load(f) if model_config.quantization == "bitsandbytes": - config["adapter_name_or_path"] = model_name_or_path + config["adapter_name_or_path"] = model_config.model elif model_config.quantization == "modelopt": if config["producer"]["name"] == "modelopt": return quant_cls.from_config(config) From c2b467a1e772902abb822c2217b3564a8797d9d4 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 6 May 2025 11:18:27 +0000 Subject: [PATCH 7/7] Revert test.txt Signed-off-by: Jee Jee Li --- requirements/test.txt | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/requirements/test.txt b/requirements/test.txt index e2a853a1469d..9a15d9a0d824 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -27,10 +27,6 @@ argcomplete==3.5.1 # via datamodel-code-generator arrow==1.3.0 # via isoduration -async-timeout==5.0.1 - # via - # aiohttp - # redis attrs==24.2.0 # via # aiohttp @@ -130,11 +126,6 @@ encodec==0.1.1 # via vocos evaluate==0.4.3 # via lm-eval -exceptiongroup==1.2.2 - # via - # anyio - # hypothesis - # pytest fastparquet==2024.11.0 # via genai-perf fastrlock==0.8.2 @@ -632,6 +623,7 @@ setuptools==77.0.3 # via # mamba-ssm # pytablewriter + # torch # triton shellingham==1.5.4 # via typer @@ -691,13 +683,8 @@ tokenizers==0.21.1 # via # -r requirements/test.in # transformers -toml==0.10.2 - # via datamodel-code-generator tomli==2.2.1 - # via - # black - # pytest - # schemathesis + # via schemathesis tomli-w==1.2.0 # via schemathesis torch==2.7.0+cu128 @@ -769,16 +756,12 @@ types-python-dateutil==2.9.0.20241206 # via arrow typing-extensions==4.12.2 # via - # anyio - # black # huggingface-hub # librosa # mistral-common - # multidict # pqdm # pydantic # pydantic-core - # rich # torch # typer tzdata==2024.2