From 8d33530f6c77df820a2edb572d5e3a51c2074bf4 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 6 May 2025 07:16:20 +0000
Subject: [PATCH 1/7] Done

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../lora_with_quantization_inference.py       | 56 +++++++++----------
 vllm/engine/arg_utils.py                      | 31 +++++-----
 .../model_loader/weight_utils.py              | 11 +---
 3 files changed, 41 insertions(+), 57 deletions(-)

diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py
index ab235ddd7545..2b2ae685e814 100644
--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@@ -75,43 +75,37 @@ def initialize_engine(model: str, quantization: str,
                       lora_repo: Optional[str]) -> LLMEngine:
     """Initialize the LLMEngine."""
 
-    if quantization == "bitsandbytes":
-        # QLoRA (https://arxiv.org/abs/2305.14314) is a quantization technique.
-        # It quantizes the model when loading, with some config info from the
-        # LoRA adapter repo. So need to set the parameter of load_format and
-        # qlora_adapter_name_or_path as below.
-        engine_args = EngineArgs(model=model,
-                                 quantization=quantization,
-                                 qlora_adapter_name_or_path=lora_repo,
-                                 enable_lora=True,
-                                 max_lora_rank=64)
-    else:
-        engine_args = EngineArgs(model=model,
-                                 quantization=quantization,
-                                 enable_lora=True,
-                                 max_loras=4)
+    engine_args = EngineArgs(model=model,
+                             quantization=quantization,
+                             enable_lora=True,
+                             max_loras=4)
     return LLMEngine.from_engine_args(engine_args)
 
 
 def main():
     """Main function that sets up and runs the prompt processing."""
 
-    test_configs = [{
-        "name": "qlora_inference_example",
-        'model': "huggyllama/llama-7b",
-        'quantization': "bitsandbytes",
-        'lora_repo': 'timdettmers/qlora-flan-7b'
-    }, {
-        "name": "AWQ_inference_with_lora_example",
-        'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ',
-        'quantization': "awq",
-        'lora_repo': 'jashing/tinyllama-colorist-lora'
-    }, {
-        "name": "GPTQ_inference_with_lora_example",
-        'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ',
-        'quantization': "gptq",
-        'lora_repo': 'jashing/tinyllama-colorist-lora'
-    }]
+    test_configs = [
+        # QLoRA (https://arxiv.org/abs/2305.14314)
+        {
+            "name": "qlora_inference_example",
+            'model': "huggyllama/llama-7b",
+            'quantization': "bitsandbytes",
+            'lora_repo': 'timdettmers/qlora-flan-7b'
+        },
+        {
+            "name": "AWQ_inference_with_lora_example",
+            'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ',
+            'quantization': "awq",
+            'lora_repo': 'jashing/tinyllama-colorist-lora'
+        },
+        {
+            "name": "GPTQ_inference_with_lora_example",
+            'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ',
+            'quantization': "gptq",
+            'lora_repo': 'jashing/tinyllama-colorist-lora'
+        }
+    ]
 
     for test_config in test_configs:
         print(
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 08dbb4c45039..d48df285fb64 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -6,6 +6,7 @@
 import json
 import re
 import threading
+import warnings
 from dataclasses import MISSING, dataclass, fields
 from itertools import permutations
 from typing import (Any, Callable, Dict, List, Literal, Optional, Type,
@@ -394,7 +395,13 @@ def __post_init__(self):
         if isinstance(self.compilation_config, (int, dict)):
             self.compilation_config = CompilationConfig.from_cli(
                 str(self.compilation_config))
-
+        if self.qlora_adapter_name_or_path is not None:
+            warnings.warn(
+                "The 'qlora_adapter_name_or_path' is deprecated "
+                "and will be removed in a future release. ",
+                DeprecationWarning,
+                stacklevel=2,
+            )
         # Setup plugins
         from vllm.plugins import load_general_plugins
         load_general_plugins()
@@ -504,10 +511,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                                 **load_kwargs["ignore_patterns"])
         load_group.add_argument("--use-tqdm-on-load",
                                 **load_kwargs["use_tqdm_on_load"])
-        load_group.add_argument('--qlora-adapter-name-or-path',
-                                type=str,
-                                default=None,
-                                help='Name or path of the QLoRA adapter.')
+        load_group.add_argument(
+            "--qlora-adapter-name-or-path",
+            type=str,
+            default=None,
+            help="The `--qlora-adapter-name-or-path` is "
+            "deprecated and will be removed in a future release.",
+        )
         load_group.add_argument('--pt-load-map-location',
                                 **load_kwargs["pt_load_map_location"])
 
@@ -895,12 +905,6 @@ def create_model_config(self) -> ModelConfig:
 
     def create_load_config(self) -> LoadConfig:
 
-        if(self.qlora_adapter_name_or_path is not None) and \
-            self.quantization != "bitsandbytes":
-            raise ValueError(
-                "QLoRA adapter only support "
-                f"'bitsandbytes' quantization, but got {self.quantization}")
-
         if self.quantization == "bitsandbytes":
             self.load_format = "bitsandbytes"
 
@@ -1097,11 +1101,6 @@ def create_engine_config(
             max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
             and self.max_cpu_loras > 0 else None) if self.enable_lora else None
 
-        if self.qlora_adapter_name_or_path is not None and \
-            self.qlora_adapter_name_or_path != "":
-            self.model_loader_extra_config[
-                "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path
-
         # bitsandbytes pre-quantized model need a specific model loader
         if model_config.quantization == "bitsandbytes":
             self.quantization = self.load_format = "bitsandbytes"
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 10bc55ca5f7d..567e8da696f6 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -162,17 +162,8 @@ def get_quant_config(model_config: ModelConfig,
                                   None)
     if hf_quant_config is not None:
         return quant_cls.from_config(hf_quant_config)
-    # In case of bitsandbytes/QLoRA, get quant config from the adapter model.
-    if model_config.quantization == "bitsandbytes":
-        if (not load_config.model_loader_extra_config
-                or "qlora_adapter_name_or_path"
-                not in load_config.model_loader_extra_config):
-            return quant_cls.from_config({"adapter_name_or_path": ""})
-        model_name_or_path = load_config.model_loader_extra_config[
-            "qlora_adapter_name_or_path"]
 
-    else:
-        model_name_or_path = model_config.model
+    model_name_or_path = model_config.model
     is_local = os.path.isdir(model_name_or_path)
     if not is_local:
         # Download the config files.

From 5e7ec0878d27f35373d1eb5b4ae8657f95ab66ff Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 6 May 2025 07:26:31 +0000
Subject: [PATCH 2/7] Done

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 examples/offline_inference/lora_with_quantization_inference.py | 1 +
 vllm/engine/arg_utils.py                                       | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py
index 2b2ae685e814..b6608ec6e958 100644
--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@@ -78,6 +78,7 @@ def initialize_engine(model: str, quantization: str,
     engine_args = EngineArgs(model=model,
                              quantization=quantization,
                              enable_lora=True,
+                             max_lora_rank=64,
                              max_loras=4)
     return LLMEngine.from_engine_args(engine_args)
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index d48df285fb64..795633ae8416 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -397,7 +397,7 @@ def __post_init__(self):
                 str(self.compilation_config))
         if self.qlora_adapter_name_or_path is not None:
             warnings.warn(
-                "The 'qlora_adapter_name_or_path' is deprecated "
+                "The `qlora_adapter_name_or_path` is deprecated "
                 "and will be removed in a future release. ",
                 DeprecationWarning,
                 stacklevel=2,

From 77e2a288267d87478a9cb34029534386a9e1c86e Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 6 May 2025 07:39:24 +0000
Subject: [PATCH 3/7] Fix typo

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/engine/arg_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 795633ae8416..2bcd7df9ab12 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -543,7 +543,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             action=argparse.BooleanOptionalAction,
             help="[DEPRECATED] The `--enable-reasoning` flag is deprecated as "
             "of v0.8.6. Use `--reasoning-parser` to specify the reasoning "
-            "parser backend insteadThis flag (`--enable-reasoning`) will be "
+            "parser backend instead. This flag (`--enable-reasoning`) will be "
             "removed in v0.10.0. When `--reasoning-parser` is specified, "
             "reasoning mode is automatically enabled.")
         guided_decoding_group.add_argument(

From 20a468319f07196dca8b5d56af6c817c36eec536 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 6 May 2025 07:45:04 +0000
Subject: [PATCH 4/7] Modify comment

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/engine/arg_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 2bcd7df9ab12..bee96a217db2 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -515,8 +515,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--qlora-adapter-name-or-path",
             type=str,
             default=None,
-            help="The `--qlora-adapter-name-or-path` is "
-            "deprecated and will be removed in a future release.",
+            help="The `--qlora-adapter-name-or-path` has no effect, do not set"
+            " it, and it  will be removed in a future release.",
         )
         load_group.add_argument('--pt-load-map-location',
                                 **load_kwargs["pt_load_map_location"])

From f934ce840f84752e2d02d029263f410da9f60f77 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 6 May 2025 07:54:21 +0000
Subject: [PATCH 5/7] Modify comment

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/engine/arg_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index bee96a217db2..580c4baf1797 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -398,7 +398,7 @@ def __post_init__(self):
         if self.qlora_adapter_name_or_path is not None:
             warnings.warn(
                 "The `qlora_adapter_name_or_path` is deprecated "
-                "and will be removed in a future release. ",
+                "and will be removed in v0.10.0. ",
                 DeprecationWarning,
                 stacklevel=2,
             )
@@ -516,7 +516,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             type=str,
             default=None,
             help="The `--qlora-adapter-name-or-path` has no effect, do not set"
-            " it, and it  will be removed in a future release.",
+            " it, and it  will be removed in v0.10.0.",
         )
         load_group.add_argument('--pt-load-map-location',
                                 **load_kwargs["pt_load_map_location"])

From 8b6906808570849590b68cb625fdedb1a448bea4 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 6 May 2025 10:38:28 +0000
Subject: [PATCH 6/7] Fix bug

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 requirements/test.txt                         | 21 +++++++++++++++++--
 vllm/engine/arg_utils.py                      |  1 +
 .../model_loader/weight_utils.py              | 15 ++++++-------
 3 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/requirements/test.txt b/requirements/test.txt
index 9a15d9a0d824..e2a853a1469d 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -27,6 +27,10 @@ argcomplete==3.5.1
     # via datamodel-code-generator
 arrow==1.3.0
     # via isoduration
+async-timeout==5.0.1
+    # via
+    #   aiohttp
+    #   redis
 attrs==24.2.0
     # via
     #   aiohttp
@@ -126,6 +130,11 @@ encodec==0.1.1
     # via vocos
 evaluate==0.4.3
     # via lm-eval
+exceptiongroup==1.2.2
+    # via
+    #   anyio
+    #   hypothesis
+    #   pytest
 fastparquet==2024.11.0
     # via genai-perf
 fastrlock==0.8.2
@@ -623,7 +632,6 @@ setuptools==77.0.3
     # via
     #   mamba-ssm
     #   pytablewriter
-    #   torch
     #   triton
 shellingham==1.5.4
     # via typer
@@ -683,8 +691,13 @@ tokenizers==0.21.1
     # via
     #   -r requirements/test.in
     #   transformers
+toml==0.10.2
+    # via datamodel-code-generator
 tomli==2.2.1
-    # via schemathesis
+    # via
+    #   black
+    #   pytest
+    #   schemathesis
 tomli-w==1.2.0
     # via schemathesis
 torch==2.7.0+cu128
@@ -756,12 +769,16 @@ types-python-dateutil==2.9.0.20241206
     # via arrow
 typing-extensions==4.12.2
     # via
+    #   anyio
+    #   black
     #   huggingface-hub
     #   librosa
     #   mistral-common
+    #   multidict
     #   pqdm
     #   pydantic
     #   pydantic-core
+    #   rich
     #   torch
     #   typer
 tzdata==2024.2
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 580c4baf1797..c2a7ff90be14 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -517,6 +517,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default=None,
             help="The `--qlora-adapter-name-or-path` has no effect, do not set"
             " it, and it  will be removed in v0.10.0.",
+            deprecated=True,
         )
         load_group.add_argument('--pt-load-map-location',
                                 **load_kwargs["pt_load_map_location"])
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 567e8da696f6..beff33414ad7 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -162,14 +162,15 @@ def get_quant_config(model_config: ModelConfig,
                                   None)
     if hf_quant_config is not None:
         return quant_cls.from_config(hf_quant_config)
-
-    model_name_or_path = model_config.model
-    is_local = os.path.isdir(model_name_or_path)
+    # Inflight BNB quantization
+    if model_config.quantization == "bitsandbytes":
+        return quant_cls.from_config({})
+    is_local = os.path.isdir(model_config.model)
     if not is_local:
         # Download the config files.
-        with get_lock(model_name_or_path, load_config.download_dir):
+        with get_lock(model_config.model, load_config.download_dir):
             hf_folder = snapshot_download(
-                model_name_or_path,
+                model_config.model,
                 revision=model_config.revision,
                 allow_patterns="*.json",
                 cache_dir=load_config.download_dir,
@@ -177,7 +178,7 @@ def get_quant_config(model_config: ModelConfig,
                 tqdm_class=DisabledTqdm,
             )
     else:
-        hf_folder = model_name_or_path
+        hf_folder = model_config.model
 
     possible_config_filenames = quant_cls.get_config_filenames()
 
@@ -204,7 +205,7 @@ def get_quant_config(model_config: ModelConfig,
         config = json.load(f)
 
         if model_config.quantization == "bitsandbytes":
-            config["adapter_name_or_path"] = model_name_or_path
+            config["adapter_name_or_path"] = model_config.model
         elif model_config.quantization == "modelopt":
             if config["producer"]["name"] == "modelopt":
                 return quant_cls.from_config(config)

From c2b467a1e772902abb822c2217b3564a8797d9d4 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 6 May 2025 11:18:27 +0000
Subject: [PATCH 7/7] Revert test.txt

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 requirements/test.txt | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

diff --git a/requirements/test.txt b/requirements/test.txt
index e2a853a1469d..9a15d9a0d824 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -27,10 +27,6 @@ argcomplete==3.5.1
     # via datamodel-code-generator
 arrow==1.3.0
     # via isoduration
-async-timeout==5.0.1
-    # via
-    #   aiohttp
-    #   redis
 attrs==24.2.0
     # via
     #   aiohttp
@@ -130,11 +126,6 @@ encodec==0.1.1
     # via vocos
 evaluate==0.4.3
     # via lm-eval
-exceptiongroup==1.2.2
-    # via
-    #   anyio
-    #   hypothesis
-    #   pytest
 fastparquet==2024.11.0
     # via genai-perf
 fastrlock==0.8.2
@@ -632,6 +623,7 @@ setuptools==77.0.3
     # via
     #   mamba-ssm
     #   pytablewriter
+    #   torch
     #   triton
 shellingham==1.5.4
     # via typer
@@ -691,13 +683,8 @@ tokenizers==0.21.1
     # via
     #   -r requirements/test.in
     #   transformers
-toml==0.10.2
-    # via datamodel-code-generator
 tomli==2.2.1
-    # via
-    #   black
-    #   pytest
-    #   schemathesis
+    # via schemathesis
 tomli-w==1.2.0
     # via schemathesis
 torch==2.7.0+cu128
@@ -769,16 +756,12 @@ types-python-dateutil==2.9.0.20241206
     # via arrow
 typing-extensions==4.12.2
     # via
-    #   anyio
-    #   black
     #   huggingface-hub
     #   librosa
     #   mistral-common
-    #   multidict
     #   pqdm
     #   pydantic
     #   pydantic-core
-    #   rich
     #   torch
     #   typer
 tzdata==2024.2