Skip to content

Commit 0a461fc

Browse files
DarkLight1337dtrifiro
authored andcommitted
[Bugfix] Fix broken OpenAI tensorizer test (vllm-project#8258)
1 parent bbee60b commit 0a461fc

File tree

4 files changed

+81
-40
lines changed

4 files changed

+81
-40
lines changed

tests/utils.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
init_distributed_environment)
2121
from vllm.engine.arg_utils import AsyncEngineArgs
2222
from vllm.entrypoints.openai.cli_args import make_arg_parser
23-
from vllm.model_executor.model_loader.loader import DefaultModelLoader
23+
from vllm.model_executor.model_loader.loader import get_model_loader
2424
from vllm.platforms import current_platform
2525
from vllm.utils import FlexibleArgumentParser, get_open_port, is_hip
2626

@@ -89,11 +89,11 @@ def __init__(self,
8989
is_local = os.path.isdir(model)
9090
if not is_local:
9191
engine_args = AsyncEngineArgs.from_cli_args(args)
92-
engine_config = engine_args.create_engine_config()
93-
dummy_loader = DefaultModelLoader(engine_config.load_config)
94-
dummy_loader._prepare_weights(engine_config.model_config.model,
95-
engine_config.model_config.revision,
96-
fall_back_to_pt=True)
92+
model_config = engine_args.create_model_config()
93+
load_config = engine_args.create_load_config()
94+
95+
model_loader = get_model_loader(load_config)
96+
model_loader.download_model(model_config)
9797

9898
env = os.environ.copy()
9999
# the current process might initialize cuda,

vllm/engine/arg_utils.py

+39-33
Original file line numberDiff line numberDiff line change
@@ -771,33 +771,8 @@ def from_cli_args(cls, args: argparse.Namespace):
771771
engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
772772
return engine_args
773773

774-
def create_engine_config(self) -> EngineConfig:
775-
# gguf file needs a specific model loader and doesn't use hf_repo
776-
if check_gguf_file(self.model):
777-
self.quantization = self.load_format = "gguf"
778-
779-
# bitsandbytes quantization needs a specific model loader
780-
# so we make sure the quant method and the load format are consistent
781-
if (self.quantization == "bitsandbytes" or
782-
self.qlora_adapter_name_or_path is not None) and \
783-
self.load_format != "bitsandbytes":
784-
raise ValueError(
785-
"BitsAndBytes quantization and QLoRA adapter only support "
786-
f"'bitsandbytes' load format, but got {self.load_format}")
787-
788-
if (self.load_format == "bitsandbytes" or
789-
self.qlora_adapter_name_or_path is not None) and \
790-
self.quantization != "bitsandbytes":
791-
raise ValueError(
792-
"BitsAndBytes load format and QLoRA adapter only support "
793-
f"'bitsandbytes' quantization, but got {self.quantization}")
794-
795-
assert self.cpu_offload_gb >= 0, (
796-
"CPU offload space must be non-negative"
797-
f", but got {self.cpu_offload_gb}")
798-
799-
device_config = DeviceConfig(device=self.device)
800-
model_config = ModelConfig(
774+
def create_model_config(self) -> ModelConfig:
775+
return ModelConfig(
801776
model=self.model,
802777
tokenizer=self.tokenizer,
803778
tokenizer_mode=self.tokenizer_mode,
@@ -825,6 +800,42 @@ def create_engine_config(self) -> EngineConfig:
825800
config_format=self.config_format,
826801
)
827802

803+
def create_load_config(self) -> LoadConfig:
804+
return LoadConfig(
805+
load_format=self.load_format,
806+
download_dir=self.download_dir,
807+
model_loader_extra_config=self.model_loader_extra_config,
808+
ignore_patterns=self.ignore_patterns,
809+
)
810+
811+
def create_engine_config(self) -> EngineConfig:
812+
# gguf file needs a specific model loader and doesn't use hf_repo
813+
if check_gguf_file(self.model):
814+
self.quantization = self.load_format = "gguf"
815+
816+
# bitsandbytes quantization needs a specific model loader
817+
# so we make sure the quant method and the load format are consistent
818+
if (self.quantization == "bitsandbytes" or
819+
self.qlora_adapter_name_or_path is not None) and \
820+
self.load_format != "bitsandbytes":
821+
raise ValueError(
822+
"BitsAndBytes quantization and QLoRA adapter only support "
823+
f"'bitsandbytes' load format, but got {self.load_format}")
824+
825+
if (self.load_format == "bitsandbytes" or
826+
self.qlora_adapter_name_or_path is not None) and \
827+
self.quantization != "bitsandbytes":
828+
raise ValueError(
829+
"BitsAndBytes load format and QLoRA adapter only support "
830+
f"'bitsandbytes' quantization, but got {self.quantization}")
831+
832+
assert self.cpu_offload_gb >= 0, (
833+
"CPU offload space must be non-negative"
834+
f", but got {self.cpu_offload_gb}")
835+
836+
device_config = DeviceConfig(device=self.device)
837+
model_config = self.create_model_config()
838+
828839
cache_config = CacheConfig(
829840
block_size=self.block_size if self.device != "neuron" else
830841
self.max_model_len, # neuron needs block_size = max_model_len
@@ -967,12 +978,7 @@ def create_engine_config(self) -> EngineConfig:
967978
self.model_loader_extra_config[
968979
"qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path
969980

970-
load_config = LoadConfig(
971-
load_format=self.load_format,
972-
download_dir=self.download_dir,
973-
model_loader_extra_config=self.model_loader_extra_config,
974-
ignore_patterns=self.ignore_patterns,
975-
)
981+
load_config = self.create_load_config()
976982

977983
prompt_adapter_config = PromptAdapterConfig(
978984
max_prompt_adapters=self.max_prompt_adapters,

vllm/model_executor/model_loader/loader.py

+29-1
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,11 @@ class BaseModelLoader(ABC):
185185
def __init__(self, load_config: LoadConfig):
186186
self.load_config = load_config
187187

188+
@abstractmethod
189+
def download_model(self, model_config: ModelConfig) -> None:
190+
"""Download a model so that it can be immediately loaded."""
191+
raise NotImplementedError
192+
188193
@abstractmethod
189194
def load_model(self, *, model_config: ModelConfig,
190195
device_config: DeviceConfig,
@@ -193,7 +198,7 @@ def load_model(self, *, model_config: ModelConfig,
193198
scheduler_config: SchedulerConfig,
194199
cache_config: CacheConfig) -> nn.Module:
195200
"""Load a model with the given configurations."""
196-
...
201+
raise NotImplementedError
197202

198203

199204
class DefaultModelLoader(BaseModelLoader):
@@ -335,6 +340,11 @@ def _xla_weights_iterator(iterator: Generator):
335340
weights_iterator = _xla_weights_iterator(weights_iterator)
336341
return weights_iterator
337342

343+
def download_model(self, model_config: ModelConfig) -> None:
344+
self._prepare_weights(model_config.model,
345+
model_config.revision,
346+
fall_back_to_pt=True)
347+
338348
def load_model(self, *, model_config: ModelConfig,
339349
device_config: DeviceConfig,
340350
lora_config: Optional[LoRAConfig],
@@ -377,6 +387,9 @@ def __init__(self, load_config: LoadConfig):
377387
raise ValueError(f"Model loader extra config is not supported for "
378388
f"load format {load_config.load_format}")
379389

390+
def download_model(self, model_config: ModelConfig) -> None:
391+
pass # Nothing to download
392+
380393
def load_model(self, *, model_config: ModelConfig,
381394
device_config: DeviceConfig,
382395
lora_config: Optional[LoRAConfig],
@@ -467,6 +480,12 @@ def _load_model_serialized(
467480
model = load_with_tensorizer(tensorizer_config, **extra_kwargs)
468481
return model.eval()
469482

483+
def download_model(self, model_config: ModelConfig) -> None:
484+
self.tensorizer_config.verify_with_model_config(model_config)
485+
486+
with self.tensorizer_config.open_stream():
487+
pass
488+
470489
def load_model(self, *, model_config: ModelConfig,
471490
device_config: DeviceConfig,
472491
lora_config: Optional[LoRAConfig],
@@ -568,6 +587,9 @@ def _prepare_weights(self, model_name_or_path: str,
568587
ignore_patterns=self.load_config.ignore_patterns,
569588
)
570589

590+
def download_model(self, model_config: ModelConfig) -> None:
591+
self._prepare_weights(model_config.model, model_config.revision)
592+
571593
def load_model(self, *, model_config: ModelConfig,
572594
device_config: DeviceConfig,
573595
lora_config: Optional[LoRAConfig],
@@ -995,6 +1017,9 @@ def _load_weights(self, model_config: ModelConfig,
9951017
set_weight_attrs(
9961018
param, {"matmul_state": [None] * len(quant_states)})
9971019

1020+
def download_model(self, model_config: ModelConfig) -> None:
1021+
self._prepare_weights(model_config.model, model_config.revision)
1022+
9981023
def load_model(self, *, model_config: ModelConfig,
9991024
device_config: DeviceConfig,
10001025
lora_config: Optional[LoRAConfig],
@@ -1070,6 +1095,9 @@ def _get_weights_iterator(
10701095
return gguf_quant_weights_iterator(model_name_or_path,
10711096
gguf_to_hf_name_map)
10721097

1098+
def download_model(self, model_config: ModelConfig) -> None:
1099+
self._prepare_weights(model_config.model)
1100+
10731101
def load_model(self, *, model_config: ModelConfig,
10741102
device_config: DeviceConfig,
10751103
lora_config: Optional[LoRAConfig],

vllm/model_executor/model_loader/tensorizer.py

+7
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,13 @@ def verify_with_model_config(self, model_config: "ModelConfig") -> None:
9999
"Loading a model using Tensorizer with quantization on vLLM"
100100
" is unstable and may lead to errors.")
101101

102+
def open_stream(self, tensorizer_args: Optional["TensorizerArgs"] = None):
103+
if tensorizer_args is None:
104+
tensorizer_args = self._construct_tensorizer_args()
105+
106+
return open_stream(self.tensorizer_uri,
107+
**tensorizer_args.stream_params)
108+
102109

103110
def load_with_tensorizer(tensorizer_config: TensorizerConfig,
104111
**extra_kwargs) -> nn.Module:

0 commit comments

Comments
 (0)