Skip to content

Commit

Permalink
[Bugfix] Fix broken OpenAI tensorizer test (vllm-project#8258)
Browse files Browse the repository at this point in the history
  • Loading branch information
DarkLight1337 authored Sep 7, 2024
1 parent 0f5acfa commit 5f1c73d
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 40 deletions.
12 changes: 6 additions & 6 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
init_distributed_environment)
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.openai.cli_args import make_arg_parser
from vllm.model_executor.model_loader.loader import DefaultModelLoader
from vllm.model_executor.model_loader.loader import get_model_loader
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser, get_open_port, is_hip

Expand Down Expand Up @@ -89,11 +89,11 @@ def __init__(self,
is_local = os.path.isdir(model)
if not is_local:
engine_args = AsyncEngineArgs.from_cli_args(args)
engine_config = engine_args.create_engine_config()
dummy_loader = DefaultModelLoader(engine_config.load_config)
dummy_loader._prepare_weights(engine_config.model_config.model,
engine_config.model_config.revision,
fall_back_to_pt=True)
model_config = engine_args.create_model_config()
load_config = engine_args.create_load_config()

model_loader = get_model_loader(load_config)
model_loader.download_model(model_config)

env = os.environ.copy()
# the current process might initialize cuda,
Expand Down
72 changes: 39 additions & 33 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -771,33 +771,8 @@ def from_cli_args(cls, args: argparse.Namespace):
engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
return engine_args

def create_engine_config(self) -> EngineConfig:
# gguf file needs a specific model loader and doesn't use hf_repo
if check_gguf_file(self.model):
self.quantization = self.load_format = "gguf"

# bitsandbytes quantization needs a specific model loader
# so we make sure the quant method and the load format are consistent
if (self.quantization == "bitsandbytes" or
self.qlora_adapter_name_or_path is not None) and \
self.load_format != "bitsandbytes":
raise ValueError(
"BitsAndBytes quantization and QLoRA adapter only support "
f"'bitsandbytes' load format, but got {self.load_format}")

if (self.load_format == "bitsandbytes" or
self.qlora_adapter_name_or_path is not None) and \
self.quantization != "bitsandbytes":
raise ValueError(
"BitsAndBytes load format and QLoRA adapter only support "
f"'bitsandbytes' quantization, but got {self.quantization}")

assert self.cpu_offload_gb >= 0, (
"CPU offload space must be non-negative"
f", but got {self.cpu_offload_gb}")

device_config = DeviceConfig(device=self.device)
model_config = ModelConfig(
def create_model_config(self) -> ModelConfig:
return ModelConfig(
model=self.model,
tokenizer=self.tokenizer,
tokenizer_mode=self.tokenizer_mode,
Expand Down Expand Up @@ -825,6 +800,42 @@ def create_engine_config(self) -> EngineConfig:
config_format=self.config_format,
)

def create_load_config(self) -> LoadConfig:
return LoadConfig(
load_format=self.load_format,
download_dir=self.download_dir,
model_loader_extra_config=self.model_loader_extra_config,
ignore_patterns=self.ignore_patterns,
)

def create_engine_config(self) -> EngineConfig:
# gguf file needs a specific model loader and doesn't use hf_repo
if check_gguf_file(self.model):
self.quantization = self.load_format = "gguf"

# bitsandbytes quantization needs a specific model loader
# so we make sure the quant method and the load format are consistent
if (self.quantization == "bitsandbytes" or
self.qlora_adapter_name_or_path is not None) and \
self.load_format != "bitsandbytes":
raise ValueError(
"BitsAndBytes quantization and QLoRA adapter only support "
f"'bitsandbytes' load format, but got {self.load_format}")

if (self.load_format == "bitsandbytes" or
self.qlora_adapter_name_or_path is not None) and \
self.quantization != "bitsandbytes":
raise ValueError(
"BitsAndBytes load format and QLoRA adapter only support "
f"'bitsandbytes' quantization, but got {self.quantization}")

assert self.cpu_offload_gb >= 0, (
"CPU offload space must be non-negative"
f", but got {self.cpu_offload_gb}")

device_config = DeviceConfig(device=self.device)
model_config = self.create_model_config()

cache_config = CacheConfig(
block_size=self.block_size if self.device != "neuron" else
self.max_model_len, # neuron needs block_size = max_model_len
Expand Down Expand Up @@ -967,12 +978,7 @@ def create_engine_config(self) -> EngineConfig:
self.model_loader_extra_config[
"qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path

load_config = LoadConfig(
load_format=self.load_format,
download_dir=self.download_dir,
model_loader_extra_config=self.model_loader_extra_config,
ignore_patterns=self.ignore_patterns,
)
load_config = self.create_load_config()

prompt_adapter_config = PromptAdapterConfig(
max_prompt_adapters=self.max_prompt_adapters,
Expand Down
30 changes: 29 additions & 1 deletion vllm/model_executor/model_loader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,11 @@ class BaseModelLoader(ABC):
def __init__(self, load_config: LoadConfig):
self.load_config = load_config

@abstractmethod
def download_model(self, model_config: ModelConfig) -> None:
"""Download a model so that it can be immediately loaded."""
raise NotImplementedError

@abstractmethod
def load_model(self, *, model_config: ModelConfig,
device_config: DeviceConfig,
Expand All @@ -193,7 +198,7 @@ def load_model(self, *, model_config: ModelConfig,
scheduler_config: SchedulerConfig,
cache_config: CacheConfig) -> nn.Module:
"""Load a model with the given configurations."""
...
raise NotImplementedError


class DefaultModelLoader(BaseModelLoader):
Expand Down Expand Up @@ -335,6 +340,11 @@ def _xla_weights_iterator(iterator: Generator):
weights_iterator = _xla_weights_iterator(weights_iterator)
return weights_iterator

def download_model(self, model_config: ModelConfig) -> None:
self._prepare_weights(model_config.model,
model_config.revision,
fall_back_to_pt=True)

def load_model(self, *, model_config: ModelConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig],
Expand Down Expand Up @@ -377,6 +387,9 @@ def __init__(self, load_config: LoadConfig):
raise ValueError(f"Model loader extra config is not supported for "
f"load format {load_config.load_format}")

def download_model(self, model_config: ModelConfig) -> None:
pass # Nothing to download

def load_model(self, *, model_config: ModelConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig],
Expand Down Expand Up @@ -467,6 +480,12 @@ def _load_model_serialized(
model = load_with_tensorizer(tensorizer_config, **extra_kwargs)
return model.eval()

def download_model(self, model_config: ModelConfig) -> None:
self.tensorizer_config.verify_with_model_config(model_config)

with self.tensorizer_config.open_stream():
pass

def load_model(self, *, model_config: ModelConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig],
Expand Down Expand Up @@ -568,6 +587,9 @@ def _prepare_weights(self, model_name_or_path: str,
ignore_patterns=self.load_config.ignore_patterns,
)

def download_model(self, model_config: ModelConfig) -> None:
self._prepare_weights(model_config.model, model_config.revision)

def load_model(self, *, model_config: ModelConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig],
Expand Down Expand Up @@ -995,6 +1017,9 @@ def _load_weights(self, model_config: ModelConfig,
set_weight_attrs(
param, {"matmul_state": [None] * len(quant_states)})

def download_model(self, model_config: ModelConfig) -> None:
self._prepare_weights(model_config.model, model_config.revision)

def load_model(self, *, model_config: ModelConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig],
Expand Down Expand Up @@ -1070,6 +1095,9 @@ def _get_weights_iterator(
return gguf_quant_weights_iterator(model_name_or_path,
gguf_to_hf_name_map)

def download_model(self, model_config: ModelConfig) -> None:
self._prepare_weights(model_config.model)

def load_model(self, *, model_config: ModelConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig],
Expand Down
7 changes: 7 additions & 0 deletions vllm/model_executor/model_loader/tensorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,13 @@ def verify_with_model_config(self, model_config: "ModelConfig") -> None:
"Loading a model using Tensorizer with quantization on vLLM"
" is unstable and may lead to errors.")

def open_stream(self, tensorizer_args: Optional["TensorizerArgs"] = None):
if tensorizer_args is None:
tensorizer_args = self._construct_tensorizer_args()

return open_stream(self.tensorizer_uri,
**tensorizer_args.stream_params)


def load_with_tensorizer(tensorizer_config: TensorizerConfig,
**extra_kwargs) -> nn.Module:
Expand Down

0 comments on commit 5f1c73d

Please sign in to comment.