Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
9ed887a
Initial commit
hmellor Apr 24, 2025
4550a8c
Second commit
hmellor Apr 24, 2025
a75f74e
Merge branch 'main' into improve-model-config
hmellor Apr 25, 2025
6304a8d
Fix return type of `optional_type`
hmellor Apr 25, 2025
947b226
Use `Literal` types instead of `str` in `LLM`
hmellor Apr 25, 2025
1fb17d9
Use `Literal` instead of `str` for `override_quantization_method`
hmellor Apr 25, 2025
f00d97d
Add `cast` when assigning to `Literal` in some cases
hmellor Apr 25, 2025
4fc786c
Better handling of tokenizer falling back to model
hmellor Apr 25, 2025
eadc39f
Add missing pooler_config field and better handle its override
hmellor Apr 25, 2025
039d89d
Max model len will never be None by the time it's used
hmellor Apr 25, 2025
41bc717
max_model_len default is already None
hmellor Apr 25, 2025
378809c
quantization in SpeculativeConfig can also be Literally typed
hmellor Apr 25, 2025
bf6e36a
Add `nargs` to tuple arguments
hmellor Apr 25, 2025
9bcac01
field_type -> type_hint, better extraction from optional, add list type
hmellor Apr 25, 2025
3e061ce
Fix type reassignment
hmellor Apr 25, 2025
8a6e4df
Make naming of `compressed_tensors` consistent
hmellor Apr 25, 2025
1fe5a04
Stronger typing of QuantizationConfigs
hmellor Apr 25, 2025
e6b7a51
Don't assign tuple to list
hmellor Apr 25, 2025
c342876
Add `QuantizationMethods` to `__all__` because it's used externally
hmellor Apr 25, 2025
2f7f6de
Handle ordering of overrides
hmellor Apr 25, 2025
7a1b442
Use enum values for defaults
hmellor Apr 25, 2025
1f3e9ba
Fix config_format typing
hmellor Apr 25, 2025
96ebdec
Fix hf token arg
hmellor Apr 25, 2025
6089a47
Add `ModelConfig.__hash__()`
hmellor Apr 25, 2025
2a09c33
Use `unsafe_hash` instead
hmellor Apr 25, 2025
3c882da
Don't use `ModelConfig` as a dict key
hmellor Apr 26, 2025
066ebba
Fix `pooler_config` default
hmellor Apr 26, 2025
0b9a512
Merge branch 'main' into improve-model-config
hmellor Apr 26, 2025
4d1acbd
pre-commit missed in something from main
hmellor Apr 26, 2025
d21b2e9
Merge branch 'main' into improve-model-config
hmellor Apr 28, 2025
8f99162
Merge branch 'main' into improve-model-config
hmellor Apr 28, 2025
c2d1557
Remove now unneeded changes
hmellor Apr 28, 2025
af54723
Merge branch 'main' into improve-model-config
hmellor Apr 29, 2025
aa31504
Get defaults from `MultiModalConfig`
hmellor Apr 29, 2025
fec0117
Remove RST from MD docstrings
hmellor Apr 29, 2025
48d2e7d
Fix pooling config tests
hmellor Apr 29, 2025
6d4fa38
Fix `--max-model-len` human readable int
hmellor Apr 29, 2025
beab5ae
Fix union of literals case
hmellor Apr 29, 2025
5a7e339
Fix arg_utils test
hmellor Apr 29, 2025
6df5dbb
Enforce that `Literal`s are merged with `Literal` not `Union`
hmellor Apr 29, 2025
08d5e20
Add tests for `config` decorator
hmellor Apr 29, 2025
92300b6
Create new helper function to handle sequences of literals
hmellor Apr 29, 2025
88c1479
Add test for literal to kwarg
hmellor Apr 29, 2025
4037013
Add test cases for `list[Literal]` and `Literal[Literal, Literal]`
hmellor Apr 29, 2025
0ecf76e
Fix pre-commit
hmellor Apr 29, 2025
8beb8df
Respond to comment
hmellor Apr 29, 2025
4510fa1
Merge branch 'config-literal-handling' into improve-model-config
hmellor Apr 29, 2025
cb08bdc
Merge branch 'main' into improve-model-config
hmellor Apr 29, 2025
1232ded
Remove change that was handled in another PR
hmellor Apr 29, 2025
0055700
Add change from other PR
hmellor Apr 29, 2025
7d6108a
Merge branch 'main' into improve-model-config
hmellor Apr 29, 2025
536f05c
Remove now unneeded test case
hmellor Apr 29, 2025
1282937
Remove test which is just testing `json.loads`
hmellor Apr 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -738,7 +738,7 @@ class VllmRunner:
- `block_size`: Set to `16` instead of `None` to reduce memory usage.
- `enable_chunked_prefill`: Set to `False` instead of `None` for
test reproducibility.
- `enforce_eager`: Set to `False` instead of `None` to test CUDA graph.
- `enforce_eager`: Set to `False` to test CUDA graph.
"""

def __init__(
Expand Down
13 changes: 1 addition & 12 deletions tests/engine/test_arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import pytest

from vllm.config import PoolerConfig, config
from vllm.config import config
from vllm.engine.arg_utils import (EngineArgs, contains_type, get_kwargs,
get_type, is_not_builtin, is_type,
literal_to_kwargs, nullable_kvs,
Expand Down Expand Up @@ -222,17 +222,6 @@ def test_prefix_cache_default():
assert not engine_args.enable_prefix_caching


def test_valid_pooling_config():
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
args = parser.parse_args([
'--override-pooler-config',
'{"pooling_type": "MEAN"}',
])
engine_args = EngineArgs.from_cli_args(args=args)
assert engine_args.override_pooler_config == PoolerConfig(
pooling_type="MEAN", )


@pytest.mark.parametrize(
("arg"),
[
Expand Down
4 changes: 2 additions & 2 deletions tests/quantization/test_register_quantization_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from vllm.model_executor.layers.linear import LinearBase # noqa: E501
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
from vllm.model_executor.layers.quantization import (
get_quantization_config, register_quantization_config)
QuantizationMethods, get_quantization_config, register_quantization_config)
from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501
QuantizationConfig)

Expand Down Expand Up @@ -54,7 +54,7 @@ def __init__(self, num_bits: int = 8) -> None:
"""Initialize the quantization config."""
self.num_bits = num_bits

def get_name(self) -> str:
def get_name(self) -> QuantizationMethods:
"""Name of the quantization method."""
return "custom_quant"

Expand Down
9 changes: 5 additions & 4 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def test_get_pooling_config():
revision=None,
)

pooling_config = model_config._init_pooler_config(None)
pooling_config = model_config._init_pooler_config()
assert pooling_config is not None

assert pooling_config.normalize
Expand All @@ -205,11 +205,12 @@ def test_get_pooling_config_from_args():
dtype="float16",
revision=None)

override_config = PoolerConfig(pooling_type='CLS', normalize=True)
override_pooler_config = PoolerConfig(pooling_type='CLS', normalize=True)
model_config.override_pooler_config = override_pooler_config

pooling_config = model_config._init_pooler_config(override_config)
pooling_config = model_config._init_pooler_config()
assert pooling_config is not None
assert asdict(pooling_config) == asdict(override_config)
assert asdict(pooling_config) == asdict(override_pooler_config)


@pytest.mark.skipif(current_platform.is_rocm(),
Expand Down
513 changes: 258 additions & 255 deletions vllm/config.py

Large diffs are not rendered by default.

451 changes: 137 additions & 314 deletions vllm/engine/arg_utils.py

Large diffs are not rendered by default.

18 changes: 7 additions & 11 deletions vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
BeamSearchSequence, get_beam_search_score)
from vllm.config import CompilationConfig
from vllm.config import CompilationConfig, ModelDType, TokenizerMode
from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
TaskOption)
from vllm.engine.llm_engine import LLMEngine
Expand All @@ -31,6 +31,7 @@
from vllm.lora.request import LoRARequest
from vllm.model_executor.guided_decoding.guided_fields import (
GuidedDecodingRequest, LLMGuidedOptions)
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput,
PoolingRequestOutput, RequestOutput,
ScoringRequestOutput)
Expand Down Expand Up @@ -162,20 +163,20 @@ def __init__(
self,
model: str,
tokenizer: Optional[str] = None,
tokenizer_mode: str = "auto",
tokenizer_mode: TokenizerMode = "auto",
skip_tokenizer_init: bool = False,
trust_remote_code: bool = False,
allowed_local_media_path: str = "",
tensor_parallel_size: int = 1,
dtype: str = "auto",
quantization: Optional[str] = None,
dtype: ModelDType = "auto",
quantization: Optional[QuantizationMethods] = None,
revision: Optional[str] = None,
tokenizer_revision: Optional[str] = None,
seed: Optional[int] = None,
gpu_memory_utilization: float = 0.9,
swap_space: float = 4,
cpu_offload_gb: float = 0,
enforce_eager: Optional[bool] = None,
enforce_eager: bool = False,
max_seq_len_to_capture: int = 8192,
disable_custom_all_reduce: bool = False,
disable_async_output_proc: bool = False,
Expand All @@ -188,12 +189,7 @@ def __init__(
compilation_config: Optional[Union[int, dict[str, Any]]] = None,
**kwargs,
) -> None:
'''
LLM constructor.

Note: if enforce_eager is unset (enforce_eager is None)
it defaults to False.
'''
"""LLM constructor."""

if "disable_log_stats" not in kwargs:
kwargs["disable_log_stats"] = True
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/layers/quantization/aqlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from vllm import _custom_ops as ops
from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.model_executor.utils import set_weight_attrs
Expand Down Expand Up @@ -186,7 +187,7 @@ def __repr__(self) -> str:
f"out_group_size={self.out_group_size})")

@classmethod
def get_name(cls) -> str:
def get_name(cls) -> QuantizationMethods:
return "aqlm"

@classmethod
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/layers/quantization/awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from vllm import _custom_ops as ops
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
UnquantizedLinearMethod)
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.model_executor.parameter import (GroupQuantScaleParameter,
Expand Down Expand Up @@ -44,7 +45,7 @@ def __repr__(self) -> str:
f"zero_point={self.zero_point}, "
f"modules_to_not_convert={self.modules_to_not_convert})")

def get_name(self) -> str:
def get_name(self) -> QuantizationMethods:
return "awq"

def get_supported_act_dtypes(self) -> List[torch.dtype]:
Expand Down
7 changes: 4 additions & 3 deletions vllm/model_executor/layers/quantization/awq_marlin.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
UnquantizedLinearMethod,
set_weight_attrs)
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.model_executor.layers.quantization.awq import (AWQConfig,
is_layer_skipped_awq)
from vllm.model_executor.layers.quantization.base_config import (
Expand Down Expand Up @@ -73,7 +74,7 @@ def __repr__(self) -> str:
f"modules_to_not_convert={self.modules_to_not_convert})")

@classmethod
def get_name(cls) -> str:
def get_name(cls) -> QuantizationMethods:
return "awq_marlin"

@classmethod
Expand Down Expand Up @@ -101,8 +102,8 @@ def from_config(cls, config: Dict[str, Any]) -> "AWQMarlinConfig":
modules_to_not_convert, config)

@classmethod
def override_quantization_method(cls, hf_quant_cfg,
user_quant) -> Optional[str]:
def override_quantization_method(
cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
can_convert = cls.is_awq_marlin_compatible(hf_quant_cfg)
is_valid_user_quant = (user_quant is None or user_quant == "marlin"
or user_quant == "awq_marlin")
Expand Down
13 changes: 9 additions & 4 deletions vllm/model_executor/layers/quantization/base_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,16 @@

import inspect
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional, Type
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type

import torch
from torch import nn

if TYPE_CHECKING:
from vllm.model_executor.layers.quantization import QuantizationMethods
else:
QuantizationMethods = str


class QuantizeMethodBase(ABC):
"""Base class for different quantized methods."""
Expand Down Expand Up @@ -66,7 +71,7 @@ def __init__(self):
self.packed_modules_mapping: Dict[str, List[str]] = dict()

@abstractmethod
def get_name(self) -> str:
def get_name(self) -> QuantizationMethods:
"""Name of the quantization method."""
raise NotImplementedError

Expand Down Expand Up @@ -99,8 +104,8 @@ def from_config(cls, config: Dict[str, Any]) -> "QuantizationConfig":
raise NotImplementedError

@classmethod
def override_quantization_method(cls, hf_quant_cfg,
user_quant) -> Optional[str]:
def override_quantization_method(
cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
"""
Detects if this quantization method can support a given checkpoint
format by overriding the user specified quantization method --
Expand Down
7 changes: 4 additions & 3 deletions vllm/model_executor/layers/quantization/bitblas.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from vllm.logger import init_logger
from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
Expand Down Expand Up @@ -100,7 +101,7 @@ def __repr__(self) -> str:
f"quant_method={self.quant_method})")

@classmethod
def get_name(cls) -> str:
def get_name(cls) -> QuantizationMethods:
return "bitblas"

@classmethod
Expand Down Expand Up @@ -139,8 +140,8 @@ def from_config(cls, config: Dict[str, Any]) -> "BitBLASConfig":
lm_head_quantized)

@classmethod
def override_quantization_method(cls, hf_quant_cfg,
user_quant) -> Optional[str]:
def override_quantization_method(
cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
# compat: autogptq >=0.8.0 use checkpoint_format: str
# compat: autogptq <=0.7.1 is_bitblas_format: bool
is_bitblas_format = (hf_quant_cfg.get("checkpoint_format") == "bitblas"
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/layers/quantization/bitsandbytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
UnquantizedLinearMethod,
set_weight_attrs)
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.utils import direct_register_custom_op
Expand Down Expand Up @@ -56,7 +57,7 @@ def __repr__(self) -> str:
f"llm_int8_skip_modules={self.llm_int8_skip_modules})")

@classmethod
def get_name(self) -> str:
def get_name(self) -> QuantizationMethods:
return "bitsandbytes"

@classmethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
UnquantizedLinearMethod)
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501
QuantizationConfig, QuantizeMethodBase)
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import ( # noqa: E501
Expand Down Expand Up @@ -71,7 +72,7 @@ def get_supported_act_dtypes(cls) -> List[torch.dtype]:
def get_min_capability(cls) -> int:
return 70

def get_name(self) -> str:
def get_name(self) -> QuantizationMethods:
return "compressed-tensors"

def get_quant_method(
Expand Down
5 changes: 3 additions & 2 deletions vllm/model_executor/layers/quantization/deepspeedfp.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import torch.nn.functional as F

from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.model_executor.utils import set_weight_attrs
Expand Down Expand Up @@ -41,8 +42,8 @@ def __repr__(self) -> str:
f"group_size={self.group_size}")

@classmethod
def get_name(cls) -> str:
return "DeepSpeedFP"
def get_name(cls) -> QuantizationMethods:
return "deepspeedfp"

@classmethod
def from_config(cls, config: Dict[str, Any]) -> "DeepSpeedFPConfig":
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/layers/quantization/experts_int8.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
from vllm.model_executor.layers.linear import (LinearBase,
UnquantizedLinearMethod)
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase)
from vllm.model_executor.utils import set_weight_attrs
Expand All @@ -20,7 +21,7 @@ def __init__(self) -> None:
super().__init__()

@classmethod
def get_name(cls) -> str:
def get_name(cls) -> QuantizationMethods:
return "experts_int8"

@classmethod
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/layers/quantization/fbgemm_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from vllm.logger import init_logger
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
UnquantizedLinearMethod)
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase)
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
Expand Down Expand Up @@ -38,7 +39,7 @@ def __init__(self, ignore_list: List[str], input_scale_ub: float):
self.fp8_linear = Fp8LinearOp()

@classmethod
def get_name(cls) -> str:
def get_name(cls) -> QuantizationMethods:
return "fbgemm_fp8"

@classmethod
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/layers/quantization/fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
FusedMoeWeightScaleSupported)
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
UnquantizedLinearMethod)
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase)
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
Expand Down Expand Up @@ -83,7 +84,7 @@ def __init__(
self.weight_block_size = weight_block_size

@classmethod
def get_name(cls) -> str:
def get_name(cls) -> QuantizationMethods:
return "fp8"

@classmethod
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/layers/quantization/gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
FusedMoEMethodBase)
from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase)
from vllm.model_executor.layers.vocab_parallel_embedding import (
Expand All @@ -31,7 +32,7 @@ def __init__(self, ) -> None:
def __repr__(self) -> str:
return ("GGUFConfig()")

def get_name(self) -> str:
def get_name(self) -> QuantizationMethods:
return "gguf"

def get_supported_act_dtypes(self) -> List[torch.dtype]:
Expand Down
Loading