From 1ffcc7387befafba468e18b4a8f78b639fd88c2c Mon Sep 17 00:00:00 2001 From: zxy Date: Tue, 8 Jul 2025 17:39:38 +0800 Subject: [PATCH 01/10] hf config overrides --- lmdeploy/cli/cli.py | 6 +- lmdeploy/cli/serve.py | 42 +++---- lmdeploy/cli/utils.py | 104 +++++++++++++++++- lmdeploy/messages.py | 5 +- lmdeploy/pytorch/config.py | 12 +- lmdeploy/pytorch/engine/executor/__init__.py | 4 + .../pytorch/engine/executor/ray_executor.py | 5 + 7 files changed, 148 insertions(+), 30 deletions(-) diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py index 6d594e1d75..e2f9582794 100644 --- a/lmdeploy/cli/cli.py +++ b/lmdeploy/cli/cli.py @@ -1,16 +1,16 @@ # Copyright (c) OpenMMLab. All rights reserved. -import argparse import os from ..version import __version__ -from .utils import ArgumentHelper, DefaultsAndTypesHelpFormatter, convert_args, get_chat_template, get_lora_adapters +from .utils import (ArgumentHelper, DefaultsAndTypesHelpFormatter, FlexibleArgumentParser, convert_args, + get_chat_template, get_lora_adapters) class CLI(object): _desc = 'The CLI provides a unified API for converting, ' \ 'compressing and deploying large language models.' - parser = argparse.ArgumentParser(prog='lmdeploy', description=_desc, add_help=True) + parser = FlexibleArgumentParser(prog='lmdeploy', description=_desc, add_help=True) parser.add_argument('-v', '--version', action='version', version=__version__) subparsers = parser.add_subparsers(title='Commands', description='lmdeploy has following commands:', dest='command') diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py index 994ac1ea05..ea2b3fb76c 100644 --- a/lmdeploy/cli/serve.py +++ b/lmdeploy/cli/serve.py @@ -171,6 +171,7 @@ def add_parser_api_server(): ArgumentHelper.enable_eplb(pt_group) ArgumentHelper.role(pt_group) ArgumentHelper.migration_backend(pt_group) + ArgumentHelper.hf_overrides(pt_group) # multi-node serving args ArgumentHelper.node_rank(parser) ArgumentHelper.num_nodes(parser) @@ -317,25 +318,28 @@ def api_server(args): if backend == 'pytorch': from lmdeploy.messages import PytorchEngineConfig adapters = get_lora_adapters(args.adapters) - backend_config = PytorchEngineConfig(dtype=args.dtype, - tp=args.tp, - dp=args.dp, - ep=args.ep, - max_batch_size=max_batch_size, - cache_max_entry_count=args.cache_max_entry_count, - block_size=args.cache_block_seq_len, - session_len=args.session_len, - adapters=adapters, - enable_prefix_caching=args.enable_prefix_caching, - device_type=args.device, - quant_policy=args.quant_policy, - eager_mode=args.eager_mode, - max_prefill_token_num=args.max_prefill_token_num, - enable_microbatch=args.enable_microbatch, - enable_eplb=args.enable_eplb, - role=EngineRole[args.role], - migration_backend=MigrationBackend[args.migration_backend], - model_format=args.model_format) + backend_config = PytorchEngineConfig( + dtype=args.dtype, + tp=args.tp, + dp=args.dp, + ep=args.ep, + max_batch_size=max_batch_size, + cache_max_entry_count=args.cache_max_entry_count, + block_size=args.cache_block_seq_len, + session_len=args.session_len, + adapters=adapters, + enable_prefix_caching=args.enable_prefix_caching, + device_type=args.device, + quant_policy=args.quant_policy, + eager_mode=args.eager_mode, + max_prefill_token_num=args.max_prefill_token_num, + enable_microbatch=args.enable_microbatch, + enable_eplb=args.enable_eplb, + role=EngineRole[args.role], + migration_backend=MigrationBackend[args.migration_backend], + model_format=args.model_format, + hf_overrides=args.hf_overrides, + ) else: from lmdeploy.messages import TurbomindEngineConfig backend_config = TurbomindEngineConfig(dtype=args.dtype, diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py index dc5767bea9..220138314d 100644 --- a/lmdeploy/cli/utils.py +++ b/lmdeploy/cli/utils.py @@ -1,7 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse -from typing import List +import json +import re +import sys +from collections import defaultdict +from typing import Any, List class DefaultsAndTypesHelpFormatter(argparse.HelpFormatter): @@ -231,6 +235,14 @@ def rope_scaling_factor(parser): return parser.add_argument('--rope-scaling-factor', type=float, default=0.0, help='Rope scaling factor') + @staticmethod + def hf_overrides(parser): + """Add argument hf_overrides to parser.""" + return parser.add_argument('--hf-overrides', + type=json.loads, + default=None, + help='Extra arguments to be forwarded to for the HuggingFace config.') + @staticmethod def use_logn_attn(parser): """Add argument use_logn_attn to parser.""" @@ -575,3 +587,93 @@ def migration_backend(parser): default='DLSlime', choices=['DLSlime', 'Mooncake'], help='kvcache migration management backend when PD disaggregation') + + +# adapted from https://github.com/vllm-project/vllm/blob/main/vllm/utils/__init__.py +class FlexibleArgumentParser(argparse.ArgumentParser): + """"More flexible argument parser.""" + + def parse_args(self, args=None, namespace=None): + # If args is not provided, use arguments from the command line + if args is None: + args = sys.argv[1:] + + def repl(match: re.Match) -> str: + """Replaces underscores with dashes in the matched string.""" + return match.group(0).replace('_', '-') + + # Everything between the first -- and the first . + pattern = re.compile(r'(?<=--)[^\.]*') + + # Convert underscores to dashes and vice versa in argument names + processed_args = [] + for arg in args: + if arg.startswith('--'): + if '=' in arg: + key, value = arg.split('=', 1) + key = pattern.sub(repl, key, count=1) + processed_args.append(f'{key}={value}') + else: + key = pattern.sub(repl, arg, count=1) + processed_args.append(key) + elif arg.startswith('-O') and arg != '-O' and len(arg) == 2: + # allow -O flag to be used without space, e.g. -O3 + processed_args.append('-O') + processed_args.append(arg[2:]) + else: + processed_args.append(arg) + + def _try_convert(value: str): + """Try to convert string to float or int.""" + if not isinstance(value, str): + return value + # try loads from json + try: + return json.loads(value) + except json.JSONDecodeError: + pass + return value + + def create_nested_dict(keys: list[str], value: str): + """Creates a nested dictionary from a list of keys and a value. + + For example, `keys = ["a", "b", "c"]` and `value = 1` will create: `{"a": {"b": {"c": 1}}}` + """ + nested_dict: Any = _try_convert(value) + for key in reversed(keys): + nested_dict = {key: nested_dict} + return nested_dict + + def recursive_dict_update(original: dict, update: dict): + """Recursively updates a dictionary with another dictionary.""" + for k, v in update.items(): + if isinstance(v, dict) and isinstance(original.get(k), dict): + recursive_dict_update(original[k], v) + else: + original[k] = v + + delete = set() + dict_args: dict[str, dict] = defaultdict(dict) + for i, processed_arg in enumerate(processed_args): + if processed_arg.startswith('--') and '.' in processed_arg: + if '=' in processed_arg: + processed_arg, value = processed_arg.split('=', 1) + if '.' not in processed_arg: + # False positive, . was only in the value + continue + else: + value = processed_args[i + 1] + delete.add(i + 1) + key, *keys = processed_arg.split('.') + # Merge all values with the same key into a single dict + arg_dict = create_nested_dict(keys, value) + recursive_dict_update(dict_args[key], arg_dict) + delete.add(i) + # Filter out the dict args we set to None + processed_args = [a for i, a in enumerate(processed_args) if i not in delete] + # Add the dict args back as if they were originally passed as JSON + for dict_arg, dict_value in dict_args.items(): + processed_args.append(dict_arg) + processed_args.append(json.dumps(dict_value)) + + return super().parse_args(processed_args, namespace) diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py index d067d2f7ce..56ac180865 100644 --- a/lmdeploy/messages.py +++ b/lmdeploy/messages.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import enum from dataclasses import dataclass, field -from typing import Callable, Dict, List, Literal, Optional +from typing import Any, Callable, Dict, List, Literal, Optional import torch from pydantic.dataclasses import dataclass as pydantic_dataclass @@ -320,6 +320,8 @@ class PytorchEngineConfig: Default to `MigrationBackend.DLSlime`. enable_mp_engine (bool): run engine in multi-process mode. model_format (str): weight quantization policy, options: ['fp8']. + hf_overrides (Dict[str, Any]): Huggingface overrides for the model. + It can be used to override the default config of the model, """ dtype: str = 'auto' tp: int = 1 @@ -349,6 +351,7 @@ class PytorchEngineConfig: enable_eplb: bool = False enable_mp_engine: bool = False model_format: str = None + hf_overrides: Optional[Dict[str, Any]] = None role: EngineRole = EngineRole.Hybrid migration_backend: MigrationBackend = MigrationBackend.DLSlime diff --git a/lmdeploy/pytorch/config.py b/lmdeploy/pytorch/config.py index 3b620a8cff..bbcabba51f 100644 --- a/lmdeploy/pytorch/config.py +++ b/lmdeploy/pytorch/config.py @@ -223,14 +223,14 @@ class MiscConfig: custom_module_map: str = None empty_init: bool = False model_format: str = None + hf_overrides: Dict[str, Any] = None @classmethod def from_engine_config(cls, engine_config: PytorchEngineConfig): """From engine config.""" - misc_config = cls( - custom_module_map=engine_config.custom_module_map, - empty_init=engine_config.empty_init, - prefill_interval=engine_config.prefill_interval, - model_format=engine_config.model_format, - ) + misc_config = cls(custom_module_map=engine_config.custom_module_map, + empty_init=engine_config.empty_init, + prefill_interval=engine_config.prefill_interval, + model_format=engine_config.model_format, + hf_overrides=engine_config.hf_overrides) return misc_config diff --git a/lmdeploy/pytorch/engine/executor/__init__.py b/lmdeploy/pytorch/engine/executor/__init__.py index 9aad5aa5bb..d15a12a423 100644 --- a/lmdeploy/pytorch/engine/executor/__init__.py +++ b/lmdeploy/pytorch/engine/executor/__init__.py @@ -83,6 +83,10 @@ def build_executor(model_path: str, 'empty_init requires distributed_executor_backend="ray", ', f'get distributed_executor_backend="{distributed_executor_backend}"') + if misc_config.hf_overrides is not None: + logger.warning(f'Overriding HF config with {misc_config.hf_overrides}') + model_config.hf_config.update(misc_config.hf_overrides) + if distributed_executor_backend is not None: logger.info(f'Build <{distributed_executor_backend}> executor.') if distributed_executor_backend == 'uni': diff --git a/lmdeploy/pytorch/engine/executor/ray_executor.py b/lmdeploy/pytorch/engine/executor/ray_executor.py index 0e7dc395a5..d29dd9842d 100644 --- a/lmdeploy/pytorch/engine/executor/ray_executor.py +++ b/lmdeploy/pytorch/engine/executor/ray_executor.py @@ -258,6 +258,11 @@ def __init__( from lmdeploy.tokenizer import Tokenizer tokenizer = Tokenizer(model_path).model.model model_config = ModelConfig.from_pretrained(model_path, dtype=dtype, dist_config=dist_config) + + if misc_config.hf_overrides is not None: + logger.warning(f'Overriding HF config with {misc_config.hf_overrides}') + model_config.hf_config.update(misc_config.hf_overrides) + super().__init__( model_path=model_path, cache_config=cache_config, From 3a342ba8230e1d4fc237eb7227c7cadf7a00c4ec Mon Sep 17 00:00:00 2001 From: zxy Date: Wed, 9 Jul 2025 18:10:45 +0800 Subject: [PATCH 02/10] TM support --- lmdeploy/cli/serve.py | 3 ++- lmdeploy/messages.py | 3 +++ lmdeploy/turbomind/deploy/config.py | 19 +++++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py index ea2b3fb76c..dedc2faef6 100644 --- a/lmdeploy/cli/serve.py +++ b/lmdeploy/cli/serve.py @@ -353,7 +353,8 @@ def api_server(args): cache_block_seq_len=args.cache_block_seq_len, enable_prefix_caching=args.enable_prefix_caching, max_prefill_token_num=args.max_prefill_token_num, - communicator=args.communicator) + communicator=args.communicator, + hf_overrides=args.hf_overrides) chat_template_config = get_chat_template(args.chat_template) from lmdeploy.messages import VisionConfig diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py index 56ac180865..9bc26ec6b7 100644 --- a/lmdeploy/messages.py +++ b/lmdeploy/messages.py @@ -222,6 +222,8 @@ class TurbomindEngineConfig: devices(List[int]): the used devices empty_init (bool): Whether to load the model weights, you should set it to True if you want to update weights after create the pipeline + hf_overrides (Dict[str, Any]): Huggingface overrides for the model. + It can be used to override the default config of the model, """ dtype: str = 'auto' @@ -251,6 +253,7 @@ class TurbomindEngineConfig: devices: Optional[List[int]] = None empty_init: bool = False communicator: str = 'nccl' + hf_overrides: Optional[Dict[str, Any]] = None def __post_init__(self): """Check input validation.""" diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py index 8f2ab0a3bd..5ac965f20f 100644 --- a/lmdeploy/turbomind/deploy/config.py +++ b/lmdeploy/turbomind/deploy/config.py @@ -8,6 +8,9 @@ from pydantic.dataclasses import dataclass from lmdeploy.messages import TurbomindEngineConfig +from lmdeploy.utils import get_logger + +logger = get_logger('lmdeploy') def config_from_dict(cls, env): @@ -150,6 +153,22 @@ def update_from_engine_config(self, config: TurbomindEngineConfig): if hasattr(self.attention_config, key): setattr(self.attention_config, key, value) + # update from hf_overrides + if hasattr(config, 'hf_overrides') and config.hf_overrides: + hf_overrides = config.hf_overrides + + if hf_overrides.get('rope_scaling'): + override_param = hf_overrides.get('rope_scaling') + if self.attention_config.rope_param is None: + self.attention_config.rope_param = RopeParam(type='', base=0, dim=0) + + self.attention_config.rope_param.__dict__.update( + type=override_param.get('rope_type'), + factor=override_param.get('factor'), + max_position_embeddings=override_param.get('original_max_position_embeddings')) + + logger.warning(f'Overriding HF config with {hf_overrides}') + # use dynamic ntk if config.rope_scaling_factor: if self.attention_config.rope_param is None: From 65087cbc9818c02b4665a68e19f4fab5f0ac553b Mon Sep 17 00:00:00 2001 From: zxy Date: Wed, 9 Jul 2025 18:27:10 +0800 Subject: [PATCH 03/10] add default val --- lmdeploy/turbomind/deploy/config.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py index 5ac965f20f..2852cb645f 100644 --- a/lmdeploy/turbomind/deploy/config.py +++ b/lmdeploy/turbomind/deploy/config.py @@ -158,14 +158,14 @@ def update_from_engine_config(self, config: TurbomindEngineConfig): hf_overrides = config.hf_overrides if hf_overrides.get('rope_scaling'): - override_param = hf_overrides.get('rope_scaling') + override_params = hf_overrides.get('rope_scaling') if self.attention_config.rope_param is None: self.attention_config.rope_param = RopeParam(type='', base=0, dim=0) - self.attention_config.rope_param.__dict__.update( - type=override_param.get('rope_type'), - factor=override_param.get('factor'), - max_position_embeddings=override_param.get('original_max_position_embeddings')) + self.attention_config.rope_param.__dict__.update(type=override_params.get('rope_type', ''), + factor=override_params.get('factor', 1.0), + max_position_embeddings=override_params.get( + 'original_max_position_embeddings', None)) logger.warning(f'Overriding HF config with {hf_overrides}') From 28a8e4e400111691a6207e9b5b16bcc31e5bb7f9 Mon Sep 17 00:00:00 2001 From: zxy Date: Thu, 10 Jul 2025 20:19:52 +0800 Subject: [PATCH 04/10] fix for yaml safe dump --- lmdeploy/turbomind/turbomind.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 0c5632bc94..f290d5deed 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -255,7 +255,12 @@ def _postprocess_config(self, tm_config: TurbomindModelConfig, engine_config: Tu # pack `self.config` and `self.engine_config` into a dict self.config_dict = self.config.to_dict() - self.config_dict.update(dict(engine_config=asdict(self.engine_config))) + engine_config_dict = asdict(engine_config) + # Sanitize `engine_config` for YAML serialization. + # `PyYAML` raises a `RepresenterError` on `mmengine.ConfigDict` objects + # passed by frameworks like OpenCompass. This ensures a standard dict. + engine_config_dict = json.loads(json.dumps(engine_config_dict)) + self.config_dict.update(dict(engine_config=engine_config_dict)) logger.info(f'turbomind model config:\n\n' f'{json.dumps(self.config_dict, indent=2)}') From 31d9f589a1ccc924e96075e72d69bc9f01ee4544 Mon Sep 17 00:00:00 2001 From: zxy Date: Fri, 11 Jul 2025 14:35:27 +0800 Subject: [PATCH 05/10] add testcases --- tests/test_lmdeploy/test_hf_overrides.py | 41 ++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 tests/test_lmdeploy/test_hf_overrides.py diff --git a/tests/test_lmdeploy/test_hf_overrides.py b/tests/test_lmdeploy/test_hf_overrides.py new file mode 100644 index 0000000000..6425e9de37 --- /dev/null +++ b/tests/test_lmdeploy/test_hf_overrides.py @@ -0,0 +1,41 @@ +import pytest + +from lmdeploy import PytorchEngineConfig, TurbomindEngineConfig, pipeline + + +@pytest.mark.parametrize('model_path', ['Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen3-30B-A3B']) +def test_hf_overrides_turbomind(model_path): + # Define a custom rope_scaling configuration to override the model's default settings. + rope_scaling_override = {'rope_type': 'yarn', 'factor': 4.0, 'original_max_position_embeddings': 32768} + hf_overrides = {'rope_scaling': rope_scaling_override} + + backend_config = TurbomindEngineConfig(hf_overrides=hf_overrides) + with pipeline(model_path, backend_config=backend_config) as pipe: + processed_config = pipe.engine.config.attention_config + + assert getattr(processed_config, 'rope_param') is not None + for key, value in rope_scaling_override.items(): + # Adjust key for compatibility with Turbomind's config + if key == 'rope_type': + key = 'type' + if key == 'original_max_position_embeddings': + key = 'max_position_embeddings' + + assert getattr(processed_config.rope_param, key) == value, \ + f'Expected {key} to be {value}, but got {getattr(processed_config.rope_param, key)}' + + +@pytest.mark.parametrize('model_path', ['Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen3-30B-A3B']) +def test_hf_overrides_pytorch(model_path): + # Define a custom rope_scaling configuration to override the model's default settings. + rope_scaling_override = {'rope_type': 'yarn', 'factor': 4.0, 'original_max_position_embeddings': 32768} + hf_overrides = {'rope_scaling': rope_scaling_override} + + backend_config = PytorchEngineConfig(hf_overrides=hf_overrides) + with pipeline(model_path, backend_config=backend_config) as pipe: + processed_config = pipe.engine.get_model_config() + + assert processed_config.hf_config.rope_scaling is not None + for key, value in rope_scaling_override.items(): + assert processed_config.hf_config.rope_scaling.get(key) == value, \ + f'Expected {key} to be {value}, but got {processed_config.hf_config.rope_scaling.get(key)}' From a748cfb385089caf04f2dc10d4dcbf1267fb4e43 Mon Sep 17 00:00:00 2001 From: zxy Date: Fri, 11 Jul 2025 18:29:27 +0800 Subject: [PATCH 06/10] change hf_overrides positions --- lmdeploy/cli/utils.py | 2 +- lmdeploy/pytorch/config.py | 20 +++++++++++++++++-- lmdeploy/pytorch/engine/executor/__init__.py | 10 +++++----- .../pytorch/engine/executor/ray_executor.py | 9 ++++----- 4 files changed, 28 insertions(+), 13 deletions(-) diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py index 954b205d73..92971dc11f 100644 --- a/lmdeploy/cli/utils.py +++ b/lmdeploy/cli/utils.py @@ -241,7 +241,7 @@ def hf_overrides(parser): return parser.add_argument('--hf-overrides', type=json.loads, default=None, - help='Extra arguments to be forwarded to for the HuggingFace config.') + help='Extra arguments to be forwarded to the HuggingFace config.') @staticmethod def use_logn_attn(parser): diff --git a/lmdeploy/pytorch/config.py b/lmdeploy/pytorch/config.py index bbcabba51f..2ecd04e3a2 100644 --- a/lmdeploy/pytorch/config.py +++ b/lmdeploy/pytorch/config.py @@ -158,7 +158,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, trust_remote_code: bool = True, dtype: str = 'auto', - dist_config: DistConfig = None): + dist_config: DistConfig = None, + hf_overrides: Dict[str, Any] = None): """Instantiate one of the configuration classes of the library from a pretrained model configuration. @@ -168,13 +169,28 @@ def from_pretrained(cls, models defined on the Hub in their own modeling files. dtype (str): user specified data type for model weights and activations. Refer to `PyTorchEngineConfig` for details + hf_overrides (Dict[str, Any]): overrides for the HF config. """ from transformers import AutoConfig + + from lmdeploy.utils import get_logger + hf_config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust_remote_code) if getattr(hf_config, 'model_type', None) in ['phi3']: # phi3 + trust_remote_code leads to error when tp. hf_config = AutoConfig.from_pretrained(pretrained_model_name_or_path) - return cls.from_hf_config(hf_config, pretrained_model_name_or_path, dtype=dtype, dist_config=dist_config) + + model_config = cls.from_hf_config(hf_config, + pretrained_model_name_or_path, + dtype=dtype, + dist_config=dist_config) + + if hf_overrides is not None: + logger = get_logger('lmdeploy') + logger.warning(f'Overriding HF config with {hf_overrides}') + model_config.hf_config.update(hf_overrides) + + return model_config @classmethod def from_hf_config(cls, diff --git a/lmdeploy/pytorch/engine/executor/__init__.py b/lmdeploy/pytorch/engine/executor/__init__.py index d15a12a423..0e26701748 100644 --- a/lmdeploy/pytorch/engine/executor/__init__.py +++ b/lmdeploy/pytorch/engine/executor/__init__.py @@ -68,7 +68,11 @@ def build_executor(model_path: str, dp = dist_config.dp world_size = dist_config.world_size - model_config = ModelConfig.from_pretrained(model_path, trust_remote_code=True, dtype=dtype, dist_config=dist_config) + model_config = ModelConfig.from_pretrained(model_path, + trust_remote_code=True, + dtype=dtype, + hf_overrides=misc_config.hf_overrides, + dist_config=dist_config) if distributed_executor_backend is None: distributed_executor_backend = get_distributed_executor_backend(world_size, dp, device_type, logger) @@ -83,10 +87,6 @@ def build_executor(model_path: str, 'empty_init requires distributed_executor_backend="ray", ', f'get distributed_executor_backend="{distributed_executor_backend}"') - if misc_config.hf_overrides is not None: - logger.warning(f'Overriding HF config with {misc_config.hf_overrides}') - model_config.hf_config.update(misc_config.hf_overrides) - if distributed_executor_backend is not None: logger.info(f'Build <{distributed_executor_backend}> executor.') if distributed_executor_backend == 'uni': diff --git a/lmdeploy/pytorch/engine/executor/ray_executor.py b/lmdeploy/pytorch/engine/executor/ray_executor.py index d29dd9842d..0e6bd1114f 100644 --- a/lmdeploy/pytorch/engine/executor/ray_executor.py +++ b/lmdeploy/pytorch/engine/executor/ray_executor.py @@ -257,11 +257,10 @@ def __init__( from lmdeploy.tokenizer import Tokenizer tokenizer = Tokenizer(model_path).model.model - model_config = ModelConfig.from_pretrained(model_path, dtype=dtype, dist_config=dist_config) - - if misc_config.hf_overrides is not None: - logger.warning(f'Overriding HF config with {misc_config.hf_overrides}') - model_config.hf_config.update(misc_config.hf_overrides) + model_config = ModelConfig.from_pretrained(model_path, + dtype=dtype, + hf_overrides=misc_config.hf_overrides, + dist_config=dist_config) super().__init__( model_path=model_path, From 5141f37030fb95fbb47682291040dba05cee0e96 Mon Sep 17 00:00:00 2001 From: zxy Date: Tue, 15 Jul 2025 14:02:29 +0800 Subject: [PATCH 07/10] optimize --- lmdeploy/cli/serve.py | 1 + lmdeploy/turbomind/deploy/config.py | 23 ++++++++++------------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py index 3b6b19c5c8..fd11048185 100644 --- a/lmdeploy/cli/serve.py +++ b/lmdeploy/cli/serve.py @@ -194,6 +194,7 @@ def add_parser_api_server(): ArgumentHelper.num_tokens_per_iter(tb_group) ArgumentHelper.max_prefill_iters(tb_group) ArgumentHelper.communicator(tb_group) + ArgumentHelper.hf_overrides(tb_group) # vlm args vision_group = parser.add_argument_group('Vision model arguments') diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py index 2852cb645f..2890638286 100644 --- a/lmdeploy/turbomind/deploy/config.py +++ b/lmdeploy/turbomind/deploy/config.py @@ -159,25 +159,22 @@ def update_from_engine_config(self, config: TurbomindEngineConfig): if hf_overrides.get('rope_scaling'): override_params = hf_overrides.get('rope_scaling') - if self.attention_config.rope_param is None: - self.attention_config.rope_param = RopeParam(type='', base=0, dim=0) - self.attention_config.rope_param.__dict__.update(type=override_params.get('rope_type', ''), - factor=override_params.get('factor', 1.0), - max_position_embeddings=override_params.get( - 'original_max_position_embeddings', None)) + rope_param = self.attention_config.rope_param or RopeParam(type='', base=0, dim=0) + rope_param.type = override_params.get('rope_type', '') + rope_param.factor = override_params.get('factor', 1.0) + rope_param.max_position_embeddings = override_params.get('original_max_position_embeddings', None) logger.warning(f'Overriding HF config with {hf_overrides}') # use dynamic ntk if config.rope_scaling_factor: - if self.attention_config.rope_param is None: - # some ut will create empty RopeParam, will check base/dim in src code - self.attention_config.rope_param = RopeParam(type='', base=0, dim=0) - self.attention_config.rope_param.__dict__.update( - type='dynamic', - factor=config.rope_scaling_factor, - max_position_embeddings=self.attention_config.max_position_embeddings) + # some ut will create empty RopeParam, will check base/dim in src code + rope_param = self.attention_config.rope_param or RopeParam(type='', base=0, dim=0) + + rope_param.type = 'dynamic' + rope_param.factor = config.rope_scaling_factor + rope_param.max_position_embeddings = self.attention_config.max_position_embeddings @classmethod def from_dict(cls, config: dict = {}): From 5730a83b9be3ac1d24ac69f61eb2db3bfa0582b1 Mon Sep 17 00:00:00 2001 From: zxy Date: Tue, 15 Jul 2025 14:22:12 +0800 Subject: [PATCH 08/10] fix arg helper, add warnings --- lmdeploy/cli/serve.py | 4 ++-- lmdeploy/turbomind/deploy/config.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py index fd11048185..74f268b5b1 100644 --- a/lmdeploy/cli/serve.py +++ b/lmdeploy/cli/serve.py @@ -165,6 +165,7 @@ def add_parser_api_server(): max_prefill_token_num_act = ArgumentHelper.max_prefill_token_num(pt_group) quant_policy = ArgumentHelper.quant_policy(pt_group) model_format = ArgumentHelper.model_format(pt_group) + hf_overrides = ArgumentHelper.hf_overrides(pt_group) ArgumentHelper.dp(pt_group) ArgumentHelper.ep(pt_group) ArgumentHelper.enable_microbatch(pt_group) @@ -172,7 +173,6 @@ def add_parser_api_server(): ArgumentHelper.enable_metrics(pt_group) ArgumentHelper.role(pt_group) ArgumentHelper.migration_backend(pt_group) - ArgumentHelper.hf_overrides(pt_group) # multi-node serving args ArgumentHelper.node_rank(parser) ArgumentHelper.num_nodes(parser) @@ -190,11 +190,11 @@ def add_parser_api_server(): tb_group._group_actions.append(max_prefill_token_num_act) tb_group._group_actions.append(quant_policy) tb_group._group_actions.append(model_format) + tb_group._group_actions.append(hf_overrides) ArgumentHelper.rope_scaling_factor(tb_group) ArgumentHelper.num_tokens_per_iter(tb_group) ArgumentHelper.max_prefill_iters(tb_group) ArgumentHelper.communicator(tb_group) - ArgumentHelper.hf_overrides(tb_group) # vlm args vision_group = parser.add_argument_group('Vision model arguments') diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py index 2890638286..49763b78ea 100644 --- a/lmdeploy/turbomind/deploy/config.py +++ b/lmdeploy/turbomind/deploy/config.py @@ -176,6 +176,9 @@ def update_from_engine_config(self, config: TurbomindEngineConfig): rope_param.factor = config.rope_scaling_factor rope_param.max_position_embeddings = self.attention_config.max_position_embeddings + logger.warning( + '`--rope-scaling-factor` will be removed in a future release. Please instead use `--hf-overrides`.') + @classmethod def from_dict(cls, config: dict = {}): """Construct TurbomindModelConfig instance from config in a dict.""" From 090c370d23072fd9192144855a4df92eb0f1198c Mon Sep 17 00:00:00 2001 From: zxy Date: Tue, 15 Jul 2025 17:06:03 +0800 Subject: [PATCH 09/10] remove UT --- tests/test_lmdeploy/test_hf_overrides.py | 41 ------------------------ 1 file changed, 41 deletions(-) delete mode 100644 tests/test_lmdeploy/test_hf_overrides.py diff --git a/tests/test_lmdeploy/test_hf_overrides.py b/tests/test_lmdeploy/test_hf_overrides.py deleted file mode 100644 index 6425e9de37..0000000000 --- a/tests/test_lmdeploy/test_hf_overrides.py +++ /dev/null @@ -1,41 +0,0 @@ -import pytest - -from lmdeploy import PytorchEngineConfig, TurbomindEngineConfig, pipeline - - -@pytest.mark.parametrize('model_path', ['Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen3-30B-A3B']) -def test_hf_overrides_turbomind(model_path): - # Define a custom rope_scaling configuration to override the model's default settings. - rope_scaling_override = {'rope_type': 'yarn', 'factor': 4.0, 'original_max_position_embeddings': 32768} - hf_overrides = {'rope_scaling': rope_scaling_override} - - backend_config = TurbomindEngineConfig(hf_overrides=hf_overrides) - with pipeline(model_path, backend_config=backend_config) as pipe: - processed_config = pipe.engine.config.attention_config - - assert getattr(processed_config, 'rope_param') is not None - for key, value in rope_scaling_override.items(): - # Adjust key for compatibility with Turbomind's config - if key == 'rope_type': - key = 'type' - if key == 'original_max_position_embeddings': - key = 'max_position_embeddings' - - assert getattr(processed_config.rope_param, key) == value, \ - f'Expected {key} to be {value}, but got {getattr(processed_config.rope_param, key)}' - - -@pytest.mark.parametrize('model_path', ['Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen3-30B-A3B']) -def test_hf_overrides_pytorch(model_path): - # Define a custom rope_scaling configuration to override the model's default settings. - rope_scaling_override = {'rope_type': 'yarn', 'factor': 4.0, 'original_max_position_embeddings': 32768} - hf_overrides = {'rope_scaling': rope_scaling_override} - - backend_config = PytorchEngineConfig(hf_overrides=hf_overrides) - with pipeline(model_path, backend_config=backend_config) as pipe: - processed_config = pipe.engine.get_model_config() - - assert processed_config.hf_config.rope_scaling is not None - for key, value in rope_scaling_override.items(): - assert processed_config.hf_config.rope_scaling.get(key) == value, \ - f'Expected {key} to be {value}, but got {processed_config.hf_config.rope_scaling.get(key)}' From 77cd89d196a980e5565e30f02d8891749be60c01 Mon Sep 17 00:00:00 2001 From: zxy Date: Tue, 15 Jul 2025 17:38:06 +0800 Subject: [PATCH 10/10] fix UT --- lmdeploy/turbomind/deploy/config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py index 49763b78ea..b629927a8b 100644 --- a/lmdeploy/turbomind/deploy/config.py +++ b/lmdeploy/turbomind/deploy/config.py @@ -165,17 +165,18 @@ def update_from_engine_config(self, config: TurbomindEngineConfig): rope_param.factor = override_params.get('factor', 1.0) rope_param.max_position_embeddings = override_params.get('original_max_position_embeddings', None) + self.attention_config.rope_param = rope_param logger.warning(f'Overriding HF config with {hf_overrides}') # use dynamic ntk if config.rope_scaling_factor: # some ut will create empty RopeParam, will check base/dim in src code rope_param = self.attention_config.rope_param or RopeParam(type='', base=0, dim=0) - rope_param.type = 'dynamic' rope_param.factor = config.rope_scaling_factor rope_param.max_position_embeddings = self.attention_config.max_position_embeddings + self.attention_config.rope_param = rope_param logger.warning( '`--rope-scaling-factor` will be removed in a future release. Please instead use `--hf-overrides`.')