From 1ffcc7387befafba468e18b4a8f78b639fd88c2c Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Tue, 8 Jul 2025 17:39:38 +0800
Subject: [PATCH 01/10] hf config overrides

---
 lmdeploy/cli/cli.py                           |   6 +-
 lmdeploy/cli/serve.py                         |  42 +++----
 lmdeploy/cli/utils.py                         | 104 +++++++++++++++++-
 lmdeploy/messages.py                          |   5 +-
 lmdeploy/pytorch/config.py                    |  12 +-
 lmdeploy/pytorch/engine/executor/__init__.py  |   4 +
 .../pytorch/engine/executor/ray_executor.py   |   5 +
 7 files changed, 148 insertions(+), 30 deletions(-)

diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py
index 6d594e1d75..e2f9582794 100644
--- a/lmdeploy/cli/cli.py
+++ b/lmdeploy/cli/cli.py
@@ -1,16 +1,16 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-import argparse
 import os
 
 from ..version import __version__
-from .utils import ArgumentHelper, DefaultsAndTypesHelpFormatter, convert_args, get_chat_template, get_lora_adapters
+from .utils import (ArgumentHelper, DefaultsAndTypesHelpFormatter, FlexibleArgumentParser, convert_args,
+                    get_chat_template, get_lora_adapters)
 
 
 class CLI(object):
     _desc = 'The CLI provides a unified API for converting, ' \
             'compressing and deploying large language models.'
-    parser = argparse.ArgumentParser(prog='lmdeploy', description=_desc, add_help=True)
+    parser = FlexibleArgumentParser(prog='lmdeploy', description=_desc, add_help=True)
     parser.add_argument('-v', '--version', action='version', version=__version__)
     subparsers = parser.add_subparsers(title='Commands', description='lmdeploy has following commands:', dest='command')
 
diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
index 994ac1ea05..ea2b3fb76c 100644
--- a/lmdeploy/cli/serve.py
+++ b/lmdeploy/cli/serve.py
@@ -171,6 +171,7 @@ def add_parser_api_server():
         ArgumentHelper.enable_eplb(pt_group)
         ArgumentHelper.role(pt_group)
         ArgumentHelper.migration_backend(pt_group)
+        ArgumentHelper.hf_overrides(pt_group)
         # multi-node serving args
         ArgumentHelper.node_rank(parser)
         ArgumentHelper.num_nodes(parser)
@@ -317,25 +318,28 @@ def api_server(args):
         if backend == 'pytorch':
             from lmdeploy.messages import PytorchEngineConfig
             adapters = get_lora_adapters(args.adapters)
-            backend_config = PytorchEngineConfig(dtype=args.dtype,
-                                                 tp=args.tp,
-                                                 dp=args.dp,
-                                                 ep=args.ep,
-                                                 max_batch_size=max_batch_size,
-                                                 cache_max_entry_count=args.cache_max_entry_count,
-                                                 block_size=args.cache_block_seq_len,
-                                                 session_len=args.session_len,
-                                                 adapters=adapters,
-                                                 enable_prefix_caching=args.enable_prefix_caching,
-                                                 device_type=args.device,
-                                                 quant_policy=args.quant_policy,
-                                                 eager_mode=args.eager_mode,
-                                                 max_prefill_token_num=args.max_prefill_token_num,
-                                                 enable_microbatch=args.enable_microbatch,
-                                                 enable_eplb=args.enable_eplb,
-                                                 role=EngineRole[args.role],
-                                                 migration_backend=MigrationBackend[args.migration_backend],
-                                                 model_format=args.model_format)
+            backend_config = PytorchEngineConfig(
+                dtype=args.dtype,
+                tp=args.tp,
+                dp=args.dp,
+                ep=args.ep,
+                max_batch_size=max_batch_size,
+                cache_max_entry_count=args.cache_max_entry_count,
+                block_size=args.cache_block_seq_len,
+                session_len=args.session_len,
+                adapters=adapters,
+                enable_prefix_caching=args.enable_prefix_caching,
+                device_type=args.device,
+                quant_policy=args.quant_policy,
+                eager_mode=args.eager_mode,
+                max_prefill_token_num=args.max_prefill_token_num,
+                enable_microbatch=args.enable_microbatch,
+                enable_eplb=args.enable_eplb,
+                role=EngineRole[args.role],
+                migration_backend=MigrationBackend[args.migration_backend],
+                model_format=args.model_format,
+                hf_overrides=args.hf_overrides,
+            )
         else:
             from lmdeploy.messages import TurbomindEngineConfig
             backend_config = TurbomindEngineConfig(dtype=args.dtype,
diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
index dc5767bea9..220138314d 100644
--- a/lmdeploy/cli/utils.py
+++ b/lmdeploy/cli/utils.py
@@ -1,7 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 import argparse
-from typing import List
+import json
+import re
+import sys
+from collections import defaultdict
+from typing import Any, List
 
 
 class DefaultsAndTypesHelpFormatter(argparse.HelpFormatter):
@@ -231,6 +235,14 @@ def rope_scaling_factor(parser):
 
         return parser.add_argument('--rope-scaling-factor', type=float, default=0.0, help='Rope scaling factor')
 
+    @staticmethod
+    def hf_overrides(parser):
+        """Add argument hf_overrides to parser."""
+        return parser.add_argument('--hf-overrides',
+                                   type=json.loads,
+                                   default=None,
+                                   help='Extra arguments to be forwarded to for the HuggingFace config.')
+
     @staticmethod
     def use_logn_attn(parser):
         """Add argument use_logn_attn to parser."""
@@ -575,3 +587,93 @@ def migration_backend(parser):
                                    default='DLSlime',
                                    choices=['DLSlime', 'Mooncake'],
                                    help='kvcache migration management backend when PD disaggregation')
+
+
+# adapted from https://github.com/vllm-project/vllm/blob/main/vllm/utils/__init__.py
+class FlexibleArgumentParser(argparse.ArgumentParser):
+    """"More flexible argument parser."""
+
+    def parse_args(self, args=None, namespace=None):
+        # If args is not provided, use arguments from the command line
+        if args is None:
+            args = sys.argv[1:]
+
+        def repl(match: re.Match) -> str:
+            """Replaces underscores with dashes in the matched string."""
+            return match.group(0).replace('_', '-')
+
+        # Everything between the first -- and the first .
+        pattern = re.compile(r'(?<=--)[^\.]*')
+
+        # Convert underscores to dashes and vice versa in argument names
+        processed_args = []
+        for arg in args:
+            if arg.startswith('--'):
+                if '=' in arg:
+                    key, value = arg.split('=', 1)
+                    key = pattern.sub(repl, key, count=1)
+                    processed_args.append(f'{key}={value}')
+                else:
+                    key = pattern.sub(repl, arg, count=1)
+                    processed_args.append(key)
+            elif arg.startswith('-O') and arg != '-O' and len(arg) == 2:
+                # allow -O flag to be used without space, e.g. -O3
+                processed_args.append('-O')
+                processed_args.append(arg[2:])
+            else:
+                processed_args.append(arg)
+
+        def _try_convert(value: str):
+            """Try to convert string to float or int."""
+            if not isinstance(value, str):
+                return value
+            # try loads from json
+            try:
+                return json.loads(value)
+            except json.JSONDecodeError:
+                pass
+            return value
+
+        def create_nested_dict(keys: list[str], value: str):
+            """Creates a nested dictionary from a list of keys and a value.
+
+            For example, `keys = ["a", "b", "c"]` and `value = 1` will create: `{"a": {"b": {"c": 1}}}`
+            """
+            nested_dict: Any = _try_convert(value)
+            for key in reversed(keys):
+                nested_dict = {key: nested_dict}
+            return nested_dict
+
+        def recursive_dict_update(original: dict, update: dict):
+            """Recursively updates a dictionary with another dictionary."""
+            for k, v in update.items():
+                if isinstance(v, dict) and isinstance(original.get(k), dict):
+                    recursive_dict_update(original[k], v)
+                else:
+                    original[k] = v
+
+        delete = set()
+        dict_args: dict[str, dict] = defaultdict(dict)
+        for i, processed_arg in enumerate(processed_args):
+            if processed_arg.startswith('--') and '.' in processed_arg:
+                if '=' in processed_arg:
+                    processed_arg, value = processed_arg.split('=', 1)
+                    if '.' not in processed_arg:
+                        # False positive, . was only in the value
+                        continue
+                else:
+                    value = processed_args[i + 1]
+                    delete.add(i + 1)
+                key, *keys = processed_arg.split('.')
+                # Merge all values with the same key into a single dict
+                arg_dict = create_nested_dict(keys, value)
+                recursive_dict_update(dict_args[key], arg_dict)
+                delete.add(i)
+        # Filter out the dict args we set to None
+        processed_args = [a for i, a in enumerate(processed_args) if i not in delete]
+        # Add the dict args back as if they were originally passed as JSON
+        for dict_arg, dict_value in dict_args.items():
+            processed_args.append(dict_arg)
+            processed_args.append(json.dumps(dict_value))
+
+        return super().parse_args(processed_args, namespace)
diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
index d067d2f7ce..56ac180865 100644
--- a/lmdeploy/messages.py
+++ b/lmdeploy/messages.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import enum
 from dataclasses import dataclass, field
-from typing import Callable, Dict, List, Literal, Optional
+from typing import Any, Callable, Dict, List, Literal, Optional
 
 import torch
 from pydantic.dataclasses import dataclass as pydantic_dataclass
@@ -320,6 +320,8 @@ class PytorchEngineConfig:
             Default to `MigrationBackend.DLSlime`.
         enable_mp_engine (bool): run engine in multi-process mode.
         model_format (str): weight quantization policy, options: ['fp8'].
+        hf_overrides (Dict[str, Any]): Huggingface overrides for the model.
+            It can be used to override the default config of the model,
     """
     dtype: str = 'auto'
     tp: int = 1
@@ -349,6 +351,7 @@ class PytorchEngineConfig:
     enable_eplb: bool = False
     enable_mp_engine: bool = False
     model_format: str = None
+    hf_overrides: Optional[Dict[str, Any]] = None
 
     role: EngineRole = EngineRole.Hybrid
     migration_backend: MigrationBackend = MigrationBackend.DLSlime
diff --git a/lmdeploy/pytorch/config.py b/lmdeploy/pytorch/config.py
index 3b620a8cff..bbcabba51f 100644
--- a/lmdeploy/pytorch/config.py
+++ b/lmdeploy/pytorch/config.py
@@ -223,14 +223,14 @@ class MiscConfig:
     custom_module_map: str = None
     empty_init: bool = False
     model_format: str = None
+    hf_overrides: Dict[str, Any] = None
 
     @classmethod
     def from_engine_config(cls, engine_config: PytorchEngineConfig):
         """From engine config."""
-        misc_config = cls(
-            custom_module_map=engine_config.custom_module_map,
-            empty_init=engine_config.empty_init,
-            prefill_interval=engine_config.prefill_interval,
-            model_format=engine_config.model_format,
-        )
+        misc_config = cls(custom_module_map=engine_config.custom_module_map,
+                          empty_init=engine_config.empty_init,
+                          prefill_interval=engine_config.prefill_interval,
+                          model_format=engine_config.model_format,
+                          hf_overrides=engine_config.hf_overrides)
         return misc_config
diff --git a/lmdeploy/pytorch/engine/executor/__init__.py b/lmdeploy/pytorch/engine/executor/__init__.py
index 9aad5aa5bb..d15a12a423 100644
--- a/lmdeploy/pytorch/engine/executor/__init__.py
+++ b/lmdeploy/pytorch/engine/executor/__init__.py
@@ -83,6 +83,10 @@ def build_executor(model_path: str,
             'empty_init requires distributed_executor_backend="ray", ',
             f'get distributed_executor_backend="{distributed_executor_backend}"')
 
+    if misc_config.hf_overrides is not None:
+        logger.warning(f'Overriding HF config with {misc_config.hf_overrides}')
+        model_config.hf_config.update(misc_config.hf_overrides)
+
     if distributed_executor_backend is not None:
         logger.info(f'Build <{distributed_executor_backend}> executor.')
     if distributed_executor_backend == 'uni':
diff --git a/lmdeploy/pytorch/engine/executor/ray_executor.py b/lmdeploy/pytorch/engine/executor/ray_executor.py
index 0e7dc395a5..d29dd9842d 100644
--- a/lmdeploy/pytorch/engine/executor/ray_executor.py
+++ b/lmdeploy/pytorch/engine/executor/ray_executor.py
@@ -258,6 +258,11 @@ def __init__(
         from lmdeploy.tokenizer import Tokenizer
         tokenizer = Tokenizer(model_path).model.model
         model_config = ModelConfig.from_pretrained(model_path, dtype=dtype, dist_config=dist_config)
+
+        if misc_config.hf_overrides is not None:
+            logger.warning(f'Overriding HF config with {misc_config.hf_overrides}')
+            model_config.hf_config.update(misc_config.hf_overrides)
+
         super().__init__(
             model_path=model_path,
             cache_config=cache_config,

From 3a342ba8230e1d4fc237eb7227c7cadf7a00c4ec Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Wed, 9 Jul 2025 18:10:45 +0800
Subject: [PATCH 02/10] TM support

---
 lmdeploy/cli/serve.py               |  3 ++-
 lmdeploy/messages.py                |  3 +++
 lmdeploy/turbomind/deploy/config.py | 19 +++++++++++++++++++
 3 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
index ea2b3fb76c..dedc2faef6 100644
--- a/lmdeploy/cli/serve.py
+++ b/lmdeploy/cli/serve.py
@@ -353,7 +353,8 @@ def api_server(args):
                                                    cache_block_seq_len=args.cache_block_seq_len,
                                                    enable_prefix_caching=args.enable_prefix_caching,
                                                    max_prefill_token_num=args.max_prefill_token_num,
-                                                   communicator=args.communicator)
+                                                   communicator=args.communicator,
+                                                   hf_overrides=args.hf_overrides)
         chat_template_config = get_chat_template(args.chat_template)
 
         from lmdeploy.messages import VisionConfig
diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
index 56ac180865..9bc26ec6b7 100644
--- a/lmdeploy/messages.py
+++ b/lmdeploy/messages.py
@@ -222,6 +222,8 @@ class TurbomindEngineConfig:
         devices(List[int]): the used devices
         empty_init (bool): Whether to load the model weights, you should set
             it to True if you want to update weights after create the pipeline
+        hf_overrides (Dict[str, Any]): Huggingface overrides for the model.
+            It can be used to override the default config of the model,
     """
 
     dtype: str = 'auto'
@@ -251,6 +253,7 @@ class TurbomindEngineConfig:
     devices: Optional[List[int]] = None
     empty_init: bool = False
     communicator: str = 'nccl'
+    hf_overrides: Optional[Dict[str, Any]] = None
 
     def __post_init__(self):
         """Check input validation."""
diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
index 8f2ab0a3bd..5ac965f20f 100644
--- a/lmdeploy/turbomind/deploy/config.py
+++ b/lmdeploy/turbomind/deploy/config.py
@@ -8,6 +8,9 @@
 from pydantic.dataclasses import dataclass
 
 from lmdeploy.messages import TurbomindEngineConfig
+from lmdeploy.utils import get_logger
+
+logger = get_logger('lmdeploy')
 
 
 def config_from_dict(cls, env):
@@ -150,6 +153,22 @@ def update_from_engine_config(self, config: TurbomindEngineConfig):
             if hasattr(self.attention_config, key):
                 setattr(self.attention_config, key, value)
 
+        # update from hf_overrides
+        if hasattr(config, 'hf_overrides') and config.hf_overrides:
+            hf_overrides = config.hf_overrides
+
+            if hf_overrides.get('rope_scaling'):
+                override_param = hf_overrides.get('rope_scaling')
+                if self.attention_config.rope_param is None:
+                    self.attention_config.rope_param = RopeParam(type='', base=0, dim=0)
+
+                self.attention_config.rope_param.__dict__.update(
+                    type=override_param.get('rope_type'),
+                    factor=override_param.get('factor'),
+                    max_position_embeddings=override_param.get('original_max_position_embeddings'))
+
+            logger.warning(f'Overriding HF config with {hf_overrides}')
+
         # use dynamic ntk
         if config.rope_scaling_factor:
             if self.attention_config.rope_param is None:

From 65087cbc9818c02b4665a68e19f4fab5f0ac553b Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Wed, 9 Jul 2025 18:27:10 +0800
Subject: [PATCH 03/10] add default val

---
 lmdeploy/turbomind/deploy/config.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
index 5ac965f20f..2852cb645f 100644
--- a/lmdeploy/turbomind/deploy/config.py
+++ b/lmdeploy/turbomind/deploy/config.py
@@ -158,14 +158,14 @@ def update_from_engine_config(self, config: TurbomindEngineConfig):
             hf_overrides = config.hf_overrides
 
             if hf_overrides.get('rope_scaling'):
-                override_param = hf_overrides.get('rope_scaling')
+                override_params = hf_overrides.get('rope_scaling')
                 if self.attention_config.rope_param is None:
                     self.attention_config.rope_param = RopeParam(type='', base=0, dim=0)
 
-                self.attention_config.rope_param.__dict__.update(
-                    type=override_param.get('rope_type'),
-                    factor=override_param.get('factor'),
-                    max_position_embeddings=override_param.get('original_max_position_embeddings'))
+                self.attention_config.rope_param.__dict__.update(type=override_params.get('rope_type', ''),
+                                                                 factor=override_params.get('factor', 1.0),
+                                                                 max_position_embeddings=override_params.get(
+                                                                     'original_max_position_embeddings', None))
 
             logger.warning(f'Overriding HF config with {hf_overrides}')
 

From 28a8e4e400111691a6207e9b5b16bcc31e5bb7f9 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Thu, 10 Jul 2025 20:19:52 +0800
Subject: [PATCH 04/10] fix for yaml safe dump

---
 lmdeploy/turbomind/turbomind.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index 0c5632bc94..f290d5deed 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -255,7 +255,12 @@ def _postprocess_config(self, tm_config: TurbomindModelConfig, engine_config: Tu
 
         # pack `self.config` and `self.engine_config` into a dict
         self.config_dict = self.config.to_dict()
-        self.config_dict.update(dict(engine_config=asdict(self.engine_config)))
+        engine_config_dict = asdict(engine_config)
+        # Sanitize `engine_config` for YAML serialization.
+        # `PyYAML` raises a `RepresenterError` on `mmengine.ConfigDict` objects
+        # passed by frameworks like OpenCompass. This ensures a standard dict.
+        engine_config_dict = json.loads(json.dumps(engine_config_dict))
+        self.config_dict.update(dict(engine_config=engine_config_dict))
         logger.info(f'turbomind model config:\n\n'
                     f'{json.dumps(self.config_dict, indent=2)}')
 

From 31d9f589a1ccc924e96075e72d69bc9f01ee4544 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Fri, 11 Jul 2025 14:35:27 +0800
Subject: [PATCH 05/10] add testcases

---
 tests/test_lmdeploy/test_hf_overrides.py | 41 ++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 tests/test_lmdeploy/test_hf_overrides.py

diff --git a/tests/test_lmdeploy/test_hf_overrides.py b/tests/test_lmdeploy/test_hf_overrides.py
new file mode 100644
index 0000000000..6425e9de37
--- /dev/null
+++ b/tests/test_lmdeploy/test_hf_overrides.py
@@ -0,0 +1,41 @@
+import pytest
+
+from lmdeploy import PytorchEngineConfig, TurbomindEngineConfig, pipeline
+
+
+@pytest.mark.parametrize('model_path', ['Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen3-30B-A3B'])
+def test_hf_overrides_turbomind(model_path):
+    # Define a custom rope_scaling configuration to override the model's default settings.
+    rope_scaling_override = {'rope_type': 'yarn', 'factor': 4.0, 'original_max_position_embeddings': 32768}
+    hf_overrides = {'rope_scaling': rope_scaling_override}
+
+    backend_config = TurbomindEngineConfig(hf_overrides=hf_overrides)
+    with pipeline(model_path, backend_config=backend_config) as pipe:
+        processed_config = pipe.engine.config.attention_config
+
+        assert getattr(processed_config, 'rope_param') is not None
+        for key, value in rope_scaling_override.items():
+            # Adjust key for compatibility with Turbomind's config
+            if key == 'rope_type':
+                key = 'type'
+            if key == 'original_max_position_embeddings':
+                key = 'max_position_embeddings'
+
+            assert getattr(processed_config.rope_param, key) == value, \
+                f'Expected {key} to be {value}, but got {getattr(processed_config.rope_param, key)}'
+
+
+@pytest.mark.parametrize('model_path', ['Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen3-30B-A3B'])
+def test_hf_overrides_pytorch(model_path):
+    # Define a custom rope_scaling configuration to override the model's default settings.
+    rope_scaling_override = {'rope_type': 'yarn', 'factor': 4.0, 'original_max_position_embeddings': 32768}
+    hf_overrides = {'rope_scaling': rope_scaling_override}
+
+    backend_config = PytorchEngineConfig(hf_overrides=hf_overrides)
+    with pipeline(model_path, backend_config=backend_config) as pipe:
+        processed_config = pipe.engine.get_model_config()
+
+        assert processed_config.hf_config.rope_scaling is not None
+        for key, value in rope_scaling_override.items():
+            assert processed_config.hf_config.rope_scaling.get(key) == value, \
+                f'Expected {key} to be {value}, but got {processed_config.hf_config.rope_scaling.get(key)}'

From a748cfb385089caf04f2dc10d4dcbf1267fb4e43 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Fri, 11 Jul 2025 18:29:27 +0800
Subject: [PATCH 06/10] change hf_overrides positions

---
 lmdeploy/cli/utils.py                         |  2 +-
 lmdeploy/pytorch/config.py                    | 20 +++++++++++++++++--
 lmdeploy/pytorch/engine/executor/__init__.py  | 10 +++++-----
 .../pytorch/engine/executor/ray_executor.py   |  9 ++++-----
 4 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
index 954b205d73..92971dc11f 100644
--- a/lmdeploy/cli/utils.py
+++ b/lmdeploy/cli/utils.py
@@ -241,7 +241,7 @@ def hf_overrides(parser):
         return parser.add_argument('--hf-overrides',
                                    type=json.loads,
                                    default=None,
-                                   help='Extra arguments to be forwarded to for the HuggingFace config.')
+                                   help='Extra arguments to be forwarded to the HuggingFace config.')
 
     @staticmethod
     def use_logn_attn(parser):
diff --git a/lmdeploy/pytorch/config.py b/lmdeploy/pytorch/config.py
index bbcabba51f..2ecd04e3a2 100644
--- a/lmdeploy/pytorch/config.py
+++ b/lmdeploy/pytorch/config.py
@@ -158,7 +158,8 @@ def from_pretrained(cls,
                         pretrained_model_name_or_path: str,
                         trust_remote_code: bool = True,
                         dtype: str = 'auto',
-                        dist_config: DistConfig = None):
+                        dist_config: DistConfig = None,
+                        hf_overrides: Dict[str, Any] = None):
         """Instantiate one of the configuration classes of the library from a
         pretrained model configuration.
 
@@ -168,13 +169,28 @@ def from_pretrained(cls,
                 models defined on the Hub in their own modeling files.
             dtype (str): user specified data type for model weights and
                 activations. Refer to `PyTorchEngineConfig` for details
+            hf_overrides (Dict[str, Any]): overrides for the HF config.
         """
         from transformers import AutoConfig
+
+        from lmdeploy.utils import get_logger
+
         hf_config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust_remote_code)
         if getattr(hf_config, 'model_type', None) in ['phi3']:
             # phi3 + trust_remote_code leads to error when tp.
             hf_config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
-        return cls.from_hf_config(hf_config, pretrained_model_name_or_path, dtype=dtype, dist_config=dist_config)
+
+        model_config = cls.from_hf_config(hf_config,
+                                          pretrained_model_name_or_path,
+                                          dtype=dtype,
+                                          dist_config=dist_config)
+
+        if hf_overrides is not None:
+            logger = get_logger('lmdeploy')
+            logger.warning(f'Overriding HF config with {hf_overrides}')
+            model_config.hf_config.update(hf_overrides)
+
+        return model_config
 
     @classmethod
     def from_hf_config(cls,
diff --git a/lmdeploy/pytorch/engine/executor/__init__.py b/lmdeploy/pytorch/engine/executor/__init__.py
index d15a12a423..0e26701748 100644
--- a/lmdeploy/pytorch/engine/executor/__init__.py
+++ b/lmdeploy/pytorch/engine/executor/__init__.py
@@ -68,7 +68,11 @@ def build_executor(model_path: str,
     dp = dist_config.dp
     world_size = dist_config.world_size
 
-    model_config = ModelConfig.from_pretrained(model_path, trust_remote_code=True, dtype=dtype, dist_config=dist_config)
+    model_config = ModelConfig.from_pretrained(model_path,
+                                               trust_remote_code=True,
+                                               dtype=dtype,
+                                               hf_overrides=misc_config.hf_overrides,
+                                               dist_config=dist_config)
 
     if distributed_executor_backend is None:
         distributed_executor_backend = get_distributed_executor_backend(world_size, dp, device_type, logger)
@@ -83,10 +87,6 @@ def build_executor(model_path: str,
             'empty_init requires distributed_executor_backend="ray", ',
             f'get distributed_executor_backend="{distributed_executor_backend}"')
 
-    if misc_config.hf_overrides is not None:
-        logger.warning(f'Overriding HF config with {misc_config.hf_overrides}')
-        model_config.hf_config.update(misc_config.hf_overrides)
-
     if distributed_executor_backend is not None:
         logger.info(f'Build <{distributed_executor_backend}> executor.')
     if distributed_executor_backend == 'uni':
diff --git a/lmdeploy/pytorch/engine/executor/ray_executor.py b/lmdeploy/pytorch/engine/executor/ray_executor.py
index d29dd9842d..0e6bd1114f 100644
--- a/lmdeploy/pytorch/engine/executor/ray_executor.py
+++ b/lmdeploy/pytorch/engine/executor/ray_executor.py
@@ -257,11 +257,10 @@ def __init__(
 
         from lmdeploy.tokenizer import Tokenizer
         tokenizer = Tokenizer(model_path).model.model
-        model_config = ModelConfig.from_pretrained(model_path, dtype=dtype, dist_config=dist_config)
-
-        if misc_config.hf_overrides is not None:
-            logger.warning(f'Overriding HF config with {misc_config.hf_overrides}')
-            model_config.hf_config.update(misc_config.hf_overrides)
+        model_config = ModelConfig.from_pretrained(model_path,
+                                                   dtype=dtype,
+                                                   hf_overrides=misc_config.hf_overrides,
+                                                   dist_config=dist_config)
 
         super().__init__(
             model_path=model_path,

From 5141f37030fb95fbb47682291040dba05cee0e96 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Tue, 15 Jul 2025 14:02:29 +0800
Subject: [PATCH 07/10] optimize

---
 lmdeploy/cli/serve.py               |  1 +
 lmdeploy/turbomind/deploy/config.py | 23 ++++++++++-------------
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
index 3b6b19c5c8..fd11048185 100644
--- a/lmdeploy/cli/serve.py
+++ b/lmdeploy/cli/serve.py
@@ -194,6 +194,7 @@ def add_parser_api_server():
         ArgumentHelper.num_tokens_per_iter(tb_group)
         ArgumentHelper.max_prefill_iters(tb_group)
         ArgumentHelper.communicator(tb_group)
+        ArgumentHelper.hf_overrides(tb_group)
 
         # vlm args
         vision_group = parser.add_argument_group('Vision model arguments')
diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
index 2852cb645f..2890638286 100644
--- a/lmdeploy/turbomind/deploy/config.py
+++ b/lmdeploy/turbomind/deploy/config.py
@@ -159,25 +159,22 @@ def update_from_engine_config(self, config: TurbomindEngineConfig):
 
             if hf_overrides.get('rope_scaling'):
                 override_params = hf_overrides.get('rope_scaling')
-                if self.attention_config.rope_param is None:
-                    self.attention_config.rope_param = RopeParam(type='', base=0, dim=0)
 
-                self.attention_config.rope_param.__dict__.update(type=override_params.get('rope_type', ''),
-                                                                 factor=override_params.get('factor', 1.0),
-                                                                 max_position_embeddings=override_params.get(
-                                                                     'original_max_position_embeddings', None))
+                rope_param = self.attention_config.rope_param or RopeParam(type='', base=0, dim=0)
+                rope_param.type = override_params.get('rope_type', '')
+                rope_param.factor = override_params.get('factor', 1.0)
+                rope_param.max_position_embeddings = override_params.get('original_max_position_embeddings', None)
 
             logger.warning(f'Overriding HF config with {hf_overrides}')
 
         # use dynamic ntk
         if config.rope_scaling_factor:
-            if self.attention_config.rope_param is None:
-                # some ut will create empty RopeParam, will check base/dim in src code
-                self.attention_config.rope_param = RopeParam(type='', base=0, dim=0)
-            self.attention_config.rope_param.__dict__.update(
-                type='dynamic',
-                factor=config.rope_scaling_factor,
-                max_position_embeddings=self.attention_config.max_position_embeddings)
+            # some ut will create empty RopeParam, will check base/dim in src code
+            rope_param = self.attention_config.rope_param or RopeParam(type='', base=0, dim=0)
+
+            rope_param.type = 'dynamic'
+            rope_param.factor = config.rope_scaling_factor
+            rope_param.max_position_embeddings = self.attention_config.max_position_embeddings
 
     @classmethod
     def from_dict(cls, config: dict = {}):

From 5730a83b9be3ac1d24ac69f61eb2db3bfa0582b1 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Tue, 15 Jul 2025 14:22:12 +0800
Subject: [PATCH 08/10] fix arg helper, add warnings

---
 lmdeploy/cli/serve.py               | 4 ++--
 lmdeploy/turbomind/deploy/config.py | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
index fd11048185..74f268b5b1 100644
--- a/lmdeploy/cli/serve.py
+++ b/lmdeploy/cli/serve.py
@@ -165,6 +165,7 @@ def add_parser_api_server():
         max_prefill_token_num_act = ArgumentHelper.max_prefill_token_num(pt_group)
         quant_policy = ArgumentHelper.quant_policy(pt_group)
         model_format = ArgumentHelper.model_format(pt_group)
+        hf_overrides = ArgumentHelper.hf_overrides(pt_group)
         ArgumentHelper.dp(pt_group)
         ArgumentHelper.ep(pt_group)
         ArgumentHelper.enable_microbatch(pt_group)
@@ -172,7 +173,6 @@ def add_parser_api_server():
         ArgumentHelper.enable_metrics(pt_group)
         ArgumentHelper.role(pt_group)
         ArgumentHelper.migration_backend(pt_group)
-        ArgumentHelper.hf_overrides(pt_group)
         # multi-node serving args
         ArgumentHelper.node_rank(parser)
         ArgumentHelper.num_nodes(parser)
@@ -190,11 +190,11 @@ def add_parser_api_server():
         tb_group._group_actions.append(max_prefill_token_num_act)
         tb_group._group_actions.append(quant_policy)
         tb_group._group_actions.append(model_format)
+        tb_group._group_actions.append(hf_overrides)
         ArgumentHelper.rope_scaling_factor(tb_group)
         ArgumentHelper.num_tokens_per_iter(tb_group)
         ArgumentHelper.max_prefill_iters(tb_group)
         ArgumentHelper.communicator(tb_group)
-        ArgumentHelper.hf_overrides(tb_group)
 
         # vlm args
         vision_group = parser.add_argument_group('Vision model arguments')
diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
index 2890638286..49763b78ea 100644
--- a/lmdeploy/turbomind/deploy/config.py
+++ b/lmdeploy/turbomind/deploy/config.py
@@ -176,6 +176,9 @@ def update_from_engine_config(self, config: TurbomindEngineConfig):
             rope_param.factor = config.rope_scaling_factor
             rope_param.max_position_embeddings = self.attention_config.max_position_embeddings
 
+            logger.warning(
+                '`--rope-scaling-factor` will be removed in a future release. Please instead use `--hf-overrides`.')
+
     @classmethod
     def from_dict(cls, config: dict = {}):
         """Construct TurbomindModelConfig instance from config in a dict."""

From 090c370d23072fd9192144855a4df92eb0f1198c Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Tue, 15 Jul 2025 17:06:03 +0800
Subject: [PATCH 09/10] remove UT

---
 tests/test_lmdeploy/test_hf_overrides.py | 41 ------------------------
 1 file changed, 41 deletions(-)
 delete mode 100644 tests/test_lmdeploy/test_hf_overrides.py

diff --git a/tests/test_lmdeploy/test_hf_overrides.py b/tests/test_lmdeploy/test_hf_overrides.py
deleted file mode 100644
index 6425e9de37..0000000000
--- a/tests/test_lmdeploy/test_hf_overrides.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import pytest
-
-from lmdeploy import PytorchEngineConfig, TurbomindEngineConfig, pipeline
-
-
-@pytest.mark.parametrize('model_path', ['Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen3-30B-A3B'])
-def test_hf_overrides_turbomind(model_path):
-    # Define a custom rope_scaling configuration to override the model's default settings.
-    rope_scaling_override = {'rope_type': 'yarn', 'factor': 4.0, 'original_max_position_embeddings': 32768}
-    hf_overrides = {'rope_scaling': rope_scaling_override}
-
-    backend_config = TurbomindEngineConfig(hf_overrides=hf_overrides)
-    with pipeline(model_path, backend_config=backend_config) as pipe:
-        processed_config = pipe.engine.config.attention_config
-
-        assert getattr(processed_config, 'rope_param') is not None
-        for key, value in rope_scaling_override.items():
-            # Adjust key for compatibility with Turbomind's config
-            if key == 'rope_type':
-                key = 'type'
-            if key == 'original_max_position_embeddings':
-                key = 'max_position_embeddings'
-
-            assert getattr(processed_config.rope_param, key) == value, \
-                f'Expected {key} to be {value}, but got {getattr(processed_config.rope_param, key)}'
-
-
-@pytest.mark.parametrize('model_path', ['Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen3-30B-A3B'])
-def test_hf_overrides_pytorch(model_path):
-    # Define a custom rope_scaling configuration to override the model's default settings.
-    rope_scaling_override = {'rope_type': 'yarn', 'factor': 4.0, 'original_max_position_embeddings': 32768}
-    hf_overrides = {'rope_scaling': rope_scaling_override}
-
-    backend_config = PytorchEngineConfig(hf_overrides=hf_overrides)
-    with pipeline(model_path, backend_config=backend_config) as pipe:
-        processed_config = pipe.engine.get_model_config()
-
-        assert processed_config.hf_config.rope_scaling is not None
-        for key, value in rope_scaling_override.items():
-            assert processed_config.hf_config.rope_scaling.get(key) == value, \
-                f'Expected {key} to be {value}, but got {processed_config.hf_config.rope_scaling.get(key)}'

From 77cd89d196a980e5565e30f02d8891749be60c01 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Tue, 15 Jul 2025 17:38:06 +0800
Subject: [PATCH 10/10] fix UT

---
 lmdeploy/turbomind/deploy/config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
index 49763b78ea..b629927a8b 100644
--- a/lmdeploy/turbomind/deploy/config.py
+++ b/lmdeploy/turbomind/deploy/config.py
@@ -165,17 +165,18 @@ def update_from_engine_config(self, config: TurbomindEngineConfig):
                 rope_param.factor = override_params.get('factor', 1.0)
                 rope_param.max_position_embeddings = override_params.get('original_max_position_embeddings', None)
 
+                self.attention_config.rope_param = rope_param
             logger.warning(f'Overriding HF config with {hf_overrides}')
 
         # use dynamic ntk
         if config.rope_scaling_factor:
             # some ut will create empty RopeParam, will check base/dim in src code
             rope_param = self.attention_config.rope_param or RopeParam(type='', base=0, dim=0)
-
             rope_param.type = 'dynamic'
             rope_param.factor = config.rope_scaling_factor
             rope_param.max_position_embeddings = self.attention_config.max_position_embeddings
 
+            self.attention_config.rope_param = rope_param
             logger.warning(
                 '`--rope-scaling-factor` will be removed in a future release. Please instead use `--hf-overrides`.')