Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fallback to pytorch engine when the model is quantized by smooth quant #2953

Merged
merged 7 commits into from
Dec 26, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion autotest/utils/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def get_all_model_list(tp_num: int = None,
model_type=model_type):
if case not in case_list:
case_list.append(case)
return [x for x in case_list if 'w8a8' not in x]
return case_list


def get_quantization_model_list(type):
Expand Down
18 changes: 18 additions & 0 deletions lmdeploy/archs.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,3 +193,21 @@ def get_model_arch(model_path: str):
raise RuntimeError(
f'Could not find model architecture from config: {_cfg}')
return arch, cfg


def get_quantization_config(config):
"""get quantization config from a model's config."""
if isinstance(config, dict):
for k, v in config.items():
if k == 'quantization_config':
return v
if isinstance(v, (dict, list)):
result = get_quantization_config(v)
if result is not None:
return result
elif isinstance(config, list):
AllentDan marked this conversation as resolved.
Show resolved Hide resolved
for item in config:
result = get_quantization_config(item)
if result is not None:
return result
return None
22 changes: 2 additions & 20 deletions lmdeploy/turbomind/deploy/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import fire
import torch

from lmdeploy.archs import get_model_arch
from lmdeploy.archs import get_model_arch, get_quantization_config
from lmdeploy.messages import TurbomindEngineConfig
from lmdeploy.model import MODELS, best_match_model
from lmdeploy.utils import get_logger, get_model
Expand Down Expand Up @@ -174,23 +174,6 @@ def pack_model_repository(workspace_path: str):
dst=osp.join(model_repo_dir, 'postprocessing'))


def find_quantization_config(nested, target_key):
if isinstance(nested, dict):
for key, value in nested.items():
if key == target_key:
return value
if isinstance(value, (dict, list)):
result = find_quantization_config(value, target_key)
if result is not None:
return result
elif isinstance(nested, list):
for item in nested:
result = find_quantization_config(item, target_key)
if result is not None:
return result
return None


def get_tm_model(model_path,
model_name,
chat_template_name,
Expand All @@ -213,8 +196,7 @@ def get_tm_model(model_path,
If it is None, the turbomind model won't be saved
"""
_, cfg = get_model_arch(model_path)
quant_config = find_quantization_config(cfg.to_dict(),
'quantization_config')
quant_config = get_quantization_config(cfg.to_dict())
if quant_config:
quant_method = quant_config.get('quant_method')
_group_size = int(quant_config.get('group_size', 0))
Expand Down
8 changes: 7 additions & 1 deletion lmdeploy/turbomind/supported_models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Copyright (c) OpenMMLab. All rights reserved.
from lmdeploy.archs import get_model_arch
from lmdeploy.archs import get_model_arch, get_quantization_config
from lmdeploy.utils import get_logger

logger = get_logger('lmdeploy')
Expand Down Expand Up @@ -80,7 +80,13 @@ def _is_head_dim_supported(cfg):
if os.path.exists(triton_model_path):
support_by_turbomind = True
else:

arch, cfg = get_model_arch(model_path)
quant_config = get_quantization_config(cfg.to_dict())
if (quant_config
and quant_config.get('quant_method') in ['smooth_quant']):
# tm hasn't support quantized models by applying smoothquant
return False

if arch in SUPPORTED_ARCHS.keys():
support_by_turbomind = True
Expand Down
Loading