Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions swift/megatron/model/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class MLLMMegatronModelType:
qwen3_vl = 'qwen3_vl'
qwen2_5_omni = 'qwen2_5_omni'
qwen3_omni = 'qwen3_omni'
qwen3_5 = 'qwen3_5'
ovis2_5 = 'ovis2_5'

internvl3 = 'internvl3'
Expand Down
2 changes: 1 addition & 1 deletion swift/megatron/model/gpt_bridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -713,7 +713,7 @@ def _get_hf_grouped(self):
if self.args.hf_model_type in {
'qwen2_moe', 'qwen3_moe', 'deepseek_v2', 'deepseek_v3', 'dots1', 'ernie4_5_moe', 'glm4_moe',
'glm4_moe_lite', 'glm4v_moe', 'minimax_m2', 'olmoe', 'qwen3_next', 'kimi_vl', 'qwen3_omni_moe',
'qwen3_vl_moe'
'qwen3_vl_moe', 'qwen3_5_moe'
}:
return False, False
return None, None
Expand Down
9 changes: 6 additions & 3 deletions swift/megatron/model/gpts/qwen3_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from megatron.core.tensor_parallel import (gather_from_sequence_parallel_region,
reduce_scatter_to_sequence_parallel_region)
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
from megatron.core.transformer.identity_op import IdentityOp
from megatron.core.transformer.spec_utils import build_module
from megatron.core.transformer.transformer_block import TransformerBlockSubmodules
from megatron.core.transformer.transformer_config import TransformerConfig
Expand Down Expand Up @@ -475,7 +476,7 @@ def forward(self, hidden_states: torch.Tensor, **kwargs):
return res, None


def get_qwen3_next_transformer_layer_spec(config, vp_stage=None):
def get_qwen3_next_transformer_layer_spec(config, vp_stage=None, gated_delta_net=None):
config.hetereogenous_dist_checkpoint = True
# compat Qwen3NextGatedDeltaNet
args = get_args()
Expand All @@ -500,16 +501,18 @@ def get_qwen3_next_transformer_layer_spec(config, vp_stage=None):
**kwargs,
)
layer_specs = []
gated_delta_net = gated_delta_net or Qwen3NextGatedDeltaNet
for layer_type in args.layer_types:
layer_spec = deepcopy(moe_layer_spec)
if layer_type == 'linear_attention':
layer_spec.submodules.self_attention.module = Qwen3NextGatedDeltaNet
layer_spec.submodules.self_attention.module = gated_delta_net
elif layer_type == 'full_attention':
layer_spec.submodules.self_attention.submodules.linear_qkv = TEColumnParallelLinear
layer_spec.submodules.self_attention.module = Qwen3NextSelfAttention
# Replace ALL layernorms with Qwen3NextRMSNorm (Zero-Centered)
layer_spec.submodules.input_layernorm = layer_norm_impl
if hasattr(layer_spec.submodules, 'pre_mlp_layernorm'):
if hasattr(layer_spec.submodules,
'pre_mlp_layernorm') and layer_spec.submodules.pre_mlp_layernorm is not IdentityOp:
layer_spec.submodules.pre_mlp_layernorm = layer_norm_impl
# Replace qk_layernorm if present
if hasattr(layer_spec.submodules.self_attention.submodules, 'q_layernorm'):
Expand Down
2 changes: 1 addition & 1 deletion swift/megatron/model/mm_gpts/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# Copyright (c) ModelScope Contributors. All rights reserved.
from . import glm, internvl, kimi_vl, llama4, qwen, qwen3_vl
from . import glm, internvl, kimi_vl, llama4, qwen, qwen3_5, qwen3_vl
105 changes: 105 additions & 0 deletions swift/megatron/model/mm_gpts/qwen3_5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# Copyright (c) ModelScope Contributors. All rights reserved.
from functools import partial

import torch
from megatron.core.extensions.transformer_engine import _get_extra_te_kwargs
from megatron.core.models.huggingface import HuggingFaceModule as _HuggingFaceModule
from megatron.core.tensor_parallel import (gather_from_sequence_parallel_region,
reduce_scatter_to_sequence_parallel_region)
from megatron.core.transformer.attention import SelfAttentionSubmodules
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.training import get_args

from swift.model import ModelType
from swift.template import Template
from ..constant import MegatronModelType
from ..gpts.qwen3_next import Qwen3NextBridge, get_qwen3_next_mtp_block_spec, get_qwen3_next_transformer_layer_spec
from ..register import MegatronModelMeta, register_megatron_model
from .utils import HuggingFaceModule

try:
from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import Qwen3_5MoeGatedDeltaNet as _Qwen3_5MoeGatedDeltaNet
except ImportError:
_Qwen3_5MoeGatedDeltaNet = object


class Qwen3_5MoeGatedDeltaNet(_HuggingFaceModule, _Qwen3_5MoeGatedDeltaNet):

def __init__(self, config: TransformerConfig, submodules: SelfAttentionSubmodules, layer_number: int, **kwargs):
assert config.context_parallel_size == 1, 'Qwen3_5 currently does not support context parallel.'
assert _Qwen3_5MoeGatedDeltaNet is not object, 'please update the `transformers` version.'
_Qwen3_5MoeGatedDeltaNet.__init__(self, config, layer_number)
self.config = config
extra_kwargs = _get_extra_te_kwargs(config)
self.to(dtype=extra_kwargs['params_dtype'], device=extra_kwargs['device'])

def forward(self, hidden_states: torch.Tensor, **kwargs):
args = get_args()
if args.sequence_parallel and args.tensor_model_parallel_size > 1:
hidden_states = gather_from_sequence_parallel_region(hidden_states)
seq_len = hidden_states.shape[0]
packed_seq_params = kwargs.get('packed_seq_params')
thd_format = packed_seq_params is not None and packed_seq_params.qkv_format == 'thd'
# Note: for packed inputs, we do not perform padding_free unpadding.
# Doing so would allow different sequences to see each other; for efficiency we keep this implementation.
if thd_format and not args.packing:
new_hidden_states = hidden_states.new_zeros(
(packed_seq_params.num_samples, packed_seq_params.max_seqlen_q.item(), hidden_states.shape[-1]))
attention_mask = hidden_states.new_zeros(
(packed_seq_params.num_samples, packed_seq_params.max_seqlen_q.item()), dtype=torch.bool)
cu_seqlens_q = packed_seq_params.cu_seqlens_q
for i in range(packed_seq_params.num_samples):
start, end = cu_seqlens_q[i], cu_seqlens_q[i + 1]
attention_mask[i, :end - start] = True
new_hidden_states[i, :end - start] = hidden_states[start:end, 0]
hidden_states = new_hidden_states
else:
hidden_states = hidden_states.transpose(0, 1)
attention_mask = kwargs.get('attention_mask')
if attention_mask is not None:
attention_mask = (~attention_mask).sum(dim=(1, 2)) > 0
Comment thread
Jintao-Huang marked this conversation as resolved.
res = super().forward(hidden_states=hidden_states, attention_mask=attention_mask)
if thd_format and not args.packing:
res = res[attention_mask][:, None]
res = torch.concat([res, res.new_zeros(seq_len - res.shape[0], 1, res.shape[2])])
else:
res = res.transpose(0, 1)
if args.sequence_parallel and args.tensor_model_parallel_size > 1:
# Quick fix for dropout issue, awaiting ms-swift 4.0 refactor
res = reduce_scatter_to_sequence_parallel_region(res) / args.tensor_model_parallel_size
return res, None


class Qwen3_5Vit(HuggingFaceModule):
module_mapping = {'model.visual': 'visual'}
_vision_tower = ['visual']
_aligner = ['visual.merger']

def __init__(self, config):
from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5TextModel
from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import Qwen3_5MoeTextModel
super().__init__(config, [Qwen3_5TextModel, Qwen3_5MoeTextModel])

def get_inputs_embeds(self, inputs_embeds, **kwargs):
return Template._get_inputs_embeds_hf(inputs_embeds, kwargs, self.visual, self.processor, self.model_config)


class Qwen3_5Bridge(Qwen3NextBridge):
hf_layers_prefix = 'model.language_model.layers'
hf_embed_key = 'model.language_model.embed_tokens.weight'
hf_final_layernorm_key = 'model.language_model.norm.weight'


register_megatron_model(
MegatronModelMeta(
MegatronModelType.qwen3_5,
[
ModelType.qwen3_5,
ModelType.qwen3_5_moe,
],
get_transformer_layer_spec=partial(
get_qwen3_next_transformer_layer_spec, gated_delta_net=Qwen3_5MoeGatedDeltaNet),
get_mtp_block_spec=get_qwen3_next_mtp_block_spec,
bridge_cls=Qwen3_5Bridge,
visual_cls=Qwen3_5Vit,
))
13 changes: 7 additions & 6 deletions swift/megatron/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,13 +106,14 @@ def convert_hf_config(config) -> Dict[str, Any]:
interleave_moe_layer_step = res.pop('interleave_moe_layer_step', None)
window_size = res.pop('window_size', None)
rope_scaling = res.get('rope_scaling') or {}
if llm_model_type in {'qwen3', 'qwen3_moe', 'qwen3_next'
} or hf_model_type in {'qwen3_omni_moe', 'qwen3_omni', 'qwen3_vl', 'qwen3_vl_moe'}:
if llm_model_type in {'qwen3', 'qwen3_moe', 'qwen3_next'} or hf_model_type in {
'qwen3_omni_moe', 'qwen3_omni', 'qwen3_vl', 'qwen3_vl_moe', 'qwen3_5', 'qwen3_5_moe'
}:
res['qk_layernorm'] = True
if llm_model_type in {'qwen2_moe', 'qwen3_moe', 'qwen3_next'
} or hf_model_type in {'qwen3_omni_moe', 'qwen3_vl_moe'}:
} or hf_model_type in {'qwen3_omni_moe', 'qwen3_vl_moe', 'qwen3_5_moe'}:
res.pop('ffn_hidden_size', None)
if llm_model_type in {'qwen2_moe', 'qwen3_next'}:
if llm_model_type in {'qwen2_moe', 'qwen3_next'} or hf_model_type == 'qwen3_5_moe':
res['use_shared_expert_gate'] = True
if llm_model_type in {
'deepseek',
Expand Down Expand Up @@ -156,8 +157,8 @@ def convert_hf_config(config) -> Dict[str, Any]:
if llm_model_type == 'glm4_moe_lite':
res['qk_layernorm'] = True
res.pop('num_query_groups', None)
elif llm_model_type == 'qwen3_next':
full_attention_interval = res.pop('full_attention_interval')
elif llm_model_type == 'qwen3_next' or hf_model_type in {'qwen3_5', 'qwen3_5_moe'}:
full_attention_interval = res.pop('full_attention_interval', 4)
num_layers = res['num_layers']
res['layer_types'] = [
'full_attention' if (i + 1) % full_attention_interval == 0 else 'linear_attention'
Expand Down
3 changes: 3 additions & 0 deletions swift/model/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,9 @@ class MLLMModelType:
qwen3_vl_moe = 'qwen3_vl_moe'
qwen3_vl_emb = 'qwen3_vl_emb'
qwen3_vl_reranker = 'qwen3_vl_reranker'
qwen3_5 = 'qwen3_5'
qwen3_5_moe = 'qwen3_5_moe'

qwen2_gme = 'qwen2_gme'
ovis1_6 = 'ovis1_6'
ovis2 = 'ovis2'
Expand Down
40 changes: 40 additions & 0 deletions swift/model/models/qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -1111,6 +1111,46 @@ def get_model(self, model_dir: str, config, processor, model_kwargs) -> PreTrain
tags=['vision', 'video']))


class Qwen3_5MoeLoader(Qwen3VLLoader):

def get_model(self, model_dir: str, config, processor, model_kwargs) -> PreTrainedModel:
from transformers import Qwen3_5MoeForConditionalGeneration
self.auto_model_cls = self.auto_model_cls or Qwen3_5MoeForConditionalGeneration
return Qwen2VLLoader.get_model(self, model_dir, config, processor, model_kwargs)
Comment on lines +1114 to +1119

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The get_model method in Qwen3_5MoeLoader bypasses its parent Qwen3VLLoader and directly calls Qwen2VLLoader.get_model. While this is functionally correct, it makes the inheritance hierarchy confusing. For better clarity and maintainability, consider refactoring the class hierarchy. For instance, Qwen3_5MoeLoader could inherit from Qwen2VLLoader and override _check_qwen_vl_utils if needed.



register_model(
ModelMeta(
MLLMModelType.qwen3_5_moe, [
ModelGroup([], TemplateType.qwen3_5),
],
Qwen3_5MoeLoader,
model_arch=ModelArch.qwen2_vl,
architectures=['Qwen3_5MoeForConditionalGeneration'],
requires=['transformers>=5.0.0.dev', 'qwen_vl_utils>=0.0.14', 'decord'],
tags=['vision', 'video']))


class Qwen3_5Loader(Qwen3VLLoader):

def get_model(self, model_dir: str, config, processor, model_kwargs) -> PreTrainedModel:
from transformers import Qwen3_5ForConditionalGeneration
self.auto_model_cls = self.auto_model_cls or Qwen3_5ForConditionalGeneration
return Qwen2VLLoader.get_model(self, model_dir, config, processor, model_kwargs)
Comment thread
Jintao-Huang marked this conversation as resolved.


register_model(
ModelMeta(
MLLMModelType.qwen3_5, [
ModelGroup([], TemplateType.qwen3_5),
],
Qwen3_5Loader,
model_arch=ModelArch.qwen2_vl,
architectures=['Qwen3_5ForConditionalGeneration'],
requires=['transformers>=5.0.0.dev', 'qwen_vl_utils>=0.0.14', 'decord'],
tags=['vision', 'video']))


class Qwen2_5OmniLoader(ModelLoader):

def get_config(self, model_dir):
Expand Down
2 changes: 2 additions & 0 deletions swift/template/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ class MLLMTemplateType:
qwen3_vl = 'qwen3_vl'
qwen3_vl_emb = 'qwen3_vl_emb'
qwen3_vl_reranker = 'qwen3_vl_reranker'
qwen3_5 = 'qwen3_5'

qwen2_gme = 'qwen2_gme'
qvq = 'qvq'
ovis1_6 = 'ovis1_6'
Expand Down
13 changes: 13 additions & 0 deletions swift/template/templates/qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,19 @@ def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]:
MLLMTemplateType.qwen3_vl, template_cls=Qwen3VLTemplate, default_system=None, thinking_prefix='<think>\n'))


class Qwen3_5Template(Qwen3VLTemplate):
image_token_id = 248056
video_token_id = 248057

def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]:
return Qwen2VLTemplate._post_encode(self, model, inputs)
Comment thread
Jintao-Huang marked this conversation as resolved.


register_template(
QwenTemplateMeta(
MLLMTemplateType.qwen3_5, template_cls=Qwen3_5Template, default_system=None, thinking_prefix='<think>\n'))


class Qwen3VLEmbTemplate(Qwen3VLTemplate):

def _preprocess_inputs(self, inputs: StdTemplateInputs) -> None:
Expand Down
2 changes: 1 addition & 1 deletion swift/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def format_time(seconds):
days = int(seconds // (24 * 3600))
hours = int((seconds % (24 * 3600)) // 3600)
minutes = int((seconds % 3600) // 60)
seconds = round(seconds % 60, 2)
seconds = int(seconds % 60)
Comment thread
Jintao-Huang marked this conversation as resolved.
Outdated

if days > 0:
time_str = f'{days}d {hours}h {minutes}m {seconds}s'
Expand Down
Loading