Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions swift/megatron/model/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class MLLMMegatronModelType:
qwen3_vl = 'qwen3_vl'
qwen2_5_omni = 'qwen2_5_omni'
qwen3_omni = 'qwen3_omni'
qwen3_5 = 'qwen3_5'
ovis2_5 = 'ovis2_5'

internvl3 = 'internvl3'
Expand Down
2 changes: 1 addition & 1 deletion swift/megatron/model/gpt_bridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -713,7 +713,7 @@ def _get_hf_grouped(self):
if self.args.hf_model_type in {
'qwen2_moe', 'qwen3_moe', 'deepseek_v2', 'deepseek_v3', 'dots1', 'ernie4_5_moe', 'glm4_moe',
'glm4_moe_lite', 'glm4v_moe', 'minimax_m2', 'olmoe', 'qwen3_next', 'kimi_vl', 'qwen3_omni_moe',
'qwen3_vl_moe'
'qwen3_vl_moe', 'qwen3_5_moe'
}:
return False, False
return None, None
Expand Down
9 changes: 6 additions & 3 deletions swift/megatron/model/gpts/qwen3_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from megatron.core.tensor_parallel import (gather_from_sequence_parallel_region,
reduce_scatter_to_sequence_parallel_region)
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
from megatron.core.transformer.identity_op import IdentityOp
from megatron.core.transformer.spec_utils import build_module
from megatron.core.transformer.transformer_block import TransformerBlockSubmodules
from megatron.core.transformer.transformer_config import TransformerConfig
Expand Down Expand Up @@ -475,7 +476,7 @@ def forward(self, hidden_states: torch.Tensor, **kwargs):
return res, None


def get_qwen3_next_transformer_layer_spec(config, vp_stage=None):
def get_qwen3_next_transformer_layer_spec(config, vp_stage=None, gated_delta_net=None):
config.hetereogenous_dist_checkpoint = True
# compat Qwen3NextGatedDeltaNet
args = get_args()
Expand All @@ -500,16 +501,18 @@ def get_qwen3_next_transformer_layer_spec(config, vp_stage=None):
**kwargs,
)
layer_specs = []
gated_delta_net = gated_delta_net or Qwen3NextGatedDeltaNet
for layer_type in args.layer_types:
layer_spec = deepcopy(moe_layer_spec)
if layer_type == 'linear_attention':
layer_spec.submodules.self_attention.module = Qwen3NextGatedDeltaNet
layer_spec.submodules.self_attention.module = gated_delta_net
elif layer_type == 'full_attention':
layer_spec.submodules.self_attention.submodules.linear_qkv = TEColumnParallelLinear
layer_spec.submodules.self_attention.module = Qwen3NextSelfAttention
# Replace ALL layernorms with Qwen3NextRMSNorm (Zero-Centered)
layer_spec.submodules.input_layernorm = layer_norm_impl
if hasattr(layer_spec.submodules, 'pre_mlp_layernorm'):
if hasattr(layer_spec.submodules,
'pre_mlp_layernorm') and layer_spec.submodules.pre_mlp_layernorm is not IdentityOp:
layer_spec.submodules.pre_mlp_layernorm = layer_norm_impl
# Replace qk_layernorm if present
if hasattr(layer_spec.submodules.self_attention.submodules, 'q_layernorm'):
Expand Down
2 changes: 1 addition & 1 deletion swift/megatron/model/mm_gpts/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# Copyright (c) ModelScope Contributors. All rights reserved.
from . import glm, internvl, kimi_vl, llama4, qwen, qwen3_vl
from . import glm, internvl, kimi_vl, llama4, qwen, qwen3_5, qwen3_vl
105 changes: 105 additions & 0 deletions swift/megatron/model/mm_gpts/qwen3_5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# Copyright (c) ModelScope Contributors. All rights reserved.
from functools import partial

import torch
from megatron.core.extensions.transformer_engine import _get_extra_te_kwargs
from megatron.core.models.huggingface import HuggingFaceModule as _HuggingFaceModule
from megatron.core.tensor_parallel import (gather_from_sequence_parallel_region,
reduce_scatter_to_sequence_parallel_region)
from megatron.core.transformer.attention import SelfAttentionSubmodules
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.training import get_args

from swift.model import ModelType
from swift.template import Template
from ..constant import MegatronModelType
from ..gpts.qwen3_next import Qwen3NextBridge, get_qwen3_next_mtp_block_spec, get_qwen3_next_transformer_layer_spec
from ..register import MegatronModelMeta, register_megatron_model
from .utils import HuggingFaceModule

try:
from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import Qwen3_5MoeGatedDeltaNet as _Qwen3_5MoeGatedDeltaNet
except ImportError:
_Qwen3_5MoeGatedDeltaNet = object


class Qwen3_5MoeGatedDeltaNet(_HuggingFaceModule, _Qwen3_5MoeGatedDeltaNet):

def __init__(self, config: TransformerConfig, submodules: SelfAttentionSubmodules, layer_number: int, **kwargs):
assert config.context_parallel_size == 1, 'Qwen3_5 currently does not support context parallel.'
assert _Qwen3_5MoeGatedDeltaNet is not object, 'please update the `transformers` version.'
_Qwen3_5MoeGatedDeltaNet.__init__(self, config, layer_number)
self.config = config
extra_kwargs = _get_extra_te_kwargs(config)
self.to(dtype=extra_kwargs['params_dtype'], device=extra_kwargs['device'])

def forward(self, hidden_states: torch.Tensor, **kwargs):
args = get_args()
if args.sequence_parallel and args.tensor_model_parallel_size > 1:
hidden_states = gather_from_sequence_parallel_region(hidden_states)
seq_len = hidden_states.shape[0]
packed_seq_params = kwargs.get('packed_seq_params')
thd_format = packed_seq_params is not None and packed_seq_params.qkv_format == 'thd'
# Note: for packed inputs, we do not perform padding_free unpadding.
# Doing so would allow different sequences to see each other; for efficiency we keep this implementation.
if thd_format and not args.packing:
new_hidden_states = hidden_states.new_zeros(
(packed_seq_params.num_samples, packed_seq_params.max_seqlen_q.item(), hidden_states.shape[-1]))
attention_mask = hidden_states.new_zeros(
(packed_seq_params.num_samples, packed_seq_params.max_seqlen_q.item()), dtype=torch.bool)
cu_seqlens_q = packed_seq_params.cu_seqlens_q
for i in range(packed_seq_params.num_samples):
start, end = cu_seqlens_q[i], cu_seqlens_q[i + 1]
attention_mask[i, :end - start] = True
new_hidden_states[i, :end - start] = hidden_states[start:end, 0]
hidden_states = new_hidden_states
else:
hidden_states = hidden_states.transpose(0, 1)
attention_mask = kwargs.get('attention_mask')
if attention_mask is not None:
attention_mask = (~attention_mask).sum(dim=(1, 2)) > 0
res = super().forward(hidden_states=hidden_states, attention_mask=attention_mask)
if thd_format and not args.packing:
res = res[attention_mask][:, None]
res = torch.concat([res, res.new_zeros(seq_len - res.shape[0], 1, res.shape[2])])
else:
res = res.transpose(0, 1)
if args.sequence_parallel and args.tensor_model_parallel_size > 1:
# Quick fix for dropout issue, awaiting ms-swift 4.0 refactor
res = reduce_scatter_to_sequence_parallel_region(res) / args.tensor_model_parallel_size
return res, None


class Qwen3_5Vit(HuggingFaceModule):
module_mapping = {'model.visual': 'visual'}
_vision_tower = ['visual']
_aligner = ['visual.merger']

def __init__(self, config):
from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5TextModel
from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import Qwen3_5MoeTextModel
super().__init__(config, [Qwen3_5TextModel, Qwen3_5MoeTextModel])

def get_inputs_embeds(self, inputs_embeds, **kwargs):
return Template._get_inputs_embeds_hf(inputs_embeds, kwargs, self.visual, self.processor, self.model_config)


class Qwen3_5Bridge(Qwen3NextBridge):
hf_layers_prefix = 'model.language_model.layers'
hf_embed_key = 'model.language_model.embed_tokens.weight'
hf_final_layernorm_key = 'model.language_model.norm.weight'


register_megatron_model(
MegatronModelMeta(
MegatronModelType.qwen3_5,
[
ModelType.qwen3_5,
ModelType.qwen3_5_moe,
],
get_transformer_layer_spec=partial(
get_qwen3_next_transformer_layer_spec, gated_delta_net=Qwen3_5MoeGatedDeltaNet),
get_mtp_block_spec=get_qwen3_next_mtp_block_spec,
bridge_cls=Qwen3_5Bridge,
visual_cls=Qwen3_5Vit,
))
13 changes: 7 additions & 6 deletions swift/megatron/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,13 +106,14 @@ def convert_hf_config(config) -> Dict[str, Any]:
interleave_moe_layer_step = res.pop('interleave_moe_layer_step', None)
window_size = res.pop('window_size', None)
rope_scaling = res.get('rope_scaling') or {}
if llm_model_type in {'qwen3', 'qwen3_moe', 'qwen3_next'
} or hf_model_type in {'qwen3_omni_moe', 'qwen3_omni', 'qwen3_vl', 'qwen3_vl_moe'}:
if llm_model_type in {'qwen3', 'qwen3_moe', 'qwen3_next'} or hf_model_type in {
'qwen3_omni_moe', 'qwen3_omni', 'qwen3_vl', 'qwen3_vl_moe', 'qwen3_5', 'qwen3_5_moe'
}:
res['qk_layernorm'] = True
if llm_model_type in {'qwen2_moe', 'qwen3_moe', 'qwen3_next'
} or hf_model_type in {'qwen3_omni_moe', 'qwen3_vl_moe'}:
} or hf_model_type in {'qwen3_omni_moe', 'qwen3_vl_moe', 'qwen3_5_moe'}:
res.pop('ffn_hidden_size', None)
if llm_model_type in {'qwen2_moe', 'qwen3_next'}:
if llm_model_type in {'qwen2_moe', 'qwen3_next'} or hf_model_type == 'qwen3_5_moe':
res['use_shared_expert_gate'] = True
if llm_model_type in {
'deepseek',
Expand Down Expand Up @@ -156,8 +157,8 @@ def convert_hf_config(config) -> Dict[str, Any]:
if llm_model_type == 'glm4_moe_lite':
res['qk_layernorm'] = True
res.pop('num_query_groups', None)
elif llm_model_type == 'qwen3_next':
full_attention_interval = res.pop('full_attention_interval')
elif llm_model_type == 'qwen3_next' or hf_model_type in {'qwen3_5', 'qwen3_5_moe'}:
full_attention_interval = res.pop('full_attention_interval', 4)
num_layers = res['num_layers']
res['layer_types'] = [
'full_attention' if (i + 1) % full_attention_interval == 0 else 'linear_attention'
Expand Down
3 changes: 3 additions & 0 deletions swift/model/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,9 @@ class MLLMModelType:
qwen3_vl_moe = 'qwen3_vl_moe'
qwen3_vl_emb = 'qwen3_vl_emb'
qwen3_vl_reranker = 'qwen3_vl_reranker'
qwen3_5 = 'qwen3_5'
qwen3_5_moe = 'qwen3_5_moe'

qwen2_gme = 'qwen2_gme'
ovis1_6 = 'ovis1_6'
ovis2 = 'ovis2'
Expand Down
40 changes: 40 additions & 0 deletions swift/model/models/qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -1111,6 +1111,46 @@ def get_model(self, model_dir: str, config, processor, model_kwargs) -> PreTrain
tags=['vision', 'video']))


class Qwen3_5MoeLoader(Qwen3VLLoader):

def get_model(self, model_dir: str, config, processor, model_kwargs) -> PreTrainedModel:
from transformers import Qwen3_5MoeForConditionalGeneration
self.auto_model_cls = self.auto_model_cls or Qwen3_5MoeForConditionalGeneration
return Qwen2VLLoader.get_model(self, model_dir, config, processor, model_kwargs)
Comment on lines +1114 to +1119
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The get_model method in Qwen3_5MoeLoader bypasses its parent Qwen3VLLoader and directly calls Qwen2VLLoader.get_model. While this is functionally correct, it makes the inheritance hierarchy confusing. For better clarity and maintainability, consider refactoring the class hierarchy. For instance, Qwen3_5MoeLoader could inherit from Qwen2VLLoader and override _check_qwen_vl_utils if needed.



register_model(
ModelMeta(
MLLMModelType.qwen3_5_moe, [
ModelGroup([], TemplateType.qwen3_5),
],
Qwen3_5MoeLoader,
model_arch=ModelArch.qwen2_vl,
architectures=['Qwen3_5MoeForConditionalGeneration'],
requires=['transformers>=5.0.0.dev', 'qwen_vl_utils>=0.0.14', 'decord'],
tags=['vision', 'video']))


class Qwen3_5Loader(Qwen3VLLoader):

def get_model(self, model_dir: str, config, processor, model_kwargs) -> PreTrainedModel:
from transformers import Qwen3_5ForConditionalGeneration
self.auto_model_cls = self.auto_model_cls or Qwen3_5ForConditionalGeneration
return Qwen2VLLoader.get_model(self, model_dir, config, processor, model_kwargs)


register_model(
ModelMeta(
MLLMModelType.qwen3_5, [
ModelGroup([], TemplateType.qwen3_5),
],
Qwen3_5Loader,
model_arch=ModelArch.qwen2_vl,
architectures=['Qwen3_5ForConditionalGeneration'],
requires=['transformers>=5.0.0.dev', 'qwen_vl_utils>=0.0.14', 'decord'],
tags=['vision', 'video']))


class Qwen2_5OmniLoader(ModelLoader):

def get_config(self, model_dir):
Expand Down
2 changes: 2 additions & 0 deletions swift/template/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ class MLLMTemplateType:
qwen3_vl = 'qwen3_vl'
qwen3_vl_emb = 'qwen3_vl_emb'
qwen3_vl_reranker = 'qwen3_vl_reranker'
qwen3_5 = 'qwen3_5'

qwen2_gme = 'qwen2_gme'
qvq = 'qvq'
ovis1_6 = 'ovis1_6'
Expand Down
13 changes: 13 additions & 0 deletions swift/template/templates/qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,19 @@ def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]:
MLLMTemplateType.qwen3_vl, template_cls=Qwen3VLTemplate, default_system=None, thinking_prefix='<think>\n'))


class Qwen3_5Template(Qwen3VLTemplate):
image_token_id = 248056
video_token_id = 248057

def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]:
return Qwen2VLTemplate._post_encode(self, model, inputs)


register_template(
QwenTemplateMeta(
MLLMTemplateType.qwen3_5, template_cls=Qwen3_5Template, default_system=None, thinking_prefix='<think>\n'))


class Qwen3VLEmbTemplate(Qwen3VLTemplate):

def _preprocess_inputs(self, inputs: StdTemplateInputs) -> None:
Expand Down
Loading