diff --git a/swift/megatron/model/constant.py b/swift/megatron/model/constant.py index eaff0a9ae6..d91d9fa9b1 100644 --- a/swift/megatron/model/constant.py +++ b/swift/megatron/model/constant.py @@ -15,6 +15,7 @@ class MLLMMegatronModelType: qwen3_vl = 'qwen3_vl' qwen2_5_omni = 'qwen2_5_omni' qwen3_omni = 'qwen3_omni' + qwen3_5 = 'qwen3_5' ovis2_5 = 'ovis2_5' internvl3 = 'internvl3' diff --git a/swift/megatron/model/gpt_bridge.py b/swift/megatron/model/gpt_bridge.py index 02104507bc..c9b9830012 100644 --- a/swift/megatron/model/gpt_bridge.py +++ b/swift/megatron/model/gpt_bridge.py @@ -713,7 +713,7 @@ def _get_hf_grouped(self): if self.args.hf_model_type in { 'qwen2_moe', 'qwen3_moe', 'deepseek_v2', 'deepseek_v3', 'dots1', 'ernie4_5_moe', 'glm4_moe', 'glm4_moe_lite', 'glm4v_moe', 'minimax_m2', 'olmoe', 'qwen3_next', 'kimi_vl', 'qwen3_omni_moe', - 'qwen3_vl_moe' + 'qwen3_vl_moe', 'qwen3_5_moe' }: return False, False return None, None diff --git a/swift/megatron/model/gpts/qwen3_next.py b/swift/megatron/model/gpts/qwen3_next.py index 36beebbf45..8cf00b4592 100644 --- a/swift/megatron/model/gpts/qwen3_next.py +++ b/swift/megatron/model/gpts/qwen3_next.py @@ -13,6 +13,7 @@ from megatron.core.tensor_parallel import (gather_from_sequence_parallel_region, reduce_scatter_to_sequence_parallel_region) from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules +from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.spec_utils import build_module from megatron.core.transformer.transformer_block import TransformerBlockSubmodules from megatron.core.transformer.transformer_config import TransformerConfig @@ -475,7 +476,7 @@ def forward(self, hidden_states: torch.Tensor, **kwargs): return res, None -def get_qwen3_next_transformer_layer_spec(config, vp_stage=None): +def get_qwen3_next_transformer_layer_spec(config, vp_stage=None, gated_delta_net=None): config.hetereogenous_dist_checkpoint = True # compat Qwen3NextGatedDeltaNet args = get_args() @@ -500,16 +501,18 @@ def get_qwen3_next_transformer_layer_spec(config, vp_stage=None): **kwargs, ) layer_specs = [] + gated_delta_net = gated_delta_net or Qwen3NextGatedDeltaNet for layer_type in args.layer_types: layer_spec = deepcopy(moe_layer_spec) if layer_type == 'linear_attention': - layer_spec.submodules.self_attention.module = Qwen3NextGatedDeltaNet + layer_spec.submodules.self_attention.module = gated_delta_net elif layer_type == 'full_attention': layer_spec.submodules.self_attention.submodules.linear_qkv = TEColumnParallelLinear layer_spec.submodules.self_attention.module = Qwen3NextSelfAttention # Replace ALL layernorms with Qwen3NextRMSNorm (Zero-Centered) layer_spec.submodules.input_layernorm = layer_norm_impl - if hasattr(layer_spec.submodules, 'pre_mlp_layernorm'): + if hasattr(layer_spec.submodules, + 'pre_mlp_layernorm') and layer_spec.submodules.pre_mlp_layernorm is not IdentityOp: layer_spec.submodules.pre_mlp_layernorm = layer_norm_impl # Replace qk_layernorm if present if hasattr(layer_spec.submodules.self_attention.submodules, 'q_layernorm'): diff --git a/swift/megatron/model/mm_gpts/__init__.py b/swift/megatron/model/mm_gpts/__init__.py index 4f9882af6d..d71bc68e63 100644 --- a/swift/megatron/model/mm_gpts/__init__.py +++ b/swift/megatron/model/mm_gpts/__init__.py @@ -1,2 +1,2 @@ # Copyright (c) ModelScope Contributors. All rights reserved. -from . import glm, internvl, kimi_vl, llama4, qwen, qwen3_vl +from . import glm, internvl, kimi_vl, llama4, qwen, qwen3_5, qwen3_vl diff --git a/swift/megatron/model/mm_gpts/qwen3_5.py b/swift/megatron/model/mm_gpts/qwen3_5.py new file mode 100644 index 0000000000..012e359922 --- /dev/null +++ b/swift/megatron/model/mm_gpts/qwen3_5.py @@ -0,0 +1,105 @@ +# Copyright (c) ModelScope Contributors. All rights reserved. +from functools import partial + +import torch +from megatron.core.extensions.transformer_engine import _get_extra_te_kwargs +from megatron.core.models.huggingface import HuggingFaceModule as _HuggingFaceModule +from megatron.core.tensor_parallel import (gather_from_sequence_parallel_region, + reduce_scatter_to_sequence_parallel_region) +from megatron.core.transformer.attention import SelfAttentionSubmodules +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.training import get_args + +from swift.model import ModelType +from swift.template import Template +from ..constant import MegatronModelType +from ..gpts.qwen3_next import Qwen3NextBridge, get_qwen3_next_mtp_block_spec, get_qwen3_next_transformer_layer_spec +from ..register import MegatronModelMeta, register_megatron_model +from .utils import HuggingFaceModule + +try: + from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import Qwen3_5MoeGatedDeltaNet as _Qwen3_5MoeGatedDeltaNet +except ImportError: + _Qwen3_5MoeGatedDeltaNet = object + + +class Qwen3_5MoeGatedDeltaNet(_HuggingFaceModule, _Qwen3_5MoeGatedDeltaNet): + + def __init__(self, config: TransformerConfig, submodules: SelfAttentionSubmodules, layer_number: int, **kwargs): + assert config.context_parallel_size == 1, 'Qwen3_5 currently does not support context parallel.' + assert _Qwen3_5MoeGatedDeltaNet is not object, 'please update the `transformers` version.' + _Qwen3_5MoeGatedDeltaNet.__init__(self, config, layer_number) + self.config = config + extra_kwargs = _get_extra_te_kwargs(config) + self.to(dtype=extra_kwargs['params_dtype'], device=extra_kwargs['device']) + + def forward(self, hidden_states: torch.Tensor, **kwargs): + args = get_args() + if args.sequence_parallel and args.tensor_model_parallel_size > 1: + hidden_states = gather_from_sequence_parallel_region(hidden_states) + seq_len = hidden_states.shape[0] + packed_seq_params = kwargs.get('packed_seq_params') + thd_format = packed_seq_params is not None and packed_seq_params.qkv_format == 'thd' + # Note: for packed inputs, we do not perform padding_free unpadding. + # Doing so would allow different sequences to see each other; for efficiency we keep this implementation. + if thd_format and not args.packing: + new_hidden_states = hidden_states.new_zeros( + (packed_seq_params.num_samples, packed_seq_params.max_seqlen_q.item(), hidden_states.shape[-1])) + attention_mask = hidden_states.new_zeros( + (packed_seq_params.num_samples, packed_seq_params.max_seqlen_q.item()), dtype=torch.bool) + cu_seqlens_q = packed_seq_params.cu_seqlens_q + for i in range(packed_seq_params.num_samples): + start, end = cu_seqlens_q[i], cu_seqlens_q[i + 1] + attention_mask[i, :end - start] = True + new_hidden_states[i, :end - start] = hidden_states[start:end, 0] + hidden_states = new_hidden_states + else: + hidden_states = hidden_states.transpose(0, 1) + attention_mask = kwargs.get('attention_mask') + if attention_mask is not None: + attention_mask = (~attention_mask).sum(dim=(1, 2)) > 0 + res = super().forward(hidden_states=hidden_states, attention_mask=attention_mask) + if thd_format and not args.packing: + res = res[attention_mask][:, None] + res = torch.concat([res, res.new_zeros(seq_len - res.shape[0], 1, res.shape[2])]) + else: + res = res.transpose(0, 1) + if args.sequence_parallel and args.tensor_model_parallel_size > 1: + # Quick fix for dropout issue, awaiting ms-swift 4.0 refactor + res = reduce_scatter_to_sequence_parallel_region(res) / args.tensor_model_parallel_size + return res, None + + +class Qwen3_5Vit(HuggingFaceModule): + module_mapping = {'model.visual': 'visual'} + _vision_tower = ['visual'] + _aligner = ['visual.merger'] + + def __init__(self, config): + from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5TextModel + from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import Qwen3_5MoeTextModel + super().__init__(config, [Qwen3_5TextModel, Qwen3_5MoeTextModel]) + + def get_inputs_embeds(self, inputs_embeds, **kwargs): + return Template._get_inputs_embeds_hf(inputs_embeds, kwargs, self.visual, self.processor, self.model_config) + + +class Qwen3_5Bridge(Qwen3NextBridge): + hf_layers_prefix = 'model.language_model.layers' + hf_embed_key = 'model.language_model.embed_tokens.weight' + hf_final_layernorm_key = 'model.language_model.norm.weight' + + +register_megatron_model( + MegatronModelMeta( + MegatronModelType.qwen3_5, + [ + ModelType.qwen3_5, + ModelType.qwen3_5_moe, + ], + get_transformer_layer_spec=partial( + get_qwen3_next_transformer_layer_spec, gated_delta_net=Qwen3_5MoeGatedDeltaNet), + get_mtp_block_spec=get_qwen3_next_mtp_block_spec, + bridge_cls=Qwen3_5Bridge, + visual_cls=Qwen3_5Vit, + )) diff --git a/swift/megatron/utils/config.py b/swift/megatron/utils/config.py index 24720ad3a4..cc2b275428 100644 --- a/swift/megatron/utils/config.py +++ b/swift/megatron/utils/config.py @@ -106,13 +106,14 @@ def convert_hf_config(config) -> Dict[str, Any]: interleave_moe_layer_step = res.pop('interleave_moe_layer_step', None) window_size = res.pop('window_size', None) rope_scaling = res.get('rope_scaling') or {} - if llm_model_type in {'qwen3', 'qwen3_moe', 'qwen3_next' - } or hf_model_type in {'qwen3_omni_moe', 'qwen3_omni', 'qwen3_vl', 'qwen3_vl_moe'}: + if llm_model_type in {'qwen3', 'qwen3_moe', 'qwen3_next'} or hf_model_type in { + 'qwen3_omni_moe', 'qwen3_omni', 'qwen3_vl', 'qwen3_vl_moe', 'qwen3_5', 'qwen3_5_moe' + }: res['qk_layernorm'] = True if llm_model_type in {'qwen2_moe', 'qwen3_moe', 'qwen3_next' - } or hf_model_type in {'qwen3_omni_moe', 'qwen3_vl_moe'}: + } or hf_model_type in {'qwen3_omni_moe', 'qwen3_vl_moe', 'qwen3_5_moe'}: res.pop('ffn_hidden_size', None) - if llm_model_type in {'qwen2_moe', 'qwen3_next'}: + if llm_model_type in {'qwen2_moe', 'qwen3_next'} or hf_model_type == 'qwen3_5_moe': res['use_shared_expert_gate'] = True if llm_model_type in { 'deepseek', @@ -156,8 +157,8 @@ def convert_hf_config(config) -> Dict[str, Any]: if llm_model_type == 'glm4_moe_lite': res['qk_layernorm'] = True res.pop('num_query_groups', None) - elif llm_model_type == 'qwen3_next': - full_attention_interval = res.pop('full_attention_interval') + elif llm_model_type == 'qwen3_next' or hf_model_type in {'qwen3_5', 'qwen3_5_moe'}: + full_attention_interval = res.pop('full_attention_interval', 4) num_layers = res['num_layers'] res['layer_types'] = [ 'full_attention' if (i + 1) % full_attention_interval == 0 else 'linear_attention' diff --git a/swift/model/constant.py b/swift/model/constant.py index 4367c60690..25dce2ed6f 100644 --- a/swift/model/constant.py +++ b/swift/model/constant.py @@ -144,6 +144,9 @@ class MLLMModelType: qwen3_vl_moe = 'qwen3_vl_moe' qwen3_vl_emb = 'qwen3_vl_emb' qwen3_vl_reranker = 'qwen3_vl_reranker' + qwen3_5 = 'qwen3_5' + qwen3_5_moe = 'qwen3_5_moe' + qwen2_gme = 'qwen2_gme' ovis1_6 = 'ovis1_6' ovis2 = 'ovis2' diff --git a/swift/model/models/qwen.py b/swift/model/models/qwen.py index 54faa99ab2..60b058df61 100644 --- a/swift/model/models/qwen.py +++ b/swift/model/models/qwen.py @@ -1111,6 +1111,46 @@ def get_model(self, model_dir: str, config, processor, model_kwargs) -> PreTrain tags=['vision', 'video'])) +class Qwen3_5MoeLoader(Qwen3VLLoader): + + def get_model(self, model_dir: str, config, processor, model_kwargs) -> PreTrainedModel: + from transformers import Qwen3_5MoeForConditionalGeneration + self.auto_model_cls = self.auto_model_cls or Qwen3_5MoeForConditionalGeneration + return Qwen2VLLoader.get_model(self, model_dir, config, processor, model_kwargs) + + +register_model( + ModelMeta( + MLLMModelType.qwen3_5_moe, [ + ModelGroup([], TemplateType.qwen3_5), + ], + Qwen3_5MoeLoader, + model_arch=ModelArch.qwen2_vl, + architectures=['Qwen3_5MoeForConditionalGeneration'], + requires=['transformers>=5.0.0.dev', 'qwen_vl_utils>=0.0.14', 'decord'], + tags=['vision', 'video'])) + + +class Qwen3_5Loader(Qwen3VLLoader): + + def get_model(self, model_dir: str, config, processor, model_kwargs) -> PreTrainedModel: + from transformers import Qwen3_5ForConditionalGeneration + self.auto_model_cls = self.auto_model_cls or Qwen3_5ForConditionalGeneration + return Qwen2VLLoader.get_model(self, model_dir, config, processor, model_kwargs) + + +register_model( + ModelMeta( + MLLMModelType.qwen3_5, [ + ModelGroup([], TemplateType.qwen3_5), + ], + Qwen3_5Loader, + model_arch=ModelArch.qwen2_vl, + architectures=['Qwen3_5ForConditionalGeneration'], + requires=['transformers>=5.0.0.dev', 'qwen_vl_utils>=0.0.14', 'decord'], + tags=['vision', 'video'])) + + class Qwen2_5OmniLoader(ModelLoader): def get_config(self, model_dir): diff --git a/swift/template/constant.py b/swift/template/constant.py index fe8e948658..30c7c09d2f 100644 --- a/swift/template/constant.py +++ b/swift/template/constant.py @@ -134,6 +134,8 @@ class MLLMTemplateType: qwen3_vl = 'qwen3_vl' qwen3_vl_emb = 'qwen3_vl_emb' qwen3_vl_reranker = 'qwen3_vl_reranker' + qwen3_5 = 'qwen3_5' + qwen2_gme = 'qwen2_gme' qvq = 'qvq' ovis1_6 = 'ovis1_6' diff --git a/swift/template/templates/qwen.py b/swift/template/templates/qwen.py index 8dea52861b..c4963ef3ab 100644 --- a/swift/template/templates/qwen.py +++ b/swift/template/templates/qwen.py @@ -552,6 +552,19 @@ def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]: MLLMTemplateType.qwen3_vl, template_cls=Qwen3VLTemplate, default_system=None, thinking_prefix='\n')) +class Qwen3_5Template(Qwen3VLTemplate): + image_token_id = 248056 + video_token_id = 248057 + + def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]: + return Qwen2VLTemplate._post_encode(self, model, inputs) + + +register_template( + QwenTemplateMeta( + MLLMTemplateType.qwen3_5, template_cls=Qwen3_5Template, default_system=None, thinking_prefix='\n')) + + class Qwen3VLEmbTemplate(Qwen3VLTemplate): def _preprocess_inputs(self, inputs: StdTemplateInputs) -> None: