diff --git a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py index 43e6250c0fd4..cad8b089a250 100644 --- a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py @@ -907,6 +907,7 @@ def __init__( self.audio_start_token_id = audio_start_token_id self.vision_start_token_id = vision_start_token_id self.speaker_id = speaker_id + self.initializer_range = self.text_config.initializer_range super().__init__(**kwargs) @@ -997,6 +998,7 @@ def __init__( upsampling_ratios=(2, 2), decoder_dim=1536, attention_dropout=0.0, + initializer_range=0.02, **kwargs, ): self.codebook_size = codebook_size @@ -1016,6 +1018,7 @@ def __init__( self.upsampling_ratios = upsampling_ratios self.decoder_dim = decoder_dim self.attention_dropout = attention_dropout + self.initializer_range = initializer_range self.rope_parameters = rope_parameters super().__init__(**kwargs) @@ -1104,6 +1107,7 @@ def __init__( self.thinker_config = Qwen3OmniMoeThinkerConfig(**thinker_config) self.talker_config = Qwen3OmniMoeTalkerConfig(**talker_config) self.code2wav_config = Qwen3OmniMoeCode2WavConfig(**code2wav_config) + self.initializer_range = self.thinker_config.initializer_range self.enable_audio_output = enable_audio_output self.im_start_token_id = im_start_token_id self.im_end_token_id = im_end_token_id diff --git a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py index ff3a5ef41c77..62966ac1563a 100644 --- a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py @@ -3042,9 +3042,9 @@ def get_input_embeddings(self): @auto_docstring class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3OmniMoeThinkerTextPreTrainedModel, GenerationMixin): - _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"} - _tp_plan = {"lm_head": "colwise_rep"} - _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + _tied_weights_keys = {"codec_head": "model.codec_embedding.weight"} + _tp_plan = {"codec_head": "colwise_rep"} + _pp_plan = {"codec_head": (["hidden_states"], ["logits"])} config_class = Qwen3OmniMoeTalkerConfig base_model_prefix = "talker" _no_split_modules = ["Qwen3OmniMoeTalkerCodePredictorModelForConditionalGeneration"] diff --git a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py index 79ae9f95c374..ce53bc9e6e8f 100644 --- a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py @@ -673,6 +673,7 @@ def __init__( self.audio_start_token_id = audio_start_token_id self.vision_start_token_id = vision_start_token_id self.speaker_id = speaker_id + self.initializer_range = self.text_config.initializer_range super().__init__(**kwargs) @@ -763,6 +764,7 @@ def __init__( upsampling_ratios=(2, 2), decoder_dim=1536, attention_dropout=0.0, + initializer_range=0.02, **kwargs, ): self.codebook_size = codebook_size @@ -782,6 +784,7 @@ def __init__( self.upsampling_ratios = upsampling_ratios self.decoder_dim = decoder_dim self.attention_dropout = attention_dropout + self.initializer_range = initializer_range self.rope_parameters = rope_parameters super().__init__(**kwargs) @@ -870,6 +873,7 @@ def __init__( self.thinker_config = Qwen3OmniMoeThinkerConfig(**thinker_config) self.talker_config = Qwen3OmniMoeTalkerConfig(**talker_config) self.code2wav_config = Qwen3OmniMoeCode2WavConfig(**code2wav_config) + self.initializer_range = self.thinker_config.initializer_range self.enable_audio_output = enable_audio_output self.im_start_token_id = im_start_token_id self.im_end_token_id = im_end_token_id @@ -1869,6 +1873,9 @@ def get_input_embeddings(self): class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3MoeForCausalLM): + _tied_weights_keys = {"codec_head": "model.codec_embedding.weight"} + _tp_plan = {"codec_head": "colwise_rep"} + _pp_plan = {"codec_head": (["hidden_states"], ["logits"])} config_class = Qwen3OmniMoeTalkerConfig base_model_prefix = "talker" _no_split_modules = ["Qwen3OmniMoeTalkerCodePredictorModelForConditionalGeneration"]