diff --git a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py index 43e6250c0fd4..d434f91fa4f5 100644 --- a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py @@ -872,6 +872,7 @@ def __init__( position_id_per_seconds=25, audio_start_token_id=151669, speaker_id=None, + initializer_range=0.02, **kwargs, ): if code_predictor_config is None: @@ -907,6 +908,7 @@ def __init__( self.audio_start_token_id = audio_start_token_id self.vision_start_token_id = vision_start_token_id self.speaker_id = speaker_id + self.initializer_range = initializer_range super().__init__(**kwargs) @@ -997,6 +999,7 @@ def __init__( upsampling_ratios=(2, 2), decoder_dim=1536, attention_dropout=0.0, + initializer_range=0.02, **kwargs, ): self.codebook_size = codebook_size @@ -1017,6 +1020,7 @@ def __init__( self.decoder_dim = decoder_dim self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters + self.initializer_range = initializer_range super().__init__(**kwargs) @@ -1087,6 +1091,7 @@ def __init__( system_token_id=8948, user_token_id=872, assistant_token_id=77091, + initializer_range=0.02, **kwargs, ): if thinker_config is None: @@ -1113,6 +1118,7 @@ def __init__( self.system_token_id = system_token_id self.user_token_id = user_token_id self.assistant_token_id = assistant_token_id + self.initializer_range = initializer_range super().__init__(**kwargs) def get_text_config(self, decoder=False) -> "PreTrainedConfig": diff --git a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py index ff3a5ef41c77..21d8c6778df8 100644 --- a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py @@ -3042,7 +3042,7 @@ def get_input_embeddings(self): @auto_docstring class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3OmniMoeThinkerTextPreTrainedModel, GenerationMixin): - _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"} + _tied_weights_keys = {} _tp_plan = {"lm_head": "colwise_rep"} _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} config_class = Qwen3OmniMoeTalkerConfig diff --git a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py index 79ae9f95c374..593387111443 100644 --- a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py @@ -638,6 +638,7 @@ def __init__( position_id_per_seconds=25, audio_start_token_id=151669, speaker_id=None, + initializer_range=0.02, **kwargs, ): if code_predictor_config is None: @@ -673,6 +674,7 @@ def __init__( self.audio_start_token_id = audio_start_token_id self.vision_start_token_id = vision_start_token_id self.speaker_id = speaker_id + self.initializer_range = initializer_range super().__init__(**kwargs) @@ -763,6 +765,7 @@ def __init__( upsampling_ratios=(2, 2), decoder_dim=1536, attention_dropout=0.0, + initializer_range=0.02, **kwargs, ): self.codebook_size = codebook_size @@ -783,6 +786,7 @@ def __init__( self.decoder_dim = decoder_dim self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters + self.initializer_range = initializer_range super().__init__(**kwargs) @@ -853,6 +857,7 @@ def __init__( system_token_id=8948, user_token_id=872, assistant_token_id=77091, + initializer_range=0.02, **kwargs, ): if thinker_config is None: @@ -879,6 +884,7 @@ def __init__( self.system_token_id = system_token_id self.user_token_id = user_token_id self.assistant_token_id = assistant_token_id + self.initializer_range = initializer_range super().__init__(**kwargs) def get_text_config(self, decoder=False) -> "PreTrainedConfig": @@ -1869,6 +1875,7 @@ def get_input_embeddings(self): class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3MoeForCausalLM): + _tied_weights_keys = {} config_class = Qwen3OmniMoeTalkerConfig base_model_prefix = "talker" _no_split_modules = ["Qwen3OmniMoeTalkerCodePredictorModelForConditionalGeneration"] diff --git a/tests/models/qwen3_omni_moe/test_configuration_and_loading.py b/tests/models/qwen3_omni_moe/test_configuration_and_loading.py new file mode 100644 index 000000000000..0f90df2c7a36 --- /dev/null +++ b/tests/models/qwen3_omni_moe/test_configuration_and_loading.py @@ -0,0 +1,24 @@ +from transformers.models.qwen3_omni_moe.configuration_qwen3_omni_moe import ( + Qwen3OmniMoeCode2WavConfig, + Qwen3OmniMoeConfig, + Qwen3OmniMoeTalkerConfig, +) +from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import ( + Qwen3OmniMoeTalkerForConditionalGeneration, +) + + +def test_qwen3_omni_moe_configs_have_initializer_range(): + talker_config = Qwen3OmniMoeTalkerConfig() + assert hasattr(talker_config, "initializer_range") + + code2wav_config = Qwen3OmniMoeCode2WavConfig() + assert hasattr(code2wav_config, "initializer_range") + + main_config = Qwen3OmniMoeConfig() + assert hasattr(main_config, "initializer_range") + + +def test_qwen3_omni_moe_talker_has_no_tied_weights(): + tied_keys = Qwen3OmniMoeTalkerForConditionalGeneration._tied_weights_keys + assert tied_keys in (None, {})