Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -907,6 +907,7 @@ def __init__(
self.audio_start_token_id = audio_start_token_id
self.vision_start_token_id = vision_start_token_id
self.speaker_id = speaker_id
self.initializer_range = self.text_config.initializer_range
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since we're using text config's init range in any case, we can instead update init_weights to use config.get_text_config().init_range

super().__init__(**kwargs)


Expand Down Expand Up @@ -997,6 +998,7 @@ def __init__(
upsampling_ratios=(2, 2),
decoder_dim=1536,
attention_dropout=0.0,
initializer_range=0.02,
**kwargs,
):
self.codebook_size = codebook_size
Expand All @@ -1016,6 +1018,7 @@ def __init__(
self.upsampling_ratios = upsampling_ratios
self.decoder_dim = decoder_dim
self.attention_dropout = attention_dropout
self.initializer_range = initializer_range
self.rope_parameters = rope_parameters

super().__init__(**kwargs)
Expand Down Expand Up @@ -1104,6 +1107,7 @@ def __init__(
self.thinker_config = Qwen3OmniMoeThinkerConfig(**thinker_config)
self.talker_config = Qwen3OmniMoeTalkerConfig(**talker_config)
self.code2wav_config = Qwen3OmniMoeCode2WavConfig(**code2wav_config)
self.initializer_range = self.thinker_config.initializer_range
self.enable_audio_output = enable_audio_output
self.im_start_token_id = im_start_token_id
self.im_end_token_id = im_end_token_id
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3042,9 +3042,9 @@ def get_input_embeddings(self):

@auto_docstring
class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3OmniMoeThinkerTextPreTrainedModel, GenerationMixin):
_tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe the correct way is codec_head: model.codec_embedding.weight. It will allow users to tie weights if needed. We just need to make sure that the model is not tying weight, I see that the default is already tie_word_embeddings=False in config

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you very much @zucchini-nlp ! Fixed.

_tp_plan = {"lm_head": "colwise_rep"}
_pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
_tied_weights_keys = {"codec_head": "model.codec_embedding.weight"}
_tp_plan = {"codec_head": "colwise_rep"}
_pp_plan = {"codec_head": (["hidden_states"], ["logits"])}
config_class = Qwen3OmniMoeTalkerConfig
base_model_prefix = "talker"
_no_split_modules = ["Qwen3OmniMoeTalkerCodePredictorModelForConditionalGeneration"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -673,6 +673,7 @@ def __init__(
self.audio_start_token_id = audio_start_token_id
self.vision_start_token_id = vision_start_token_id
self.speaker_id = speaker_id
self.initializer_range = self.text_config.initializer_range
super().__init__(**kwargs)


Expand Down Expand Up @@ -763,6 +764,7 @@ def __init__(
upsampling_ratios=(2, 2),
decoder_dim=1536,
attention_dropout=0.0,
initializer_range=0.02,
**kwargs,
):
self.codebook_size = codebook_size
Expand All @@ -782,6 +784,7 @@ def __init__(
self.upsampling_ratios = upsampling_ratios
self.decoder_dim = decoder_dim
self.attention_dropout = attention_dropout
self.initializer_range = initializer_range
self.rope_parameters = rope_parameters

super().__init__(**kwargs)
Expand Down Expand Up @@ -870,6 +873,7 @@ def __init__(
self.thinker_config = Qwen3OmniMoeThinkerConfig(**thinker_config)
self.talker_config = Qwen3OmniMoeTalkerConfig(**talker_config)
self.code2wav_config = Qwen3OmniMoeCode2WavConfig(**code2wav_config)
self.initializer_range = self.thinker_config.initializer_range
self.enable_audio_output = enable_audio_output
self.im_start_token_id = im_start_token_id
self.im_end_token_id = im_end_token_id
Expand Down Expand Up @@ -1869,6 +1873,9 @@ def get_input_embeddings(self):


class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3MoeForCausalLM):
_tied_weights_keys = {"codec_head": "model.codec_embedding.weight"}
_tp_plan = {"codec_head": "colwise_rep"}
_pp_plan = {"codec_head": (["hidden_states"], ["logits"])}
config_class = Qwen3OmniMoeTalkerConfig
base_model_prefix = "talker"
_no_split_modules = ["Qwen3OmniMoeTalkerCodePredictorModelForConditionalGeneration"]
Expand Down