Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,7 @@ def __init__(
rope_parameters: int | None = None,
attention_bias: bool | None = False,
sliding_window: int | None = None,
max_window_layers: int | None = 28,
layer_types: list[str] | None = None,
attention_dropout: int | None = 0,
num_code_groups: int | None = 32,
Expand All @@ -581,15 +582,15 @@ def __init__(
eos_token_id: int | None = None,
**kwargs,
):
self.sliding_window = sliding_window
self.num_code_groups = num_code_groups
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.sliding_window = sliding_window if self.use_sliding_window else None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok I checked what happened and it seems it was unintentionally added in #41541, but since I'm not super familiar with this model I'd rather wait for @zucchini-nlp to answer here

Imo we should just do self.sliding_window = sliding_window (use_sliding_window was never used at all and should be removed from the docstring) - max_window_layers should be removed alongside it (not reintroduced)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also the changes should be done in modular and then reapplied via python utils/modular_model_converter.py qwen3_omni_moe

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed a bad copy, no need for use_sliding_window. Model always uses sliding layers together with full attention

self.sliding_window = sliding_window
self.max_window_layers = max_window_layers
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: duplicate


# for backward compatibility
if num_key_value_heads is None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,7 @@ def __init__(
rope_parameters: int | None = None,
attention_bias: bool | None = False,
sliding_window: int | None = None,
max_window_layers: int | None = 28,
layer_types: list[str] | None = None,
attention_dropout: int | None = 0,
num_code_groups: int | None = 32,
Expand All @@ -473,7 +474,6 @@ def __init__(
eos_token_id: int | None = None,
**kwargs,
):
self.sliding_window = sliding_window
self.num_code_groups = num_code_groups
super().__init__(
vocab_size,
Expand Down Expand Up @@ -502,7 +502,8 @@ def __init__(
**kwargs,
)
del self.use_sliding_window
del self.max_window_layers
self.sliding_window = sliding_window
self.max_window_layers = max_window_layers


class Qwen3OmniMoeTalkerTextConfig(Qwen3MoeConfig):
Expand Down
26 changes: 26 additions & 0 deletions tests/models/qwen3_omni_moe/test_modeling_qwen3_omni_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
is_torch_available,
is_vision_available,
)
from transformers.models.qwen3_omni_moe.configuration_qwen3_omni_moe import Qwen3OmniMoeTalkerCodePredictorConfig
from transformers.testing_utils import (
Expectations,
cleanup,
Expand Down Expand Up @@ -648,6 +649,31 @@ def _video_features_get_expected_num_hidden_states(self, model_tester=None):
model_tester = self.model_tester
return model_tester.vision_config["depth"] + 1

def test_code_predictor_config_init(self):
"""
Test that Qwen3OmniMoeTalkerCodePredictorConfig initializes correctly
and accepts max_window_layers while removing use_sliding_window.
"""

config = Qwen3OmniMoeTalkerCodePredictorConfig(
vocab_size=100,
hidden_size=32,
num_hidden_layers=2,
num_attention_heads=4,
max_window_layers=28,
sliding_window=2048,
)

# 1. Check max_window_layers is present
self.assertEqual(config.max_window_layers, 28)

# 2. Check sliding_window is present
self.assertEqual(config.sliding_window, 2048)

# 3. Check use_sliding_window is removed
with self.assertRaises(AttributeError):
_ = config.use_sliding_window


@require_torch
class Qwen3OmniModelIntegrationTest(unittest.TestCase):
Expand Down