huggingface · qgallouedec · May 7, 2026 · May 4, 2026 · May 6, 2026 · May 6, 2026
diff --git a/scripts/generate_tiny_models/for_causal_lm/qwen3_moe_for_causal_lm.py b/scripts/generate_tiny_models/for_causal_lm/qwen3_moe_for_causal_lm.py
@@ -32,14 +32,22 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 config = Qwen3MoeConfig(
-    vocab_size=len(tokenizer.vocab),
+    vocab_size=151936,
     hidden_size=8,
     num_attention_heads=4,
     num_key_value_heads=2,
     num_hidden_layers=2,
     intermediate_size=32,
     num_experts=4,
     num_experts_per_tok=2,
+    max_position_embeddings=40960,
+    rope_theta=1000000.0,
+    norm_topk_prob=True,
+    bos_token_id=151643,
+    eos_token_id=151645,
+    # Forwarded via kwargs (not Qwen3MoeConfig fields, but PretrainedConfig accepts arbitrary kwargs):
+    head_dim=128,
+    max_window_layers=48,
 )
 model = Qwen3MoeForCausalLM(config).to(dtype=torch.bfloat16)
 init_weights_tiny_model(model)

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -63,7 +63,14 @@ def apply_model_revisions(monkeypatch):
     if not MODEL_REVISIONS:
         return
 
-    from transformers import PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin
+    from transformers import (
+        AutoConfig,
+        AutoModelForCausalLM,
+        AutoModelForSequenceClassification,
+        PreTrainedModel,
+        PreTrainedTokenizerBase,
+        ProcessorMixin,
+    )
 
     def create_classmethod_wrapper(original_classmethod):
         # Extract the underlying function from the classmethod
@@ -83,6 +90,9 @@ def wrapper(cls, pretrained_model_name_or_path, *args, **kwargs):
 
     # Patch all transformers Auto* classes
     for cls in [
+        AutoConfig,
+        AutoModelForCausalLM,
+        AutoModelForSequenceClassification,
         PreTrainedModel,
         PreTrainedTokenizerBase,
         ProcessorMixin,