huggingface · albertvillanova · May 15, 2026 · May 13, 2026 · May 13, 2026 · May 13, 2026
diff --git a/scripts/generate_tiny_models/for_conditional_generation/gemma4_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/gemma4_for_conditional_generation.py
@@ -29,20 +29,35 @@
 processor = AutoProcessor.from_pretrained(MODEL_ID)
 generation_config = GenerationConfig.from_pretrained(MODEL_ID)
 
+# Gemma4 image processor uses aspect-ratio-preserving resizing, not a fixed image size. max_soft_tokens controls
+# the output token budget and must be one of (70, 140, 280, 560, 1120). The smallest value (70) gives
+# max_patches = 70 × pooling_kernel_size² = 70 × 9 = 630, so position_embedding_size must be at least 630.
+# intermediate_size mirrors Gemma3: without it the production value (text: 6144, vision: 3072) is inherited, causing
+# training activations [batch, patches, intermediate_size] to dominate GPU memory and OOM in CI.
+IMAGE_TOKENS = 70  # minimum supported max_soft_tokens
+MAX_PATCHES = IMAGE_TOKENS * 3**2  # 630
+
 text_config = {
     "num_hidden_layers": 2,
     "hidden_size": 16,
     "num_attention_heads": 4,
     "num_key_value_heads": 2,
+    "intermediate_size": 32,
 }
 vision_config = {
     "num_hidden_layers": 2,
     "hidden_size": 16,
     "num_attention_heads": 4,
     "num_key_value_heads": 2,
     "embed_dim": 64,
+    "intermediate_size": 32,
+    "position_embedding_size": MAX_PATCHES,  # 630
+    "default_output_length": IMAGE_TOKENS,  # 70
 }
 
+processor.image_processor.image_seq_length = IMAGE_TOKENS
+processor.image_processor.max_soft_tokens = IMAGE_TOKENS
+
 config = AutoConfig.from_pretrained(MODEL_ID)
 for k, v in text_config.items():
     setattr(config.text_config, k, v)