diff --git a/scripts/generate_tiny_models/for_conditional_generation/gemma4_for_conditional_generation.py b/scripts/generate_tiny_models/for_conditional_generation/gemma4_for_conditional_generation.py index 8d3cba21904..372a17cecdb 100644 --- a/scripts/generate_tiny_models/for_conditional_generation/gemma4_for_conditional_generation.py +++ b/scripts/generate_tiny_models/for_conditional_generation/gemma4_for_conditional_generation.py @@ -29,11 +29,20 @@ processor = AutoProcessor.from_pretrained(MODEL_ID) generation_config = GenerationConfig.from_pretrained(MODEL_ID) +# Gemma4 image processor uses aspect-ratio-preserving resizing, not a fixed image size. max_soft_tokens controls +# the output token budget and must be one of (70, 140, 280, 560, 1120). The smallest value (70) gives +# max_patches = 70 × pooling_kernel_size² = 70 × 9 = 630, so position_embedding_size must be at least 630. +# intermediate_size mirrors Gemma3: without it the production value (text: 6144, vision: 3072) is inherited, causing +# training activations [batch, patches, intermediate_size] to dominate GPU memory and OOM in CI. +IMAGE_TOKENS = 70 # minimum supported max_soft_tokens +MAX_PATCHES = IMAGE_TOKENS * 3**2 # 630 + text_config = { "num_hidden_layers": 2, "hidden_size": 16, "num_attention_heads": 4, "num_key_value_heads": 2, + "intermediate_size": 32, } vision_config = { "num_hidden_layers": 2, @@ -41,8 +50,15 @@ "num_attention_heads": 4, "num_key_value_heads": 2, "embed_dim": 64, + "intermediate_size": 32, + "position_embedding_size": MAX_PATCHES, # 630 + "default_output_length": IMAGE_TOKENS, # 70 } +processor.image_seq_length = IMAGE_TOKENS # top-level Gemma4Processor attribute (serialized to processor_config.json) +processor.image_processor.image_seq_length = IMAGE_TOKENS # nested Gemma4ImageProcessor attribute +processor.image_processor.max_soft_tokens = IMAGE_TOKENS + config = AutoConfig.from_pretrained(MODEL_ID) for k, v in text_config.items(): setattr(config.text_config, k, v)