diff --git a/examples/modular-transformers/configuration_new_model.py b/examples/modular-transformers/configuration_new_model.py index 5213682da45a..f27ab4917ee4 100644 --- a/examples/modular-transformers/configuration_new_model.py +++ b/examples/modular-transformers/configuration_new_model.py @@ -45,7 +45,6 @@ class NewModelConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } - vocab_size: int = 256030 hidden_size: int = 64 intermediate_size: int = 90 diff --git a/examples/modular-transformers/modeling_dummy_bert.py b/examples/modular-transformers/modeling_dummy_bert.py index 3a8ec4d237f5..e0c4a03556f6 100644 --- a/examples/modular-transformers/modeling_dummy_bert.py +++ b/examples/modular-transformers/modeling_dummy_bert.py @@ -556,7 +556,6 @@ def forward( output_attentions: bool | None = None, output_hidden_states: bool | None = None, return_dict: bool | None = None, - cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: if (input_ids is None) ^ (inputs_embeds is not None): diff --git a/examples/modular-transformers/modeling_new_task_model.py b/examples/modular-transformers/modeling_new_task_model.py index 8a080cce6a6a..6e739fa0dbf4 100644 --- a/examples/modular-transformers/modeling_new_task_model.py +++ b/examples/modular-transformers/modeling_new_task_model.py @@ -420,7 +420,6 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, token_type_ids: torch.LongTensor | None = None, - cache_position: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, use_cache: bool | None = None, @@ -439,7 +438,6 @@ def forward( position_ids=position_ids, past_key_values=past_key_values, token_type_ids=token_type_ids, - cache_position=cache_position, inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, diff --git a/examples/modular-transformers/modeling_super.py b/examples/modular-transformers/modeling_super.py index 825da3a0e932..5502ac642b99 100644 --- a/examples/modular-transformers/modeling_super.py +++ b/examples/modular-transformers/modeling_super.py @@ -359,7 +359,6 @@ def forward( output_attentions: bool | None = None, output_hidden_states: bool | None = None, return_dict: bool | None = None, - cache_position: torch.LongTensor | None = None, ) -> tuple | CausalLMOutputWithPast: out = super().forward( input_ids, @@ -371,7 +370,6 @@ def forward( output_attentions, output_hidden_states, return_dict, - cache_position, ) out.logits *= 2**4 return out diff --git a/examples/modular-transformers/modular_dummy_bert.py b/examples/modular-transformers/modular_dummy_bert.py index 8a38ef13fdab..0c900110fd82 100644 --- a/examples/modular-transformers/modular_dummy_bert.py +++ b/examples/modular-transformers/modular_dummy_bert.py @@ -22,7 +22,6 @@ def forward( output_attentions: bool | None = None, output_hidden_states: bool | None = None, return_dict: bool | None = None, - cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: return super().forward(input_ids, **kwargs) diff --git a/examples/modular-transformers/modular_new_task_model.py b/examples/modular-transformers/modular_new_task_model.py index 162919403e9c..ce6061d41fb2 100644 --- a/examples/modular-transformers/modular_new_task_model.py +++ b/examples/modular-transformers/modular_new_task_model.py @@ -28,7 +28,6 @@ def forward( position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, token_type_ids: torch.LongTensor | None = None, - cache_position: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, use_cache: bool | None = None, @@ -47,7 +46,6 @@ def forward( position_ids=position_ids, past_key_values=past_key_values, token_type_ids=token_type_ids, - cache_position=cache_position, inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, diff --git a/examples/modular-transformers/modular_super.py b/examples/modular-transformers/modular_super.py index a63b8ee43e28..89bd8af5ba3a 100644 --- a/examples/modular-transformers/modular_super.py +++ b/examples/modular-transformers/modular_super.py @@ -19,7 +19,6 @@ def forward( output_attentions: bool | None = None, output_hidden_states: bool | None = None, return_dict: bool | None = None, - cache_position: torch.LongTensor | None = None, ) -> tuple | CausalLMOutputWithPast: out = super().forward( input_ids, @@ -31,7 +30,6 @@ def forward( output_attentions, output_hidden_states, return_dict, - cache_position, ) out.logits *= 2**4 return out