From 76d3628f8e8fddd25e4af24482a426f741d02536 Mon Sep 17 00:00:00 2001 From: leot13 Date: Thu, 10 Aug 2023 17:16:45 +0200 Subject: [PATCH 01/25] add image_embeddings option in generate-related methods --- src/transformers/models/idefics/modeling_idefics.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 098adacc8dee..8e88853b1018 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -71,11 +71,18 @@ def expand_inputs_for_generation( if attention_mask is not None: model_kwargs["attention_mask"] = attention_mask.index_select(0, expanded_return_idx) + + if model_kwargs["image_attention_mask"] is not None: model_kwargs["image_attention_mask"] = model_kwargs["image_attention_mask"].index_select( 0, expanded_return_idx ) + + if model_kwargs["pixel_values"] is not None: model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(0, expanded_return_idx) + elif model_kwargs["image_embeddings"] is not None: + model_kwargs["image_embeddings"] = model_kwargs["image_embeddings"].index_select(0, expanded_return_idx) + if is_encoder_decoder: if encoder_outputs is None: raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.") @@ -139,6 +146,7 @@ def prepare_inputs_for_generation(input_ids, past=None, **kwargs): position_ids = position_ids[:, -1].unsqueeze(-1) pixel_values = kwargs.get("pixel_values", None) + image_embeddings = kwargs.get("image_embeddings", None) image_attention_mask = kwargs.get("image_attention_mask", None) # if pixel_values is None or image_attention_mask is None: # raise ValueError("pixel values and image attention mask cannot be None") @@ -151,6 +159,7 @@ def prepare_inputs_for_generation(input_ids, past=None, **kwargs): "attention_mask": attention_mask, "token_type_ids": token_type_ids, "pixel_values": pixel_values, + "image_embeddings": image_embeddings, "image_attention_mask": image_attention_mask, } From 0687433066c2cd6b0afcaf5a3bb1c9fc61a6a609 Mon Sep 17 00:00:00 2001 From: leot13 Date: Thu, 10 Aug 2023 17:51:47 +0200 Subject: [PATCH 02/25] style --- src/transformers/models/idefics/modeling_idefics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 8e88853b1018..a6043becf84a 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -71,12 +71,12 @@ def expand_inputs_for_generation( if attention_mask is not None: model_kwargs["attention_mask"] = attention_mask.index_select(0, expanded_return_idx) - + if model_kwargs["image_attention_mask"] is not None: model_kwargs["image_attention_mask"] = model_kwargs["image_attention_mask"].index_select( 0, expanded_return_idx ) - + if model_kwargs["pixel_values"] is not None: model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(0, expanded_return_idx) From 09143947827df15d56fa3b7c9d11ddbdd1b57700 Mon Sep 17 00:00:00 2001 From: leot13 Date: Fri, 11 Aug 2023 14:07:36 +0200 Subject: [PATCH 03/25] rename image_embeddings and allow perceiver embeddings precomputation --- .../models/idefics/modeling_idefics.py | 65 ++++++++++++++----- 1 file changed, 48 insertions(+), 17 deletions(-) diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index a6043becf84a..c62c4531b34a 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -65,6 +65,11 @@ def expand_inputs_for_generation( ) input_ids = input_ids.index_select(0, expanded_return_idx) + model_kwargs["pixel_values"] = model_kwargs.get("pixel_values", None) + model_kwargs["image_encoder_embeddings"] = model_kwargs.get("image_encoder_embeddings", None) + model_kwargs["perceiver_embeddings"] = model_kwargs.get("perceiver_embeddings", None) + model_kwargs["image_attention_mask"] = model_kwargs.get("image_attention_mask", None) + if "token_type_ids" in model_kwargs: token_type_ids = model_kwargs["token_type_ids"] model_kwargs["token_type_ids"] = token_type_ids.index_select(0, expanded_return_idx) @@ -80,8 +85,15 @@ def expand_inputs_for_generation( if model_kwargs["pixel_values"] is not None: model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(0, expanded_return_idx) - elif model_kwargs["image_embeddings"] is not None: - model_kwargs["image_embeddings"] = model_kwargs["image_embeddings"].index_select(0, expanded_return_idx) + elif model_kwargs["image_encoder_embeddings"] is not None: + model_kwargs["image_encoder_embeddings"] = model_kwargs["image_encoder_embeddings"].index_select( + 0, expanded_return_idx + ) + + elif model_kwargs["perceiver_embeddings"] is not None: + model_kwargs["perceiver_embeddings"] = model_kwargs["perceiver_embeddings"].index_select( + 0, expanded_return_idx + ) if is_encoder_decoder: if encoder_outputs is None: @@ -146,7 +158,8 @@ def prepare_inputs_for_generation(input_ids, past=None, **kwargs): position_ids = position_ids[:, -1].unsqueeze(-1) pixel_values = kwargs.get("pixel_values", None) - image_embeddings = kwargs.get("image_embeddings", None) + image_encoder_embeddings = kwargs.get("image_encoder_embeddings", None) + perceiver_embeddings = kwargs.get("perceiver_embeddings", None) image_attention_mask = kwargs.get("image_attention_mask", None) # if pixel_values is None or image_attention_mask is None: # raise ValueError("pixel values and image attention mask cannot be None") @@ -159,7 +172,8 @@ def prepare_inputs_for_generation(input_ids, past=None, **kwargs): "attention_mask": attention_mask, "token_type_ids": token_type_ids, "pixel_values": pixel_values, - "image_embeddings": image_embeddings, + "image_encoder_embeddings": image_encoder_embeddings, + "perceiver_embeddings": perceiver_embeddings, "image_attention_mask": image_attention_mask, } @@ -1064,7 +1078,8 @@ def forward( past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, pixel_values: Optional[torch.FloatTensor] = None, - image_embeddings: Optional[torch.FloatTensor] = None, + image_encoder_embeddings: Optional[torch.FloatTensor] = None, + perceiver_embeddings: Optional[torch.FloatTensor] = None, image_attention_mask: Optional[torch.Tensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, @@ -1112,11 +1127,17 @@ def forward( position_ids = position_ids.view(-1, seq_length).long() no_images = False - if pixel_values is None and image_embeddings is None: - raise ValueError("Either pixel_values and image_embeddings have to be not-None.") - - elif pixel_values is not None and image_embeddings is not None: - raise ValueError("You cannot specify both pixel_values and image_embeddings at the same time") + if ( + sum( + 1 + for vision_input in (pixel_values, image_encoder_embeddings, perceiver_embeddings) + if vision_input is not None + ) + != 1 + ): + raise ValueError( + "Exactly 1 of pixel_values, image_encoder_embeddings or perceiver_embeddings has to be not-None." + ) elif pixel_values is not None: no_images = len(torch.nonzero(pixel_values)) == 0 @@ -1127,14 +1148,22 @@ def forward( # Get sequence from the vision encoder image_hidden_states = self.vision_model(pixel_values=pixel_values).last_hidden_state - elif image_embeddings is not None: - batch_size, num_images, image_seq_len, image_hidden_size = image_embeddings.size() - image_hidden_states = image_embeddings.to(dtype=self.dtype, device=input_ids.device) + elif image_encoder_embeddings is not None: + batch_size, num_images, image_seq_len, image_hidden_size = image_encoder_embeddings.size() + image_hidden_states = image_encoder_embeddings.to(dtype=self.dtype, device=input_ids.device) image_hidden_states = image_hidden_states.view(batch_size * num_images, image_seq_len, image_hidden_size) - if self.config.use_resampler: + if self.config.use_resampler and perceiver_embeddings is None: image_hidden_states = self.perceiver_resampler(image_hidden_states) - image_seq_len, image_hidden_size = image_hidden_states.size(1), image_hidden_states.size(2) + image_seq_len, image_hidden_size = image_hidden_states.size(1), image_hidden_states.size(2) + elif self.config.use_resampler and perceiver_embeddings is not None: + batch_size, num_images, image_seq_len, image_hidden_size = perceiver_embeddings.size() + image_hidden_states = perceiver_embeddings + elif not self.config.use_resampler and perceiver_embeddings is None: + image_seq_len, image_hidden_size = image_hidden_states.size(1), image_hidden_states.size(2) + elif not self.config.use_resampler and perceiver_embeddings is not None: + raise ValueError("If perceiver_embeddings are passed, use_resampler should be True") + image_hidden_states = image_hidden_states.view(batch_size, num_images * image_seq_len, image_hidden_size) # # Hack to use the model in full language modeling mode # image_attention_mask = torch.zeros(batch_size, seq_length, 1, dtype=torch.long, device=image_hidden_states.device) @@ -1359,7 +1388,8 @@ def forward( past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, pixel_values: Optional[torch.FloatTensor] = None, - image_embeddings: Optional[torch.FloatTensor] = None, + image_encoder_embeddings: Optional[torch.FloatTensor] = None, + perceiver_embeddings: Optional[torch.FloatTensor] = None, image_attention_mask: Optional[torch.Tensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, @@ -1407,7 +1437,8 @@ def forward( past_key_values=past_key_values, inputs_embeds=inputs_embeds, pixel_values=pixel_values, - image_embeddings=image_embeddings, + image_encoder_embeddings=image_encoder_embeddings, + perceiver_embeddings=perceiver_embeddings, image_attention_mask=image_attention_mask, use_cache=use_cache, output_attentions=output_attentions, From 6bb49c4951fb39e417fd9e7222661a0553c750d5 Mon Sep 17 00:00:00 2001 From: leot13 Date: Mon, 14 Aug 2023 10:56:49 +0200 Subject: [PATCH 04/25] compute embeddings within generate --- .../models/idefics/modeling_idefics.py | 90 +++++++++++++------ 1 file changed, 61 insertions(+), 29 deletions(-) diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index c62c4531b34a..3716ab0b2341 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -65,7 +65,6 @@ def expand_inputs_for_generation( ) input_ids = input_ids.index_select(0, expanded_return_idx) - model_kwargs["pixel_values"] = model_kwargs.get("pixel_values", None) model_kwargs["image_encoder_embeddings"] = model_kwargs.get("image_encoder_embeddings", None) model_kwargs["perceiver_embeddings"] = model_kwargs.get("perceiver_embeddings", None) model_kwargs["image_attention_mask"] = model_kwargs.get("image_attention_mask", None) @@ -82,10 +81,7 @@ def expand_inputs_for_generation( 0, expanded_return_idx ) - if model_kwargs["pixel_values"] is not None: - model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(0, expanded_return_idx) - - elif model_kwargs["image_encoder_embeddings"] is not None: + if model_kwargs["image_encoder_embeddings"] is not None: model_kwargs["image_encoder_embeddings"] = model_kwargs["image_encoder_embeddings"].index_select( 0, expanded_return_idx ) @@ -95,17 +91,10 @@ def expand_inputs_for_generation( 0, expanded_return_idx ) - if is_encoder_decoder: - if encoder_outputs is None: - raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.") - encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.index_select( - 0, expanded_return_idx.to(encoder_outputs.last_hidden_state.device) - ) - model_kwargs["encoder_outputs"] = encoder_outputs return input_ids, model_kwargs -def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False): +def update_model_kwargs_for_generation(outputs, model_kwargs): # must have this key set to at least None model_kwargs["past_key_values"] = model_kwargs.get("past_key_values", None) @@ -125,16 +114,15 @@ def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1) # update attention masks - if not is_encoder_decoder: - if "attention_mask" in model_kwargs: - attention_mask = model_kwargs["attention_mask"] - model_kwargs["attention_mask"] = torch.cat( - [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 - ) - if "image_attention_mask" in model_kwargs: - image_attention_mask = model_kwargs["image_attention_mask"] - last_mask = image_attention_mask[:, -1, :].unsqueeze(1) - model_kwargs["image_attention_mask"] = last_mask + if "attention_mask" in model_kwargs: + attention_mask = model_kwargs["attention_mask"] + model_kwargs["attention_mask"] = torch.cat( + [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 + ) + if "image_attention_mask" in model_kwargs: + image_attention_mask = model_kwargs["image_attention_mask"] + last_mask = image_attention_mask[:, -1, :].unsqueeze(1) + model_kwargs["image_attention_mask"] = last_mask return model_kwargs @@ -157,12 +145,9 @@ def prepare_inputs_for_generation(input_ids, past=None, **kwargs): if past: position_ids = position_ids[:, -1].unsqueeze(-1) - pixel_values = kwargs.get("pixel_values", None) image_encoder_embeddings = kwargs.get("image_encoder_embeddings", None) perceiver_embeddings = kwargs.get("perceiver_embeddings", None) image_attention_mask = kwargs.get("image_attention_mask", None) - # if pixel_values is None or image_attention_mask is None: - # raise ValueError("pixel values and image attention mask cannot be None") return { "input_ids": input_ids, @@ -171,7 +156,6 @@ def prepare_inputs_for_generation(input_ids, past=None, **kwargs): "position_ids": position_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids, - "pixel_values": pixel_values, "image_encoder_embeddings": image_encoder_embeddings, "perceiver_embeddings": perceiver_embeddings, "image_attention_mask": image_attention_mask, @@ -1475,6 +1459,54 @@ def forward( attentions=outputs.attentions, ) + def _prepare_encoder_decoder_kwargs_for_generation(self, inputs_tensor, model_kwargs, model_input_name): + pixel_values = model_kwargs.get("pixel_values", None) + image_encoder_embeddings = model_kwargs.get("image_encoder_embeddings", None) + model_kwargs["perceiver_embeddings"] = model_kwargs.get("perceiver_embeddings", None) + + if image_encoder_embeddings is not None: + batch_size, num_images, image_seq_len, image_hidden_size = image_encoder_embeddings.size() + image_encoder_embeddings = image_encoder_embeddings.view( + batch_size * num_images, image_seq_len, image_hidden_size + ) + + elif pixel_values is not None: + batch_size, num_images = pixel_values.shape[:2] + pixel_values = pixel_values.contiguous().view(batch_size * num_images, *pixel_values.shape[2:]) + image_encoder_embeddings = self.model.vision_model(pixel_values=pixel_values).last_hidden_state + + if self.config.use_resampler: + if model_kwargs["perceiver_embeddings"] is None: + model_kwargs["perceiver_embeddings"] = self.model.perceiver_resampler(image_encoder_embeddings) + + image_seq_len, image_hidden_size = model_kwargs["perceiver_embeddings"].size(1), model_kwargs[ + "perceiver_embeddings" + ].size(2) + model_kwargs["perceiver_embeddings"] = model_kwargs["perceiver_embeddings"].view( + batch_size, num_images, image_seq_len, image_hidden_size + ) + else: + image_seq_len, image_hidden_size = image_encoder_embeddings.size(1), image_encoder_embeddings.size(2) + model_kwargs["image_encoder_embeddings"] = image_encoder_embeddings.view( + batch_size, num_images, image_seq_len, image_hidden_size + ) + + model_kwargs["pixel_values"] = None + model_kwargs["input_ids"] = inputs_tensor + return model_kwargs + + def _prepare_decoder_input_ids_for_generation( + self, + batch_size, + model_input_name, + model_kwargs, + decoder_start_token_id, + bos_token_id, + device, + ): + decoder_input_ids = model_kwargs.pop("input_ids") + return decoder_input_ids, model_kwargs + def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs): inputs = prepare_inputs_for_generation(input_ids, past=past, **kwargs) unwanted_kwargs = ["token_type_ids"] @@ -1490,8 +1522,8 @@ def _expand_inputs_for_generation( return expand_inputs_for_generation(*args, **model_kwargs) @staticmethod - def _update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False): - return update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=is_encoder_decoder) + def _update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder): + return update_model_kwargs_for_generation(outputs, model_kwargs) @staticmethod def _reorder_cache(past, beam_idx): From 86bdb7ebb47bf6a242ad6a4c3d0d48adf53f509e Mon Sep 17 00:00:00 2001 From: leot13 Date: Mon, 14 Aug 2023 10:58:22 +0200 Subject: [PATCH 05/25] make is_encoder_decoder= True the default in config --- src/transformers/models/idefics/configuration_idefics.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/transformers/models/idefics/configuration_idefics.py b/src/transformers/models/idefics/configuration_idefics.py index 528273a585cf..e045e2657a17 100644 --- a/src/transformers/models/idefics/configuration_idefics.py +++ b/src/transformers/models/idefics/configuration_idefics.py @@ -268,6 +268,7 @@ def __init__( freeze_vision_layers=True, freeze_vision_module_exceptions=[], use_resampler=False, + is_encoder_decoder=True, vision_config=None, perceiver_config=None, **kwargs, @@ -297,6 +298,7 @@ def __init__( self.freeze_lm_head = freeze_lm_head self.use_resampler = use_resampler + self.is_encoder_decoder = is_encoder_decoder if perceiver_config is None: self.perceiver_config = IdeficsPerceiverConfig() From c8cc8f7d419665823d7c85ef47d610b2de229e64 Mon Sep 17 00:00:00 2001 From: leot13 Date: Mon, 14 Aug 2023 11:05:44 +0200 Subject: [PATCH 06/25] nested if else fix --- .../models/idefics/modeling_idefics.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 3716ab0b2341..6cff18ea331f 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -1137,16 +1137,18 @@ def forward( image_hidden_states = image_encoder_embeddings.to(dtype=self.dtype, device=input_ids.device) image_hidden_states = image_hidden_states.view(batch_size * num_images, image_seq_len, image_hidden_size) - if self.config.use_resampler and perceiver_embeddings is None: - image_hidden_states = self.perceiver_resampler(image_hidden_states) - image_seq_len, image_hidden_size = image_hidden_states.size(1), image_hidden_states.size(2) - elif self.config.use_resampler and perceiver_embeddings is not None: - batch_size, num_images, image_seq_len, image_hidden_size = perceiver_embeddings.size() - image_hidden_states = perceiver_embeddings - elif not self.config.use_resampler and perceiver_embeddings is None: - image_seq_len, image_hidden_size = image_hidden_states.size(1), image_hidden_states.size(2) - elif not self.config.use_resampler and perceiver_embeddings is not None: - raise ValueError("If perceiver_embeddings are passed, use_resampler should be True") + if self.config.use_resampler: + if perceiver_embeddings is None: + image_hidden_states = self.perceiver_resampler(image_hidden_states) + image_seq_len, image_hidden_size = image_hidden_states.size(1), image_hidden_states.size(2) + else: + batch_size, num_images, image_seq_len, image_hidden_size = perceiver_embeddings.size() + image_hidden_states = perceiver_embeddings + else: + if perceiver_embeddings is None: + image_seq_len, image_hidden_size = image_hidden_states.size(1), image_hidden_states.size(2) + else: + raise ValueError("If perceiver_embeddings are passed, use_resampler should be True") image_hidden_states = image_hidden_states.view(batch_size, num_images * image_seq_len, image_hidden_size) # # Hack to use the model in full language modeling mode From 08f9000810bc9cee2bd135e3aee066408f927653 Mon Sep 17 00:00:00 2001 From: leot13 Date: Mon, 14 Aug 2023 11:32:29 +0200 Subject: [PATCH 07/25] better triple check --- src/transformers/models/idefics/modeling_idefics.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 6cff18ea331f..a232496d9166 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -1113,9 +1113,8 @@ def forward( no_images = False if ( sum( - 1 + vision_input is not None for vision_input in (pixel_values, image_encoder_embeddings, perceiver_embeddings) - if vision_input is not None ) != 1 ): From f6f53672b9076d45fbbb309f78f260c2e1c968de Mon Sep 17 00:00:00 2001 From: leot13 Date: Mon, 14 Aug 2023 13:52:43 +0200 Subject: [PATCH 08/25] switch if elif order for pixel values / img embeds --- src/transformers/models/idefics/modeling_idefics.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index a232496d9166..b7e6f95f1c41 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -1465,17 +1465,17 @@ def _prepare_encoder_decoder_kwargs_for_generation(self, inputs_tensor, model_kw image_encoder_embeddings = model_kwargs.get("image_encoder_embeddings", None) model_kwargs["perceiver_embeddings"] = model_kwargs.get("perceiver_embeddings", None) - if image_encoder_embeddings is not None: + if pixel_values is not None: + batch_size, num_images = pixel_values.shape[:2] + pixel_values = pixel_values.contiguous().view(batch_size * num_images, *pixel_values.shape[2:]) + image_encoder_embeddings = self.model.vision_model(pixel_values=pixel_values).last_hidden_state + + elif image_encoder_embeddings is not None: batch_size, num_images, image_seq_len, image_hidden_size = image_encoder_embeddings.size() image_encoder_embeddings = image_encoder_embeddings.view( batch_size * num_images, image_seq_len, image_hidden_size ) - elif pixel_values is not None: - batch_size, num_images = pixel_values.shape[:2] - pixel_values = pixel_values.contiguous().view(batch_size * num_images, *pixel_values.shape[2:]) - image_encoder_embeddings = self.model.vision_model(pixel_values=pixel_values).last_hidden_state - if self.config.use_resampler: if model_kwargs["perceiver_embeddings"] is None: model_kwargs["perceiver_embeddings"] = self.model.perceiver_resampler(image_encoder_embeddings) From 2d4daf68e2942e71b180c007e2a2f8acbf8c3faa Mon Sep 17 00:00:00 2001 From: leot13 Date: Mon, 14 Aug 2023 22:26:15 +0200 Subject: [PATCH 09/25] update model_kwargs perceiver only at the end --- .../models/idefics/modeling_idefics.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index b7e6f95f1c41..a5b124833473 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -1463,7 +1463,7 @@ def forward( def _prepare_encoder_decoder_kwargs_for_generation(self, inputs_tensor, model_kwargs, model_input_name): pixel_values = model_kwargs.get("pixel_values", None) image_encoder_embeddings = model_kwargs.get("image_encoder_embeddings", None) - model_kwargs["perceiver_embeddings"] = model_kwargs.get("perceiver_embeddings", None) + perceiver_embeddings = model_kwargs.get("perceiver_embeddings", None) if pixel_values is not None: batch_size, num_images = pixel_values.shape[:2] @@ -1477,15 +1477,14 @@ def _prepare_encoder_decoder_kwargs_for_generation(self, inputs_tensor, model_kw ) if self.config.use_resampler: - if model_kwargs["perceiver_embeddings"] is None: - model_kwargs["perceiver_embeddings"] = self.model.perceiver_resampler(image_encoder_embeddings) - - image_seq_len, image_hidden_size = model_kwargs["perceiver_embeddings"].size(1), model_kwargs[ - "perceiver_embeddings" - ].size(2) - model_kwargs["perceiver_embeddings"] = model_kwargs["perceiver_embeddings"].view( - batch_size, num_images, image_seq_len, image_hidden_size - ) + if perceiver_embeddings is None: + perceiver_embeddings = self.model.perceiver_resampler(image_encoder_embeddings) + image_seq_len, image_hidden_size = perceiver_embeddings.size(1), perceiver_embeddings.size(2) + model_kwargs["perceiver_embeddings"] = perceiver_embeddings.view( + batch_size, num_images, image_seq_len, image_hidden_size + ) + else: + model_kwargs["perceiver_embeddings"] = perceiver_embeddings else: image_seq_len, image_hidden_size = image_encoder_embeddings.size(1), image_encoder_embeddings.size(2) model_kwargs["image_encoder_embeddings"] = image_encoder_embeddings.view( From 784f270d3f3ac3635d0247761e8fedf8526141f5 Mon Sep 17 00:00:00 2001 From: leot13 Date: Mon, 14 Aug 2023 23:26:00 +0200 Subject: [PATCH 10/25] use _prepare_model_inputs instead of encoder_decoder logic --- .../models/idefics/modeling_idefics.py | 76 +++++++++++++++---- 1 file changed, 61 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index a5b124833473..b2dd5afdc708 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -23,6 +23,7 @@ import torch import torch.nn.functional as F import torch.utils.checkpoint +import inspect from torch import nn from torch.nn import CrossEntropyLoss @@ -1460,7 +1461,65 @@ def forward( attentions=outputs.attentions, ) - def _prepare_encoder_decoder_kwargs_for_generation(self, inputs_tensor, model_kwargs, model_input_name): + def _prepare_model_inputs( + self, + inputs, + bos_token_id, + model_kwargs, + ): + if ( + self.config.is_encoder_decoder + and hasattr(self, "encoder") + and self.encoder.main_input_name != self.main_input_name + ): + input_name = self.encoder.main_input_name + else: + input_name = self.main_input_name + + model_kwargs = {k: v for k, v in model_kwargs.items() if v is not None or k != input_name} + + # 2. check whether model_input_name is passed as kwarg + # if yes and `inputs` is None use kwarg inputs + inputs_kwarg = model_kwargs.pop(input_name, None) + if inputs_kwarg is not None and inputs is not None: + raise ValueError( + f"`inputs`: {inputs}` were passed alongside {input_name} which is not allowed." + f"Make sure to either pass {inputs} or {input_name}=..." + ) + elif inputs_kwarg is not None: + inputs = inputs_kwarg + + # 3. In the presence of `inputs_embeds` for text models: + # - decoder-only models should complain if the user attempts to pass `inputs_embeds`, but the model + # doesn't have its forwarding implemented. `inputs_embeds` is kept in `model_kwargs` and can coexist with + # input_ids (`inputs_embeds` will be used in the 1st generation step, as opposed to `input_ids`) + # - encoder-decoder models should complain if the user attempts to pass `inputs_embeds` and `input_ids`, and + # pull the former to inputs. It will be used in place of `input_ids` to get the encoder hidden states. + if input_name == "input_ids" and "inputs_embeds" in model_kwargs: + if not self.config.is_encoder_decoder: + has_inputs_embeds_forwarding = "inputs_embeds" in set( + inspect.signature(self.prepare_inputs_for_generation).parameters.keys() + ) + if not has_inputs_embeds_forwarding: + raise ValueError( + f"You passed `inputs_embeds` to `.generate()`, but the model class {self.__class__.__name__} " + "doesn't have its forwarding implemented. See the GPT2 implementation for an example " + "(https://github.com/huggingface/transformers/pull/21405), and feel free to open a PR with it!" + ) + # In this case, `input_ids` is moved to the `model_kwargs`, so a few automations (like the creation of + # the attention mask) can rely on the actual model input. + model_kwargs["input_ids"] = self._maybe_initialize_input_ids_for_generation( + inputs, bos_token_id, model_kwargs=model_kwargs + ) + else: + if inputs is not None: + raise ValueError("You passed `inputs_embeds` and `input_ids` to `.generate()`. Please pick one.") + inputs, input_name = model_kwargs["inputs_embeds"], "inputs_embeds" + + # 4. if `inputs` is still None, try to create `input_ids` from BOS token + inputs = self._maybe_initialize_input_ids_for_generation(inputs, bos_token_id, model_kwargs) + + # 5. Prepare model kwargs from IDEFICS vision component pixel_values = model_kwargs.get("pixel_values", None) image_encoder_embeddings = model_kwargs.get("image_encoder_embeddings", None) perceiver_embeddings = model_kwargs.get("perceiver_embeddings", None) @@ -1492,20 +1551,7 @@ def _prepare_encoder_decoder_kwargs_for_generation(self, inputs_tensor, model_kw ) model_kwargs["pixel_values"] = None - model_kwargs["input_ids"] = inputs_tensor - return model_kwargs - - def _prepare_decoder_input_ids_for_generation( - self, - batch_size, - model_input_name, - model_kwargs, - decoder_start_token_id, - bos_token_id, - device, - ): - decoder_input_ids = model_kwargs.pop("input_ids") - return decoder_input_ids, model_kwargs + return inputs, input_name, model_kwargs def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs): inputs = prepare_inputs_for_generation(input_ids, past=past, **kwargs) From aa791348c7a9bfbd9289ef985ad8a237cf8cb1ce Mon Sep 17 00:00:00 2001 From: leot13 Date: Mon, 14 Aug 2023 23:29:30 +0200 Subject: [PATCH 11/25] fix comment typo --- src/transformers/models/idefics/modeling_idefics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index b2dd5afdc708..8b108d3256ab 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -1519,7 +1519,7 @@ def _prepare_model_inputs( # 4. if `inputs` is still None, try to create `input_ids` from BOS token inputs = self._maybe_initialize_input_ids_for_generation(inputs, bos_token_id, model_kwargs) - # 5. Prepare model kwargs from IDEFICS vision component + # 5. Prepare model_kwargs for IDEFICS vision component pixel_values = model_kwargs.get("pixel_values", None) image_encoder_embeddings = model_kwargs.get("image_encoder_embeddings", None) perceiver_embeddings = model_kwargs.get("perceiver_embeddings", None) From 190ea96f9e52877a1bc3925a12a57b074afc6ada Mon Sep 17 00:00:00 2001 From: leot13 Date: Mon, 14 Aug 2023 23:33:21 +0200 Subject: [PATCH 12/25] fix config default for is_encoder_decoder --- src/transformers/models/idefics/configuration_idefics.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/transformers/models/idefics/configuration_idefics.py b/src/transformers/models/idefics/configuration_idefics.py index e045e2657a17..528273a585cf 100644 --- a/src/transformers/models/idefics/configuration_idefics.py +++ b/src/transformers/models/idefics/configuration_idefics.py @@ -268,7 +268,6 @@ def __init__( freeze_vision_layers=True, freeze_vision_module_exceptions=[], use_resampler=False, - is_encoder_decoder=True, vision_config=None, perceiver_config=None, **kwargs, @@ -298,7 +297,6 @@ def __init__( self.freeze_lm_head = freeze_lm_head self.use_resampler = use_resampler - self.is_encoder_decoder = is_encoder_decoder if perceiver_config is None: self.perceiver_config = IdeficsPerceiverConfig() From 6fdd61b2d4cb4404170bdb8fb39410a39106f36f Mon Sep 17 00:00:00 2001 From: leot13 Date: Tue, 15 Aug 2023 08:10:08 +0200 Subject: [PATCH 13/25] style --- src/transformers/models/idefics/modeling_idefics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 8b108d3256ab..23941fe3ed73 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -18,12 +18,12 @@ # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch Idefics model.""" +import inspect from typing import List, Optional, Tuple, Union import torch import torch.nn.functional as F import torch.utils.checkpoint -import inspect from torch import nn from torch.nn import CrossEntropyLoss From 480be3311945606eeb570f02634c547adcbfd0be Mon Sep 17 00:00:00 2001 From: leot13 Date: Tue, 15 Aug 2023 09:57:23 +0200 Subject: [PATCH 14/25] add typehints --- src/transformers/models/idefics/modeling_idefics.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 23941fe3ed73..6065d13d168c 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -19,7 +19,7 @@ # limitations under the License. """ PyTorch Idefics model.""" import inspect -from typing import List, Optional, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union import torch import torch.nn.functional as F @@ -1463,10 +1463,10 @@ def forward( def _prepare_model_inputs( self, - inputs, - bos_token_id, - model_kwargs, - ): + inputs: Optional[torch.Tensor] = None, + bos_token_id: Optional[int] = None, + model_kwargs: Optional[Dict[str, torch.Tensor]] = None, + ) -> Tuple[torch.Tensor, Optional[str], Dict[str, torch.Tensor]]: if ( self.config.is_encoder_decoder and hasattr(self, "encoder") From 79349ad8f7e2821a552909de66ed1e88d309b308 Mon Sep 17 00:00:00 2001 From: leot13 Date: Thu, 17 Aug 2023 21:02:46 +0200 Subject: [PATCH 15/25] precompute in forward --- .../models/idefics/modeling_idefics.py | 226 ++++++++++-------- 1 file changed, 123 insertions(+), 103 deletions(-) diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 6065d13d168c..9f2abefd1573 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -18,8 +18,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch Idefics model.""" -import inspect -from typing import Dict, List, Optional, Tuple, Union +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union import torch import torch.nn.functional as F @@ -29,7 +29,7 @@ from ... import PreTrainedModel from ...activations import ACT2FN -from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_outputs import ModelOutput from ...modeling_utils import PretrainedConfig from ...utils import ( add_start_docstrings, @@ -53,6 +53,96 @@ ] +@dataclass +class IdeficsBaseModelOutputWithPast(ModelOutput): + """ + Base class for Idefics model's outputs that may also contain a past key/values (to speed up sequential decoding). + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if + `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, + encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if + `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values` + input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + image_hidden_states (`tuple(torch.FloatTensor)`, *optional*): + Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, + sequence_length, hidden_size)`. + + image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver + """ + + last_hidden_state: torch.FloatTensor = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class IdeficsCausalLMOutputWithPast(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Language modeling loss (for next-token prediction). + logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (: + obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when + ``config.use_cache=True``): List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`, with each + tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see + ``past_key_values`` input) to speed up sequential decoding. + hidden_states (: + obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when + ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the + embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (: + obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when + ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + image_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`): + Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, + sequence_length, hidden_size)`. + + image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver + """ + + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None + past_key_values: Optional[List[torch.FloatTensor]] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + + def expand_inputs_for_generation( input_ids, expand_size=1, @@ -65,7 +155,7 @@ def expand_inputs_for_generation( torch.arange(input_ids.shape[0]).view(-1, 1).repeat(1, expand_size).view(-1).to(input_ids.device) ) input_ids = input_ids.index_select(0, expanded_return_idx) - + model_kwargs["pixel_values"] = model_kwargs.get("pixel_values", None) model_kwargs["image_encoder_embeddings"] = model_kwargs.get("image_encoder_embeddings", None) model_kwargs["perceiver_embeddings"] = model_kwargs.get("perceiver_embeddings", None) model_kwargs["image_attention_mask"] = model_kwargs.get("image_attention_mask", None) @@ -82,7 +172,10 @@ def expand_inputs_for_generation( 0, expanded_return_idx ) - if model_kwargs["image_encoder_embeddings"] is not None: + if model_kwargs["pixel_values"] is not None: + model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(0, expanded_return_idx) + + elif model_kwargs["image_encoder_embeddings"] is not None: model_kwargs["image_encoder_embeddings"] = model_kwargs["image_encoder_embeddings"].index_select( 0, expanded_return_idx ) @@ -125,6 +218,9 @@ def update_model_kwargs_for_generation(outputs, model_kwargs): last_mask = image_attention_mask[:, -1, :].unsqueeze(1) model_kwargs["image_attention_mask"] = last_mask + # Get the precomputed image_hidden_states + model_kwargs["image_hidden_states"] = outputs.image_hidden_states + return model_kwargs @@ -146,6 +242,7 @@ def prepare_inputs_for_generation(input_ids, past=None, **kwargs): if past: position_ids = position_ids[:, -1].unsqueeze(-1) + pixel_values = kwargs.get("pixel_values", None) image_encoder_embeddings = kwargs.get("image_encoder_embeddings", None) perceiver_embeddings = kwargs.get("perceiver_embeddings", None) image_attention_mask = kwargs.get("image_attention_mask", None) @@ -157,6 +254,7 @@ def prepare_inputs_for_generation(input_ids, past=None, **kwargs): "position_ids": position_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids, + "pixel_values": pixel_values, "image_encoder_embeddings": image_encoder_embeddings, "perceiver_embeddings": perceiver_embeddings, "image_attention_mask": image_attention_mask, @@ -1070,7 +1168,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPast]: + ) -> Union[Tuple, IdeficsBaseModelOutputWithPast]: device = input_ids.device if input_ids is not None else inputs_embeds.device output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions @@ -1296,13 +1394,19 @@ def vblock( all_hidden_states += (hidden_states,) next_cache = next_decoder_cache if use_cache else None + image_hidden_states = image_hidden_states.view(batch_size, num_images, image_seq_len, image_hidden_size) if not return_dict: - return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) - return BaseModelOutputWithPast( + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, image_hidden_states] + if v is not None + ) + return IdeficsBaseModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=next_cache, hidden_states=all_hidden_states, attentions=all_self_attns, + image_hidden_states=image_hidden_states, ) @@ -1365,7 +1469,7 @@ def tie_weights(self): output_embeddings.out_additional_features = input_embeddings.num_additional_embeddings @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + @replace_return_docstrings(output_type=IdeficsCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids: torch.LongTensor = None, @@ -1382,7 +1486,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - ) -> Union[Tuple, CausalLMOutputWithPast]: + ) -> Union[Tuple, IdeficsCausalLMOutputWithPast]: r""" Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): @@ -1453,107 +1557,23 @@ def forward( output = (logits,) + outputs[1:] return (loss,) + output if loss is not None else output - return CausalLMOutputWithPast( + return IdeficsCausalLMOutputWithPast( loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, ) - def _prepare_model_inputs( - self, - inputs: Optional[torch.Tensor] = None, - bos_token_id: Optional[int] = None, - model_kwargs: Optional[Dict[str, torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, Optional[str], Dict[str, torch.Tensor]]: - if ( - self.config.is_encoder_decoder - and hasattr(self, "encoder") - and self.encoder.main_input_name != self.main_input_name - ): - input_name = self.encoder.main_input_name - else: - input_name = self.main_input_name - - model_kwargs = {k: v for k, v in model_kwargs.items() if v is not None or k != input_name} - - # 2. check whether model_input_name is passed as kwarg - # if yes and `inputs` is None use kwarg inputs - inputs_kwarg = model_kwargs.pop(input_name, None) - if inputs_kwarg is not None and inputs is not None: - raise ValueError( - f"`inputs`: {inputs}` were passed alongside {input_name} which is not allowed." - f"Make sure to either pass {inputs} or {input_name}=..." - ) - elif inputs_kwarg is not None: - inputs = inputs_kwarg - - # 3. In the presence of `inputs_embeds` for text models: - # - decoder-only models should complain if the user attempts to pass `inputs_embeds`, but the model - # doesn't have its forwarding implemented. `inputs_embeds` is kept in `model_kwargs` and can coexist with - # input_ids (`inputs_embeds` will be used in the 1st generation step, as opposed to `input_ids`) - # - encoder-decoder models should complain if the user attempts to pass `inputs_embeds` and `input_ids`, and - # pull the former to inputs. It will be used in place of `input_ids` to get the encoder hidden states. - if input_name == "input_ids" and "inputs_embeds" in model_kwargs: - if not self.config.is_encoder_decoder: - has_inputs_embeds_forwarding = "inputs_embeds" in set( - inspect.signature(self.prepare_inputs_for_generation).parameters.keys() - ) - if not has_inputs_embeds_forwarding: - raise ValueError( - f"You passed `inputs_embeds` to `.generate()`, but the model class {self.__class__.__name__} " - "doesn't have its forwarding implemented. See the GPT2 implementation for an example " - "(https://github.com/huggingface/transformers/pull/21405), and feel free to open a PR with it!" - ) - # In this case, `input_ids` is moved to the `model_kwargs`, so a few automations (like the creation of - # the attention mask) can rely on the actual model input. - model_kwargs["input_ids"] = self._maybe_initialize_input_ids_for_generation( - inputs, bos_token_id, model_kwargs=model_kwargs - ) - else: - if inputs is not None: - raise ValueError("You passed `inputs_embeds` and `input_ids` to `.generate()`. Please pick one.") - inputs, input_name = model_kwargs["inputs_embeds"], "inputs_embeds" - - # 4. if `inputs` is still None, try to create `input_ids` from BOS token - inputs = self._maybe_initialize_input_ids_for_generation(inputs, bos_token_id, model_kwargs) - - # 5. Prepare model_kwargs for IDEFICS vision component - pixel_values = model_kwargs.get("pixel_values", None) - image_encoder_embeddings = model_kwargs.get("image_encoder_embeddings", None) - perceiver_embeddings = model_kwargs.get("perceiver_embeddings", None) - - if pixel_values is not None: - batch_size, num_images = pixel_values.shape[:2] - pixel_values = pixel_values.contiguous().view(batch_size * num_images, *pixel_values.shape[2:]) - image_encoder_embeddings = self.model.vision_model(pixel_values=pixel_values).last_hidden_state - - elif image_encoder_embeddings is not None: - batch_size, num_images, image_seq_len, image_hidden_size = image_encoder_embeddings.size() - image_encoder_embeddings = image_encoder_embeddings.view( - batch_size * num_images, image_seq_len, image_hidden_size - ) - - if self.config.use_resampler: - if perceiver_embeddings is None: - perceiver_embeddings = self.model.perceiver_resampler(image_encoder_embeddings) - image_seq_len, image_hidden_size = perceiver_embeddings.size(1), perceiver_embeddings.size(2) - model_kwargs["perceiver_embeddings"] = perceiver_embeddings.view( - batch_size, num_images, image_seq_len, image_hidden_size - ) - else: - model_kwargs["perceiver_embeddings"] = perceiver_embeddings - else: - image_seq_len, image_hidden_size = image_encoder_embeddings.size(1), image_encoder_embeddings.size(2) - model_kwargs["image_encoder_embeddings"] = image_encoder_embeddings.view( - batch_size, num_images, image_seq_len, image_hidden_size - ) - - model_kwargs["pixel_values"] = None - return inputs, input_name, model_kwargs - def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs): + image_hidden_states = kwargs.get("image_hidden_states", None) + if image_hidden_states is not None: + if self.config.use_resampler: + kwargs["perceiver_embeddings"] = image_hidden_states + else: + kwargs["image_encoder_embeddings"] = image_hidden_states + kwargs["pixel_values"] = None inputs = prepare_inputs_for_generation(input_ids, past=past, **kwargs) unwanted_kwargs = ["token_type_ids"] for kwarg in unwanted_kwargs: From e6781fbed4ba8c341104a0c679e96cd953c59a58 Mon Sep 17 00:00:00 2001 From: leot13 Date: Thu, 17 Aug 2023 21:19:40 +0200 Subject: [PATCH 16/25] doc builder --- .../models/idefics/modeling_idefics.py | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 9f2abefd1573..8083572e89e5 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -104,33 +104,33 @@ class IdeficsCausalLMOutputWithPast(ModelOutput): Base class for causal language model (or autoregressive) outputs. Args: - loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): Language modeling loss (for next-token prediction). - logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). past_key_values (: - obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when - ``config.use_cache=True``): List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`, with each - tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`). + obj:*List[torch.FloatTensor]*, *optional*, returned when `use_cache=True` is passed or when + `config.use_cache=True`): List of `torch.FloatTensor` of length `config.n_layers`, with each + tensor of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`). Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see - ``past_key_values`` input) to speed up sequential decoding. + `past_key_values` input) to speed up sequential decoding. hidden_states (: - obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when - ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the - embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. + obj:*tuple(torch.FloatTensor)*, *optional*, returned when `output_hidden_states=True` is passed or when + `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the + embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (: - obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when - ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + obj:*tuple(torch.FloatTensor)*, *optional*, returned when `output_attentions=True` is passed or when + `config.output_attentions=True`): Tuple of `torch.FloatTensor` (one for each layer) of shape + `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - image_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`): - Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, - sequence_length, hidden_size)`. + image_hidden_states (`tuple(torch.FloatTensor)`, *optional*): + Tuple of *torch.FloatTensor* (one for the output of the image embeddings, *(batch_size, num_images, + sequence_length, hidden_size)*. image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver """ From df0d79bc7dc120ce5a3281bbce6b3d2e8035d3de Mon Sep 17 00:00:00 2001 From: leot13 Date: Thu, 17 Aug 2023 21:20:25 +0200 Subject: [PATCH 17/25] style --- src/transformers/models/idefics/modeling_idefics.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 8083572e89e5..150fd4539cf0 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -110,21 +110,21 @@ class IdeficsCausalLMOutputWithPast(ModelOutput): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). past_key_values (: obj:*List[torch.FloatTensor]*, *optional*, returned when `use_cache=True` is passed or when - `config.use_cache=True`): List of `torch.FloatTensor` of length `config.n_layers`, with each - tensor of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`). + `config.use_cache=True`): List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of + shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`). Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. hidden_states (: obj:*tuple(torch.FloatTensor)*, *optional*, returned when `output_hidden_states=True` is passed or when - `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the - embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (: obj:*tuple(torch.FloatTensor)*, *optional*, returned when `output_attentions=True` is passed or when - `config.output_attentions=True`): Tuple of `torch.FloatTensor` (one for each layer) of shape - `(batch_size, num_heads, sequence_length, sequence_length)`. + `config.output_attentions=True`): Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, + num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. From cda719d46f60b67c14f11469e29ef581fe9d6019 Mon Sep 17 00:00:00 2001 From: leot13 Date: Thu, 17 Aug 2023 21:47:49 +0200 Subject: [PATCH 18/25] pop instead of get image hidden states --- src/transformers/models/idefics/modeling_idefics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 150fd4539cf0..b46bf4eb3fd8 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -1567,7 +1567,7 @@ def forward( ) def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs): - image_hidden_states = kwargs.get("image_hidden_states", None) + image_hidden_states = kwargs.pop("image_hidden_states", None) if image_hidden_states is not None: if self.config.use_resampler: kwargs["perceiver_embeddings"] = image_hidden_states From 698805104facb5cca4e2aa66d371ab0980d0360e Mon Sep 17 00:00:00 2001 From: leot13 Date: Thu, 17 Aug 2023 22:14:14 +0200 Subject: [PATCH 19/25] Trigger CI From 5f6fb1ea58e245485e30a200fd1ac23250cfef28 Mon Sep 17 00:00:00 2001 From: Leo Tronchon Date: Fri, 18 Aug 2023 12:59:35 +0200 Subject: [PATCH 20/25] Update src/transformers/models/idefics/modeling_idefics.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- src/transformers/models/idefics/modeling_idefics.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index b46bf4eb3fd8..4f675f6c7536 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -1210,13 +1210,7 @@ def forward( position_ids = position_ids.view(-1, seq_length).long() no_images = False - if ( - sum( - vision_input is not None - for vision_input in (pixel_values, image_encoder_embeddings, perceiver_embeddings) - ) - != 1 - ): +if (pixel_values, image_encoder_embeddings, perceiver_embeddings).count(None) != 2: raise ValueError( "Exactly 1 of pixel_values, image_encoder_embeddings or perceiver_embeddings has to be not-None." ) From 603efa81f3439cab2d180343b7fb34784d4c8d74 Mon Sep 17 00:00:00 2001 From: Leo Tronchon Date: Fri, 18 Aug 2023 13:09:01 +0200 Subject: [PATCH 21/25] Update src/transformers/models/idefics/modeling_idefics.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- src/transformers/models/idefics/modeling_idefics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 4f675f6c7536..229a53e811a8 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -129,8 +129,8 @@ class IdeficsCausalLMOutputWithPast(ModelOutput): Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. image_hidden_states (`tuple(torch.FloatTensor)`, *optional*): - Tuple of *torch.FloatTensor* (one for the output of the image embeddings, *(batch_size, num_images, - sequence_length, hidden_size)*. + Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, + sequence_length, hidden_size)`. image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver """ From 46fa4b0a1066a9749dfcac9526d59daf1cdcb765 Mon Sep 17 00:00:00 2001 From: leot13 Date: Fri, 18 Aug 2023 13:12:29 +0200 Subject: [PATCH 22/25] fix * + indentation + style --- src/transformers/models/idefics/modeling_idefics.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 229a53e811a8..4b48105d011a 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -104,31 +104,31 @@ class IdeficsCausalLMOutputWithPast(ModelOutput): Base class for causal language model (or autoregressive) outputs. Args: - loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + loss (`torch.FloatTensor` of shape `(1,)`, `optional`, returned when `labels` is provided): Language modeling loss (for next-token prediction). logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). past_key_values (: - obj:*List[torch.FloatTensor]*, *optional*, returned when `use_cache=True` is passed or when + obj:`List[torch.FloatTensor]`, `optional`, returned when `use_cache=True` is passed or when `config.use_cache=True`): List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`). Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. hidden_states (: - obj:*tuple(torch.FloatTensor)*, *optional*, returned when `output_hidden_states=True` is passed or when + obj:`tuple(torch.FloatTensor)`, `optional`, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (: - obj:*tuple(torch.FloatTensor)*, *optional*, returned when `output_attentions=True` is passed or when + obj:`tuple(torch.FloatTensor)`, `optional`, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - image_hidden_states (`tuple(torch.FloatTensor)`, *optional*): + image_hidden_states (`tuple(torch.FloatTensor)`, `optional`): Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, sequence_length, hidden_size)`. @@ -1210,7 +1210,7 @@ def forward( position_ids = position_ids.view(-1, seq_length).long() no_images = False -if (pixel_values, image_encoder_embeddings, perceiver_embeddings).count(None) != 2: + if (pixel_values, image_encoder_embeddings, perceiver_embeddings).count(None) != 2: raise ValueError( "Exactly 1 of pixel_values, image_encoder_embeddings or perceiver_embeddings has to be not-None." ) From c5585e6f3ad665269c265b550b5529cbac20ae36 Mon Sep 17 00:00:00 2001 From: leot13 Date: Fri, 18 Aug 2023 14:03:12 +0200 Subject: [PATCH 23/25] simplify a bit the use_resampler logic using comments --- src/transformers/models/idefics/modeling_idefics.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 4b48105d011a..e61350d78c21 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -1231,16 +1231,15 @@ def forward( if self.config.use_resampler: if perceiver_embeddings is None: - image_hidden_states = self.perceiver_resampler(image_hidden_states) - image_seq_len, image_hidden_size = image_hidden_states.size(1), image_hidden_states.size(2) + perceiver_embeddings = self.perceiver_resampler(image_hidden_states) + image_seq_len, image_hidden_size = perceiver_embeddings.size(1), perceiver_embeddings.size(2) else: batch_size, num_images, image_seq_len, image_hidden_size = perceiver_embeddings.size() - image_hidden_states = perceiver_embeddings + image_hidden_states = perceiver_embeddings + elif perceiver_embeddings is None: + image_seq_len, image_hidden_size = image_hidden_states.size(1), image_hidden_states.size(2) else: - if perceiver_embeddings is None: - image_seq_len, image_hidden_size = image_hidden_states.size(1), image_hidden_states.size(2) - else: - raise ValueError("If perceiver_embeddings are passed, use_resampler should be True") + raise ValueError("If `perceiver_embeddings` are passed, use_resampler should be True") image_hidden_states = image_hidden_states.view(batch_size, num_images * image_seq_len, image_hidden_size) # # Hack to use the model in full language modeling mode From f0036e0a1dcb40268e991241620fb3b58fad15a8 Mon Sep 17 00:00:00 2001 From: leot13 Date: Fri, 18 Aug 2023 14:06:34 +0200 Subject: [PATCH 24/25] update diocstrings --- .../models/idefics/modeling_idefics.py | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index e61350d78c21..d46e9d9e731b 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -101,34 +101,31 @@ class IdeficsBaseModelOutputWithPast(ModelOutput): @dataclass class IdeficsCausalLMOutputWithPast(ModelOutput): """ - Base class for causal language model (or autoregressive) outputs. + Base class for Idefics causal language model (or autoregressive) outputs. Args: - loss (`torch.FloatTensor` of shape `(1,)`, `optional`, returned when `labels` is provided): + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): Language modeling loss (for next-token prediction). logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - past_key_values (: - obj:`List[torch.FloatTensor]`, `optional`, returned when `use_cache=True` is passed or when - `config.use_cache=True`): List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of - shape `(2, batch_size, num_heads, sequence_length, embed_size_per_head)`). + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) - Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. - hidden_states (: - obj:`tuple(torch.FloatTensor)`, `optional`, returned when `output_hidden_states=True` is passed or when - `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings + + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (: - obj:`tuple(torch.FloatTensor)`, `optional`, returned when `output_attentions=True` is passed or when - `config.output_attentions=True`): Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, - num_heads, sequence_length, sequence_length)`. + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - image_hidden_states (`tuple(torch.FloatTensor)`, `optional`): + image_hidden_states (`tuple(torch.FloatTensor)`, *optional*): Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, sequence_length, hidden_size)`. From de358eefb49a772db119a163012b0ccaaadf92ac Mon Sep 17 00:00:00 2001 From: leot13 Date: Fri, 18 Aug 2023 14:31:33 +0200 Subject: [PATCH 25/25] Trigger CI