From 789f2f4e99912e6403e005e78dacf4e47f13f0c7 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 8 Feb 2022 18:22:18 +0100 Subject: [PATCH 01/30] TF generate start refactor --- src/transformers/generation_tf_utils.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py index f880cb368ea0..a1152ba90b64 100644 --- a/src/transformers/generation_tf_utils.py +++ b/src/transformers/generation_tf_utils.py @@ -764,14 +764,13 @@ def generate( cur_len < max_length ), f"The context has {cur_len} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`" - if num_beams > 1: - output = self._generate_beam_search( + if num_beams == 1: + return self._generate_no_beam_search( input_ids, cur_len=cur_len, max_length=max_length, min_length=min_length, do_sample=do_sample, - early_stopping=early_stopping, temperature=temperature, top_k=top_k, top_p=top_p, @@ -781,25 +780,21 @@ def generate( pad_token_id=pad_token_id, eos_token_id=eos_token_id, batch_size=effective_batch_size, - num_return_sequences=num_return_sequences, - length_penalty=length_penalty, - num_beams=num_beams, vocab_size=vocab_size, encoder_outputs=encoder_outputs, attention_mask=attention_mask, use_cache=use_cache, - forced_bos_token_id=forced_bos_token_id, - forced_eos_token_id=forced_eos_token_id, return_dict_in_generate=return_dict_in_generate, **model_kwargs, ) else: - output = self._generate_no_beam_search( + return self._generate_beam_search( input_ids, cur_len=cur_len, max_length=max_length, min_length=min_length, do_sample=do_sample, + early_stopping=early_stopping, temperature=temperature, top_k=top_k, top_p=top_p, @@ -809,16 +804,19 @@ def generate( pad_token_id=pad_token_id, eos_token_id=eos_token_id, batch_size=effective_batch_size, + num_return_sequences=num_return_sequences, + length_penalty=length_penalty, + num_beams=num_beams, vocab_size=vocab_size, encoder_outputs=encoder_outputs, attention_mask=attention_mask, use_cache=use_cache, + forced_bos_token_id=forced_bos_token_id, + forced_eos_token_id=forced_eos_token_id, return_dict_in_generate=return_dict_in_generate, **model_kwargs, ) - return output - def _generate_no_beam_search( self, input_ids, From 6702ed3bbc994f29b8e1f8dd599132b36442a881 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 8 Feb 2022 19:06:51 +0100 Subject: [PATCH 02/30] Add tf tests for sample generate --- tests/test_modeling_tf_gpt2.py | 86 ++++++++++++++++++++++------------ tests/test_modeling_tf_t5.py | 28 +++++++++++ 2 files changed, 85 insertions(+), 29 deletions(-) diff --git a/tests/test_modeling_tf_gpt2.py b/tests/test_modeling_tf_gpt2.py index d653329a5e82..dea63a786ba6 100644 --- a/tests/test_modeling_tf_gpt2.py +++ b/tests/test_modeling_tf_gpt2.py @@ -26,6 +26,7 @@ if is_tf_available(): import tensorflow as tf + from transformers import GPT2Tokenizer from transformers.models.gpt2.modeling_tf_gpt2 import ( TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST, TFGPT2DoubleHeadsModel, @@ -427,35 +428,6 @@ def test_model_from_pretrained(self): @require_tf class TFGPT2ModelLanguageGenerationTest(unittest.TestCase): - @slow - def test_lm_generate_gpt2(self): - model = TFGPT2LMHeadModel.from_pretrained("gpt2") - input_ids = tf.convert_to_tensor([[464, 3290]], dtype=tf.int32) # The dog - expected_output_ids = [ - 464, - 3290, - 373, - 1043, - 287, - 257, - 2214, - 1474, - 262, - 16246, - 286, - 2688, - 290, - 2688, - 27262, - 13, - 198, - 198, - 464, - 3290, - ] # The dog was found in a field near the intersection of West and West Streets.\n\nThe dog - output_ids = model.generate(input_ids, do_sample=False) - self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids) - @slow def test_lm_generate_distilgpt2(self): model = TFGPT2LMHeadModel.from_pretrained("distilgpt2") @@ -485,3 +457,59 @@ def test_lm_generate_distilgpt2(self): output_ids = model.generate(input_ids, do_sample=False) self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids) + + @slow + def test_lm_generate_distilgpt2_batch_special(self): + model = TFGPT2LMHeadModel.from_pretrained("distilgpt2") + tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2") + + tokenizer.pad_token = tokenizer.eos_token + tokenizer.padding_side = "left" + + sentences = ["Today is a beautiful day and", "Yesterday was"] + input_ids = tokenizer(sentences, return_tensors="tf", padding=True).input_ids + + generation_kwargs = { + "bad_words_ids": [tokenizer("is").input_ids, tokenizer("angry about").input_ids], + "no_repeat_ngram_size": 2, + "do_sample": False, + "repetition_penalty": 1.3, + } + + output_ids = model.generate(input_ids, **generation_kwargs) + + output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + expected_output_string = [ + "Today is a beautiful day and I am so happy to be able take part in this amazing event.", + "Yesterday was a very busy day for the first time since I started writing this post", + ] + self.assertListEqual(output_strings, expected_output_string) + + @slow + def a_test_lm_generate_gpt2(self): + model = TFGPT2LMHeadModel.from_pretrained("gpt2") + input_ids = tf.convert_to_tensor([[464, 3290]], dtype=tf.int32) # The dog + expected_output_ids = [ + 464, + 3290, + 373, + 1043, + 287, + 257, + 2214, + 1474, + 262, + 16246, + 286, + 2688, + 290, + 2688, + 27262, + 13, + 198, + 198, + 464, + 3290, + ] # The dog was found in a field near the intersection of West and West Streets.\n\nThe dog + output_ids = model.generate(input_ids, do_sample=False) + self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids) diff --git a/tests/test_modeling_tf_t5.py b/tests/test_modeling_tf_t5.py index 59ee70c53ec2..a9f3d4a2e9c6 100644 --- a/tests/test_modeling_tf_t5.py +++ b/tests/test_modeling_tf_t5.py @@ -439,6 +439,34 @@ def test_train_pipeline_custom_model(self): pass +@require_tf +@require_sentencepiece +@require_tokenizers +class TFT5GenerationIntegrationTests(unittest.TestCase): + @slow + def test_greedy_generate(self): + model = TFT5ForConditionalGeneration.from_pretrained("t5-small") + tokenizer = T5Tokenizer.from_pretrained("t5-small") + + sentences = ["Yesterday, my name was", "Today is a beautiful day and"] + input_ids = tokenizer(sentences, return_tensors="tf", padding=True).input_ids + + generation_kwargs = { + "bad_words_ids": [tokenizer("my").input_ids, tokenizer("ein schöner").input_ids], + "no_repeat_ngram_size": 3, + "do_sample": False, + "repetition_penalty": 2.2, + } + + output_ids = model.generate(input_ids, **generation_kwargs) + + output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + + expected_output_string = ["Yesterday, my name was", "Heute ist ein schöne Tag und"] + + self.assertListEqual(expected_output_string, output_strings) + + @require_tf @require_sentencepiece @require_tokenizers From cf6de09adc65fe0888891c857d49365a98b09e06 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 9 Feb 2022 18:49:29 +0100 Subject: [PATCH 03/30] re-organize --- src/transformers/generation_tf_utils.py | 473 +++++++++++++++++++++++- 1 file changed, 471 insertions(+), 2 deletions(-) diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py index a1152ba90b64..c41f243f74b1 100644 --- a/src/transformers/generation_tf_utils.py +++ b/src/transformers/generation_tf_utils.py @@ -544,6 +544,38 @@ def generate( input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids ) # generate sequences without allowing bad_words to be generated ```""" + num_beams = num_beams if num_beams is not None else self.config.num_beams + do_sample = do_sample if do_sample is not None else self.config.do_sample + + is_greedy_gen_mode = num_beams == 1 and do_sample is False + + if is_greedy_gen_mode: + return self._generate( + input_ids=input_ids, + max_length=max_length, + min_length=min_length, + do_sample=do_sample, + early_stopping=early_stopping, + num_beams=num_beams, + temperature=temperature, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, + bad_words_ids=bad_words_ids, + bos_token_id=bos_token_id, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + length_penalty=length_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + num_return_sequences=num_return_sequences, + attention_mask=attention_mask, + decoder_start_token_id=decoder_start_token_id, + use_cache=use_cache, + output_scores=output_scores, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict_in_generate=return_dict_in_generate, + ) # We cannot generate if the model does not have a LM head if self.get_output_embeddings() is None: @@ -554,12 +586,11 @@ def generate( max_length = max_length if max_length is not None else self.config.max_length min_length = min_length if min_length is not None else self.config.min_length - do_sample = do_sample if do_sample is not None else self.config.do_sample early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping - num_beams = num_beams if num_beams is not None else self.config.num_beams temperature = temperature if temperature is not None else self.config.temperature top_k = top_k if top_k is not None else self.config.top_k top_p = top_p if top_p is not None else self.config.top_p + repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id @@ -1699,3 +1730,441 @@ def is_done(self, best_sum_logprobs, cur_len): cur_score = best_sum_logprobs / cur_len ** self.length_penalty ret = self.worst_score >= cur_score return ret + + def _generate( + self, + input_ids=None, + max_length=None, + min_length=None, + do_sample=None, + early_stopping=None, + num_beams=None, + temperature=None, + top_k=None, + top_p=None, + repetition_penalty=None, + bad_words_ids=None, + bos_token_id=None, + pad_token_id=None, + eos_token_id=None, + length_penalty=None, + no_repeat_ngram_size=None, + num_return_sequences=None, + attention_mask=None, + decoder_start_token_id=None, + use_cache=None, + output_scores=None, + output_attentions=None, + output_hidden_states=None, + return_dict_in_generate=None, + forced_bos_token_id=None, + forced_eos_token_id=None, + **model_kwargs, + ) -> Union[TFGreedySearchOutput, TFSampleOutput, TFBeamSearchOutput, TFBeamSampleOutput, tf.Tensor]: + r""" + Generates sequences for models with a language modeling head. The method currently supports greedy decoding, + beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling. + + Adapted in part from [Facebook's XLM beam search + code](https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529). + + Apart from `input_ids` and `attention_mask`, all the arguments below will default to the value of the attribute + of the same name inside the [`PretrainedConfig`] of the model. The default values indicated are the default + values of those config. + + Most of these parameters are explained in more detail in [this blog + post](https://huggingface.co/blog/how-to-generate). + + Parameters: + + input_ids (`tf.Tensor` of `dtype=tf.int32` and shape `(batch_size, sequence_length)`, *optional*): + The sequence used as a prompt for the generation. If `None` the method initializes it with + `bos_token_id` and a batch size of 1. + max_length (`int`, *optional*, defaults to 20): + The maximum length of the sequence to be generated. + min_length (`int`, *optional*, defaults to 10): + The minimum length of the sequence to be generated. + do_sample (`bool`, *optional*, defaults to `False`): + Whether or not to use sampling ; use greedy decoding otherwise. + early_stopping (`bool`, *optional*, defaults to `False`): + Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not. + num_beams (`int`, *optional*, defaults to 1): + Number of beams for beam search. 1 means no beam search. + temperature (`float`, *optional*, defaults to 1.0): + The value used to module the next token probabilities. + top_k (`int`, *optional*, defaults to 50): + The number of highest probability vocabulary tokens to keep for top-k-filtering. + top_p (`float`, *optional*, defaults to 1.0): + If set to float < 1, only the most probable tokens with probabilities that add up to `top_p` or higher + are kept for generation. + repetition_penalty (`float`, *optional*, defaults to 1.0): + The parameter for repetition penalty. 1.0 means no penalty. See [this + paper](https://arxiv.org/pdf/1909.05858.pdf) for more details. + pad_token_id (`int`, *optional*): + The id of the *padding* token. + bos_token_id (`int`, *optional*): + The id of the *beginning-of-sequence* token. + eos_token_id (`int`, *optional*): + The id of the *end-of-sequence* token. + length_penalty (`float`, *optional*, defaults to 1.0): + Exponential penalty to the length. 1.0 means no penalty. + + Set to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in + order to encourage the model to produce longer sequences. + no_repeat_ngram_size (`int`, *optional*, defaults to 0): + If set to int > 0, all ngrams of that size can only occur once. + bad_words_ids(`List[int]`, *optional*): + List of token ids that are not allowed to be generated. In order to get the tokens of the words that + should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`. + num_return_sequences(`int`, *optional*, defaults to 1): + The number of independently computed returned sequences for each element in the batch. + attention_mask (`tf.Tensor` of `dtype=tf.int32` and shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values are in `[0, 1]`, 1 for tokens + that are not masked, and 0 for masked tokens. + + If not provided, will default to a tensor the same shape as `input_ids` that masks the pad token. + + [What are attention masks?](../glossary#attention-mask) + decoder_start_token_id (`int`, *optional*): + If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token. + use_cache: (`bool`, *optional*, defaults to `True`): + Whether or not the model should use the past last key/values attentions (if applicable to the model) to + speed up decoding. + output_attentions (`bool`, *optional*, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more details. + output_hidden_states (`bool`, *optional*, defaults to `False`): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more details. + output_scores (`bool`, *optional*, defaults to `False`): + Whether or not to return the prediction scores. See `scores` under returned tensors for more details. + return_dict_in_generate (`bool`, *optional*, defaults to `False`): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + forced_bos_token_id (`int`, *optional*): + The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful + for multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be + the target language token. + forced_eos_token_id (`int`, *optional*): + The id of the token to force as the last generated token when `max_length` is reached. + model_specific_kwargs: + Additional model specific kwargs will be forwarded to the `forward` function of the model. + + Return: + [`~file_utils.ModelOutput`] or `tf.Tensor`: A [`~file_utils.ModelOutput`] (if + `return_dict_in_generate=True` or when `config.return_dict_in_generate=True`) or a `tf.Tensor`. + + If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible + [`~file_utils.ModelOutput`] types are: + + - [`~generation_utils.TFGreedySearchDecoderOnlyOutput`], + - [`~generation_utils.TFSampleDecoderOnlyOutput`], + - [`~generation_utils.TFBeamSearchDecoderOnlyOutput`], + - [`~generation_utils.TFBeamSampleDecoderOnlyOutput`] + + If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible + [`~file_utils.ModelOutput`] types are: + + - [`~generation_utils.TFGreedySearchEncoderDecoderOutput`], + - [`~generation_utils.TFSampleEncoderDecoderOutput`], + - [`~generation_utils.TFBeamSearchEncoderDecoderOutput`], + - [`~generation_utils.TFBeamSampleEncoderDecoderOutput`] + + Examples: + + ```python + tokenizer = AutoTokenizer.from_pretrained("distilgpt2") # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained( + "distilgpt2" + ) # Download model and configuration from huggingface.co and cache. + outputs = model.generate(max_length=40) # do greedy decoding + print(f"Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}") + + tokenizer = AutoTokenizer.from_pretrained("openai-gpt") # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained( + "openai-gpt" + ) # Download model and configuration from huggingface.co and cache. + input_context = "The dog" + input_ids = tokenizer.encode(input_context, return_tensors="tf") # encode input context + outputs = model.generate( + input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5 + ) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog' + for i in range(3): # 3 output sequences were generated + print(f"Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}") + + tokenizer = AutoTokenizer.from_pretrained("distilgpt2") # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained( + "distilgpt2" + ) # Download model and configuration from huggingface.co and cache. + input_context = "The dog" + input_ids = tokenizer.encode(input_context, return_tensors="tf") # encode input context + outputs = model.generate( + input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True + ) # generate 3 candidates using sampling + for i in range(3): # 3 output sequences were generated + print(f"Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}") + + tokenizer = AutoTokenizer.from_pretrained("ctrl") # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained( + "ctrl" + ) # Download model and configuration from huggingface.co and cache. + input_context = "Legal My neighbor is" # "Legal" is one of the control codes for ctrl + input_ids = tokenizer.encode(input_context, return_tensors="tf") # encode input context + outputs = model.generate( + input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2 + ) # generate sequences + print(f"Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}") + + tokenizer = AutoTokenizer.from_pretrained("gpt2") # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained( + "gpt2" + ) # Download model and configuration from huggingface.co and cache. + input_context = "My cute dog" + bad_words_ids = [ + tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ["idiot", "stupid", "shut up"] + ] + input_ids = tokenizer.encode(input_context, return_tensors="tf") # encode input context + outputs = model.generate( + input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids + ) # generate sequences without allowing bad_words to be generated + ```""" + + # We cannot generate if the model does not have a LM head + if self.get_output_embeddings() is None: + raise AttributeError( + "You tried to generate sequences with a model that does not have a LM Head. " + "Please use another model class (e.g. `TFOpenAIGPTLMHeadModel`, `TFXLNetLMHeadModel`, `TFGPT2LMHeadModel`, `TFCTRLLMHeadModel`, `TFT5ForConditionalGeneration`, `TFTransfoXLLMHeadModel`)" + ) + + num_beams = num_beams if num_beams is not None else self.config.num_beams + do_sample = do_sample if do_sample is not None else self.config.do_sample + max_length = max_length if max_length is not None else self.config.max_length + min_length = min_length if min_length is not None else self.config.min_length + early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping + temperature = temperature if temperature is not None else self.config.temperature + top_k = top_k if top_k is not None else self.config.top_k + top_p = top_p if top_p is not None else self.config.top_p + + repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty + bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id + pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id + eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id + length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty + no_repeat_ngram_size = ( + no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size + ) + bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids + num_return_sequences = ( + num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences + ) + decoder_start_token_id = ( + decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id + ) + forced_bos_token_id = ( + forced_bos_token_id if forced_bos_token_id is not None else self.config.forced_bos_token_id + ) + forced_eos_token_id = ( + forced_eos_token_id if forced_eos_token_id is not None else self.config.forced_eos_token_id + ) + + output_scores = output_scores if output_scores is not None else self.config.output_scores + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict_in_generate = ( + return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate + ) + + model_kwargs["output_scores"] = output_scores + model_kwargs["output_attentions"] = output_attentions + model_kwargs["output_hidden_states"] = output_hidden_states + + if self.config.is_encoder_decoder: + model_kwargs["encoder_attentions"] = None + model_kwargs["encoder_hidden_states"] = None + + if input_ids is not None: + batch_size = shape_list(input_ids)[0] # overridden by the input batch_size + else: + batch_size = 1 + + assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer." + assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer." + assert isinstance(do_sample, bool), "`do_sample` should be a boolean." + assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean." + assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer." + assert temperature > 0, "`temperature` should be strictly positive." + assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer." + assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." + assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." + assert input_ids is not None or ( + isinstance(bos_token_id, int) and bos_token_id >= 0 + ), "If input_ids is not defined, `bos_token_id` should be a positive integer." + assert pad_token_id is None or ( + isinstance(pad_token_id, int) and (pad_token_id >= 0) + ), "`pad_token_id` should be a positive integer." + assert (eos_token_id is None) or ( + isinstance(eos_token_id, int) and (eos_token_id >= 0) + ), "`eos_token_id` should be a positive integer." + assert length_penalty > 0, "`length_penalty` should be strictly positive." + assert ( + isinstance(num_return_sequences, int) and num_return_sequences > 0 + ), "`num_return_sequences` should be a strictly positive integer." + assert ( + bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list) + ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated" + + # This block corresponds to the following line in `generation_utils`: + # "input_ids = self._prepare_input_ids_for_generation(bos_token_id, model_kwargs.get("encoder_outputs"))" + # with the following differences: + # 1. In PT, `generate()`'s `model_kwargs` can accept `encoder_outputs`, but not the case in TF. + # 2. There is no shape checking in PT. + # In both PT/TF, if `input_ids` is `None`, we try to create it as it is for a text model. + if input_ids is None: + assert isinstance(bos_token_id, int) and bos_token_id >= 0, ( + "you should either supply a context to complete as `input_ids` input " + "or a `bos_token_id` (integer >= 0) as a first token to start the generation." + ) + input_ids = tf.fill((batch_size, 1), bos_token_id) + + # not allow to duplicate outputs when greedy decoding + if do_sample is False: + if num_beams == 1: + # no_beam_search greedy generation conditions + assert ( + num_return_sequences == 1 + ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1" + + else: + # beam_search greedy generation conditions + assert ( + num_beams >= num_return_sequences + ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences" + + # create attention mask if necessary + # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140 + if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids.numpy()): + attention_mask = tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=tf.int32) + elif attention_mask is None: + attention_mask = tf.ones_like(input_ids) + + if pad_token_id is None and eos_token_id is not None: + logger.warning(f"Setting `pad_token_id` to {eos_token_id} (first `eos_token_id`) to generate sequence") + pad_token_id = eos_token_id + + # current position and vocab size + cur_len = shape_list(input_ids)[1] # unused + vocab_size = getattr(self.config, "vocab_size", None) + if vocab_size is None and self.config.is_encoder_decoder: + decoder_config = getattr(self.config, "decoder", None) + if decoder_config is not None: + vocab_size = getattr(self.config.decoder, "vocab_size", None) + + # set effective batch size and effective batch multiplier according to do_sample + if do_sample: + effective_batch_size = batch_size * num_return_sequences + effective_batch_mult = num_return_sequences + else: + effective_batch_size = batch_size + effective_batch_mult = 1 + + if self.config.is_encoder_decoder: + if decoder_start_token_id is None: + decoder_start_token_id = bos_token_id + + assert ( + decoder_start_token_id is not None + ), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation" + assert hasattr(self, "get_encoder"), f"{self} should have a 'get_encoder' function defined" + assert callable(self.get_encoder), f"{self.get_encoder} should be a method" + + # get encoder and store encoder outputs + encoder = self.get_encoder() + + encoder_kwargs = { + "attention_mask": attention_mask, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "return_dict": return_dict_in_generate, + } + + # vision models don't use `attention_mask`. + signature = dict(inspect.signature(encoder.call).parameters) + if "attention_mask" not in signature: + encoder_kwargs.pop("attention_mask") + + encoder_outputs = encoder(input_ids, **encoder_kwargs) + if return_dict_in_generate: + if output_attentions: + model_kwargs["encoder_attentions"] = encoder_outputs.attentions + if output_hidden_states: + model_kwargs["encoder_hidden_states"] = encoder_outputs.hidden_states + + # The condition `len(shape_list(input_ids)) == 2` is to make this block treats only text inputs. + # (vision inputs might occur when the model is an encoder-decoder model) + # Expand input ids if num_beams > 1 or num_return_sequences > 1 + if len(shape_list(input_ids)) == 2 and (num_return_sequences > 1 or num_beams > 1): + input_ids_len = shape_list(input_ids)[-1] + input_ids = tf.broadcast_to( + tf.expand_dims(input_ids, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) + ) + attention_mask = tf.broadcast_to( + tf.expand_dims(attention_mask, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) + ) + input_ids = tf.reshape( + input_ids, (effective_batch_size * num_beams, input_ids_len) + ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) + attention_mask = tf.reshape( + attention_mask, (effective_batch_size * num_beams, input_ids_len) + ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) + + if self.config.is_encoder_decoder: + + # create empty decoder_input_ids + input_ids = ( + tf.ones( + (effective_batch_size * num_beams, 1), + dtype=tf.int32, + ) + * decoder_start_token_id + ) + cur_len = 1 + + assert ( + batch_size == encoder_outputs[0].shape[0] + ), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} " + + # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1) + expanded_batch_idxs = tf.reshape( + tf.repeat(tf.expand_dims(tf.range(batch_size), -1), repeats=num_beams * effective_batch_mult, axis=1), + shape=(-1,), + ) + # expand encoder_outputs + encoder_outputs = (tf.gather(encoder_outputs[0], expanded_batch_idxs, axis=0),) + else: + encoder_outputs = None + cur_len = shape_list(input_ids)[-1] + + assert ( + cur_len < max_length + ), f"The context has {cur_len} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`" + + is_greedy_gen_mode = (num_beams == 1) and do_sample is False + + if is_greedy_gen_mode: + if num_return_sequences > 1: + raise ValueError( + f"num_return_sequences has to be 1, but is {num_return_sequences} when doing greedy search." + ) + + # 10. run greedy search + return self.greedy_search( + input_ids, + logits_processor=logits_processor, + stopping_criteria=stopping_criteria, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + output_scores=output_scores, + return_dict_in_generate=return_dict_in_generate, + **model_kwargs, + ) From 7164d93e72a37af30c556106239b41e5846ffee9 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 9 Feb 2022 23:55:21 +0100 Subject: [PATCH 04/30] boom boom --- .../generation_tf_logits_process.py | 291 ++++++++++++++++++ src/transformers/generation_tf_utils.py | 280 ++++++++++++++++- 2 files changed, 563 insertions(+), 8 deletions(-) create mode 100644 src/transformers/generation_tf_logits_process.py diff --git a/src/transformers/generation_tf_logits_process.py b/src/transformers/generation_tf_logits_process.py new file mode 100644 index 000000000000..b8454d011996 --- /dev/null +++ b/src/transformers/generation_tf_logits_process.py @@ -0,0 +1,291 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +import tensorflow as tf +from abc import ABC + + +from .file_utils import add_start_docstrings +from .utils.logging import get_logger + + +logger = get_logger(__name__) + + +TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING = r""" + Args: + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + scores (`tf.Tensor` of shape `(batch_size, config.vocab_size)`): + Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam + search or log softmax for each vocabulary token when using beam search + kwargs: + Additional logits processor specific kwargs. + + Return: + `tf.Tensor` of shape `(batch_size, config.vocab_size)`: The processed prediction scores. + +""" + + +class TFLogitsProcessor(ABC): + """Abstract base class for all logit processors that can be applied during generation.""" + + @add_start_docstrings(TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING) + def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor) -> tf.Tensor: + """TF method for processing logits.""" + raise NotImplementedError( + f"{self.__class__} is an abstract class. Only classes inheriting this class can be called." + ) + + +class TFLogitsProcessorList(list): + """ + This class can be used to create a list of [`TFLogitsProcessor`] to subsequently process + a `scores` input tensor. This class inherits from list and adds a specific *__call__* method to apply each + [`TFLogitsProcessor`] to the inputs. + """ + + @add_start_docstrings(TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING) + def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int, **kwargs) -> tf.Tensor: + for processor in self: + function_args = inspect.signature(processor.__call__).parameters + if len(function_args) > 3: + if not all(arg in kwargs for arg in list(function_args.keys())[2:]): + raise ValueError( + f"Make sure that all the required parameters: {list(function_args.keys())} for " + f"{processor.__class__} are passed to the logits processor." + ) + scores = processor(input_ids, scores, cur_len, **kwargs) + else: + scores = processor(input_ids, scores, cur_len) + return scores + + +class TFMinLengthLogitsProcessor(TFLogitsProcessor): + r""" + [`TFLogitsProcessor`] enforcing a min-length by setting EOS probability to 0. + + Args: + min_length (`int`): + The minimum length below which the score of `eos_token_id` is set to `-float("Inf")`. + eos_token_id (`int`): + The id of the *end-of-sequence* token. + """ + + def __init__(self, min_length: int, eos_token_id: int): + if not isinstance(min_length, int) or min_length < 0: + raise ValueError(f"`min_length` has to be a positive integer, but is {min_length}") + + if not isinstance(eos_token_id, int) or eos_token_id < 0: + raise ValueError(f"`eos_token_id` has to be a positive integer, but is {eos_token_id}") + + self.min_length = min_length + self.eos_token_id = eos_token_id + + def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor: + # create boolean flag to decide if min length penalty should be applied + apply_penalty = 1 - tf.clip(cur_len - self.min_length, 0, 1) + + scores = tf.where( + apply_penalty, jax.ops.index_update(scores, jax.ops.index[:, self.eos_token_id], -float("inf")), scores + ) + + return scores + + +class TFRepetitionPenaltyLogitsProcessor(TFLogitsProcessor): + r""" + [`LogitsProcessor`] enforcing an exponential penalty on repeated sequences. + + Args: + repetition_penalty (`float`): + The parameter for repetition penalty. 1.0 means no penalty. See [this + paper](https://arxiv.org/pdf/1909.05858.pdf) for more details. + """ + + def __init__(self, penalty: float): + if not isinstance(penalty, float) or not (penalty > 0): + raise ValueError(f"`penalty` has to be a strictly positive float, but is {penalty}") + + self.penalty = penalty + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + score = torch.gather(scores, 1, input_ids) + + # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability + score = torch.where(score < 0, score * self.penalty, score / self.penalty) + + scores.scatter_(1, input_ids, score) + return scores + + +class TFNoBadWordsLogitsProcessor(TFLogitsProcessor): + """ + [`LogitsProcessor`] that enforces that specified sequences will never be sampled. + + Args: + bad_words_ids (`List[List[int]]`): + List of list of token ids that are not allowed to be generated. In order to get the tokens of the words + that should not appear in the generated text, use `tokenizer(bad_word, add_prefix_space=True).input_ids`. + eos_token_id (`int`): + The id of the *end-of-sequence* token. + """ + + def __init__(self, bad_words_ids: List[List[int]], eos_token_id: int): + + if not isinstance(bad_words_ids, List) or len(bad_words_ids) == 0: + raise ValueError(f"`bad_words_ids` has to be a non-emtpy list, but is {bad_words_ids}.") + if any(not isinstance(bad_word_ids, list) for bad_word_ids in bad_words_ids): + raise ValueError(f"`bad_words_ids` has to be a list of lists, but is {bad_words_ids}.") + if any( + any((not isinstance(token_id, (int, np.integer)) or token_id < 0) for token_id in bad_word_ids) + for bad_word_ids in bad_words_ids + ): + raise ValueError( + f"Each list in `bad_words_ids` has to be a list of positive integers, but is {bad_words_ids}." + ) + + bad_words_ids = list(filter(lambda bad_token_seq: bad_token_seq != [eos_token_id], bad_words_ids)) + self.bad_words_id_length_1 = [] + self.bad_words_id_length_greater_than_1 = [] + for word in bad_words_ids: + if len(word) == 1: + self.bad_words_id_length_1.append(word[0]) + else: + self.bad_words_id_length_greater_than_1.append(word) + + self.static_bad_words_mask: Optional[torch.LongTensor] = None + + for banned_token_seq in self.bad_words_id_length_greater_than_1: + if len(banned_token_seq) == 0: + raise ValueError(f"Banned words token sequences {bad_words_ids} cannot have an empty list") + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + if self.static_bad_words_mask is None and len(self.bad_words_id_length_1) > 0: + self.static_bad_words_mask = self._calc_static_bad_word_mask(scores) + + dynamic_banned_tokens = self._calc_banned_bad_words_ids(input_ids.tolist()) + scores = self._set_scores_to_inf_for_banned_tokens(scores, dynamic_banned_tokens) + + return scores + + def _calc_static_bad_word_mask(self, scores: torch.FloatTensor) -> torch.BoolTensor: + static_bad_words_mask = torch.zeros(scores.shape[1]) + static_bad_words_mask[self.bad_words_id_length_1] = 1 + return static_bad_words_mask.unsqueeze(0).to(scores.device).bool() + + def _tokens_match(self, prev_tokens: List[int], tokens: List[int]) -> bool: + if len(tokens) == 0: + # if bad word tokens is just one token always ban it + return True + elif len(tokens) > len(prev_tokens): + # if bad word tokens are longer then prev input_ids they can't be equal + return False + else: + return prev_tokens[-len(tokens) :] == tokens + + def _calc_banned_bad_words_ids(self, prev_input_ids: List[List[int]]) -> Iterable[int]: + banned_tokens = [] + for prev_input_ids_slice in prev_input_ids: + banned_tokens_slice = [] + for banned_token_seq in self.bad_words_id_length_greater_than_1: + if self._tokens_match(prev_input_ids_slice, banned_token_seq[:-1]): + banned_tokens_slice.append(banned_token_seq[-1]) + + banned_tokens.append(banned_tokens_slice) + + return banned_tokens + + def _set_scores_to_inf_for_banned_tokens( + self, scores: torch.Tensor, banned_tokens: List[List[int]] + ) -> torch.Tensor: + """ + Modifies the scores in place by setting the banned token positions to `-inf`. Banned token is expected to be a + list of list of banned tokens to ban in the format [[batch index, vocabulary position],... + + Args: + scores: logits distribution of shape (batch size, vocabulary size) + banned_tokens: list of list of tokens to ban of length (batch_size) + """ + banned_mask_list = [] + for idx, batch_banned_tokens in enumerate(banned_tokens): + for token in batch_banned_tokens: + # Eliminates invalid bad word IDs that are over the vocabulary size. + if token <= scores.shape[1]: + banned_mask_list.append([idx, token]) + else: + logger.error( + f"An invalid bad word ID is defined: {token}. This ID is not contained in the " + f"vocabulary, and is therefore ignored." + ) + if not banned_mask_list and self.static_bad_words_mask is None: + return scores + + else: + if banned_mask_list: + banned_mask = torch.LongTensor(banned_mask_list) + indices = torch.ones(len(banned_mask)) + # A sparse tensor is generated from a list of coordinates: [[0, 1], [0, 2], [2, 0]]. A conversion to dense tensor generates: + # [ 0 1 1 ] + # [ 0 0 0 ] + # [ 1 0 0 ] + + banned_mask = ( + torch.sparse.LongTensor(banned_mask.t(), indices, scores.size()) + .to(scores.device) + .to_dense() + .bool() + ) + + if self.static_bad_words_mask is not None: + banned_mask = torch.bitwise_or(banned_mask, self.static_bad_words_mask) + else: + banned_mask = self.static_bad_words_mask + + scores = scores.masked_fill(banned_mask, -float("inf")) + return scores + + +class TFNoRepeatNGramLogitsProcessor(TFLogitsProcessor): + r""" + [`LogitsProcessor`] that enforces no repetition of n-grams. See + [Fairseq](https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345). + + Args: + ngram_size (`int`): + All ngrams of size `ngram_size` can only occur once. + """ + + def __init__(self, ngram_size: int): + if not isinstance(ngram_size, int) or ngram_size <= 0: + raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}") + self.ngram_size = ngram_size + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + num_batch_hypotheses = scores.shape[0] + cur_len = input_ids.shape[-1] + banned_batch_tokens = _calc_banned_ngram_tokens(self.ngram_size, input_ids, num_batch_hypotheses, cur_len) + + for i, banned_tokens in enumerate(banned_batch_tokens): + scores[i, banned_tokens] = -float("inf") + + return scores diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py index c41f243f74b1..ced107c5cb37 100644 --- a/src/transformers/generation_tf_utils.py +++ b/src/transformers/generation_tf_utils.py @@ -16,11 +16,19 @@ import inspect from dataclasses import dataclass -from typing import Optional, Tuple, Union +from typing import Optional, Tuple, Union, List import numpy as np import tensorflow as tf +from .generation_tf_logits_process import ( + TFLogitsProcessorList, + TFMinLengthLogitsProcessor, + TFNoBadWordsLogitsProcessor, + TFNoRepeatNGramLogitsProcessor, + TFRepetitionPenaltyLogitsProcessor, +) + from .file_utils import ModelOutput from .utils import logging @@ -186,8 +194,6 @@ class TFBeamSearchDecoderOnlyOutput(ModelOutput): `(batch_size*num_beams*num_return_sequences, config.vocab_size)`). attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`. - hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of `tf.Tensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`. """ @@ -941,7 +947,7 @@ def _generate_no_beam_search( if no_repeat_ngram_size > 0: # calculate a list of banned tokens to prevent repetitively generating the same ngrams - # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 + # from fairseq: https://github.com/pytf/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 banned_tokens = calc_banned_ngram_tokens(input_ids, batch_size, no_repeat_ngram_size, cur_len) # create banned_tokens boolean mask banned_tokens_indices_mask = [] @@ -1138,7 +1144,7 @@ def _generate_beam_search( # cache compute states past = encoder_outputs - # to stay similar to torch : past = (encoder_outputs, None) if encoder_outputs is not None else None + # to stay similar to tf : past = (encoder_outputs, None) if encoder_outputs is not None else None # init attention / hidden states / scores tuples scores = () if (return_dict_in_generate and kwargs["output_scores"]) else None @@ -1211,7 +1217,7 @@ def _generate_beam_search( if no_repeat_ngram_size > 0: # calculate a list of banned tokens to prevent repetitively generating the same ngrams - # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 + # from fairseq: https://github.com/pytf/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 num_batch_hypotheses = batch_size * num_beams banned_tokens = calc_banned_ngram_tokens( input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len @@ -2151,6 +2157,15 @@ def _generate( is_greedy_gen_mode = (num_beams == 1) and do_sample is False + # prepare distribution pre_processing samplers + logits_processor = self._get_logits_processor( + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + bad_words_ids=bad_words_ids, + min_length=min_length, + eos_token_id=eos_token_id, + ) + if is_greedy_gen_mode: if num_return_sequences > 1: raise ValueError( @@ -2160,11 +2175,260 @@ def _generate( # 10. run greedy search return self.greedy_search( input_ids, - logits_processor=logits_processor, - stopping_criteria=stopping_criteria, + max_length=max_length, pad_token_id=pad_token_id, eos_token_id=eos_token_id, + logits_processor=logits_processor, output_scores=output_scores, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, return_dict_in_generate=return_dict_in_generate, **model_kwargs, ) + + def _get_logits_processor( + self, + repetition_penalty: float, + no_repeat_ngram_size: int, + bad_words_ids: List[List[int]], + min_length: int, + eos_token_id: int, + ) -> TFLogitsProcessorList: + """ + This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsProcessor`] + instances used to modify the scores of the language model head. + """ + processors = TFLogitsProcessorList() + + repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty + no_repeat_ngram_size = ( + no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size + ) + bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids + eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id + + # instantiate processors list + if repetition_penalty is not None and repetition_penalty != 1.0: + processors.append(TFRepetitionPenaltyLogitsProcessor(penalty=repetition_penalty)) + if no_repeat_ngram_size is not None and no_repeat_ngram_size > 0: + processors.append(TFNoRepeatNGramLogitsProcessor(no_repeat_ngram_size)) + if bad_words_ids is not None: + processors.append(TFNoBadWordsLogitsProcessor(bad_words_ids, eos_token_id)) + if min_length is not None and eos_token_id is not None and min_length > -1: + processors.append(TFMinLengthLogitsProcessor(min_length, eos_token_id)) + + return processors + + def greedy_search( + self, + input_ids: tf.Tensor, + max_length: Optional[int] = None, + pad_token_id: Optional[int] = None, + eos_token_id: Optional[int] = None, + logits_processor: Optional[TFLogitsProcessorList] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_scores: Optional[bool] = None, + return_dict_in_generate: Optional[bool] = None, + **model_kwargs, + ) -> Union[TFGreedySearchOutput, tf.Tensor]: + r""" + Generates sequences for models with a language modeling head using greedy decoding. + + Parameters: + + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`): + The sequence used as a prompt for the generation. + logits_processor (`LogitsProcessorList`, *optional*): + An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] + used to modify the prediction scores of the language modeling head applied at each generation step. + stopping_criteria (`StoppingCriteriaList`, *optional*): + An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] + used to tell if the generation loop should stop. + + max_length (`int`, *optional*, defaults to 20): + **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated + tokens. The maximum length of the sequence to be generated. + pad_token_id (`int`, *optional*): + The id of the *padding* token. + eos_token_id (`int`, *optional*): + The id of the *end-of-sequence* token. + output_attentions (`bool`, *optional*, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more details. + output_hidden_states (`bool`, *optional*, defaults to `False`): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more details. + output_scores (`bool`, *optional*, defaults to `False`): + Whether or not to return the prediction scores. See `scores` under returned tensors for more details. + return_dict_in_generate (`bool`, *optional*, defaults to `False`): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + synced_gpus (`bool`, *optional*, defaults to `False`): + Whether to continue running the while loop until max_length (needed for ZeRO stage 3) + model_kwargs: + Additional model specific keyword arguments will be forwarded to the `forward` function of the model. + If model is an encoder-decoder model the kwargs should include `encoder_outputs`. + + Return: + [`~generation_utils.GreedySearchDecoderOnlyOutput`], [`~generation_utils.GreedySearchEncoderDecoderOutput`] + or `tf.Tensor`: A `tf.Tensor` containing the generated tokens (default behaviour) or a + [`~generation_utils.GreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and + `return_dict_in_generate=True` or a [`~generation_utils.GreedySearchEncoderDecoderOutput`] if + `model.config.is_encoder_decoder=True`. + + Examples: + + ```python + >>> from transformers import ( + ... AutoTokenizer, + ... AutoModelForCausalLM, + ... LogitsProcessorList, + ... MinLengthLogitsProcessor, + ... ) + + >>> tokenizer = AutoTokenizer.from_pretrained("gpt2") + >>> model = AutoModelForCausalLM.from_pretrained("gpt2") + + >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token + >>> model.config.pad_token_id = model.config.eos_token_id + + >>> input_prompt = "Today is a beautiful day, and" + >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids + + >>> # instantiate logits processors + >>> logits_processor = LogitsProcessorList( + ... [ + ... MinLengthLogitsProcessor(15, eos_token_id=model.config.eos_token_id), + ... ] + ... ) + + >>> outputs = model.greedy_search(input_ids, logits_processor=logits_processor) + + >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True)) + ```""" + # init values + logits_processor = logits_processor if logits_processor is not None else TFLogitsProcessorList() + + pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id + eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id + output_scores = output_scores if output_scores is not None else self.config.output_scores + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict_in_generate = ( + return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate + ) + + # init attention / hidden states / scores tuples + scores = () if (return_dict_in_generate and output_scores) else None + decoder_attentions = () if (return_dict_in_generate and output_attentions) else None + cross_attentions = () if (return_dict_in_generate and output_attentions) else None + decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None + + # if model is an encoder-decoder, retrieve encoder attention weights and hidden states + if return_dict_in_generate and self.config.is_encoder_decoder: + encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None + encoder_hidden_states = ( + model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None + ) + + # keep track of which sequences are already finished + unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) + cur_len = input_ids.shape[-1] + + while cur_len < max_length: + # prepare model inputs + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + + # forward pass to get next token + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + next_token_logits = outputs.logits[:, -1, :] + + # Store scores, attentions and hidden_states when required + if return_dict_in_generate: + if output_scores: + scores += (next_token_logits,) + if output_attentions: + decoder_attentions += ( + (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) + ) + if self.config.is_encoder_decoder: + cross_attentions += (outputs.cross_attentions,) + + if output_hidden_states: + decoder_hidden_states += ( + (outputs.decoder_hidden_states,) + if self.config.is_encoder_decoder + else (outputs.hidden_states,) + ) + + # pre-process distribution + next_tokens_scores = logits_processor(input_ids, next_token_logits) + + # argmax + next_tokens = tf.argmax(next_tokens_scores, dim=-1) + + # finished sentences should have their next token be a padding token + if eos_token_id is not None: + if pad_token_id is None: + raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.") + next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) + + # update generated ids, model inputs, and length for next step + input_ids = tf.cat([input_ids, next_tokens[:, None]], dim=-1) + model_kwargs = self._update_model_kwargs_for_generation( + outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder + ) + cur_len = cur_len + 1 + + # if eos_token was found in one sentence, set sentence to finished +# if eos_token_id is not None: +# unfinished_sequences = unfinished_sequences.mul((next_tokens != eos_token_id).long()) + + if eos_token_id is not None: + eos_in_sents = next_tokens == eos_token_id + # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length + is_sents_unfinished_and_token_to_add_is_eos = tf.math.multiply( + unfinished_sents, tf.cast(eos_in_sents, tf.int32) + ) + sent_lengths = ( + sent_lengths * (1 - is_sents_unfinished_and_token_to_add_is_eos) + + cur_len * is_sents_unfinished_and_token_to_add_is_eos + ) + + # unfinished_sents is set to zero if eos in sentence + unfinished_sents -= is_sents_unfinished_and_token_to_add_is_eos + + # stop when each sentence is finished, or if we exceed the maximum length + if tf.math.reduce_max(unfinished_sents) == 0: + break +# if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): +# break + + if return_dict_in_generate: + if self.config.is_encoder_decoder: + return TFGreedySearchEncoderDecoderOutput( + sequences=input_ids, + scores=scores, + encoder_attentions=encoder_attentions, + encoder_hidden_states=encoder_hidden_states, + decoder_attentions=decoder_attentions, + cross_attentions=cross_attentions, + decoder_hidden_states=decoder_hidden_states, + ) + else: + return TFGreedySearchDecoderOnlyOutput( + sequences=input_ids, + scores=scores, + attentions=decoder_attentions, + hidden_states=decoder_hidden_states, + ) + else: + return input_ids From 0844c8367757199f1ee01d48af0c46d11e4f896f Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 9 Feb 2022 23:58:43 +0100 Subject: [PATCH 05/30] Apply suggestions from code review --- src/transformers/generation_tf_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py index ced107c5cb37..92c2471676ad 100644 --- a/src/transformers/generation_tf_utils.py +++ b/src/transformers/generation_tf_utils.py @@ -947,7 +947,7 @@ def _generate_no_beam_search( if no_repeat_ngram_size > 0: # calculate a list of banned tokens to prevent repetitively generating the same ngrams - # from fairseq: https://github.com/pytf/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 + # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 banned_tokens = calc_banned_ngram_tokens(input_ids, batch_size, no_repeat_ngram_size, cur_len) # create banned_tokens boolean mask banned_tokens_indices_mask = [] @@ -1144,7 +1144,7 @@ def _generate_beam_search( # cache compute states past = encoder_outputs - # to stay similar to tf : past = (encoder_outputs, None) if encoder_outputs is not None else None + # to stay similar to torch : past = (encoder_outputs, None) if encoder_outputs is not None else None # init attention / hidden states / scores tuples scores = () if (return_dict_in_generate and kwargs["output_scores"]) else None @@ -1217,7 +1217,7 @@ def _generate_beam_search( if no_repeat_ngram_size > 0: # calculate a list of banned tokens to prevent repetitively generating the same ngrams - # from fairseq: https://github.com/pytf/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 + # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 num_batch_hypotheses = batch_size * num_beams banned_tokens = calc_banned_ngram_tokens( input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len From b5ae0414dd12ba79bbae5be88e76ef941d9a46fc Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 9 Feb 2022 23:59:05 +0100 Subject: [PATCH 06/30] re-add --- src/transformers/generation_tf_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py index ced107c5cb37..fa607e8a6a5a 100644 --- a/src/transformers/generation_tf_utils.py +++ b/src/transformers/generation_tf_utils.py @@ -194,6 +194,8 @@ class TFBeamSearchDecoderOnlyOutput(ModelOutput): `(batch_size*num_beams*num_return_sequences, config.vocab_size)`). attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`. + hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of `tf.Tensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`. """ From 0671b89be69e81a47057a3b759491b3473e7962a Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 10 Feb 2022 00:10:30 +0100 Subject: [PATCH 07/30] add all code --- .../generation_tf_logits_process.py | 8 +-- src/transformers/generation_tf_utils.py | 54 ++++++++++++------- 2 files changed, 38 insertions(+), 24 deletions(-) diff --git a/src/transformers/generation_tf_logits_process.py b/src/transformers/generation_tf_logits_process.py index b8454d011996..344d765a8588 100644 --- a/src/transformers/generation_tf_logits_process.py +++ b/src/transformers/generation_tf_logits_process.py @@ -14,9 +14,9 @@ # limitations under the License. import inspect -import tensorflow as tf from abc import ABC +import tensorflow as tf from .file_utils import add_start_docstrings from .utils.logging import get_logger @@ -59,9 +59,9 @@ def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor) -> tf.Tensor: class TFLogitsProcessorList(list): """ - This class can be used to create a list of [`TFLogitsProcessor`] to subsequently process - a `scores` input tensor. This class inherits from list and adds a specific *__call__* method to apply each - [`TFLogitsProcessor`] to the inputs. + This class can be used to create a list of [`TFLogitsProcessor`] to subsequently process a `scores` input tensor. + This class inherits from list and adds a specific *__call__* method to apply each [`TFLogitsProcessor`] to the + inputs. """ @add_start_docstrings(TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING) diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py index d53a719d9879..108d367950b5 100644 --- a/src/transformers/generation_tf_utils.py +++ b/src/transformers/generation_tf_utils.py @@ -16,11 +16,12 @@ import inspect from dataclasses import dataclass -from typing import Optional, Tuple, Union, List +from typing import Any, Dict, List, Optional, Tuple, Union import numpy as np import tensorflow as tf +from .file_utils import ModelOutput from .generation_tf_logits_process import ( TFLogitsProcessorList, TFMinLengthLogitsProcessor, @@ -28,8 +29,6 @@ TFNoRepeatNGramLogitsProcessor, TFRepetitionPenaltyLogitsProcessor, ) - -from .file_utils import ModelOutput from .utils import logging @@ -2188,6 +2187,30 @@ def _generate( **model_kwargs, ) + @staticmethod + def _update_model_kwargs_for_generation( + outputs: ModelOutput, model_kwargs: Dict[str, Any], is_encoder_decoder: bool = False + ) -> Dict[str, Any]: + # update past + if "past_key_values" in outputs: + model_kwargs["past"] = outputs.past_key_values + elif "mems" in outputs: + model_kwargs["past"] = outputs.mems + elif "past_buckets_states" in outputs: + model_kwargs["past"] = outputs.past_buckets_states + else: + model_kwargs["past"] = None + + # update attention mask + if not is_encoder_decoder: + if "attention_mask" in model_kwargs: + attention_mask = model_kwargs["attention_mask"] + model_kwargs["attention_mask"] = tf.concat( + [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1 + ) + + return model_kwargs + def _get_logits_processor( self, repetition_penalty: float, @@ -2336,7 +2359,7 @@ def greedy_search( ) # keep track of which sequences are already finished - unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) + unfinished_sequences = tf.ones_like(input_ids[:, 0]) cur_len = input_ids.shape[-1] while cur_len < max_length: @@ -2375,7 +2398,7 @@ def greedy_search( next_tokens_scores = logits_processor(input_ids, next_token_logits) # argmax - next_tokens = tf.argmax(next_tokens_scores, dim=-1) + next_tokens = tf.argmax(next_tokens_scores, axis=-1) # finished sentences should have their next token be a padding token if eos_token_id is not None: @@ -2384,35 +2407,26 @@ def greedy_search( next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) # update generated ids, model inputs, and length for next step - input_ids = tf.cat([input_ids, next_tokens[:, None]], dim=-1) + input_ids = tf.concat([input_ids, next_tokens[:, None]], axis=-1) model_kwargs = self._update_model_kwargs_for_generation( outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder ) cur_len = cur_len + 1 # if eos_token was found in one sentence, set sentence to finished -# if eos_token_id is not None: -# unfinished_sequences = unfinished_sequences.mul((next_tokens != eos_token_id).long()) - if eos_token_id is not None: eos_in_sents = next_tokens == eos_token_id - # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length + # if sentence is unfinished and the token to add is eos is_sents_unfinished_and_token_to_add_is_eos = tf.math.multiply( - unfinished_sents, tf.cast(eos_in_sents, tf.int32) - ) - sent_lengths = ( - sent_lengths * (1 - is_sents_unfinished_and_token_to_add_is_eos) - + cur_len * is_sents_unfinished_and_token_to_add_is_eos + unfinished_sequences, tf.cast(eos_in_sents, tf.int32) ) - # unfinished_sents is set to zero if eos in sentence - unfinished_sents -= is_sents_unfinished_and_token_to_add_is_eos + # unfinished_sequences is set to zero if eos in sentence + unfinished_sequences -= is_sents_unfinished_and_token_to_add_is_eos # stop when each sentence is finished, or if we exceed the maximum length - if tf.math.reduce_max(unfinished_sents) == 0: + if tf.math.reduce_max(unfinished_sequences) == 0: break -# if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): -# break if return_dict_in_generate: if self.config.is_encoder_decoder: From c23bff236115f0306fffe25887b3b0afaa25d585 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 10 Feb 2022 12:16:36 +0100 Subject: [PATCH 08/30] make random greedy pass --- .../generation_tf_logits_process.py | 80 +++- src/transformers/generation_tf_utils.py | 433 +++++++++--------- 2 files changed, 273 insertions(+), 240 deletions(-) diff --git a/src/transformers/generation_tf_logits_process.py b/src/transformers/generation_tf_logits_process.py index 344d765a8588..b84f4a39686d 100644 --- a/src/transformers/generation_tf_logits_process.py +++ b/src/transformers/generation_tf_logits_process.py @@ -15,7 +15,9 @@ import inspect from abc import ABC +from typing import Iterable, List, Optional +import numpy as np import tensorflow as tf from .file_utils import add_start_docstrings @@ -103,11 +105,13 @@ def __init__(self, min_length: int, eos_token_id: int): def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor: # create boolean flag to decide if min length penalty should be applied - apply_penalty = 1 - tf.clip(cur_len - self.min_length, 0, 1) + apply_penalty = 1 - tf.clip_by_value(cur_len - self.min_length, 0, 1) - scores = tf.where( - apply_penalty, jax.ops.index_update(scores, jax.ops.index[:, self.eos_token_id], -float("inf")), scores - ) + # TODO(Matt) - this if statement has to be rewritten for XLA. Leaving it now though since + # generate is not XLA - compileable anyways + if apply_penalty: + eos_token_id_mask = tf.broadcast_to(tf.range(scores.shape[-1]) == self.eos_token_id, scores.shape) + scores = tf.where(eos_token_id_mask, tf.ones_like(scores) * float("-inf"), scores) return scores @@ -128,11 +132,11 @@ def __init__(self, penalty: float): self.penalty = penalty - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: - score = torch.gather(scores, 1, input_ids) + def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor: + score = tf.gather(scores, 1, input_ids) # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability - score = torch.where(score < 0, score * self.penalty, score / self.penalty) + score = tf.where(score < 0, score * self.penalty, score / self.penalty) scores.scatter_(1, input_ids, score) return scores @@ -173,13 +177,13 @@ def __init__(self, bad_words_ids: List[List[int]], eos_token_id: int): else: self.bad_words_id_length_greater_than_1.append(word) - self.static_bad_words_mask: Optional[torch.LongTensor] = None + self.static_bad_words_mask: Optional[tf.Tensor] = None for banned_token_seq in self.bad_words_id_length_greater_than_1: if len(banned_token_seq) == 0: raise ValueError(f"Banned words token sequences {bad_words_ids} cannot have an empty list") - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor: if self.static_bad_words_mask is None and len(self.bad_words_id_length_1) > 0: self.static_bad_words_mask = self._calc_static_bad_word_mask(scores) @@ -188,8 +192,8 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to return scores - def _calc_static_bad_word_mask(self, scores: torch.FloatTensor) -> torch.BoolTensor: - static_bad_words_mask = torch.zeros(scores.shape[1]) + def _calc_static_bad_word_mask(self, scores: tf.Tensor) -> tf.Tensor: + static_bad_words_mask = tf.zeros(scores.shape[1]) static_bad_words_mask[self.bad_words_id_length_1] = 1 return static_bad_words_mask.unsqueeze(0).to(scores.device).bool() @@ -215,9 +219,7 @@ def _calc_banned_bad_words_ids(self, prev_input_ids: List[List[int]]) -> Iterabl return banned_tokens - def _set_scores_to_inf_for_banned_tokens( - self, scores: torch.Tensor, banned_tokens: List[List[int]] - ) -> torch.Tensor: + def _set_scores_to_inf_for_banned_tokens(self, scores: tf.Tensor, banned_tokens: List[List[int]]) -> tf.Tensor: """ Modifies the scores in place by setting the banned token positions to `-inf`. Banned token is expected to be a list of list of banned tokens to ban in the format [[batch index, vocabulary position],... @@ -242,22 +244,19 @@ def _set_scores_to_inf_for_banned_tokens( else: if banned_mask_list: - banned_mask = torch.LongTensor(banned_mask_list) - indices = torch.ones(len(banned_mask)) + banned_mask = tf.Tensor(banned_mask_list) + indices = tf.ones(len(banned_mask)) # A sparse tensor is generated from a list of coordinates: [[0, 1], [0, 2], [2, 0]]. A conversion to dense tensor generates: # [ 0 1 1 ] # [ 0 0 0 ] # [ 1 0 0 ] banned_mask = ( - torch.sparse.LongTensor(banned_mask.t(), indices, scores.size()) - .to(scores.device) - .to_dense() - .bool() + tf.sparse.Tensor(banned_mask.t(), indices, scores.size()).to(scores.device).to_dense().bool() ) if self.static_bad_words_mask is not None: - banned_mask = torch.bitwise_or(banned_mask, self.static_bad_words_mask) + banned_mask = tf.bitwise_or(banned_mask, self.static_bad_words_mask) else: banned_mask = self.static_bad_words_mask @@ -265,10 +264,45 @@ def _set_scores_to_inf_for_banned_tokens( return scores +def _get_ngrams(ngram_size: int, prev_input_ids: tf.Tensor, num_hypos: int): + generated_ngrams = [{} for _ in range(num_hypos)] + for idx in range(num_hypos): + gen_tokens = prev_input_ids[idx].tolist() + generated_ngram = generated_ngrams[idx] + for ngram in zip(*[gen_tokens[i:] for i in range(ngram_size)]): + prev_ngram_tuple = tuple(ngram[:-1]) + generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]] + return generated_ngrams + + +def _get_generated_ngrams(banned_ngrams, prev_input_ids, ngram_size, cur_len): + # Before decoding the next token, prevent decoding of ngrams that have already appeared + start_idx = cur_len + 1 - ngram_size + ngram_idx = tuple(prev_input_ids[start_idx:cur_len].tolist()) + return banned_ngrams.get(ngram_idx, []) + + +def _calc_banned_ngram_tokens( + ngram_size: int, prev_input_ids: tf.Tensor, num_hypos: int, cur_len: int +) -> List[Iterable[int]]: + """Copied from fairseq for no_repeat_ngram in beam_search""" + if cur_len + 1 < ngram_size: + # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet + return [[] for _ in range(num_hypos)] + + generated_ngrams = _get_ngrams(ngram_size, prev_input_ids, num_hypos) + + banned_tokens = [ + _get_generated_ngrams(generated_ngrams[hypo_idx], prev_input_ids[hypo_idx], ngram_size, cur_len) + for hypo_idx in range(num_hypos) + ] + return banned_tokens + + class TFNoRepeatNGramLogitsProcessor(TFLogitsProcessor): r""" [`LogitsProcessor`] that enforces no repetition of n-grams. See - [Fairseq](https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345). + [Fairseq](https://github.com/pytf/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345). Args: ngram_size (`int`): @@ -280,7 +314,7 @@ def __init__(self, ngram_size: int): raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}") self.ngram_size = ngram_size - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor: num_batch_hypotheses = scores.shape[0] cur_len = input_ids.shape[-1] banned_batch_tokens = _calc_banned_ngram_tokens(self.ngram_size, input_ids, num_batch_hypotheses, cur_len) diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py index 108d367950b5..9e0dd5255416 100644 --- a/src/transformers/generation_tf_utils.py +++ b/src/transformers/generation_tf_utils.py @@ -1528,216 +1528,6 @@ def adjust_logits_during_generation( else: return logits - -def _create_next_token_logits_penalties(input_ids, logits, repetition_penalty): - # create logit penalties for already seen input_ids - token_penalties = np.ones(shape_list(logits)) - prev_input_ids = [np.unique(input_id) for input_id in input_ids.numpy()] - for i, prev_input_id in enumerate(prev_input_ids): - logit_penalized = logits[i].numpy()[prev_input_id] - logit_penalties = np.zeros(logit_penalized.shape) - # if previous logit score is < 0 then multiply repetition penalty else divide - logit_penalties[logit_penalized < 0] = repetition_penalty - logit_penalties[logit_penalized > 0] = 1 / repetition_penalty - np.put(token_penalties[i], prev_input_id, logit_penalties) - return tf.convert_to_tensor(token_penalties, dtype=tf.float32) - - -def calc_banned_ngram_tokens(prev_input_ids, num_hypos, no_repeat_ngram_size, cur_len): - # Copied from fairseq for no_repeat_ngram in beam_search - if cur_len + 1 < no_repeat_ngram_size: - # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet - return [[] for _ in range(num_hypos)] - generated_ngrams = [{} for _ in range(num_hypos)] - for idx in range(num_hypos): - gen_tokens = prev_input_ids[idx].numpy().tolist() - generated_ngram = generated_ngrams[idx] - for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]): - prev_ngram_tuple = tuple(ngram[:-1]) - generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]] - - def _get_generated_ngrams(hypo_idx): - # Before decoding the next token, prevent decoding of ngrams that have already appeared - start_idx = cur_len + 1 - no_repeat_ngram_size - ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].numpy().tolist()) - return generated_ngrams[hypo_idx].get(ngram_idx, []) - - banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)] - return banned_tokens - - -def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids): - banned_tokens = [] - - def _tokens_match(prev_tokens, tokens): - if len(tokens) == 0: - # if bad word tokens is just one token always ban it - return True - if len(tokens) > len(prev_tokens): - # if bad word tokens are longer than prev tokens they can't be equal - return False - - if prev_tokens[-len(tokens) :] == tokens: - # if tokens match - return True - else: - return False - - for prev_input_ids_slice in prev_input_ids: - banned_tokens_slice = [] - - for banned_token_seq in bad_words_ids: - assert ( - len(banned_token_seq) > 0 - ), f"Banned words token sequences { bad_words_ids} cannot have an empty list" - - if _tokens_match(prev_input_ids_slice.numpy().tolist(), banned_token_seq[:-1]) is False: - # if tokens do not match continue - continue - - banned_tokens_slice.append(banned_token_seq[-1]) - - banned_tokens.append(banned_tokens_slice) - - return banned_tokens - - -def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1): - """ - Filter a distribution of logits using top-k and/or nucleus (top-p) filtering - - Args: - logits: logits distribution shape (batch size, vocabulary size) - top_k (`int`, *optional*, defaults to 0): - If > 0, only keep the top k tokens with highest probability (top-k filtering) - top_p (`float`, *optional*, defaults to 1.0): - If < 1.0, only keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus - filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) - min_tokens_to_keep (`int`, *optional*, defaults to 1): - Minimumber of tokens we keep per batch example in the output. - - From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 - """ - logits_shape = shape_list(logits) - - if top_k > 0: - top_k = min(max(top_k, min_tokens_to_keep), logits_shape[-1]) # Safety check - # Remove all tokens with a probability less than the last token of the top-k - indices_to_remove = logits < tf.math.top_k(logits, k=top_k)[0][..., -1, None] - logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value) - - if top_p < 1.0: - sorted_indices = tf.argsort(logits, direction="DESCENDING") - sorted_logits = tf.gather( - logits, sorted_indices, axis=-1, batch_dims=1 - ) # expects logits to be of dim (batch_size, vocab_size) - - cumulative_probs = tf.math.cumsum(tf.nn.softmax(sorted_logits, axis=-1), axis=-1) - - # Remove tokens with cumulative probability above the threshold (token with 0 are kept) - sorted_indices_to_remove = cumulative_probs > top_p - - if min_tokens_to_keep > 1: - # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) - sorted_indices_to_remove = tf.concat( - [ - tf.zeros_like(sorted_indices_to_remove[:, :min_tokens_to_keep]), - sorted_indices_to_remove[:, min_tokens_to_keep:], - ], - -1, - ) - - # Shift the indices to the right to keep also the first token above the threshold - sorted_indices_to_remove = tf.concat( - [tf.zeros_like(sorted_indices_to_remove[:, :1]), sorted_indices_to_remove[:, :-1]], - -1, - ) - # scatter sorted tensors to original indexing - indices_to_remove = scatter_values_on_batch_indices(sorted_indices_to_remove, sorted_indices) - logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value) - return logits - - -def scatter_values_on_batch_indices(values, batch_indices): - shape = shape_list(batch_indices) - # broadcast batch dim to shape - broad_casted_batch_dims = tf.reshape(tf.broadcast_to(tf.expand_dims(tf.range(shape[0]), axis=-1), shape), [1, -1]) - # transform batch_indices to pair_indices - pair_indices = tf.transpose(tf.concat([broad_casted_batch_dims, tf.reshape(batch_indices, [1, -1])], 0)) - # scatter values to pair indices - return tf.scatter_nd(pair_indices, tf.reshape(values, [-1]), shape) - - -def set_tensor_by_indices_to_value(tensor, indices, value): - # create value_tensor since tensor value assignment is not possible in TF - value_tensor = tf.zeros_like(tensor) + value - return tf.where(indices, value_tensor, tensor) - - -def sample_without_replacement(logits, num_samples): - """ - categorical sampling without replacement is currently not implemented the gumbel-max trick will do for now see - https://github.com/tensorflow/tensorflow/issues/9260 for more info - """ - z = -tf.math.log(tf.random.uniform(shape_list(logits), 0, 1)) - _, indices = tf.nn.top_k(logits + z, num_samples) - return indices - - -def shape_list(x): - """Deal with dynamic shape in tensorflow cleanly.""" - static = x.shape.as_list() - dynamic = tf.shape(x) - return [dynamic[i] if s is None else s for i, s in enumerate(static)] - - -class BeamHypotheses(object): - def __init__(self, num_beams, max_length, length_penalty, early_stopping): - """ - Initialize n-best list of hypotheses. - """ - self.max_length = max_length - 1 # ignoring bos_token - self.length_penalty = length_penalty - self.early_stopping = early_stopping - self.num_beams = num_beams - self.beams = [] - self.worst_score = 1e9 - - def __len__(self): - """ - Number of hypotheses in the list. - """ - return len(self.beams) - - def add(self, hyp, sum_logprobs): - """ - Add a new hypothesis to the list. - """ - score = sum_logprobs / len(hyp) ** self.length_penalty - if len(self) < self.num_beams or score > self.worst_score: - self.beams.append((score, hyp)) - if len(self) > self.num_beams: - sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)]) - del self.beams[sorted_scores[0][1]] - self.worst_score = sorted_scores[1][0] - else: - self.worst_score = min(score, self.worst_score) - - def is_done(self, best_sum_logprobs, cur_len): - """ - If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst - one in the heap, then we are done with this sentence. - """ - - if len(self) < self.num_beams: - return False - elif self.early_stopping: - return True - else: - cur_score = best_sum_logprobs / cur_len ** self.length_penalty - ret = self.worst_score >= cur_score - return ret - def _generate( self, input_ids=None, @@ -1982,10 +1772,6 @@ def _generate( return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate ) - model_kwargs["output_scores"] = output_scores - model_kwargs["output_attentions"] = output_attentions - model_kwargs["output_hidden_states"] = output_hidden_states - if self.config.is_encoder_decoder: model_kwargs["encoder_attentions"] = None model_kwargs["encoder_hidden_states"] = None @@ -2126,7 +1912,6 @@ def _generate( ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) if self.config.is_encoder_decoder: - # create empty decoder_input_ids input_ids = ( tf.ones( @@ -2363,6 +2148,10 @@ def greedy_search( cur_len = input_ids.shape[-1] while cur_len < max_length: + # TODO (Patrick): remove following two lines by cleaning up `prepare_inputs_for_generation` + # in all models + model_kwargs["past"] = None if "past" not in model_kwargs else model_kwargs["past"] + model_kwargs["use_cache"] = None if "use_cache" not in model_kwargs else model_kwargs["use_cache"] # prepare model inputs model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) @@ -2395,10 +2184,10 @@ def greedy_search( ) # pre-process distribution - next_tokens_scores = logits_processor(input_ids, next_token_logits) + next_tokens_scores = logits_processor(input_ids, next_token_logits, cur_len) # argmax - next_tokens = tf.argmax(next_tokens_scores, axis=-1) + next_tokens = tf.cast(tf.argmax(next_tokens_scores, axis=-1), tf.int32) # finished sentences should have their next token be a padding token if eos_token_id is not None: @@ -2448,3 +2237,213 @@ def greedy_search( ) else: return input_ids + + +def _create_next_token_logits_penalties(input_ids, logits, repetition_penalty): + # create logit penalties for already seen input_ids + token_penalties = np.ones(shape_list(logits)) + prev_input_ids = [np.unique(input_id) for input_id in input_ids.numpy()] + for i, prev_input_id in enumerate(prev_input_ids): + logit_penalized = logits[i].numpy()[prev_input_id] + logit_penalties = np.zeros(logit_penalized.shape) + # if previous logit score is < 0 then multiply repetition penalty else divide + logit_penalties[logit_penalized < 0] = repetition_penalty + logit_penalties[logit_penalized > 0] = 1 / repetition_penalty + np.put(token_penalties[i], prev_input_id, logit_penalties) + return tf.convert_to_tensor(token_penalties, dtype=tf.float32) + + +def calc_banned_ngram_tokens(prev_input_ids, num_hypos, no_repeat_ngram_size, cur_len): + # Copied from fairseq for no_repeat_ngram in beam_search + if cur_len + 1 < no_repeat_ngram_size: + # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet + return [[] for _ in range(num_hypos)] + generated_ngrams = [{} for _ in range(num_hypos)] + for idx in range(num_hypos): + gen_tokens = prev_input_ids[idx].numpy().tolist() + generated_ngram = generated_ngrams[idx] + for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]): + prev_ngram_tuple = tuple(ngram[:-1]) + generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]] + + def _get_generated_ngrams(hypo_idx): + # Before decoding the next token, prevent decoding of ngrams that have already appeared + start_idx = cur_len + 1 - no_repeat_ngram_size + ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].numpy().tolist()) + return generated_ngrams[hypo_idx].get(ngram_idx, []) + + banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)] + return banned_tokens + + +def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids): + banned_tokens = [] + + def _tokens_match(prev_tokens, tokens): + if len(tokens) == 0: + # if bad word tokens is just one token always ban it + return True + if len(tokens) > len(prev_tokens): + # if bad word tokens are longer than prev tokens they can't be equal + return False + + if prev_tokens[-len(tokens) :] == tokens: + # if tokens match + return True + else: + return False + + for prev_input_ids_slice in prev_input_ids: + banned_tokens_slice = [] + + for banned_token_seq in bad_words_ids: + assert ( + len(banned_token_seq) > 0 + ), f"Banned words token sequences { bad_words_ids} cannot have an empty list" + + if _tokens_match(prev_input_ids_slice.numpy().tolist(), banned_token_seq[:-1]) is False: + # if tokens do not match continue + continue + + banned_tokens_slice.append(banned_token_seq[-1]) + + banned_tokens.append(banned_tokens_slice) + + return banned_tokens + + +def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1): + """ + Filter a distribution of logits using top-k and/or nucleus (top-p) filtering + + Args: + logits: logits distribution shape (batch size, vocabulary size) + top_k (`int`, *optional*, defaults to 0): + If > 0, only keep the top k tokens with highest probability (top-k filtering) + top_p (`float`, *optional*, defaults to 1.0): + If < 1.0, only keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus + filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) + min_tokens_to_keep (`int`, *optional*, defaults to 1): + Minimumber of tokens we keep per batch example in the output. + + From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 + """ + logits_shape = shape_list(logits) + + if top_k > 0: + top_k = min(max(top_k, min_tokens_to_keep), logits_shape[-1]) # Safety check + # Remove all tokens with a probability less than the last token of the top-k + indices_to_remove = logits < tf.math.top_k(logits, k=top_k)[0][..., -1, None] + logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value) + + if top_p < 1.0: + sorted_indices = tf.argsort(logits, direction="DESCENDING") + sorted_logits = tf.gather( + logits, sorted_indices, axis=-1, batch_dims=1 + ) # expects logits to be of dim (batch_size, vocab_size) + + cumulative_probs = tf.math.cumsum(tf.nn.softmax(sorted_logits, axis=-1), axis=-1) + + # Remove tokens with cumulative probability above the threshold (token with 0 are kept) + sorted_indices_to_remove = cumulative_probs > top_p + + if min_tokens_to_keep > 1: + # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) + sorted_indices_to_remove = tf.concat( + [ + tf.zeros_like(sorted_indices_to_remove[:, :min_tokens_to_keep]), + sorted_indices_to_remove[:, min_tokens_to_keep:], + ], + -1, + ) + + # Shift the indices to the right to keep also the first token above the threshold + sorted_indices_to_remove = tf.concat( + [tf.zeros_like(sorted_indices_to_remove[:, :1]), sorted_indices_to_remove[:, :-1]], + -1, + ) + # scatter sorted tensors to original indexing + indices_to_remove = scatter_values_on_batch_indices(sorted_indices_to_remove, sorted_indices) + logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value) + return logits + + +def scatter_values_on_batch_indices(values, batch_indices): + shape = shape_list(batch_indices) + # broadcast batch dim to shape + broad_casted_batch_dims = tf.reshape(tf.broadcast_to(tf.expand_dims(tf.range(shape[0]), axis=-1), shape), [1, -1]) + # transform batch_indices to pair_indices + pair_indices = tf.transpose(tf.concat([broad_casted_batch_dims, tf.reshape(batch_indices, [1, -1])], 0)) + # scatter values to pair indices + return tf.scatter_nd(pair_indices, tf.reshape(values, [-1]), shape) + + +def set_tensor_by_indices_to_value(tensor, indices, value): + # create value_tensor since tensor value assignment is not possible in TF + value_tensor = tf.zeros_like(tensor) + value + return tf.where(indices, value_tensor, tensor) + + +def sample_without_replacement(logits, num_samples): + """ + categorical sampling without replacement is currently not implemented the gumbel-max trick will do for now see + https://github.com/tensorflow/tensorflow/issues/9260 for more info + """ + z = -tf.math.log(tf.random.uniform(shape_list(logits), 0, 1)) + _, indices = tf.nn.top_k(logits + z, num_samples) + return indices + + +def shape_list(x): + """Deal with dynamic shape in tensorflow cleanly.""" + static = x.shape.as_list() + dynamic = tf.shape(x) + return [dynamic[i] if s is None else s for i, s in enumerate(static)] + + +class BeamHypotheses(object): + def __init__(self, num_beams, max_length, length_penalty, early_stopping): + """ + Initialize n-best list of hypotheses. + """ + self.max_length = max_length - 1 # ignoring bos_token + self.length_penalty = length_penalty + self.early_stopping = early_stopping + self.num_beams = num_beams + self.beams = [] + self.worst_score = 1e9 + + def __len__(self): + """ + Number of hypotheses in the list. + """ + return len(self.beams) + + def add(self, hyp, sum_logprobs): + """ + Add a new hypothesis to the list. + """ + score = sum_logprobs / len(hyp) ** self.length_penalty + if len(self) < self.num_beams or score > self.worst_score: + self.beams.append((score, hyp)) + if len(self) > self.num_beams: + sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)]) + del self.beams[sorted_scores[0][1]] + self.worst_score = sorted_scores[1][0] + else: + self.worst_score = min(score, self.worst_score) + + def is_done(self, best_sum_logprobs, cur_len): + """ + If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst + one in the heap, then we are done with this sentence. + """ + + if len(self) < self.num_beams: + return False + elif self.early_stopping: + return True + else: + cur_score = best_sum_logprobs / cur_len ** self.length_penalty + ret = self.worst_score >= cur_score + return ret From 7786d182e47eb871cea70d8419b213be1f1f7598 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 10 Feb 2022 16:28:54 +0100 Subject: [PATCH 09/30] make encoder-decoder random work --- src/transformers/generation_tf_utils.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py index 9e0dd5255416..8f2a885d0800 100644 --- a/src/transformers/generation_tf_utils.py +++ b/src/transformers/generation_tf_utils.py @@ -1937,6 +1937,10 @@ def _generate( encoder_outputs = None cur_len = shape_list(input_ids)[-1] + # TODO(Patrick) - not very clean here + model_kwargs["attention_mask"] = attention_mask + model_kwargs["past"] = encoder_outputs # defined for encoder-decoder models, None for decoder-only models + assert ( cur_len < max_length ), f"The context has {cur_len} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`" @@ -2138,10 +2142,8 @@ def greedy_search( # if model is an encoder-decoder, retrieve encoder attention weights and hidden states if return_dict_in_generate and self.config.is_encoder_decoder: - encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None - encoder_hidden_states = ( - model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None - ) + encoder_attentions = model_kwargs["encoder_attentions"] if output_attentions else None + encoder_hidden_states = model_kwargs["encoder_hidden_states"] if output_hidden_states else None # keep track of which sequences are already finished unfinished_sequences = tf.ones_like(input_ids[:, 0]) @@ -2150,8 +2152,8 @@ def greedy_search( while cur_len < max_length: # TODO (Patrick): remove following two lines by cleaning up `prepare_inputs_for_generation` # in all models - model_kwargs["past"] = None if "past" not in model_kwargs else model_kwargs["past"] model_kwargs["use_cache"] = None if "use_cache" not in model_kwargs else model_kwargs["use_cache"] + # prepare model inputs model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) From d04530fdc4afc3263a85d68f98f4d3e199e39a0c Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 10 Feb 2022 17:42:10 +0100 Subject: [PATCH 10/30] further improvements --- @! | 2394 +++++++++++++++++ .../generation_tf_logits_process.py | 228 +- src/transformers/generation_tf_utils.py | 235 +- 3 files changed, 2585 insertions(+), 272 deletions(-) create mode 100644 @! diff --git a/@! b/@! new file mode 100644 index 000000000000..8a21a47ce585 --- /dev/null +++ b/@! @@ -0,0 +1,2394 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Union + +import numpy as np +import tensorflow as tf + +from .file_utils import ModelOutput +from .generation_tf_logits_process import ( + TFLogitsProcessorList, + TFMinLengthLogitsProcessor, + TFNoBadWordsLogitsProcessor, + TFNoRepeatNGramLogitsProcessor, + TFRepetitionPenaltyLogitsProcessor, +) +from .utils import logging + + +logger = logging.get_logger(__name__) + + +@dataclass +class TFGreedySearchDecoderOnlyOutput(ModelOutput): + """ + Base class for outputs of decoder-only generation models using greedy search. + + + Args: + sequences (`tf.Tensor` of shape `(batch_size, sequence_length)`): + The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter + if all batches finished early due to the `eos_token_id`. + scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): + Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) + at each generation step. `(max_length-input_ids.shape[-1],)`-shaped tuple of `tf.Tensor` with each tensor + of shape `(batch_size, config.vocab_size)`). + attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. + hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + `tf.Tensor` of shape `(batch_size, generated_length, hidden_size)`. + """ + + sequences: tf.Tensor = None + scores: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None + hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None + + +@dataclass +class TFGreedySearchEncoderDecoderOutput(ModelOutput): + """ + Base class for outputs of encoder-decoder generation models using greedy search. Hidden states and attention + weights of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the + encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes) + + + Args: + sequences (`tf.Tensor` of shape `(batch_size, sequence_length)`): + The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter + if all batches finished early due to the `eos_token_id`. + scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): + Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) + at each generation step. `(max_length-1,)`-shaped tuple of `tf.Tensor` with each tensor of shape + `(batch_size, config.vocab_size)`). + encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. + cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. + decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + `tf.Tensor` of shape `(batch_size, generated_length, hidden_size)`. + """ + + sequences: tf.Tensor = None + scores: Optional[Tuple[tf.Tensor]] = None + encoder_attentions: Optional[Tuple[tf.Tensor]] = None + encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None + decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None + cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None + decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None + + +@dataclass +class TFSampleDecoderOnlyOutput(ModelOutput): + """ + Base class for outputs of decoder-only generation models using sampling. + + + Args: + sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`): + The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter + if all batches finished early due to the `eos_token_id`. + scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): + Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) + at each generation step. `(max_length-input_ids.shape[-1],)`-shaped tuple of `tf.Tensor` with each tensor + of shape `(batch_size*num_return_sequences, config.vocab_size)`). + attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + `tf.Tensor` of shape `(num_return_sequences*batch_size, num_heads, generated_length, sequence_length)`. + hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + `tf.Tensor` of shape `(num_return_sequences*batch_size, generated_length, hidden_size)`. + """ + + sequences: tf.Tensor = None + scores: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None + hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None + + +@dataclass +class TFSampleEncoderDecoderOutput(ModelOutput): + """ + Base class for outputs of encoder-decoder generation models using sampling. Hidden states and attention weights of + the decoder (respectively the encoder) can be accessed via the encoder_attentions and the encoder_hidden_states + attributes (respectively the decoder_attentions and the decoder_hidden_states attributes) + + + Args: + sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`): + The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter + if all batches finished early due to the `eos_token_id`. + scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): + Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) + at each generation step. `(max_length-1,)`-shaped tuple of `tf.Tensor` with each tensor of shape + `(batch_size*num_return_sequences, config.vocab_size)`). + encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size*num_return_sequences, + num_heads, sequence_length, sequence_length)`. + encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size*num_return_sequences, sequence_length, hidden_size)`. + decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + `tf.Tensor` of shape `(batch_size*num_return_sequences, num_heads, generated_length, sequence_length)`. + cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. + decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + `tf.Tensor` of shape `(batch_size*num_return_sequences, generated_length, hidden_size)`. + """ + + sequences: tf.Tensor = None + scores: Optional[Tuple[tf.Tensor]] = None + encoder_attentions: Optional[Tuple[tf.Tensor]] = None + encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None + decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None + cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None + decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None + + +@dataclass +class TFBeamSearchDecoderOnlyOutput(ModelOutput): + """ + Base class for outputs of decoder-only generation models using beam search. + + Args: + sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`): + The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter + if all batches finished early due to the `eos_token_id`. + sequences_scores (`tf.Tensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): + Final beam scores of the generated `sequences`. + scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): + Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log + softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam + . `(max_length-input_ids.shape[-1],)`-shaped tuple of `tf.Tensor` with each tensor of shape + `(batch_size*num_beams*num_return_sequences, config.vocab_size)`). + attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`. + hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + `tf.Tensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`. + """ + + sequences: tf.Tensor = None + sequences_scores: Optional[tf.Tensor] = None + scores: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None + hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None + + +@dataclass +class TFBeamSearchEncoderDecoderOutput(ModelOutput): + """ + Base class for outputs of encoder-decoder generation models using beam search. Hidden states and attention weights + of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the encoder_hidden_states + attributes (respectively the decoder_attentions and the decoder_hidden_states attributes) + + Args: + sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`): + The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter + if all batches finished early due to the `eos_token_id`. + sequences_scores (`tf.Tensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): + Final beam scores of the generated `sequences`. + scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): + Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log + softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam + . `(max_length-1,)`-shaped tuple of `tf.Tensor` with each tensor of shape `(batch_size*num_beams, + config.vocab_size)`). + attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): + encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size*num_beams*num_return_sequences, sequence_length, hidden_size)`. + decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + `tf.Tensor` of shape `(batch_size*num_beams*num_return_sequences, num_heads, generated_length, + sequence_length)`. + cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. + decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + `tf.Tensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`. + """ + + sequences: tf.Tensor = None + sequences_scores: Optional[tf.Tensor] = None + scores: Optional[Tuple[tf.Tensor]] = None + encoder_attentions: Optional[Tuple[tf.Tensor]] = None + encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None + decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None + cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None + decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None + + +@dataclass +class TFBeamSampleDecoderOnlyOutput(ModelOutput): + """ + Base class for outputs of decoder-only generation models using beam sample. + + Args: + sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`): + The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter + if all batches finished early due to the `eos_token_id`. + sequences_scores (`tf.Tensor` of shape `(batch_size * num_return_sequence)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): + Final beam scores of the generated `sequences`. + scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): + Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log + softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam + . `(max_length-input_ids.shape[-1],)`-shaped tuple of `tf.Tensor` with each tensor of shape + `(batch_size*num_beams*num_return_sequences, config.vocab_size)`). + attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`. + hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + `tf.Tensor` of shape `(batch_size*num_beams, generated_length, hidden_size)`. + """ + + sequences: tf.Tensor = None + sequences_scores: Optional[tf.Tensor] = None + scores: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None + hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None + + +@dataclass +class TFBeamSampleEncoderDecoderOutput(ModelOutput): + """ + Base class for outputs of encoder-decoder generation models using beam sampling. Hidden states and attention + weights of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the + encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes) + + Args: + sequences (`tf.Tensor` of shape `(batch_size*num_beams, sequence_length)`): + The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter + if all batches finished early due to the `eos_token_id`. + sequences_scores (`tf.Tensor` of shape `(batch_size * num_return_sequence)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): + Final beam scores of the generated `sequences`. + scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): + Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log + softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam + . `(max_length-1,)`-shaped tuple of `tf.Tensor` with each tensor of shape `(batch_size*num_beams, + config.vocab_size)`). + encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size*num_beams, sequence_length, hidden_size)`. + decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`. + cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. + decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + `tf.Tensor` of shape `(batch_size*num_beams, generated_length, hidden_size)`. + """ + + sequences: tf.Tensor = None + sequences_scores: Optional[tf.Tensor] = None + scores: Optional[Tuple[tf.Tensor]] = None + encoder_attentions: Optional[Tuple[tf.Tensor]] = None + encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None + decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None + cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None + decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None + + +TFGreedySearchOutput = Union[TFGreedySearchEncoderDecoderOutput, TFGreedySearchDecoderOnlyOutput] +TFSampleOutput = Union[TFSampleEncoderDecoderOutput, TFSampleDecoderOnlyOutput] +TFBeamSearchOutput = Union[TFBeamSearchEncoderDecoderOutput, TFBeamSearchDecoderOnlyOutput] +TFBeamSampleOutput = Union[TFBeamSampleEncoderDecoderOutput, TFBeamSampleDecoderOnlyOutput] + + +class TFGenerationMixin: + """ + A class containing all of the functions supporting generation, to be used as a mixin in [`TFPreTrainedModel`]. + """ + + def prepare_inputs_for_generation(self, inputs, **kwargs): + """ + Implement in subclasses of [`TFPreTrainedModel`] for custom behavior to prepare inputs in the generate method. + """ + return {"input_ids": inputs} + + def _use_cache(self, outputs, use_cache): + """During generation, decide whether to pass the `past` variable to the next forward pass.""" + use_cache = getattr(self.config, "use_cache", False) + if len(outputs) <= 1 or use_cache is False: + return False + if hasattr(self.config, "mem_len") and self.config.mem_len == 0: + return False + return True + + def generate( + self, + input_ids=None, + max_length=None, + min_length=None, + do_sample=None, + early_stopping=None, + num_beams=None, + temperature=None, + top_k=None, + top_p=None, + repetition_penalty=None, + bad_words_ids=None, + bos_token_id=None, + pad_token_id=None, + eos_token_id=None, + length_penalty=None, + no_repeat_ngram_size=None, + num_return_sequences=None, + attention_mask=None, + decoder_start_token_id=None, + use_cache=None, + output_scores=None, + output_attentions=None, + output_hidden_states=None, + return_dict_in_generate=None, + forced_bos_token_id=None, + forced_eos_token_id=None, + **model_kwargs, + ) -> Union[TFGreedySearchOutput, TFSampleOutput, TFBeamSearchOutput, TFBeamSampleOutput, tf.Tensor]: + r""" + Generates sequences for models with a language modeling head. The method currently supports greedy decoding, + beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling. + + Adapted in part from [Facebook's XLM beam search + code](https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529). + + Apart from `input_ids` and `attention_mask`, all the arguments below will default to the value of the attribute + of the same name inside the [`PretrainedConfig`] of the model. The default values indicated are the default + values of those config. + + Most of these parameters are explained in more detail in [this blog + post](https://huggingface.co/blog/how-to-generate). + + Parameters: + + input_ids (`tf.Tensor` of `dtype=tf.int32` and shape `(batch_size, sequence_length)`, *optional*): + The sequence used as a prompt for the generation. If `None` the method initializes it with + `bos_token_id` and a batch size of 1. + max_length (`int`, *optional*, defaults to 20): + The maximum length of the sequence to be generated. + min_length (`int`, *optional*, defaults to 10): + The minimum length of the sequence to be generated. + do_sample (`bool`, *optional*, defaults to `False`): + Whether or not to use sampling ; use greedy decoding otherwise. + early_stopping (`bool`, *optional*, defaults to `False`): + Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not. + num_beams (`int`, *optional*, defaults to 1): + Number of beams for beam search. 1 means no beam search. + temperature (`float`, *optional*, defaults to 1.0): + The value used to module the next token probabilities. + top_k (`int`, *optional*, defaults to 50): + The number of highest probability vocabulary tokens to keep for top-k-filtering. + top_p (`float`, *optional*, defaults to 1.0): + If set to float < 1, only the most probable tokens with probabilities that add up to `top_p` or higher + are kept for generation. + repetition_penalty (`float`, *optional*, defaults to 1.0): + The parameter for repetition penalty. 1.0 means no penalty. See [this + paper](https://arxiv.org/pdf/1909.05858.pdf) for more details. + pad_token_id (`int`, *optional*): + The id of the *padding* token. + bos_token_id (`int`, *optional*): + The id of the *beginning-of-sequence* token. + eos_token_id (`int`, *optional*): + The id of the *end-of-sequence* token. + length_penalty (`float`, *optional*, defaults to 1.0): + Exponential penalty to the length. 1.0 means no penalty. + + Set to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in + order to encourage the model to produce longer sequences. + no_repeat_ngram_size (`int`, *optional*, defaults to 0): + If set to int > 0, all ngrams of that size can only occur once. + bad_words_ids(`List[int]`, *optional*): + List of token ids that are not allowed to be generated. In order to get the tokens of the words that + should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`. + num_return_sequences(`int`, *optional*, defaults to 1): + The number of independently computed returned sequences for each element in the batch. + attention_mask (`tf.Tensor` of `dtype=tf.int32` and shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values are in `[0, 1]`, 1 for tokens + that are not masked, and 0 for masked tokens. + + If not provided, will default to a tensor the same shape as `input_ids` that masks the pad token. + + [What are attention masks?](../glossary#attention-mask) + decoder_start_token_id (`int`, *optional*): + If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token. + use_cache: (`bool`, *optional*, defaults to `True`): + Whether or not the model should use the past last key/values attentions (if applicable to the model) to + speed up decoding. + output_attentions (`bool`, *optional*, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more details. + output_hidden_states (`bool`, *optional*, defaults to `False`): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more details. + output_scores (`bool`, *optional*, defaults to `False`): + Whether or not to return the prediction scores. See `scores` under returned tensors for more details. + return_dict_in_generate (`bool`, *optional*, defaults to `False`): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + forced_bos_token_id (`int`, *optional*): + The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful + for multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be + the target language token. + forced_eos_token_id (`int`, *optional*): + The id of the token to force as the last generated token when `max_length` is reached. + model_specific_kwargs: + Additional model specific kwargs will be forwarded to the `forward` function of the model. + + Return: + [`~file_utils.ModelOutput`] or `tf.Tensor`: A [`~file_utils.ModelOutput`] (if + `return_dict_in_generate=True` or when `config.return_dict_in_generate=True`) or a `tf.Tensor`. + + If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible + [`~file_utils.ModelOutput`] types are: + + - [`~generation_tf_utils.TFGreedySearchDecoderOnlyOutput`], + - [`~generation_tf_utils.TFSampleDecoderOnlyOutput`], + - [`~generation_tf_utils.TFBeamSearchDecoderOnlyOutput`], + - [`~generation_tf_utils.TFBeamSampleDecoderOnlyOutput`] + + If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible + [`~file_utils.ModelOutput`] types are: + + - [`~generation_tf_utils.TFGreedySearchEncoderDecoderOutput`], + - [`~generation_tf_utils.TFSampleEncoderDecoderOutput`], + - [`~generation_tf_utils.TFBeamSearchEncoderDecoderOutput`], + - [`~generation_tf_utils.TFBeamSampleEncoderDecoderOutput`] + + Examples: + + ```python + tokenizer = AutoTokenizer.from_pretrained("distilgpt2") # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained( + "distilgpt2" + ) # Download model and configuration from huggingface.co and cache. + outputs = model.generate(max_length=40) # do greedy decoding + print(f"Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}") + + tokenizer = AutoTokenizer.from_pretrained("openai-gpt") # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained( + "openai-gpt" + ) # Download model and configuration from huggingface.co and cache. + input_context = "The dog" + input_ids = tokenizer.encode(input_context, return_tensors="tf") # encode input context + outputs = model.generate( + input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5 + ) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog' + for i in range(3): # 3 output sequences were generated + print(f"Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}") + + tokenizer = AutoTokenizer.from_pretrained("distilgpt2") # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained( + "distilgpt2" + ) # Download model and configuration from huggingface.co and cache. + input_context = "The dog" + input_ids = tokenizer.encode(input_context, return_tensors="tf") # encode input context + outputs = model.generate( + input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True + ) # generate 3 candidates using sampling + for i in range(3): # 3 output sequences were generated + print(f"Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}") + + tokenizer = AutoTokenizer.from_pretrained("ctrl") # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained( + "ctrl" + ) # Download model and configuration from huggingface.co and cache. + input_context = "Legal My neighbor is" # "Legal" is one of the control codes for ctrl + input_ids = tokenizer.encode(input_context, return_tensors="tf") # encode input context + outputs = model.generate( + input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2 + ) # generate sequences + print(f"Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}") + + tokenizer = AutoTokenizer.from_pretrained("gpt2") # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained( + "gpt2" + ) # Download model and configuration from huggingface.co and cache. + input_context = "My cute dog" + bad_words_ids = [ + tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ["idiot", "stupid", "shut up"] + ] + input_ids = tokenizer.encode(input_context, return_tensors="tf") # encode input context + outputs = model.generate( + input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids + ) # generate sequences without allowing bad_words to be generated + ```""" + num_beams = num_beams if num_beams is not None else self.config.num_beams + do_sample = do_sample if do_sample is not None else self.config.do_sample + + is_greedy_gen_mode = num_beams == 1 and do_sample is False + + if is_greedy_gen_mode: + return self._generate( + input_ids=input_ids, + max_length=max_length, + min_length=min_length, + do_sample=do_sample, + early_stopping=early_stopping, + num_beams=num_beams, + temperature=temperature, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, + bad_words_ids=bad_words_ids, + bos_token_id=bos_token_id, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + length_penalty=length_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + num_return_sequences=num_return_sequences, + attention_mask=attention_mask, + decoder_start_token_id=decoder_start_token_id, + use_cache=use_cache, + output_scores=output_scores, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict_in_generate=return_dict_in_generate, + ) + + # We cannot generate if the model does not have a LM head + if self.get_output_embeddings() is None: + raise AttributeError( + "You tried to generate sequences with a model that does not have a LM Head. " + "Please use another model class (e.g. `TFOpenAIGPTLMHeadModel`, `TFXLNetLMHeadModel`, `TFGPT2LMHeadModel`, `TFCTRLLMHeadModel`, `TFT5ForConditionalGeneration`, `TFTransfoXLLMHeadModel`)" + ) + + max_length = max_length if max_length is not None else self.config.max_length + min_length = min_length if min_length is not None else self.config.min_length + early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping + temperature = temperature if temperature is not None else self.config.temperature + top_k = top_k if top_k is not None else self.config.top_k + top_p = top_p if top_p is not None else self.config.top_p + + repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty + bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id + pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id + eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id + length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty + no_repeat_ngram_size = ( + no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size + ) + bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids + num_return_sequences = ( + num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences + ) + decoder_start_token_id = ( + decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id + ) + forced_bos_token_id = ( + forced_bos_token_id if forced_bos_token_id is not None else self.config.forced_bos_token_id + ) + forced_eos_token_id = ( + forced_eos_token_id if forced_eos_token_id is not None else self.config.forced_eos_token_id + ) + + output_scores = output_scores if output_scores is not None else self.config.output_scores + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict_in_generate = ( + return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate + ) + + model_kwargs["output_scores"] = output_scores + model_kwargs["output_attentions"] = output_attentions + model_kwargs["output_hidden_states"] = output_hidden_states + if self.config.is_encoder_decoder: + model_kwargs["encoder_attentions"] = None + model_kwargs["encoder_hidden_states"] = None + + if input_ids is not None: + batch_size = shape_list(input_ids)[0] # overridden by the input batch_size + else: + batch_size = 1 + + assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer." + assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer." + assert isinstance(do_sample, bool), "`do_sample` should be a boolean." + assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean." + assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer." + assert temperature > 0, "`temperature` should be strictly positive." + assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer." + assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." + assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." + assert input_ids is not None or ( + isinstance(bos_token_id, int) and bos_token_id >= 0 + ), "If input_ids is not defined, `bos_token_id` should be a positive integer." + assert pad_token_id is None or ( + isinstance(pad_token_id, int) and (pad_token_id >= 0) + ), "`pad_token_id` should be a positive integer." + assert (eos_token_id is None) or ( + isinstance(eos_token_id, int) and (eos_token_id >= 0) + ), "`eos_token_id` should be a positive integer." + assert length_penalty > 0, "`length_penalty` should be strictly positive." + assert ( + isinstance(num_return_sequences, int) and num_return_sequences > 0 + ), "`num_return_sequences` should be a strictly positive integer." + assert ( + bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list) + ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated" + + # This block corresponds to the following line in `generation_tf_utils`: + # "input_ids = self._prepare_input_ids_for_generation(bos_token_id, model_kwargs.get("encoder_outputs"))" + # with the following differences: + # 1. In PT, `generate()`'s `model_kwargs` can accept `encoder_outputs`, but not the case in TF. + # 2. There is no shape checking in PT. + # In both PT/TF, if `input_ids` is `None`, we try to create it as it is for a text model. + if input_ids is None: + assert isinstance(bos_token_id, int) and bos_token_id >= 0, ( + "you should either supply a context to complete as `input_ids` input " + "or a `bos_token_id` (integer >= 0) as a first token to start the generation." + ) + input_ids = tf.fill((batch_size, 1), bos_token_id) + + # not allow to duplicate outputs when greedy decoding + if do_sample is False: + if num_beams == 1: + # no_beam_search greedy generation conditions + assert ( + num_return_sequences == 1 + ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1" + + else: + # beam_search greedy generation conditions + assert ( + num_beams >= num_return_sequences + ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences" + + # create attention mask if necessary + # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140 + if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids.numpy()): + attention_mask = tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=tf.int32) + elif attention_mask is None: + attention_mask = tf.ones_like(input_ids) + + if pad_token_id is None and eos_token_id is not None: + logger.warning(f"Setting `pad_token_id` to {eos_token_id} (first `eos_token_id`) to generate sequence") + pad_token_id = eos_token_id + + # current position and vocab size + cur_len = shape_list(input_ids)[1] # unused + vocab_size = getattr(self.config, "vocab_size", None) + if vocab_size is None and self.config.is_encoder_decoder: + decoder_config = getattr(self.config, "decoder", None) + if decoder_config is not None: + vocab_size = getattr(self.config.decoder, "vocab_size", None) + + # set effective batch size and effective batch multiplier according to do_sample + if do_sample: + effective_batch_size = batch_size * num_return_sequences + effective_batch_mult = num_return_sequences + else: + effective_batch_size = batch_size + effective_batch_mult = 1 + + if self.config.is_encoder_decoder: + if decoder_start_token_id is None: + decoder_start_token_id = bos_token_id + + assert ( + decoder_start_token_id is not None + ), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation" + assert hasattr(self, "get_encoder"), f"{self} should have a 'get_encoder' function defined" + assert callable(self.get_encoder), f"{self.get_encoder} should be a method" + + # get encoder and store encoder outputs + encoder = self.get_encoder() + + encoder_kwargs = { + "attention_mask": attention_mask, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "return_dict": return_dict_in_generate, + } + + # vision models don't use `attention_mask`. + signature = dict(inspect.signature(encoder.call).parameters) + if "attention_mask" not in signature: + encoder_kwargs.pop("attention_mask") + + encoder_outputs = encoder(input_ids, **encoder_kwargs) + if return_dict_in_generate: + if output_attentions: + model_kwargs["encoder_attentions"] = encoder_outputs.attentions + if output_hidden_states: + model_kwargs["encoder_hidden_states"] = encoder_outputs.hidden_states + + # The condition `len(shape_list(input_ids)) == 2` is to make this block treats only text inputs. + # (vision inputs might occur when the model is an encoder-decoder model) + # Expand input ids if num_beams > 1 or num_return_sequences > 1 + if len(shape_list(input_ids)) == 2 and (num_return_sequences > 1 or num_beams > 1): + input_ids_len = shape_list(input_ids)[-1] + input_ids = tf.broadcast_to( + tf.expand_dims(input_ids, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) + ) + attention_mask = tf.broadcast_to( + tf.expand_dims(attention_mask, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) + ) + input_ids = tf.reshape( + input_ids, (effective_batch_size * num_beams, input_ids_len) + ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) + attention_mask = tf.reshape( + attention_mask, (effective_batch_size * num_beams, input_ids_len) + ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) + + if self.config.is_encoder_decoder: + + # create empty decoder_input_ids + input_ids = ( + tf.ones( + (effective_batch_size * num_beams, 1), + dtype=tf.int32, + ) + * decoder_start_token_id + ) + cur_len = 1 + + assert ( + batch_size == encoder_outputs[0].shape[0] + ), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} " + + # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1) + expanded_batch_idxs = tf.reshape( + tf.repeat(tf.expand_dims(tf.range(batch_size), -1), repeats=num_beams * effective_batch_mult, axis=1), + shape=(-1,), + ) + # expand encoder_outputs + encoder_outputs = (tf.gather(encoder_outputs[0], expanded_batch_idxs, axis=0),) + else: + encoder_outputs = None + cur_len = shape_list(input_ids)[-1] + + assert ( + cur_len < max_length + ), f"The context has {cur_len} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`" + + if num_beams == 1: + return self._generate_no_beam_search( + input_ids, + cur_len=cur_len, + max_length=max_length, + min_length=min_length, + do_sample=do_sample, + temperature=temperature, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + bad_words_ids=bad_words_ids, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + batch_size=effective_batch_size, + vocab_size=vocab_size, + encoder_outputs=encoder_outputs, + attention_mask=attention_mask, + use_cache=use_cache, + return_dict_in_generate=return_dict_in_generate, + **model_kwargs, + ) + else: + return self._generate_beam_search( + input_ids, + cur_len=cur_len, + max_length=max_length, + min_length=min_length, + do_sample=do_sample, + early_stopping=early_stopping, + temperature=temperature, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + bad_words_ids=bad_words_ids, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + batch_size=effective_batch_size, + num_return_sequences=num_return_sequences, + length_penalty=length_penalty, + num_beams=num_beams, + vocab_size=vocab_size, + encoder_outputs=encoder_outputs, + attention_mask=attention_mask, + use_cache=use_cache, + forced_bos_token_id=forced_bos_token_id, + forced_eos_token_id=forced_eos_token_id, + return_dict_in_generate=return_dict_in_generate, + **model_kwargs, + ) + + def _generate_no_beam_search( + self, + input_ids, + cur_len, + max_length, + min_length, + do_sample, + temperature, + top_k, + top_p, + repetition_penalty, + no_repeat_ngram_size, + bad_words_ids, + pad_token_id, + eos_token_id, + batch_size, + vocab_size, + encoder_outputs, + attention_mask, + use_cache, + return_dict_in_generate, + **kwargs + ) -> Union[TFGreedySearchOutput, TFSampleOutput, tf.Tensor]: + """ + Generate sequences for each example without beam search (num_beams == 1). All returned sequences are generated + independently. + """ + + # length of generated sentences / unfinished sentences + unfinished_sents = tf.ones_like(input_ids[:, 0]) + sent_lengths = tf.ones_like(input_ids[:, 0]) * max_length + + past = encoder_outputs # defined for encoder-decoder models, None for decoder-only models + + # init attention / hidden states / scores tuples + scores = () if (return_dict_in_generate and kwargs["output_scores"]) else None + decoder_attentions = () if (return_dict_in_generate and kwargs["output_attentions"]) else None + cross_attentions = () if (return_dict_in_generate and kwargs["output_attentions"]) else None + decoder_hidden_states = () if (return_dict_in_generate and kwargs["output_hidden_states"]) else None + # if model is an encoder-decoder, retrieve encoder attention weights and hidden states + if self.config.is_encoder_decoder: + encoder_attentions = ( + kwargs["encoder_attentions"] if (return_dict_in_generate and kwargs["encoder_attentions"]) else None + ) + encoder_hidden_states = ( + kwargs["encoder_hidden_states"] + if (return_dict_in_generate and kwargs["encoder_hidden_states"]) + else None + ) + + while cur_len < max_length: + model_inputs = self.prepare_inputs_for_generation( + input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **kwargs + ) + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=kwargs["output_attentions"], + output_hidden_states=kwargs["output_hidden_states"], + ) + next_token_logits = outputs.logits[:, -1, :] # (batch_size * num_beams, vocab_size) + + # Store scores, attentions and hidden_states when required + if return_dict_in_generate: + if kwargs["output_scores"]: + scores += (next_token_logits,) + if kwargs["output_attentions"]: + decoder_attentions += ( + (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) + ) + if self.config.is_encoder_decoder: + cross_attentions += (outputs.cross_attentions,) + + if kwargs["output_hidden_states"]: + decoder_hidden_states += ( + (outputs.decoder_hidden_states,) + if self.config.is_encoder_decoder + else (outputs.hidden_states,) + ) + + # if model has past, then set the past variable to speed up decoding + if self._use_cache(outputs, use_cache): + past = outputs[1] + + # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858) + if repetition_penalty != 1.0: + next_token_logits_penalties = _create_next_token_logits_penalties( + input_ids, next_token_logits, repetition_penalty + ) + next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties) + + if no_repeat_ngram_size > 0: + # calculate a list of banned tokens to prevent repetitively generating the same ngrams + # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 + banned_tokens = calc_banned_ngram_tokens(input_ids, batch_size, no_repeat_ngram_size, cur_len) + # create banned_tokens boolean mask + banned_tokens_indices_mask = [] + for banned_tokens_slice in banned_tokens: + banned_tokens_indices_mask.append( + [True if token in banned_tokens_slice else False for token in range(vocab_size)] + ) + + next_token_logits = set_tensor_by_indices_to_value( + next_token_logits, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") + ) + + if bad_words_ids is not None: + # calculate a list of banned tokens according to bad words + banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) + + banned_tokens_indices_mask = [] + for banned_tokens_slice in banned_tokens: + banned_tokens_indices_mask.append( + [True if token in banned_tokens_slice else False for token in range(vocab_size)] + ) + + next_token_logits = set_tensor_by_indices_to_value( + next_token_logits, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") + ) + + # set eos token prob to zero if min_length is not reached + if eos_token_id is not None and cur_len < min_length: + # create eos_token_id boolean mask + is_token_logit_eos_token = tf.convert_to_tensor( + [True if token == eos_token_id else False for token in range(vocab_size)], dtype=tf.bool + ) + eos_token_indices_mask = tf.broadcast_to(is_token_logit_eos_token, [batch_size, vocab_size]) + + next_token_logits = set_tensor_by_indices_to_value( + next_token_logits, eos_token_indices_mask, -float("inf") + ) + + if do_sample: + # Temperature (higher temperature => more likely to sample low probability tokens) + if temperature != 1.0: + next_token_logits = next_token_logits / temperature + # Top-p/top-k filtering + next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p) + # Sample + next_token = tf.squeeze( + tf.random.categorical(next_token_logits, dtype=tf.int32, num_samples=1), axis=1 + ) + else: + # Greedy decoding + next_token = tf.math.argmax(next_token_logits, axis=-1, output_type=tf.int32) + + # update generations and finished sentences + if eos_token_id is not None: + # pad finished sentences if eos_token_id exist + tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents) + else: + tokens_to_add = next_token + + # add token and increase length by one + input_ids = tf.concat([input_ids, tf.expand_dims(tokens_to_add, -1)], 1) + cur_len = cur_len + 1 + + if eos_token_id is not None: + eos_in_sents = tokens_to_add == eos_token_id + # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length + is_sents_unfinished_and_token_to_add_is_eos = tf.math.multiply( + unfinished_sents, tf.cast(eos_in_sents, tf.int32) + ) + sent_lengths = ( + sent_lengths * (1 - is_sents_unfinished_and_token_to_add_is_eos) + + cur_len * is_sents_unfinished_and_token_to_add_is_eos + ) + + # unfinished_sents is set to zero if eos in sentence + unfinished_sents -= is_sents_unfinished_and_token_to_add_is_eos + + # stop when there is a in each sentence, or if we exceed the maximum length + if tf.math.reduce_max(unfinished_sents) == 0: + break + + # extend attention_mask for new generated input if only decoder + if self.config.is_encoder_decoder is False: + attention_mask = tf.concat( + [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1 + ) + + # if there are different sentences lengths in the batch, some batches have to be padded + min_sent_length = tf.math.reduce_min(sent_lengths) + max_sent_length = tf.math.reduce_max(sent_lengths) + if min_sent_length != max_sent_length: + assert pad_token_id is not None, "`Pad_token_id` has to be defined if batches have different lengths" + # finished sents are filled with pad_token + padding = tf.ones([batch_size, max_sent_length.numpy()], dtype=tf.int32) * pad_token_id + + # create length masks for tf.where operation + broad_casted_sent_lengths = tf.broadcast_to( + tf.expand_dims(sent_lengths, -1), [batch_size, max_sent_length] + ) + broad_casted_range = tf.transpose( + tf.broadcast_to(tf.expand_dims(tf.range(max_sent_length), -1), [max_sent_length, batch_size]) + ) + + decoded = tf.where(broad_casted_range < broad_casted_sent_lengths, input_ids, padding) + else: + decoded = input_ids + + if return_dict_in_generate: + if do_sample: + if self.config.is_encoder_decoder: + return TFSampleEncoderDecoderOutput( + sequences=decoded, + scores=scores, + encoder_attentions=encoder_attentions, + encoder_hidden_states=encoder_hidden_states, + decoder_attentions=decoder_attentions, + cross_attentions=cross_attentions, + decoder_hidden_states=decoder_hidden_states, + ) + else: + return TFSampleDecoderOnlyOutput( + sequences=decoded, + scores=scores, + attentions=decoder_attentions, + hidden_states=decoder_hidden_states, + ) + else: + if self.config.is_encoder_decoder: + return TFGreedySearchEncoderDecoderOutput( + sequences=decoded, + scores=scores, + encoder_attentions=encoder_attentions, + encoder_hidden_states=encoder_hidden_states, + decoder_attentions=decoder_attentions, + cross_attentions=cross_attentions, + decoder_hidden_states=decoder_hidden_states, + ) + else: + return TFGreedySearchDecoderOnlyOutput( + sequences=decoded, + scores=scores, + attentions=decoder_attentions, + hidden_states=decoder_hidden_states, + ) + else: + return decoded + + def _generate_beam_search( + self, + input_ids, + cur_len, + max_length, + min_length, + do_sample, + early_stopping, + temperature, + top_k, + top_p, + repetition_penalty, + no_repeat_ngram_size, + bad_words_ids, + pad_token_id, + eos_token_id, + batch_size, + num_return_sequences, + length_penalty, + num_beams, + vocab_size, + encoder_outputs, + attention_mask, + use_cache, + forced_bos_token_id, + forced_eos_token_id, + return_dict_in_generate, + **kwargs, + ) -> Union[TFBeamSearchOutput, TFBeamSampleOutput, tf.Tensor]: + """Generate sequences for each example with beam search.""" + + # generated hypotheses + generated_hyps = [ + BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=early_stopping) + for _ in range(batch_size) + ] + + # for greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times + if do_sample is False: + beam_scores_begin = tf.zeros((batch_size, 1), dtype=tf.float32) + beam_scores_end = tf.ones((batch_size, num_beams - 1), dtype=tf.float32) * (-1e9) + beam_scores = tf.concat([beam_scores_begin, beam_scores_end], -1) + else: + beam_scores = tf.zeros((batch_size, num_beams), dtype=tf.float32) + + beam_scores = tf.reshape(beam_scores, (batch_size * num_beams,)) + + # cache compute states + past = encoder_outputs + # to stay similar to torch : past = (encoder_outputs, None) if encoder_outputs is not None else None + + # init attention / hidden states / scores tuples + scores = () if (return_dict_in_generate and kwargs["output_scores"]) else None + decoder_attentions = () if (return_dict_in_generate and kwargs["output_attentions"]) else None + cross_attentions = () if (return_dict_in_generate and kwargs["output_attentions"]) else None + decoder_hidden_states = () if (return_dict_in_generate and kwargs["output_hidden_states"]) else None + # if model is an encoder-decoder, retrieve encoder attention weights and hidden states + if self.config.is_encoder_decoder: + encoder_attentions = ( + kwargs["encoder_attentions"] if (return_dict_in_generate and kwargs["encoder_attentions"]) else None + ) + encoder_hidden_states = ( + kwargs["encoder_hidden_states"] + if (return_dict_in_generate and kwargs["encoder_hidden_states"]) + else None + ) + + # done sentences + done = [False for _ in range(batch_size)] + + while cur_len < max_length: + model_inputs = self.prepare_inputs_for_generation( + input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **kwargs + ) + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=kwargs["output_attentions"], + output_hidden_states=kwargs["output_hidden_states"], + ) + next_token_logits = outputs.logits[:, -1, :] # (batch_size * num_beams, vocab_size) + + # if model has past, then set the past variable to speed up decoding + if self._use_cache(outputs, use_cache): + past = outputs[1] + + # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858) + if repetition_penalty != 1.0: + next_token_logits_penalties = _create_next_token_logits_penalties( + input_ids, next_token_logits, repetition_penalty + ) + next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties) + + # Temperature (higher temperature => more likely to sample low probability tokens) + if temperature != 1.0: + next_token_logits = next_token_logits / temperature + + if self.config.is_encoder_decoder and do_sample is False: + next_token_logits = self.adjust_logits_during_generation( + next_token_logits, + cur_len=cur_len, + max_length=max_length, + forced_bos_token_id=forced_bos_token_id, + forced_eos_token_id=forced_eos_token_id, + ) + # calculate log softmax score + scores = tf.nn.log_softmax(next_token_logits, axis=-1) # (batch_size * num_beams, vocab_size) + + # set eos token prob to zero if min_length is not reached + if eos_token_id is not None and cur_len < min_length: + # create eos_token_id boolean mask + num_batch_hypotheses = batch_size * num_beams + + is_token_logit_eos_token = tf.convert_to_tensor( + [True if token == eos_token_id else False for token in range(vocab_size)], dtype=tf.bool + ) + eos_token_indices_mask = tf.broadcast_to(is_token_logit_eos_token, [num_batch_hypotheses, vocab_size]) + + scores = set_tensor_by_indices_to_value(scores, eos_token_indices_mask, -float("inf")) + + if no_repeat_ngram_size > 0: + # calculate a list of banned tokens to prevent repetitively generating the same ngrams + # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 + num_batch_hypotheses = batch_size * num_beams + banned_tokens = calc_banned_ngram_tokens( + input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len + ) + # create banned_tokens boolean mask + banned_tokens_indices_mask = [] + for banned_tokens_slice in banned_tokens: + banned_tokens_indices_mask.append( + [True if token in banned_tokens_slice else False for token in range(vocab_size)] + ) + + scores = set_tensor_by_indices_to_value( + scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") + ) + + if bad_words_ids is not None: + # calculate a list of banned tokens according to bad words + banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) + + banned_tokens_indices_mask = [] + for banned_tokens_slice in banned_tokens: + banned_tokens_indices_mask.append( + [True if token in banned_tokens_slice else False for token in range(vocab_size)] + ) + + scores = set_tensor_by_indices_to_value( + scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") + ) + + assert shape_list(scores) == [batch_size * num_beams, vocab_size] + + if do_sample: + _scores = scores + tf.broadcast_to( + beam_scores[:, None], (batch_size * num_beams, vocab_size) + ) # (batch_size * num_beams, vocab_size) + + # Top-p/top-k filtering + _scores = tf_top_k_top_p_filtering( + _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2 + ) # (batch_size * num_beams, vocab_size) + # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search) + _scores = tf.reshape(_scores, (batch_size, num_beams * vocab_size)) + + next_tokens = sample_without_replacement( + _scores, num_samples=2 * num_beams + ) # (batch_size, 2 * num_beams) + # Compute next scores + next_scores = tf.gather(_scores, next_tokens, batch_dims=1) # (batch_size, 2 * num_beams) + + # sort the sampled vector to make sure that the first num_beams samples are the best + next_scores_indices = tf.argsort(next_scores, direction="DESCENDING", axis=1) + next_scores = tf.gather(next_scores, next_scores_indices, batch_dims=1) # (batch_size, num_beams * 2) + next_tokens = tf.gather(next_tokens, next_scores_indices, batch_dims=1) # (batch_size, num_beams * 2) + else: + # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product) + next_scores = scores + tf.broadcast_to( + beam_scores[:, None], (batch_size * num_beams, vocab_size) + ) # (batch_size * num_beams, vocab_size) + + # re-organize to group the beam together (we are keeping top hypothesis across beams) + next_scores = tf.reshape( + next_scores, (batch_size, num_beams * vocab_size) + ) # (batch_size, num_beams * vocab_size) + + next_scores, next_tokens = tf.math.top_k(next_scores, k=2 * num_beams, sorted=True) + + assert shape_list(next_scores) == shape_list(next_tokens) == [batch_size, 2 * num_beams] + + # Store scores, attentions and hidden_states when required + if return_dict_in_generate: + if kwargs["output_scores"]: + scores += (next_token_logits,) + if kwargs["output_attentions"]: + decoder_attentions += ( + (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) + ) + if self.config.is_encoder_decoder: + cross_attentions += (outputs.cross_attentions,) + + if kwargs["output_hidden_states"]: + decoder_hidden_states += ( + (outputs.decoder_hidden_states,) + if self.config.is_encoder_decoder + else (outputs.hidden_states,) + ) + + # next batch beam content + next_batch_beam = [] + + # for each sentence + for batch_idx in range(batch_size): + + # if we are done with this sentence + if done[batch_idx]: + assert ( + len(generated_hyps[batch_idx]) >= num_beams + ), f"Batch can only be done if at least {num_beams} beams have been generated." + assert ( + eos_token_id is not None and pad_token_id is not None + ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined" + next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams) # pad the batch + continue + + # next sentence beam content + next_sent_beam = [] + + # next tokens for this sentence + for beam_token_rank, (beam_token_id, beam_token_score) in enumerate( + zip(next_tokens[batch_idx], next_scores[batch_idx]) + ): + # get beam and token IDs + beam_id = beam_token_id // vocab_size + token_id = beam_token_id % vocab_size + + effective_beam_id = batch_idx * num_beams + beam_id + # add to generated hypotheses if end of sentence or last iteration + if (eos_token_id is not None) and (token_id.numpy() == eos_token_id): + # if beam_token does not belong to top num_beams tokens, it should not be added + is_beam_token_worse_than_top_num_beams = beam_token_rank >= num_beams + if is_beam_token_worse_than_top_num_beams: + continue + generated_hyps[batch_idx].add( + tf.identity(input_ids[effective_beam_id]), beam_token_score.numpy() + ) + else: + # add next predicted token if it is not eos_token + next_sent_beam.append((beam_token_score, token_id, effective_beam_id)) + + # the beam for next step is full + if len(next_sent_beam) == num_beams: + break + + # Check if we are done so that we can save a pad step if all(done) + done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done( + tf.reduce_max(next_scores[batch_idx]).numpy(), cur_len + ) + + # update next beam content + assert len(next_sent_beam) == num_beams, "Beam should always be full" + next_batch_beam.extend(next_sent_beam) + assert len(next_batch_beam) == num_beams * (batch_idx + 1) + + # stop when we are done with each sentence + if all(done): + break + + # sanity check / prepare next batch + assert len(next_batch_beam) == batch_size * num_beams + beam_scores = tf.convert_to_tensor([x[0] for x in next_batch_beam], dtype=tf.float32) + beam_tokens = tf.convert_to_tensor([x[1] for x in next_batch_beam], dtype=tf.int32) + beam_idx = tf.convert_to_tensor([x[2] for x in next_batch_beam], dtype=tf.int32) + + # re-order batch and update current length + input_ids = tf.stack([tf.identity(input_ids[x, :]) for x in beam_idx]) + input_ids = tf.concat([input_ids, tf.expand_dims(beam_tokens, 1)], axis=-1) + cur_len = cur_len + 1 + + # re-order internal states + if past is not None: + past = self._reorder_cache(past, beam_idx) + + # extend attention_mask for new generated input if only decoder + if self.config.is_encoder_decoder is False: + attention_mask = tf.concat( + [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1 + ) + + # finalize all open beam hypotheses and end to generated hypotheses + for batch_idx in range(batch_size): + # Add all open beam hypothesis to generated_hyps + if done[batch_idx]: + continue + # test that beam scores match previously calculated scores if not eos and batch_idx not done + if eos_token_id is not None and all( + (token_id % vocab_size).numpy().item() != eos_token_id for token_id in next_tokens[batch_idx] + ): + if not tf.reduce_all( + next_scores[batch_idx, :num_beams] == tf.reshape(beam_scores, (batch_size, num_beams))[batch_idx] + ): + raise ValueError( + f"If batch_idx is not done, final next scores: {next_scores[:, :num_beams][batch_idx]} have " + "to equal to accumulated beam_scores: " + f"{tf.reshape(beam_scores, (batch_size, num_beams))[batch_idx]}" + ) + # need to add best num_beams hypotheses to generated hyps + for beam_id in range(num_beams): + effective_beam_id = batch_idx * num_beams + beam_id + final_score = beam_scores[effective_beam_id].numpy().item() + final_tokens = input_ids[effective_beam_id] + generated_hyps[batch_idx].add(final_tokens, final_score) + + # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch + output_batch_size = batch_size if do_sample else batch_size * num_return_sequences + output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences + + # select the best hypotheses + sent_lengths_list = [] + best = [] + + # retrieve best hypotheses + for i, hypotheses in enumerate(generated_hyps): + sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0]) + for j in range(output_num_return_sequences_per_batch): + best_hyp = sorted_hyps.pop()[1] + sent_lengths_list.append(len(best_hyp)) + best.append(best_hyp) + assert output_batch_size == len( + best + ), f"Output batch size {output_batch_size} must match output beam hypotheses {len(best)}" + + sent_lengths = tf.convert_to_tensor(sent_lengths_list, dtype=tf.int32) + + # shorter batches are filled with pad_token + if tf.reduce_min(sent_lengths).numpy() != tf.reduce_max(sent_lengths).numpy(): + assert pad_token_id is not None, "`Pad_token_id` has to be defined" + sent_max_len = min(tf.reduce_max(sent_lengths).numpy() + 1, max_length) + decoded_list = [] + + # fill with hypothesis and eos_token_id if necessary + for i, hypo in enumerate(best): + assert sent_lengths[i] == shape_list(hypo)[0] + # if sent_length is max_len do not pad + if sent_lengths[i] == sent_max_len: + decoded_slice = hypo + else: + # else pad to sent_max_len + num_pad_tokens = sent_max_len - sent_lengths[i] + padding = pad_token_id * tf.ones((num_pad_tokens,), dtype=tf.int32) + decoded_slice = tf.concat([hypo, padding], axis=-1) + + # finish sentence with EOS token + if sent_lengths[i] < max_length: + decoded_slice = tf.where( + tf.range(sent_max_len, dtype=tf.int32) == sent_lengths[i], + eos_token_id * tf.ones((sent_max_len,), dtype=tf.int32), + decoded_slice, + ) + # add to list + decoded_list.append(decoded_slice) + + decoded = tf.stack(decoded_list) + else: + # none of the hypotheses have an eos_token + assert (len(hypo) == max_length for hypo in best) + decoded = tf.stack(best) + + if return_dict_in_generate: + if do_sample and self.config.is_encoder_decoder: + return TFBeamSampleEncoderDecoderOutput( + sequences=decoded, + scores=scores, + encoder_attentions=encoder_attentions, + encoder_hidden_states=encoder_hidden_states, + decoder_attentions=decoder_attentions, + cross_attentions=cross_attentions, + decoder_hidden_states=decoder_hidden_states, + ) + elif do_sample and not self.config.is_encoder_decoder: + return TFBeamSampleDecoderOnlyOutput( + sequences=decoded, + scores=scores, + attentions=decoder_attentions, + hidden_states=decoder_hidden_states, + ) + elif self.config.is_encoder_decoder: + return TFBeamSearchEncoderDecoderOutput( + sequences=decoded, + scores=scores, + encoder_attentions=encoder_attentions, + encoder_hidden_states=encoder_hidden_states, + decoder_attentions=decoder_attentions, + cross_attentions=cross_attentions, + decoder_hidden_states=decoder_hidden_states, + ) + else: + return TFBeamSearchDecoderOnlyOutput( + sequences=decoded, + scores=scores, + attentions=decoder_attentions, + hidden_states=decoder_hidden_states, + ) + else: + return decoded + + @staticmethod + def _reorder_cache(past, beam_idx): + return tuple(tf.gather(layer_past, beam_idx, axis=1) for layer_past in past) + + def adjust_logits_during_generation( + self, logits, cur_len, max_length, forced_bos_token_id, forced_eos_token_id, **kwargs + ): + """ + Implement in subclasses of [`PreTrainedModel`] for custom behavior to adjust the logits in the generate method. + """ + vocab_size = getattr(self.config, "vocab_size", None) + if vocab_size is None and self.config.is_encoder_decoder: + decoder_config = getattr(self.config, "decoder", None) + if decoder_config is not None: + vocab_size = getattr(self.config.decoder, "vocab_size", None) + + if cur_len == 1 and forced_bos_token_id is not None: + vocab_range = tf.constant(range(vocab_size)) + return tf.where(vocab_range != forced_bos_token_id, -1e8, logits) + elif cur_len == max_length - 1 and forced_eos_token_id is not None: + vocab_range = tf.constant(range(vocab_size)) + return tf.where(vocab_range != forced_eos_token_id, -1e8, logits) + else: + return logits + + def _generate( + self, + input_ids=None, + max_length=None, + min_length=None, + do_sample=None, + early_stopping=None, + num_beams=None, + temperature=None, + top_k=None, + top_p=None, + repetition_penalty=None, + bad_words_ids=None, + bos_token_id=None, + pad_token_id=None, + eos_token_id=None, + length_penalty=None, + no_repeat_ngram_size=None, + num_return_sequences=None, + attention_mask=None, + decoder_start_token_id=None, + use_cache=None, + output_scores=None, + output_attentions=None, + output_hidden_states=None, + return_dict_in_generate=None, + forced_bos_token_id=None, + forced_eos_token_id=None, + **model_kwargs, + ) -> Union[TFGreedySearchOutput, TFSampleOutput, TFBeamSearchOutput, TFBeamSampleOutput, tf.Tensor]: + r""" + Generates sequences for models with a language modeling head. The method currently supports greedy decoding, + beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling. + + Adapted in part from [Facebook's XLM beam search + code](https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529). + + Apart from `input_ids` and `attention_mask`, all the arguments below will default to the value of the attribute + of the same name inside the [`PretrainedConfig`] of the model. The default values indicated are the default + values of those config. + + Most of these parameters are explained in more detail in [this blog + post](https://huggingface.co/blog/how-to-generate). + + Parameters: + + input_ids (`tf.Tensor` of `dtype=tf.int32` and shape `(batch_size, sequence_length)`, *optional*): + The sequence used as a prompt for the generation. If `None` the method initializes it with + `bos_token_id` and a batch size of 1. + max_length (`int`, *optional*, defaults to 20): + The maximum length of the sequence to be generated. + min_length (`int`, *optional*, defaults to 10): + The minimum length of the sequence to be generated. + do_sample (`bool`, *optional*, defaults to `False`): + Whether or not to use sampling ; use greedy decoding otherwise. + early_stopping (`bool`, *optional*, defaults to `False`): + Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not. + num_beams (`int`, *optional*, defaults to 1): + Number of beams for beam search. 1 means no beam search. + temperature (`float`, *optional*, defaults to 1.0): + The value used to module the next token probabilities. + top_k (`int`, *optional*, defaults to 50): + The number of highest probability vocabulary tokens to keep for top-k-filtering. + top_p (`float`, *optional*, defaults to 1.0): + If set to float < 1, only the most probable tokens with probabilities that add up to `top_p` or higher + are kept for generation. + repetition_penalty (`float`, *optional*, defaults to 1.0): + The parameter for repetition penalty. 1.0 means no penalty. See [this + paper](https://arxiv.org/pdf/1909.05858.pdf) for more details. + pad_token_id (`int`, *optional*): + The id of the *padding* token. + bos_token_id (`int`, *optional*): + The id of the *beginning-of-sequence* token. + eos_token_id (`int`, *optional*): + The id of the *end-of-sequence* token. + length_penalty (`float`, *optional*, defaults to 1.0): + Exponential penalty to the length. 1.0 means no penalty. + + Set to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in + order to encourage the model to produce longer sequences. + no_repeat_ngram_size (`int`, *optional*, defaults to 0): + If set to int > 0, all ngrams of that size can only occur once. + bad_words_ids(`List[int]`, *optional*): + List of token ids that are not allowed to be generated. In order to get the tokens of the words that + should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`. + num_return_sequences(`int`, *optional*, defaults to 1): + The number of independently computed returned sequences for each element in the batch. + attention_mask (`tf.Tensor` of `dtype=tf.int32` and shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values are in `[0, 1]`, 1 for tokens + that are not masked, and 0 for masked tokens. + + If not provided, will default to a tensor the same shape as `input_ids` that masks the pad token. + + [What are attention masks?](../glossary#attention-mask) + decoder_start_token_id (`int`, *optional*): + If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token. + use_cache: (`bool`, *optional*, defaults to `True`): + Whether or not the model should use the past last key/values attentions (if applicable to the model) to + speed up decoding. + output_attentions (`bool`, *optional*, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more details. + output_hidden_states (`bool`, *optional*, defaults to `False`): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more details. + output_scores (`bool`, *optional*, defaults to `False`): + Whether or not to return the prediction scores. See `scores` under returned tensors for more details. + return_dict_in_generate (`bool`, *optional*, defaults to `False`): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + forced_bos_token_id (`int`, *optional*): + The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful + for multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be + the target language token. + forced_eos_token_id (`int`, *optional*): + The id of the token to force as the last generated token when `max_length` is reached. + model_specific_kwargs: + Additional model specific kwargs will be forwarded to the `forward` function of the model. + + Return: + [`~file_utils.ModelOutput`] or `tf.Tensor`: A [`~file_utils.ModelOutput`] (if + `return_dict_in_generate=True` or when `config.return_dict_in_generate=True`) or a `tf.Tensor`. + + If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible + [`~file_utils.ModelOutput`] types are: + + - [`~generation_tf_utils.TFGreedySearchDecoderOnlyOutput`], + - [`~generation_tf_utils.TFSampleDecoderOnlyOutput`], + - [`~generation_tf_utils.TFBeamSearchDecoderOnlyOutput`], + - [`~generation_tf_utils.TFBeamSampleDecoderOnlyOutput`] + + If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible + [`~file_utils.ModelOutput`] types are: + + - [`~generation_tf_utils.TFGreedySearchEncoderDecoderOutput`], + - [`~generation_tf_utils.TFSampleEncoderDecoderOutput`], + - [`~generation_tf_utils.TFBeamSearchEncoderDecoderOutput`], + - [`~generation_tf_utils.TFBeamSampleEncoderDecoderOutput`] + + Examples: + + ```python + tokenizer = AutoTokenizer.from_pretrained("distilgpt2") # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained( + "distilgpt2" + ) # Download model and configuration from huggingface.co and cache. + outputs = model.generate(max_length=40) # do greedy decoding + print(f"Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}") + + tokenizer = AutoTokenizer.from_pretrained("openai-gpt") # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained( + "openai-gpt" + ) # Download model and configuration from huggingface.co and cache. + input_context = "The dog" + input_ids = tokenizer.encode(input_context, return_tensors="tf") # encode input context + outputs = model.generate( + input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5 + ) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog' + for i in range(3): # 3 output sequences were generated + print(f"Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}") + + tokenizer = AutoTokenizer.from_pretrained("distilgpt2") # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained( + "distilgpt2" + ) # Download model and configuration from huggingface.co and cache. + input_context = "The dog" + input_ids = tokenizer.encode(input_context, return_tensors="tf") # encode input context + outputs = model.generate( + input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True + ) # generate 3 candidates using sampling + for i in range(3): # 3 output sequences were generated + print(f"Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}") + + tokenizer = AutoTokenizer.from_pretrained("ctrl") # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained( + "ctrl" + ) # Download model and configuration from huggingface.co and cache. + input_context = "Legal My neighbor is" # "Legal" is one of the control codes for ctrl + input_ids = tokenizer.encode(input_context, return_tensors="tf") # encode input context + outputs = model.generate( + input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2 + ) # generate sequences + print(f"Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}") + + tokenizer = AutoTokenizer.from_pretrained("gpt2") # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained( + "gpt2" + ) # Download model and configuration from huggingface.co and cache. + input_context = "My cute dog" + bad_words_ids = [ + tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ["idiot", "stupid", "shut up"] + ] + input_ids = tokenizer.encode(input_context, return_tensors="tf") # encode input context + outputs = model.generate( + input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids + ) # generate sequences without allowing bad_words to be generated + ```""" + # 1. Set generation parameters if not already defined + num_beams = num_beams if num_beams is not None else self.config.num_beams + do_sample = do_sample if do_sample is not None else self.config.do_sample + max_length = max_length if max_length is not None else self.config.max_length + min_length = min_length if min_length is not None else self.config.min_length + early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping + + bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id + pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id + eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id + + output_scores = output_scores if output_scores is not None else self.config.output_scores + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict_in_generate = ( + return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate + ) + + if self.config.is_encoder_decoder: + model_kwargs["encoder_attentions"] = None + model_kwargs["encoder_hidden_states"] = None + + if input_ids is not None: + batch_size = shape_list(input_ids)[0] # overridden by the input batch_size + else: + batch_size = 1 + + # 2. Define model inputs + + # This block corresponds to the following line in `generation_tf_utils`: + # "input_ids = self._prepare_input_ids_for_generation(bos_token_id, model_kwargs.get("encoder_outputs"))" + # with the following differences: + # 1. In PT, `generate()`'s `model_kwargs` can accept `encoder_outputs`, but not the case in TF. + # 2. There is no shape checking in PT. + # In both PT/TF, if `input_ids` is `None`, we try to create it as it is for a text model. + if input_ids is None: + assert isinstance(bos_token_id, int) and bos_token_id >= 0, ( + "you should either supply a context to complete as `input_ids` input " + "or a `bos_token_id` (integer >= 0) as a first token to start the generation." + ) + input_ids = tf.fill((batch_size, 1), bos_token_id) + + # not allow to duplicate outputs when greedy decoding + if do_sample is False: + if num_beams == 1: + # no_beam_search greedy generation conditions + assert ( + num_return_sequences == 1 + ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1" + + else: + # beam_search greedy generation conditions + assert ( + num_beams >= num_return_sequences + ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences" + + # create attention mask if necessary + # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140 + if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids.numpy()): + attention_mask = tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=tf.int32) + elif attention_mask is None: + attention_mask = tf.ones_like(input_ids) + + if pad_token_id is None and eos_token_id is not None: + logger.warning(f"Setting `pad_token_id` to {eos_token_id} (first `eos_token_id`) to generate sequence") + pad_token_id = eos_token_id + + # current position and vocab size + cur_len = shape_list(input_ids)[1] # unused + vocab_size = getattr(self.config, "vocab_size", None) + if vocab_size is None and self.config.is_encoder_decoder: + decoder_config = getattr(self.config, "decoder", None) + if decoder_config is not None: + vocab_size = getattr(self.config.decoder, "vocab_size", None) + + # set effective batch size and effective batch multiplier according to do_sample + if do_sample: + effective_batch_size = batch_size * num_return_sequences + effective_batch_mult = num_return_sequences + else: + effective_batch_size = batch_size + effective_batch_mult = 1 + + if self.config.is_encoder_decoder: + if decoder_start_token_id is None: + decoder_start_token_id = bos_token_id + + assert ( + decoder_start_token_id is not None + ), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation" + assert hasattr(self, "get_encoder"), f"{self} should have a 'get_encoder' function defined" + assert callable(self.get_encoder), f"{self.get_encoder} should be a method" + + # get encoder and store encoder outputs + encoder = self.get_encoder() + + encoder_kwargs = { + "attention_mask": attention_mask, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "return_dict": return_dict_in_generate, + } + + # vision models don't use `attention_mask`. + signature = dict(inspect.signature(encoder.call).parameters) + if "attention_mask" not in signature: + encoder_kwargs.pop("attention_mask") + + encoder_outputs = encoder(input_ids, **encoder_kwargs) + if return_dict_in_generate: + if output_attentions: + model_kwargs["encoder_attentions"] = encoder_outputs.attentions + if output_hidden_states: + model_kwargs["encoder_hidden_states"] = encoder_outputs.hidden_states + + # The condition `len(shape_list(input_ids)) == 2` is to make this block treats only text inputs. + # (vision inputs might occur when the model is an encoder-decoder model) + # Expand input ids if num_beams > 1 or num_return_sequences > 1 + if len(shape_list(input_ids)) == 2 and (num_return_sequences > 1 or num_beams > 1): + input_ids_len = shape_list(input_ids)[-1] + input_ids = tf.broadcast_to( + tf.expand_dims(input_ids, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) + ) + attention_mask = tf.broadcast_to( + tf.expand_dims(attention_mask, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) + ) + input_ids = tf.reshape( + input_ids, (effective_batch_size * num_beams, input_ids_len) + ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) + attention_mask = tf.reshape( + attention_mask, (effective_batch_size * num_beams, input_ids_len) + ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) + + if self.config.is_encoder_decoder: + # create empty decoder_input_ids + input_ids = ( + tf.ones( + (effective_batch_size * num_beams, 1), + dtype=tf.int32, + ) + * decoder_start_token_id + ) + cur_len = 1 + + assert ( + batch_size == encoder_outputs[0].shape[0] + ), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} " + + # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1) + expanded_batch_idxs = tf.reshape( + tf.repeat(tf.expand_dims(tf.range(batch_size), -1), repeats=num_beams * effective_batch_mult, axis=1), + shape=(-1,), + ) + # expand encoder_outputs + encoder_outputs = (tf.gather(encoder_outputs[0], expanded_batch_idxs, axis=0),) + else: + encoder_outputs = None + cur_len = shape_list(input_ids)[-1] + + # TODO(Patrick) - not very clean here + model_kwargs["attention_mask"] = attention_mask + model_kwargs["past"] = encoder_outputs # defined for encoder-decoder models, None for decoder-only models + + assert ( + cur_len < max_length + ), f"The context has {cur_len} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`" + + is_greedy_gen_mode = (num_beams == 1) and do_sample is False + + # prepare distribution pre_processing samplers + # 7. prepare distribution pre_processing samplers + logits_processor = self._get_logits_processor( + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + bad_words_ids=bad_words_ids, + min_length=min_length, + eos_token_id=eos_token_id, + ) + + # 8. go into different generation modes + if is_greedy_gen_mode: + if num_return_sequences > 1: + raise ValueError( + f"num_return_sequences has to be 1, but is {num_return_sequences} when doing greedy search." + ) + + # 9. run greedy search + return self.greedy_search( + input_ids, + max_length=max_length, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + logits_processor=logits_processor, + output_scores=output_scores, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict_in_generate=return_dict_in_generate, + **model_kwargs, + ) + + @staticmethod + def _update_model_kwargs_for_generation( + outputs: ModelOutput, model_kwargs: Dict[str, Any], is_encoder_decoder: bool = False + ) -> Dict[str, Any]: + # update past + if "past_key_values" in outputs: + model_kwargs["past"] = outputs.past_key_values + elif "mems" in outputs: + model_kwargs["past"] = outputs.mems + elif "past_buckets_states" in outputs: + model_kwargs["past"] = outputs.past_buckets_states + else: + model_kwargs["past"] = None + + # update attention mask + if not is_encoder_decoder: + if "attention_mask" in model_kwargs: + attention_mask = model_kwargs["attention_mask"] + model_kwargs["attention_mask"] = tf.concat( + [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1 + ) + + return model_kwargs + + def _get_logits_processor( + self, + repetition_penalty: float, + no_repeat_ngram_size: int, + bad_words_ids: List[List[int]], + min_length: int, + eos_token_id: int, + ) -> TFLogitsProcessorList: + """ + This class returns a [`TFLogitsProcessorList`] list object that contains all relevant [`TFLogitsProcessor`] + instances used to modify the scores of the language model head. + """ + processors = TFLogitsProcessorList() + + repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty + no_repeat_ngram_size = ( + no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size + ) + bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids + eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id + + # instantiate processors list + if repetition_penalty is not None and repetition_penalty != 1.0: + processors.append(TFRepetitionPenaltyLogitsProcessor(penalty=repetition_penalty)) + if no_repeat_ngram_size is not None and no_repeat_ngram_size > 0: + processors.append(TFNoRepeatNGramLogitsProcessor(no_repeat_ngram_size)) + if bad_words_ids is not None: + processors.append(TFNoBadWordsLogitsProcessor(bad_words_ids, eos_token_id)) + if min_length is not None and eos_token_id is not None and min_length > -1: + processors.append(TFMinLengthLogitsProcessor(min_length, eos_token_id)) + + return processors + + def greedy_search( + self, + input_ids: tf.Tensor, + max_length: Optional[int] = None, + pad_token_id: Optional[int] = None, + eos_token_id: Optional[int] = None, + logits_processor: Optional[TFLogitsProcessorList] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_scores: Optional[bool] = None, + return_dict_in_generate: Optional[bool] = None, + **model_kwargs, + ) -> Union[TFGreedySearchOutput, tf.Tensor]: + r""" + Generates sequences for models with a language modeling head using greedy decoding. + + Parameters: + + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`): + The sequence used as a prompt for the generation. + logits_processor (`LogitsProcessorList`, *optional*): + An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] + used to modify the prediction scores of the language modeling head applied at each generation step. + max_length (`int`, *optional*, defaults to 20): + The maximum length of the sequence to be generated. + pad_token_id (`int`, *optional*): + The id of the *padding* token. + eos_token_id (`int`, *optional*): + The id of the *end-of-sequence* token. + output_attentions (`bool`, *optional*, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more details. + output_hidden_states (`bool`, *optional*, defaults to `False`): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more details. + output_scores (`bool`, *optional*, defaults to `False`): + Whether or not to return the prediction scores. See `scores` under returned tensors for more details. + return_dict_in_generate (`bool`, *optional*, defaults to `False`): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + model_kwargs: + Additional model specific keyword arguments will be forwarded to the `forward` function of the model. + If model is an encoder-decoder model the kwargs should include `encoder_outputs`. + + Return: + [`~generation_tf_utils.TFGreedySearchDecoderOnlyOutput`], [`~generation_tf_utils.TFGreedySearchEncoderDecoderOutput`] + or `tf.Tensor`: A `tf.Tensor` containing the generated tokens (default behaviour) or a + [`~generation_tf_utils.TFGreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and + `return_dict_in_generate=True` or a [`~generation_tf_utils.TFGreedySearchEncoderDecoderOutput`] if + `model.config.is_encoder_decoder=True`. + + Examples: + + ```python + >>> from transformers import ( + ... TFAutoTokenizer, + ... TFAutoModelForCausalLM, + ... TFLogitsProcessorList, + ... TFMinLengthLogitsProcessor, + ... ) + + >>> tokenizer = TFAutoTokenizer.from_pretrained("gpt2") + >>> model = TFAutoModelForCausalLM.from_pretrained("gpt2") + + >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token + >>> model.config.pad_token_id = model.config.eos_token_id + + >>> input_prompt = "Today is a beautiful day, and" + >>> input_ids = tokenizer(input_prompt, return_tensors="tf").input_ids + + >>> # instantiate logits processors + >>> logits_processor = TFLogitsProcessorList( + ... [ + ... TFMinLengthLogitsProcessor(15, eos_token_id=model.config.eos_token_id), + ... ] + ... ) + + >>> outputs = model.greedy_search(input_ids, logits_processor=logits_processor) + + >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True)) + ```""" + # init values + logits_processor = logits_processor if logits_processor is not None else TFLogitsProcessorList() + + pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id + eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id + output_scores = output_scores if output_scores is not None else self.config.output_scores + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict_in_generate = ( + return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate + ) + + # init attention / hidden states / scores tuples + scores = () if (return_dict_in_generate and output_scores) else None + decoder_attentions = () if (return_dict_in_generate and output_attentions) else None + cross_attentions = () if (return_dict_in_generate and output_attentions) else None + decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None + + # if model is an encoder-decoder, retrieve encoder attention weights and hidden states + if return_dict_in_generate and self.config.is_encoder_decoder: + encoder_attentions = model_kwargs["encoder_attentions"] if output_attentions else None + encoder_hidden_states = model_kwargs["encoder_hidden_states"] if output_hidden_states else None + + # keep track of which sequences are already finished + unfinished_sequences = tf.ones_like(input_ids[:, 0]) + cur_len = input_ids.shape[-1] + + while cur_len < max_length: + # TODO (Patrick): remove following line by cleaning up `prepare_inputs_for_generation` + # in all models + model_kwargs["use_cache"] = None if "use_cache" not in model_kwargs else model_kwargs["use_cache"] + + # prepare model inputs + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + + # forward pass to get next token + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + next_token_logits = outputs.logits[:, -1, :] + + # Store scores, attentions and hidden_states when required + if return_dict_in_generate: + if output_scores: + scores += (next_token_logits,) + if output_attentions: + decoder_attentions += ( + (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) + ) + if self.config.is_encoder_decoder: + cross_attentions += (outputs.cross_attentions,) + + if output_hidden_states: + decoder_hidden_states += ( + (outputs.decoder_hidden_states,) + if self.config.is_encoder_decoder + else (outputs.hidden_states,) + ) + + # pre-process distribution + next_tokens_scores = logits_processor(input_ids, next_token_logits, cur_len) + + # argmax + next_tokens = tf.cast(tf.argmax(next_tokens_scores, axis=-1), tf.int32) + + # finished sentences should have their next token be a padding token + if eos_token_id is not None: + if pad_token_id is None: + raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.") + next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) + + # update generated ids, model inputs, and length for next step + input_ids = tf.concat([input_ids, next_tokens[:, None]], axis=-1) + model_kwargs = self._update_model_kwargs_for_generation( + outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder + ) + cur_len = cur_len + 1 + + # if eos_token was found in one sentence, set sentence to finished + if eos_token_id is not None: + eos_in_sents = next_tokens == eos_token_id + # if sentence is unfinished and the token to add is eos + is_sents_unfinished_and_token_to_add_is_eos = tf.math.multiply( + unfinished_sequences, tf.cast(eos_in_sents, tf.int32) + ) + + # unfinished_sequences is set to zero if eos in sentence + unfinished_sequences -= is_sents_unfinished_and_token_to_add_is_eos + + # stop when each sentence is finished, or if we exceed the maximum length + if tf.math.reduce_max(unfinished_sequences) == 0: + break + + if return_dict_in_generate: + if self.config.is_encoder_decoder: + return TFGreedySearchEncoderDecoderOutput( + sequences=input_ids, + scores=scores, + encoder_attentions=encoder_attentions, + encoder_hidden_states=encoder_hidden_states, + decoder_attentions=decoder_attentions, + cross_attentions=cross_attentions, + decoder_hidden_states=decoder_hidden_states, + ) + else: + return TFGreedySearchDecoderOnlyOutput( + sequences=input_ids, + scores=scores, + attentions=decoder_attentions, + hidden_states=decoder_hidden_states, + ) + else: + return input_ids + + +def _create_next_token_logits_penalties(input_ids, logits, repetition_penalty): + # create logit penalties for already seen input_ids + token_penalties = np.ones(shape_list(logits)) + prev_input_ids = [np.unique(input_id) for input_id in input_ids.numpy()] + for i, prev_input_id in enumerate(prev_input_ids): + logit_penalized = logits[i].numpy()[prev_input_id] + logit_penalties = np.zeros(logit_penalized.shape) + # if previous logit score is < 0 then multiply repetition penalty else divide + logit_penalties[logit_penalized < 0] = repetition_penalty + logit_penalties[logit_penalized > 0] = 1 / repetition_penalty + np.put(token_penalties[i], prev_input_id, logit_penalties) + return tf.convert_to_tensor(token_penalties, dtype=tf.float32) + + +def calc_banned_ngram_tokens(prev_input_ids, num_hypos, no_repeat_ngram_size, cur_len): + # Copied from fairseq for no_repeat_ngram in beam_search + if cur_len + 1 < no_repeat_ngram_size: + # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet + return [[] for _ in range(num_hypos)] + generated_ngrams = [{} for _ in range(num_hypos)] + for idx in range(num_hypos): + gen_tokens = prev_input_ids[idx].numpy().tolist() + generated_ngram = generated_ngrams[idx] + for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]): + prev_ngram_tuple = tuple(ngram[:-1]) + generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]] + + def _get_generated_ngrams(hypo_idx): + # Before decoding the next token, prevent decoding of ngrams that have already appeared + start_idx = cur_len + 1 - no_repeat_ngram_size + ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].numpy().tolist()) + return generated_ngrams[hypo_idx].get(ngram_idx, []) + + banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)] + return banned_tokens + + +def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids): + banned_tokens = [] + + def _tokens_match(prev_tokens, tokens): + if len(tokens) == 0: + # if bad word tokens is just one token always ban it + return True + if len(tokens) > len(prev_tokens): + # if bad word tokens are longer than prev tokens they can't be equal + return False + + if prev_tokens[-len(tokens) :] == tokens: + # if tokens match + return True + else: + return False + + for prev_input_ids_slice in prev_input_ids: + banned_tokens_slice = [] + + for banned_token_seq in bad_words_ids: + assert ( + len(banned_token_seq) > 0 + ), f"Banned words token sequences { bad_words_ids} cannot have an empty list" + + if _tokens_match(prev_input_ids_slice.numpy().tolist(), banned_token_seq[:-1]) is False: + # if tokens do not match continue + continue + + banned_tokens_slice.append(banned_token_seq[-1]) + + banned_tokens.append(banned_tokens_slice) + + return banned_tokens + + +def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1): + """ + Filter a distribution of logits using top-k and/or nucleus (top-p) filtering + + Args: + logits: logits distribution shape (batch size, vocabulary size) + top_k (`int`, *optional*, defaults to 0): + If > 0, only keep the top k tokens with highest probability (top-k filtering) + top_p (`float`, *optional*, defaults to 1.0): + If < 1.0, only keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus + filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) + min_tokens_to_keep (`int`, *optional*, defaults to 1): + Minimumber of tokens we keep per batch example in the output. + + From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 + """ + logits_shape = shape_list(logits) + + if top_k > 0: + top_k = min(max(top_k, min_tokens_to_keep), logits_shape[-1]) # Safety check + # Remove all tokens with a probability less than the last token of the top-k + indices_to_remove = logits < tf.math.top_k(logits, k=top_k)[0][..., -1, None] + logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value) + + if top_p < 1.0: + sorted_indices = tf.argsort(logits, direction="DESCENDING") + sorted_logits = tf.gather( + logits, sorted_indices, axis=-1, batch_dims=1 + ) # expects logits to be of dim (batch_size, vocab_size) + + cumulative_probs = tf.math.cumsum(tf.nn.softmax(sorted_logits, axis=-1), axis=-1) + + # Remove tokens with cumulative probability above the threshold (token with 0 are kept) + sorted_indices_to_remove = cumulative_probs > top_p + + if min_tokens_to_keep > 1: + # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) + sorted_indices_to_remove = tf.concat( + [ + tf.zeros_like(sorted_indices_to_remove[:, :min_tokens_to_keep]), + sorted_indices_to_remove[:, min_tokens_to_keep:], + ], + -1, + ) + + # Shift the indices to the right to keep also the first token above the threshold + sorted_indices_to_remove = tf.concat( + [tf.zeros_like(sorted_indices_to_remove[:, :1]), sorted_indices_to_remove[:, :-1]], + -1, + ) + # scatter sorted tensors to original indexing + indices_to_remove = scatter_values_on_batch_indices(sorted_indices_to_remove, sorted_indices) + logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value) + return logits + + +def scatter_values_on_batch_indices(values, batch_indices): + shape = shape_list(batch_indices) + # broadcast batch dim to shape + broad_casted_batch_dims = tf.reshape(tf.broadcast_to(tf.expand_dims(tf.range(shape[0]), axis=-1), shape), [1, -1]) + # transform batch_indices to pair_indices + pair_indices = tf.transpose(tf.concat([broad_casted_batch_dims, tf.reshape(batch_indices, [1, -1])], 0)) + # scatter values to pair indices + return tf.scatter_nd(pair_indices, tf.reshape(values, [-1]), shape) + + +def set_tensor_by_indices_to_value(tensor, indices, value): + # create value_tensor since tensor value assignment is not possible in TF + value_tensor = tf.zeros_like(tensor) + value + return tf.where(indices, value_tensor, tensor) + + +def sample_without_replacement(logits, num_samples): + """ + categorical sampling without replacement is currently not implemented the gumbel-max trick will do for now see + https://github.com/tensorflow/tensorflow/issues/9260 for more info + """ + z = -tf.math.log(tf.random.uniform(shape_list(logits), 0, 1)) + _, indices = tf.nn.top_k(logits + z, num_samples) + return indices + + +def shape_list(x): + """Deal with dynamic shape in tensorflow cleanly.""" + static = x.shape.as_list() + dynamic = tf.shape(x) + return [dynamic[i] if s is None else s for i, s in enumerate(static)] + + +class BeamHypotheses(object): + def __init__(self, num_beams, max_length, length_penalty, early_stopping): + """ + Initialize n-best list of hypotheses. + """ + self.max_length = max_length - 1 # ignoring bos_token + self.length_penalty = length_penalty + self.early_stopping = early_stopping + self.num_beams = num_beams + self.beams = [] + self.worst_score = 1e9 + + def __len__(self): + """ + Number of hypotheses in the list. + """ + return len(self.beams) + + def add(self, hyp, sum_logprobs): + """ + Add a new hypothesis to the list. + """ + score = sum_logprobs / len(hyp) ** self.length_penalty + if len(self) < self.num_beams or score > self.worst_score: + self.beams.append((score, hyp)) + if len(self) > self.num_beams: + sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)]) + del self.beams[sorted_scores[0][1]] + self.worst_score = sorted_scores[1][0] + else: + self.worst_score = min(score, self.worst_score) + + def is_done(self, best_sum_logprobs, cur_len): + """ + If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst + one in the heap, then we are done with this sentence. + """ + + if len(self) < self.num_beams: + return False + elif self.early_stopping: + return True + else: + cur_score = best_sum_logprobs / cur_len ** self.length_penalty + ret = self.worst_score >= cur_score + return ret diff --git a/src/transformers/generation_tf_logits_process.py b/src/transformers/generation_tf_logits_process.py index b84f4a39686d..48089989e91f 100644 --- a/src/transformers/generation_tf_logits_process.py +++ b/src/transformers/generation_tf_logits_process.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team +# Copyright 2022 The HuggingFace Inc. team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import inspect from abc import ABC -from typing import Iterable, List, Optional +from typing import List import numpy as np import tensorflow as tf @@ -44,10 +44,17 @@ Return: `tf.Tensor` of shape `(batch_size, config.vocab_size)`: The processed prediction scores. - """ +# TODO(Patrick) - this function is copied from `generation_tf_utils.py` +# it should be moved into a `tf_utils.py` file. +def set_tensor_by_indices_to_value(tensor, indices, value): + # create value_tensor since tensor value assignment is not possible in TF + value_tensor = tf.zeros_like(tensor) + value + return tf.where(indices, value_tensor, tensor) + + class TFLogitsProcessor(ABC): """Abstract base class for all logit processors that can be applied during generation.""" @@ -111,7 +118,7 @@ def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf. # generate is not XLA - compileable anyways if apply_penalty: eos_token_id_mask = tf.broadcast_to(tf.range(scores.shape[-1]) == self.eos_token_id, scores.shape) - scores = tf.where(eos_token_id_mask, tf.ones_like(scores) * float("-inf"), scores) + scores = set_tensor_by_indices_to_value(scores, eos_token_id_mask, float("-inf")) return scores @@ -132,13 +139,25 @@ def __init__(self, penalty: float): self.penalty = penalty + def _create_score_penalties(self, input_ids, logits): + # create logit penalties for already seen input_ids + token_penalties = np.ones(logits.shape) + prev_input_ids = [np.unique(input_id) for input_id in input_ids.numpy()] + for i, prev_input_id in enumerate(prev_input_ids): + logit_penalized = logits[i].numpy()[prev_input_id] + logit_penalties = np.zeros(logit_penalized.shape) + # if previous logit score is < 0 then multiply repetition penalty else divide + logit_penalties[logit_penalized < 0] = self.penalty + logit_penalties[logit_penalized > 0] = 1 / self.penalty + np.put(token_penalties[i], prev_input_id, logit_penalties) + return tf.convert_to_tensor(token_penalties, dtype=tf.float32) + def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor: - score = tf.gather(scores, 1, input_ids) - # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability - score = tf.where(score < 0, score * self.penalty, score / self.penalty) + score_penalties = self._create_score_penalties(input_ids, scores) + + scores = tf.math.multiply(scores, score_penalties) - scores.scatter_(1, input_ids, score) return scores @@ -168,135 +187,61 @@ def __init__(self, bad_words_ids: List[List[int]], eos_token_id: int): f"Each list in `bad_words_ids` has to be a list of positive integers, but is {bad_words_ids}." ) - bad_words_ids = list(filter(lambda bad_token_seq: bad_token_seq != [eos_token_id], bad_words_ids)) - self.bad_words_id_length_1 = [] - self.bad_words_id_length_greater_than_1 = [] - for word in bad_words_ids: - if len(word) == 1: - self.bad_words_id_length_1.append(word[0]) - else: - self.bad_words_id_length_greater_than_1.append(word) - - self.static_bad_words_mask: Optional[tf.Tensor] = None - - for banned_token_seq in self.bad_words_id_length_greater_than_1: - if len(banned_token_seq) == 0: - raise ValueError(f"Banned words token sequences {bad_words_ids} cannot have an empty list") + self.bad_words_ids = bad_words_ids - def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor: - if self.static_bad_words_mask is None and len(self.bad_words_id_length_1) > 0: - self.static_bad_words_mask = self._calc_static_bad_word_mask(scores) - - dynamic_banned_tokens = self._calc_banned_bad_words_ids(input_ids.tolist()) - scores = self._set_scores_to_inf_for_banned_tokens(scores, dynamic_banned_tokens) + def calc_banned_bad_words_ids(self, prev_input_ids): + banned_tokens = [] - return scores + def _tokens_match(prev_tokens, tokens): + if len(tokens) == 0: + # if bad word tokens is just one token always ban it + return True + if len(tokens) > len(prev_tokens): + # if bad word tokens are longer than prev tokens they can't be equal + return False + + if prev_tokens[-len(tokens) :] == tokens: + # if tokens match + return True + else: + return False - def _calc_static_bad_word_mask(self, scores: tf.Tensor) -> tf.Tensor: - static_bad_words_mask = tf.zeros(scores.shape[1]) - static_bad_words_mask[self.bad_words_id_length_1] = 1 - return static_bad_words_mask.unsqueeze(0).to(scores.device).bool() - - def _tokens_match(self, prev_tokens: List[int], tokens: List[int]) -> bool: - if len(tokens) == 0: - # if bad word tokens is just one token always ban it - return True - elif len(tokens) > len(prev_tokens): - # if bad word tokens are longer then prev input_ids they can't be equal - return False - else: - return prev_tokens[-len(tokens) :] == tokens - - def _calc_banned_bad_words_ids(self, prev_input_ids: List[List[int]]) -> Iterable[int]: - banned_tokens = [] for prev_input_ids_slice in prev_input_ids: banned_tokens_slice = [] - for banned_token_seq in self.bad_words_id_length_greater_than_1: - if self._tokens_match(prev_input_ids_slice, banned_token_seq[:-1]): - banned_tokens_slice.append(banned_token_seq[-1]) - - banned_tokens.append(banned_tokens_slice) - return banned_tokens + for banned_token_seq in self.bad_words_ids: + assert ( + len(banned_token_seq) > 0 + ), f"Banned words token sequences {self.bad_words_ids} cannot have an empty list" - def _set_scores_to_inf_for_banned_tokens(self, scores: tf.Tensor, banned_tokens: List[List[int]]) -> tf.Tensor: - """ - Modifies the scores in place by setting the banned token positions to `-inf`. Banned token is expected to be a - list of list of banned tokens to ban in the format [[batch index, vocabulary position],... - - Args: - scores: logits distribution of shape (batch size, vocabulary size) - banned_tokens: list of list of tokens to ban of length (batch_size) - """ - banned_mask_list = [] - for idx, batch_banned_tokens in enumerate(banned_tokens): - for token in batch_banned_tokens: - # Eliminates invalid bad word IDs that are over the vocabulary size. - if token <= scores.shape[1]: - banned_mask_list.append([idx, token]) - else: - logger.error( - f"An invalid bad word ID is defined: {token}. This ID is not contained in the " - f"vocabulary, and is therefore ignored." - ) - if not banned_mask_list and self.static_bad_words_mask is None: - return scores - - else: - if banned_mask_list: - banned_mask = tf.Tensor(banned_mask_list) - indices = tf.ones(len(banned_mask)) - # A sparse tensor is generated from a list of coordinates: [[0, 1], [0, 2], [2, 0]]. A conversion to dense tensor generates: - # [ 0 1 1 ] - # [ 0 0 0 ] - # [ 1 0 0 ] - - banned_mask = ( - tf.sparse.Tensor(banned_mask.t(), indices, scores.size()).to(scores.device).to_dense().bool() - ) - - if self.static_bad_words_mask is not None: - banned_mask = tf.bitwise_or(banned_mask, self.static_bad_words_mask) - else: - banned_mask = self.static_bad_words_mask + if _tokens_match(prev_input_ids_slice.numpy().tolist(), banned_token_seq[:-1]) is False: + # if tokens do not match continue + continue - scores = scores.masked_fill(banned_mask, -float("inf")) - return scores + banned_tokens_slice.append(banned_token_seq[-1]) + banned_tokens.append(banned_tokens_slice) -def _get_ngrams(ngram_size: int, prev_input_ids: tf.Tensor, num_hypos: int): - generated_ngrams = [{} for _ in range(num_hypos)] - for idx in range(num_hypos): - gen_tokens = prev_input_ids[idx].tolist() - generated_ngram = generated_ngrams[idx] - for ngram in zip(*[gen_tokens[i:] for i in range(ngram_size)]): - prev_ngram_tuple = tuple(ngram[:-1]) - generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]] - return generated_ngrams + return banned_tokens + def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor: -def _get_generated_ngrams(banned_ngrams, prev_input_ids, ngram_size, cur_len): - # Before decoding the next token, prevent decoding of ngrams that have already appeared - start_idx = cur_len + 1 - ngram_size - ngram_idx = tuple(prev_input_ids[start_idx:cur_len].tolist()) - return banned_ngrams.get(ngram_idx, []) + vocab_size = scores.shape[-1] + # calculate a list of banned tokens according to bad words + banned_tokens = self.calc_banned_bad_words_ids(input_ids) -def _calc_banned_ngram_tokens( - ngram_size: int, prev_input_ids: tf.Tensor, num_hypos: int, cur_len: int -) -> List[Iterable[int]]: - """Copied from fairseq for no_repeat_ngram in beam_search""" - if cur_len + 1 < ngram_size: - # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet - return [[] for _ in range(num_hypos)] + banned_tokens_indices_mask = [] + for banned_tokens_slice in banned_tokens: + banned_tokens_indices_mask.append( + [True if token in banned_tokens_slice else False for token in range(vocab_size)] + ) - generated_ngrams = _get_ngrams(ngram_size, prev_input_ids, num_hypos) + scores = set_tensor_by_indices_to_value( + scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") + ) - banned_tokens = [ - _get_generated_ngrams(generated_ngrams[hypo_idx], prev_input_ids[hypo_idx], ngram_size, cur_len) - for hypo_idx in range(num_hypos) - ] - return banned_tokens + return scores class TFNoRepeatNGramLogitsProcessor(TFLogitsProcessor): @@ -314,12 +259,43 @@ def __init__(self, ngram_size: int): raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}") self.ngram_size = ngram_size + def calc_banned_ngram_tokens(self, prev_input_ids, num_hypos, cur_len): + # Copied from fairseq for no_repeat_ngram in beam_search + if cur_len + 1 < self.ngram_size: + # return no banned tokens if we haven't generated ngram_size tokens yet + return [[] for _ in range(num_hypos)] + generated_ngrams = [{} for _ in range(num_hypos)] + for idx in range(num_hypos): + gen_tokens = prev_input_ids[idx].numpy().tolist() + generated_ngram = generated_ngrams[idx] + for ngram in zip(*[gen_tokens[i:] for i in range(self.ngram_size)]): + prev_ngram_tuple = tuple(ngram[:-1]) + generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]] + + def _get_generated_ngrams(hypo_idx): + # Before decoding the next token, prevent decoding of ngrams that have already appeared + start_idx = cur_len + 1 - self.ngram_size + ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].numpy().tolist()) + return generated_ngrams[hypo_idx].get(ngram_idx, []) + + banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)] + + return banned_tokens + def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor: - num_batch_hypotheses = scores.shape[0] - cur_len = input_ids.shape[-1] - banned_batch_tokens = _calc_banned_ngram_tokens(self.ngram_size, input_ids, num_batch_hypotheses, cur_len) - for i, banned_tokens in enumerate(banned_batch_tokens): - scores[i, banned_tokens] = -float("inf") + batch_size, vocab_size = scores.shape + banned_tokens = self.calc_banned_ngram_tokens(input_ids, batch_size, cur_len) + + # create banned_tokens boolean mask + banned_tokens_indices_mask = [] + for banned_tokens_slice in banned_tokens: + banned_tokens_indices_mask.append( + [True if token in banned_tokens_slice else False for token in range(vocab_size)] + ) + + scores = set_tensor_by_indices_to_value( + scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") + ) return scores diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py index 8f2a885d0800..2ce2f9168036 100644 --- a/src/transformers/generation_tf_utils.py +++ b/src/transformers/generation_tf_utils.py @@ -480,18 +480,18 @@ def generate( If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible [`~file_utils.ModelOutput`] types are: - - [`~generation_utils.TFGreedySearchDecoderOnlyOutput`], - - [`~generation_utils.TFSampleDecoderOnlyOutput`], - - [`~generation_utils.TFBeamSearchDecoderOnlyOutput`], - - [`~generation_utils.TFBeamSampleDecoderOnlyOutput`] + - [`~generation_tf_utils.TFGreedySearchDecoderOnlyOutput`], + - [`~generation_tf_utils.TFSampleDecoderOnlyOutput`], + - [`~generation_tf_utils.TFBeamSearchDecoderOnlyOutput`], + - [`~generation_tf_utils.TFBeamSampleDecoderOnlyOutput`] If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible [`~file_utils.ModelOutput`] types are: - - [`~generation_utils.TFGreedySearchEncoderDecoderOutput`], - - [`~generation_utils.TFSampleEncoderDecoderOutput`], - - [`~generation_utils.TFBeamSearchEncoderDecoderOutput`], - - [`~generation_utils.TFBeamSampleEncoderDecoderOutput`] + - [`~generation_tf_utils.TFGreedySearchEncoderDecoderOutput`], + - [`~generation_tf_utils.TFSampleEncoderDecoderOutput`], + - [`~generation_tf_utils.TFBeamSearchEncoderDecoderOutput`], + - [`~generation_tf_utils.TFBeamSampleEncoderDecoderOutput`] Examples: @@ -667,7 +667,7 @@ def generate( bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list) ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated" - # This block corresponds to the following line in `generation_utils`: + # This block corresponds to the following line in `generation_tf_utils`: # "input_ids = self._prepare_input_ids_for_generation(bos_token_id, model_kwargs.get("encoder_outputs"))" # with the following differences: # 1. In PT, `generate()`'s `model_kwargs` can accept `encoder_outputs`, but not the case in TF. @@ -1653,18 +1653,18 @@ def _generate( If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible [`~file_utils.ModelOutput`] types are: - - [`~generation_utils.TFGreedySearchDecoderOnlyOutput`], - - [`~generation_utils.TFSampleDecoderOnlyOutput`], - - [`~generation_utils.TFBeamSearchDecoderOnlyOutput`], - - [`~generation_utils.TFBeamSampleDecoderOnlyOutput`] + - [`~generation_tf_utils.TFGreedySearchDecoderOnlyOutput`], + - [`~generation_tf_utils.TFSampleDecoderOnlyOutput`], + - [`~generation_tf_utils.TFBeamSearchDecoderOnlyOutput`], + - [`~generation_tf_utils.TFBeamSampleDecoderOnlyOutput`] If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible [`~file_utils.ModelOutput`] types are: - - [`~generation_utils.TFGreedySearchEncoderDecoderOutput`], - - [`~generation_utils.TFSampleEncoderDecoderOutput`], - - [`~generation_utils.TFBeamSearchEncoderDecoderOutput`], - - [`~generation_utils.TFBeamSampleEncoderDecoderOutput`] + - [`~generation_tf_utils.TFGreedySearchEncoderDecoderOutput`], + - [`~generation_tf_utils.TFSampleEncoderDecoderOutput`], + - [`~generation_tf_utils.TFBeamSearchEncoderDecoderOutput`], + - [`~generation_tf_utils.TFBeamSampleEncoderDecoderOutput`] Examples: @@ -1724,44 +1724,16 @@ def _generate( input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids ) # generate sequences without allowing bad_words to be generated ```""" - - # We cannot generate if the model does not have a LM head - if self.get_output_embeddings() is None: - raise AttributeError( - "You tried to generate sequences with a model that does not have a LM Head. " - "Please use another model class (e.g. `TFOpenAIGPTLMHeadModel`, `TFXLNetLMHeadModel`, `TFGPT2LMHeadModel`, `TFCTRLLMHeadModel`, `TFT5ForConditionalGeneration`, `TFTransfoXLLMHeadModel`)" - ) - + # 1. Set generation parameters if not already defined num_beams = num_beams if num_beams is not None else self.config.num_beams do_sample = do_sample if do_sample is not None else self.config.do_sample max_length = max_length if max_length is not None else self.config.max_length min_length = min_length if min_length is not None else self.config.min_length early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping - temperature = temperature if temperature is not None else self.config.temperature - top_k = top_k if top_k is not None else self.config.top_k - top_p = top_p if top_p is not None else self.config.top_p - repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id - length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty - no_repeat_ngram_size = ( - no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size - ) - bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids - num_return_sequences = ( - num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences - ) - decoder_start_token_id = ( - decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id - ) - forced_bos_token_id = ( - forced_bos_token_id if forced_bos_token_id is not None else self.config.forced_bos_token_id - ) - forced_eos_token_id = ( - forced_eos_token_id if forced_eos_token_id is not None else self.config.forced_eos_token_id - ) output_scores = output_scores if output_scores is not None else self.config.output_scores output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions @@ -1772,68 +1744,38 @@ def _generate( return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate ) - if self.config.is_encoder_decoder: - model_kwargs["encoder_attentions"] = None - model_kwargs["encoder_hidden_states"] = None + if pad_token_id is None and eos_token_id is not None: + logger.warning(f"Setting `pad_token_id` to {eos_token_id} (first `eos_token_id`) to generate sequence") + pad_token_id = eos_token_id + + # 2. Define model inputs + # inputs_ids now has to be defined + input_ids = self._prepare_model_inputs(input_ids, bos_token_id) if input_ids is not None: batch_size = shape_list(input_ids)[0] # overridden by the input batch_size else: batch_size = 1 - assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer." - assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer." - assert isinstance(do_sample, bool), "`do_sample` should be a boolean." - assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean." - assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer." - assert temperature > 0, "`temperature` should be strictly positive." - assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer." - assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." - assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." - assert input_ids is not None or ( - isinstance(bos_token_id, int) and bos_token_id >= 0 - ), "If input_ids is not defined, `bos_token_id` should be a positive integer." - assert pad_token_id is None or ( - isinstance(pad_token_id, int) and (pad_token_id >= 0) - ), "`pad_token_id` should be a positive integer." - assert (eos_token_id is None) or ( - isinstance(eos_token_id, int) and (eos_token_id >= 0) - ), "`eos_token_id` should be a positive integer." - assert length_penalty > 0, "`length_penalty` should be strictly positive." - assert ( - isinstance(num_return_sequences, int) and num_return_sequences > 0 - ), "`num_return_sequences` should be a strictly positive integer." - assert ( - bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list) - ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated" + # 3. Define other model kwargs + if self.config.is_encoder_decoder: + model_kwargs["encoder_attentions"] = None + model_kwargs["encoder_hidden_states"] = None - # This block corresponds to the following line in `generation_utils`: + # This block corresponds to the following line in `generation_tf_utils`: # "input_ids = self._prepare_input_ids_for_generation(bos_token_id, model_kwargs.get("encoder_outputs"))" # with the following differences: # 1. In PT, `generate()`'s `model_kwargs` can accept `encoder_outputs`, but not the case in TF. # 2. There is no shape checking in PT. # In both PT/TF, if `input_ids` is `None`, we try to create it as it is for a text model. if input_ids is None: - assert isinstance(bos_token_id, int) and bos_token_id >= 0, ( - "you should either supply a context to complete as `input_ids` input " - "or a `bos_token_id` (integer >= 0) as a first token to start the generation." - ) + if not isinstance(bos_token_id, int) or bos_token_id < 0: + raise ValueError( + "you should either supply a context to complete as `input_ids` input " + "or a `bos_token_id` (integer >= 0) as a first token to start the generation." + ) input_ids = tf.fill((batch_size, 1), bos_token_id) - # not allow to duplicate outputs when greedy decoding - if do_sample is False: - if num_beams == 1: - # no_beam_search greedy generation conditions - assert ( - num_return_sequences == 1 - ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1" - - else: - # beam_search greedy generation conditions - assert ( - num_beams >= num_return_sequences - ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences" - # create attention mask if necessary # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140 if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids.numpy()): @@ -1841,10 +1783,6 @@ def _generate( elif attention_mask is None: attention_mask = tf.ones_like(input_ids) - if pad_token_id is None and eos_token_id is not None: - logger.warning(f"Setting `pad_token_id` to {eos_token_id} (first `eos_token_id`) to generate sequence") - pad_token_id = eos_token_id - # current position and vocab size cur_len = shape_list(input_ids)[1] # unused vocab_size = getattr(self.config, "vocab_size", None) @@ -1893,23 +1831,9 @@ def _generate( if output_hidden_states: model_kwargs["encoder_hidden_states"] = encoder_outputs.hidden_states - # The condition `len(shape_list(input_ids)) == 2` is to make this block treats only text inputs. - # (vision inputs might occur when the model is an encoder-decoder model) - # Expand input ids if num_beams > 1 or num_return_sequences > 1 - if len(shape_list(input_ids)) == 2 and (num_return_sequences > 1 or num_beams > 1): - input_ids_len = shape_list(input_ids)[-1] - input_ids = tf.broadcast_to( - tf.expand_dims(input_ids, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) - ) - attention_mask = tf.broadcast_to( - tf.expand_dims(attention_mask, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) - ) - input_ids = tf.reshape( - input_ids, (effective_batch_size * num_beams, input_ids_len) - ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) - attention_mask = tf.reshape( - attention_mask, (effective_batch_size * num_beams, input_ids_len) - ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) + # TODO(Patrick) - not very clean here + model_kwargs["attention_mask"] = attention_mask + model_kwargs["past"] = encoder_outputs # defined for encoder-decoder models, None for decoder-only models if self.config.is_encoder_decoder: # create empty decoder_input_ids @@ -1937,17 +1861,16 @@ def _generate( encoder_outputs = None cur_len = shape_list(input_ids)[-1] - # TODO(Patrick) - not very clean here - model_kwargs["attention_mask"] = attention_mask - model_kwargs["past"] = encoder_outputs # defined for encoder-decoder models, None for decoder-only models - - assert ( - cur_len < max_length - ), f"The context has {cur_len} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`" + if cur_len >= max_length: + raise ValueError( + f"The context has {cur_len} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`" + ) + # 6. determine generation mode is_greedy_gen_mode = (num_beams == 1) and do_sample is False # prepare distribution pre_processing samplers + # 7. prepare distribution pre_processing samplers logits_processor = self._get_logits_processor( repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, @@ -1956,13 +1879,14 @@ def _generate( eos_token_id=eos_token_id, ) + # 8. go into different generation modes if is_greedy_gen_mode: if num_return_sequences > 1: raise ValueError( f"num_return_sequences has to be 1, but is {num_return_sequences} when doing greedy search." ) - # 10. run greedy search + # 9. run greedy search return self.greedy_search( input_ids, max_length=max_length, @@ -1976,6 +1900,32 @@ def _generate( **model_kwargs, ) + def _prepare_decoder_input_ids_for_generation( + self, + batch_size: int, + decoder_start_token_id: int = None, + bos_token_id: int = None, + model_kwargs: Optional[Dict[str, tf.Tensor]] = None, + ) -> tf.Tensor: + + if model_kwargs is not None and "decoder_input_ids" in model_kwargs: + return model_kwargs.pop("decoder_input_ids") + else: + decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id) + return tf.ones((batch_size, 1), dtype=tf.int32) * decoder_start_token_id + + def _prepare_model_inputs(self, inputs: Optional[tf.Tensor] = None, bos_token_id: Optional[int] = None): + if inputs is None: + if not isinstance(bos_token_id, int) or bos_token_id < 0: + raise ValueError( + "you should either supply a context to complete as `input_ids` input " + "or a `bos_token_id` (integer >= 0) as a first token to start the generation." + ) + return tf.fill((1, 1), bos_token_id, dtype=tf.int32) + + # if inputs are passed return those + return inputs + @staticmethod def _update_model_kwargs_for_generation( outputs: ModelOutput, model_kwargs: Dict[str, Any], is_encoder_decoder: bool = False @@ -2009,7 +1959,7 @@ def _get_logits_processor( eos_token_id: int, ) -> TFLogitsProcessorList: """ - This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsProcessor`] + This class returns a [`TFLogitsProcessorList`] list object that contains all relevant [`TFLogitsProcessor`] instances used to modify the scores of the language model head. """ processors = TFLogitsProcessorList() @@ -2056,13 +2006,8 @@ def greedy_search( logits_processor (`LogitsProcessorList`, *optional*): An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] used to modify the prediction scores of the language modeling head applied at each generation step. - stopping_criteria (`StoppingCriteriaList`, *optional*): - An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] - used to tell if the generation loop should stop. - max_length (`int`, *optional*, defaults to 20): - **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated - tokens. The maximum length of the sequence to be generated. + The maximum length of the sequence to be generated. pad_token_id (`int`, *optional*): The id of the *padding* token. eos_token_id (`int`, *optional*): @@ -2077,42 +2022,40 @@ def greedy_search( Whether or not to return the prediction scores. See `scores` under returned tensors for more details. return_dict_in_generate (`bool`, *optional*, defaults to `False`): Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. - synced_gpus (`bool`, *optional*, defaults to `False`): - Whether to continue running the while loop until max_length (needed for ZeRO stage 3) model_kwargs: Additional model specific keyword arguments will be forwarded to the `forward` function of the model. If model is an encoder-decoder model the kwargs should include `encoder_outputs`. Return: - [`~generation_utils.GreedySearchDecoderOnlyOutput`], [`~generation_utils.GreedySearchEncoderDecoderOutput`] - or `tf.Tensor`: A `tf.Tensor` containing the generated tokens (default behaviour) or a - [`~generation_utils.GreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation_utils.GreedySearchEncoderDecoderOutput`] if - `model.config.is_encoder_decoder=True`. + [`~generation_tf_utils.TFGreedySearchDecoderOnlyOutput`], + [`~generation_tf_utils.TFGreedySearchEncoderDecoderOutput`] or `tf.Tensor`: A `tf.Tensor` containing the + generated tokens (default behaviour) or a [`~generation_tf_utils.TFGreedySearchDecoderOnlyOutput`] if + `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or a + [`~generation_tf_utils.TFGreedySearchEncoderDecoderOutput`] if `model.config.is_encoder_decoder=True`. Examples: ```python >>> from transformers import ( - ... AutoTokenizer, - ... AutoModelForCausalLM, - ... LogitsProcessorList, - ... MinLengthLogitsProcessor, + ... TFAutoTokenizer, + ... TFAutoModelForCausalLM, + ... TFLogitsProcessorList, + ... TFMinLengthLogitsProcessor, ... ) - >>> tokenizer = AutoTokenizer.from_pretrained("gpt2") - >>> model = AutoModelForCausalLM.from_pretrained("gpt2") + >>> tokenizer = TFAutoTokenizer.from_pretrained("gpt2") + >>> model = TFAutoModelForCausalLM.from_pretrained("gpt2") >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token >>> model.config.pad_token_id = model.config.eos_token_id >>> input_prompt = "Today is a beautiful day, and" - >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids + >>> input_ids = tokenizer(input_prompt, return_tensors="tf").input_ids >>> # instantiate logits processors - >>> logits_processor = LogitsProcessorList( + >>> logits_processor = TFLogitsProcessorList( ... [ - ... MinLengthLogitsProcessor(15, eos_token_id=model.config.eos_token_id), + ... TFMinLengthLogitsProcessor(15, eos_token_id=model.config.eos_token_id), ... ] ... ) @@ -2150,7 +2093,7 @@ def greedy_search( cur_len = input_ids.shape[-1] while cur_len < max_length: - # TODO (Patrick): remove following two lines by cleaning up `prepare_inputs_for_generation` + # TODO (Patrick): remove following line by cleaning up `prepare_inputs_for_generation` # in all models model_kwargs["use_cache"] = None if "use_cache" not in model_kwargs else model_kwargs["use_cache"] From 73090dd215816de85accd7bbc8690c1dd08348ee Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 10 Feb 2022 17:42:23 +0100 Subject: [PATCH 11/30] delete bogus file --- @! | 2394 ------------------------------------------------------------ 1 file changed, 2394 deletions(-) delete mode 100644 @! diff --git a/@! b/@! deleted file mode 100644 index 8a21a47ce585..000000000000 --- a/@! +++ /dev/null @@ -1,2394 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import inspect -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Union - -import numpy as np -import tensorflow as tf - -from .file_utils import ModelOutput -from .generation_tf_logits_process import ( - TFLogitsProcessorList, - TFMinLengthLogitsProcessor, - TFNoBadWordsLogitsProcessor, - TFNoRepeatNGramLogitsProcessor, - TFRepetitionPenaltyLogitsProcessor, -) -from .utils import logging - - -logger = logging.get_logger(__name__) - - -@dataclass -class TFGreedySearchDecoderOnlyOutput(ModelOutput): - """ - Base class for outputs of decoder-only generation models using greedy search. - - - Args: - sequences (`tf.Tensor` of shape `(batch_size, sequence_length)`): - The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter - if all batches finished early due to the `eos_token_id`. - scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) - at each generation step. `(max_length-input_ids.shape[-1],)`-shaped tuple of `tf.Tensor` with each tensor - of shape `(batch_size, config.vocab_size)`). - attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. - hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `tf.Tensor` of shape `(batch_size, generated_length, hidden_size)`. - """ - - sequences: tf.Tensor = None - scores: Optional[Tuple[tf.Tensor]] = None - attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None - hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None - - -@dataclass -class TFGreedySearchEncoderDecoderOutput(ModelOutput): - """ - Base class for outputs of encoder-decoder generation models using greedy search. Hidden states and attention - weights of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the - encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes) - - - Args: - sequences (`tf.Tensor` of shape `(batch_size, sequence_length)`): - The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter - if all batches finished early due to the `eos_token_id`. - scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) - at each generation step. `(max_length-1,)`-shaped tuple of `tf.Tensor` with each tensor of shape - `(batch_size, config.vocab_size)`). - encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. - encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape - `(batch_size, sequence_length, hidden_size)`. - decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. - cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. - decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `tf.Tensor` of shape `(batch_size, generated_length, hidden_size)`. - """ - - sequences: tf.Tensor = None - scores: Optional[Tuple[tf.Tensor]] = None - encoder_attentions: Optional[Tuple[tf.Tensor]] = None - encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None - decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None - cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None - decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None - - -@dataclass -class TFSampleDecoderOnlyOutput(ModelOutput): - """ - Base class for outputs of decoder-only generation models using sampling. - - - Args: - sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`): - The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter - if all batches finished early due to the `eos_token_id`. - scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) - at each generation step. `(max_length-input_ids.shape[-1],)`-shaped tuple of `tf.Tensor` with each tensor - of shape `(batch_size*num_return_sequences, config.vocab_size)`). - attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `tf.Tensor` of shape `(num_return_sequences*batch_size, num_heads, generated_length, sequence_length)`. - hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `tf.Tensor` of shape `(num_return_sequences*batch_size, generated_length, hidden_size)`. - """ - - sequences: tf.Tensor = None - scores: Optional[Tuple[tf.Tensor]] = None - attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None - hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None - - -@dataclass -class TFSampleEncoderDecoderOutput(ModelOutput): - """ - Base class for outputs of encoder-decoder generation models using sampling. Hidden states and attention weights of - the decoder (respectively the encoder) can be accessed via the encoder_attentions and the encoder_hidden_states - attributes (respectively the decoder_attentions and the decoder_hidden_states attributes) - - - Args: - sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`): - The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter - if all batches finished early due to the `eos_token_id`. - scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) - at each generation step. `(max_length-1,)`-shaped tuple of `tf.Tensor` with each tensor of shape - `(batch_size*num_return_sequences, config.vocab_size)`). - encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size*num_return_sequences, - num_heads, sequence_length, sequence_length)`. - encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape - `(batch_size*num_return_sequences, sequence_length, hidden_size)`. - decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `tf.Tensor` of shape `(batch_size*num_return_sequences, num_heads, generated_length, sequence_length)`. - cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. - decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `tf.Tensor` of shape `(batch_size*num_return_sequences, generated_length, hidden_size)`. - """ - - sequences: tf.Tensor = None - scores: Optional[Tuple[tf.Tensor]] = None - encoder_attentions: Optional[Tuple[tf.Tensor]] = None - encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None - decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None - cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None - decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None - - -@dataclass -class TFBeamSearchDecoderOnlyOutput(ModelOutput): - """ - Base class for outputs of decoder-only generation models using beam search. - - Args: - sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`): - The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter - if all batches finished early due to the `eos_token_id`. - sequences_scores (`tf.Tensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Final beam scores of the generated `sequences`. - scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log - softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam - . `(max_length-input_ids.shape[-1],)`-shaped tuple of `tf.Tensor` with each tensor of shape - `(batch_size*num_beams*num_return_sequences, config.vocab_size)`). - attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`. - hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `tf.Tensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`. - """ - - sequences: tf.Tensor = None - sequences_scores: Optional[tf.Tensor] = None - scores: Optional[Tuple[tf.Tensor]] = None - attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None - hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None - - -@dataclass -class TFBeamSearchEncoderDecoderOutput(ModelOutput): - """ - Base class for outputs of encoder-decoder generation models using beam search. Hidden states and attention weights - of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the encoder_hidden_states - attributes (respectively the decoder_attentions and the decoder_hidden_states attributes) - - Args: - sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`): - The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter - if all batches finished early due to the `eos_token_id`. - sequences_scores (`tf.Tensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Final beam scores of the generated `sequences`. - scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log - softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam - . `(max_length-1,)`-shaped tuple of `tf.Tensor` with each tensor of shape `(batch_size*num_beams, - config.vocab_size)`). - attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. - encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape - `(batch_size*num_beams*num_return_sequences, sequence_length, hidden_size)`. - decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `tf.Tensor` of shape `(batch_size*num_beams*num_return_sequences, num_heads, generated_length, - sequence_length)`. - cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. - decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `tf.Tensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`. - """ - - sequences: tf.Tensor = None - sequences_scores: Optional[tf.Tensor] = None - scores: Optional[Tuple[tf.Tensor]] = None - encoder_attentions: Optional[Tuple[tf.Tensor]] = None - encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None - decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None - cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None - decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None - - -@dataclass -class TFBeamSampleDecoderOnlyOutput(ModelOutput): - """ - Base class for outputs of decoder-only generation models using beam sample. - - Args: - sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`): - The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter - if all batches finished early due to the `eos_token_id`. - sequences_scores (`tf.Tensor` of shape `(batch_size * num_return_sequence)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Final beam scores of the generated `sequences`. - scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log - softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam - . `(max_length-input_ids.shape[-1],)`-shaped tuple of `tf.Tensor` with each tensor of shape - `(batch_size*num_beams*num_return_sequences, config.vocab_size)`). - attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`. - hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `tf.Tensor` of shape `(batch_size*num_beams, generated_length, hidden_size)`. - """ - - sequences: tf.Tensor = None - sequences_scores: Optional[tf.Tensor] = None - scores: Optional[Tuple[tf.Tensor]] = None - attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None - hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None - - -@dataclass -class TFBeamSampleEncoderDecoderOutput(ModelOutput): - """ - Base class for outputs of encoder-decoder generation models using beam sampling. Hidden states and attention - weights of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the - encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes) - - Args: - sequences (`tf.Tensor` of shape `(batch_size*num_beams, sequence_length)`): - The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter - if all batches finished early due to the `eos_token_id`. - sequences_scores (`tf.Tensor` of shape `(batch_size * num_return_sequence)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Final beam scores of the generated `sequences`. - scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log - softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam - . `(max_length-1,)`-shaped tuple of `tf.Tensor` with each tensor of shape `(batch_size*num_beams, - config.vocab_size)`). - encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. - encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape - `(batch_size*num_beams, sequence_length, hidden_size)`. - decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`. - cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. - decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `tf.Tensor` of shape `(batch_size*num_beams, generated_length, hidden_size)`. - """ - - sequences: tf.Tensor = None - sequences_scores: Optional[tf.Tensor] = None - scores: Optional[Tuple[tf.Tensor]] = None - encoder_attentions: Optional[Tuple[tf.Tensor]] = None - encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None - decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None - cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None - decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None - - -TFGreedySearchOutput = Union[TFGreedySearchEncoderDecoderOutput, TFGreedySearchDecoderOnlyOutput] -TFSampleOutput = Union[TFSampleEncoderDecoderOutput, TFSampleDecoderOnlyOutput] -TFBeamSearchOutput = Union[TFBeamSearchEncoderDecoderOutput, TFBeamSearchDecoderOnlyOutput] -TFBeamSampleOutput = Union[TFBeamSampleEncoderDecoderOutput, TFBeamSampleDecoderOnlyOutput] - - -class TFGenerationMixin: - """ - A class containing all of the functions supporting generation, to be used as a mixin in [`TFPreTrainedModel`]. - """ - - def prepare_inputs_for_generation(self, inputs, **kwargs): - """ - Implement in subclasses of [`TFPreTrainedModel`] for custom behavior to prepare inputs in the generate method. - """ - return {"input_ids": inputs} - - def _use_cache(self, outputs, use_cache): - """During generation, decide whether to pass the `past` variable to the next forward pass.""" - use_cache = getattr(self.config, "use_cache", False) - if len(outputs) <= 1 or use_cache is False: - return False - if hasattr(self.config, "mem_len") and self.config.mem_len == 0: - return False - return True - - def generate( - self, - input_ids=None, - max_length=None, - min_length=None, - do_sample=None, - early_stopping=None, - num_beams=None, - temperature=None, - top_k=None, - top_p=None, - repetition_penalty=None, - bad_words_ids=None, - bos_token_id=None, - pad_token_id=None, - eos_token_id=None, - length_penalty=None, - no_repeat_ngram_size=None, - num_return_sequences=None, - attention_mask=None, - decoder_start_token_id=None, - use_cache=None, - output_scores=None, - output_attentions=None, - output_hidden_states=None, - return_dict_in_generate=None, - forced_bos_token_id=None, - forced_eos_token_id=None, - **model_kwargs, - ) -> Union[TFGreedySearchOutput, TFSampleOutput, TFBeamSearchOutput, TFBeamSampleOutput, tf.Tensor]: - r""" - Generates sequences for models with a language modeling head. The method currently supports greedy decoding, - beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling. - - Adapted in part from [Facebook's XLM beam search - code](https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529). - - Apart from `input_ids` and `attention_mask`, all the arguments below will default to the value of the attribute - of the same name inside the [`PretrainedConfig`] of the model. The default values indicated are the default - values of those config. - - Most of these parameters are explained in more detail in [this blog - post](https://huggingface.co/blog/how-to-generate). - - Parameters: - - input_ids (`tf.Tensor` of `dtype=tf.int32` and shape `(batch_size, sequence_length)`, *optional*): - The sequence used as a prompt for the generation. If `None` the method initializes it with - `bos_token_id` and a batch size of 1. - max_length (`int`, *optional*, defaults to 20): - The maximum length of the sequence to be generated. - min_length (`int`, *optional*, defaults to 10): - The minimum length of the sequence to be generated. - do_sample (`bool`, *optional*, defaults to `False`): - Whether or not to use sampling ; use greedy decoding otherwise. - early_stopping (`bool`, *optional*, defaults to `False`): - Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not. - num_beams (`int`, *optional*, defaults to 1): - Number of beams for beam search. 1 means no beam search. - temperature (`float`, *optional*, defaults to 1.0): - The value used to module the next token probabilities. - top_k (`int`, *optional*, defaults to 50): - The number of highest probability vocabulary tokens to keep for top-k-filtering. - top_p (`float`, *optional*, defaults to 1.0): - If set to float < 1, only the most probable tokens with probabilities that add up to `top_p` or higher - are kept for generation. - repetition_penalty (`float`, *optional*, defaults to 1.0): - The parameter for repetition penalty. 1.0 means no penalty. See [this - paper](https://arxiv.org/pdf/1909.05858.pdf) for more details. - pad_token_id (`int`, *optional*): - The id of the *padding* token. - bos_token_id (`int`, *optional*): - The id of the *beginning-of-sequence* token. - eos_token_id (`int`, *optional*): - The id of the *end-of-sequence* token. - length_penalty (`float`, *optional*, defaults to 1.0): - Exponential penalty to the length. 1.0 means no penalty. - - Set to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in - order to encourage the model to produce longer sequences. - no_repeat_ngram_size (`int`, *optional*, defaults to 0): - If set to int > 0, all ngrams of that size can only occur once. - bad_words_ids(`List[int]`, *optional*): - List of token ids that are not allowed to be generated. In order to get the tokens of the words that - should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`. - num_return_sequences(`int`, *optional*, defaults to 1): - The number of independently computed returned sequences for each element in the batch. - attention_mask (`tf.Tensor` of `dtype=tf.int32` and shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values are in `[0, 1]`, 1 for tokens - that are not masked, and 0 for masked tokens. - - If not provided, will default to a tensor the same shape as `input_ids` that masks the pad token. - - [What are attention masks?](../glossary#attention-mask) - decoder_start_token_id (`int`, *optional*): - If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token. - use_cache: (`bool`, *optional*, defaults to `True`): - Whether or not the model should use the past last key/values attentions (if applicable to the model) to - speed up decoding. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more details. - output_hidden_states (`bool`, *optional*, defaults to `False`): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more details. - output_scores (`bool`, *optional*, defaults to `False`): - Whether or not to return the prediction scores. See `scores` under returned tensors for more details. - return_dict_in_generate (`bool`, *optional*, defaults to `False`): - Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. - forced_bos_token_id (`int`, *optional*): - The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful - for multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be - the target language token. - forced_eos_token_id (`int`, *optional*): - The id of the token to force as the last generated token when `max_length` is reached. - model_specific_kwargs: - Additional model specific kwargs will be forwarded to the `forward` function of the model. - - Return: - [`~file_utils.ModelOutput`] or `tf.Tensor`: A [`~file_utils.ModelOutput`] (if - `return_dict_in_generate=True` or when `config.return_dict_in_generate=True`) or a `tf.Tensor`. - - If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible - [`~file_utils.ModelOutput`] types are: - - - [`~generation_tf_utils.TFGreedySearchDecoderOnlyOutput`], - - [`~generation_tf_utils.TFSampleDecoderOnlyOutput`], - - [`~generation_tf_utils.TFBeamSearchDecoderOnlyOutput`], - - [`~generation_tf_utils.TFBeamSampleDecoderOnlyOutput`] - - If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible - [`~file_utils.ModelOutput`] types are: - - - [`~generation_tf_utils.TFGreedySearchEncoderDecoderOutput`], - - [`~generation_tf_utils.TFSampleEncoderDecoderOutput`], - - [`~generation_tf_utils.TFBeamSearchEncoderDecoderOutput`], - - [`~generation_tf_utils.TFBeamSampleEncoderDecoderOutput`] - - Examples: - - ```python - tokenizer = AutoTokenizer.from_pretrained("distilgpt2") # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained( - "distilgpt2" - ) # Download model and configuration from huggingface.co and cache. - outputs = model.generate(max_length=40) # do greedy decoding - print(f"Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}") - - tokenizer = AutoTokenizer.from_pretrained("openai-gpt") # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained( - "openai-gpt" - ) # Download model and configuration from huggingface.co and cache. - input_context = "The dog" - input_ids = tokenizer.encode(input_context, return_tensors="tf") # encode input context - outputs = model.generate( - input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5 - ) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog' - for i in range(3): # 3 output sequences were generated - print(f"Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}") - - tokenizer = AutoTokenizer.from_pretrained("distilgpt2") # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained( - "distilgpt2" - ) # Download model and configuration from huggingface.co and cache. - input_context = "The dog" - input_ids = tokenizer.encode(input_context, return_tensors="tf") # encode input context - outputs = model.generate( - input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True - ) # generate 3 candidates using sampling - for i in range(3): # 3 output sequences were generated - print(f"Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}") - - tokenizer = AutoTokenizer.from_pretrained("ctrl") # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained( - "ctrl" - ) # Download model and configuration from huggingface.co and cache. - input_context = "Legal My neighbor is" # "Legal" is one of the control codes for ctrl - input_ids = tokenizer.encode(input_context, return_tensors="tf") # encode input context - outputs = model.generate( - input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2 - ) # generate sequences - print(f"Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}") - - tokenizer = AutoTokenizer.from_pretrained("gpt2") # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained( - "gpt2" - ) # Download model and configuration from huggingface.co and cache. - input_context = "My cute dog" - bad_words_ids = [ - tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ["idiot", "stupid", "shut up"] - ] - input_ids = tokenizer.encode(input_context, return_tensors="tf") # encode input context - outputs = model.generate( - input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids - ) # generate sequences without allowing bad_words to be generated - ```""" - num_beams = num_beams if num_beams is not None else self.config.num_beams - do_sample = do_sample if do_sample is not None else self.config.do_sample - - is_greedy_gen_mode = num_beams == 1 and do_sample is False - - if is_greedy_gen_mode: - return self._generate( - input_ids=input_ids, - max_length=max_length, - min_length=min_length, - do_sample=do_sample, - early_stopping=early_stopping, - num_beams=num_beams, - temperature=temperature, - top_k=top_k, - top_p=top_p, - repetition_penalty=repetition_penalty, - bad_words_ids=bad_words_ids, - bos_token_id=bos_token_id, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - length_penalty=length_penalty, - no_repeat_ngram_size=no_repeat_ngram_size, - num_return_sequences=num_return_sequences, - attention_mask=attention_mask, - decoder_start_token_id=decoder_start_token_id, - use_cache=use_cache, - output_scores=output_scores, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict_in_generate=return_dict_in_generate, - ) - - # We cannot generate if the model does not have a LM head - if self.get_output_embeddings() is None: - raise AttributeError( - "You tried to generate sequences with a model that does not have a LM Head. " - "Please use another model class (e.g. `TFOpenAIGPTLMHeadModel`, `TFXLNetLMHeadModel`, `TFGPT2LMHeadModel`, `TFCTRLLMHeadModel`, `TFT5ForConditionalGeneration`, `TFTransfoXLLMHeadModel`)" - ) - - max_length = max_length if max_length is not None else self.config.max_length - min_length = min_length if min_length is not None else self.config.min_length - early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping - temperature = temperature if temperature is not None else self.config.temperature - top_k = top_k if top_k is not None else self.config.top_k - top_p = top_p if top_p is not None else self.config.top_p - - repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty - bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id - pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id - eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id - length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty - no_repeat_ngram_size = ( - no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size - ) - bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids - num_return_sequences = ( - num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences - ) - decoder_start_token_id = ( - decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id - ) - forced_bos_token_id = ( - forced_bos_token_id if forced_bos_token_id is not None else self.config.forced_bos_token_id - ) - forced_eos_token_id = ( - forced_eos_token_id if forced_eos_token_id is not None else self.config.forced_eos_token_id - ) - - output_scores = output_scores if output_scores is not None else self.config.output_scores - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict_in_generate = ( - return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate - ) - - model_kwargs["output_scores"] = output_scores - model_kwargs["output_attentions"] = output_attentions - model_kwargs["output_hidden_states"] = output_hidden_states - if self.config.is_encoder_decoder: - model_kwargs["encoder_attentions"] = None - model_kwargs["encoder_hidden_states"] = None - - if input_ids is not None: - batch_size = shape_list(input_ids)[0] # overridden by the input batch_size - else: - batch_size = 1 - - assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer." - assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer." - assert isinstance(do_sample, bool), "`do_sample` should be a boolean." - assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean." - assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer." - assert temperature > 0, "`temperature` should be strictly positive." - assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer." - assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." - assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." - assert input_ids is not None or ( - isinstance(bos_token_id, int) and bos_token_id >= 0 - ), "If input_ids is not defined, `bos_token_id` should be a positive integer." - assert pad_token_id is None or ( - isinstance(pad_token_id, int) and (pad_token_id >= 0) - ), "`pad_token_id` should be a positive integer." - assert (eos_token_id is None) or ( - isinstance(eos_token_id, int) and (eos_token_id >= 0) - ), "`eos_token_id` should be a positive integer." - assert length_penalty > 0, "`length_penalty` should be strictly positive." - assert ( - isinstance(num_return_sequences, int) and num_return_sequences > 0 - ), "`num_return_sequences` should be a strictly positive integer." - assert ( - bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list) - ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated" - - # This block corresponds to the following line in `generation_tf_utils`: - # "input_ids = self._prepare_input_ids_for_generation(bos_token_id, model_kwargs.get("encoder_outputs"))" - # with the following differences: - # 1. In PT, `generate()`'s `model_kwargs` can accept `encoder_outputs`, but not the case in TF. - # 2. There is no shape checking in PT. - # In both PT/TF, if `input_ids` is `None`, we try to create it as it is for a text model. - if input_ids is None: - assert isinstance(bos_token_id, int) and bos_token_id >= 0, ( - "you should either supply a context to complete as `input_ids` input " - "or a `bos_token_id` (integer >= 0) as a first token to start the generation." - ) - input_ids = tf.fill((batch_size, 1), bos_token_id) - - # not allow to duplicate outputs when greedy decoding - if do_sample is False: - if num_beams == 1: - # no_beam_search greedy generation conditions - assert ( - num_return_sequences == 1 - ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1" - - else: - # beam_search greedy generation conditions - assert ( - num_beams >= num_return_sequences - ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences" - - # create attention mask if necessary - # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140 - if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids.numpy()): - attention_mask = tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=tf.int32) - elif attention_mask is None: - attention_mask = tf.ones_like(input_ids) - - if pad_token_id is None and eos_token_id is not None: - logger.warning(f"Setting `pad_token_id` to {eos_token_id} (first `eos_token_id`) to generate sequence") - pad_token_id = eos_token_id - - # current position and vocab size - cur_len = shape_list(input_ids)[1] # unused - vocab_size = getattr(self.config, "vocab_size", None) - if vocab_size is None and self.config.is_encoder_decoder: - decoder_config = getattr(self.config, "decoder", None) - if decoder_config is not None: - vocab_size = getattr(self.config.decoder, "vocab_size", None) - - # set effective batch size and effective batch multiplier according to do_sample - if do_sample: - effective_batch_size = batch_size * num_return_sequences - effective_batch_mult = num_return_sequences - else: - effective_batch_size = batch_size - effective_batch_mult = 1 - - if self.config.is_encoder_decoder: - if decoder_start_token_id is None: - decoder_start_token_id = bos_token_id - - assert ( - decoder_start_token_id is not None - ), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation" - assert hasattr(self, "get_encoder"), f"{self} should have a 'get_encoder' function defined" - assert callable(self.get_encoder), f"{self.get_encoder} should be a method" - - # get encoder and store encoder outputs - encoder = self.get_encoder() - - encoder_kwargs = { - "attention_mask": attention_mask, - "output_attentions": output_attentions, - "output_hidden_states": output_hidden_states, - "return_dict": return_dict_in_generate, - } - - # vision models don't use `attention_mask`. - signature = dict(inspect.signature(encoder.call).parameters) - if "attention_mask" not in signature: - encoder_kwargs.pop("attention_mask") - - encoder_outputs = encoder(input_ids, **encoder_kwargs) - if return_dict_in_generate: - if output_attentions: - model_kwargs["encoder_attentions"] = encoder_outputs.attentions - if output_hidden_states: - model_kwargs["encoder_hidden_states"] = encoder_outputs.hidden_states - - # The condition `len(shape_list(input_ids)) == 2` is to make this block treats only text inputs. - # (vision inputs might occur when the model is an encoder-decoder model) - # Expand input ids if num_beams > 1 or num_return_sequences > 1 - if len(shape_list(input_ids)) == 2 and (num_return_sequences > 1 or num_beams > 1): - input_ids_len = shape_list(input_ids)[-1] - input_ids = tf.broadcast_to( - tf.expand_dims(input_ids, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) - ) - attention_mask = tf.broadcast_to( - tf.expand_dims(attention_mask, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) - ) - input_ids = tf.reshape( - input_ids, (effective_batch_size * num_beams, input_ids_len) - ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) - attention_mask = tf.reshape( - attention_mask, (effective_batch_size * num_beams, input_ids_len) - ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) - - if self.config.is_encoder_decoder: - - # create empty decoder_input_ids - input_ids = ( - tf.ones( - (effective_batch_size * num_beams, 1), - dtype=tf.int32, - ) - * decoder_start_token_id - ) - cur_len = 1 - - assert ( - batch_size == encoder_outputs[0].shape[0] - ), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} " - - # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1) - expanded_batch_idxs = tf.reshape( - tf.repeat(tf.expand_dims(tf.range(batch_size), -1), repeats=num_beams * effective_batch_mult, axis=1), - shape=(-1,), - ) - # expand encoder_outputs - encoder_outputs = (tf.gather(encoder_outputs[0], expanded_batch_idxs, axis=0),) - else: - encoder_outputs = None - cur_len = shape_list(input_ids)[-1] - - assert ( - cur_len < max_length - ), f"The context has {cur_len} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`" - - if num_beams == 1: - return self._generate_no_beam_search( - input_ids, - cur_len=cur_len, - max_length=max_length, - min_length=min_length, - do_sample=do_sample, - temperature=temperature, - top_k=top_k, - top_p=top_p, - repetition_penalty=repetition_penalty, - no_repeat_ngram_size=no_repeat_ngram_size, - bad_words_ids=bad_words_ids, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - batch_size=effective_batch_size, - vocab_size=vocab_size, - encoder_outputs=encoder_outputs, - attention_mask=attention_mask, - use_cache=use_cache, - return_dict_in_generate=return_dict_in_generate, - **model_kwargs, - ) - else: - return self._generate_beam_search( - input_ids, - cur_len=cur_len, - max_length=max_length, - min_length=min_length, - do_sample=do_sample, - early_stopping=early_stopping, - temperature=temperature, - top_k=top_k, - top_p=top_p, - repetition_penalty=repetition_penalty, - no_repeat_ngram_size=no_repeat_ngram_size, - bad_words_ids=bad_words_ids, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - batch_size=effective_batch_size, - num_return_sequences=num_return_sequences, - length_penalty=length_penalty, - num_beams=num_beams, - vocab_size=vocab_size, - encoder_outputs=encoder_outputs, - attention_mask=attention_mask, - use_cache=use_cache, - forced_bos_token_id=forced_bos_token_id, - forced_eos_token_id=forced_eos_token_id, - return_dict_in_generate=return_dict_in_generate, - **model_kwargs, - ) - - def _generate_no_beam_search( - self, - input_ids, - cur_len, - max_length, - min_length, - do_sample, - temperature, - top_k, - top_p, - repetition_penalty, - no_repeat_ngram_size, - bad_words_ids, - pad_token_id, - eos_token_id, - batch_size, - vocab_size, - encoder_outputs, - attention_mask, - use_cache, - return_dict_in_generate, - **kwargs - ) -> Union[TFGreedySearchOutput, TFSampleOutput, tf.Tensor]: - """ - Generate sequences for each example without beam search (num_beams == 1). All returned sequences are generated - independently. - """ - - # length of generated sentences / unfinished sentences - unfinished_sents = tf.ones_like(input_ids[:, 0]) - sent_lengths = tf.ones_like(input_ids[:, 0]) * max_length - - past = encoder_outputs # defined for encoder-decoder models, None for decoder-only models - - # init attention / hidden states / scores tuples - scores = () if (return_dict_in_generate and kwargs["output_scores"]) else None - decoder_attentions = () if (return_dict_in_generate and kwargs["output_attentions"]) else None - cross_attentions = () if (return_dict_in_generate and kwargs["output_attentions"]) else None - decoder_hidden_states = () if (return_dict_in_generate and kwargs["output_hidden_states"]) else None - # if model is an encoder-decoder, retrieve encoder attention weights and hidden states - if self.config.is_encoder_decoder: - encoder_attentions = ( - kwargs["encoder_attentions"] if (return_dict_in_generate and kwargs["encoder_attentions"]) else None - ) - encoder_hidden_states = ( - kwargs["encoder_hidden_states"] - if (return_dict_in_generate and kwargs["encoder_hidden_states"]) - else None - ) - - while cur_len < max_length: - model_inputs = self.prepare_inputs_for_generation( - input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **kwargs - ) - outputs = self( - **model_inputs, - return_dict=True, - output_attentions=kwargs["output_attentions"], - output_hidden_states=kwargs["output_hidden_states"], - ) - next_token_logits = outputs.logits[:, -1, :] # (batch_size * num_beams, vocab_size) - - # Store scores, attentions and hidden_states when required - if return_dict_in_generate: - if kwargs["output_scores"]: - scores += (next_token_logits,) - if kwargs["output_attentions"]: - decoder_attentions += ( - (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) - ) - if self.config.is_encoder_decoder: - cross_attentions += (outputs.cross_attentions,) - - if kwargs["output_hidden_states"]: - decoder_hidden_states += ( - (outputs.decoder_hidden_states,) - if self.config.is_encoder_decoder - else (outputs.hidden_states,) - ) - - # if model has past, then set the past variable to speed up decoding - if self._use_cache(outputs, use_cache): - past = outputs[1] - - # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858) - if repetition_penalty != 1.0: - next_token_logits_penalties = _create_next_token_logits_penalties( - input_ids, next_token_logits, repetition_penalty - ) - next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties) - - if no_repeat_ngram_size > 0: - # calculate a list of banned tokens to prevent repetitively generating the same ngrams - # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 - banned_tokens = calc_banned_ngram_tokens(input_ids, batch_size, no_repeat_ngram_size, cur_len) - # create banned_tokens boolean mask - banned_tokens_indices_mask = [] - for banned_tokens_slice in banned_tokens: - banned_tokens_indices_mask.append( - [True if token in banned_tokens_slice else False for token in range(vocab_size)] - ) - - next_token_logits = set_tensor_by_indices_to_value( - next_token_logits, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") - ) - - if bad_words_ids is not None: - # calculate a list of banned tokens according to bad words - banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) - - banned_tokens_indices_mask = [] - for banned_tokens_slice in banned_tokens: - banned_tokens_indices_mask.append( - [True if token in banned_tokens_slice else False for token in range(vocab_size)] - ) - - next_token_logits = set_tensor_by_indices_to_value( - next_token_logits, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") - ) - - # set eos token prob to zero if min_length is not reached - if eos_token_id is not None and cur_len < min_length: - # create eos_token_id boolean mask - is_token_logit_eos_token = tf.convert_to_tensor( - [True if token == eos_token_id else False for token in range(vocab_size)], dtype=tf.bool - ) - eos_token_indices_mask = tf.broadcast_to(is_token_logit_eos_token, [batch_size, vocab_size]) - - next_token_logits = set_tensor_by_indices_to_value( - next_token_logits, eos_token_indices_mask, -float("inf") - ) - - if do_sample: - # Temperature (higher temperature => more likely to sample low probability tokens) - if temperature != 1.0: - next_token_logits = next_token_logits / temperature - # Top-p/top-k filtering - next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p) - # Sample - next_token = tf.squeeze( - tf.random.categorical(next_token_logits, dtype=tf.int32, num_samples=1), axis=1 - ) - else: - # Greedy decoding - next_token = tf.math.argmax(next_token_logits, axis=-1, output_type=tf.int32) - - # update generations and finished sentences - if eos_token_id is not None: - # pad finished sentences if eos_token_id exist - tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents) - else: - tokens_to_add = next_token - - # add token and increase length by one - input_ids = tf.concat([input_ids, tf.expand_dims(tokens_to_add, -1)], 1) - cur_len = cur_len + 1 - - if eos_token_id is not None: - eos_in_sents = tokens_to_add == eos_token_id - # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length - is_sents_unfinished_and_token_to_add_is_eos = tf.math.multiply( - unfinished_sents, tf.cast(eos_in_sents, tf.int32) - ) - sent_lengths = ( - sent_lengths * (1 - is_sents_unfinished_and_token_to_add_is_eos) - + cur_len * is_sents_unfinished_and_token_to_add_is_eos - ) - - # unfinished_sents is set to zero if eos in sentence - unfinished_sents -= is_sents_unfinished_and_token_to_add_is_eos - - # stop when there is a in each sentence, or if we exceed the maximum length - if tf.math.reduce_max(unfinished_sents) == 0: - break - - # extend attention_mask for new generated input if only decoder - if self.config.is_encoder_decoder is False: - attention_mask = tf.concat( - [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1 - ) - - # if there are different sentences lengths in the batch, some batches have to be padded - min_sent_length = tf.math.reduce_min(sent_lengths) - max_sent_length = tf.math.reduce_max(sent_lengths) - if min_sent_length != max_sent_length: - assert pad_token_id is not None, "`Pad_token_id` has to be defined if batches have different lengths" - # finished sents are filled with pad_token - padding = tf.ones([batch_size, max_sent_length.numpy()], dtype=tf.int32) * pad_token_id - - # create length masks for tf.where operation - broad_casted_sent_lengths = tf.broadcast_to( - tf.expand_dims(sent_lengths, -1), [batch_size, max_sent_length] - ) - broad_casted_range = tf.transpose( - tf.broadcast_to(tf.expand_dims(tf.range(max_sent_length), -1), [max_sent_length, batch_size]) - ) - - decoded = tf.where(broad_casted_range < broad_casted_sent_lengths, input_ids, padding) - else: - decoded = input_ids - - if return_dict_in_generate: - if do_sample: - if self.config.is_encoder_decoder: - return TFSampleEncoderDecoderOutput( - sequences=decoded, - scores=scores, - encoder_attentions=encoder_attentions, - encoder_hidden_states=encoder_hidden_states, - decoder_attentions=decoder_attentions, - cross_attentions=cross_attentions, - decoder_hidden_states=decoder_hidden_states, - ) - else: - return TFSampleDecoderOnlyOutput( - sequences=decoded, - scores=scores, - attentions=decoder_attentions, - hidden_states=decoder_hidden_states, - ) - else: - if self.config.is_encoder_decoder: - return TFGreedySearchEncoderDecoderOutput( - sequences=decoded, - scores=scores, - encoder_attentions=encoder_attentions, - encoder_hidden_states=encoder_hidden_states, - decoder_attentions=decoder_attentions, - cross_attentions=cross_attentions, - decoder_hidden_states=decoder_hidden_states, - ) - else: - return TFGreedySearchDecoderOnlyOutput( - sequences=decoded, - scores=scores, - attentions=decoder_attentions, - hidden_states=decoder_hidden_states, - ) - else: - return decoded - - def _generate_beam_search( - self, - input_ids, - cur_len, - max_length, - min_length, - do_sample, - early_stopping, - temperature, - top_k, - top_p, - repetition_penalty, - no_repeat_ngram_size, - bad_words_ids, - pad_token_id, - eos_token_id, - batch_size, - num_return_sequences, - length_penalty, - num_beams, - vocab_size, - encoder_outputs, - attention_mask, - use_cache, - forced_bos_token_id, - forced_eos_token_id, - return_dict_in_generate, - **kwargs, - ) -> Union[TFBeamSearchOutput, TFBeamSampleOutput, tf.Tensor]: - """Generate sequences for each example with beam search.""" - - # generated hypotheses - generated_hyps = [ - BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=early_stopping) - for _ in range(batch_size) - ] - - # for greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times - if do_sample is False: - beam_scores_begin = tf.zeros((batch_size, 1), dtype=tf.float32) - beam_scores_end = tf.ones((batch_size, num_beams - 1), dtype=tf.float32) * (-1e9) - beam_scores = tf.concat([beam_scores_begin, beam_scores_end], -1) - else: - beam_scores = tf.zeros((batch_size, num_beams), dtype=tf.float32) - - beam_scores = tf.reshape(beam_scores, (batch_size * num_beams,)) - - # cache compute states - past = encoder_outputs - # to stay similar to torch : past = (encoder_outputs, None) if encoder_outputs is not None else None - - # init attention / hidden states / scores tuples - scores = () if (return_dict_in_generate and kwargs["output_scores"]) else None - decoder_attentions = () if (return_dict_in_generate and kwargs["output_attentions"]) else None - cross_attentions = () if (return_dict_in_generate and kwargs["output_attentions"]) else None - decoder_hidden_states = () if (return_dict_in_generate and kwargs["output_hidden_states"]) else None - # if model is an encoder-decoder, retrieve encoder attention weights and hidden states - if self.config.is_encoder_decoder: - encoder_attentions = ( - kwargs["encoder_attentions"] if (return_dict_in_generate and kwargs["encoder_attentions"]) else None - ) - encoder_hidden_states = ( - kwargs["encoder_hidden_states"] - if (return_dict_in_generate and kwargs["encoder_hidden_states"]) - else None - ) - - # done sentences - done = [False for _ in range(batch_size)] - - while cur_len < max_length: - model_inputs = self.prepare_inputs_for_generation( - input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **kwargs - ) - outputs = self( - **model_inputs, - return_dict=True, - output_attentions=kwargs["output_attentions"], - output_hidden_states=kwargs["output_hidden_states"], - ) - next_token_logits = outputs.logits[:, -1, :] # (batch_size * num_beams, vocab_size) - - # if model has past, then set the past variable to speed up decoding - if self._use_cache(outputs, use_cache): - past = outputs[1] - - # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858) - if repetition_penalty != 1.0: - next_token_logits_penalties = _create_next_token_logits_penalties( - input_ids, next_token_logits, repetition_penalty - ) - next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties) - - # Temperature (higher temperature => more likely to sample low probability tokens) - if temperature != 1.0: - next_token_logits = next_token_logits / temperature - - if self.config.is_encoder_decoder and do_sample is False: - next_token_logits = self.adjust_logits_during_generation( - next_token_logits, - cur_len=cur_len, - max_length=max_length, - forced_bos_token_id=forced_bos_token_id, - forced_eos_token_id=forced_eos_token_id, - ) - # calculate log softmax score - scores = tf.nn.log_softmax(next_token_logits, axis=-1) # (batch_size * num_beams, vocab_size) - - # set eos token prob to zero if min_length is not reached - if eos_token_id is not None and cur_len < min_length: - # create eos_token_id boolean mask - num_batch_hypotheses = batch_size * num_beams - - is_token_logit_eos_token = tf.convert_to_tensor( - [True if token == eos_token_id else False for token in range(vocab_size)], dtype=tf.bool - ) - eos_token_indices_mask = tf.broadcast_to(is_token_logit_eos_token, [num_batch_hypotheses, vocab_size]) - - scores = set_tensor_by_indices_to_value(scores, eos_token_indices_mask, -float("inf")) - - if no_repeat_ngram_size > 0: - # calculate a list of banned tokens to prevent repetitively generating the same ngrams - # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 - num_batch_hypotheses = batch_size * num_beams - banned_tokens = calc_banned_ngram_tokens( - input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len - ) - # create banned_tokens boolean mask - banned_tokens_indices_mask = [] - for banned_tokens_slice in banned_tokens: - banned_tokens_indices_mask.append( - [True if token in banned_tokens_slice else False for token in range(vocab_size)] - ) - - scores = set_tensor_by_indices_to_value( - scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") - ) - - if bad_words_ids is not None: - # calculate a list of banned tokens according to bad words - banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) - - banned_tokens_indices_mask = [] - for banned_tokens_slice in banned_tokens: - banned_tokens_indices_mask.append( - [True if token in banned_tokens_slice else False for token in range(vocab_size)] - ) - - scores = set_tensor_by_indices_to_value( - scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") - ) - - assert shape_list(scores) == [batch_size * num_beams, vocab_size] - - if do_sample: - _scores = scores + tf.broadcast_to( - beam_scores[:, None], (batch_size * num_beams, vocab_size) - ) # (batch_size * num_beams, vocab_size) - - # Top-p/top-k filtering - _scores = tf_top_k_top_p_filtering( - _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2 - ) # (batch_size * num_beams, vocab_size) - # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search) - _scores = tf.reshape(_scores, (batch_size, num_beams * vocab_size)) - - next_tokens = sample_without_replacement( - _scores, num_samples=2 * num_beams - ) # (batch_size, 2 * num_beams) - # Compute next scores - next_scores = tf.gather(_scores, next_tokens, batch_dims=1) # (batch_size, 2 * num_beams) - - # sort the sampled vector to make sure that the first num_beams samples are the best - next_scores_indices = tf.argsort(next_scores, direction="DESCENDING", axis=1) - next_scores = tf.gather(next_scores, next_scores_indices, batch_dims=1) # (batch_size, num_beams * 2) - next_tokens = tf.gather(next_tokens, next_scores_indices, batch_dims=1) # (batch_size, num_beams * 2) - else: - # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product) - next_scores = scores + tf.broadcast_to( - beam_scores[:, None], (batch_size * num_beams, vocab_size) - ) # (batch_size * num_beams, vocab_size) - - # re-organize to group the beam together (we are keeping top hypothesis across beams) - next_scores = tf.reshape( - next_scores, (batch_size, num_beams * vocab_size) - ) # (batch_size, num_beams * vocab_size) - - next_scores, next_tokens = tf.math.top_k(next_scores, k=2 * num_beams, sorted=True) - - assert shape_list(next_scores) == shape_list(next_tokens) == [batch_size, 2 * num_beams] - - # Store scores, attentions and hidden_states when required - if return_dict_in_generate: - if kwargs["output_scores"]: - scores += (next_token_logits,) - if kwargs["output_attentions"]: - decoder_attentions += ( - (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) - ) - if self.config.is_encoder_decoder: - cross_attentions += (outputs.cross_attentions,) - - if kwargs["output_hidden_states"]: - decoder_hidden_states += ( - (outputs.decoder_hidden_states,) - if self.config.is_encoder_decoder - else (outputs.hidden_states,) - ) - - # next batch beam content - next_batch_beam = [] - - # for each sentence - for batch_idx in range(batch_size): - - # if we are done with this sentence - if done[batch_idx]: - assert ( - len(generated_hyps[batch_idx]) >= num_beams - ), f"Batch can only be done if at least {num_beams} beams have been generated." - assert ( - eos_token_id is not None and pad_token_id is not None - ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined" - next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams) # pad the batch - continue - - # next sentence beam content - next_sent_beam = [] - - # next tokens for this sentence - for beam_token_rank, (beam_token_id, beam_token_score) in enumerate( - zip(next_tokens[batch_idx], next_scores[batch_idx]) - ): - # get beam and token IDs - beam_id = beam_token_id // vocab_size - token_id = beam_token_id % vocab_size - - effective_beam_id = batch_idx * num_beams + beam_id - # add to generated hypotheses if end of sentence or last iteration - if (eos_token_id is not None) and (token_id.numpy() == eos_token_id): - # if beam_token does not belong to top num_beams tokens, it should not be added - is_beam_token_worse_than_top_num_beams = beam_token_rank >= num_beams - if is_beam_token_worse_than_top_num_beams: - continue - generated_hyps[batch_idx].add( - tf.identity(input_ids[effective_beam_id]), beam_token_score.numpy() - ) - else: - # add next predicted token if it is not eos_token - next_sent_beam.append((beam_token_score, token_id, effective_beam_id)) - - # the beam for next step is full - if len(next_sent_beam) == num_beams: - break - - # Check if we are done so that we can save a pad step if all(done) - done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done( - tf.reduce_max(next_scores[batch_idx]).numpy(), cur_len - ) - - # update next beam content - assert len(next_sent_beam) == num_beams, "Beam should always be full" - next_batch_beam.extend(next_sent_beam) - assert len(next_batch_beam) == num_beams * (batch_idx + 1) - - # stop when we are done with each sentence - if all(done): - break - - # sanity check / prepare next batch - assert len(next_batch_beam) == batch_size * num_beams - beam_scores = tf.convert_to_tensor([x[0] for x in next_batch_beam], dtype=tf.float32) - beam_tokens = tf.convert_to_tensor([x[1] for x in next_batch_beam], dtype=tf.int32) - beam_idx = tf.convert_to_tensor([x[2] for x in next_batch_beam], dtype=tf.int32) - - # re-order batch and update current length - input_ids = tf.stack([tf.identity(input_ids[x, :]) for x in beam_idx]) - input_ids = tf.concat([input_ids, tf.expand_dims(beam_tokens, 1)], axis=-1) - cur_len = cur_len + 1 - - # re-order internal states - if past is not None: - past = self._reorder_cache(past, beam_idx) - - # extend attention_mask for new generated input if only decoder - if self.config.is_encoder_decoder is False: - attention_mask = tf.concat( - [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1 - ) - - # finalize all open beam hypotheses and end to generated hypotheses - for batch_idx in range(batch_size): - # Add all open beam hypothesis to generated_hyps - if done[batch_idx]: - continue - # test that beam scores match previously calculated scores if not eos and batch_idx not done - if eos_token_id is not None and all( - (token_id % vocab_size).numpy().item() != eos_token_id for token_id in next_tokens[batch_idx] - ): - if not tf.reduce_all( - next_scores[batch_idx, :num_beams] == tf.reshape(beam_scores, (batch_size, num_beams))[batch_idx] - ): - raise ValueError( - f"If batch_idx is not done, final next scores: {next_scores[:, :num_beams][batch_idx]} have " - "to equal to accumulated beam_scores: " - f"{tf.reshape(beam_scores, (batch_size, num_beams))[batch_idx]}" - ) - # need to add best num_beams hypotheses to generated hyps - for beam_id in range(num_beams): - effective_beam_id = batch_idx * num_beams + beam_id - final_score = beam_scores[effective_beam_id].numpy().item() - final_tokens = input_ids[effective_beam_id] - generated_hyps[batch_idx].add(final_tokens, final_score) - - # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch - output_batch_size = batch_size if do_sample else batch_size * num_return_sequences - output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences - - # select the best hypotheses - sent_lengths_list = [] - best = [] - - # retrieve best hypotheses - for i, hypotheses in enumerate(generated_hyps): - sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0]) - for j in range(output_num_return_sequences_per_batch): - best_hyp = sorted_hyps.pop()[1] - sent_lengths_list.append(len(best_hyp)) - best.append(best_hyp) - assert output_batch_size == len( - best - ), f"Output batch size {output_batch_size} must match output beam hypotheses {len(best)}" - - sent_lengths = tf.convert_to_tensor(sent_lengths_list, dtype=tf.int32) - - # shorter batches are filled with pad_token - if tf.reduce_min(sent_lengths).numpy() != tf.reduce_max(sent_lengths).numpy(): - assert pad_token_id is not None, "`Pad_token_id` has to be defined" - sent_max_len = min(tf.reduce_max(sent_lengths).numpy() + 1, max_length) - decoded_list = [] - - # fill with hypothesis and eos_token_id if necessary - for i, hypo in enumerate(best): - assert sent_lengths[i] == shape_list(hypo)[0] - # if sent_length is max_len do not pad - if sent_lengths[i] == sent_max_len: - decoded_slice = hypo - else: - # else pad to sent_max_len - num_pad_tokens = sent_max_len - sent_lengths[i] - padding = pad_token_id * tf.ones((num_pad_tokens,), dtype=tf.int32) - decoded_slice = tf.concat([hypo, padding], axis=-1) - - # finish sentence with EOS token - if sent_lengths[i] < max_length: - decoded_slice = tf.where( - tf.range(sent_max_len, dtype=tf.int32) == sent_lengths[i], - eos_token_id * tf.ones((sent_max_len,), dtype=tf.int32), - decoded_slice, - ) - # add to list - decoded_list.append(decoded_slice) - - decoded = tf.stack(decoded_list) - else: - # none of the hypotheses have an eos_token - assert (len(hypo) == max_length for hypo in best) - decoded = tf.stack(best) - - if return_dict_in_generate: - if do_sample and self.config.is_encoder_decoder: - return TFBeamSampleEncoderDecoderOutput( - sequences=decoded, - scores=scores, - encoder_attentions=encoder_attentions, - encoder_hidden_states=encoder_hidden_states, - decoder_attentions=decoder_attentions, - cross_attentions=cross_attentions, - decoder_hidden_states=decoder_hidden_states, - ) - elif do_sample and not self.config.is_encoder_decoder: - return TFBeamSampleDecoderOnlyOutput( - sequences=decoded, - scores=scores, - attentions=decoder_attentions, - hidden_states=decoder_hidden_states, - ) - elif self.config.is_encoder_decoder: - return TFBeamSearchEncoderDecoderOutput( - sequences=decoded, - scores=scores, - encoder_attentions=encoder_attentions, - encoder_hidden_states=encoder_hidden_states, - decoder_attentions=decoder_attentions, - cross_attentions=cross_attentions, - decoder_hidden_states=decoder_hidden_states, - ) - else: - return TFBeamSearchDecoderOnlyOutput( - sequences=decoded, - scores=scores, - attentions=decoder_attentions, - hidden_states=decoder_hidden_states, - ) - else: - return decoded - - @staticmethod - def _reorder_cache(past, beam_idx): - return tuple(tf.gather(layer_past, beam_idx, axis=1) for layer_past in past) - - def adjust_logits_during_generation( - self, logits, cur_len, max_length, forced_bos_token_id, forced_eos_token_id, **kwargs - ): - """ - Implement in subclasses of [`PreTrainedModel`] for custom behavior to adjust the logits in the generate method. - """ - vocab_size = getattr(self.config, "vocab_size", None) - if vocab_size is None and self.config.is_encoder_decoder: - decoder_config = getattr(self.config, "decoder", None) - if decoder_config is not None: - vocab_size = getattr(self.config.decoder, "vocab_size", None) - - if cur_len == 1 and forced_bos_token_id is not None: - vocab_range = tf.constant(range(vocab_size)) - return tf.where(vocab_range != forced_bos_token_id, -1e8, logits) - elif cur_len == max_length - 1 and forced_eos_token_id is not None: - vocab_range = tf.constant(range(vocab_size)) - return tf.where(vocab_range != forced_eos_token_id, -1e8, logits) - else: - return logits - - def _generate( - self, - input_ids=None, - max_length=None, - min_length=None, - do_sample=None, - early_stopping=None, - num_beams=None, - temperature=None, - top_k=None, - top_p=None, - repetition_penalty=None, - bad_words_ids=None, - bos_token_id=None, - pad_token_id=None, - eos_token_id=None, - length_penalty=None, - no_repeat_ngram_size=None, - num_return_sequences=None, - attention_mask=None, - decoder_start_token_id=None, - use_cache=None, - output_scores=None, - output_attentions=None, - output_hidden_states=None, - return_dict_in_generate=None, - forced_bos_token_id=None, - forced_eos_token_id=None, - **model_kwargs, - ) -> Union[TFGreedySearchOutput, TFSampleOutput, TFBeamSearchOutput, TFBeamSampleOutput, tf.Tensor]: - r""" - Generates sequences for models with a language modeling head. The method currently supports greedy decoding, - beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling. - - Adapted in part from [Facebook's XLM beam search - code](https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529). - - Apart from `input_ids` and `attention_mask`, all the arguments below will default to the value of the attribute - of the same name inside the [`PretrainedConfig`] of the model. The default values indicated are the default - values of those config. - - Most of these parameters are explained in more detail in [this blog - post](https://huggingface.co/blog/how-to-generate). - - Parameters: - - input_ids (`tf.Tensor` of `dtype=tf.int32` and shape `(batch_size, sequence_length)`, *optional*): - The sequence used as a prompt for the generation. If `None` the method initializes it with - `bos_token_id` and a batch size of 1. - max_length (`int`, *optional*, defaults to 20): - The maximum length of the sequence to be generated. - min_length (`int`, *optional*, defaults to 10): - The minimum length of the sequence to be generated. - do_sample (`bool`, *optional*, defaults to `False`): - Whether or not to use sampling ; use greedy decoding otherwise. - early_stopping (`bool`, *optional*, defaults to `False`): - Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not. - num_beams (`int`, *optional*, defaults to 1): - Number of beams for beam search. 1 means no beam search. - temperature (`float`, *optional*, defaults to 1.0): - The value used to module the next token probabilities. - top_k (`int`, *optional*, defaults to 50): - The number of highest probability vocabulary tokens to keep for top-k-filtering. - top_p (`float`, *optional*, defaults to 1.0): - If set to float < 1, only the most probable tokens with probabilities that add up to `top_p` or higher - are kept for generation. - repetition_penalty (`float`, *optional*, defaults to 1.0): - The parameter for repetition penalty. 1.0 means no penalty. See [this - paper](https://arxiv.org/pdf/1909.05858.pdf) for more details. - pad_token_id (`int`, *optional*): - The id of the *padding* token. - bos_token_id (`int`, *optional*): - The id of the *beginning-of-sequence* token. - eos_token_id (`int`, *optional*): - The id of the *end-of-sequence* token. - length_penalty (`float`, *optional*, defaults to 1.0): - Exponential penalty to the length. 1.0 means no penalty. - - Set to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in - order to encourage the model to produce longer sequences. - no_repeat_ngram_size (`int`, *optional*, defaults to 0): - If set to int > 0, all ngrams of that size can only occur once. - bad_words_ids(`List[int]`, *optional*): - List of token ids that are not allowed to be generated. In order to get the tokens of the words that - should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`. - num_return_sequences(`int`, *optional*, defaults to 1): - The number of independently computed returned sequences for each element in the batch. - attention_mask (`tf.Tensor` of `dtype=tf.int32` and shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values are in `[0, 1]`, 1 for tokens - that are not masked, and 0 for masked tokens. - - If not provided, will default to a tensor the same shape as `input_ids` that masks the pad token. - - [What are attention masks?](../glossary#attention-mask) - decoder_start_token_id (`int`, *optional*): - If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token. - use_cache: (`bool`, *optional*, defaults to `True`): - Whether or not the model should use the past last key/values attentions (if applicable to the model) to - speed up decoding. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more details. - output_hidden_states (`bool`, *optional*, defaults to `False`): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more details. - output_scores (`bool`, *optional*, defaults to `False`): - Whether or not to return the prediction scores. See `scores` under returned tensors for more details. - return_dict_in_generate (`bool`, *optional*, defaults to `False`): - Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. - forced_bos_token_id (`int`, *optional*): - The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful - for multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be - the target language token. - forced_eos_token_id (`int`, *optional*): - The id of the token to force as the last generated token when `max_length` is reached. - model_specific_kwargs: - Additional model specific kwargs will be forwarded to the `forward` function of the model. - - Return: - [`~file_utils.ModelOutput`] or `tf.Tensor`: A [`~file_utils.ModelOutput`] (if - `return_dict_in_generate=True` or when `config.return_dict_in_generate=True`) or a `tf.Tensor`. - - If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible - [`~file_utils.ModelOutput`] types are: - - - [`~generation_tf_utils.TFGreedySearchDecoderOnlyOutput`], - - [`~generation_tf_utils.TFSampleDecoderOnlyOutput`], - - [`~generation_tf_utils.TFBeamSearchDecoderOnlyOutput`], - - [`~generation_tf_utils.TFBeamSampleDecoderOnlyOutput`] - - If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible - [`~file_utils.ModelOutput`] types are: - - - [`~generation_tf_utils.TFGreedySearchEncoderDecoderOutput`], - - [`~generation_tf_utils.TFSampleEncoderDecoderOutput`], - - [`~generation_tf_utils.TFBeamSearchEncoderDecoderOutput`], - - [`~generation_tf_utils.TFBeamSampleEncoderDecoderOutput`] - - Examples: - - ```python - tokenizer = AutoTokenizer.from_pretrained("distilgpt2") # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained( - "distilgpt2" - ) # Download model and configuration from huggingface.co and cache. - outputs = model.generate(max_length=40) # do greedy decoding - print(f"Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}") - - tokenizer = AutoTokenizer.from_pretrained("openai-gpt") # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained( - "openai-gpt" - ) # Download model and configuration from huggingface.co and cache. - input_context = "The dog" - input_ids = tokenizer.encode(input_context, return_tensors="tf") # encode input context - outputs = model.generate( - input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5 - ) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog' - for i in range(3): # 3 output sequences were generated - print(f"Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}") - - tokenizer = AutoTokenizer.from_pretrained("distilgpt2") # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained( - "distilgpt2" - ) # Download model and configuration from huggingface.co and cache. - input_context = "The dog" - input_ids = tokenizer.encode(input_context, return_tensors="tf") # encode input context - outputs = model.generate( - input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True - ) # generate 3 candidates using sampling - for i in range(3): # 3 output sequences were generated - print(f"Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}") - - tokenizer = AutoTokenizer.from_pretrained("ctrl") # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained( - "ctrl" - ) # Download model and configuration from huggingface.co and cache. - input_context = "Legal My neighbor is" # "Legal" is one of the control codes for ctrl - input_ids = tokenizer.encode(input_context, return_tensors="tf") # encode input context - outputs = model.generate( - input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2 - ) # generate sequences - print(f"Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}") - - tokenizer = AutoTokenizer.from_pretrained("gpt2") # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained( - "gpt2" - ) # Download model and configuration from huggingface.co and cache. - input_context = "My cute dog" - bad_words_ids = [ - tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ["idiot", "stupid", "shut up"] - ] - input_ids = tokenizer.encode(input_context, return_tensors="tf") # encode input context - outputs = model.generate( - input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids - ) # generate sequences without allowing bad_words to be generated - ```""" - # 1. Set generation parameters if not already defined - num_beams = num_beams if num_beams is not None else self.config.num_beams - do_sample = do_sample if do_sample is not None else self.config.do_sample - max_length = max_length if max_length is not None else self.config.max_length - min_length = min_length if min_length is not None else self.config.min_length - early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping - - bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id - pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id - eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id - - output_scores = output_scores if output_scores is not None else self.config.output_scores - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict_in_generate = ( - return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate - ) - - if self.config.is_encoder_decoder: - model_kwargs["encoder_attentions"] = None - model_kwargs["encoder_hidden_states"] = None - - if input_ids is not None: - batch_size = shape_list(input_ids)[0] # overridden by the input batch_size - else: - batch_size = 1 - - # 2. Define model inputs - - # This block corresponds to the following line in `generation_tf_utils`: - # "input_ids = self._prepare_input_ids_for_generation(bos_token_id, model_kwargs.get("encoder_outputs"))" - # with the following differences: - # 1. In PT, `generate()`'s `model_kwargs` can accept `encoder_outputs`, but not the case in TF. - # 2. There is no shape checking in PT. - # In both PT/TF, if `input_ids` is `None`, we try to create it as it is for a text model. - if input_ids is None: - assert isinstance(bos_token_id, int) and bos_token_id >= 0, ( - "you should either supply a context to complete as `input_ids` input " - "or a `bos_token_id` (integer >= 0) as a first token to start the generation." - ) - input_ids = tf.fill((batch_size, 1), bos_token_id) - - # not allow to duplicate outputs when greedy decoding - if do_sample is False: - if num_beams == 1: - # no_beam_search greedy generation conditions - assert ( - num_return_sequences == 1 - ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1" - - else: - # beam_search greedy generation conditions - assert ( - num_beams >= num_return_sequences - ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences" - - # create attention mask if necessary - # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140 - if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids.numpy()): - attention_mask = tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=tf.int32) - elif attention_mask is None: - attention_mask = tf.ones_like(input_ids) - - if pad_token_id is None and eos_token_id is not None: - logger.warning(f"Setting `pad_token_id` to {eos_token_id} (first `eos_token_id`) to generate sequence") - pad_token_id = eos_token_id - - # current position and vocab size - cur_len = shape_list(input_ids)[1] # unused - vocab_size = getattr(self.config, "vocab_size", None) - if vocab_size is None and self.config.is_encoder_decoder: - decoder_config = getattr(self.config, "decoder", None) - if decoder_config is not None: - vocab_size = getattr(self.config.decoder, "vocab_size", None) - - # set effective batch size and effective batch multiplier according to do_sample - if do_sample: - effective_batch_size = batch_size * num_return_sequences - effective_batch_mult = num_return_sequences - else: - effective_batch_size = batch_size - effective_batch_mult = 1 - - if self.config.is_encoder_decoder: - if decoder_start_token_id is None: - decoder_start_token_id = bos_token_id - - assert ( - decoder_start_token_id is not None - ), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation" - assert hasattr(self, "get_encoder"), f"{self} should have a 'get_encoder' function defined" - assert callable(self.get_encoder), f"{self.get_encoder} should be a method" - - # get encoder and store encoder outputs - encoder = self.get_encoder() - - encoder_kwargs = { - "attention_mask": attention_mask, - "output_attentions": output_attentions, - "output_hidden_states": output_hidden_states, - "return_dict": return_dict_in_generate, - } - - # vision models don't use `attention_mask`. - signature = dict(inspect.signature(encoder.call).parameters) - if "attention_mask" not in signature: - encoder_kwargs.pop("attention_mask") - - encoder_outputs = encoder(input_ids, **encoder_kwargs) - if return_dict_in_generate: - if output_attentions: - model_kwargs["encoder_attentions"] = encoder_outputs.attentions - if output_hidden_states: - model_kwargs["encoder_hidden_states"] = encoder_outputs.hidden_states - - # The condition `len(shape_list(input_ids)) == 2` is to make this block treats only text inputs. - # (vision inputs might occur when the model is an encoder-decoder model) - # Expand input ids if num_beams > 1 or num_return_sequences > 1 - if len(shape_list(input_ids)) == 2 and (num_return_sequences > 1 or num_beams > 1): - input_ids_len = shape_list(input_ids)[-1] - input_ids = tf.broadcast_to( - tf.expand_dims(input_ids, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) - ) - attention_mask = tf.broadcast_to( - tf.expand_dims(attention_mask, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) - ) - input_ids = tf.reshape( - input_ids, (effective_batch_size * num_beams, input_ids_len) - ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) - attention_mask = tf.reshape( - attention_mask, (effective_batch_size * num_beams, input_ids_len) - ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) - - if self.config.is_encoder_decoder: - # create empty decoder_input_ids - input_ids = ( - tf.ones( - (effective_batch_size * num_beams, 1), - dtype=tf.int32, - ) - * decoder_start_token_id - ) - cur_len = 1 - - assert ( - batch_size == encoder_outputs[0].shape[0] - ), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} " - - # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1) - expanded_batch_idxs = tf.reshape( - tf.repeat(tf.expand_dims(tf.range(batch_size), -1), repeats=num_beams * effective_batch_mult, axis=1), - shape=(-1,), - ) - # expand encoder_outputs - encoder_outputs = (tf.gather(encoder_outputs[0], expanded_batch_idxs, axis=0),) - else: - encoder_outputs = None - cur_len = shape_list(input_ids)[-1] - - # TODO(Patrick) - not very clean here - model_kwargs["attention_mask"] = attention_mask - model_kwargs["past"] = encoder_outputs # defined for encoder-decoder models, None for decoder-only models - - assert ( - cur_len < max_length - ), f"The context has {cur_len} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`" - - is_greedy_gen_mode = (num_beams == 1) and do_sample is False - - # prepare distribution pre_processing samplers - # 7. prepare distribution pre_processing samplers - logits_processor = self._get_logits_processor( - repetition_penalty=repetition_penalty, - no_repeat_ngram_size=no_repeat_ngram_size, - bad_words_ids=bad_words_ids, - min_length=min_length, - eos_token_id=eos_token_id, - ) - - # 8. go into different generation modes - if is_greedy_gen_mode: - if num_return_sequences > 1: - raise ValueError( - f"num_return_sequences has to be 1, but is {num_return_sequences} when doing greedy search." - ) - - # 9. run greedy search - return self.greedy_search( - input_ids, - max_length=max_length, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - logits_processor=logits_processor, - output_scores=output_scores, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict_in_generate=return_dict_in_generate, - **model_kwargs, - ) - - @staticmethod - def _update_model_kwargs_for_generation( - outputs: ModelOutput, model_kwargs: Dict[str, Any], is_encoder_decoder: bool = False - ) -> Dict[str, Any]: - # update past - if "past_key_values" in outputs: - model_kwargs["past"] = outputs.past_key_values - elif "mems" in outputs: - model_kwargs["past"] = outputs.mems - elif "past_buckets_states" in outputs: - model_kwargs["past"] = outputs.past_buckets_states - else: - model_kwargs["past"] = None - - # update attention mask - if not is_encoder_decoder: - if "attention_mask" in model_kwargs: - attention_mask = model_kwargs["attention_mask"] - model_kwargs["attention_mask"] = tf.concat( - [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1 - ) - - return model_kwargs - - def _get_logits_processor( - self, - repetition_penalty: float, - no_repeat_ngram_size: int, - bad_words_ids: List[List[int]], - min_length: int, - eos_token_id: int, - ) -> TFLogitsProcessorList: - """ - This class returns a [`TFLogitsProcessorList`] list object that contains all relevant [`TFLogitsProcessor`] - instances used to modify the scores of the language model head. - """ - processors = TFLogitsProcessorList() - - repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty - no_repeat_ngram_size = ( - no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size - ) - bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids - eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id - - # instantiate processors list - if repetition_penalty is not None and repetition_penalty != 1.0: - processors.append(TFRepetitionPenaltyLogitsProcessor(penalty=repetition_penalty)) - if no_repeat_ngram_size is not None and no_repeat_ngram_size > 0: - processors.append(TFNoRepeatNGramLogitsProcessor(no_repeat_ngram_size)) - if bad_words_ids is not None: - processors.append(TFNoBadWordsLogitsProcessor(bad_words_ids, eos_token_id)) - if min_length is not None and eos_token_id is not None and min_length > -1: - processors.append(TFMinLengthLogitsProcessor(min_length, eos_token_id)) - - return processors - - def greedy_search( - self, - input_ids: tf.Tensor, - max_length: Optional[int] = None, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[int] = None, - logits_processor: Optional[TFLogitsProcessorList] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - output_scores: Optional[bool] = None, - return_dict_in_generate: Optional[bool] = None, - **model_kwargs, - ) -> Union[TFGreedySearchOutput, tf.Tensor]: - r""" - Generates sequences for models with a language modeling head using greedy decoding. - - Parameters: - - input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`): - The sequence used as a prompt for the generation. - logits_processor (`LogitsProcessorList`, *optional*): - An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] - used to modify the prediction scores of the language modeling head applied at each generation step. - max_length (`int`, *optional*, defaults to 20): - The maximum length of the sequence to be generated. - pad_token_id (`int`, *optional*): - The id of the *padding* token. - eos_token_id (`int`, *optional*): - The id of the *end-of-sequence* token. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more details. - output_hidden_states (`bool`, *optional*, defaults to `False`): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more details. - output_scores (`bool`, *optional*, defaults to `False`): - Whether or not to return the prediction scores. See `scores` under returned tensors for more details. - return_dict_in_generate (`bool`, *optional*, defaults to `False`): - Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. - model_kwargs: - Additional model specific keyword arguments will be forwarded to the `forward` function of the model. - If model is an encoder-decoder model the kwargs should include `encoder_outputs`. - - Return: - [`~generation_tf_utils.TFGreedySearchDecoderOnlyOutput`], [`~generation_tf_utils.TFGreedySearchEncoderDecoderOutput`] - or `tf.Tensor`: A `tf.Tensor` containing the generated tokens (default behaviour) or a - [`~generation_tf_utils.TFGreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation_tf_utils.TFGreedySearchEncoderDecoderOutput`] if - `model.config.is_encoder_decoder=True`. - - Examples: - - ```python - >>> from transformers import ( - ... TFAutoTokenizer, - ... TFAutoModelForCausalLM, - ... TFLogitsProcessorList, - ... TFMinLengthLogitsProcessor, - ... ) - - >>> tokenizer = TFAutoTokenizer.from_pretrained("gpt2") - >>> model = TFAutoModelForCausalLM.from_pretrained("gpt2") - - >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token - >>> model.config.pad_token_id = model.config.eos_token_id - - >>> input_prompt = "Today is a beautiful day, and" - >>> input_ids = tokenizer(input_prompt, return_tensors="tf").input_ids - - >>> # instantiate logits processors - >>> logits_processor = TFLogitsProcessorList( - ... [ - ... TFMinLengthLogitsProcessor(15, eos_token_id=model.config.eos_token_id), - ... ] - ... ) - - >>> outputs = model.greedy_search(input_ids, logits_processor=logits_processor) - - >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True)) - ```""" - # init values - logits_processor = logits_processor if logits_processor is not None else TFLogitsProcessorList() - - pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id - eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id - output_scores = output_scores if output_scores is not None else self.config.output_scores - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict_in_generate = ( - return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate - ) - - # init attention / hidden states / scores tuples - scores = () if (return_dict_in_generate and output_scores) else None - decoder_attentions = () if (return_dict_in_generate and output_attentions) else None - cross_attentions = () if (return_dict_in_generate and output_attentions) else None - decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None - - # if model is an encoder-decoder, retrieve encoder attention weights and hidden states - if return_dict_in_generate and self.config.is_encoder_decoder: - encoder_attentions = model_kwargs["encoder_attentions"] if output_attentions else None - encoder_hidden_states = model_kwargs["encoder_hidden_states"] if output_hidden_states else None - - # keep track of which sequences are already finished - unfinished_sequences = tf.ones_like(input_ids[:, 0]) - cur_len = input_ids.shape[-1] - - while cur_len < max_length: - # TODO (Patrick): remove following line by cleaning up `prepare_inputs_for_generation` - # in all models - model_kwargs["use_cache"] = None if "use_cache" not in model_kwargs else model_kwargs["use_cache"] - - # prepare model inputs - model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - - # forward pass to get next token - outputs = self( - **model_inputs, - return_dict=True, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - - next_token_logits = outputs.logits[:, -1, :] - - # Store scores, attentions and hidden_states when required - if return_dict_in_generate: - if output_scores: - scores += (next_token_logits,) - if output_attentions: - decoder_attentions += ( - (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) - ) - if self.config.is_encoder_decoder: - cross_attentions += (outputs.cross_attentions,) - - if output_hidden_states: - decoder_hidden_states += ( - (outputs.decoder_hidden_states,) - if self.config.is_encoder_decoder - else (outputs.hidden_states,) - ) - - # pre-process distribution - next_tokens_scores = logits_processor(input_ids, next_token_logits, cur_len) - - # argmax - next_tokens = tf.cast(tf.argmax(next_tokens_scores, axis=-1), tf.int32) - - # finished sentences should have their next token be a padding token - if eos_token_id is not None: - if pad_token_id is None: - raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.") - next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) - - # update generated ids, model inputs, and length for next step - input_ids = tf.concat([input_ids, next_tokens[:, None]], axis=-1) - model_kwargs = self._update_model_kwargs_for_generation( - outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder - ) - cur_len = cur_len + 1 - - # if eos_token was found in one sentence, set sentence to finished - if eos_token_id is not None: - eos_in_sents = next_tokens == eos_token_id - # if sentence is unfinished and the token to add is eos - is_sents_unfinished_and_token_to_add_is_eos = tf.math.multiply( - unfinished_sequences, tf.cast(eos_in_sents, tf.int32) - ) - - # unfinished_sequences is set to zero if eos in sentence - unfinished_sequences -= is_sents_unfinished_and_token_to_add_is_eos - - # stop when each sentence is finished, or if we exceed the maximum length - if tf.math.reduce_max(unfinished_sequences) == 0: - break - - if return_dict_in_generate: - if self.config.is_encoder_decoder: - return TFGreedySearchEncoderDecoderOutput( - sequences=input_ids, - scores=scores, - encoder_attentions=encoder_attentions, - encoder_hidden_states=encoder_hidden_states, - decoder_attentions=decoder_attentions, - cross_attentions=cross_attentions, - decoder_hidden_states=decoder_hidden_states, - ) - else: - return TFGreedySearchDecoderOnlyOutput( - sequences=input_ids, - scores=scores, - attentions=decoder_attentions, - hidden_states=decoder_hidden_states, - ) - else: - return input_ids - - -def _create_next_token_logits_penalties(input_ids, logits, repetition_penalty): - # create logit penalties for already seen input_ids - token_penalties = np.ones(shape_list(logits)) - prev_input_ids = [np.unique(input_id) for input_id in input_ids.numpy()] - for i, prev_input_id in enumerate(prev_input_ids): - logit_penalized = logits[i].numpy()[prev_input_id] - logit_penalties = np.zeros(logit_penalized.shape) - # if previous logit score is < 0 then multiply repetition penalty else divide - logit_penalties[logit_penalized < 0] = repetition_penalty - logit_penalties[logit_penalized > 0] = 1 / repetition_penalty - np.put(token_penalties[i], prev_input_id, logit_penalties) - return tf.convert_to_tensor(token_penalties, dtype=tf.float32) - - -def calc_banned_ngram_tokens(prev_input_ids, num_hypos, no_repeat_ngram_size, cur_len): - # Copied from fairseq for no_repeat_ngram in beam_search - if cur_len + 1 < no_repeat_ngram_size: - # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet - return [[] for _ in range(num_hypos)] - generated_ngrams = [{} for _ in range(num_hypos)] - for idx in range(num_hypos): - gen_tokens = prev_input_ids[idx].numpy().tolist() - generated_ngram = generated_ngrams[idx] - for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]): - prev_ngram_tuple = tuple(ngram[:-1]) - generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]] - - def _get_generated_ngrams(hypo_idx): - # Before decoding the next token, prevent decoding of ngrams that have already appeared - start_idx = cur_len + 1 - no_repeat_ngram_size - ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].numpy().tolist()) - return generated_ngrams[hypo_idx].get(ngram_idx, []) - - banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)] - return banned_tokens - - -def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids): - banned_tokens = [] - - def _tokens_match(prev_tokens, tokens): - if len(tokens) == 0: - # if bad word tokens is just one token always ban it - return True - if len(tokens) > len(prev_tokens): - # if bad word tokens are longer than prev tokens they can't be equal - return False - - if prev_tokens[-len(tokens) :] == tokens: - # if tokens match - return True - else: - return False - - for prev_input_ids_slice in prev_input_ids: - banned_tokens_slice = [] - - for banned_token_seq in bad_words_ids: - assert ( - len(banned_token_seq) > 0 - ), f"Banned words token sequences { bad_words_ids} cannot have an empty list" - - if _tokens_match(prev_input_ids_slice.numpy().tolist(), banned_token_seq[:-1]) is False: - # if tokens do not match continue - continue - - banned_tokens_slice.append(banned_token_seq[-1]) - - banned_tokens.append(banned_tokens_slice) - - return banned_tokens - - -def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1): - """ - Filter a distribution of logits using top-k and/or nucleus (top-p) filtering - - Args: - logits: logits distribution shape (batch size, vocabulary size) - top_k (`int`, *optional*, defaults to 0): - If > 0, only keep the top k tokens with highest probability (top-k filtering) - top_p (`float`, *optional*, defaults to 1.0): - If < 1.0, only keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus - filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) - min_tokens_to_keep (`int`, *optional*, defaults to 1): - Minimumber of tokens we keep per batch example in the output. - - From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 - """ - logits_shape = shape_list(logits) - - if top_k > 0: - top_k = min(max(top_k, min_tokens_to_keep), logits_shape[-1]) # Safety check - # Remove all tokens with a probability less than the last token of the top-k - indices_to_remove = logits < tf.math.top_k(logits, k=top_k)[0][..., -1, None] - logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value) - - if top_p < 1.0: - sorted_indices = tf.argsort(logits, direction="DESCENDING") - sorted_logits = tf.gather( - logits, sorted_indices, axis=-1, batch_dims=1 - ) # expects logits to be of dim (batch_size, vocab_size) - - cumulative_probs = tf.math.cumsum(tf.nn.softmax(sorted_logits, axis=-1), axis=-1) - - # Remove tokens with cumulative probability above the threshold (token with 0 are kept) - sorted_indices_to_remove = cumulative_probs > top_p - - if min_tokens_to_keep > 1: - # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) - sorted_indices_to_remove = tf.concat( - [ - tf.zeros_like(sorted_indices_to_remove[:, :min_tokens_to_keep]), - sorted_indices_to_remove[:, min_tokens_to_keep:], - ], - -1, - ) - - # Shift the indices to the right to keep also the first token above the threshold - sorted_indices_to_remove = tf.concat( - [tf.zeros_like(sorted_indices_to_remove[:, :1]), sorted_indices_to_remove[:, :-1]], - -1, - ) - # scatter sorted tensors to original indexing - indices_to_remove = scatter_values_on_batch_indices(sorted_indices_to_remove, sorted_indices) - logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value) - return logits - - -def scatter_values_on_batch_indices(values, batch_indices): - shape = shape_list(batch_indices) - # broadcast batch dim to shape - broad_casted_batch_dims = tf.reshape(tf.broadcast_to(tf.expand_dims(tf.range(shape[0]), axis=-1), shape), [1, -1]) - # transform batch_indices to pair_indices - pair_indices = tf.transpose(tf.concat([broad_casted_batch_dims, tf.reshape(batch_indices, [1, -1])], 0)) - # scatter values to pair indices - return tf.scatter_nd(pair_indices, tf.reshape(values, [-1]), shape) - - -def set_tensor_by_indices_to_value(tensor, indices, value): - # create value_tensor since tensor value assignment is not possible in TF - value_tensor = tf.zeros_like(tensor) + value - return tf.where(indices, value_tensor, tensor) - - -def sample_without_replacement(logits, num_samples): - """ - categorical sampling without replacement is currently not implemented the gumbel-max trick will do for now see - https://github.com/tensorflow/tensorflow/issues/9260 for more info - """ - z = -tf.math.log(tf.random.uniform(shape_list(logits), 0, 1)) - _, indices = tf.nn.top_k(logits + z, num_samples) - return indices - - -def shape_list(x): - """Deal with dynamic shape in tensorflow cleanly.""" - static = x.shape.as_list() - dynamic = tf.shape(x) - return [dynamic[i] if s is None else s for i, s in enumerate(static)] - - -class BeamHypotheses(object): - def __init__(self, num_beams, max_length, length_penalty, early_stopping): - """ - Initialize n-best list of hypotheses. - """ - self.max_length = max_length - 1 # ignoring bos_token - self.length_penalty = length_penalty - self.early_stopping = early_stopping - self.num_beams = num_beams - self.beams = [] - self.worst_score = 1e9 - - def __len__(self): - """ - Number of hypotheses in the list. - """ - return len(self.beams) - - def add(self, hyp, sum_logprobs): - """ - Add a new hypothesis to the list. - """ - score = sum_logprobs / len(hyp) ** self.length_penalty - if len(self) < self.num_beams or score > self.worst_score: - self.beams.append((score, hyp)) - if len(self) > self.num_beams: - sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)]) - del self.beams[sorted_scores[0][1]] - self.worst_score = sorted_scores[1][0] - else: - self.worst_score = min(score, self.worst_score) - - def is_done(self, best_sum_logprobs, cur_len): - """ - If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst - one in the heap, then we are done with this sentence. - """ - - if len(self) < self.num_beams: - return False - elif self.early_stopping: - return True - else: - cur_score = best_sum_logprobs / cur_len ** self.length_penalty - ret = self.worst_score >= cur_score - return ret From 3db93ffa2e270c05952a4f00e0b77e8bf4e7b08d Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Fri, 11 Feb 2022 12:43:20 +0100 Subject: [PATCH 12/30] make gpt2 and t5 tests work --- src/transformers/generation_tf_utils.py | 214 +++++++++++------------- 1 file changed, 99 insertions(+), 115 deletions(-) diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py index 2ce2f9168036..3e546163aaf1 100644 --- a/src/transformers/generation_tf_utils.py +++ b/src/transformers/generation_tf_utils.py @@ -1725,8 +1725,6 @@ def _generate( ) # generate sequences without allowing bad_words to be generated ```""" # 1. Set generation parameters if not already defined - num_beams = num_beams if num_beams is not None else self.config.num_beams - do_sample = do_sample if do_sample is not None else self.config.do_sample max_length = max_length if max_length is not None else self.config.max_length min_length = min_length if min_length is not None else self.config.min_length early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping @@ -1744,6 +1742,12 @@ def _generate( return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate ) + num_beams = num_beams if num_beams is not None else self.config.num_beams + do_sample = do_sample if do_sample is not None else self.config.do_sample + num_return_sequences = ( + num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences + ) + if pad_token_id is None and eos_token_id is not None: logger.warning(f"Setting `pad_token_id` to {eos_token_id} (first `eos_token_id`) to generate sequence") pad_token_id = eos_token_id @@ -1751,126 +1755,49 @@ def _generate( # 2. Define model inputs # inputs_ids now has to be defined input_ids = self._prepare_model_inputs(input_ids, bos_token_id) + batch_size = input_ids.shape[0] - if input_ids is not None: - batch_size = shape_list(input_ids)[0] # overridden by the input batch_size - else: - batch_size = 1 - - # 3. Define other model kwargs - if self.config.is_encoder_decoder: - model_kwargs["encoder_attentions"] = None - model_kwargs["encoder_hidden_states"] = None - - # This block corresponds to the following line in `generation_tf_utils`: - # "input_ids = self._prepare_input_ids_for_generation(bos_token_id, model_kwargs.get("encoder_outputs"))" - # with the following differences: - # 1. In PT, `generate()`'s `model_kwargs` can accept `encoder_outputs`, but not the case in TF. - # 2. There is no shape checking in PT. - # In both PT/TF, if `input_ids` is `None`, we try to create it as it is for a text model. - if input_ids is None: - if not isinstance(bos_token_id, int) or bos_token_id < 0: - raise ValueError( - "you should either supply a context to complete as `input_ids` input " - "or a `bos_token_id` (integer >= 0) as a first token to start the generation." - ) - input_ids = tf.fill((batch_size, 1), bos_token_id) - - # create attention mask if necessary - # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140 - if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids.numpy()): - attention_mask = tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=tf.int32) - elif attention_mask is None: - attention_mask = tf.ones_like(input_ids) + # 3. Prepare other model kwargs + model_kwargs["output_attentions"] = output_attentions + model_kwargs["output_hidden_states"] = output_hidden_states + model_kwargs["use_cache"] = use_cache - # current position and vocab size - cur_len = shape_list(input_ids)[1] # unused - vocab_size = getattr(self.config, "vocab_size", None) - if vocab_size is None and self.config.is_encoder_decoder: - decoder_config = getattr(self.config, "decoder", None) - if decoder_config is not None: - vocab_size = getattr(self.config.decoder, "vocab_size", None) + requires_attention_mask = "encoder_outputs" not in model_kwargs - # set effective batch size and effective batch multiplier according to do_sample - if do_sample: - effective_batch_size = batch_size * num_return_sequences - effective_batch_mult = num_return_sequences - else: - effective_batch_size = batch_size - effective_batch_mult = 1 + if model_kwargs.get("attention_mask", None) is None and requires_attention_mask: + model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(input_ids, pad_token_id) if self.config.is_encoder_decoder: - if decoder_start_token_id is None: - decoder_start_token_id = bos_token_id - - assert ( - decoder_start_token_id is not None - ), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation" - assert hasattr(self, "get_encoder"), f"{self} should have a 'get_encoder' function defined" - assert callable(self.get_encoder), f"{self.get_encoder} should be a method" + # if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs: + # if model is encoder decoder encoder_outputs are created + # and added to `model_kwargs` + model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(input_ids, model_kwargs) - # get encoder and store encoder outputs - encoder = self.get_encoder() - - encoder_kwargs = { - "attention_mask": attention_mask, - "output_attentions": output_attentions, - "output_hidden_states": output_hidden_states, - "return_dict": return_dict_in_generate, - } - - # vision models don't use `attention_mask`. - signature = dict(inspect.signature(encoder.call).parameters) - if "attention_mask" not in signature: - encoder_kwargs.pop("attention_mask") - - encoder_outputs = encoder(input_ids, **encoder_kwargs) - if return_dict_in_generate: - if output_attentions: - model_kwargs["encoder_attentions"] = encoder_outputs.attentions - if output_hidden_states: - model_kwargs["encoder_hidden_states"] = encoder_outputs.hidden_states - - # TODO(Patrick) - not very clean here - model_kwargs["attention_mask"] = attention_mask - model_kwargs["past"] = encoder_outputs # defined for encoder-decoder models, None for decoder-only models + # TODO(PVP) - ugly hack here which requires a bigger + # refactor of all generation models in TF. `past` should be + # optional everywhere + model_kwargs["past"] = model_kwargs["encoder_outputs"] if self.config.is_encoder_decoder else None + # 4. Prepare `input_ids` which will be used for auto-regressive generation if self.config.is_encoder_decoder: - # create empty decoder_input_ids - input_ids = ( - tf.ones( - (effective_batch_size * num_beams, 1), - dtype=tf.int32, - ) - * decoder_start_token_id - ) - cur_len = 1 - - assert ( - batch_size == encoder_outputs[0].shape[0] - ), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} " - - # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1) - expanded_batch_idxs = tf.reshape( - tf.repeat(tf.expand_dims(tf.range(batch_size), -1), repeats=num_beams * effective_batch_mult, axis=1), - shape=(-1,), + # if encoder-decoder then `input_ids` come from `decoder_start_token_id` + input_ids = self._prepare_decoder_input_ids_for_generation( + batch_size, + decoder_start_token_id=decoder_start_token_id, + bos_token_id=bos_token_id, + model_kwargs=model_kwargs, ) - # expand encoder_outputs - encoder_outputs = (tf.gather(encoder_outputs[0], expanded_batch_idxs, axis=0),) - else: - encoder_outputs = None - cur_len = shape_list(input_ids)[-1] - if cur_len >= max_length: + if input_ids.shape[-1] >= max_length: raise ValueError( - f"The context has {cur_len} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`" + f"The context has {input_ids.shape[-1]} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`" ) - # 6. determine generation mode + # 5. determine generation mode is_greedy_gen_mode = (num_beams == 1) and do_sample is False # prepare distribution pre_processing samplers - # 7. prepare distribution pre_processing samplers + # 6. prepare distribution pre_processing samplers logits_processor = self._get_logits_processor( repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, @@ -1879,14 +1806,14 @@ def _generate( eos_token_id=eos_token_id, ) - # 8. go into different generation modes + # 7. go into different generation modes if is_greedy_gen_mode: if num_return_sequences > 1: raise ValueError( f"num_return_sequences has to be 1, but is {num_return_sequences} when doing greedy search." ) - # 9. run greedy search + # 8. run greedy search return self.greedy_search( input_ids, max_length=max_length, @@ -1894,12 +1821,42 @@ def _generate( eos_token_id=eos_token_id, logits_processor=logits_processor, output_scores=output_scores, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, return_dict_in_generate=return_dict_in_generate, **model_kwargs, ) + def _prepare_attention_mask_for_generation( + self, + input_ids: tf.Tensor, + pad_token_id: int, + ) -> tf.Tensor: + if (pad_token_id is not None) and (pad_token_id in input_ids.numpy()): + return tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=tf.int32) + else: + return tf.ones_like(input_ids) + + def _prepare_encoder_decoder_kwargs_for_generation(self, input_ids: tf.Tensor, model_kwargs) -> Dict[str, Any]: + # get encoder and store encoder outputs + encoder = self.get_encoder() + + # prepare encoder args and encoder kwargs from model kwargs + irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"] + encoder_kwargs = { + argument: value + for argument, value in model_kwargs.items() + if not any(argument.startswith(p) for p in irrelevant_prefix) + } + + # vision models don't use `attention_mask`. + signature = dict(inspect.signature(encoder.call).parameters) + if "attention_mask" not in signature: + encoder_kwargs.pop("attention_mask") + + encoder_outputs = encoder(input_ids, **encoder_kwargs) + + model_kwargs["encoder_outputs"] = encoder_outputs + return model_kwargs + def _prepare_decoder_input_ids_for_generation( self, batch_size: int, @@ -1907,13 +1864,38 @@ def _prepare_decoder_input_ids_for_generation( bos_token_id: int = None, model_kwargs: Optional[Dict[str, tf.Tensor]] = None, ) -> tf.Tensor: - if model_kwargs is not None and "decoder_input_ids" in model_kwargs: return model_kwargs.pop("decoder_input_ids") else: decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id) return tf.ones((batch_size, 1), dtype=tf.int32) * decoder_start_token_id + def _get_decoder_start_token_id(self, decoder_start_token_id: int = None, bos_token_id: int = None) -> int: + decoder_start_token_id = ( + decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id + ) + bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id + + if decoder_start_token_id is not None: + return decoder_start_token_id + elif ( + hasattr(self.config, "decoder") + and hasattr(self.config.decoder, "decoder_start_token_id") + and self.config.decoder.decoder_start_token_id is not None + ): + return self.config.decoder.decoder_start_token_id + elif bos_token_id is not None: + return bos_token_id + elif ( + hasattr(self.config, "decoder") + and hasattr(self.config.decoder, "bos_token_id") + and self.config.decoder.bos_token_id is not None + ): + return self.config.decoder.bos_token_id + raise ValueError( + "`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation." + ) + def _prepare_model_inputs(self, inputs: Optional[tf.Tensor] = None, bos_token_id: Optional[int] = None): if inputs is None: if not isinstance(bos_token_id, int) or bos_token_id < 0: @@ -2003,8 +1985,8 @@ def greedy_search( input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`): The sequence used as a prompt for the generation. - logits_processor (`LogitsProcessorList`, *optional*): - An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] + logits_processor (`TFLogitsProcessorList`, *optional*): + An instance of [`TFLogitsProcessorList`]. List of instances of class derived from [`TFLogitsProcessor`] used to modify the prediction scores of the language modeling head applied at each generation step. max_length (`int`, *optional*, defaults to 20): The maximum length of the sequence to be generated. @@ -2085,8 +2067,10 @@ def greedy_search( # if model is an encoder-decoder, retrieve encoder attention weights and hidden states if return_dict_in_generate and self.config.is_encoder_decoder: - encoder_attentions = model_kwargs["encoder_attentions"] if output_attentions else None - encoder_hidden_states = model_kwargs["encoder_hidden_states"] if output_hidden_states else None + encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None + encoder_hidden_states = ( + model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None + ) # keep track of which sequences are already finished unfinished_sequences = tf.ones_like(input_ids[:, 0]) From 7a7b7eff1240b5f361682d9555e046c15e4f664c Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Fri, 11 Feb 2022 16:49:36 +0100 Subject: [PATCH 13/30] finish logits tests --- .../generation_tf_logits_process.py | 25 ++- src/transformers/generation_tf_utils.py | 59 ++++-- tests/test_generation_tf_logits_process.py | 177 ++++++++++++++++++ tests/test_modeling_tf_common.py | 2 +- 4 files changed, 236 insertions(+), 27 deletions(-) create mode 100644 tests/test_generation_tf_logits_process.py diff --git a/src/transformers/generation_tf_logits_process.py b/src/transformers/generation_tf_logits_process.py index 48089989e91f..d8c38a1a2f7b 100644 --- a/src/transformers/generation_tf_logits_process.py +++ b/src/transformers/generation_tf_logits_process.py @@ -74,7 +74,7 @@ class TFLogitsProcessorList(list): """ @add_start_docstrings(TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING) - def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int, **kwargs) -> tf.Tensor: + def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, **kwargs) -> tf.Tensor: for processor in self: function_args = inspect.signature(processor.__call__).parameters if len(function_args) > 3: @@ -83,9 +83,9 @@ def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int, **kwar f"Make sure that all the required parameters: {list(function_args.keys())} for " f"{processor.__class__} are passed to the logits processor." ) - scores = processor(input_ids, scores, cur_len, **kwargs) + scores = processor(input_ids, scores, **kwargs) else: - scores = processor(input_ids, scores, cur_len) + scores = processor(input_ids, scores) return scores @@ -110,8 +110,9 @@ def __init__(self, min_length: int, eos_token_id: int): self.min_length = min_length self.eos_token_id = eos_token_id - def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor: + def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor) -> tf.Tensor: # create boolean flag to decide if min length penalty should be applied + cur_len = input_ids.shape[-1] apply_penalty = 1 - tf.clip_by_value(cur_len - self.min_length, 0, 1) # TODO(Matt) - this if statement has to be rewritten for XLA. Leaving it now though since @@ -141,6 +142,9 @@ def __init__(self, penalty: float): def _create_score_penalties(self, input_ids, logits): # create logit penalties for already seen input_ids + input_ids = input_ids.cpu() + logits = logits.cpu() + token_penalties = np.ones(logits.shape) prev_input_ids = [np.unique(input_id) for input_id in input_ids.numpy()] for i, prev_input_id in enumerate(prev_input_ids): @@ -152,7 +156,7 @@ def _create_score_penalties(self, input_ids, logits): np.put(token_penalties[i], prev_input_id, logit_penalties) return tf.convert_to_tensor(token_penalties, dtype=tf.float32) - def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor: + def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor) -> tf.Tensor: score_penalties = self._create_score_penalties(input_ids, scores) @@ -214,7 +218,7 @@ def _tokens_match(prev_tokens, tokens): len(banned_token_seq) > 0 ), f"Banned words token sequences {self.bad_words_ids} cannot have an empty list" - if _tokens_match(prev_input_ids_slice.numpy().tolist(), banned_token_seq[:-1]) is False: + if _tokens_match(prev_input_ids_slice.cpu().numpy().tolist(), banned_token_seq[:-1]) is False: # if tokens do not match continue continue @@ -224,7 +228,7 @@ def _tokens_match(prev_tokens, tokens): return banned_tokens - def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor: + def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor) -> tf.Tensor: vocab_size = scores.shape[-1] @@ -266,7 +270,7 @@ def calc_banned_ngram_tokens(self, prev_input_ids, num_hypos, cur_len): return [[] for _ in range(num_hypos)] generated_ngrams = [{} for _ in range(num_hypos)] for idx in range(num_hypos): - gen_tokens = prev_input_ids[idx].numpy().tolist() + gen_tokens = prev_input_ids[idx].cpu().numpy().tolist() generated_ngram = generated_ngrams[idx] for ngram in zip(*[gen_tokens[i:] for i in range(self.ngram_size)]): prev_ngram_tuple = tuple(ngram[:-1]) @@ -275,16 +279,17 @@ def calc_banned_ngram_tokens(self, prev_input_ids, num_hypos, cur_len): def _get_generated_ngrams(hypo_idx): # Before decoding the next token, prevent decoding of ngrams that have already appeared start_idx = cur_len + 1 - self.ngram_size - ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].numpy().tolist()) + ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].cpu().numpy().tolist()) return generated_ngrams[hypo_idx].get(ngram_idx, []) banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)] return banned_tokens - def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor: + def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor) -> tf.Tensor: batch_size, vocab_size = scores.shape + cur_len = input_ids.shape[-1] banned_tokens = self.calc_banned_ngram_tokens(input_ids, batch_size, cur_len) # create banned_tokens boolean mask diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py index 3e546163aaf1..6e1d34183870 100644 --- a/src/transformers/generation_tf_utils.py +++ b/src/transformers/generation_tf_utils.py @@ -1753,8 +1753,8 @@ def _generate( pad_token_id = eos_token_id # 2. Define model inputs - # inputs_ids now has to be defined input_ids = self._prepare_model_inputs(input_ids, bos_token_id) + # inputs_ids now has to be defined and cannot be None anymore batch_size = input_ids.shape[0] # 3. Prepare other model kwargs @@ -1768,15 +1768,15 @@ def _generate( model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(input_ids, pad_token_id) if self.config.is_encoder_decoder: - # if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs: - # if model is encoder decoder encoder_outputs are created - # and added to `model_kwargs` - model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(input_ids, model_kwargs) + # if model is encoder decoder model, we create encoder_outputs and add to `model_kwargs` + model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation( + input_ids, return_dict_in_generate, model_kwargs + ) - # TODO(PVP) - ugly hack here which requires a bigger + # TODO(Patrick) - ugly `past`/`encoder_output` hack here which requires a bigger # refactor of all generation models in TF. `past` should be - # optional everywhere - model_kwargs["past"] = model_kwargs["encoder_outputs"] if self.config.is_encoder_decoder else None + # optional everywhere and not be set equal to encoder_outputs + model_kwargs["past"] = model_kwargs.get("encoder_outputs")[:1] if self.config.is_encoder_decoder else None # 4. Prepare `input_ids` which will be used for auto-regressive generation if self.config.is_encoder_decoder: @@ -1794,9 +1794,9 @@ def _generate( ) # 5. determine generation mode + # TODO(Matt, Joao, Patrick) - add more use cases here is_greedy_gen_mode = (num_beams == 1) and do_sample is False - # prepare distribution pre_processing samplers # 6. prepare distribution pre_processing samplers logits_processor = self._get_logits_processor( repetition_penalty=repetition_penalty, @@ -1825,17 +1825,25 @@ def _generate( **model_kwargs, ) + # TODO(Matt, Joao, Patrick) - add more sub-generation methods here + def _prepare_attention_mask_for_generation( self, input_ids: tf.Tensor, pad_token_id: int, ) -> tf.Tensor: + # prepare `attention_mask` if not passed if (pad_token_id is not None) and (pad_token_id in input_ids.numpy()): return tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=tf.int32) else: return tf.ones_like(input_ids) - def _prepare_encoder_decoder_kwargs_for_generation(self, input_ids: tf.Tensor, model_kwargs) -> Dict[str, Any]: + def _prepare_encoder_decoder_kwargs_for_generation( + self, input_ids: tf.Tensor, return_dict_in_generate, model_kwargs + ) -> Dict[str, Any]: + # TODO(Patrick) - remove `return_dict_in_generate` flag input once `past`/`encoder_outputs` + # is cleaned + # get encoder and store encoder outputs encoder = self.get_encoder() @@ -1855,6 +1863,15 @@ def _prepare_encoder_decoder_kwargs_for_generation(self, input_ids: tf.Tensor, m encoder_outputs = encoder(input_ids, **encoder_kwargs) model_kwargs["encoder_outputs"] = encoder_outputs + + # TODO(Patrick): `encoder_outputs`, `past` hack. Currently, `encoder_attentions` and + # `encoder_hidden_states` have to be seperated from encoder_outputs and passed + # under other names because of `encoder_outputs`, `past` hack. Need to clean-up + # all encoder-decoder prepare_inputs_for_generation method to clean this + if return_dict_in_generate: + model_kwargs["encoder_attentions"] = encoder_outputs.get("attentions", None) + model_kwargs["encoder_hidden_states"] = encoder_outputs.get("hidden_states", None) + return model_kwargs def _prepare_decoder_input_ids_for_generation( @@ -1864,6 +1881,8 @@ def _prepare_decoder_input_ids_for_generation( bos_token_id: int = None, model_kwargs: Optional[Dict[str, tf.Tensor]] = None, ) -> tf.Tensor: + + # prepare `input_ids` for decoder if model is encoder-decoder if model_kwargs is not None and "decoder_input_ids" in model_kwargs: return model_kwargs.pop("decoder_input_ids") else: @@ -1871,6 +1890,8 @@ def _prepare_decoder_input_ids_for_generation( return tf.ones((batch_size, 1), dtype=tf.int32) * decoder_start_token_id def _get_decoder_start_token_id(self, decoder_start_token_id: int = None, bos_token_id: int = None) -> int: + # retrieve decoder_start_token_id for encoder-decoder models + # fall back to bos_token_id if necessary decoder_start_token_id = ( decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id ) @@ -1897,7 +1918,10 @@ def _get_decoder_start_token_id(self, decoder_start_token_id: int = None, bos_to ) def _prepare_model_inputs(self, inputs: Optional[tf.Tensor] = None, bos_token_id: Optional[int] = None): + # TODO(Patrick) - adapt this function when making `generate` more flexible + # for all kinds of input types if inputs is None: + # if no `inputs` are passed create prompt of size (1,1) filled with BOS token if not isinstance(bos_token_id, int) or bos_token_id < 0: raise ValueError( "you should either supply a context to complete as `input_ids` input " @@ -1905,7 +1929,6 @@ def _prepare_model_inputs(self, inputs: Optional[tf.Tensor] = None, bos_token_id ) return tf.fill((1, 1), bos_token_id, dtype=tf.int32) - # if inputs are passed return those return inputs @staticmethod @@ -2065,12 +2088,16 @@ def greedy_search( cross_attentions = () if (return_dict_in_generate and output_attentions) else None decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None + # TODO(Patrick): `encoder_outputs`, `past` hack. Currently T5, Bart expect `encoder_outputs` + # to be wrapped into `past` variable. Tis is a bad design and needs + # to be updated. + # Remove the following lines when updating all encoder-decoder models + encoder_outputs = model_kwargs.pop("encoder_outputs", None) + # if model is an encoder-decoder, retrieve encoder attention weights and hidden states if return_dict_in_generate and self.config.is_encoder_decoder: - encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None - encoder_hidden_states = ( - model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None - ) + encoder_attentions = encoder_outputs.get("attentions") if output_attentions else None + encoder_hidden_states = encoder_outputs.get("hidden_states") if output_hidden_states else None # keep track of which sequences are already finished unfinished_sequences = tf.ones_like(input_ids[:, 0]) @@ -2113,7 +2140,7 @@ def greedy_search( ) # pre-process distribution - next_tokens_scores = logits_processor(input_ids, next_token_logits, cur_len) + next_tokens_scores = logits_processor(input_ids, next_token_logits) # argmax next_tokens = tf.cast(tf.argmax(next_tokens_scores, axis=-1), tf.int32) diff --git a/tests/test_generation_tf_logits_process.py b/tests/test_generation_tf_logits_process.py new file mode 100644 index 000000000000..06ca571235c0 --- /dev/null +++ b/tests/test_generation_tf_logits_process.py @@ -0,0 +1,177 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Team Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a clone of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers import is_tf_available +from transformers.testing_utils import require_tf + +from .test_modeling_common import ids_tensor + + +if is_tf_available(): + import tensorflow as tf + + from transformers.generation_tf_logits_process import ( + TFLogitsProcessorList, + TFMinLengthLogitsProcessor, + TFNoBadWordsLogitsProcessor, + TFNoRepeatNGramLogitsProcessor, + TFRepetitionPenaltyLogitsProcessor, + ) + + +def set_tensor_by_indices_to_value(tensor, indices, value): + # create value_tensor since tensor value assignment is not possible in TF + value_tensor = tf.zeros_like(tensor) + value + return tf.where(indices, value_tensor, tensor) + + +@require_tf +class TFLogitsProcessorTest(unittest.TestCase): + def _get_uniform_logits(self, batch_size: int, length: int): + scores = tf.ones((batch_size, length), dtype=tf.float32) / length + return scores + + def test_min_lenght_dist_processor(self): + vocab_size = 20 + batch_size = 4 + eos_token_id = 0 + + min_dist_processor = TFMinLengthLogitsProcessor(min_length=10, eos_token_id=eos_token_id) + + # check that min length is applied at length 5 + input_ids = ids_tensor((batch_size, 5), vocab_size=20) + scores = self._get_uniform_logits(batch_size, vocab_size) + scores_before_min_length = min_dist_processor(input_ids, scores) + self.assertListEqual(scores_before_min_length[:, eos_token_id].numpy().tolist(), 4 * [-float("inf")]) + + # check that min length is not applied anymore at length 15 + input_ids = ids_tensor((batch_size, 15), vocab_size=20) + scores = self._get_uniform_logits(batch_size, vocab_size) + scores_before_min_length = min_dist_processor(input_ids, scores) + self.assertFalse(tf.math.reduce_any(tf.math.is_inf(scores_before_min_length)).numpy()) + + def test_repetition_penalty_dist_process(self): + input_ids = tf.constant([[0, 1], [5, 0]], dtype=tf.int32) + vocab_size = 10 + + scores = self._get_uniform_logits(batch_size=2, length=vocab_size) + + mask = tf.cast(tf.constant([[1] + 9 * [0], 10 * [0]]), tf.bool) + scores = set_tensor_by_indices_to_value(scores, mask, -1 / vocab_size) + mask = tf.cast(tf.constant([10 * [0], 5 * [0] + [1] + 4 * [0]]), tf.bool) + scores = set_tensor_by_indices_to_value(scores, mask, 4 / vocab_size) + + rep_penalty_proc = TFRepetitionPenaltyLogitsProcessor(penalty=2.0) + + scores = rep_penalty_proc(input_ids, tf.identity(scores)) + + # check that values were correctly changed + self.assertAlmostEqual(scores[0, 0].numpy(), -(1 / vocab_size) * 2) + self.assertAlmostEqual(scores[0, 1].numpy(), (1 / vocab_size) / 2) + + self.assertAlmostEqual(scores[1, 0].numpy(), (1 / vocab_size) / 2) + self.assertAlmostEqual(scores[1, 5].numpy(), (4 / vocab_size) / 2) + + def test_no_repeat_ngram_dist_processor(self): + vocab_size = 3 + batch_size = 2 + + input_ids = tf.constant([[1, 1, 2, 1], [0, 1, 0, 1]], dtype=tf.int32) + scores = self._get_uniform_logits(batch_size, vocab_size) + + no_repeat_proc_2_gram = TFNoRepeatNGramLogitsProcessor(2) + no_repeat_proc_3_gram = TFNoRepeatNGramLogitsProcessor(3) + + filtered_scores_2_gram = no_repeat_proc_2_gram(input_ids, tf.identity(scores)) + filtered_scores_3_gram = no_repeat_proc_3_gram(input_ids, tf.identity(scores)) + + # 2-gram would forbid 2nd and 3rd token (1,2) at 1st batch and 1st token (0) at 2nd batch + self.assertListEqual( + tf.math.is_inf(filtered_scores_2_gram).numpy().tolist(), [[False, True, True], [True, False, False]] + ) + + # 3-gram would forbid no token at 1st batch and 1st token (0) at 2nd batch + self.assertListEqual( + tf.math.is_inf(filtered_scores_3_gram).numpy().tolist(), [[False, False, False], [True, False, False]] + ) + + def test_no_bad_words_dist_processor(self): + vocab_size = 5 + batch_size = 2 + eos_token_id = 4 + + input_ids = tf.constant([[0, 1, 3, 1], [0, 1, 0, 1]], dtype=tf.int32) + bad_word_tokens = [[1], [4], [1, 0], [0, 1, 2], [1, 3, 1, 3]] + scores = self._get_uniform_logits(batch_size, vocab_size) + + no_bad_words_dist_proc = TFNoBadWordsLogitsProcessor(bad_words_ids=bad_word_tokens, eos_token_id=eos_token_id) + + filtered_scores = no_bad_words_dist_proc(input_ids, tf.identity(scores)) + + # batch 1: 1st, 2nd, and 4th (0, 1, 3) token are forbidden + # batch 2: 1st, 2nd, and 3rd (0, 1, 2) token are forbidden + self.assertListEqual( + tf.math.is_inf(filtered_scores).numpy().tolist(), + [[True, True, False, True, True], [True, True, True, False, True]], + ) + + def test_processor_list(self): + batch_size = 4 + sequence_length = 10 + vocab_size = 15 + eos_token_id = 0 + + # dummy input_ids and scores + input_ids = ids_tensor((batch_size, sequence_length), vocab_size) + input_ids_comp = input_ids.clone() + + scores = self._get_uniform_logits(batch_size, vocab_size) + scores_comp = tf.identity(scores) + + # instantiate all dist processors + min_dist_proc = TFMinLengthLogitsProcessor(min_length=10, eos_token_id=eos_token_id) + rep_penalty_proc = TFRepetitionPenaltyLogitsProcessor(penalty=2.0) + no_repeat_proc = TFNoRepeatNGramLogitsProcessor(2) + no_bad_words_dist_proc = TFNoBadWordsLogitsProcessor(bad_words_ids=[[1]], eos_token_id=eos_token_id) + + # no processor list + scores = min_dist_proc(input_ids, scores) + scores = rep_penalty_proc(input_ids, scores) + scores = no_repeat_proc(input_ids, scores) + scores = no_bad_words_dist_proc(input_ids, scores) + + # with processor list + processor = TFLogitsProcessorList( + [ + min_dist_proc, + rep_penalty_proc, + no_repeat_proc, + no_bad_words_dist_proc, + ] + ) + scores_comp = processor(input_ids, scores_comp) + + # remove inf + scores = set_tensor_by_indices_to_value(scores, tf.math.is_inf(scores), -1e9) + scores_comp = set_tensor_by_indices_to_value(scores_comp, tf.math.is_inf(scores_comp), -1e9) + + # scores should be equal + tf.debugging.assert_near(scores, scores_comp, atol=1e-3) + + # input_ids should never be changed + self.assertListEqual(input_ids.cpu().numpy().tolist(), input_ids_comp.cpu().numpy().tolist()) diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index b100ac3ff924..662af0cbff5b 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -931,7 +931,7 @@ def test_lm_head_model_random_no_beam_search_generate(self): # num_return_sequences = 1 self._check_generated_ids(model.generate(do_sample=True, max_length=5)) - with self.assertRaises(AssertionError): + with self.assertRaises(ValueError): # generating multiple sequences when no beam search generation # is not allowed as it would always generate the same sequences model.generate(input_ids, do_sample=False, num_return_sequences=2) From 7b1b2cc0e5e64a30609fa86f2364a05929862ac6 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 14 Feb 2022 08:46:55 +0100 Subject: [PATCH 14/30] correct logits processors --- tests/test_generation_tf_logits_process.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_generation_tf_logits_process.py b/tests/test_generation_tf_logits_process.py index 06ca571235c0..a679576bf0ae 100644 --- a/tests/test_generation_tf_logits_process.py +++ b/tests/test_generation_tf_logits_process.py @@ -19,7 +19,7 @@ from transformers import is_tf_available from transformers.testing_utils import require_tf -from .test_modeling_common import ids_tensor +from .test_modeling_tf_common import ids_tensor if is_tf_available(): @@ -138,7 +138,7 @@ def test_processor_list(self): # dummy input_ids and scores input_ids = ids_tensor((batch_size, sequence_length), vocab_size) - input_ids_comp = input_ids.clone() + input_ids_comp = tf.identity(input_ids) scores = self._get_uniform_logits(batch_size, vocab_size) scores_comp = tf.identity(scores) From 1a9e87027557bd31db74f749ea5bc55d11bfcbc6 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 14 Feb 2022 10:17:49 +0100 Subject: [PATCH 15/30] correct past / encoder_outputs drama --- src/transformers/generation_tf_utils.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py index 13e44a950a9a..a62f21d04666 100644 --- a/src/transformers/generation_tf_utils.py +++ b/src/transformers/generation_tf_utils.py @@ -1927,17 +1927,27 @@ def _prepare_model_inputs(self, inputs: Optional[tf.Tensor] = None, bos_token_id return inputs - @staticmethod def _update_model_kwargs_for_generation( - outputs: ModelOutput, model_kwargs: Dict[str, Any], is_encoder_decoder: bool = False + self, outputs: ModelOutput, model_kwargs: Dict[str, Any], is_encoder_decoder: bool = False ) -> Dict[str, Any]: # update past - if "past_key_values" in outputs: + if self._use_cache(outputs, model_kwargs["use_cache"]): + # TODO(Patrick): `past`/`encoder_outputs` hack. This should be + # removed when cleaning up the encoder-decoder models + # if model has past, then set the past variable to speed up decoding + # make this method static then as well + model_kwargs["past"] = outputs[1] + elif "past_key_values" in outputs: model_kwargs["past"] = outputs.past_key_values elif "mems" in outputs: model_kwargs["past"] = outputs.mems elif "past_buckets_states" in outputs: model_kwargs["past"] = outputs.past_buckets_states + elif "past" in model_kwargs: + # TODO(Patrick) `past`/`encoder_outputs` hack. + # removed when cleaning up the encoder-decoder models. + # The line should not be necessary. + pass else: model_kwargs["past"] = None @@ -2100,7 +2110,7 @@ def greedy_search( cur_len = input_ids.shape[-1] while cur_len < max_length: - # TODO (Patrick): remove following line by cleaning up `prepare_inputs_for_generation` + # TODO(Patrick): remove following line by cleaning up `prepare_inputs_for_generation` # in all models model_kwargs["use_cache"] = None if "use_cache" not in model_kwargs else model_kwargs["use_cache"] From 385c24ffaba85d8378f468a5b73b5fe581408472 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 14 Feb 2022 10:34:25 +0100 Subject: [PATCH 16/30] refactor some methods --- .../generation_tf_logits_process.py | 9 +---- src/transformers/generation_tf_utils.py | 16 ++------- src/transformers/tf_utils.py | 33 +++++++++++++++++++ tests/test_generation_tf_logits_process.py | 7 +--- 4 files changed, 37 insertions(+), 28 deletions(-) create mode 100644 src/transformers/tf_utils.py diff --git a/src/transformers/generation_tf_logits_process.py b/src/transformers/generation_tf_logits_process.py index d8c38a1a2f7b..92c12a364474 100644 --- a/src/transformers/generation_tf_logits_process.py +++ b/src/transformers/generation_tf_logits_process.py @@ -20,6 +20,7 @@ import numpy as np import tensorflow as tf +from .tf_utils import set_tensor_by_indices_to_value from .file_utils import add_start_docstrings from .utils.logging import get_logger @@ -47,14 +48,6 @@ """ -# TODO(Patrick) - this function is copied from `generation_tf_utils.py` -# it should be moved into a `tf_utils.py` file. -def set_tensor_by_indices_to_value(tensor, indices, value): - # create value_tensor since tensor value assignment is not possible in TF - value_tensor = tf.zeros_like(tensor) + value - return tf.where(indices, value_tensor, tensor) - - class TFLogitsProcessor(ABC): """Abstract base class for all logit processors that can be applied during generation.""" diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py index a62f21d04666..34ac2e0aba02 100644 --- a/src/transformers/generation_tf_utils.py +++ b/src/transformers/generation_tf_utils.py @@ -22,6 +22,7 @@ import tensorflow as tf from .file_utils import ModelOutput +from .tf_utils import shape_list, set_tensor_by_indices_to_value from .generation_tf_logits_process import ( TFLogitsProcessorList, TFMinLengthLogitsProcessor, @@ -1832,7 +1833,7 @@ def _prepare_attention_mask_for_generation( if (pad_token_id is not None) and (pad_token_id in input_ids.numpy()): return tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=tf.int32) else: - return tf.ones_like(input_ids) + return tf.ones(input_ids.shape[:2], dtype=tf.int32) def _prepare_encoder_decoder_kwargs_for_generation( self, input_ids: tf.Tensor, return_dict_in_generate, model_kwargs @@ -2340,12 +2341,6 @@ def scatter_values_on_batch_indices(values, batch_indices): return tf.scatter_nd(pair_indices, tf.reshape(values, [-1]), shape) -def set_tensor_by_indices_to_value(tensor, indices, value): - # create value_tensor since tensor value assignment is not possible in TF - value_tensor = tf.zeros_like(tensor) + value - return tf.where(indices, value_tensor, tensor) - - def sample_without_replacement(logits, num_samples): """ categorical sampling without replacement is currently not implemented the gumbel-max trick will do for now see @@ -2356,13 +2351,6 @@ def sample_without_replacement(logits, num_samples): return indices -def shape_list(x): - """Deal with dynamic shape in tensorflow cleanly.""" - static = x.shape.as_list() - dynamic = tf.shape(x) - return [dynamic[i] if s is None else s for i, s in enumerate(static)] - - class BeamHypotheses(object): def __init__(self, num_beams, max_length, length_penalty, early_stopping): """ diff --git a/src/transformers/tf_utils.py b/src/transformers/tf_utils.py new file mode 100644 index 000000000000..9899b8610a97 --- /dev/null +++ b/src/transformers/tf_utils.py @@ -0,0 +1,33 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tensorflow as tf + +from .utils import logging + + +logger = logging.get_logger(__name__) + + +def set_tensor_by_indices_to_value(tensor, indices, value): + # create value_tensor since tensor value assignment is not possible in TF + value_tensor = tf.zeros_like(tensor) + value + return tf.where(indices, value_tensor, tensor) + + +def shape_list(x): + """Deal with dynamic shape in tensorflow cleanly.""" + static = x.shape.as_list() + dynamic = tf.shape(x) + return [dynamic[i] if s is None else s for i, s in enumerate(static)] diff --git a/tests/test_generation_tf_logits_process.py b/tests/test_generation_tf_logits_process.py index a679576bf0ae..5beef35699e4 100644 --- a/tests/test_generation_tf_logits_process.py +++ b/tests/test_generation_tf_logits_process.py @@ -18,6 +18,7 @@ from transformers import is_tf_available from transformers.testing_utils import require_tf +from transformers.tf_utils import set_tensor_by_indices_to_value from .test_modeling_tf_common import ids_tensor @@ -34,12 +35,6 @@ ) -def set_tensor_by_indices_to_value(tensor, indices, value): - # create value_tensor since tensor value assignment is not possible in TF - value_tensor = tf.zeros_like(tensor) + value - return tf.where(indices, value_tensor, tensor) - - @require_tf class TFLogitsProcessorTest(unittest.TestCase): def _get_uniform_logits(self, batch_size: int, length: int): From bd750ffa98c96164de0402f6deaa00b5286f397c Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 14 Feb 2022 10:35:23 +0100 Subject: [PATCH 17/30] another fix --- tests/test_modeling_tf_speech_to_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_modeling_tf_speech_to_text.py b/tests/test_modeling_tf_speech_to_text.py index e34892bf126a..6253ccf9539d 100644 --- a/tests/test_modeling_tf_speech_to_text.py +++ b/tests/test_modeling_tf_speech_to_text.py @@ -474,7 +474,7 @@ def test_lm_head_model_random_no_beam_search_generate(self): # num_return_sequences = 1 self._check_generated_ids(model.generate(input_features, do_sample=True)) - with self.assertRaises(AssertionError): + with self.assertRaises(ValueError): # generating multiple sequences when no beam search generation # is not allowed as it would always generate the same sequences model.generate(input_features, do_sample=False, num_return_sequences=2) From 49e33b0b46c1f9d5e1586d25641e4acb6ececbd8 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 14 Feb 2022 10:42:02 +0100 Subject: [PATCH 18/30] refactor shape_list --- src/transformers/generation_tf_logits_process.py | 2 +- src/transformers/generation_tf_utils.py | 2 +- src/transformers/models/albert/modeling_tf_albert.py | 2 +- src/transformers/models/bart/modeling_tf_bart.py | 2 +- src/transformers/models/bert/modeling_tf_bert.py | 2 +- .../models/blenderbot/modeling_tf_blenderbot.py | 2 +- .../blenderbot_small/modeling_tf_blenderbot_small.py | 2 +- src/transformers/models/clip/modeling_tf_clip.py | 2 +- src/transformers/models/convbert/modeling_tf_convbert.py | 2 +- src/transformers/models/ctrl/modeling_tf_ctrl.py | 2 +- src/transformers/models/deberta/modeling_tf_deberta.py | 2 +- .../models/deberta_v2/modeling_tf_deberta_v2.py | 2 +- .../models/distilbert/modeling_tf_distilbert.py | 2 +- src/transformers/models/electra/modeling_tf_electra.py | 2 +- .../encoder_decoder/modeling_tf_encoder_decoder.py | 9 ++------- src/transformers/models/flaubert/modeling_tf_flaubert.py | 2 +- src/transformers/models/funnel/modeling_tf_funnel.py | 2 +- src/transformers/models/gpt2/modeling_tf_gpt2.py | 2 +- src/transformers/models/hubert/modeling_tf_hubert.py | 9 ++------- src/transformers/models/layoutlm/modeling_tf_layoutlm.py | 2 +- src/transformers/models/led/modeling_tf_led.py | 2 +- .../models/longformer/modeling_tf_longformer.py | 2 +- src/transformers/models/marian/modeling_tf_marian.py | 2 +- src/transformers/models/mbart/modeling_tf_mbart.py | 2 +- .../models/mobilebert/modeling_tf_mobilebert.py | 2 +- src/transformers/models/mpnet/modeling_tf_mpnet.py | 2 +- src/transformers/models/openai/modeling_tf_openai.py | 2 +- src/transformers/models/pegasus/modeling_tf_pegasus.py | 2 +- src/transformers/models/rembert/modeling_tf_rembert.py | 2 +- src/transformers/models/roberta/modeling_tf_roberta.py | 2 +- src/transformers/models/roformer/modeling_tf_roformer.py | 2 +- .../models/speech_to_text/modeling_tf_speech_to_text.py | 2 +- src/transformers/models/t5/modeling_tf_t5.py | 2 +- src/transformers/models/tapas/modeling_tf_tapas.py | 2 +- .../models/transfo_xl/modeling_tf_transfo_xl.py | 2 +- .../modeling_tf_vision_encoder_decoder.py | 9 ++------- src/transformers/models/vit/modeling_tf_vit.py | 2 +- src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py | 9 ++------- src/transformers/models/xlm/modeling_tf_xlm.py | 2 +- src/transformers/models/xlnet/modeling_tf_xlnet.py | 2 +- .../modeling_tf_{{cookiecutter.lowercase_modelname}}.py | 4 ++-- tests/test_modeling_tf_gpt2.py | 2 +- 42 files changed, 47 insertions(+), 67 deletions(-) diff --git a/src/transformers/generation_tf_logits_process.py b/src/transformers/generation_tf_logits_process.py index 92c12a364474..9d5aa3899eeb 100644 --- a/src/transformers/generation_tf_logits_process.py +++ b/src/transformers/generation_tf_logits_process.py @@ -20,8 +20,8 @@ import numpy as np import tensorflow as tf -from .tf_utils import set_tensor_by_indices_to_value from .file_utils import add_start_docstrings +from .tf_utils import set_tensor_by_indices_to_value from .utils.logging import get_logger diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py index 34ac2e0aba02..9e81188b5ab8 100644 --- a/src/transformers/generation_tf_utils.py +++ b/src/transformers/generation_tf_utils.py @@ -22,7 +22,6 @@ import tensorflow as tf from .file_utils import ModelOutput -from .tf_utils import shape_list, set_tensor_by_indices_to_value from .generation_tf_logits_process import ( TFLogitsProcessorList, TFMinLengthLogitsProcessor, @@ -30,6 +29,7 @@ TFNoRepeatNGramLogitsProcessor, TFRepetitionPenaltyLogitsProcessor, ) +from .tf_utils import set_tensor_by_indices_to_value, shape_list from .utils import logging diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py index f2659e817a95..42f1e5b34dff 100644 --- a/src/transformers/models/albert/modeling_tf_albert.py +++ b/src/transformers/models/albert/modeling_tf_albert.py @@ -51,8 +51,8 @@ get_initializer, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_albert import AlbertConfig diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py index b9abc647abd6..058fdb99f298 100644 --- a/src/transformers/models/bart/modeling_tf_bart.py +++ b/src/transformers/models/bart/modeling_tf_bart.py @@ -44,8 +44,8 @@ TFWrappedEmbeddings, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_bart import BartConfig diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py index 7d7d431c7ec2..bf5ddb365b87 100644 --- a/src/transformers/models/bert/modeling_tf_bert.py +++ b/src/transformers/models/bert/modeling_tf_bert.py @@ -57,8 +57,8 @@ get_initializer, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_bert import BertConfig diff --git a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py index 6d50492062f2..65135a1d0796 100644 --- a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py @@ -46,8 +46,8 @@ TFWrappedEmbeddings, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_blenderbot import BlenderbotConfig diff --git a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py index fdf0c63c0ae7..0243030a4301 100644 --- a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py @@ -44,8 +44,8 @@ TFWrappedEmbeddings, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_blenderbot_small import BlenderbotSmallConfig diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py index 3a1621ba9d9c..4902248b2567 100644 --- a/src/transformers/models/clip/modeling_tf_clip.py +++ b/src/transformers/models/clip/modeling_tf_clip.py @@ -39,8 +39,8 @@ get_initializer, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig diff --git a/src/transformers/models/convbert/modeling_tf_convbert.py b/src/transformers/models/convbert/modeling_tf_convbert.py index 84967b5fba1c..0c4d265dcd75 100644 --- a/src/transformers/models/convbert/modeling_tf_convbert.py +++ b/src/transformers/models/convbert/modeling_tf_convbert.py @@ -43,8 +43,8 @@ get_initializer, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_convbert import ConvBertConfig diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py index acfce53c8a75..c72448310a85 100644 --- a/src/transformers/models/ctrl/modeling_tf_ctrl.py +++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py @@ -30,8 +30,8 @@ get_initializer, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_ctrl import CTRLConfig diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py index 25a6c07d42cc..0d36de4895a8 100644 --- a/src/transformers/models/deberta/modeling_tf_deberta.py +++ b/src/transformers/models/deberta/modeling_tf_deberta.py @@ -39,8 +39,8 @@ TFTokenClassificationLoss, get_initializer, input_processing, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_deberta import DebertaConfig diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py index 1a8f8c94ba95..445cb76256bb 100644 --- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py @@ -38,8 +38,8 @@ TFTokenClassificationLoss, get_initializer, input_processing, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_deberta_v2 import DebertaV2Config diff --git a/src/transformers/models/distilbert/modeling_tf_distilbert.py b/src/transformers/models/distilbert/modeling_tf_distilbert.py index 05da8b306179..86a814a749bd 100644 --- a/src/transformers/models/distilbert/modeling_tf_distilbert.py +++ b/src/transformers/models/distilbert/modeling_tf_distilbert.py @@ -45,8 +45,8 @@ get_initializer, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_distilbert import DistilBertConfig diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py index f24b003b6012..68c639de91be 100644 --- a/src/transformers/models/electra/modeling_tf_electra.py +++ b/src/transformers/models/electra/modeling_tf_electra.py @@ -50,8 +50,8 @@ get_initializer, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_electra import ElectraConfig diff --git a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py index 8ba4ae31b83a..a2668b75b117 100644 --- a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py @@ -30,13 +30,8 @@ replace_return_docstrings, ) from ...modeling_tf_outputs import TFBaseModelOutput, TFSeq2SeqLMOutput -from ...modeling_tf_utils import ( - TFCausalLanguageModelingLoss, - TFPreTrainedModel, - get_initializer, - input_processing, - shape_list, -) +from ...modeling_tf_utils import TFCausalLanguageModelingLoss, TFPreTrainedModel, get_initializer, input_processing +from ...tf_utils import shape_list from ...utils import logging from ..auto.configuration_auto import AutoConfig from ..auto.modeling_tf_auto import TFAutoModel, TFAutoModelForCausalLM diff --git a/src/transformers/models/flaubert/modeling_tf_flaubert.py b/src/transformers/models/flaubert/modeling_tf_flaubert.py index 87c1c7e6b042..c681277a8076 100644 --- a/src/transformers/models/flaubert/modeling_tf_flaubert.py +++ b/src/transformers/models/flaubert/modeling_tf_flaubert.py @@ -38,8 +38,8 @@ get_initializer, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from ..xlm.modeling_tf_xlm import ( TFXLMForMultipleChoice, diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py index b3d9a8506eb7..9b4b6e7083ca 100644 --- a/src/transformers/models/funnel/modeling_tf_funnel.py +++ b/src/transformers/models/funnel/modeling_tf_funnel.py @@ -47,8 +47,8 @@ get_initializer, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_funnel import FunnelConfig diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py index ab32cc0e8314..d4939594d5ea 100644 --- a/src/transformers/models/gpt2/modeling_tf_gpt2.py +++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py @@ -44,8 +44,8 @@ get_initializer, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_gpt2 import GPT2Config diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py index 548ea5e3856e..936f2ab0dc22 100644 --- a/src/transformers/models/hubert/modeling_tf_hubert.py +++ b/src/transformers/models/hubert/modeling_tf_hubert.py @@ -28,13 +28,8 @@ replace_return_docstrings, ) from ...modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput -from ...modeling_tf_utils import ( - TFPreTrainedModel, - booleans_processing, - get_initializer, - keras_serializable, - shape_list, -) +from ...modeling_tf_utils import TFPreTrainedModel, booleans_processing, get_initializer, keras_serializable +from ...tf_utils import shape_list from ...tokenization_utils_base import BatchEncoding from ...utils import logging from .configuration_hubert import HubertConfig diff --git a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py index dbc9b21b0bda..6f308835007e 100644 --- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py @@ -39,8 +39,8 @@ get_initializer, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_layoutlm import LayoutLMConfig diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py index 924a62f7d99f..e282db0e811f 100644 --- a/src/transformers/models/led/modeling_tf_led.py +++ b/src/transformers/models/led/modeling_tf_led.py @@ -39,8 +39,8 @@ get_initializer, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_led import LEDConfig diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py index da34d11b80b1..458133a9b463 100644 --- a/src/transformers/models/longformer/modeling_tf_longformer.py +++ b/src/transformers/models/longformer/modeling_tf_longformer.py @@ -38,8 +38,8 @@ get_initializer, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_longformer import LongformerConfig diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py index be9be08fb171..ba094d6a0a7e 100644 --- a/src/transformers/models/marian/modeling_tf_marian.py +++ b/src/transformers/models/marian/modeling_tf_marian.py @@ -45,8 +45,8 @@ TFWrappedEmbeddings, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_marian import MarianConfig diff --git a/src/transformers/models/mbart/modeling_tf_mbart.py b/src/transformers/models/mbart/modeling_tf_mbart.py index f98408f8e1e7..59e41bd69489 100644 --- a/src/transformers/models/mbart/modeling_tf_mbart.py +++ b/src/transformers/models/mbart/modeling_tf_mbart.py @@ -44,8 +44,8 @@ TFWrappedEmbeddings, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_mbart import MBartConfig diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py index 928e7e8b1619..9b16c79f18e6 100644 --- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py @@ -51,8 +51,8 @@ get_initializer, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_mobilebert import MobileBertConfig diff --git a/src/transformers/models/mpnet/modeling_tf_mpnet.py b/src/transformers/models/mpnet/modeling_tf_mpnet.py index 0ed54a2ab1ca..196a47b1fb83 100644 --- a/src/transformers/models/mpnet/modeling_tf_mpnet.py +++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py @@ -47,8 +47,8 @@ get_initializer, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_mpnet import MPNetConfig diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py index a924fb40231c..cb680603a1df 100644 --- a/src/transformers/models/openai/modeling_tf_openai.py +++ b/src/transformers/models/openai/modeling_tf_openai.py @@ -39,8 +39,8 @@ get_initializer, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_openai import OpenAIGPTConfig diff --git a/src/transformers/models/pegasus/modeling_tf_pegasus.py b/src/transformers/models/pegasus/modeling_tf_pegasus.py index 86f922e7bbc6..cb1468740624 100644 --- a/src/transformers/models/pegasus/modeling_tf_pegasus.py +++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py @@ -45,8 +45,8 @@ TFWrappedEmbeddings, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_pegasus import PegasusConfig diff --git a/src/transformers/models/rembert/modeling_tf_rembert.py b/src/transformers/models/rembert/modeling_tf_rembert.py index 9bf6ba6edeeb..24a6387cd7c3 100644 --- a/src/transformers/models/rembert/modeling_tf_rembert.py +++ b/src/transformers/models/rembert/modeling_tf_rembert.py @@ -51,8 +51,8 @@ get_initializer, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_rembert import RemBertConfig diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py index 9aeb0a1eef58..b74863fb2079 100644 --- a/src/transformers/models/roberta/modeling_tf_roberta.py +++ b/src/transformers/models/roberta/modeling_tf_roberta.py @@ -52,8 +52,8 @@ get_initializer, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_roberta import RobertaConfig diff --git a/src/transformers/models/roformer/modeling_tf_roformer.py b/src/transformers/models/roformer/modeling_tf_roformer.py index 57a40a290587..393114df01ff 100644 --- a/src/transformers/models/roformer/modeling_tf_roformer.py +++ b/src/transformers/models/roformer/modeling_tf_roformer.py @@ -51,8 +51,8 @@ get_initializer, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_roformer import RoFormerConfig diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py index 7c69684e0611..0eba94521d25 100755 --- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py @@ -39,8 +39,8 @@ TFSharedEmbeddings, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_speech_to_text import Speech2TextConfig diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py index 5b030342ff56..ca307df70ebc 100644 --- a/src/transformers/models/t5/modeling_tf_t5.py +++ b/src/transformers/models/t5/modeling_tf_t5.py @@ -44,8 +44,8 @@ TFWrappedEmbeddings, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_t5 import T5Config diff --git a/src/transformers/models/tapas/modeling_tf_tapas.py b/src/transformers/models/tapas/modeling_tf_tapas.py index cdb7e8c113e9..46baba262798 100644 --- a/src/transformers/models/tapas/modeling_tf_tapas.py +++ b/src/transformers/models/tapas/modeling_tf_tapas.py @@ -45,8 +45,8 @@ get_initializer, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_tapas import TapasConfig diff --git a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py index ab8fb6f11b73..f1e23f77ec61 100644 --- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py +++ b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py @@ -34,8 +34,8 @@ get_initializer, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_transfo_xl import TransfoXLConfig from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask diff --git a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py index 06bcbf7c4b97..244c836b8c3f 100644 --- a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py +++ b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py @@ -30,13 +30,8 @@ replace_return_docstrings, ) from ...modeling_tf_outputs import TFBaseModelOutput, TFSeq2SeqLMOutput -from ...modeling_tf_utils import ( - TFCausalLanguageModelingLoss, - TFPreTrainedModel, - get_initializer, - input_processing, - shape_list, -) +from ...modeling_tf_utils import TFCausalLanguageModelingLoss, TFPreTrainedModel, get_initializer, input_processing +from ...tf_utils import shape_list from ...utils import logging from ..auto.configuration_auto import AutoConfig from ..auto.modeling_tf_auto import TFAutoModel, TFAutoModelForCausalLM diff --git a/src/transformers/models/vit/modeling_tf_vit.py b/src/transformers/models/vit/modeling_tf_vit.py index b1e027c96482..9a7025c662d7 100644 --- a/src/transformers/models/vit/modeling_tf_vit.py +++ b/src/transformers/models/vit/modeling_tf_vit.py @@ -32,8 +32,8 @@ get_initializer, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_vit import ViTConfig diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py index 6c079fcbf268..6ef3a3f98d02 100644 --- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py @@ -30,13 +30,8 @@ replace_return_docstrings, ) from ...modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput -from ...modeling_tf_utils import ( - TFPreTrainedModel, - booleans_processing, - get_initializer, - keras_serializable, - shape_list, -) +from ...modeling_tf_utils import TFPreTrainedModel, booleans_processing, get_initializer, keras_serializable +from ...tf_utils import shape_list from ...tokenization_utils_base import BatchEncoding from ...utils import logging from .configuration_wav2vec2 import Wav2Vec2Config diff --git a/src/transformers/models/xlm/modeling_tf_xlm.py b/src/transformers/models/xlm/modeling_tf_xlm.py index 6d6ff088ec0e..1554fa3103b2 100644 --- a/src/transformers/models/xlm/modeling_tf_xlm.py +++ b/src/transformers/models/xlm/modeling_tf_xlm.py @@ -50,8 +50,8 @@ get_initializer, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_xlm import XLMConfig diff --git a/src/transformers/models/xlnet/modeling_tf_xlnet.py b/src/transformers/models/xlnet/modeling_tf_xlnet.py index c31b82d7862c..ea0f6b6baf84 100644 --- a/src/transformers/models/xlnet/modeling_tf_xlnet.py +++ b/src/transformers/models/xlnet/modeling_tf_xlnet.py @@ -44,8 +44,8 @@ get_initializer, input_processing, keras_serializable, - shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_xlnet import XLNetConfig diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py index 37b62d5772c8..5bcc5df54c5f 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py @@ -53,7 +53,7 @@ get_initializer, input_processing, keras_serializable, - shape_list, +); from ...tf_utils import (shape_list, ) from ...utils import logging from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config @@ -1803,7 +1803,7 @@ def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAn TFWrappedEmbeddings, input_processing, keras_serializable, - shape_list, +); from ...tf_utils import (shape_list, ) from ...utils import logging from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config diff --git a/tests/test_modeling_tf_gpt2.py b/tests/test_modeling_tf_gpt2.py index dea63a786ba6..a3c2be89ebc3 100644 --- a/tests/test_modeling_tf_gpt2.py +++ b/tests/test_modeling_tf_gpt2.py @@ -33,8 +33,8 @@ TFGPT2ForSequenceClassification, TFGPT2LMHeadModel, TFGPT2Model, - shape_list, ) + from transformers.tf_utils import shape_list class TFGPT2ModelTester: From 4b2460d86cc8ea4c30778fca30058b409f7af590 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 14 Feb 2022 10:49:38 +0100 Subject: [PATCH 19/30] fix more shape list --- src/transformers/modeling_tf_utils.py | 23 --------------- .../modeling_tf_transfo_xl_utilities.py | 2 +- src/transformers/tf_utils.py | 29 +++++++++++++++---- ...tf_{{cookiecutter.lowercase_modelname}}.py | 2 +- tests/test_modeling_tf_longformer.py | 9 +----- 5 files changed, 27 insertions(+), 38 deletions(-) diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 428af92b2b3a..23071a02971d 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -2047,29 +2047,6 @@ def register_for_auto_class(cls, auto_class="TFAutoModel"): cls._auto_class = auto_class -def shape_list(tensor: Union[tf.Tensor, np.ndarray]) -> List[int]: - """ - Deal with dynamic shape in tensorflow cleanly. - - Args: - tensor (`tf.Tensor` or `np.ndarray`): The tensor we want the shape of. - - Returns: - `List[int]`: The shape of the tensor as a list. - """ - if isinstance(tensor, np.ndarray): - return list(tensor.shape) - - dynamic = tf.shape(tensor) - - if tensor.shape == tf.TensorShape(None): - return dynamic - - static = tensor.shape.as_list() - - return [dynamic[i] if s is None else s for i, s in enumerate(static)] - - def get_initializer(initializer_range: float = 0.02) -> tf.initializers.TruncatedNormal: """ Creates a `tf.initializers.TruncatedNormal` with the given range. diff --git a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl_utilities.py b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl_utilities.py index 53eb8239a5a4..af95f348ec28 100644 --- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl_utilities.py +++ b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl_utilities.py @@ -20,7 +20,7 @@ import tensorflow as tf -from ...modeling_tf_utils import shape_list +from ...tf_utils import shape_list class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): diff --git a/src/transformers/tf_utils.py b/src/transformers/tf_utils.py index 9899b8610a97..7b086fbaed64 100644 --- a/src/transformers/tf_utils.py +++ b/src/transformers/tf_utils.py @@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import List, Union + +import numpy as np import tensorflow as tf from .utils import logging @@ -20,14 +23,30 @@ logger = logging.get_logger(__name__) -def set_tensor_by_indices_to_value(tensor, indices, value): +def set_tensor_by_indices_to_value(tensor: tf.Tensor, indices: tf.Tensor, value: Union[tf.Tensor, int, float]): # create value_tensor since tensor value assignment is not possible in TF value_tensor = tf.zeros_like(tensor) + value return tf.where(indices, value_tensor, tensor) -def shape_list(x): - """Deal with dynamic shape in tensorflow cleanly.""" - static = x.shape.as_list() - dynamic = tf.shape(x) +def shape_list(tensor: Union[tf.Tensor, np.ndarray]) -> List[int]: + """ + Deal with dynamic shape in tensorflow cleanly. + + Args: + tensor (`tf.Tensor` or `np.ndarray`): The tensor we want the shape of. + + Returns: + `List[int]`: The shape of the tensor as a list. + """ + if isinstance(tensor, np.ndarray): + return list(tensor.shape) + + dynamic = tf.shape(tensor) + + if tensor.shape == tf.TensorShape(None): + return dynamic + + static = tensor.shape.as_list() + return [dynamic[i] if s is None else s for i, s in enumerate(static)] diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py index 5bcc5df54c5f..3dbe073e683d 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py @@ -53,8 +53,8 @@ get_initializer, input_processing, keras_serializable, -); from ...tf_utils import (shape_list, ) +from ...tf_utils import shape_list from ...utils import logging from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config diff --git a/tests/test_modeling_tf_longformer.py b/tests/test_modeling_tf_longformer.py index b88437a1373f..be96de22afa0 100644 --- a/tests/test_modeling_tf_longformer.py +++ b/tests/test_modeling_tf_longformer.py @@ -36,14 +36,7 @@ TFLongformerModel, TFLongformerSelfAttention, ) - - def shape_list(x): - """ - copied from transformers.modeling_tf_utils - """ - static = x.shape.as_list() - dynamic = tf.shape(x) - return [dynamic[i] if s is None else s for i, s in enumerate(static)] + from transformers.tf_utils import shape_list class TFLongformerModelTester: From ed5f2ff5250ed37dd431425fba1e3e28891e71e7 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 14 Feb 2022 11:01:10 +0100 Subject: [PATCH 20/30] import shape _list --- src/transformers/generation_tf_logits_process.py | 6 +++--- src/transformers/modeling_tf_utils.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/transformers/generation_tf_logits_process.py b/src/transformers/generation_tf_logits_process.py index 9d5aa3899eeb..3ef96b61cf0e 100644 --- a/src/transformers/generation_tf_logits_process.py +++ b/src/transformers/generation_tf_logits_process.py @@ -119,7 +119,7 @@ def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor) -> tf.Tensor: class TFRepetitionPenaltyLogitsProcessor(TFLogitsProcessor): r""" - [`LogitsProcessor`] enforcing an exponential penalty on repeated sequences. + [`TFLogitsProcessor`] enforcing an exponential penalty on repeated sequences. Args: repetition_penalty (`float`): @@ -160,7 +160,7 @@ def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor) -> tf.Tensor: class TFNoBadWordsLogitsProcessor(TFLogitsProcessor): """ - [`LogitsProcessor`] that enforces that specified sequences will never be sampled. + [`TFLogitsProcessor`] that enforces that specified sequences will never be sampled. Args: bad_words_ids (`List[List[int]]`): @@ -243,7 +243,7 @@ def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor) -> tf.Tensor: class TFNoRepeatNGramLogitsProcessor(TFLogitsProcessor): r""" - [`LogitsProcessor`] that enforces no repetition of n-grams. See + [`TFLogitsProcessor`] that enforces no repetition of n-grams. See [Fairseq](https://github.com/pytf/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345). Args: diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 23071a02971d..eeea20679730 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -54,6 +54,7 @@ ) from .generation_tf_utils import TFGenerationMixin from .modeling_tf_outputs import TFSeq2SeqLMOutput +from .tf_utils import shape_list from .tokenization_utils_base import BatchEncoding from .utils import logging From dd1c21409fec1783a94ce2cf97accfa8a5fb407b Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 14 Feb 2022 11:16:10 +0100 Subject: [PATCH 21/30] finish docs --- docs/source/internal/generation_utils.mdx | 18 ++++++++++++++ src/transformers/__init__.py | 17 +++++++++++++ src/transformers/utils/dummy_tf_objects.py | 28 ++++++++++++++++++++++ 3 files changed, 63 insertions(+) diff --git a/docs/source/internal/generation_utils.mdx b/docs/source/internal/generation_utils.mdx index 9eb4abe06d34..5ee321c0a442 100644 --- a/docs/source/internal/generation_utils.mdx +++ b/docs/source/internal/generation_utils.mdx @@ -148,6 +148,24 @@ generation. [[autodoc]] InfNanRemoveLogitsProcessor - __call__ +[[autodoc]] TFLogitsProcessor + - __call__ + +[[autodoc]] TFLogitsProcessorList + - __call__ + +[[autodoc]] TFMinLengthLogitsProcessor + - __call__ + +[[autodoc]] TFNoBadWordsLogitsProcessor + - __call__ + +[[autodoc]] TFNoRepeatNGramLogitsProcessor + - __call__ + +[[autodoc]] TFRepetitionPenaltyLogitsProcessor + - __call__ + [[autodoc]] FlaxLogitsProcessor - __call__ diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index f4b0e2908b61..0bd62c72af53 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1592,6 +1592,14 @@ _import_structure["activations_tf"] = [] _import_structure["benchmark.benchmark_args_tf"] = ["TensorFlowBenchmarkArguments"] _import_structure["benchmark.benchmark_tf"] = ["TensorFlowBenchmark"] + _import_structure["generation_tf_logits_process"] = [ + "TFLogitsProcessor", + "TFLogitsProcessorList", + "TFMinLengthLogitsProcessor", + "TFNoBadWordsLogitsProcessor", + "TFNoRepeatNGramLogitsProcessor", + "TFRepetitionPenaltyLogitsProcessor", + ] _import_structure["generation_tf_utils"] = ["tf_top_k_top_p_filtering"] _import_structure["keras_callbacks"] = ["KerasMetricCallback", "PushToHubCallback"] _import_structure["modeling_tf_outputs"] = [] @@ -2264,6 +2272,7 @@ "FlaxXGLMPreTrainedModel", ] ) + _import_structure["tf_utils"] = [] else: from .utils import dummy_flax_objects @@ -3572,6 +3581,14 @@ # Benchmarks from .benchmark.benchmark_tf import TensorFlowBenchmark + from .generation_tf_logits_process import ( + TFLogitsProcessor, + TFLogitsProcessorList, + TFMinLengthLogitsProcessor, + TFNoBadWordsLogitsProcessor, + TFNoRepeatNGramLogitsProcessor, + TFRepetitionPenaltyLogitsProcessor, + ) from .generation_tf_utils import tf_top_k_top_p_filtering from .keras_callbacks import KerasMetricCallback, PushToHubCallback from .modeling_tf_layoutlm import ( diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py index 02b401ef394e..6a766d93b27c 100644 --- a/src/transformers/utils/dummy_tf_objects.py +++ b/src/transformers/utils/dummy_tf_objects.py @@ -17,6 +17,34 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["tf"]) +class TFMinLengthLogitsProcessor(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFNoBadWordsLogitsProcessor(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFNoRepeatNGramLogitsProcessor(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFRepetitionPenaltyLogitsProcessor(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + def tf_top_k_top_p_filtering(*args, **kwargs): requires_backends(tf_top_k_top_p_filtering, ["tf"]) From 0c7d049e4da76718989c569262054d18b3f8b4cb Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 14 Feb 2022 11:24:46 +0100 Subject: [PATCH 22/30] fix imports --- src/transformers/generation_tf_utils.py | 2 +- tests/test_generation_tf_logits_process.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py index 9e81188b5ab8..98c755e6fc44 100644 --- a/src/transformers/generation_tf_utils.py +++ b/src/transformers/generation_tf_utils.py @@ -1924,7 +1924,7 @@ def _prepare_model_inputs(self, inputs: Optional[tf.Tensor] = None, bos_token_id "you should either supply a context to complete as `input_ids` input " "or a `bos_token_id` (integer >= 0) as a first token to start the generation." ) - return tf.fill((1, 1), bos_token_id, dtype=tf.int32) + return tf.cast(tf.fill((1, 1), bos_token_id), dtype=tf.int32) return inputs diff --git a/tests/test_generation_tf_logits_process.py b/tests/test_generation_tf_logits_process.py index 5beef35699e4..73aca4ca8a50 100644 --- a/tests/test_generation_tf_logits_process.py +++ b/tests/test_generation_tf_logits_process.py @@ -18,9 +18,6 @@ from transformers import is_tf_available from transformers.testing_utils import require_tf -from transformers.tf_utils import set_tensor_by_indices_to_value - -from .test_modeling_tf_common import ids_tensor if is_tf_available(): @@ -33,6 +30,9 @@ TFNoRepeatNGramLogitsProcessor, TFRepetitionPenaltyLogitsProcessor, ) + from transformers.tf_utils import set_tensor_by_indices_to_value + + from .test_modeling_tf_common import ids_tensor @require_tf From 726355e6cbcb91ce0ab11bf99766469eff474527 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 14 Feb 2022 11:25:25 +0100 Subject: [PATCH 23/30] make style --- src/transformers/utils/dummy_tf_objects.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py index 6a766d93b27c..6bba825a8897 100644 --- a/src/transformers/utils/dummy_tf_objects.py +++ b/src/transformers/utils/dummy_tf_objects.py @@ -17,6 +17,20 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["tf"]) +class TFLogitsProcessor(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFLogitsProcessorList(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + class TFMinLengthLogitsProcessor(metaclass=DummyObject): _backends = ["tf"] From 6293862619adbff6ec1b0d4cbb2f22c23840a4c7 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 14 Feb 2022 11:35:21 +0100 Subject: [PATCH 24/30] correct tf utils --- src/transformers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 0bd62c72af53..ad05486104ee 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -2054,6 +2054,7 @@ ] ) _import_structure["optimization_tf"] = ["AdamWeightDecay", "GradientAccumulator", "WarmUp", "create_optimizer"] + _import_structure["tf_utils"] = [] _import_structure["trainer_tf"] = ["TFTrainer"] else: @@ -2272,7 +2273,6 @@ "FlaxXGLMPreTrainedModel", ] ) - _import_structure["tf_utils"] = [] else: from .utils import dummy_flax_objects From b2934ee1ad636deecb1d4d2b52b50332b69f143e Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 14 Feb 2022 11:37:30 +0000 Subject: [PATCH 25/30] Fix TFRag as well --- .../models/rag/modeling_tf_rag.py | 41 +++++++++++-------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py index 4059b09cd80b..7ea2d3521b61 100644 --- a/src/transformers/models/rag/modeling_tf_rag.py +++ b/src/transformers/models/rag/modeling_tf_rag.py @@ -1269,6 +1269,8 @@ def generate( ) if return_dict_in_generate: + # TODO(Patrick): `encoder_outputs`, `past` hack. + # Remove after cleaning encoder-decoder outputs if output_attentions: model_kwargs["encoder_attentions"] = encoder_outputs.attentions if output_hidden_states: @@ -1350,28 +1352,35 @@ def extend_enc_output(tensor, num_beams=None): **model_kwargs, # encoder_outputs is here as in Pytorch's version ) else: - return self._generate_no_beam_search( - decoder_input_ids, - cur_len=cur_len, - max_length=max_length, - min_length=min_length, - do_sample=do_sample, - temperature=temperature, - top_k=top_k, - top_p=top_p, + pre_processor = self._get_logits_processor( repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, bad_words_ids=bad_words_ids, + min_length=min_length, + eos_token_id=eos_token_id, + ) + # TODO(Patrick) clean-up once generate is fully cleaned up + model_kwargs["attention_mask"] = context_attention_mask + # TODO(Patrick) remove once generate is fully cleaned up + model_kwargs.pop("output_hidden_states", None) + model_kwargs.pop("output_attentions", None) + model_kwargs.pop("output_scores", None) + + # TODO(Patrick): `encoder_outputs`, `past` hack. + # Remove after cleaning encoder-decoder outputs + model_kwargs["past"] = encoder_outputs + + return self.greedy_search( + input_ids=decoder_input_ids, + max_length=max_length, pad_token_id=pad_token_id, eos_token_id=eos_token_id, - batch_size=batch_size, - vocab_size=vocab_size, - attention_mask=context_attention_mask, - use_cache=use_cache, - forced_bos_token_id=None, - forced_eos_token_id=None, + logits_processor=pre_processor, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + output_scores=output_scores, return_dict_in_generate=return_dict_in_generate, - **model_kwargs, # encoder_outputs is here as in Pytorch's version + **model_kwargs, ) def get_input_embeddings(self): From 4f6d927b095b8a0cad3cee41754f57327fdaf0da Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 15 Feb 2022 12:59:33 +0100 Subject: [PATCH 26/30] Apply Lysandre's and Sylvais suggestions --- .../generation_flax_logits_process.py | 5 +- src/transformers/generation_logits_process.py | 5 +- .../generation_tf_logits_process.py | 5 +- src/transformers/generation_tf_utils.py | 70 +++++++++---------- tests/test_modeling_tf_gpt2.py | 56 +++------------ 5 files changed, 49 insertions(+), 92 deletions(-) diff --git a/src/transformers/generation_flax_logits_process.py b/src/transformers/generation_flax_logits_process.py index 1d6695341343..76a09ed012dd 100644 --- a/src/transformers/generation_flax_logits_process.py +++ b/src/transformers/generation_flax_logits_process.py @@ -14,7 +14,6 @@ # limitations under the License. import inspect -from abc import ABC import jax import jax.lax as lax @@ -48,7 +47,7 @@ """ -class FlaxLogitsProcessor(ABC): +class FlaxLogitsProcessor: """Abstract base class for all logit processors that can be applied during generation.""" @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING) @@ -59,7 +58,7 @@ def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray) -> jnp.ndarray: ) -class FlaxLogitsWarper(ABC): +class FlaxLogitsWarper: """Abstract base class for all logit warpers that can be applied during generation with multinomial sampling.""" @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING) diff --git a/src/transformers/generation_logits_process.py b/src/transformers/generation_logits_process.py index ad79273502e9..573ecd1a0a29 100644 --- a/src/transformers/generation_logits_process.py +++ b/src/transformers/generation_logits_process.py @@ -15,7 +15,6 @@ import inspect import math -from abc import ABC from typing import Callable, Iterable, List, Optional import numpy as np @@ -49,7 +48,7 @@ """ -class LogitsProcessor(ABC): +class LogitsProcessor: """Abstract base class for all logit processors that can be applied during generation.""" @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING) @@ -60,7 +59,7 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to ) -class LogitsWarper(ABC): +class LogitsWarper: """Abstract base class for all logit warpers that can be applied during generation with multinomial sampling.""" @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING) diff --git a/src/transformers/generation_tf_logits_process.py b/src/transformers/generation_tf_logits_process.py index 3ef96b61cf0e..56eed92184bf 100644 --- a/src/transformers/generation_tf_logits_process.py +++ b/src/transformers/generation_tf_logits_process.py @@ -14,7 +14,6 @@ # limitations under the License. import inspect -from abc import ABC from typing import List import numpy as np @@ -48,7 +47,7 @@ """ -class TFLogitsProcessor(ABC): +class TFLogitsProcessor: """Abstract base class for all logit processors that can be applied during generation.""" @add_start_docstrings(TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING) @@ -244,7 +243,7 @@ def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor) -> tf.Tensor: class TFNoRepeatNGramLogitsProcessor(TFLogitsProcessor): r""" [`TFLogitsProcessor`] that enforces no repetition of n-grams. See - [Fairseq](https://github.com/pytf/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345). + [Fairseq](https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345). Args: ngram_size (`int`): diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py index 98c755e6fc44..b8d4746fe2e6 100644 --- a/src/transformers/generation_tf_utils.py +++ b/src/transformers/generation_tf_utils.py @@ -1621,7 +1621,7 @@ def _generate( [What are attention masks?](../glossary#attention-mask) decoder_start_token_id (`int`, *optional*): If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token. - use_cache: (`bool`, *optional*, defaults to `True`): + use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding. output_attentions (`bool`, *optional*, defaults to `False`): @@ -1667,59 +1667,50 @@ def _generate( ```python tokenizer = AutoTokenizer.from_pretrained("distilgpt2") # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained( - "distilgpt2" - ) # Download model and configuration from huggingface.co and cache. - outputs = model.generate(max_length=40) # do greedy decoding + model = TFAutoModelWithLMHead.from_pretrained("distilgpt2") + # Greedy decoding + outputs = model.generate(max_length=40) print(f"Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}") - tokenizer = AutoTokenizer.from_pretrained("openai-gpt") # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained( - "openai-gpt" - ) # Download model and configuration from huggingface.co and cache. + tokenizer = AutoTokenizer.from_pretrained("openai-gpt") + model = TFAutoModelWithLMHead.from_pretrained("openai-gpt") input_context = "The dog" input_ids = tokenizer.encode(input_context, return_tensors="tf") # encode input context - outputs = model.generate( - input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5 - ) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog' - for i in range(3): # 3 output sequences were generated + # Generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog' + outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5) + # 3 output sequences were generated + for i in range(3): print(f"Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}") - tokenizer = AutoTokenizer.from_pretrained("distilgpt2") # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained( - "distilgpt2" - ) # Download model and configuration from huggingface.co and cache. + tokenizer = AutoTokenizer.from_pretrained("distilgpt2") + model = TFAutoModelWithLMHead.from_pretrained("distilgpt2") input_context = "The dog" - input_ids = tokenizer.encode(input_context, return_tensors="tf") # encode input context + input_ids = tokenizer.encode(input_context, return_tensors="tf") + # Generate 3 candidates using sampling outputs = model.generate( input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True - ) # generate 3 candidates using sampling - for i in range(3): # 3 output sequences were generated + ) + # 3 output sequences were generated + for i in range(3): print(f"Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}") - tokenizer = AutoTokenizer.from_pretrained("ctrl") # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained( - "ctrl" - ) # Download model and configuration from huggingface.co and cache. - input_context = "Legal My neighbor is" # "Legal" is one of the control codes for ctrl - input_ids = tokenizer.encode(input_context, return_tensors="tf") # encode input context - outputs = model.generate( - input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2 - ) # generate sequences + tokenizer = AutoTokenizer.from_pretrained("ctrl") + model = TFAutoModelWithLMHead.from_pretrained("ctrl") + # "Legal" is one of the control codes for ctrl + input_context = "Legal My neighbor is" + input_ids = tokenizer.encode(input_context, return_tensors="tf") + outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2) print(f"Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}") - tokenizer = AutoTokenizer.from_pretrained("gpt2") # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained( - "gpt2" - ) # Download model and configuration from huggingface.co and cache. + tokenizer = AutoTokenizer.from_pretrained("gpt2") + model = TFAutoModelWithLMHead.from_pretrained("gpt2") input_context = "My cute dog" bad_words_ids = [ tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ["idiot", "stupid", "shut up"] ] - input_ids = tokenizer.encode(input_context, return_tensors="tf") # encode input context - outputs = model.generate( - input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids - ) # generate sequences without allowing bad_words to be generated + input_ids = tokenizer.encode(input_context, return_tensors="tf") + # generate sequences without allowing bad_words to be generated + outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids) ```""" # 1. Set generation parameters if not already defined max_length = max_length if max_length is not None else self.config.max_length @@ -1787,7 +1778,10 @@ def _generate( if input_ids.shape[-1] >= max_length: raise ValueError( - f"The context has {input_ids.shape[-1]} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`" + f"The context has {input_ids.shape[-1]} number of tokens, " + f"but `max_length` is only {max_length}. " + "Please make sure that `max_length` is bigger than the number of tokens, " + "by setting either `generate(max_length=...,...)` or `config.max_length = ...`" ) # 5. determine generation mode diff --git a/tests/test_modeling_tf_gpt2.py b/tests/test_modeling_tf_gpt2.py index a3c2be89ebc3..4f66ec89f4e2 100644 --- a/tests/test_modeling_tf_gpt2.py +++ b/tests/test_modeling_tf_gpt2.py @@ -432,28 +432,11 @@ class TFGPT2ModelLanguageGenerationTest(unittest.TestCase): def test_lm_generate_distilgpt2(self): model = TFGPT2LMHeadModel.from_pretrained("distilgpt2") input_ids = tf.convert_to_tensor([[464, 1893]], dtype=tf.int32) # The president - expected_output_ids = [ - 464, - 1893, - 286, - 262, - 1578, - 1829, - 11, - 290, - 262, - 1893, - 286, - 262, - 1578, - 7526, - 11, - 423, - 587, - 287, - 262, - 2635, - ] # The president of the United States, and the president of the United Kingdom, have been in the White + + # The president of the United States, and the president of the United Kingdom, have been in the White + # fmt: off + expected_output_ids = [464, 1893, 286, 262, 1578, 1829, 11, 290, 262, 1893, 286, 262, 1578, 7526, 11, 423, 587, 287, 262, 2635] + # fmt: on output_ids = model.generate(input_ids, do_sample=False) self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids) @@ -486,30 +469,13 @@ def test_lm_generate_distilgpt2_batch_special(self): self.assertListEqual(output_strings, expected_output_string) @slow - def a_test_lm_generate_gpt2(self): + def test_lm_generate_gpt2(self): model = TFGPT2LMHeadModel.from_pretrained("gpt2") input_ids = tf.convert_to_tensor([[464, 3290]], dtype=tf.int32) # The dog - expected_output_ids = [ - 464, - 3290, - 373, - 1043, - 287, - 257, - 2214, - 1474, - 262, - 16246, - 286, - 2688, - 290, - 2688, - 27262, - 13, - 198, - 198, - 464, - 3290, - ] # The dog was found in a field near the intersection of West and West Streets.\n\nThe dog + + # The dog was found in a field near the intersection of West and West Streets.\n\nThe dog + # fmt: off + expected_output_ids = [464, 3290, 373, 1043, 287, 257, 2214, 1474, 262, 16246, 286, 2688, 290, 2688, 27262, 13, 198, 198, 464, 3290] + # fmt: on output_ids = model.generate(input_ids, do_sample=False) self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids) From 39c0b650ff39be57a91aee9388fb45a672bc3a6f Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 15 Feb 2022 16:59:16 +0100 Subject: [PATCH 27/30] Update tests/test_generation_tf_logits_process.py Co-authored-by: Matt --- tests/test_generation_tf_logits_process.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_generation_tf_logits_process.py b/tests/test_generation_tf_logits_process.py index 73aca4ca8a50..48b81b6a3b5d 100644 --- a/tests/test_generation_tf_logits_process.py +++ b/tests/test_generation_tf_logits_process.py @@ -41,7 +41,7 @@ def _get_uniform_logits(self, batch_size: int, length: int): scores = tf.ones((batch_size, length), dtype=tf.float32) / length return scores - def test_min_lenght_dist_processor(self): + def test_min_length_dist_processor(self): vocab_size = 20 batch_size = 4 eos_token_id = 0 From 920a9910d113ff87e7d7d43198d13070cabc6182 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 15 Feb 2022 17:04:36 +0100 Subject: [PATCH 28/30] Update src/transformers/tf_utils.py Co-authored-by: Matt --- src/transformers/tf_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/transformers/tf_utils.py b/src/transformers/tf_utils.py index 7b086fbaed64..42c744be7a67 100644 --- a/src/transformers/tf_utils.py +++ b/src/transformers/tf_utils.py @@ -25,8 +25,7 @@ def set_tensor_by_indices_to_value(tensor: tf.Tensor, indices: tf.Tensor, value: Union[tf.Tensor, int, float]): # create value_tensor since tensor value assignment is not possible in TF - value_tensor = tf.zeros_like(tensor) + value - return tf.where(indices, value_tensor, tensor) + return tf.where(indices, value, tensor) def shape_list(tensor: Union[tf.Tensor, np.ndarray]) -> List[int]: From 4b7d994e6bb5474fff7ca1dc2b40356bfb04b430 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 15 Feb 2022 17:11:08 +0100 Subject: [PATCH 29/30] remove cpu according to gante --- src/transformers/generation_tf_logits_process.py | 9 +++------ tests/test_generation_tf_logits_process.py | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/transformers/generation_tf_logits_process.py b/src/transformers/generation_tf_logits_process.py index 56eed92184bf..e2c0e2c4e393 100644 --- a/src/transformers/generation_tf_logits_process.py +++ b/src/transformers/generation_tf_logits_process.py @@ -134,9 +134,6 @@ def __init__(self, penalty: float): def _create_score_penalties(self, input_ids, logits): # create logit penalties for already seen input_ids - input_ids = input_ids.cpu() - logits = logits.cpu() - token_penalties = np.ones(logits.shape) prev_input_ids = [np.unique(input_id) for input_id in input_ids.numpy()] for i, prev_input_id in enumerate(prev_input_ids): @@ -210,7 +207,7 @@ def _tokens_match(prev_tokens, tokens): len(banned_token_seq) > 0 ), f"Banned words token sequences {self.bad_words_ids} cannot have an empty list" - if _tokens_match(prev_input_ids_slice.cpu().numpy().tolist(), banned_token_seq[:-1]) is False: + if _tokens_match(prev_input_ids_slice.numpy().tolist(), banned_token_seq[:-1]) is False: # if tokens do not match continue continue @@ -262,7 +259,7 @@ def calc_banned_ngram_tokens(self, prev_input_ids, num_hypos, cur_len): return [[] for _ in range(num_hypos)] generated_ngrams = [{} for _ in range(num_hypos)] for idx in range(num_hypos): - gen_tokens = prev_input_ids[idx].cpu().numpy().tolist() + gen_tokens = prev_input_ids[idx].numpy().tolist() generated_ngram = generated_ngrams[idx] for ngram in zip(*[gen_tokens[i:] for i in range(self.ngram_size)]): prev_ngram_tuple = tuple(ngram[:-1]) @@ -271,7 +268,7 @@ def calc_banned_ngram_tokens(self, prev_input_ids, num_hypos, cur_len): def _get_generated_ngrams(hypo_idx): # Before decoding the next token, prevent decoding of ngrams that have already appeared start_idx = cur_len + 1 - self.ngram_size - ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].cpu().numpy().tolist()) + ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].numpy().tolist()) return generated_ngrams[hypo_idx].get(ngram_idx, []) banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)] diff --git a/tests/test_generation_tf_logits_process.py b/tests/test_generation_tf_logits_process.py index 48b81b6a3b5d..fb9eb086e451 100644 --- a/tests/test_generation_tf_logits_process.py +++ b/tests/test_generation_tf_logits_process.py @@ -169,4 +169,4 @@ def test_processor_list(self): tf.debugging.assert_near(scores, scores_comp, atol=1e-3) # input_ids should never be changed - self.assertListEqual(input_ids.cpu().numpy().tolist(), input_ids_comp.cpu().numpy().tolist()) + self.assertListEqual(input_ids.numpy().tolist(), input_ids_comp.numpy().tolist()) From 3fbe55b961fe7b2529ccb3efee494f2dfda8f1a8 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 15 Feb 2022 16:17:36 +0000 Subject: [PATCH 30/30] correct logit processor --- src/transformers/generation_tf_logits_process.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/generation_tf_logits_process.py b/src/transformers/generation_tf_logits_process.py index e2c0e2c4e393..74a617685667 100644 --- a/src/transformers/generation_tf_logits_process.py +++ b/src/transformers/generation_tf_logits_process.py @@ -69,7 +69,7 @@ class TFLogitsProcessorList(list): def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, **kwargs) -> tf.Tensor: for processor in self: function_args = inspect.signature(processor.__call__).parameters - if len(function_args) > 3: + if len(function_args) > 2: if not all(arg in kwargs for arg in list(function_args.keys())[2:]): raise ValueError( f"Make sure that all the required parameters: {list(function_args.keys())} for "