diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md
index 0221622c4080..936e4bfb95da 100644
--- a/docs/source/en/internal/generation_utils.md
+++ b/docs/source/en/internal/generation_utils.md
@@ -140,9 +140,6 @@ generation.
 [[autodoc]] ForcedEOSTokenLogitsProcessor
     - __call__
 
-[[autodoc]] ForceTokensLogitsProcessor
-    - __call__
-
 [[autodoc]] HammingDiversityLogitsProcessor
     - __call__
 
diff --git a/docs/source/ja/internal/generation_utils.md b/docs/source/ja/internal/generation_utils.md
index 9e3ce7799543..1a5cc1dec079 100644
--- a/docs/source/ja/internal/generation_utils.md
+++ b/docs/source/ja/internal/generation_utils.md
@@ -139,9 +139,6 @@ generation_output[:2]
 [[autodoc]] ForcedEOSTokenLogitsProcessor
     - __call__
 
-[[autodoc]] ForceTokensLogitsProcessor
-    - __call__
-
 [[autodoc]] HammingDiversityLogitsProcessor
     - __call__
 
diff --git a/docs/source/zh/internal/generation_utils.md b/docs/source/zh/internal/generation_utils.md
index 75f28c233ee0..084e2a29dc8c 100644
--- a/docs/source/zh/internal/generation_utils.md
+++ b/docs/source/zh/internal/generation_utils.md
@@ -133,9 +133,6 @@ generation_output[:2]
 [[autodoc]] ForcedEOSTokenLogitsProcessor
     - __call__
 
-[[autodoc]] ForceTokensLogitsProcessor
-    - __call__
-
 [[autodoc]] HammingDiversityLogitsProcessor
     - __call__
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 1d36e7f8c746..ced2b9997366 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1276,7 +1276,6 @@
             "ExponentialDecayLengthPenalty",
             "ForcedBOSTokenLogitsProcessor",
             "ForcedEOSTokenLogitsProcessor",
-            "ForceTokensLogitsProcessor",
             "GenerationMixin",
             "HammingDiversityLogitsProcessor",
             "InfNanRemoveLogitsProcessor",
@@ -6059,7 +6058,6 @@
             ExponentialDecayLengthPenalty,
             ForcedBOSTokenLogitsProcessor,
             ForcedEOSTokenLogitsProcessor,
-            ForceTokensLogitsProcessor,
             GenerationMixin,
             HammingDiversityLogitsProcessor,
             InfNanRemoveLogitsProcessor,
diff --git a/src/transformers/commands/pt_to_tf.py b/src/transformers/commands/pt_to_tf.py
index 4002b5e0eb85..ad0dbd14e15b 100644
--- a/src/transformers/commands/pt_to_tf.py
+++ b/src/transformers/commands/pt_to_tf.py
@@ -12,45 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import inspect
 import os
 from argparse import ArgumentParser, Namespace
-from importlib import import_module
 
-import huggingface_hub
-import numpy as np
-from packaging import version
-
-from .. import (
-    FEATURE_EXTRACTOR_MAPPING,
-    IMAGE_PROCESSOR_MAPPING,
-    PROCESSOR_MAPPING,
-    TOKENIZER_MAPPING,
-    AutoConfig,
-    AutoFeatureExtractor,
-    AutoImageProcessor,
-    AutoProcessor,
-    AutoTokenizer,
-    is_datasets_available,
-    is_tf_available,
-    is_torch_available,
-)
-from ..utils import TF2_WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, logging
+from ..utils import logging
 from . import BaseTransformersCLICommand
 
 
-if is_tf_available():
-    import tensorflow as tf
-
-    tf.config.experimental.enable_tensor_float_32_execution(False)
-
-if is_torch_available():
-    import torch
-
-if is_datasets_available():
-    from datasets import load_dataset
-
-
 MAX_ERROR = 5e-5  # larger error tolerance than in our internal tests, to avoid flaky user-facing errors
 
 
@@ -136,44 +104,6 @@ def register_subcommand(parser: ArgumentParser):
         )
         train_parser.set_defaults(func=convert_command_factory)
 
-    @staticmethod
-    def find_pt_tf_differences(pt_outputs, tf_outputs):
-        """
-        Compares the TensorFlow and PyTorch outputs, returning a dictionary with all tensor differences.
-        """
-        # 1. All output attributes must be the same
-        pt_out_attrs = set(pt_outputs.keys())
-        tf_out_attrs = set(tf_outputs.keys())
-        if pt_out_attrs != tf_out_attrs:
-            raise ValueError(
-                f"The model outputs have different attributes, aborting. (Pytorch: {pt_out_attrs}, TensorFlow:"
-                f" {tf_out_attrs})"
-            )
-
-        # 2. For each output attribute, computes the difference
-        def _find_pt_tf_differences(pt_out, tf_out, differences, attr_name=""):
-            # If the current attribute is a tensor, it is a leaf and we make the comparison. Otherwise, we will dig in
-            # recursivelly, keeping the name of the attribute.
-            if isinstance(pt_out, torch.Tensor):
-                tensor_difference = np.max(np.abs(pt_out.numpy() - tf_out.numpy()))
-                differences[attr_name] = tensor_difference
-            else:
-                root_name = attr_name
-                for i, pt_item in enumerate(pt_out):
-                    # If it is a named attribute, we keep the name. Otherwise, just its index.
-                    if isinstance(pt_item, str):
-                        branch_name = root_name + pt_item
-                        tf_item = tf_out[pt_item]
-                        pt_item = pt_out[pt_item]
-                    else:
-                        branch_name = root_name + f"[{i}]"
-                        tf_item = tf_out[i]
-                    differences = _find_pt_tf_differences(pt_item, tf_item, differences, branch_name)
-
-            return differences
-
-        return _find_pt_tf_differences(pt_outputs, tf_outputs, {})
-
     def __init__(
         self,
         model_name: str,
@@ -196,237 +126,12 @@ def __init__(
         self._extra_commit_description = extra_commit_description
         self._override_model_class = override_model_class
 
-    def get_inputs(self, pt_model, tf_dummy_inputs, config):
-        """
-        Returns the right inputs for the model, based on its signature.
-        """
-
-        def _get_audio_input():
-            ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-            speech_samples = ds.sort("id").select(range(2))[:2]["audio"]
-            raw_samples = [x["array"] for x in speech_samples]
-            return raw_samples
-
-        model_config_class = type(pt_model.config)
-        if model_config_class in PROCESSOR_MAPPING:
-            processor = AutoProcessor.from_pretrained(self._local_dir)
-            if model_config_class in TOKENIZER_MAPPING and processor.tokenizer.pad_token is None:
-                processor.tokenizer.pad_token = processor.tokenizer.eos_token
-        elif model_config_class in IMAGE_PROCESSOR_MAPPING:
-            processor = AutoImageProcessor.from_pretrained(self._local_dir)
-        elif model_config_class in FEATURE_EXTRACTOR_MAPPING:
-            processor = AutoFeatureExtractor.from_pretrained(self._local_dir)
-        elif model_config_class in TOKENIZER_MAPPING:
-            processor = AutoTokenizer.from_pretrained(self._local_dir)
-            if processor.pad_token is None:
-                processor.pad_token = processor.eos_token
-        else:
-            raise ValueError(f"Unknown data processing type (model config type: {model_config_class})")
-
-        model_forward_signature = set(inspect.signature(pt_model.forward).parameters.keys())
-        processor_inputs = {}
-        if "input_ids" in model_forward_signature:
-            processor_inputs.update(
-                {
-                    "text": ["Hi there!", "I am a batch with more than one row and different input lengths."],
-                    "padding": True,
-                    "truncation": True,
-                }
-            )
-        if "pixel_values" in model_forward_signature:
-            sample_images = load_dataset("uoft-cs/cifar10", "plain_text", split="test")[:2]["img"]  # no-script
-            processor_inputs.update({"images": sample_images})
-        if "input_features" in model_forward_signature:
-            feature_extractor_signature = inspect.signature(processor.feature_extractor).parameters
-            # Pad to the largest input length by default but take feature extractor default
-            # padding value if it exists e.g. "max_length" and is not False or None
-            if "padding" in feature_extractor_signature:
-                default_strategy = feature_extractor_signature["padding"].default
-                if default_strategy is not False and default_strategy is not None:
-                    padding_strategy = default_strategy
-                else:
-                    padding_strategy = True
-            else:
-                padding_strategy = True
-            processor_inputs.update({"audio": _get_audio_input(), "padding": padding_strategy})
-        if "input_values" in model_forward_signature:  # Wav2Vec2 audio input
-            processor_inputs.update({"audio": _get_audio_input(), "padding": True})
-        pt_input = processor(**processor_inputs, return_tensors="pt")
-        tf_input = processor(**processor_inputs, return_tensors="tf")
-
-        # Extra input requirements, in addition to the input modality
-        if (
-            config.is_encoder_decoder
-            or (hasattr(pt_model, "encoder") and hasattr(pt_model, "decoder"))
-            or "decoder_input_ids" in tf_dummy_inputs
-        ):
-            decoder_input_ids = np.asarray([[1], [1]], dtype=int) * (pt_model.config.decoder_start_token_id or 0)
-            pt_input.update({"decoder_input_ids": torch.tensor(decoder_input_ids)})
-            tf_input.update({"decoder_input_ids": tf.convert_to_tensor(decoder_input_ids)})
-
-        return pt_input, tf_input
-
     def run(self):
-        self._logger.warning(
-            "\n\nConverting PyTorch weights to TensorFlow is deprecated and will be removed in v4.43. "
+        # TODO (joao): delete file in v4.47
+        raise NotImplementedError(
+            "\n\nConverting PyTorch weights to TensorFlow weights was removed in v4.43. "
             "Instead, we recommend that you convert PyTorch weights to Safetensors, an improved "
             "format that can be loaded by any framework, including TensorFlow. For more information, "
             "please see the Safetensors conversion guide: "
             "https://huggingface.co/docs/safetensors/en/convert-weights\n\n"
         )
-        # hub version 0.9.0 introduced the possibility of programmatically opening PRs with normal write tokens.
-        if version.parse(huggingface_hub.__version__) < version.parse("0.9.0"):
-            raise ImportError(
-                "The huggingface_hub version must be >= 0.9.0 to use this command. Please update your huggingface_hub"
-                " installation."
-            )
-        else:
-            from huggingface_hub import Repository, create_commit
-            from huggingface_hub._commit_api import CommitOperationAdd
-
-        # Fetch remote data
-        repo = Repository(local_dir=self._local_dir, clone_from=self._model_name)
-
-        # Load config and get the appropriate architecture -- the latter is needed to convert the head's weights
-        config = AutoConfig.from_pretrained(self._local_dir)
-        architectures = config.architectures
-        if self._override_model_class is not None:
-            if self._override_model_class.startswith("TF"):
-                architectures = [self._override_model_class[2:]]
-            else:
-                architectures = [self._override_model_class]
-            try:
-                pt_class = getattr(import_module("transformers"), architectures[0])
-            except AttributeError:
-                raise ValueError(f"Model class {self._override_model_class} not found in transformers.")
-            try:
-                tf_class = getattr(import_module("transformers"), "TF" + architectures[0])
-            except AttributeError:
-                raise ValueError(f"TF model class TF{self._override_model_class} not found in transformers.")
-        elif architectures is None:  # No architecture defined -- use auto classes
-            pt_class = getattr(import_module("transformers"), "AutoModel")
-            tf_class = getattr(import_module("transformers"), "TFAutoModel")
-            self._logger.warning("No detected architecture, using AutoModel/TFAutoModel")
-        else:  # Architecture defined -- use it
-            if len(architectures) > 1:
-                raise ValueError(f"More than one architecture was found, aborting. (architectures = {architectures})")
-            self._logger.warning(f"Detected architecture: {architectures[0]}")
-            pt_class = getattr(import_module("transformers"), architectures[0])
-            try:
-                tf_class = getattr(import_module("transformers"), "TF" + architectures[0])
-            except AttributeError:
-                raise AttributeError(f"The TensorFlow equivalent of {architectures[0]} doesn't exist in transformers.")
-
-        # Check the TF dummy inputs to see what keys we need in the forward pass
-        tf_from_pt_model = tf_class.from_config(config)
-        tf_dummy_inputs = tf_from_pt_model.dummy_inputs
-
-        del tf_from_pt_model  # Try to keep only one model in memory at a time
-
-        # Load the model and get some basic inputs
-        pt_model = pt_class.from_pretrained(self._local_dir)
-        pt_model.eval()
-
-        pt_input, tf_input = self.get_inputs(pt_model, tf_dummy_inputs, config)
-
-        with torch.no_grad():
-            pt_outputs = pt_model(**pt_input, output_hidden_states=True)
-        del pt_model  # will no longer be used, and may have a large memory footprint
-
-        tf_from_pt_model = tf_class.from_pretrained(self._local_dir, from_pt=True)
-        tf_from_pt_outputs = tf_from_pt_model(**tf_input, output_hidden_states=True, training=False)
-
-        # Confirms that cross loading PT weights into TF worked.
-        crossload_differences = self.find_pt_tf_differences(pt_outputs, tf_from_pt_outputs)
-        output_differences = {k: v for k, v in crossload_differences.items() if "hidden" not in k}
-        hidden_differences = {k: v for k, v in crossload_differences.items() if "hidden" in k}
-        if len(output_differences) == 0 and architectures is not None:
-            raise ValueError(
-                f"Something went wrong -- the config file has architectures ({architectures}), but no model head"
-                " output was found. All outputs start with 'hidden'"
-            )
-        max_crossload_output_diff = max(output_differences.values()) if output_differences else 0.0
-        max_crossload_hidden_diff = max(hidden_differences.values())
-        if max_crossload_output_diff > self._max_error or max_crossload_hidden_diff > self._max_error:
-            raise ValueError(
-                "The cross-loaded TensorFlow model has different outputs, something went wrong!\n"
-                + f"\nList of maximum output differences above the threshold ({self._max_error}):\n"
-                + "\n".join([f"{k}: {v:.3e}" for k, v in output_differences.items() if v > self._max_error])
-                + f"\n\nList of maximum hidden layer differences above the threshold ({self._max_error}):\n"
-                + "\n".join([f"{k}: {v:.3e}" for k, v in hidden_differences.items() if v > self._max_error])
-            )
-
-        # Save the weights in a TF format (if needed) and confirms that the results are still good
-        tf_weights_path = os.path.join(self._local_dir, TF2_WEIGHTS_NAME)
-        tf_weights_index_path = os.path.join(self._local_dir, TF2_WEIGHTS_INDEX_NAME)
-        if (not os.path.exists(tf_weights_path) and not os.path.exists(tf_weights_index_path)) or self._new_weights:
-            tf_from_pt_model.save_pretrained(self._local_dir)
-        del tf_from_pt_model  # will no longer be used, and may have a large memory footprint
-
-        tf_model = tf_class.from_pretrained(self._local_dir)
-        tf_outputs = tf_model(**tf_input, output_hidden_states=True)
-
-        conversion_differences = self.find_pt_tf_differences(pt_outputs, tf_outputs)
-        output_differences = {k: v for k, v in conversion_differences.items() if "hidden" not in k}
-        hidden_differences = {k: v for k, v in conversion_differences.items() if "hidden" in k}
-        if len(output_differences) == 0 and architectures is not None:
-            raise ValueError(
-                f"Something went wrong -- the config file has architectures ({architectures}), but no model head"
-                " output was found. All outputs start with 'hidden'"
-            )
-        max_conversion_output_diff = max(output_differences.values()) if output_differences else 0.0
-        max_conversion_hidden_diff = max(hidden_differences.values())
-        if max_conversion_output_diff > self._max_error or max_conversion_hidden_diff > self._max_error:
-            raise ValueError(
-                "The converted TensorFlow model has different outputs, something went wrong!\n"
-                + f"\nList of maximum output differences above the threshold ({self._max_error}):\n"
-                + "\n".join([f"{k}: {v:.3e}" for k, v in output_differences.items() if v > self._max_error])
-                + f"\n\nList of maximum hidden layer differences above the threshold ({self._max_error}):\n"
-                + "\n".join([f"{k}: {v:.3e}" for k, v in hidden_differences.items() if v > self._max_error])
-            )
-
-        commit_message = "Update TF weights" if self._new_weights else "Add TF weights"
-        if self._push:
-            repo.git_add(auto_lfs_track=True)
-            repo.git_commit(commit_message)
-            repo.git_push(blocking=True)  # this prints a progress bar with the upload
-            self._logger.warning(f"TF weights pushed into {self._model_name}")
-        elif not self._no_pr:
-            self._logger.warning("Uploading the weights into a new PR...")
-            commit_descrition = (
-                "Model converted by the [`transformers`' `pt_to_tf`"
-                " CLI](https://github.com/huggingface/transformers/blob/main/src/transformers/commands/pt_to_tf.py). "
-                "All converted model outputs and hidden layers were validated against its PyTorch counterpart.\n\n"
-                f"Maximum crossload output difference={max_crossload_output_diff:.3e}; "
-                f"Maximum crossload hidden layer difference={max_crossload_hidden_diff:.3e};\n"
-                f"Maximum conversion output difference={max_conversion_output_diff:.3e}; "
-                f"Maximum conversion hidden layer difference={max_conversion_hidden_diff:.3e};\n"
-            )
-            if self._max_error > MAX_ERROR:
-                commit_descrition += (
-                    f"\n\nCAUTION: The maximum admissible error was manually increased to {self._max_error}!"
-                )
-            if self._extra_commit_description:
-                commit_descrition += "\n\n" + self._extra_commit_description
-
-            # sharded model -> adds all related files (index and .h5 shards)
-            if os.path.exists(tf_weights_index_path):
-                operations = [
-                    CommitOperationAdd(path_in_repo=TF2_WEIGHTS_INDEX_NAME, path_or_fileobj=tf_weights_index_path)
-                ]
-                for shard_path in tf.io.gfile.glob(self._local_dir + "/tf_model-*.h5"):
-                    operations += [
-                        CommitOperationAdd(path_in_repo=os.path.basename(shard_path), path_or_fileobj=shard_path)
-                    ]
-            else:
-                operations = [CommitOperationAdd(path_in_repo=TF2_WEIGHTS_NAME, path_or_fileobj=tf_weights_path)]
-
-            hub_pr_url = create_commit(
-                repo_id=self._model_name,
-                operations=operations,
-                commit_message=commit_message,
-                commit_description=commit_descrition,
-                repo_type="model",
-                create_pr=True,
-            ).pr_url
-            self._logger.warning(f"PR open in {hub_pr_url}")
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 2f84bc29aee2..c6e3d90b9f0c 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -81,6 +81,15 @@ class PretrainedConfig(PushToHubMixin):
       model.
     - **num_hidden_layers** (`int`) -- The number of blocks in the model.
 
+    <Tip warning={true}>
+
+    Setting parameters for sequence generation in the model config is deprecated. For backward compatibility, loading
+    some of them will still be possible, but attempting to overwrite them will throw an exception -- you should set
+    them in a [~transformers.GenerationConfig]. Check the documentation of [~transformers.GenerationConfig] for more
+    information about the individual parameters.
+
+    </Tip>
+
     Arg:
         name_or_path (`str`, *optional*, defaults to `""`):
             Store the string that was passed to [`PreTrainedModel.from_pretrained`] or
@@ -117,77 +126,6 @@ class PretrainedConfig(PushToHubMixin):
             sequence_length embeddings at a time. For more information on feed forward chunking, see [How does Feed
             Forward Chunking work?](../glossary.html#feed-forward-chunking).
 
-        > Parameters for sequence generation
-
-        max_length (`int`, *optional*, defaults to 20):
-            Maximum length that will be used by default in the `generate` method of the model.
-        min_length (`int`, *optional*, defaults to 0):
-            Minimum length that will be used by default in the `generate` method of the model.
-        do_sample (`bool`, *optional*, defaults to `False`):
-            Flag that will be used by default in the `generate` method of the model. Whether or not to use sampling ;
-            use greedy decoding otherwise.
-        early_stopping (`bool`, *optional*, defaults to `False`):
-            Flag that will be used by default in the `generate` method of the model. Whether to stop the beam search
-            when at least `num_beams` sentences are finished per batch or not.
-        num_beams (`int`, *optional*, defaults to 1):
-            Number of beams for beam search that will be used by default in the `generate` method of the model. 1 means
-            no beam search.
-        num_beam_groups (`int`, *optional*, defaults to 1):
-            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams
-            that will be used by default in the `generate` method of the model. 1 means no group beam search.
-        diversity_penalty (`float`, *optional*, defaults to 0.0):
-            Value to control diversity for group beam search. that will be used by default in the `generate` method of
-            the model. 0 means no diversity penalty. The higher the penalty, the more diverse are the outputs.
-        temperature (`float`, *optional*, defaults to 1.0):
-            The value used to module the next token probabilities that will be used by default in the `generate` method
-            of the model. Must be strictly positive.
-        top_k (`int`, *optional*, defaults to 50):
-            Number of highest probability vocabulary tokens to keep for top-k-filtering that will be used by default in
-            the `generate` method of the model.
-        top_p (`float`, *optional*, defaults to 1):
-            Value that will be used by default in the `generate` method of the model for `top_p`. If set to float < 1,
-            only the most probable tokens with probabilities that add up to `top_p` or higher are kept for generation.
-        typical_p (`float`, *optional*, defaults to 1):
-            Local typicality measures how similar the conditional probability of predicting a target token next is to
-            the expected conditional probability of predicting a random token next, given the partial text already
-            generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that
-            add up to `typical_p` or higher are kept for generation. See [this
-            paper](https://arxiv.org/pdf/2202.00666.pdf) for more details.
-        repetition_penalty (`float`, *optional*, defaults to 1):
-            Parameter for repetition penalty that will be used by default in the `generate` method of the model. 1.0
-            means no penalty.
-        length_penalty (`float`, *optional*, defaults to 1):
-            Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
-            the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
-            likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
-            `length_penalty` < 0.0 encourages shorter sequences.
-        no_repeat_ngram_size (`int`, *optional*, defaults to 0) -- Value that will be used by default in the
-            `generate` method of the model for `no_repeat_ngram_size`. If set to int > 0, all ngrams of that size can
-            only occur once.
-        encoder_no_repeat_ngram_size (`int`, *optional*, defaults to 0) -- Value that will be used by
-            default in the `generate` method of the model for `encoder_no_repeat_ngram_size`. If set to int > 0, all
-            ngrams of that size that occur in the `encoder_input_ids` cannot occur in the `decoder_input_ids`.
-        bad_words_ids (`List[int]`, *optional*):
-            List of token ids that are not allowed to be generated that will be used by default in the `generate`
-            method of the model. In order to get the tokens of the words that should not appear in the generated text,
-            use `tokenizer.encode(bad_word, add_prefix_space=True)`.
-        num_return_sequences (`int`, *optional*, defaults to 1):
-            Number of independently computed returned sequences for each element in the batch that will be used by
-            default in the `generate` method of the model.
-        output_scores (`bool`, *optional*, defaults to `False`):
-            Whether the model should return the logits when used for generation.
-        return_dict_in_generate (`bool`, *optional*, defaults to `False`):
-            Whether the model should return a [`~transformers.utils.ModelOutput`] instead of a `torch.LongTensor`.
-        forced_bos_token_id (`int`, *optional*):
-            The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful for
-            multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be the target
-            language token.
-        forced_eos_token_id (`int`, *optional*):
-            The id of the token to force as the last generated token when `max_length` is reached.
-        remove_invalid_values (`bool`, *optional*):
-            Whether to remove possible _nan_ and _inf_ outputs of the model to prevent the generation method to crash.
-            Note that using `remove_invalid_values` can slow down generation.
-
         > Parameters for fine-tuning tasks
 
         architectures (`List[str]`, *optional*):
@@ -287,7 +225,7 @@ def __init__(self, **kwargs):
 
         # Retrocompatibility: Parameters for sequence generation. While we will keep the ability to load these
         # parameters, saving them will be deprecated. In a distant future, we won't need to load them.
-        for parameter_name, default_value in self._get_generation_defaults().items():
+        for parameter_name, default_value in self._get_global_generation_defaults().items():
             setattr(self, parameter_name, kwargs.pop(parameter_name, default_value))
 
         # Fine-tuning task arguments
@@ -440,16 +378,13 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
         if os.path.isfile(save_directory):
             raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
 
-        non_default_generation_parameters = {}
-        for parameter_name, default_value in self._get_generation_defaults().items():
-            if hasattr(self, parameter_name) and getattr(self, parameter_name) != default_value:
-                non_default_generation_parameters[parameter_name] = getattr(self, parameter_name)
+        non_default_generation_parameters = self._get_non_default_generation_parameters()
         if len(non_default_generation_parameters) > 0:
-            logger.warning(
-                "Some non-default generation parameters are set in the model config. These should go into a "
-                "GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) "
-                "instead. This warning will be raised to an exception in v4.41.\n"
-                f"Non-default generation parameters: {str(non_default_generation_parameters)}"
+            raise ValueError(
+                "Some non-default generation parameters are set in the model config. These should go into either a) "
+                "`model.generation_config` (as opposed to `model.config`); OR b) a GenerationConfig file "
+                "(https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) "
+                f"\nNon-default generation parameters: {str(non_default_generation_parameters)}"
             )
 
         os.makedirs(save_directory, exist_ok=True)
@@ -1049,7 +984,7 @@ def register_for_auto_class(cls, auto_class="AutoConfig"):
         cls._auto_class = auto_class
 
     @staticmethod
-    def _get_generation_defaults() -> Dict[str, Any]:
+    def _get_global_generation_defaults() -> Dict[str, Any]:
         return {
             "max_length": 20,
             "min_length": 0,
@@ -1078,14 +1013,49 @@ def _get_generation_defaults() -> Dict[str, Any]:
             "begin_suppress_tokens": None,
         }
 
-    def _has_non_default_generation_parameters(self) -> bool:
+    def _get_non_default_generation_parameters(self) -> Dict[str, Any]:
         """
-        Whether or not this instance holds non-default generation parameters.
+        Gets the non-default generation parameters on the PretrainedConfig instance
         """
-        for parameter_name, default_value in self._get_generation_defaults().items():
-            if hasattr(self, parameter_name) and getattr(self, parameter_name) != default_value:
-                return True
-        return False
+        non_default_generation_parameters = {}
+        decoder_attribute_name = None
+        default_config = None
+
+        # Composite models don't have a default config, use their decoder config as a fallback for default values
+        # If no known pattern is matched, then `default_config = None` -> check against the global generation defaults
+        try:
+            default_config = self.__class__()
+        except ValueError:
+            for decoder_attribute_name in ("decoder", "generator", "text_config"):
+                if hasattr(self, decoder_attribute_name):
+                    default_config = getattr(self, decoder_attribute_name).__class__()
+                    break
+
+        # If it is a composite model, we want to check the subconfig that will be used for generation
+        self_decoder_config = self if decoder_attribute_name is None else getattr(self, decoder_attribute_name)
+
+        for parameter_name, default_global_value in self._get_global_generation_defaults().items():
+            if hasattr(self_decoder_config, parameter_name):
+                is_default_in_config = is_default_generation_value = None
+                parameter_value = getattr(self_decoder_config, parameter_name)
+                # Three cases in which is okay for the model config to hold generation config parameters:
+                # 1. The parameter is set to `None`, effectivelly delegating its value to the generation config
+                if parameter_value is None:
+                    continue
+                # 2. If we have a default config, then the instance should hold the same generation defaults
+                if default_config is not None:
+                    is_default_in_config = parameter_value == getattr(default_config, parameter_name)
+                # 3. if we don't have a default config, then the instance should hold the global generation defaults
+                else:
+                    is_default_generation_value = parameter_value == default_global_value
+
+                is_non_default = (is_default_in_config is False) or (
+                    is_default_in_config is None and is_default_generation_value is False
+                )
+                if is_non_default:
+                    non_default_generation_parameters[parameter_name] = getattr(self_decoder_config, parameter_name)
+
+        return non_default_generation_parameters
 
 
 def get_configuration_file(configuration_files: List[str]) -> str:
diff --git a/src/transformers/generation/__init__.py b/src/transformers/generation/__init__.py
index 6880321d6326..faf5266b84ae 100644
--- a/src/transformers/generation/__init__.py
+++ b/src/transformers/generation/__init__.py
@@ -55,7 +55,6 @@
         "ExponentialDecayLengthPenalty",
         "ForcedBOSTokenLogitsProcessor",
         "ForcedEOSTokenLogitsProcessor",
-        "ForceTokensLogitsProcessor",
         "HammingDiversityLogitsProcessor",
         "InfNanRemoveLogitsProcessor",
         "LogitNormalization",
@@ -201,7 +200,6 @@
             ExponentialDecayLengthPenalty,
             ForcedBOSTokenLogitsProcessor,
             ForcedEOSTokenLogitsProcessor,
-            ForceTokensLogitsProcessor,
             HammingDiversityLogitsProcessor,
             InfNanRemoveLogitsProcessor,
             LogitNormalization,
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 7f89e239245b..e9ba45606829 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -15,7 +15,6 @@
 
 import inspect
 import math
-import warnings
 from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
@@ -1844,34 +1843,6 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
         return scores
 
 
-class ForceTokensLogitsProcessor(LogitsProcessor):
-    r"""
-    This processor takes a list of pairs of integers which indicates a mapping from generation indices to token
-    indices that will be forced before generation. The processor will set their log probs to `inf` so that they are
-    sampled at their corresponding index. Originally created for
-    [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper).
-    """
-
-    def __init__(self, force_token_map: List[List[int]], _has_warned: Optional[bool] = False):
-        self.force_token_map = dict(force_token_map)
-        if not _has_warned:
-            # TODO(Sanchit): remove this processor entirely in v4.40
-            warnings.warn(
-                "This `ForceTokensLogitsProcessor` has been deprecated and will be removed in v4.40. Should you need to provide prompt ids for generation, specify `input_ids` to the generate method for decoder-only models, or `decoder_input_ids` for encoder-decoder models.",
-                FutureWarning,
-            )
-
-    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        generation_idx = input_ids.shape[-1]
-        current_token = self.force_token_map.get(generation_idx, None)
-        scores_processed = scores
-        if current_token is not None:
-            scores_processed = torch.full_like(scores, -float("inf"))
-            scores_processed[:, current_token] = 0
-        return scores_processed
-
-
 class WhisperTimeStampLogitsProcessor(LogitsProcessor):
     r"""
 
diff --git a/src/transformers/generation/stopping_criteria.py b/src/transformers/generation/stopping_criteria.py
index f8e94f6f86a0..961b6d6f5e43 100644
--- a/src/transformers/generation/stopping_criteria.py
+++ b/src/transformers/generation/stopping_criteria.py
@@ -85,36 +85,6 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwa
         return torch.full((input_ids.shape[0],), is_done, device=input_ids.device, dtype=torch.bool)
 
 
-class MaxNewTokensCriteria(StoppingCriteria):
-    """
-    This class can be used to stop generation whenever the generated number of tokens exceeds `max_new_tokens`. Keep in
-    mind for decoder-only type of transformers, this will **not** include the initial prompted tokens. This is very
-    close to `MaxLengthCriteria` but ignores the number of initial tokens.
-
-    Args:
-        start_length (`int`):
-            The number of initial tokens.
-        max_new_tokens (`int`):
-            The maximum number of tokens to generate.
-    """
-
-    def __init__(self, start_length: int, max_new_tokens: int):
-        warnings.warn(
-            "The class `MaxNewTokensCriteria` is deprecated and will be removed in v4.43. "
-            f"Please use `MaxLengthCriteria(max_length={start_length + max_new_tokens})` "
-            "with `max_length = start_length + max_new_tokens` instead.",
-            FutureWarning,
-        )
-        self.start_length = start_length
-        self.max_new_tokens = max_new_tokens
-        self.max_length = start_length + max_new_tokens
-
-    @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.BoolTensor:
-        is_done = input_ids.shape[-1] >= self.max_length
-        return torch.full((input_ids.shape[0],), is_done, device=input_ids.device, dtype=torch.bool)
-
-
 class MaxTimeCriteria(StoppingCriteria):
     """
     This class can be used to stop generation whenever the full generation exceeds some amount of time. By default, the
@@ -516,8 +486,6 @@ def max_length(self) -> Optional[int]:
         for stopping_criterium in self:
             if isinstance(stopping_criterium, MaxLengthCriteria):
                 return stopping_criterium.max_length
-            elif isinstance(stopping_criterium, MaxNewTokensCriteria):
-                return stopping_criterium.max_length
         return None
 
 
diff --git a/src/transformers/generation/tf_logits_process.py b/src/transformers/generation/tf_logits_process.py
index fc9799b7ab39..58824b7b0071 100644
--- a/src/transformers/generation/tf_logits_process.py
+++ b/src/transformers/generation/tf_logits_process.py
@@ -520,15 +520,21 @@ def __init__(self, begin_suppress_tokens, begin_index):
         self.begin_index = begin_index
 
     def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
-        scores = tf.cond(
-            tf.equal(cur_len, self.begin_index),
-            lambda: tf.tensor_scatter_nd_update(
-                scores,
-                indices=[[i, token] for i in range(scores.shape[0]) for token in self.begin_suppress_tokens],
-                updates=[-float("inf") for _ in range(scores.shape[0] * len(self.begin_suppress_tokens))],
-            ),
-            lambda: scores,
-        )
+        suppressed_indices = []
+        for token in self.begin_suppress_tokens:
+            if token < scores.shape[-1]:  # to ensure we don't go beyond the vocab size
+                suppressed_indices.extend([[i, token] for i in range(scores.shape[0])])
+
+        if len(suppressed_indices) > 0:
+            scores = tf.cond(
+                tf.equal(cur_len, self.begin_index),
+                lambda: tf.tensor_scatter_nd_update(
+                    scores,
+                    indices=suppressed_indices,
+                    updates=[-float("inf") for _ in range(scores.shape[0] * len(self.begin_suppress_tokens))],
+                ),
+                lambda: scores,
+            )
         return scores
 
 
@@ -540,11 +546,17 @@ def __init__(self, suppress_tokens):
         self.suppress_tokens = list(suppress_tokens)
 
     def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
-        scores = tf.tensor_scatter_nd_update(
-            scores,
-            indices=[[i, token] for i in range(scores.shape[0]) for token in self.suppress_tokens],
-            updates=[-float("inf") for _ in range(scores.shape[0] * len(self.suppress_tokens))],
-        )
+        suppressed_indices = []
+        for token in self.suppress_tokens:
+            if token < scores.shape[-1]:  # to ensure we don't go beyond the vocab size
+                suppressed_indices.extend([[i, token] for i in range(scores.shape[0])])
+
+        if len(suppressed_indices) > 0:
+            scores = tf.tensor_scatter_nd_update(
+                scores,
+                indices=[[i, token] for i in range(scores.shape[0]) for token in self.suppress_tokens],
+                updates=[-float("inf") for _ in range(scores.shape[0] * len(self.suppress_tokens))],
+            )
         return scores
 
 
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index a9ebdcdd4775..07251a5c4e50 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -76,7 +76,6 @@
     ExponentialDecayLengthPenalty,
     ForcedBOSTokenLogitsProcessor,
     ForcedEOSTokenLogitsProcessor,
-    ForceTokensLogitsProcessor,
     HammingDiversityLogitsProcessor,
     InfNanRemoveLogitsProcessor,
     LogitNormalization,
@@ -865,9 +864,6 @@ def _get_logits_processor(
                 if (input_ids_seq_length > 1 or generation_config.forced_bos_token_id is None)
                 else begin_index + 1
             )
-            if generation_config.forced_decoder_ids is not None:
-                # generation starts after the last token that is forced
-                begin_index += generation_config.forced_decoder_ids[-1][0]
             processors.append(
                 SuppressTokensAtBeginLogitsProcessor(
                     generation_config.begin_suppress_tokens,
@@ -876,12 +872,11 @@ def _get_logits_processor(
                 )
             )
         if generation_config.forced_decoder_ids is not None:
-            # TODO(Sanchit): deprecate in v4.40 by removing this logic
-            warnings.warn(
-                "You have explicitly specified `forced_decoder_ids`. This functionality has been deprecated and will throw an error in v4.40. Please remove the `forced_decoder_ids` argument in favour of `input_ids` or `decoder_input_ids` respectively.",
-                FutureWarning,
+            # TODO (sanchit): move this exception to GenerationConfig.validate() when TF & FLAX are aligned with PT
+            raise ValueError(
+                "You have explicitly specified `forced_decoder_ids`. Please remove the `forced_decoder_ids` argument "
+                "in favour of `input_ids` or `decoder_input_ids` respectively.",
             )
-            processors.append(ForceTokensLogitsProcessor(generation_config.forced_decoder_ids, _has_warned=True))
         if generation_config.watermarking_config is not None:
             processors.append(
                 WatermarkLogitsProcessor(
@@ -1344,19 +1339,18 @@ def _prepare_generation_config(
         using_model_generation_config = False
         if generation_config is None:
             # legacy: users may modify the model configuration to control generation. To trigger this legacy behavior,
-            # three conditions must be met
+            # the following conditions must be met
             # 1) the generation config must have been created from the model config (`_from_model_config` field);
             # 2) the generation config must have seen no modification since its creation (the hash is the same);
             # 3) the user must have set generation parameters in the model config.
             # NOTE: `torch.compile` can't compile `hash`, this legacy support is disabled with compilation.
             if (
                 not is_torchdynamo_compiling()
-                and self.generation_config._from_model_config
-                and self.generation_config._original_object_hash == hash(self.generation_config)
-                and self.config._has_non_default_generation_parameters()
+                and self.generation_config._from_model_config  # 1)
+                and self.generation_config._original_object_hash == hash(self.generation_config)  # 2)
             ):
                 new_generation_config = GenerationConfig.from_model_config(self.config)
-                if new_generation_config != self.generation_config:
+                if new_generation_config != self.generation_config:  # 3)
                     warnings.warn(
                         "You have modified the pretrained model configuration to control generation. This is a"
                         " deprecated strategy to control generation and will be removed soon, in a future version."
@@ -2273,13 +2267,6 @@ def heal_tokens(
 
         return input_ids
 
-    def contrastive_search(self, *args, **kwargs):
-        logger.warning_once(
-            "Calling `contrastive_search` directly is deprecated and will be removed in v4.41. Use `generate` or a "
-            "custom generation loop instead.",
-        )
-        return self._contrastive_search(*args, **kwargs)
-
     def _dola_decoding(
         self,
         input_ids: torch.LongTensor,
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index b92d4b447f19..b943b5e7989f 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -2571,26 +2571,21 @@ def save_pretrained(
         # Save the config
         if is_main_process:
             if not _hf_peft_config_loaded:
+                # If the model config has set attributes that should be in the generation config, move them there.
+                misplaced_generation_parameters = model_to_save.config._get_non_default_generation_parameters()
+                if self.can_generate() and len(misplaced_generation_parameters) > 0:
+                    warnings.warn(
+                        "Moving the following attributes in the config to the generation config: "
+                        f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
+                        "generation parameters in the model config, as opposed to in the generation config.",
+                        UserWarning,
+                    )
+                    for param_name, param_value in misplaced_generation_parameters.items():
+                        setattr(model_to_save.generation_config, param_name, param_value)
+                        setattr(model_to_save.config, param_name, None)
+
                 model_to_save.config.save_pretrained(save_directory)
             if self.can_generate():
-                # generation config built from the model config + the model config holds generation kwargs -> generate
-                # may revert to legacy behavior if the two don't match
-                if (
-                    model_to_save.generation_config._from_model_config
-                    and model_to_save.config._has_non_default_generation_parameters()
-                ):
-                    new_generation_config = GenerationConfig.from_model_config(model_to_save.config)
-                    if new_generation_config != model_to_save.generation_config:
-                        logger.warning(
-                            "Your generation config was originally created from the model config, but the model "
-                            "config has changed since then. Unless you pass the `generation_config` argument to this "
-                            "model's `generate` calls, they will revert to the legacy behavior where the base "
-                            "`generate` parameterization is loaded from the model config instead. "
-                            "To avoid this behavior and this warning, we recommend you to overwrite the generation "
-                            "config model attribute before calling the model's `save_pretrained`, preferably also "
-                            "removing any generation kwargs from the model config. This warning will be raised to an "
-                            "exception in v4.41."
-                        )
                 model_to_save.generation_config.save_pretrained(save_directory)
 
             if _hf_peft_config_loaded:
diff --git a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
index 9e1d995dc291..7980667a68d7 100644
--- a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """Audio Spectogram Transformer (AST) model configuration"""
 
+from typing import Any, Dict
+
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -118,3 +120,9 @@ def __init__(
         self.time_stride = time_stride
         self.max_length = max_length
         self.num_mel_bins = num_mel_bins
+
+    # Overwritten from the parent class: AST is not compatible with `generate`, but has a config parameter sharing the
+    # same name (`max_length`). Sharing the same name triggers checks regarding the config -> generation_config
+    # generative parameters deprecation cycle, overwriting this function prevents this from happening.
+    def _get_non_default_generation_parameters(self) -> Dict[str, Any]:
+        return {}
diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index c1caae6c6857..e365744f8b9e 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -693,7 +693,7 @@ def forward(
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
                 "Using `past_key_values` as a tuple is deprecated and will be removed in v4.45. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
         batch_size, seq_length, _ = inputs_embeds.shape
diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py
index 1920f350f559..e668a0dc0631 100644
--- a/src/transformers/models/codegen/modeling_codegen.py
+++ b/src/transformers/models/codegen/modeling_codegen.py
@@ -532,7 +532,7 @@ def forward(
             if not self.training:
                 logger.warning_once(
                     "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. "
-                    "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                    "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
                 )
 
         seq_length = inputs_embeds.shape[1]
diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 3486d5ed3ab0..1d8f011c3f6a 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -1066,7 +1066,7 @@ def forward(
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
                 "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
         if cache_position is None:
diff --git a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
index 8c0ae2771e81..ab5d49b32fea 100644
--- a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
@@ -74,9 +74,11 @@ class EncoderDecoderConfig(PretrainedConfig):
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        assert (
-            "encoder" in kwargs and "decoder" in kwargs
-        ), "Config has to be initialized with encoder and decoder config"
+        if "encoder" not in kwargs or "decoder" not in kwargs:
+            raise ValueError(
+                f"A configuraton of type {self.model_type} cannot be instantiated because "
+                f"both `encoder` and `decoder` sub-configurations were not passed, only {kwargs}"
+            )
         encoder_config = kwargs.pop("encoder")
         encoder_model_type = encoder_config.pop("model_type")
         decoder_config = kwargs.pop("decoder")
diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py
index edaef78f9286..a9acd171c3ae 100644
--- a/src/transformers/models/falcon/modeling_falcon.py
+++ b/src/transformers/models/falcon/modeling_falcon.py
@@ -1029,7 +1029,7 @@ def forward(
             if not self.training:
                 logger.warning_once(
                     "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. "
-                    "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                    "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
                 )
 
         alibi = None
diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py
index a05d2c059e21..c648dee8fb56 100644
--- a/src/transformers/models/gemma/modeling_gemma.py
+++ b/src/transformers/models/gemma/modeling_gemma.py
@@ -861,7 +861,7 @@ def forward(
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
                 "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
         # decoder layers
diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py
index 581f2b3947b4..4807289c927c 100644
--- a/src/transformers/models/git/modeling_git.py
+++ b/src/transformers/models/git/modeling_git.py
@@ -423,7 +423,7 @@ def forward(
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
                 "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
         all_hidden_states = () if output_hidden_states else None
diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
index 3a606c37b31c..65144ad0c0f1 100755
--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -747,7 +747,7 @@ def forward(
             if not self.training:
                 logger.warning_once(
                     "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. "
-                    "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                    "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
                 )
 
         seq_length = inputs_embeds.shape[1]
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index 22fbb0429f59..5d21f2d2a725 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -928,7 +928,7 @@ def forward(
             if not self.training:
                 logger.warning_once(
                     "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. "
-                    "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                    "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
                 )
 
         seq_length = inputs_embeds.shape[1]
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index 82540fe98ec7..ba0f319791e4 100644
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -819,7 +819,7 @@ def forward(
             if not self.training:
                 logger.warning_once(
                     "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. "
-                    "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                    "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
                 )
 
         seq_length = inputs_embeds.shape[1]
diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py
index 3532219f3d6c..b4c24a46bb68 100644
--- a/src/transformers/models/idefics/modeling_idefics.py
+++ b/src/transformers/models/idefics/modeling_idefics.py
@@ -1243,7 +1243,7 @@ def forward(
             if not self.training:
                 logger.warning_once(
                     "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. "
-                    "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                    "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
                 )
             return_legacy_cache = True
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index 8716d27f5481..293ce1dd7f6b 100644
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -950,7 +950,7 @@ def forward(
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
                 "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
         if cache_position is None:
diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py
index 1a2b732e85e4..7ee8a12e74cb 100644
--- a/src/transformers/models/mistral/modeling_mistral.py
+++ b/src/transformers/models/mistral/modeling_mistral.py
@@ -766,7 +766,7 @@ def forward(
             return_legacy_cache = True
             logger.warning_once(
                 "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
         if cache_position is None:
diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py
index 522b6db7bcc7..d9ca3be7b0d4 100644
--- a/src/transformers/models/mixtral/modeling_mixtral.py
+++ b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -1022,7 +1022,7 @@ def forward(
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
                 "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
         if inputs_embeds is None:
diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py
index 1940660f61b5..c185112f318c 100644
--- a/src/transformers/models/olmo/modeling_olmo.py
+++ b/src/transformers/models/olmo/modeling_olmo.py
@@ -872,7 +872,7 @@ def forward(
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
                 "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
         if cache_position is None:
diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py
index 1e4f56c0674d..e6f3f491d309 100644
--- a/src/transformers/models/persimmon/modeling_persimmon.py
+++ b/src/transformers/models/persimmon/modeling_persimmon.py
@@ -682,7 +682,7 @@ def forward(
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
                 "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
         if inputs_embeds is None:
diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py
index 6d63c0ea7e8e..f53ae35ca4ce 100644
--- a/src/transformers/models/phi/modeling_phi.py
+++ b/src/transformers/models/phi/modeling_phi.py
@@ -966,7 +966,7 @@ def forward(
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
                 "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
         if inputs_embeds is None:
diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py
index 08417fcabfaa..d6788c5cc350 100644
--- a/src/transformers/models/phi3/modeling_phi3.py
+++ b/src/transformers/models/phi3/modeling_phi3.py
@@ -1007,7 +1007,7 @@ def forward(
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
                 "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
         if inputs_embeds is None:
diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py
index 28b414b1901b..59fe54819df1 100644
--- a/src/transformers/models/qwen2/modeling_qwen2.py
+++ b/src/transformers/models/qwen2/modeling_qwen2.py
@@ -871,7 +871,7 @@ def forward(
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
                 "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
         if inputs_embeds is None:
diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
index 12ebe26e058d..f1f8ca3ff532 100644
--- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
+++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
@@ -1033,7 +1033,7 @@ def forward(
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
                 "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
         if inputs_embeds is None:
diff --git a/src/transformers/models/rag/configuration_rag.py b/src/transformers/models/rag/configuration_rag.py
index 5dd4d12c5e74..98de7382a456 100644
--- a/src/transformers/models/rag/configuration_rag.py
+++ b/src/transformers/models/rag/configuration_rag.py
@@ -124,9 +124,11 @@ def __init__(
             vocab_size=vocab_size,
             **kwargs,
         )
-        assert (
-            "question_encoder" in kwargs and "generator" in kwargs
-        ), "Config has to be initialized with question_encoder and generator config"
+        if "question_encoder" not in kwargs or "generator" not in kwargs:
+            raise ValueError(
+                f"A configuraton of type {self.model_type} cannot be instantiated because "
+                f"both `question_encoder` and `generator` sub-configurations were not passed, only {kwargs}"
+            )
         question_encoder_config = kwargs.pop("question_encoder")
         question_encoder_model_type = question_encoder_config.pop("model_type")
         decoder_config = kwargs.pop("generator")
diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py
index 988948a9a827..ae317af37875 100755
--- a/src/transformers/models/stablelm/modeling_stablelm.py
+++ b/src/transformers/models/stablelm/modeling_stablelm.py
@@ -959,7 +959,7 @@ def forward(
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
                 "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
         if inputs_embeds is None:
diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py
index d51077b04254..21469e9d2223 100644
--- a/src/transformers/models/starcoder2/modeling_starcoder2.py
+++ b/src/transformers/models/starcoder2/modeling_starcoder2.py
@@ -844,7 +844,7 @@ def forward(
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
                 "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
         if inputs_embeds is None:
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 4732ecea8611..18df9ad6193f 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -289,13 +289,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ForceTokensLogitsProcessor(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class GenerationMixin(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/generation/test_stopping_criteria.py b/tests/generation/test_stopping_criteria.py
index ddf9a1c9379e..a04dac96169e 100644
--- a/tests/generation/test_stopping_criteria.py
+++ b/tests/generation/test_stopping_criteria.py
@@ -28,7 +28,6 @@
     from transformers.generation import (
         EosTokenCriteria,
         MaxLengthCriteria,
-        MaxNewTokensCriteria,
         MaxTimeCriteria,
         StoppingCriteriaList,
         StopStringCriteria,
@@ -76,21 +75,6 @@ def test_max_length_criteria(self):
         input_ids, scores = self._get_tensors(10)
         self.assertTrue(all(criteria(input_ids, scores)))
 
-    def test_max_new_tokens_criteria(self):
-        criteria = MaxNewTokensCriteria(start_length=5, max_new_tokens=5)
-
-        input_ids, scores = self._get_tensors(5)
-        self.assertFalse(all(criteria(input_ids, scores)))
-
-        input_ids, scores = self._get_tensors(9)
-        self.assertFalse(all(criteria(input_ids, scores)))
-
-        input_ids, scores = self._get_tensors(10)
-        self.assertTrue(all(criteria(input_ids, scores)))
-
-        criteria_list = StoppingCriteriaList([criteria])
-        self.assertEqual(criteria_list.max_length, 10)
-
     def test_max_time_criteria(self):
         input_ids, scores = self._get_tensors(5)
 
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index ae52f6c67404..db2bb7989dea 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1576,6 +1576,11 @@ def test_generate_continue_from_past_key_values(self):
             # 3. ignore `token_type_ids` for simplicity
             # 4. ignore `forced_eos_token_id`, which requires further manipulation of the continuation inputs and is
             #    active by default on some models
+            # 5. ignore `encoder_no_repeat_ngram_size`, which is set by default in some encoder-decoder models. When
+            #    we use their decoder as a stand-alone model, `encoder_no_repeat_ngram_size` actually prevents
+            #    repetition exclusively from the prompt. This test relies on comparing one call vs 2 calls
+            #    with cache, what is considered a prompt is different in the two cases.
+
             if "token_type_ids" in inputs:
                 del inputs["token_type_ids"]
 
@@ -1583,6 +1588,7 @@ def test_generate_continue_from_past_key_values(self):
             model.eval()
             model.generation_config.pad_token_id = model.generation_config.eos_token_id = -1
             model.generation_config.forced_eos_token_id = None
+            model.generation_config.encoder_no_repeat_ngram_size = 0
             model.generation_config.use_cache = True
 
             # If "past_key_values" is not returned, skip the test (e.g. RWKV uses a different cache name and format)
@@ -2790,7 +2796,7 @@ def forward(self, input_ids, **kwargs):
     def test_default_max_length_warning(self):
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model.config.pad_token_id = tokenizer.eos_token_id
+        model.generation_config.pad_token_id = tokenizer.eos_token_id
 
         text = "Hello world"
         tokenized_inputs = tokenizer([text], return_tensors="pt")
@@ -2817,8 +2823,8 @@ def test_length_warning_assisted_generation(self):
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
         assistant = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model.config.pad_token_id = tokenizer.eos_token_id
-        assistant.config.pad_token_id = tokenizer.eos_token_id
+        model.generation_config.pad_token_id = tokenizer.eos_token_id
+        assistant.generation_config.pad_token_id = tokenizer.eos_token_id
 
         text = "Hello world"
         tokenized_inputs = tokenizer([text], return_tensors="pt")
@@ -2839,8 +2845,8 @@ def test_generated_length_assisted_generation(self):
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
         assistant = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model.config.pad_token_id = tokenizer.eos_token_id
-        assistant.config.pad_token_id = tokenizer.eos_token_id
+        model.generation_config.pad_token_id = tokenizer.eos_token_id
+        assistant.generation_config.pad_token_id = tokenizer.eos_token_id
 
         text = "Hello world"
         tokenized_inputs = tokenizer([text], return_tensors="pt")
@@ -2866,7 +2872,7 @@ def test_model_kwarg_assisted_decoding_decoder_only(self):
         # PT-only test: TF doesn't support assisted decoding yet.
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model.config.pad_token_id = tokenizer.eos_token_id
+        model.generation_config.pad_token_id = tokenizer.eos_token_id
 
         text = "Hello world"
         tokenized_inputs = tokenizer([text], return_tensors="pt")
diff --git a/tests/models/bart/test_modeling_bart.py b/tests/models/bart/test_modeling_bart.py
index dd0cb5bf4c0b..eda51d21199f 100644
--- a/tests/models/bart/test_modeling_bart.py
+++ b/tests/models/bart/test_modeling_bart.py
@@ -123,12 +123,6 @@ def __init__(
         self.pad_token_id = pad_token_id
         self.bos_token_id = bos_token_id
 
-        # forcing a certain token to be generated, sets all other tokens to -inf
-        # if however the token to be generated is already at -inf then it can lead token
-        # `nan` values and thus break generation
-        self.forced_bos_token_id = None
-        self.forced_eos_token_id = None
-
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
@@ -158,8 +152,6 @@ def get_config(self):
             eos_token_id=self.eos_token_id,
             bos_token_id=self.bos_token_id,
             pad_token_id=self.pad_token_id,
-            forced_bos_token_id=self.forced_bos_token_id,
-            forced_eos_token_id=self.forced_eos_token_id,
         )
 
     def get_pipeline_config(self):
diff --git a/tests/models/blenderbot/test_modeling_blenderbot.py b/tests/models/blenderbot/test_modeling_blenderbot.py
index fa0797cbeed8..cecedb8a9071 100644
--- a/tests/models/blenderbot/test_modeling_blenderbot.py
+++ b/tests/models/blenderbot/test_modeling_blenderbot.py
@@ -116,12 +116,6 @@ def __init__(
         self.pad_token_id = pad_token_id
         self.bos_token_id = bos_token_id
 
-        # forcing a certain token to be generated, sets all other tokens to -inf
-        # if however the token to be generated is already at -inf then it can lead token
-        # `nan` values and thus break generation
-        self.forced_bos_token_id = None
-        self.forced_eos_token_id = None
-
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
             3,
@@ -150,8 +144,6 @@ def get_config(self):
             eos_token_id=self.eos_token_id,
             bos_token_id=self.bos_token_id,
             pad_token_id=self.pad_token_id,
-            forced_bos_token_id=self.forced_bos_token_id,
-            forced_eos_token_id=self.forced_eos_token_id,
         )
 
     def get_pipeline_config(self):
@@ -368,7 +360,6 @@ def __init__(
         decoder_attention_heads=4,
         max_position_embeddings=30,
         is_encoder_decoder=False,
-        encoder_no_repeat_ngram_size=0,
         pad_token_id=0,
         bos_token_id=1,
         eos_token_id=2,
@@ -399,7 +390,6 @@ def __init__(
         self.use_cache = use_cache
         self.max_position_embeddings = max_position_embeddings
         self.is_encoder_decoder = is_encoder_decoder
-        self.encoder_no_repeat_ngram_size = encoder_no_repeat_ngram_size
 
         self.scope = None
         self.decoder_key_length = decoder_seq_length
@@ -431,7 +421,6 @@ def prepare_config_and_inputs(self):
             decoder_start_token_id=self.decoder_start_token_id,
             max_position_embeddings=self.max_position_embeddings,
             is_encoder_decoder=self.is_encoder_decoder,
-            encoder_no_repeat_ngram_size=self.encoder_no_repeat_ngram_size,
         )
 
         return (
diff --git a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
index 6be86a66b98e..59f68b547547 100644
--- a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
@@ -113,12 +113,6 @@ def __init__(
         self.pad_token_id = pad_token_id
         self.bos_token_id = bos_token_id
 
-        # forcing a certain token to be generated, sets all other tokens to -inf
-        # if however the token to be generated is already at -inf then it can lead token
-        # `nan` values and thus break generation
-        self.forced_bos_token_id = None
-        self.forced_eos_token_id = None
-
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
             3,
@@ -147,8 +141,6 @@ def get_config(self):
             eos_token_id=self.eos_token_id,
             bos_token_id=self.bos_token_id,
             pad_token_id=self.pad_token_id,
-            forced_bos_token_id=self.forced_bos_token_id,
-            forced_eos_token_id=self.forced_eos_token_id,
         )
 
     def prepare_config_and_inputs_for_common(self):
diff --git a/tests/models/decision_transformer/test_modeling_decision_transformer.py b/tests/models/decision_transformer/test_modeling_decision_transformer.py
index 27d1598167e6..0c95e6291c50 100644
--- a/tests/models/decision_transformer/test_modeling_decision_transformer.py
+++ b/tests/models/decision_transformer/test_modeling_decision_transformer.py
@@ -41,7 +41,6 @@ def __init__(
         act_dim=6,
         state_dim=17,
         hidden_size=23,
-        max_length=11,
         is_training=True,
     ):
         self.parent = parent
@@ -50,7 +49,6 @@ def __init__(
         self.act_dim = act_dim
         self.state_dim = state_dim
         self.hidden_size = hidden_size
-        self.max_length = max_length
         self.is_training = is_training
 
     def prepare_config_and_inputs(self):
@@ -80,7 +78,6 @@ def get_config(self):
             act_dim=self.act_dim,
             state_dim=self.state_dim,
             hidden_size=self.hidden_size,
-            max_length=self.max_length,
         )
 
     def create_and_check_model(
diff --git a/tests/models/marian/test_modeling_marian.py b/tests/models/marian/test_modeling_marian.py
index 4f4fa36622d1..aed5381fcc70 100644
--- a/tests/models/marian/test_modeling_marian.py
+++ b/tests/models/marian/test_modeling_marian.py
@@ -132,12 +132,6 @@ def __init__(
         self.bos_token_id = bos_token_id
         self.decoder_start_token_id = decoder_start_token_id
 
-        # forcing a certain token to be generated, sets all other tokens to -inf
-        # if however the token to be generated is already at -inf then it can lead token
-        # `nan` values and thus break generation
-        self.forced_bos_token_id = None
-        self.forced_eos_token_id = None
-
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
             3,
@@ -167,8 +161,6 @@ def get_config(self):
             bos_token_id=self.bos_token_id,
             pad_token_id=self.pad_token_id,
             decoder_start_token_id=self.decoder_start_token_id,
-            forced_bos_token_id=self.forced_bos_token_id,
-            forced_eos_token_id=self.forced_eos_token_id,
         )
 
     def prepare_config_and_inputs_for_common(self):
diff --git a/tests/models/mbart/test_modeling_mbart.py b/tests/models/mbart/test_modeling_mbart.py
index 5a8263e11969..9401d892daa3 100644
--- a/tests/models/mbart/test_modeling_mbart.py
+++ b/tests/models/mbart/test_modeling_mbart.py
@@ -120,12 +120,6 @@ def __init__(
         self.pad_token_id = pad_token_id
         self.bos_token_id = bos_token_id
 
-        # forcing a certain token to be generated, sets all other tokens to -inf
-        # if however the token to be generated is already at -inf then it can lead token
-        # `nan` values and thus break generation
-        self.forced_bos_token_id = None
-        self.forced_eos_token_id = None
-
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
@@ -155,8 +149,6 @@ def get_config(self):
             eos_token_id=self.eos_token_id,
             bos_token_id=self.bos_token_id,
             pad_token_id=self.pad_token_id,
-            forced_bos_token_id=self.forced_bos_token_id,
-            forced_eos_token_id=self.forced_eos_token_id,
         )
 
     def prepare_config_and_inputs_for_common(self):
diff --git a/tests/models/mobilevit/test_modeling_mobilevit.py b/tests/models/mobilevit/test_modeling_mobilevit.py
index 9eb5878500d5..cd4cfa68e5dc 100644
--- a/tests/models/mobilevit/test_modeling_mobilevit.py
+++ b/tests/models/mobilevit/test_modeling_mobilevit.py
@@ -17,7 +17,7 @@
 import unittest
 
 from transformers import MobileViTConfig
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
@@ -274,6 +274,10 @@ def test_model_from_pretrained(self):
         model = MobileViTModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
+    @is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516")
+    def test_batching_equivalence(self):
+        super().test_batching_equivalence()
+
 
 # We will verify our results on an image of cute cats
 def prepare_img():
diff --git a/tests/models/pegasus/test_modeling_pegasus.py b/tests/models/pegasus/test_modeling_pegasus.py
index f7de1258847d..2bd102b904e3 100644
--- a/tests/models/pegasus/test_modeling_pegasus.py
+++ b/tests/models/pegasus/test_modeling_pegasus.py
@@ -112,12 +112,6 @@ def __init__(
         self.pad_token_id = pad_token_id
         self.bos_token_id = bos_token_id
 
-        # forcing a certain token to be generated, sets all other tokens to -inf
-        # if however the token to be generated is already at -inf then it can lead token
-        # `nan` values and thus break generation
-        self.forced_bos_token_id = None
-        self.forced_eos_token_id = None
-
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
@@ -165,8 +159,6 @@ def get_config(self):
             eos_token_id=self.eos_token_id,
             bos_token_id=self.bos_token_id,
             pad_token_id=self.pad_token_id,
-            forced_bos_token_id=self.forced_bos_token_id,
-            forced_eos_token_id=self.forced_eos_token_id,
         )
 
     def prepare_config_and_inputs_for_common(self):
diff --git a/tests/models/whisper/test_modeling_flax_whisper.py b/tests/models/whisper/test_modeling_flax_whisper.py
index 4b8092e800ad..065c6536481d 100644
--- a/tests/models/whisper/test_modeling_flax_whisper.py
+++ b/tests/models/whisper/test_modeling_flax_whisper.py
@@ -84,7 +84,6 @@ def __init__(
         decoder_start_token_id=85,
         num_conv_layers=1,
         suppress_tokens=None,
-        begin_suppress_tokens=None,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -118,7 +117,6 @@ def __init__(
         self.decoder_start_token_id = decoder_start_token_id
         self.num_conv_layers = num_conv_layers
         self.suppress_tokens = suppress_tokens
-        self.begin_suppress_tokens = begin_suppress_tokens
 
     def prepare_config_and_inputs_for_common(self):
         input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length], self.vocab_size)
@@ -147,7 +145,6 @@ def prepare_config_and_inputs_for_common(self):
             encoder_ffn_dim=self.encoder_ffn_dim,
             encoder_layers=self.encoder_layers,
             suppress_tokens=self.suppress_tokens,
-            begin_suppress_tokens=self.begin_suppress_tokens,
         )
         inputs_dict = prepare_whisper_inputs_dict(config, input_features, decoder_input_ids)
         return config, inputs_dict
@@ -741,7 +738,6 @@ def __init__(
         num_mel_bins=80,
         num_conv_layers=1,
         suppress_tokens=None,
-        begin_suppress_tokens=None,
         classifier_proj_size=4,
         num_labels=2,
         is_encoder_decoder=False,
@@ -764,7 +760,6 @@ def __init__(
         self.max_source_positions = max_source_positions
         self.num_conv_layers = num_conv_layers
         self.suppress_tokens = suppress_tokens
-        self.begin_suppress_tokens = begin_suppress_tokens
         self.classifier_proj_size = classifier_proj_size
         self.num_labels = num_labels
         self.is_encoder_decoder = is_encoder_decoder
@@ -785,7 +780,6 @@ def get_config(self):
             decoder_ffn_dim=self.hidden_size,
             encoder_ffn_dim=self.hidden_size,
             suppress_tokens=self.suppress_tokens,
-            begin_suppress_tokens=self.begin_suppress_tokens,
             classifier_proj_size=self.classifier_proj_size,
             num_labels=self.num_labels,
             is_encoder_decoder=self.is_encoder_decoder,
diff --git a/tests/models/whisper/test_modeling_tf_whisper.py b/tests/models/whisper/test_modeling_tf_whisper.py
index b200671e048f..be311486267d 100644
--- a/tests/models/whisper/test_modeling_tf_whisper.py
+++ b/tests/models/whisper/test_modeling_tf_whisper.py
@@ -104,7 +104,6 @@ def __init__(
         decoder_start_token_id=85,
         num_conv_layers=1,
         suppress_tokens=None,
-        begin_suppress_tokens=None,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -129,7 +128,6 @@ def __init__(
         self.decoder_start_token_id = decoder_start_token_id
         self.num_conv_layers = num_conv_layers
         self.suppress_tokens = suppress_tokens
-        self.begin_suppress_tokens = begin_suppress_tokens
 
     def prepare_config_and_inputs(self):
         input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length], self.vocab_size)
@@ -166,7 +164,6 @@ def get_config(self):
             encoder_ffn_dim=self.hidden_size,
             decoder_start_token_id=self.decoder_start_token_id,
             suppress_tokens=self.suppress_tokens,
-            begin_suppress_tokens=self.begin_suppress_tokens,
         )
 
     def prepare_config_and_inputs_for_common(self):
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 6deebf552b91..f3d191b4d3c4 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -218,7 +218,6 @@ def __init__(
         decoder_start_token_id=85,
         num_conv_layers=1,
         suppress_tokens=None,
-        begin_suppress_tokens=None,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -243,7 +242,6 @@ def __init__(
         self.decoder_start_token_id = decoder_start_token_id
         self.num_conv_layers = num_conv_layers
         self.suppress_tokens = suppress_tokens
-        self.begin_suppress_tokens = begin_suppress_tokens
 
     def prepare_config_and_inputs(self):
         input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length], self.vocab_size)
@@ -280,7 +278,6 @@ def get_config(self):
             encoder_ffn_dim=self.hidden_size,
             decoder_start_token_id=self.decoder_start_token_id,
             suppress_tokens=self.suppress_tokens,
-            begin_suppress_tokens=self.begin_suppress_tokens,
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -3309,7 +3306,6 @@ def __init__(
         num_mel_bins=80,
         num_conv_layers=1,
         suppress_tokens=None,
-        begin_suppress_tokens=None,
         classifier_proj_size=4,
         num_labels=2,
         is_encoder_decoder=False,
@@ -3332,7 +3328,6 @@ def __init__(
         self.max_source_positions = max_source_positions
         self.num_conv_layers = num_conv_layers
         self.suppress_tokens = suppress_tokens
-        self.begin_suppress_tokens = begin_suppress_tokens
         self.classifier_proj_size = classifier_proj_size
         self.num_labels = num_labels
         self.is_encoder_decoder = is_encoder_decoder
@@ -3353,7 +3348,6 @@ def get_config(self):
             decoder_ffn_dim=self.hidden_size,
             encoder_ffn_dim=self.hidden_size,
             suppress_tokens=self.suppress_tokens,
-            begin_suppress_tokens=self.begin_suppress_tokens,
             classifier_proj_size=self.classifier_proj_size,
             num_labels=self.num_labels,
             is_encoder_decoder=self.is_encoder_decoder,
@@ -3685,7 +3679,6 @@ def __init__(
         decoder_start_token_id=85,
         num_conv_layers=1,
         suppress_tokens=None,
-        begin_suppress_tokens=None,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -3709,7 +3702,6 @@ def __init__(
         self.decoder_start_token_id = decoder_start_token_id
         self.num_conv_layers = num_conv_layers
         self.suppress_tokens = suppress_tokens
-        self.begin_suppress_tokens = begin_suppress_tokens
 
     def prepare_config_and_inputs(self):
         input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length], self.vocab_size)
@@ -3765,7 +3757,6 @@ def get_config(self):
             encoder_ffn_dim=self.hidden_size,
             decoder_start_token_id=self.decoder_start_token_id,
             suppress_tokens=self.suppress_tokens,
-            begin_suppress_tokens=self.begin_suppress_tokens,
         )
 
     def prepare_config_and_inputs_for_common(self):
diff --git a/tests/utils/test_cli.py b/tests/utils/test_cli.py
index b208ff19f1a4..148f091c2794 100644
--- a/tests/utils/test_cli.py
+++ b/tests/utils/test_cli.py
@@ -18,7 +18,7 @@
 import unittest
 from unittest.mock import patch
 
-from transformers.testing_utils import CaptureStd, is_pt_tf_cross_test, require_torch
+from transformers.testing_utils import CaptureStd, require_torch
 
 
 class CLITest(unittest.TestCase):
@@ -33,18 +33,6 @@ def test_cli_env(self):
         self.assertIn("Platform", cs.out)
         self.assertIn("Using distributed or parallel set-up in script?", cs.out)
 
-    @is_pt_tf_cross_test
-    @patch(
-        "sys.argv", ["fakeprogrampath", "pt-to-tf", "--model-name", "hf-internal-testing/tiny-random-gptj", "--no-pr"]
-    )
-    def test_cli_pt_to_tf(self):
-        import transformers.commands.transformers_cli
-
-        shutil.rmtree("/tmp/hf-internal-testing/tiny-random-gptj", ignore_errors=True)  # cleans potential past runs
-        transformers.commands.transformers_cli.main()
-
-        self.assertTrue(os.path.exists("/tmp/hf-internal-testing/tiny-random-gptj/tf_model.h5"))
-
     @require_torch
     @patch("sys.argv", ["fakeprogrampath", "download", "hf-internal-testing/tiny-random-gptj", "--cache-dir", "/tmp"])
     def test_cli_download(self):
diff --git a/tests/utils/test_configuration_utils.py b/tests/utils/test_configuration_utils.py
index 934e11a78797..6b684867eb87 100644
--- a/tests/utils/test_configuration_utils.py
+++ b/tests/utils/test_configuration_utils.py
@@ -315,21 +315,19 @@ def test_repo_versioning_before(self):
         old_configuration = old_transformers.models.auto.AutoConfig.from_pretrained(repo)
         self.assertEqual(old_configuration.hidden_size, 768)
 
-    def test_saving_config_with_custom_generation_kwargs_raises_warning(self):
+    def test_saving_config_with_custom_generation_kwargs_raises_exception(self):
         config = BertConfig(min_length=3)  # `min_length = 3` is a non-default generation kwarg
         with tempfile.TemporaryDirectory() as tmp_dir:
-            with self.assertLogs("transformers.configuration_utils", level="WARNING") as logs:
+            with self.assertRaises(ValueError):
                 config.save_pretrained(tmp_dir)
-            self.assertEqual(len(logs.output), 1)
-            self.assertIn("min_length", logs.output[0])
 
-    def test_has_non_default_generation_parameters(self):
+    def test_get_non_default_generation_parameters(self):
         config = BertConfig()
-        self.assertFalse(config._has_non_default_generation_parameters())
+        self.assertFalse(len(config._get_non_default_generation_parameters()) > 0)
         config = BertConfig(min_length=3)
-        self.assertTrue(config._has_non_default_generation_parameters())
+        self.assertTrue(len(config._get_non_default_generation_parameters()) > 0)
         config = BertConfig(min_length=0)  # `min_length = 0` is a default generation kwarg
-        self.assertFalse(config._has_non_default_generation_parameters())
+        self.assertFalse(len(config._get_non_default_generation_parameters()) > 0)
 
     def test_loading_config_do_not_raise_future_warnings(self):
         """Regression test for https://github.com/huggingface/transformers/issues/31002."""
diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py
index 5dec17f1f477..238a9a1fe4d6 100644
--- a/tests/utils/test_modeling_utils.py
+++ b/tests/utils/test_modeling_utils.py
@@ -23,6 +23,7 @@
 import unittest
 import unittest.mock as mock
 import uuid
+import warnings
 from pathlib import Path
 
 import requests
@@ -1599,14 +1600,30 @@ def test_safetensors_torch_from_torch_sharded(self):
         for p1, p2 in zip(model.parameters(), new_model.parameters()):
             self.assertTrue(torch.equal(p1, p2))
 
-    def test_modifying_model_config_causes_warning_saving_generation_config(self):
+    def test_modifying_model_config_gets_moved_to_generation_config(self):
+        """
+        Calling `model.save_pretrained` should move the changes made to `generate` parameterization in the model config
+        to the generation config.
+        """
         model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
-        model.config.top_k = 1
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            with self.assertLogs("transformers.modeling_utils", level="WARNING") as logs:
+        # Initially, the repetition penalty has its default value in `model.config`. The `model.generation_config` will
+        # have the exact same default
+        self.assertTrue(model.config.repetition_penalty == 1.0)
+        self.assertTrue(model.generation_config.repetition_penalty == 1.0)
+        # If the user attempts to save a custom generation parameter:
+        model.config.repetition_penalty = 3.0
+        with warnings.catch_warnings(record=True) as warning_list:
+            with tempfile.TemporaryDirectory() as tmp_dir:
                 model.save_pretrained(tmp_dir)
-            self.assertEqual(len(logs.output), 1)
-            self.assertIn("Your generation config was originally created from the model config", logs.output[0])
+                # 1 - That parameter will be removed from `model.config`. We don't want to use `model.config` to store
+                # generative parameters, and the old default (1.0) would no longer relect the user's wishes.
+                self.assertTrue(model.config.repetition_penalty is None)
+                # 2 - That parameter will be set in `model.generation_config` instead.
+                self.assertTrue(model.generation_config.repetition_penalty == 3.0)
+        # 3 - The user will see a warning regarding the custom parameter that has been moved.
+        self.assertTrue(len(warning_list) == 1)
+        self.assertTrue("Moving the following attributes" in str(warning_list[0].message))
+        self.assertTrue("repetition_penalty" in str(warning_list[0].message))
 
     @require_safetensors
     def test_model_from_pretrained_from_mlx(self):