From 2d2306a55c1a4f6909e3086aa1d7265c691d9944 Mon Sep 17 00:00:00 2001
From: Behrooz <ermiaazarkhalili@gmail.com>
Date: Wed, 5 Nov 2025 16:59:07 -0800
Subject: [PATCH 1/8] Move ORPOTrainer and ORPOConfig to experimental

- Move ORPOTrainer and ORPOConfig to trl.experimental.orpo
- Add deprecation warnings in trl.trainer with removal planned for TRL 0.29.0
- Update imports in tests, examples, and documentation
- Maintain backward compatibility through deprecation stubs

Fixes #4465
---
 docs/source/_toctree.yml              |    8 +-
 docs/source/orpo_trainer.md           |    2 +-
 examples/scripts/orpo.py              |    3 +-
 tests/test_orpo_trainer.py            |    2 +-
 trl/experimental/orpo/__init__.py     |   19 +
 trl/experimental/orpo/orpo_config.py  |  169 ++++
 trl/experimental/orpo/orpo_trainer.py | 1050 +++++++++++++++++++++++++
 trl/trainer/__init__.py               |    4 -
 trl/trainer/orpo_config.py            |  162 +---
 trl/trainer/orpo_trainer.py           | 1009 +-----------------------
 10 files changed, 1290 insertions(+), 1138 deletions(-)
 create mode 100644 trl/experimental/orpo/__init__.py
 create mode 100644 trl/experimental/orpo/orpo_config.py
 create mode 100644 trl/experimental/orpo/orpo_trainer.py

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 74f79544b33..a7555a1e7ed 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -70,8 +70,6 @@
       title: KTO
     - local: nash_md_trainer
       title: Nash-MD
-    - local: orpo_trainer
-      title: ORPO
     - local: ppo_trainer
       title: PPO
     - local: prm_trainer
@@ -117,8 +115,10 @@
     title: GRPO With Replay Buffer
   - local: gspo_token
     title: GSPO-token
-  - local: papo_trainer
-    title: PAPO
   - local: openenv
     title: OpenEnv Integration
+  - local: orpo_trainer
+    title: ORPO
+  - local: papo_trainer
+    title: PAPO
   title: Experimental
\ No newline at end of file
diff --git a/docs/source/orpo_trainer.md b/docs/source/orpo_trainer.md
index 3092de2d2ae..d3428313bb8 100644
--- a/docs/source/orpo_trainer.md
+++ b/docs/source/orpo_trainer.md
@@ -34,7 +34,7 @@ Below is the script to train the model:
 ```python
 # train_orpo.py
 from datasets import load_dataset
-from trl import ORPOConfig, ORPOTrainer
+from trl.experimental.orpo import ORPOConfig, ORPOTrainer
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
diff --git a/examples/scripts/orpo.py b/examples/scripts/orpo.py
index e256a4277ad..36eb32a565b 100644
--- a/examples/scripts/orpo.py
+++ b/examples/scripts/orpo.py
@@ -63,7 +63,8 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
 
-from trl import ModelConfig, ORPOConfig, ORPOTrainer, ScriptArguments, get_peft_config
+from trl import ModelConfig, ScriptArguments, get_peft_config
+from trl.experimental.orpo import ORPOConfig, ORPOTrainer
 
 
 # Enable logging in a Hugging Face Space
diff --git a/tests/test_orpo_trainer.py b/tests/test_orpo_trainer.py
index 70f087ac948..554938293ca 100644
--- a/tests/test_orpo_trainer.py
+++ b/tests/test_orpo_trainer.py
@@ -17,7 +17,7 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer
 
-from trl import ORPOConfig, ORPOTrainer
+from trl.experimental.orpo import ORPOConfig, ORPOTrainer
 
 from .testing_utils import TrlTestCase, require_peft
 
diff --git a/trl/experimental/orpo/__init__.py b/trl/experimental/orpo/__init__.py
new file mode 100644
index 00000000000..17960ce5b18
--- /dev/null
+++ b/trl/experimental/orpo/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .orpo_config import ORPOConfig
+from .orpo_trainer import ORPOTrainer
+
+
+__all__ = ["ORPOConfig", "ORPOTrainer"]
diff --git a/trl/experimental/orpo/orpo_config.py b/trl/experimental/orpo/orpo_config.py
new file mode 100644
index 00000000000..523beeab934
--- /dev/null
+++ b/trl/experimental/orpo/orpo_config.py
@@ -0,0 +1,169 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Any
+
+from transformers import TrainingArguments
+
+
+@dataclass
+class ORPOConfig(TrainingArguments):
+    r"""
+    Configuration class for the [`ORPOTrainer`].
+
+    This class includes only the parameters that are specific to ORPO training. For a full list of training arguments,
+    please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
+    differ from those in [`~transformers.TrainingArguments`].
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters:
+        max_length (`int` or `None`, *optional*, defaults to `1024`):
+            Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
+            to use the default data collator.
+        max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
+            Maximum length of the prompt. This argument is required if you want to use the default data collator.
+        max_completion_length (`int`, *optional*):
+            Maximum length of the completion. This argument is required if you want to use the default data collator
+            and your model is an encoder-decoder.
+        beta (`float`, *optional*, defaults to `0.1`):
+            Parameter controlling the relative ratio loss weight in the ORPO loss. In the
+            [paper](https://huggingface.co/papers/2403.07691), it is denoted by λ. In the
+            [code](https://github.com/xfactlab/orpo), it is denoted by `alpha`.
+        disable_dropout (`bool`, *optional*, defaults to `True`):
+            Whether to disable dropout in the model.
+        label_pad_token_id (`int`, *optional*, defaults to `-100`):
+            Label pad token id. This argument is required if you want to use the default data collator.
+        padding_value (`int`, *optional*):
+            Padding value to use. If `None`, the padding value of the tokenizer is used.
+        truncation_mode (`str`, *optional*, defaults to `"keep_end"`):
+            Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
+            This argument is required if you want to use the default data collator.
+        generate_during_eval (`bool`, *optional*, defaults to `False`):
+            If `True`, generates and logs completions from the model to W&B or Comet during evaluation.
+        is_encoder_decoder (`bool`, *optional*):
+            When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
+            you need to specify if the model returned by the callable is an encoder-decoder model.
+        model_init_kwargs (`dict[str, Any]`, *optional*):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
+            string.
+        dataset_num_proc (`int`, *optional*):
+            Number of processes to use for processing the dataset.
+    """
+
+    _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["model_init_kwargs"]
+
+    # Parameters whose default values are overridden from TrainingArguments
+    learning_rate: float = field(
+        default=1e-6,
+        metadata={"help": "The initial learning rate for AdamW."},
+    )
+    logging_steps: float = field(
+        default=10,
+        metadata={
+            "help": "Log every X updates steps. Should be an integer or a float in range `[0,1)`. If smaller than 1, "
+            "will be interpreted as ratio of total training steps."
+        },
+    )
+    gradient_checkpointing: bool = field(
+        default=True,
+        metadata={
+            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
+        },
+    )
+    bf16: bool | None = field(
+        default=None,
+        metadata={
+            "help": "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA "
+            "architecture or Intel XPU or using CPU (use_cpu) or Ascend NPU. If not set, it defaults to `True` if "
+            "`fp16` is not set."
+        },
+    )
+
+    max_length: int | None = field(
+        default=1024,
+        metadata={"help": "Maximum length of the sequences (prompt + completion) in the batch."},
+    )
+    max_prompt_length: int | None = field(
+        default=512,
+        metadata={
+            "help": "Maximum length of the prompt. This argument is required if you want to use the default data "
+            "collator and your model is an encoder-decoder."
+        },
+    )
+    max_completion_length: int | None = field(
+        default=None,
+        metadata={
+            "help": "Maximum length of the completion. This argument is required if you want to use the default data "
+            "collator and your model is an encoder-decoder."
+        },
+    )
+    beta: float = field(
+        default=0.1,
+        metadata={
+            "help": "Parameter controlling the relative ratio loss weight in the ORPO loss. In the paper, it is "
+            "denoted by λ."
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={"help": "Whether to disable dropout in the model."},
+    )
+    label_pad_token_id: int = field(
+        default=-100,
+        metadata={
+            "help": "Label pad token id. This argument is required if you want to use the default data collator."
+        },
+    )
+    padding_value: int | None = field(
+        default=None,
+        metadata={"help": "Padding value to use. If `None`, the padding value of the tokenizer is used."},
+    )
+    truncation_mode: str = field(
+        default="keep_end",
+        metadata={
+            "help": "Truncation mode to use when the prompt is too long.",
+            "choices": ["keep_end", "keep_start"],
+        },
+    )
+    generate_during_eval: bool = field(
+        default=False,
+        metadata={"help": "If `True`, generates and logs completions from the model to W&B during evaluation."},
+    )
+    is_encoder_decoder: bool | None = field(
+        default=None,
+        metadata={
+            "help": "When using the `model_init` argument (callable) to instantiate the model instead of the `model` "
+            "argument, you need to specify if the model returned by the callable is an encoder-decoder model."
+        },
+    )
+    model_init_kwargs: dict[str, Any] | None = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model "
+            "from a string."
+        },
+    )
+    dataset_num_proc: int | None = field(
+        default=None,
+        metadata={"help": "Number of processes to use for processing the dataset."},
+    )
+
+    def __post_init__(self):
+        self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
+
+        super().__post_init__()
diff --git a/trl/experimental/orpo/orpo_trainer.py b/trl/experimental/orpo/orpo_trainer.py
new file mode 100644
index 00000000000..8ac3b0b6ed2
--- /dev/null
+++ b/trl/experimental/orpo/orpo_trainer.py
@@ -0,0 +1,1050 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import os
+import random
+import textwrap
+import warnings
+from collections import defaultdict
+from collections.abc import Callable
+from contextlib import nullcontext
+from pathlib import Path
+from typing import Any, Literal
+
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from accelerate import PartialState, logging
+from datasets import Dataset
+from torch import autocast
+from torch.utils.data import DataLoader
+from transformers import (
+    AutoModelForCausalLM,
+    BaseImageProcessor,
+    DataCollator,
+    FeatureExtractionMixin,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+    is_comet_available,
+    is_torch_xla_available,
+    is_wandb_available,
+)
+from transformers.trainer_callback import TrainerCallback
+from transformers.trainer_utils import EvalLoopOutput
+from transformers.utils import is_peft_available, is_torch_fx_proxy
+
+from ...data_utils import maybe_apply_chat_template, maybe_extract_prompt
+from ..base_trainer import BaseTrainer
+from .orpo_config import ORPOConfig
+from ..utils import (
+    DPODataCollatorWithPadding,
+    add_bos_token_if_needed,
+    add_eos_token_if_needed,
+    disable_dropout_in_model,
+    log_table_to_comet_experiment,
+    pad_to_length,
+    peft_module_casting_to_bf16,
+    selective_log_softmax,
+)
+
+
+if is_peft_available():
+    from peft import PeftModel, get_peft_model, prepare_model_for_kbit_training
+
+
+if is_wandb_available():
+    import wandb
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+
+logger = logging.get_logger(__name__)
+
+
+class ORPOTrainer(BaseTrainer):
+    r"""
+    Initialize ORPOTrainer.
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]):
+            The model to train, preferably an [`~transformers.AutoModelForSequenceClassification`].
+        args ([`ORPOConfig`]):
+            The ORPO config arguments to use for training.
+        data_collator ([`~transformers.DataCollator`]):
+            The data collator to use for training. If None is specified, the default data collator
+            ([`DPODataCollatorWithPadding`]) will be used which will pad the sequences to the maximum length of the
+            sequences in the batch, given a dataset of paired sequences.
+        train_dataset ([`~datasets.Dataset`]):
+            The dataset to use for training.
+        eval_dataset ([`~datasets.Dataset`]):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        model_init (`Callable[[], transformers.PreTrainedModel]`):
+            The model initializer to use for training. If None is specified, the default model initializer will be
+            used.
+        callbacks (`list[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+        peft_config (`dict`, defaults to `None`):
+            The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in
+            a PEFT model.
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+            metric values.
+    """
+
+    _tag_names = ["trl", "orpo"]
+    _name = "ORPO"
+    _paper = {
+        "title": "ORPO: Monolithic Preference Optimization without Reference Model",
+        "id": "2403.07691",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @article{hong2024orpo,
+                title        = {{ORPO: Monolithic Preference Optimization without Reference Model}},
+                author       = {Jiwoo Hong and Noah Lee and James Thorne},
+                year         = 2024,
+                eprint       = {arXiv:2403.07691}
+            }"""),
+    }
+
+    def __init__(
+        self,
+        model: PreTrainedModel | nn.Module | str | None = None,
+        args: ORPOConfig | None = None,
+        data_collator: DataCollator | None = None,
+        train_dataset: Dataset | None = None,
+        eval_dataset: Dataset | dict[str, Dataset] | None = None,
+        processing_class: PreTrainedTokenizerBase
+        | BaseImageProcessor
+        | FeatureExtractionMixin
+        | ProcessorMixin
+        | None = None,
+        model_init: Callable[[], PreTrainedModel] | None = None,
+        callbacks: list[TrainerCallback] | None = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
+        peft_config: dict | None = None,
+        compute_metrics: Callable[[EvalLoopOutput], dict] | None = None,
+    ):
+        if args.model_init_kwargs is None:
+            model_init_kwargs = {}
+        elif not isinstance(model, str):
+            raise ValueError("You passed model_kwargs to the ORPOTrainer. But your model is already instantiated.")
+        else:
+            model_init_kwargs = args.model_init_kwargs
+            dtype = model_init_kwargs.get("dtype")
+            if dtype is not None:
+                # Convert to `torch.dtype` if an str is passed
+                if isinstance(dtype, str) and dtype != "auto":
+                    dtype = getattr(torch, dtype)
+                if dtype != "auto" and not isinstance(dtype, torch.dtype):
+                    raise ValueError(
+                        f"Invalid `dtype` passed to the ORPOConfig. Expected a string with either `torch.dtype` or 'auto', but got {dtype}."
+                    )
+                model_init_kwargs["dtype"] = dtype
+
+        if isinstance(model, str):
+            model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
+
+        # Initialize this variable to False. This helps tracking the case when `peft_module_casting_to_bf16`
+        # has been called in order to properly call autocast if needed.
+        self._peft_has_been_casted_to_bf16 = False
+
+        if not is_peft_available() and peft_config is not None:
+            raise ValueError(
+                "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models"
+            )
+        elif is_peft_available() and peft_config is not None:
+            # if model is a peft model and we have a peft_config, we merge and unload it first
+            if isinstance(model, PeftModel):
+                model = model.merge_and_unload()
+
+            if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False):
+                _support_gc_kwargs = hasattr(
+                    args, "gradient_checkpointing_kwargs"
+                ) and "gradient_checkpointing_kwargs" in list(
+                    inspect.signature(prepare_model_for_kbit_training).parameters
+                )
+
+                prepare_model_kwargs = {"use_gradient_checkpointing": args.gradient_checkpointing}
+
+                if _support_gc_kwargs:
+                    prepare_model_kwargs["gradient_checkpointing_kwargs"] = args.gradient_checkpointing_kwargs
+
+                model = prepare_model_for_kbit_training(model, **prepare_model_kwargs)
+            elif args.gradient_checkpointing:
+                # For backward compatibility with older versions of transformers
+                if hasattr(model, "enable_input_require_grads"):
+                    model.enable_input_require_grads()
+                else:
+
+                    def make_inputs_require_grad(module, input, output):
+                        output.requires_grad_(True)
+
+                    model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+            # get peft model with the given config
+            model = get_peft_model(model, peft_config)
+            if args.bf16 and getattr(model, "is_loaded_in_4bit", False):
+                peft_module_casting_to_bf16(model)
+                # If args.bf16 we need to explicitly call `generate` with torch amp autocast context manager
+                self._peft_has_been_casted_to_bf16 = True
+
+        # For models that use gradient_checkpointing, we need to attach a hook that enables input
+        # to explicitly have `requires_grad=True`, otherwise training will either silently
+        # fail or completely fail.
+        elif args.gradient_checkpointing:
+            # For backward compatibility with older versions of transformers
+            if hasattr(model, "enable_input_require_grads"):
+                model.enable_input_require_grads()
+            else:
+
+                def make_inputs_require_grad(module, input, output):
+                    output.requires_grad_(True)
+
+                model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+        if args.generate_during_eval and not (is_wandb_available() or is_comet_available()):
+            raise ValueError(
+                "`generate_during_eval=True` requires Weights and Biases or Comet to be installed."
+                " Please install `wandb` or `comet-ml` to resolve."
+            )
+
+        if model is not None:
+            self.is_encoder_decoder = model.config.is_encoder_decoder
+        elif args.is_encoder_decoder is None:
+            raise ValueError("When no model is provided, you need to pass the parameter is_encoder_decoder.")
+        else:
+            self.is_encoder_decoder = args.is_encoder_decoder
+
+        if self.is_encoder_decoder:
+            self.decoder_start_token_id = model.config.decoder_start_token_id
+            self.pad_token_id = model.config.pad_token_id
+
+        if processing_class is None:
+            raise ValueError("processing_class must be specified to tokenize a ORPO dataset.")
+        if args.max_length is None:
+            logger.warning(
+                "`max_length` is not set in the ORPOConfig's init"
+                " it will default to `512` by default, but you should do it yourself in the future.",
+            )
+            max_length = 512
+        else:
+            max_length = args.max_length
+        if args.max_prompt_length is None:
+            logger.warning(
+                "`max_prompt_length` is not set in the ORPOConfig's init"
+                " it will default to `128` by default, but you should do it yourself in the future.",
+            )
+            max_prompt_length = 128
+        else:
+            max_prompt_length = args.max_prompt_length
+
+        if args.max_completion_length is None and self.is_encoder_decoder:
+            logger.warning(
+                "When using an encoder decoder architecture, you should set `max_completion_length` in the ORPOConfig's init"
+                " it will default to `128` by default, but you should do it yourself in the future.",
+            )
+            self.max_completion_length = 128
+        else:
+            self.max_completion_length = args.max_completion_length
+
+        if data_collator is None:
+            data_collator = DPODataCollatorWithPadding(
+                pad_token_id=processing_class.pad_token_id,
+                label_pad_token_id=args.label_pad_token_id,
+                is_encoder_decoder=self.is_encoder_decoder,
+            )
+
+            if args.remove_unused_columns:
+                args.remove_unused_columns = False
+                # warn users
+                logger.warning(
+                    "When using DPODataCollatorWithPadding, you should set `remove_unused_columns=False` in your TrainingArguments"
+                    " we have set it for you, but you should do it yourself in the future.",
+                )
+
+            self.use_dpo_data_collator = True
+        else:
+            self.use_dpo_data_collator = False
+
+        # Disable dropout in the model and reference model
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+
+        self.max_length = max_length
+        self.generate_during_eval = args.generate_during_eval
+        self.label_pad_token_id = args.label_pad_token_id
+        self.padding_value = args.padding_value if args.padding_value is not None else processing_class.pad_token_id
+        self.max_prompt_length = max_prompt_length
+        self.truncation_mode = args.truncation_mode
+        self.processing_class = processing_class
+
+        self.beta = args.beta
+        self.aux_loss_enabled = getattr(model.config, "output_router_logits", False)
+        self.aux_loss_coef = getattr(model.config, "router_aux_loss_coef", 0.0)
+        if self.aux_loss_enabled and self.aux_loss_coef == 0.0:
+            logger.warning(
+                "You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to "
+                "`0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value "
+                "greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary "
+                "loss.",
+            )
+
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+
+        # The trainer estimates the number of FLOPs (floating-point operations) using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in ORPO, the sampled data does not include the
+        # "input_ids" key. Instead, the available keys are "prompt_input_ids", "chosen_input_ids", and
+        # "rejected_input_ids". As a result, the trainer issues the warning: "Could not estimate the number of tokens
+        # of the input, floating-point operations will not be computed." To suppress this warning, we set the
+        # "estimate_tokens" key in the model's "warnings_issued" dictionary to True. This acts as a flag to indicate
+        # that the warning has already been issued.
+        model.warnings_issued["estimate_tokens"] = True
+
+        # Compute that only on the main process for faster data processing.
+        # see: https://github.com/huggingface/trl/pull/1255
+        with PartialState().main_process_first():
+            # Extract the prompt if needed, and apply the chat template if needed
+            train_dataset = train_dataset.map(maybe_extract_prompt, num_proc=args.dataset_num_proc)
+            train_dataset = train_dataset.map(
+                maybe_apply_chat_template, fn_kwargs={"tokenizer": processing_class}, num_proc=args.dataset_num_proc
+            )
+            train_dataset = train_dataset.map(self.tokenize_row, num_proc=args.dataset_num_proc)
+            if eval_dataset is not None:
+                eval_dataset = eval_dataset.map(maybe_extract_prompt, num_proc=args.dataset_num_proc)
+                eval_dataset = eval_dataset.map(
+                    maybe_apply_chat_template,
+                    fn_kwargs={"tokenizer": processing_class},
+                    num_proc=args.dataset_num_proc,
+                )
+                eval_dataset = eval_dataset.map(self.tokenize_row, num_proc=args.dataset_num_proc)
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            model_init=model_init,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+        if not hasattr(self, "accelerator"):
+            raise AttributeError(
+                "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`."
+            )
+
+    def build_tokenized_answer(self, prompt, answer):
+        """
+        Llama tokenizer does satisfy `enc(a + b) = enc(a) + enc(b)`. It does ensure `enc(a + b) = enc(a) + enc(a +
+        b)[len(enc(a)):]`. Reference:
+            https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
+        """
+
+        full_tokenized = self.processing_class(prompt + answer, add_special_tokens=False)
+        prompt_input_ids = self.processing_class(prompt, add_special_tokens=False)["input_ids"]
+
+        answer_input_ids = full_tokenized["input_ids"][len(prompt_input_ids) :]
+        answer_attention_mask = full_tokenized["attention_mask"][len(prompt_input_ids) :]
+
+        # Concat tokens to form `enc(a) + enc(a + b)[len(enc(a)):]`
+        full_concat_input_ids = np.concatenate([prompt_input_ids, answer_input_ids])
+
+        # Prepare input tokens for token by token comparison
+        full_input_ids = np.array(full_tokenized["input_ids"])
+
+        if len(full_input_ids) != len(full_concat_input_ids):
+            raise ValueError("Prompt input ids and answer input ids should have the same length.")
+
+        # On some tokenizers, like Llama-2 tokenizer, there are occasions where tokens
+        # can be merged together when tokenizing prompt+answer. This could result
+        # on the last token from the prompt being different when tokenized on its own
+        # vs when done as prompt+answer.
+        response_token_ids_start_idx = len(prompt_input_ids)
+
+        # If tokenized prompt is different than both prompt+answer, then it means the
+        # last token has changed due to merging.
+        if prompt_input_ids != full_tokenized["input_ids"][:response_token_ids_start_idx]:
+            response_token_ids_start_idx -= 1
+
+        prompt_input_ids = full_tokenized["input_ids"][:response_token_ids_start_idx]
+        prompt_attention_mask = full_tokenized["attention_mask"][:response_token_ids_start_idx]
+
+        if len(prompt_input_ids) != len(prompt_attention_mask):
+            raise ValueError("Prompt input ids and attention mask should have the same length.")
+
+        answer_input_ids = full_tokenized["input_ids"][response_token_ids_start_idx:]
+        answer_attention_mask = full_tokenized["attention_mask"][response_token_ids_start_idx:]
+
+        return dict(
+            prompt_input_ids=prompt_input_ids,
+            prompt_attention_mask=prompt_attention_mask,
+            input_ids=answer_input_ids,
+            attention_mask=answer_attention_mask,
+        )
+
+    def tokenize_row(self, feature, model: PreTrainedModel | nn.Module | None = None) -> dict:
+        """Tokenize a single row from a ORPO specific dataset.
+
+        At this stage, we don't convert to PyTorch tensors yet; we just handle the truncation in case the prompt +
+        chosen or prompt + rejected responses is/are too long. First we truncate the prompt; if we're still too long,
+        we truncate the chosen/rejected.
+
+        We also create the labels for the chosen/rejected responses, which are of length equal to the sum of the length
+        of the prompt and the chosen/rejected response, with label_pad_token_id for the prompt tokens.
+        """
+        batch = {}
+        prompt = feature["prompt"]
+        chosen = feature["chosen"]
+        rejected = feature["rejected"]
+
+        if not self.is_encoder_decoder:
+            # Check issues below for more details
+            #  1. https://github.com/huggingface/trl/issues/907
+            #  2. https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
+            #  3. https://github.com/LianjiaTech/BELLE/issues/337
+
+            if not isinstance(prompt, str):
+                raise ValueError(f"prompt should be an str but got {type(prompt)}")
+            prompt_tokens = self.processing_class(prompt, add_special_tokens=False)
+            prompt_tokens = {f"prompt_{k}": v for k, v in prompt_tokens.items()}
+
+            if not isinstance(chosen, str):
+                raise ValueError(f"chosen should be an str but got {type(chosen)}")
+            chosen_tokens = self.build_tokenized_answer(prompt, chosen)
+
+            if not isinstance(rejected, str):
+                raise ValueError(f"rejected should be an str but got {type(rejected)}")
+            rejected_tokens = self.build_tokenized_answer(prompt, rejected)
+
+            # Last prompt token might get merged by tokenizer and
+            # it should not be included for generation if that happens
+            prompt_len_input_ids = len(prompt_tokens["prompt_input_ids"])
+
+            chosen_prompt_len_input_ids = len(chosen_tokens["prompt_input_ids"])
+            rejected_prompt_len_input_ids = len(rejected_tokens["prompt_input_ids"])
+            prompt_len_input_ids = min(chosen_prompt_len_input_ids, rejected_prompt_len_input_ids)
+
+            for k, v in prompt_tokens.items():
+                prompt_tokens[k] = v[:prompt_len_input_ids]
+
+            # Make sure prompts only have one different token at most an
+            # and length only differs by 1 at most
+            num_diff_tokens = sum(
+                a != b
+                for a, b in zip(chosen_tokens["prompt_input_ids"], rejected_tokens["prompt_input_ids"], strict=True)
+            )
+            num_diff_len = abs(chosen_prompt_len_input_ids - rejected_prompt_len_input_ids)
+            if num_diff_tokens > 1 or num_diff_len > 1:
+                raise ValueError(
+                    "Chosen and rejected prompt_input_ids might only differ on the "
+                    "last token due to tokenizer merge ops."
+                )
+
+            # add BOS token to head of prompt. Avoid adding if it's already there
+            prompt_tokens, chosen_tokens, rejected_tokens = add_bos_token_if_needed(
+                self.processing_class.bos_token_id,
+                prompt_len_input_ids,
+                prompt_tokens,
+                chosen_prompt_len_input_ids,
+                chosen_tokens,
+                rejected_prompt_len_input_ids,
+                rejected_tokens,
+            )
+
+            # add EOS token to end of answer. Avoid adding if it's already there
+            chosen_tokens, rejected_tokens = add_eos_token_if_needed(
+                self.processing_class.eos_token_id, chosen_tokens, rejected_tokens
+            )
+
+            longer_response_length = max(len(chosen_tokens["input_ids"]), len(rejected_tokens["input_ids"]))
+
+            # if combined sequence is too long, truncate the prompt
+            for answer_tokens in [chosen_tokens, rejected_tokens, prompt_tokens]:
+                if len(answer_tokens["prompt_input_ids"]) + longer_response_length > self.max_length:
+                    if self.truncation_mode == "keep_start":
+                        for k in ["prompt_input_ids", "prompt_attention_mask"]:
+                            answer_tokens[k] = answer_tokens[k][: self.max_prompt_length]
+                    elif self.truncation_mode == "keep_end":
+                        for k in ["prompt_input_ids", "prompt_attention_mask"]:
+                            answer_tokens[k] = answer_tokens[k][-self.max_prompt_length :]
+                    else:
+                        raise ValueError(f"Unknown truncation mode: {self.truncation_mode}")
+
+            # if that's still too long, truncate the response
+            for answer_tokens in [chosen_tokens, rejected_tokens]:
+                if len(answer_tokens["prompt_input_ids"]) + longer_response_length > self.max_length:
+                    for k in ["input_ids", "attention_mask"]:
+                        answer_tokens[k] = answer_tokens[k][: self.max_length - self.max_prompt_length]
+
+            # Create labels
+            chosen_sequence_tokens = {
+                k: chosen_tokens[f"prompt_{k}"] + chosen_tokens[k] for k in ["input_ids", "attention_mask"]
+            }
+            rejected_sequence_tokens = {
+                k: rejected_tokens[f"prompt_{k}"] + rejected_tokens[k] for k in ["input_ids", "attention_mask"]
+            }
+            chosen_sequence_tokens["labels"] = chosen_sequence_tokens["input_ids"][:]
+            chosen_sequence_tokens["labels"][: len(chosen_tokens["prompt_input_ids"])] = [
+                self.label_pad_token_id
+            ] * len(chosen_tokens["prompt_input_ids"])
+            rejected_sequence_tokens["labels"] = rejected_sequence_tokens["input_ids"][:]
+            rejected_sequence_tokens["labels"][: len(rejected_tokens["prompt_input_ids"])] = [
+                self.label_pad_token_id
+            ] * len(rejected_tokens["prompt_input_ids"])
+
+            for k, toks in {
+                "chosen_": chosen_sequence_tokens,
+                "rejected_": rejected_sequence_tokens,
+                "": prompt_tokens,
+            }.items():
+                for type_key, tokens in toks.items():
+                    if type_key == "token_type_ids":
+                        continue
+                    batch[f"{k}{type_key}"] = tokens
+
+        else:
+            chosen_tokens = self.processing_class(
+                chosen, truncation=True, max_length=self.max_completion_length, add_special_tokens=True
+            )
+            rejected_tokens = self.processing_class(
+                rejected, truncation=True, max_length=self.max_completion_length, add_special_tokens=True
+            )
+            prompt_tokens = self.processing_class(
+                prompt, truncation=True, max_length=self.max_prompt_length, add_special_tokens=True
+            )
+
+            batch["chosen_labels"] = chosen_tokens["input_ids"]
+            batch["rejected_labels"] = rejected_tokens["input_ids"]
+            batch["prompt_input_ids"] = prompt_tokens["input_ids"]
+            batch["prompt_attention_mask"] = prompt_tokens["attention_mask"]
+
+            if model is not None and hasattr(model, "prepare_decoder_input_ids_from_labels"):
+                batch["rejected_decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels(
+                    labels=torch.tensor(batch["rejected_labels"])
+                )
+                batch["chosen_decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels(
+                    labels=torch.tensor(batch["chosen_labels"])
+                )
+
+        if is_torch_xla_available():
+            # Pad the sequences to global max_length to avoid TorchXLA recompilation
+            for k in batch:
+                if "labels" in k or self.is_encoder_decoder:
+                    pad_value = self.label_pad_token_id
+                elif k.endswith("_input_ids"):
+                    pad_value = self.padding_value
+                elif k.endswith("_attention_mask"):
+                    pad_value = 0
+                batch[k] = batch[k] + [pad_value] * (self.max_length - len(batch[k]))
+        return batch
+
+    @staticmethod
+    def concatenated_inputs(
+        batch: dict[str, list | torch.LongTensor],
+        is_encoder_decoder: bool = False,
+        label_pad_token_id: int = -100,
+        padding_value: int = 0,
+        device: torch.device | None = None,
+    ) -> dict[str, torch.LongTensor]:
+        """Concatenate the chosen and rejected inputs into a single tensor.
+
+        Args:
+            batch:
+                A batch of data. Must contain the keys 'chosen_input_ids' and 'rejected_input_ids', which are tensors
+                of shape (batch_size, sequence_length).
+            is_encoder_decoder:
+                Whether the model is an encoder-decoder model.
+            label_pad_token_id:
+                The label pad token id.
+            padding_value:
+                The padding value to use for the concatenated inputs_ids.
+            device:
+                The device for the concatenated inputs.
+
+        Returns:
+            A dictionary containing the concatenated inputs under the key 'concatenated_input_ids'.
+        """
+        concatenated_batch = {}
+
+        if is_encoder_decoder:
+            max_length = max(batch["chosen_labels"].shape[1], batch["rejected_labels"].shape[1])
+        else:
+            max_length = max(batch["chosen_input_ids"].shape[1], batch["rejected_input_ids"].shape[1])
+
+        for k in batch:
+            if k.startswith("chosen") and isinstance(batch[k], torch.Tensor):
+                if "labels" in k or is_encoder_decoder:
+                    pad_value = label_pad_token_id
+                elif k.endswith("_input_ids"):
+                    pad_value = padding_value
+                elif k.endswith("_attention_mask"):
+                    pad_value = 0
+                concatenated_key = k.replace("chosen", "concatenated")
+                concatenated_batch[concatenated_key] = pad_to_length(batch[k], max_length, pad_value=pad_value)
+        for k in batch:
+            if k.startswith("rejected") and isinstance(batch[k], torch.Tensor):
+                if "labels" in k or is_encoder_decoder:
+                    pad_value = label_pad_token_id
+                elif k.endswith("_input_ids"):
+                    pad_value = padding_value
+                elif k.endswith("_attention_mask"):
+                    pad_value = 0
+                concatenated_key = k.replace("rejected", "concatenated")
+                concatenated_batch[concatenated_key] = torch.cat(
+                    (
+                        concatenated_batch[concatenated_key],
+                        pad_to_length(batch[k], max_length, pad_value=pad_value),
+                    ),
+                    dim=0,
+                ).to(device=device)
+
+        if is_encoder_decoder:
+            concatenated_batch["concatenated_input_ids"] = batch["prompt_input_ids"].repeat(2, 1).to(device=device)
+            concatenated_batch["concatenated_attention_mask"] = (
+                batch["prompt_attention_mask"].repeat(2, 1).to(device=device)
+            )
+
+        return concatenated_batch
+
+    def odds_ratio_loss(
+        self,
+        policy_chosen_logps: torch.FloatTensor,
+        policy_rejected_logps: torch.FloatTensor,
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        """Compute ORPO's odds ratio (OR) loss for a batch of policy and reference model log probabilities.
+
+        Args:
+            policy_chosen_logps:
+                Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
+            policy_rejected_logps:
+                Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)
+
+        Returns:
+            A tuple of three tensors: (losses, chosen_rewards, rejected_rewards). The losses tensor contains the ORPO
+            loss for each example in the batch. The chosen_rewards and rejected_rewards tensors contain the rewards for
+            the chosen and rejected responses, respectively. The log odds ratio of the chosen responses over the
+            rejected responses ratio for logging purposes. The `log(sigmoid(log_odds_chosen))` for logging purposes.
+        """
+
+        # Derived from Eqs. (4) and (7) from https://huggingface.co/papers/2403.07691 by using log identities and exp(log(P(y|x)) = P(y|x)
+        log_odds = (policy_chosen_logps - policy_rejected_logps) - (
+            torch.log1p(-torch.exp(policy_chosen_logps)) - torch.log1p(-torch.exp(policy_rejected_logps))
+        )
+        ratio = F.logsigmoid(log_odds)
+        losses = self.beta * ratio
+
+        chosen_rewards = self.beta * (policy_chosen_logps.to(self.accelerator.device)).detach()
+        rejected_rewards = self.beta * (policy_rejected_logps.to(self.accelerator.device)).detach()
+
+        return losses, chosen_rewards, rejected_rewards, torch.mean(ratio), torch.mean(log_odds)
+
+    @staticmethod
+    def get_batch_logps(
+        logits: torch.FloatTensor,
+        labels: torch.LongTensor,
+        average_log_prob: bool = False,
+        label_pad_token_id: int = -100,
+        is_encoder_decoder: bool = False,
+    ) -> torch.FloatTensor:
+        """Compute the log probabilities of the given labels under the given logits.
+
+        Args:
+            logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size)
+            labels:
+                Labels for which to compute the log probabilities. Label tokens with a value of label_pad_token_id are
+                ignored. Shape: (batch_size, sequence_length)
+            average_log_prob:
+                If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the
+                log probabilities of the (non-masked) tokens.
+            label_pad_token_id: The label pad token id.
+            is_encoder_decoder: Whether the model is an encoder-decoder model.
+
+        Returns:
+            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the
+            given logits.
+        """
+        if logits.shape[:-1] != labels.shape:
+            raise ValueError("Logits (batch and sequence length dim) and labels must have the same shape.")
+
+        if not is_encoder_decoder:
+            labels = labels[:, 1:].clone()
+            logits = logits[:, :-1, :]
+        loss_mask = labels != label_pad_token_id
+
+        # dummy token; we'll ignore the losses on these tokens later
+        labels = torch.where(labels == label_pad_token_id, 0, labels)
+
+        per_token_logps = selective_log_softmax(logits, labels)
+
+        if average_log_prob:
+            return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        else:
+            return (per_token_logps * loss_mask).sum(-1)
+
+    def concatenated_forward(
+        self, model: nn.Module, batch: dict[str, list | torch.LongTensor]
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        """Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.
+
+        We do this to avoid doing two forward passes, because it's faster for FSDP.
+        """
+        concatenated_batch = self.concatenated_inputs(
+            batch,
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+            padding_value=self.padding_value,
+            device=self.accelerator.device,
+        )
+        len_chosen = batch["chosen_labels"].shape[0]
+
+        model_kwargs = (
+            {
+                "decoder_input_ids": self._shift_right(concatenated_batch["concatenated_labels"]),
+            }
+            if self.is_encoder_decoder
+            else {}
+        )
+
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+
+        outputs = model(
+            concatenated_batch["concatenated_input_ids"],
+            attention_mask=concatenated_batch["concatenated_attention_mask"],
+            use_cache=False,
+            **model_kwargs,
+        )
+        all_logits = outputs.logits
+
+        def cross_entropy_loss(logits, labels):
+            if not self.is_encoder_decoder:
+                # Shift so that tokens < n predict n
+                logits = logits[..., :-1, :].contiguous()
+                labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            logits = logits.view(-1, logits.shape[-1])
+            labels = labels.view(-1)
+            # Enable model parallelism
+            labels = labels.to(logits.device)
+            loss = loss_fct(logits, labels)
+            return loss
+
+        if self.is_encoder_decoder:
+            labels = concatenated_batch["concatenated_labels"].clone()
+        else:
+            labels = concatenated_batch["concatenated_input_ids"].clone()
+            attention_mask = concatenated_batch["concatenated_attention_mask"]
+            labels = torch.where(attention_mask == 1, labels, self.label_pad_token_id)
+        # orpo chosen nll loss is computed over the full prompt and response
+        chosen_nll_loss = cross_entropy_loss(all_logits[:len_chosen], labels[:len_chosen])
+
+        all_logps = self.get_batch_logps(
+            all_logits,
+            concatenated_batch["concatenated_labels"],
+            average_log_prob=True,
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+        )
+
+        chosen_logps = all_logps[:len_chosen]
+        rejected_logps = all_logps[len_chosen:]
+
+        if not self.is_encoder_decoder:
+            chosen_logits = all_logits[:len_chosen, :-1, :]
+            rejected_logits = all_logits[len_chosen:, :-1, :]
+        else:
+            chosen_logits = all_logits[:len_chosen]
+            rejected_logits = all_logits[len_chosen:]
+
+        if self.aux_loss_enabled:
+            return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_nll_loss, outputs.aux_loss)
+
+        return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_nll_loss)
+
+    def get_batch_loss_metrics(
+        self,
+        model,
+        batch: dict[str, list | torch.LongTensor],
+        train_eval: Literal["train", "eval"] = "train",
+    ):
+        """Compute the ORPO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+
+        forward_output = self.concatenated_forward(model, batch)
+        (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits,
+            policy_rejected_logits,
+            policy_nll_loss,
+        ) = forward_output[:5]
+        if self.aux_loss_enabled:
+            aux_loss = forward_output[5]
+
+        losses, chosen_rewards, rejected_rewards, log_odds_ratio, log_odds_chosen = self.odds_ratio_loss(
+            policy_chosen_logps, policy_rejected_logps
+        )
+        # full ORPO loss
+        loss = policy_nll_loss - losses.mean()
+
+        reward_accuracies = (chosen_rewards > rejected_rewards).float()
+
+        prefix = "eval_" if train_eval == "eval" else ""
+        metrics[f"{prefix}rewards/chosen"] = self.accelerator.gather_for_metrics(chosen_rewards).mean()
+        metrics[f"{prefix}rewards/rejected"] = self.accelerator.gather_for_metrics(rejected_rewards).mean()
+        metrics[f"{prefix}rewards/accuracies"] = self.accelerator.gather_for_metrics(reward_accuracies).mean()
+        metrics[f"{prefix}rewards/margins"] = self.accelerator.gather_for_metrics(
+            chosen_rewards - rejected_rewards
+        ).mean()
+        metrics[f"{prefix}logps/rejected"] = self.accelerator.gather_for_metrics(policy_rejected_logps).detach().mean()
+        metrics[f"{prefix}logps/chosen"] = self.accelerator.gather_for_metrics(policy_chosen_logps).detach().mean()
+        metrics[f"{prefix}logits/rejected"] = self.accelerator.gather_for_metrics(
+            policy_rejected_logits.detach().mean()
+        ).mean()
+        metrics[f"{prefix}logits/chosen"] = self.accelerator.gather_for_metrics(
+            policy_chosen_logits.detach().mean()
+        ).mean()
+        metrics[f"{prefix}nll_loss"] = self.accelerator.gather_for_metrics(policy_nll_loss).detach().mean()
+        metrics[f"{prefix}log_odds_ratio"] = self.accelerator.gather_for_metrics(log_odds_ratio).detach().mean()
+        metrics[f"{prefix}log_odds_chosen"] = self.accelerator.gather_for_metrics(log_odds_chosen).detach().mean()
+        if is_torch_xla_available():
+            xm.mark_step()  # needed because .item() calls
+        for k, v in metrics.items():
+            metrics[k] = v.item()
+        if self.aux_loss_enabled:
+            loss += self.aux_loss_coef * aux_loss
+
+        return loss, metrics
+
+    def compute_loss(
+        self,
+        model: PreTrainedModel | nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        return_outputs=False,
+        num_items_in_batch=None,
+    ) -> torch.Tensor | tuple[torch.Tensor, dict[str, torch.Tensor]]:
+        compute_loss_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+
+        with compute_loss_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="train")
+
+        # Make sure to move the loss to the device the original accumulating loss is at back in the `Trainer` class:
+        loss = loss.to(self.args.device)
+
+        # force log the metrics
+        self.store_metrics(metrics, train_eval="train")
+
+        if return_outputs:
+            return (loss, metrics)
+        return loss
+
+    def generate_from_model(self, model, batch: dict[str, torch.LongTensor]) -> str:
+        """Generate samples from the model and reference model for the given batch of inputs."""
+
+        # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with
+        # the torch amp context manager as some hidden states are silently casted to full precision.
+        generate_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+
+        with generate_context_manager:
+            policy_output = model.generate(
+                input_ids=batch["prompt_input_ids"],
+                attention_mask=batch["prompt_attention_mask"],
+                max_length=self.max_length,
+                do_sample=True,
+                pad_token_id=self.processing_class.pad_token_id,
+            )
+
+        policy_output = pad_to_length(policy_output, self.max_length, self.processing_class.pad_token_id)
+        policy_output_decoded = self.processing_class.batch_decode(policy_output, skip_special_tokens=True)
+
+        return policy_output_decoded
+
+    def prediction_step(
+        self,
+        model: PreTrainedModel | nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        prediction_loss_only: bool,
+        ignore_keys: list[str] | None = None,
+    ):
+        if not self.use_dpo_data_collator:
+            logger.warning(
+                "prediction_step is only implemented for DPODataCollatorWithPadding, and you passed a datacollator that is different than "
+                "DPODataCollatorWithPadding - you might see unexpected behavior. Alternatively, you can implement your own prediction_step method if you are using a custom data collator"
+            )
+        if ignore_keys is None:
+            if hasattr(model, "config"):
+                ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", [])
+            else:
+                ignore_keys = []
+
+        prediction_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+
+        with torch.no_grad(), prediction_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="eval")
+
+        # force log the metrics
+        self.store_metrics(metrics, train_eval="eval")
+
+        if prediction_loss_only:
+            return (loss.detach(), None, None)
+
+        # logits for the chosen and rejected samples from model
+        logits_dict = {
+            "eval_logits/chosen": metrics["eval_logits/chosen"],
+            "eval_logits/rejected": metrics["eval_logits/rejected"],
+        }
+        logits = [v for k, v in logits_dict.items() if k not in ignore_keys]
+        logits = torch.tensor(logits, device=self.accelerator.device)
+        labels = torch.zeros(logits.shape[0], device=self.accelerator.device)
+
+        return (loss.detach(), logits, labels)
+
+    def store_metrics(self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None:
+        for key, value in metrics.items():
+            self._stored_metrics[train_eval][key].append(value)
+
+    def evaluation_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: bool | None = None,
+        ignore_keys: list[str] | None = None,
+        metric_key_prefix: str = "eval",
+    ) -> EvalLoopOutput:
+        """
+        Overriding built-in evaluation loop to store metrics for each batch. Prediction/evaluation loop, shared by
+        `Trainer.evaluate()` and `Trainer.predict()`.
+
+        Works both with or without labels.
+        """
+
+        # Sample and save to game log if requested (for one batch to save time)
+        if self.generate_during_eval:
+            # Generate random indices within the range of the total number of samples
+            num_samples = len(dataloader.dataset)
+            random_indices = random.sample(range(num_samples), k=self.args.eval_batch_size)
+
+            # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader
+            random_batch_dataset = dataloader.dataset.select(random_indices)
+            random_batch = self.data_collator(random_batch_dataset)
+            random_batch = self._prepare_inputs(random_batch)
+
+            policy_output_decoded = self.generate_from_model(self.model, random_batch)
+
+            table = pd.DataFrame(
+                columns=["Prompt", "Policy"],
+                data=[
+                    [prompt, pol[len(prompt) :]]
+                    for prompt, pol in zip(random_batch["prompt"], policy_output_decoded, strict=True)
+                ],
+            )
+            if "wandb" in self.args.report_to:
+                wandb.log({"game_log": wandb.Table(data=table)})
+
+            if "comet_ml" in self.args.report_to:
+                log_table_to_comet_experiment(
+                    name="game_log.csv",
+                    table=table,
+                )
+
+        # Base evaluation
+        initial_output = super().evaluation_loop(
+            dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix
+        )
+
+        return initial_output
+
+    def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
+        """
+        Log `logs` on the various objects watching training, including stored metrics.
+
+        Args:
+            logs (`dict[str, float]`):
+                The values to log.
+            start_time (`float`, *optional*):
+                Start time of the training.
+        """
+        # logs either has 'loss' or 'eval_loss'
+        train_eval = "train" if "loss" in logs else "eval"
+        # Add averaged stored metrics to logs
+        for key, metrics in self._stored_metrics[train_eval].items():
+            logs[key] = torch.tensor(metrics).mean().item()
+        del self._stored_metrics[train_eval]
+        return super().log(logs, start_time)
+
+    def _shift_right(self, input_ids):
+        if self.decoder_start_token_id is None:
+            raise ValueError(
+                "model.config.decoder_start_token_id has to be defined. It is usually set to the pad_token_id."
+            )
+
+        # shift inputs to the right
+        if is_torch_fx_proxy(input_ids):
+            # Item assignment is not supported natively for proxies.
+            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), self.decoder_start_token_id)
+            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
+        else:
+            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+            shifted_input_ids[..., 0] = self.decoder_start_token_id
+
+        if self.pad_token_id is None:
+            raise ValueError("model.config.pad_token_id has to be defined.")
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, self.pad_token_id)
+
+        return shifted_input_ids
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
diff --git a/trl/trainer/__init__.py b/trl/trainer/__init__.py
index 98846bf7159..1f2e3ccec00 100644
--- a/trl/trainer/__init__.py
+++ b/trl/trainer/__init__.py
@@ -54,8 +54,6 @@
     "nash_md_trainer": ["NashMDTrainer"],
     "online_dpo_config": ["OnlineDPOConfig"],
     "online_dpo_trainer": ["OnlineDPOTrainer"],
-    "orpo_config": ["ORPOConfig"],
-    "orpo_trainer": ["ORPOTrainer"],
     "ppo_config": ["PPOConfig"],
     "ppo_trainer": ["PPOTrainer"],
     "prm_config": ["PRMConfig"],
@@ -114,8 +112,6 @@
     from .nash_md_trainer import NashMDTrainer
     from .online_dpo_config import OnlineDPOConfig
     from .online_dpo_trainer import OnlineDPOTrainer
-    from .orpo_config import ORPOConfig
-    from .orpo_trainer import ORPOTrainer
     from .ppo_config import PPOConfig
     from .ppo_trainer import PPOTrainer
     from .prm_config import PRMConfig
diff --git a/trl/trainer/orpo_config.py b/trl/trainer/orpo_config.py
index 523beeab934..73d2345e88c 100644
--- a/trl/trainer/orpo_config.py
+++ b/trl/trainer/orpo_config.py
@@ -12,158 +12,38 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass, field
-from typing import Any
+import warnings
+from dataclasses import dataclass
 
-from transformers import TrainingArguments
+from ..experimental.orpo import ORPOConfig as ExperimentalORPOConfig
 
 
 @dataclass
-class ORPOConfig(TrainingArguments):
+class ORPOConfig(ExperimentalORPOConfig):
     r"""
     Configuration class for the [`ORPOTrainer`].
 
-    This class includes only the parameters that are specific to ORPO training. For a full list of training arguments,
-    please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
-    differ from those in [`~transformers.TrainingArguments`].
+    <Deprecated version="0.25.0">
 
-    Using [`~transformers.HfArgumentParser`] we can turn this class into
-    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
-    command line.
+    This class has been moved to `trl.experimental.orpo.ORPOConfig` and will be removed in TRL 0.29.0.
+    Please update your imports:
 
-    Parameters:
-        max_length (`int` or `None`, *optional*, defaults to `1024`):
-            Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
-            to use the default data collator.
-        max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
-            Maximum length of the prompt. This argument is required if you want to use the default data collator.
-        max_completion_length (`int`, *optional*):
-            Maximum length of the completion. This argument is required if you want to use the default data collator
-            and your model is an encoder-decoder.
-        beta (`float`, *optional*, defaults to `0.1`):
-            Parameter controlling the relative ratio loss weight in the ORPO loss. In the
-            [paper](https://huggingface.co/papers/2403.07691), it is denoted by λ. In the
-            [code](https://github.com/xfactlab/orpo), it is denoted by `alpha`.
-        disable_dropout (`bool`, *optional*, defaults to `True`):
-            Whether to disable dropout in the model.
-        label_pad_token_id (`int`, *optional*, defaults to `-100`):
-            Label pad token id. This argument is required if you want to use the default data collator.
-        padding_value (`int`, *optional*):
-            Padding value to use. If `None`, the padding value of the tokenizer is used.
-        truncation_mode (`str`, *optional*, defaults to `"keep_end"`):
-            Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
-            This argument is required if you want to use the default data collator.
-        generate_during_eval (`bool`, *optional*, defaults to `False`):
-            If `True`, generates and logs completions from the model to W&B or Comet during evaluation.
-        is_encoder_decoder (`bool`, *optional*):
-            When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
-            you need to specify if the model returned by the callable is an encoder-decoder model.
-        model_init_kwargs (`dict[str, Any]`, *optional*):
-            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
-            string.
-        dataset_num_proc (`int`, *optional*):
-            Number of processes to use for processing the dataset.
-    """
-
-    _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["model_init_kwargs"]
+    ```python
+    from trl.experimental.orpo import ORPOConfig
+    ```
 
-    # Parameters whose default values are overridden from TrainingArguments
-    learning_rate: float = field(
-        default=1e-6,
-        metadata={"help": "The initial learning rate for AdamW."},
-    )
-    logging_steps: float = field(
-        default=10,
-        metadata={
-            "help": "Log every X updates steps. Should be an integer or a float in range `[0,1)`. If smaller than 1, "
-            "will be interpreted as ratio of total training steps."
-        },
-    )
-    gradient_checkpointing: bool = field(
-        default=True,
-        metadata={
-            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
-        },
-    )
-    bf16: bool | None = field(
-        default=None,
-        metadata={
-            "help": "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA "
-            "architecture or Intel XPU or using CPU (use_cpu) or Ascend NPU. If not set, it defaults to `True` if "
-            "`fp16` is not set."
-        },
-    )
+    For more details, see: https://github.com/huggingface/trl/issues/4223
 
-    max_length: int | None = field(
-        default=1024,
-        metadata={"help": "Maximum length of the sequences (prompt + completion) in the batch."},
-    )
-    max_prompt_length: int | None = field(
-        default=512,
-        metadata={
-            "help": "Maximum length of the prompt. This argument is required if you want to use the default data "
-            "collator and your model is an encoder-decoder."
-        },
-    )
-    max_completion_length: int | None = field(
-        default=None,
-        metadata={
-            "help": "Maximum length of the completion. This argument is required if you want to use the default data "
-            "collator and your model is an encoder-decoder."
-        },
-    )
-    beta: float = field(
-        default=0.1,
-        metadata={
-            "help": "Parameter controlling the relative ratio loss weight in the ORPO loss. In the paper, it is "
-            "denoted by λ."
-        },
-    )
-    disable_dropout: bool = field(
-        default=True,
-        metadata={"help": "Whether to disable dropout in the model."},
-    )
-    label_pad_token_id: int = field(
-        default=-100,
-        metadata={
-            "help": "Label pad token id. This argument is required if you want to use the default data collator."
-        },
-    )
-    padding_value: int | None = field(
-        default=None,
-        metadata={"help": "Padding value to use. If `None`, the padding value of the tokenizer is used."},
-    )
-    truncation_mode: str = field(
-        default="keep_end",
-        metadata={
-            "help": "Truncation mode to use when the prompt is too long.",
-            "choices": ["keep_end", "keep_start"],
-        },
-    )
-    generate_during_eval: bool = field(
-        default=False,
-        metadata={"help": "If `True`, generates and logs completions from the model to W&B during evaluation."},
-    )
-    is_encoder_decoder: bool | None = field(
-        default=None,
-        metadata={
-            "help": "When using the `model_init` argument (callable) to instantiate the model instead of the `model` "
-            "argument, you need to specify if the model returned by the callable is an encoder-decoder model."
-        },
-    )
-    model_init_kwargs: dict[str, Any] | None = field(
-        default=None,
-        metadata={
-            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model "
-            "from a string."
-        },
-    )
-    dataset_num_proc: int | None = field(
-        default=None,
-        metadata={"help": "Number of processes to use for processing the dataset."},
-    )
+    </Deprecated>
+    """
 
     def __post_init__(self):
-        self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
-
+        warnings.warn(
+            "ORPOConfig has been moved to trl.experimental.orpo.ORPOConfig and will be removed from "
+            "trl.trainer in TRL 0.29.0. Please update your imports to: "
+            "`from trl.experimental.orpo import ORPOConfig`. "
+            "For more details, see: https://github.com/huggingface/trl/issues/4223",
+            FutureWarning,
+            stacklevel=2,
+        )
         super().__post_init__()
diff --git a/trl/trainer/orpo_trainer.py b/trl/trainer/orpo_trainer.py
index fb905243800..1c60e06c5f5 100644
--- a/trl/trainer/orpo_trainer.py
+++ b/trl/trainer/orpo_trainer.py
@@ -12,124 +12,45 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import inspect
-import os
-import random
-import textwrap
 import warnings
-from collections import defaultdict
 from collections.abc import Callable
-from contextlib import nullcontext
-from pathlib import Path
-from typing import Any, Literal
+from typing import Any
 
-import numpy as np
-import pandas as pd
-import torch
 import torch.nn as nn
-import torch.nn.functional as F
-from accelerate import PartialState, logging
 from datasets import Dataset
-from torch import autocast
-from torch.utils.data import DataLoader
 from transformers import (
-    AutoModelForCausalLM,
     BaseImageProcessor,
     DataCollator,
     FeatureExtractionMixin,
     PreTrainedModel,
     PreTrainedTokenizerBase,
     ProcessorMixin,
-    is_comet_available,
-    is_torch_xla_available,
-    is_wandb_available,
 )
 from transformers.trainer_callback import TrainerCallback
 from transformers.trainer_utils import EvalLoopOutput
-from transformers.utils import is_peft_available, is_torch_fx_proxy
 
-from ..data_utils import maybe_apply_chat_template, maybe_extract_prompt
-from .base_trainer import BaseTrainer
+from ..experimental.orpo import ORPOTrainer as ExperimentalORPOTrainer
 from .orpo_config import ORPOConfig
-from .utils import (
-    DPODataCollatorWithPadding,
-    add_bos_token_if_needed,
-    add_eos_token_if_needed,
-    disable_dropout_in_model,
-    log_table_to_comet_experiment,
-    pad_to_length,
-    peft_module_casting_to_bf16,
-    selective_log_softmax,
-)
-
-
-if is_peft_available():
-    from peft import PeftModel, get_peft_model, prepare_model_for_kbit_training
 
 
-if is_wandb_available():
-    import wandb
+class ORPOTrainer(ExperimentalORPOTrainer):
+    """
+    Initialize ORPOTrainer.
 
-if is_torch_xla_available():
-    import torch_xla.core.xla_model as xm
+    <Deprecated version="0.25.0">
 
+    This class has been moved to `trl.experimental.orpo.ORPOTrainer` and will be removed in TRL 0.29.0.
+    Please update your imports:
 
-logger = logging.get_logger(__name__)
+    ```python
+    from trl.experimental.orpo import ORPOTrainer
+    ```
 
+    For more details, see: https://github.com/huggingface/trl/issues/4223
 
-class ORPOTrainer(BaseTrainer):
-    r"""
-    Initialize ORPOTrainer.
-
-    Args:
-        model ([`~transformers.PreTrainedModel`]):
-            The model to train, preferably an [`~transformers.AutoModelForSequenceClassification`].
-        args ([`ORPOConfig`]):
-            The ORPO config arguments to use for training.
-        data_collator ([`~transformers.DataCollator`]):
-            The data collator to use for training. If None is specified, the default data collator
-            ([`DPODataCollatorWithPadding`]) will be used which will pad the sequences to the maximum length of the
-            sequences in the batch, given a dataset of paired sequences.
-        train_dataset ([`~datasets.Dataset`]):
-            The dataset to use for training.
-        eval_dataset ([`~datasets.Dataset`]):
-            The dataset to use for evaluation.
-        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
-            Processing class used to process the data. If provided, will be used to automatically process the inputs
-            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
-            reuse the fine-tuned model.
-        model_init (`Callable[[], transformers.PreTrainedModel]`):
-            The model initializer to use for training. If None is specified, the default model initializer will be
-            used.
-        callbacks (`list[transformers.TrainerCallback]`):
-            The callbacks to use for training.
-        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
-            The optimizer and scheduler to use for training.
-        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
-            The function to use to preprocess the logits before computing the metrics.
-        peft_config (`dict`, defaults to `None`):
-            The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in
-            a PEFT model.
-        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
-            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
-            metric values.
+    </Deprecated>
     """
 
-    _tag_names = ["trl", "orpo"]
-    _name = "ORPO"
-    _paper = {
-        "title": "ORPO: Monolithic Preference Optimization without Reference Model",
-        "id": "2403.07691",
-        # docstyle-ignore
-        "citation": textwrap.dedent("""\
-            @article{hong2024orpo,
-                title        = {{ORPO: Monolithic Preference Optimization without Reference Model}},
-                author       = {Jiwoo Hong and Noah Lee and James Thorne},
-                year         = 2024,
-                eprint       = {arXiv:2403.07691}
-            }"""),
-    }
-
     def __init__(
         self,
         model: PreTrainedModel | nn.Module | str | None = None,
@@ -149,207 +70,14 @@ def __init__(
         peft_config: dict | None = None,
         compute_metrics: Callable[[EvalLoopOutput], dict] | None = None,
     ):
-        if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
-            warnings.warn(
-                "This trainer will soon be moved to trl.experimental and is a candidate for removal. If you rely on "
-                "it and want it to remain, please share your comments here: "
-                "https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable "
-                "TRL_EXPERIMENTAL_SILENCE=1."
-            )
-        if args.model_init_kwargs is None:
-            model_init_kwargs = {}
-        elif not isinstance(model, str):
-            raise ValueError("You passed model_kwargs to the ORPOTrainer. But your model is already instantiated.")
-        else:
-            model_init_kwargs = args.model_init_kwargs
-            dtype = model_init_kwargs.get("dtype")
-            if dtype is not None:
-                # Convert to `torch.dtype` if an str is passed
-                if isinstance(dtype, str) and dtype != "auto":
-                    dtype = getattr(torch, dtype)
-                if dtype != "auto" and not isinstance(dtype, torch.dtype):
-                    raise ValueError(
-                        f"Invalid `dtype` passed to the ORPOConfig. Expected a string with either `torch.dtype` or 'auto', but got {dtype}."
-                    )
-                model_init_kwargs["dtype"] = dtype
-
-        if isinstance(model, str):
-            model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
-
-        # Initialize this variable to False. This helps tracking the case when `peft_module_casting_to_bf16`
-        # has been called in order to properly call autocast if needed.
-        self._peft_has_been_casted_to_bf16 = False
-
-        if not is_peft_available() and peft_config is not None:
-            raise ValueError(
-                "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models"
-            )
-        elif is_peft_available() and peft_config is not None:
-            # if model is a peft model and we have a peft_config, we merge and unload it first
-            if isinstance(model, PeftModel):
-                model = model.merge_and_unload()
-
-            if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False):
-                _support_gc_kwargs = hasattr(
-                    args, "gradient_checkpointing_kwargs"
-                ) and "gradient_checkpointing_kwargs" in list(
-                    inspect.signature(prepare_model_for_kbit_training).parameters
-                )
-
-                prepare_model_kwargs = {"use_gradient_checkpointing": args.gradient_checkpointing}
-
-                if _support_gc_kwargs:
-                    prepare_model_kwargs["gradient_checkpointing_kwargs"] = args.gradient_checkpointing_kwargs
-
-                model = prepare_model_for_kbit_training(model, **prepare_model_kwargs)
-            elif args.gradient_checkpointing:
-                # For backward compatibility with older versions of transformers
-                if hasattr(model, "enable_input_require_grads"):
-                    model.enable_input_require_grads()
-                else:
-
-                    def make_inputs_require_grad(module, input, output):
-                        output.requires_grad_(True)
-
-                    model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
-
-            # get peft model with the given config
-            model = get_peft_model(model, peft_config)
-            if args.bf16 and getattr(model, "is_loaded_in_4bit", False):
-                peft_module_casting_to_bf16(model)
-                # If args.bf16 we need to explicitly call `generate` with torch amp autocast context manager
-                self._peft_has_been_casted_to_bf16 = True
-
-        # For models that use gradient_checkpointing, we need to attach a hook that enables input
-        # to explicitly have `requires_grad=True`, otherwise training will either silently
-        # fail or completely fail.
-        elif args.gradient_checkpointing:
-            # For backward compatibility with older versions of transformers
-            if hasattr(model, "enable_input_require_grads"):
-                model.enable_input_require_grads()
-            else:
-
-                def make_inputs_require_grad(module, input, output):
-                    output.requires_grad_(True)
-
-                model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
-
-        if args.generate_during_eval and not (is_wandb_available() or is_comet_available()):
-            raise ValueError(
-                "`generate_during_eval=True` requires Weights and Biases or Comet to be installed."
-                " Please install `wandb` or `comet-ml` to resolve."
-            )
-
-        if model is not None:
-            self.is_encoder_decoder = model.config.is_encoder_decoder
-        elif args.is_encoder_decoder is None:
-            raise ValueError("When no model is provided, you need to pass the parameter is_encoder_decoder.")
-        else:
-            self.is_encoder_decoder = args.is_encoder_decoder
-
-        if self.is_encoder_decoder:
-            self.decoder_start_token_id = model.config.decoder_start_token_id
-            self.pad_token_id = model.config.pad_token_id
-
-        if processing_class is None:
-            raise ValueError("processing_class must be specified to tokenize a ORPO dataset.")
-        if args.max_length is None:
-            logger.warning(
-                "`max_length` is not set in the ORPOConfig's init"
-                " it will default to `512` by default, but you should do it yourself in the future.",
-            )
-            max_length = 512
-        else:
-            max_length = args.max_length
-        if args.max_prompt_length is None:
-            logger.warning(
-                "`max_prompt_length` is not set in the ORPOConfig's init"
-                " it will default to `128` by default, but you should do it yourself in the future.",
-            )
-            max_prompt_length = 128
-        else:
-            max_prompt_length = args.max_prompt_length
-
-        if args.max_completion_length is None and self.is_encoder_decoder:
-            logger.warning(
-                "When using an encoder decoder architecture, you should set `max_completion_length` in the ORPOConfig's init"
-                " it will default to `128` by default, but you should do it yourself in the future.",
-            )
-            self.max_completion_length = 128
-        else:
-            self.max_completion_length = args.max_completion_length
-
-        if data_collator is None:
-            data_collator = DPODataCollatorWithPadding(
-                pad_token_id=processing_class.pad_token_id,
-                label_pad_token_id=args.label_pad_token_id,
-                is_encoder_decoder=self.is_encoder_decoder,
-            )
-
-            if args.remove_unused_columns:
-                args.remove_unused_columns = False
-                # warn users
-                logger.warning(
-                    "When using DPODataCollatorWithPadding, you should set `remove_unused_columns=False` in your TrainingArguments"
-                    " we have set it for you, but you should do it yourself in the future.",
-                )
-
-            self.use_dpo_data_collator = True
-        else:
-            self.use_dpo_data_collator = False
-
-        # Disable dropout in the model and reference model
-        if args.disable_dropout:
-            disable_dropout_in_model(model)
-
-        self.max_length = max_length
-        self.generate_during_eval = args.generate_during_eval
-        self.label_pad_token_id = args.label_pad_token_id
-        self.padding_value = args.padding_value if args.padding_value is not None else processing_class.pad_token_id
-        self.max_prompt_length = max_prompt_length
-        self.truncation_mode = args.truncation_mode
-        self.processing_class = processing_class
-
-        self.beta = args.beta
-        self.aux_loss_enabled = getattr(model.config, "output_router_logits", False)
-        self.aux_loss_coef = getattr(model.config, "router_aux_loss_coef", 0.0)
-        if self.aux_loss_enabled and self.aux_loss_coef == 0.0:
-            logger.warning(
-                "You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to "
-                "`0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value "
-                "greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary "
-                "loss.",
-            )
-
-        self._stored_metrics = defaultdict(lambda: defaultdict(list))
-
-        # The trainer estimates the number of FLOPs (floating-point operations) using the number of elements in the
-        # input tensor associated with the key "input_ids". However, in ORPO, the sampled data does not include the
-        # "input_ids" key. Instead, the available keys are "prompt_input_ids", "chosen_input_ids", and
-        # "rejected_input_ids". As a result, the trainer issues the warning: "Could not estimate the number of tokens
-        # of the input, floating-point operations will not be computed." To suppress this warning, we set the
-        # "estimate_tokens" key in the model's "warnings_issued" dictionary to True. This acts as a flag to indicate
-        # that the warning has already been issued.
-        model.warnings_issued["estimate_tokens"] = True
-
-        # Compute that only on the main process for faster data processing.
-        # see: https://github.com/huggingface/trl/pull/1255
-        with PartialState().main_process_first():
-            # Extract the prompt if needed, and apply the chat template if needed
-            train_dataset = train_dataset.map(maybe_extract_prompt, num_proc=args.dataset_num_proc)
-            train_dataset = train_dataset.map(
-                maybe_apply_chat_template, fn_kwargs={"tokenizer": processing_class}, num_proc=args.dataset_num_proc
-            )
-            train_dataset = train_dataset.map(self.tokenize_row, num_proc=args.dataset_num_proc)
-            if eval_dataset is not None:
-                eval_dataset = eval_dataset.map(maybe_extract_prompt, num_proc=args.dataset_num_proc)
-                eval_dataset = eval_dataset.map(
-                    maybe_apply_chat_template,
-                    fn_kwargs={"tokenizer": processing_class},
-                    num_proc=args.dataset_num_proc,
-                )
-                eval_dataset = eval_dataset.map(self.tokenize_row, num_proc=args.dataset_num_proc)
-
+        warnings.warn(
+            "ORPOTrainer has been moved to trl.experimental.orpo.ORPOTrainer and will be removed from "
+            "trl.trainer in TRL 0.29.0. Please update your imports to: "
+            "`from trl.experimental.orpo import ORPOTrainer`. "
+            "For more details, see: https://github.com/huggingface/trl/issues/4223",
+            FutureWarning,
+            stacklevel=2,
+        )
         super().__init__(
             model=model,
             args=args,
@@ -358,700 +86,9 @@ def make_inputs_require_grad(module, input, output):
             eval_dataset=eval_dataset,
             processing_class=processing_class,
             model_init=model_init,
-            compute_metrics=compute_metrics,
             callbacks=callbacks,
             optimizers=optimizers,
             preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+            peft_config=peft_config,
+            compute_metrics=compute_metrics,
         )
-
-        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
-        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
-        # self.model_accepts_loss_kwargs to False to enable scaling.
-        self.model_accepts_loss_kwargs = False
-
-        # Add tags for models that have been loaded with the correct transformers version
-        if hasattr(self.model, "add_model_tags"):
-            self.model.add_model_tags(self._tag_names)
-
-        if not hasattr(self, "accelerator"):
-            raise AttributeError(
-                "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`."
-            )
-
-    def build_tokenized_answer(self, prompt, answer):
-        """
-        Llama tokenizer does satisfy `enc(a + b) = enc(a) + enc(b)`. It does ensure `enc(a + b) = enc(a) + enc(a +
-        b)[len(enc(a)):]`. Reference:
-            https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
-        """
-
-        full_tokenized = self.processing_class(prompt + answer, add_special_tokens=False)
-        prompt_input_ids = self.processing_class(prompt, add_special_tokens=False)["input_ids"]
-
-        answer_input_ids = full_tokenized["input_ids"][len(prompt_input_ids) :]
-        answer_attention_mask = full_tokenized["attention_mask"][len(prompt_input_ids) :]
-
-        # Concat tokens to form `enc(a) + enc(a + b)[len(enc(a)):]`
-        full_concat_input_ids = np.concatenate([prompt_input_ids, answer_input_ids])
-
-        # Prepare input tokens for token by token comparison
-        full_input_ids = np.array(full_tokenized["input_ids"])
-
-        if len(full_input_ids) != len(full_concat_input_ids):
-            raise ValueError("Prompt input ids and answer input ids should have the same length.")
-
-        # On some tokenizers, like Llama-2 tokenizer, there are occasions where tokens
-        # can be merged together when tokenizing prompt+answer. This could result
-        # on the last token from the prompt being different when tokenized on its own
-        # vs when done as prompt+answer.
-        response_token_ids_start_idx = len(prompt_input_ids)
-
-        # If tokenized prompt is different than both prompt+answer, then it means the
-        # last token has changed due to merging.
-        if prompt_input_ids != full_tokenized["input_ids"][:response_token_ids_start_idx]:
-            response_token_ids_start_idx -= 1
-
-        prompt_input_ids = full_tokenized["input_ids"][:response_token_ids_start_idx]
-        prompt_attention_mask = full_tokenized["attention_mask"][:response_token_ids_start_idx]
-
-        if len(prompt_input_ids) != len(prompt_attention_mask):
-            raise ValueError("Prompt input ids and attention mask should have the same length.")
-
-        answer_input_ids = full_tokenized["input_ids"][response_token_ids_start_idx:]
-        answer_attention_mask = full_tokenized["attention_mask"][response_token_ids_start_idx:]
-
-        return dict(
-            prompt_input_ids=prompt_input_ids,
-            prompt_attention_mask=prompt_attention_mask,
-            input_ids=answer_input_ids,
-            attention_mask=answer_attention_mask,
-        )
-
-    def tokenize_row(self, feature, model: PreTrainedModel | nn.Module | None = None) -> dict:
-        """Tokenize a single row from a ORPO specific dataset.
-
-        At this stage, we don't convert to PyTorch tensors yet; we just handle the truncation in case the prompt +
-        chosen or prompt + rejected responses is/are too long. First we truncate the prompt; if we're still too long,
-        we truncate the chosen/rejected.
-
-        We also create the labels for the chosen/rejected responses, which are of length equal to the sum of the length
-        of the prompt and the chosen/rejected response, with label_pad_token_id for the prompt tokens.
-        """
-        batch = {}
-        prompt = feature["prompt"]
-        chosen = feature["chosen"]
-        rejected = feature["rejected"]
-
-        if not self.is_encoder_decoder:
-            # Check issues below for more details
-            #  1. https://github.com/huggingface/trl/issues/907
-            #  2. https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
-            #  3. https://github.com/LianjiaTech/BELLE/issues/337
-
-            if not isinstance(prompt, str):
-                raise ValueError(f"prompt should be an str but got {type(prompt)}")
-            prompt_tokens = self.processing_class(prompt, add_special_tokens=False)
-            prompt_tokens = {f"prompt_{k}": v for k, v in prompt_tokens.items()}
-
-            if not isinstance(chosen, str):
-                raise ValueError(f"chosen should be an str but got {type(chosen)}")
-            chosen_tokens = self.build_tokenized_answer(prompt, chosen)
-
-            if not isinstance(rejected, str):
-                raise ValueError(f"rejected should be an str but got {type(rejected)}")
-            rejected_tokens = self.build_tokenized_answer(prompt, rejected)
-
-            # Last prompt token might get merged by tokenizer and
-            # it should not be included for generation if that happens
-            prompt_len_input_ids = len(prompt_tokens["prompt_input_ids"])
-
-            chosen_prompt_len_input_ids = len(chosen_tokens["prompt_input_ids"])
-            rejected_prompt_len_input_ids = len(rejected_tokens["prompt_input_ids"])
-            prompt_len_input_ids = min(chosen_prompt_len_input_ids, rejected_prompt_len_input_ids)
-
-            for k, v in prompt_tokens.items():
-                prompt_tokens[k] = v[:prompt_len_input_ids]
-
-            # Make sure prompts only have one different token at most an
-            # and length only differs by 1 at most
-            num_diff_tokens = sum(
-                a != b
-                for a, b in zip(chosen_tokens["prompt_input_ids"], rejected_tokens["prompt_input_ids"], strict=True)
-            )
-            num_diff_len = abs(chosen_prompt_len_input_ids - rejected_prompt_len_input_ids)
-            if num_diff_tokens > 1 or num_diff_len > 1:
-                raise ValueError(
-                    "Chosen and rejected prompt_input_ids might only differ on the "
-                    "last token due to tokenizer merge ops."
-                )
-
-            # add BOS token to head of prompt. Avoid adding if it's already there
-            prompt_tokens, chosen_tokens, rejected_tokens = add_bos_token_if_needed(
-                self.processing_class.bos_token_id,
-                prompt_len_input_ids,
-                prompt_tokens,
-                chosen_prompt_len_input_ids,
-                chosen_tokens,
-                rejected_prompt_len_input_ids,
-                rejected_tokens,
-            )
-
-            # add EOS token to end of answer. Avoid adding if it's already there
-            chosen_tokens, rejected_tokens = add_eos_token_if_needed(
-                self.processing_class.eos_token_id, chosen_tokens, rejected_tokens
-            )
-
-            longer_response_length = max(len(chosen_tokens["input_ids"]), len(rejected_tokens["input_ids"]))
-
-            # if combined sequence is too long, truncate the prompt
-            for answer_tokens in [chosen_tokens, rejected_tokens, prompt_tokens]:
-                if len(answer_tokens["prompt_input_ids"]) + longer_response_length > self.max_length:
-                    if self.truncation_mode == "keep_start":
-                        for k in ["prompt_input_ids", "prompt_attention_mask"]:
-                            answer_tokens[k] = answer_tokens[k][: self.max_prompt_length]
-                    elif self.truncation_mode == "keep_end":
-                        for k in ["prompt_input_ids", "prompt_attention_mask"]:
-                            answer_tokens[k] = answer_tokens[k][-self.max_prompt_length :]
-                    else:
-                        raise ValueError(f"Unknown truncation mode: {self.truncation_mode}")
-
-            # if that's still too long, truncate the response
-            for answer_tokens in [chosen_tokens, rejected_tokens]:
-                if len(answer_tokens["prompt_input_ids"]) + longer_response_length > self.max_length:
-                    for k in ["input_ids", "attention_mask"]:
-                        answer_tokens[k] = answer_tokens[k][: self.max_length - self.max_prompt_length]
-
-            # Create labels
-            chosen_sequence_tokens = {
-                k: chosen_tokens[f"prompt_{k}"] + chosen_tokens[k] for k in ["input_ids", "attention_mask"]
-            }
-            rejected_sequence_tokens = {
-                k: rejected_tokens[f"prompt_{k}"] + rejected_tokens[k] for k in ["input_ids", "attention_mask"]
-            }
-            chosen_sequence_tokens["labels"] = chosen_sequence_tokens["input_ids"][:]
-            chosen_sequence_tokens["labels"][: len(chosen_tokens["prompt_input_ids"])] = [
-                self.label_pad_token_id
-            ] * len(chosen_tokens["prompt_input_ids"])
-            rejected_sequence_tokens["labels"] = rejected_sequence_tokens["input_ids"][:]
-            rejected_sequence_tokens["labels"][: len(rejected_tokens["prompt_input_ids"])] = [
-                self.label_pad_token_id
-            ] * len(rejected_tokens["prompt_input_ids"])
-
-            for k, toks in {
-                "chosen_": chosen_sequence_tokens,
-                "rejected_": rejected_sequence_tokens,
-                "": prompt_tokens,
-            }.items():
-                for type_key, tokens in toks.items():
-                    if type_key == "token_type_ids":
-                        continue
-                    batch[f"{k}{type_key}"] = tokens
-
-        else:
-            chosen_tokens = self.processing_class(
-                chosen, truncation=True, max_length=self.max_completion_length, add_special_tokens=True
-            )
-            rejected_tokens = self.processing_class(
-                rejected, truncation=True, max_length=self.max_completion_length, add_special_tokens=True
-            )
-            prompt_tokens = self.processing_class(
-                prompt, truncation=True, max_length=self.max_prompt_length, add_special_tokens=True
-            )
-
-            batch["chosen_labels"] = chosen_tokens["input_ids"]
-            batch["rejected_labels"] = rejected_tokens["input_ids"]
-            batch["prompt_input_ids"] = prompt_tokens["input_ids"]
-            batch["prompt_attention_mask"] = prompt_tokens["attention_mask"]
-
-            if model is not None and hasattr(model, "prepare_decoder_input_ids_from_labels"):
-                batch["rejected_decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels(
-                    labels=torch.tensor(batch["rejected_labels"])
-                )
-                batch["chosen_decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels(
-                    labels=torch.tensor(batch["chosen_labels"])
-                )
-
-        if is_torch_xla_available():
-            # Pad the sequences to global max_length to avoid TorchXLA recompilation
-            for k in batch:
-                if "labels" in k or self.is_encoder_decoder:
-                    pad_value = self.label_pad_token_id
-                elif k.endswith("_input_ids"):
-                    pad_value = self.padding_value
-                elif k.endswith("_attention_mask"):
-                    pad_value = 0
-                batch[k] = batch[k] + [pad_value] * (self.max_length - len(batch[k]))
-        return batch
-
-    @staticmethod
-    def concatenated_inputs(
-        batch: dict[str, list | torch.LongTensor],
-        is_encoder_decoder: bool = False,
-        label_pad_token_id: int = -100,
-        padding_value: int = 0,
-        device: torch.device | None = None,
-    ) -> dict[str, torch.LongTensor]:
-        """Concatenate the chosen and rejected inputs into a single tensor.
-
-        Args:
-            batch:
-                A batch of data. Must contain the keys 'chosen_input_ids' and 'rejected_input_ids', which are tensors
-                of shape (batch_size, sequence_length).
-            is_encoder_decoder:
-                Whether the model is an encoder-decoder model.
-            label_pad_token_id:
-                The label pad token id.
-            padding_value:
-                The padding value to use for the concatenated inputs_ids.
-            device:
-                The device for the concatenated inputs.
-
-        Returns:
-            A dictionary containing the concatenated inputs under the key 'concatenated_input_ids'.
-        """
-        concatenated_batch = {}
-
-        if is_encoder_decoder:
-            max_length = max(batch["chosen_labels"].shape[1], batch["rejected_labels"].shape[1])
-        else:
-            max_length = max(batch["chosen_input_ids"].shape[1], batch["rejected_input_ids"].shape[1])
-
-        for k in batch:
-            if k.startswith("chosen") and isinstance(batch[k], torch.Tensor):
-                if "labels" in k or is_encoder_decoder:
-                    pad_value = label_pad_token_id
-                elif k.endswith("_input_ids"):
-                    pad_value = padding_value
-                elif k.endswith("_attention_mask"):
-                    pad_value = 0
-                concatenated_key = k.replace("chosen", "concatenated")
-                concatenated_batch[concatenated_key] = pad_to_length(batch[k], max_length, pad_value=pad_value)
-        for k in batch:
-            if k.startswith("rejected") and isinstance(batch[k], torch.Tensor):
-                if "labels" in k or is_encoder_decoder:
-                    pad_value = label_pad_token_id
-                elif k.endswith("_input_ids"):
-                    pad_value = padding_value
-                elif k.endswith("_attention_mask"):
-                    pad_value = 0
-                concatenated_key = k.replace("rejected", "concatenated")
-                concatenated_batch[concatenated_key] = torch.cat(
-                    (
-                        concatenated_batch[concatenated_key],
-                        pad_to_length(batch[k], max_length, pad_value=pad_value),
-                    ),
-                    dim=0,
-                ).to(device=device)
-
-        if is_encoder_decoder:
-            concatenated_batch["concatenated_input_ids"] = batch["prompt_input_ids"].repeat(2, 1).to(device=device)
-            concatenated_batch["concatenated_attention_mask"] = (
-                batch["prompt_attention_mask"].repeat(2, 1).to(device=device)
-            )
-
-        return concatenated_batch
-
-    def odds_ratio_loss(
-        self,
-        policy_chosen_logps: torch.FloatTensor,
-        policy_rejected_logps: torch.FloatTensor,
-    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
-        """Compute ORPO's odds ratio (OR) loss for a batch of policy and reference model log probabilities.
-
-        Args:
-            policy_chosen_logps:
-                Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
-            policy_rejected_logps:
-                Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)
-
-        Returns:
-            A tuple of three tensors: (losses, chosen_rewards, rejected_rewards). The losses tensor contains the ORPO
-            loss for each example in the batch. The chosen_rewards and rejected_rewards tensors contain the rewards for
-            the chosen and rejected responses, respectively. The log odds ratio of the chosen responses over the
-            rejected responses ratio for logging purposes. The `log(sigmoid(log_odds_chosen))` for logging purposes.
-        """
-
-        # Derived from Eqs. (4) and (7) from https://huggingface.co/papers/2403.07691 by using log identities and exp(log(P(y|x)) = P(y|x)
-        log_odds = (policy_chosen_logps - policy_rejected_logps) - (
-            torch.log1p(-torch.exp(policy_chosen_logps)) - torch.log1p(-torch.exp(policy_rejected_logps))
-        )
-        ratio = F.logsigmoid(log_odds)
-        losses = self.beta * ratio
-
-        chosen_rewards = self.beta * (policy_chosen_logps.to(self.accelerator.device)).detach()
-        rejected_rewards = self.beta * (policy_rejected_logps.to(self.accelerator.device)).detach()
-
-        return losses, chosen_rewards, rejected_rewards, torch.mean(ratio), torch.mean(log_odds)
-
-    @staticmethod
-    def get_batch_logps(
-        logits: torch.FloatTensor,
-        labels: torch.LongTensor,
-        average_log_prob: bool = False,
-        label_pad_token_id: int = -100,
-        is_encoder_decoder: bool = False,
-    ) -> torch.FloatTensor:
-        """Compute the log probabilities of the given labels under the given logits.
-
-        Args:
-            logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size)
-            labels:
-                Labels for which to compute the log probabilities. Label tokens with a value of label_pad_token_id are
-                ignored. Shape: (batch_size, sequence_length)
-            average_log_prob:
-                If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the
-                log probabilities of the (non-masked) tokens.
-            label_pad_token_id: The label pad token id.
-            is_encoder_decoder: Whether the model is an encoder-decoder model.
-
-        Returns:
-            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the
-            given logits.
-        """
-        if logits.shape[:-1] != labels.shape:
-            raise ValueError("Logits (batch and sequence length dim) and labels must have the same shape.")
-
-        if not is_encoder_decoder:
-            labels = labels[:, 1:].clone()
-            logits = logits[:, :-1, :]
-        loss_mask = labels != label_pad_token_id
-
-        # dummy token; we'll ignore the losses on these tokens later
-        labels = torch.where(labels == label_pad_token_id, 0, labels)
-
-        per_token_logps = selective_log_softmax(logits, labels)
-
-        if average_log_prob:
-            return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
-        else:
-            return (per_token_logps * loss_mask).sum(-1)
-
-    def concatenated_forward(
-        self, model: nn.Module, batch: dict[str, list | torch.LongTensor]
-    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
-        """Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.
-
-        We do this to avoid doing two forward passes, because it's faster for FSDP.
-        """
-        concatenated_batch = self.concatenated_inputs(
-            batch,
-            is_encoder_decoder=self.is_encoder_decoder,
-            label_pad_token_id=self.label_pad_token_id,
-            padding_value=self.padding_value,
-            device=self.accelerator.device,
-        )
-        len_chosen = batch["chosen_labels"].shape[0]
-
-        model_kwargs = (
-            {
-                "decoder_input_ids": self._shift_right(concatenated_batch["concatenated_labels"]),
-            }
-            if self.is_encoder_decoder
-            else {}
-        )
-
-        if self.aux_loss_enabled:
-            model_kwargs["output_router_logits"] = True
-
-        outputs = model(
-            concatenated_batch["concatenated_input_ids"],
-            attention_mask=concatenated_batch["concatenated_attention_mask"],
-            use_cache=False,
-            **model_kwargs,
-        )
-        all_logits = outputs.logits
-
-        def cross_entropy_loss(logits, labels):
-            if not self.is_encoder_decoder:
-                # Shift so that tokens < n predict n
-                logits = logits[..., :-1, :].contiguous()
-                labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = nn.CrossEntropyLoss()
-            logits = logits.view(-1, logits.shape[-1])
-            labels = labels.view(-1)
-            # Enable model parallelism
-            labels = labels.to(logits.device)
-            loss = loss_fct(logits, labels)
-            return loss
-
-        if self.is_encoder_decoder:
-            labels = concatenated_batch["concatenated_labels"].clone()
-        else:
-            labels = concatenated_batch["concatenated_input_ids"].clone()
-            attention_mask = concatenated_batch["concatenated_attention_mask"]
-            labels = torch.where(attention_mask == 1, labels, self.label_pad_token_id)
-        # orpo chosen nll loss is computed over the full prompt and response
-        chosen_nll_loss = cross_entropy_loss(all_logits[:len_chosen], labels[:len_chosen])
-
-        all_logps = self.get_batch_logps(
-            all_logits,
-            concatenated_batch["concatenated_labels"],
-            average_log_prob=True,
-            is_encoder_decoder=self.is_encoder_decoder,
-            label_pad_token_id=self.label_pad_token_id,
-        )
-
-        chosen_logps = all_logps[:len_chosen]
-        rejected_logps = all_logps[len_chosen:]
-
-        if not self.is_encoder_decoder:
-            chosen_logits = all_logits[:len_chosen, :-1, :]
-            rejected_logits = all_logits[len_chosen:, :-1, :]
-        else:
-            chosen_logits = all_logits[:len_chosen]
-            rejected_logits = all_logits[len_chosen:]
-
-        if self.aux_loss_enabled:
-            return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_nll_loss, outputs.aux_loss)
-
-        return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_nll_loss)
-
-    def get_batch_loss_metrics(
-        self,
-        model,
-        batch: dict[str, list | torch.LongTensor],
-        train_eval: Literal["train", "eval"] = "train",
-    ):
-        """Compute the ORPO loss and other metrics for the given batch of inputs for train or test."""
-        metrics = {}
-
-        forward_output = self.concatenated_forward(model, batch)
-        (
-            policy_chosen_logps,
-            policy_rejected_logps,
-            policy_chosen_logits,
-            policy_rejected_logits,
-            policy_nll_loss,
-        ) = forward_output[:5]
-        if self.aux_loss_enabled:
-            aux_loss = forward_output[5]
-
-        losses, chosen_rewards, rejected_rewards, log_odds_ratio, log_odds_chosen = self.odds_ratio_loss(
-            policy_chosen_logps, policy_rejected_logps
-        )
-        # full ORPO loss
-        loss = policy_nll_loss - losses.mean()
-
-        reward_accuracies = (chosen_rewards > rejected_rewards).float()
-
-        prefix = "eval_" if train_eval == "eval" else ""
-        metrics[f"{prefix}rewards/chosen"] = self.accelerator.gather_for_metrics(chosen_rewards).mean()
-        metrics[f"{prefix}rewards/rejected"] = self.accelerator.gather_for_metrics(rejected_rewards).mean()
-        metrics[f"{prefix}rewards/accuracies"] = self.accelerator.gather_for_metrics(reward_accuracies).mean()
-        metrics[f"{prefix}rewards/margins"] = self.accelerator.gather_for_metrics(
-            chosen_rewards - rejected_rewards
-        ).mean()
-        metrics[f"{prefix}logps/rejected"] = self.accelerator.gather_for_metrics(policy_rejected_logps).detach().mean()
-        metrics[f"{prefix}logps/chosen"] = self.accelerator.gather_for_metrics(policy_chosen_logps).detach().mean()
-        metrics[f"{prefix}logits/rejected"] = self.accelerator.gather_for_metrics(
-            policy_rejected_logits.detach().mean()
-        ).mean()
-        metrics[f"{prefix}logits/chosen"] = self.accelerator.gather_for_metrics(
-            policy_chosen_logits.detach().mean()
-        ).mean()
-        metrics[f"{prefix}nll_loss"] = self.accelerator.gather_for_metrics(policy_nll_loss).detach().mean()
-        metrics[f"{prefix}log_odds_ratio"] = self.accelerator.gather_for_metrics(log_odds_ratio).detach().mean()
-        metrics[f"{prefix}log_odds_chosen"] = self.accelerator.gather_for_metrics(log_odds_chosen).detach().mean()
-        if is_torch_xla_available():
-            xm.mark_step()  # needed because .item() calls
-        for k, v in metrics.items():
-            metrics[k] = v.item()
-        if self.aux_loss_enabled:
-            loss += self.aux_loss_coef * aux_loss
-
-        return loss, metrics
-
-    def compute_loss(
-        self,
-        model: PreTrainedModel | nn.Module,
-        inputs: dict[str, torch.Tensor | Any],
-        return_outputs=False,
-        num_items_in_batch=None,
-    ) -> torch.Tensor | tuple[torch.Tensor, dict[str, torch.Tensor]]:
-        compute_loss_context_manager = (
-            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
-        )
-
-        with compute_loss_context_manager:
-            loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="train")
-
-        # Make sure to move the loss to the device the original accumulating loss is at back in the `Trainer` class:
-        loss = loss.to(self.args.device)
-
-        # force log the metrics
-        self.store_metrics(metrics, train_eval="train")
-
-        if return_outputs:
-            return (loss, metrics)
-        return loss
-
-    def generate_from_model(self, model, batch: dict[str, torch.LongTensor]) -> str:
-        """Generate samples from the model and reference model for the given batch of inputs."""
-
-        # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with
-        # the torch amp context manager as some hidden states are silently casted to full precision.
-        generate_context_manager = (
-            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
-        )
-
-        with generate_context_manager:
-            policy_output = model.generate(
-                input_ids=batch["prompt_input_ids"],
-                attention_mask=batch["prompt_attention_mask"],
-                max_length=self.max_length,
-                do_sample=True,
-                pad_token_id=self.processing_class.pad_token_id,
-            )
-
-        policy_output = pad_to_length(policy_output, self.max_length, self.processing_class.pad_token_id)
-        policy_output_decoded = self.processing_class.batch_decode(policy_output, skip_special_tokens=True)
-
-        return policy_output_decoded
-
-    def prediction_step(
-        self,
-        model: PreTrainedModel | nn.Module,
-        inputs: dict[str, torch.Tensor | Any],
-        prediction_loss_only: bool,
-        ignore_keys: list[str] | None = None,
-    ):
-        if not self.use_dpo_data_collator:
-            logger.warning(
-                "prediction_step is only implemented for DPODataCollatorWithPadding, and you passed a datacollator that is different than "
-                "DPODataCollatorWithPadding - you might see unexpected behavior. Alternatively, you can implement your own prediction_step method if you are using a custom data collator"
-            )
-        if ignore_keys is None:
-            if hasattr(model, "config"):
-                ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", [])
-            else:
-                ignore_keys = []
-
-        prediction_context_manager = (
-            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
-        )
-
-        with torch.no_grad(), prediction_context_manager:
-            loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="eval")
-
-        # force log the metrics
-        self.store_metrics(metrics, train_eval="eval")
-
-        if prediction_loss_only:
-            return (loss.detach(), None, None)
-
-        # logits for the chosen and rejected samples from model
-        logits_dict = {
-            "eval_logits/chosen": metrics["eval_logits/chosen"],
-            "eval_logits/rejected": metrics["eval_logits/rejected"],
-        }
-        logits = [v for k, v in logits_dict.items() if k not in ignore_keys]
-        logits = torch.tensor(logits, device=self.accelerator.device)
-        labels = torch.zeros(logits.shape[0], device=self.accelerator.device)
-
-        return (loss.detach(), logits, labels)
-
-    def store_metrics(self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None:
-        for key, value in metrics.items():
-            self._stored_metrics[train_eval][key].append(value)
-
-    def evaluation_loop(
-        self,
-        dataloader: DataLoader,
-        description: str,
-        prediction_loss_only: bool | None = None,
-        ignore_keys: list[str] | None = None,
-        metric_key_prefix: str = "eval",
-    ) -> EvalLoopOutput:
-        """
-        Overriding built-in evaluation loop to store metrics for each batch. Prediction/evaluation loop, shared by
-        `Trainer.evaluate()` and `Trainer.predict()`.
-
-        Works both with or without labels.
-        """
-
-        # Sample and save to game log if requested (for one batch to save time)
-        if self.generate_during_eval:
-            # Generate random indices within the range of the total number of samples
-            num_samples = len(dataloader.dataset)
-            random_indices = random.sample(range(num_samples), k=self.args.eval_batch_size)
-
-            # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader
-            random_batch_dataset = dataloader.dataset.select(random_indices)
-            random_batch = self.data_collator(random_batch_dataset)
-            random_batch = self._prepare_inputs(random_batch)
-
-            policy_output_decoded = self.generate_from_model(self.model, random_batch)
-
-            table = pd.DataFrame(
-                columns=["Prompt", "Policy"],
-                data=[
-                    [prompt, pol[len(prompt) :]]
-                    for prompt, pol in zip(random_batch["prompt"], policy_output_decoded, strict=True)
-                ],
-            )
-            if "wandb" in self.args.report_to:
-                wandb.log({"game_log": wandb.Table(data=table)})
-
-            if "comet_ml" in self.args.report_to:
-                log_table_to_comet_experiment(
-                    name="game_log.csv",
-                    table=table,
-                )
-
-        # Base evaluation
-        initial_output = super().evaluation_loop(
-            dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix
-        )
-
-        return initial_output
-
-    def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
-        """
-        Log `logs` on the various objects watching training, including stored metrics.
-
-        Args:
-            logs (`dict[str, float]`):
-                The values to log.
-            start_time (`float`, *optional*):
-                Start time of the training.
-        """
-        # logs either has 'loss' or 'eval_loss'
-        train_eval = "train" if "loss" in logs else "eval"
-        # Add averaged stored metrics to logs
-        for key, metrics in self._stored_metrics[train_eval].items():
-            logs[key] = torch.tensor(metrics).mean().item()
-        del self._stored_metrics[train_eval]
-        return super().log(logs, start_time)
-
-    def _shift_right(self, input_ids):
-        if self.decoder_start_token_id is None:
-            raise ValueError(
-                "model.config.decoder_start_token_id has to be defined. It is usually set to the pad_token_id."
-            )
-
-        # shift inputs to the right
-        if is_torch_fx_proxy(input_ids):
-            # Item assignment is not supported natively for proxies.
-            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), self.decoder_start_token_id)
-            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
-        else:
-            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
-            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
-            shifted_input_ids[..., 0] = self.decoder_start_token_id
-
-        if self.pad_token_id is None:
-            raise ValueError("model.config.pad_token_id has to be defined.")
-        # replace possible -100 values in labels by `pad_token_id`
-        shifted_input_ids.masked_fill_(shifted_input_ids == -100, self.pad_token_id)
-
-        return shifted_input_ids
-
-    # Ensure the model card is saved along with the checkpoint
-    def _save_checkpoint(self, model, trial):
-        if self.args.hub_model_id is None:
-            model_name = Path(self.args.output_dir).name
-        else:
-            model_name = self.args.hub_model_id.split("/")[-1]
-        self.create_model_card(model_name=model_name)
-        super()._save_checkpoint(model, trial)

From 18040a40375720aec6f7d57aaa7eae7e8373482b Mon Sep 17 00:00:00 2001
From: Behrooz <ermiaazarkhalili@gmail.com>
Date: Wed, 5 Nov 2025 18:36:39 -0800
Subject: [PATCH 2/8] Address reviewer feedback on ORPO experimental migration

- Restore ORPO imports in trl/trainer/__init__.py for backward compatibility
- Fix deprecation stub naming from ExperimentalORPOTrainer to _ORPOTrainer
- Add torch import to deprecation stub for type hints
- Fix relative import paths in trl/experimental/orpo/orpo_trainer.py
- Update autodoc references to experimental.orpo.ORPOTrainer
- Update all documentation references to use experimental namespace
- Move ORPO test from test_trainers_args.py to experimental/test_trainers_args.py
---
 docs/source/community_tutorials.md            |  2 +-
 docs/source/dataset_formats.md                |  2 +-
 docs/source/example_overview.md               |  2 +-
 docs/source/index.md                          |  2 +-
 docs/source/orpo_trainer.md                   |  8 ++---
 tests/{ => experimental}/test_orpo_trainer.py |  0
 tests/experimental/test_trainers_args.py      | 28 ++++++++++++++++++
 tests/test_trainers_args.py                   | 29 -------------------
 trl/experimental/orpo/orpo_trainer.py         |  4 +--
 trl/trainer/__init__.py                       |  4 +++
 trl/trainer/orpo_trainer.py                   |  5 ++--
 11 files changed, 45 insertions(+), 41 deletions(-)
 rename tests/{ => experimental}/test_orpo_trainer.py (100%)

diff --git a/docs/source/community_tutorials.md b/docs/source/community_tutorials.md
index 333aa973b5a..a412ee7d917 100644
--- a/docs/source/community_tutorials.md
+++ b/docs/source/community_tutorials.md
@@ -15,7 +15,7 @@ Community tutorials are made by active members of the Hugging Face community who
 | Instruction tuning | [`SFTTrainer`] | Fine-tuning Google Gemma LLMs using ChatML format with QLoRA | [Philipp Schmid](https://huggingface.co/philschmid) | [Link](https://www.philschmid.de/fine-tune-google-gemma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/deep-learning-pytorch-huggingface/blob/main/training/gemma-lora-example.ipynb) |
 | Structured Generation | [`SFTTrainer`] | Fine-tuning Llama-2-7B to generate Persian product catalogs in JSON using QLoRA and PEFT | [Mohammadreza Esmaeilian](https://huggingface.co/Mohammadreza) | [Link](https://huggingface.co/learn/cookbook/en/fine_tuning_llm_to_generate_persian_product_catalogs_in_json_format) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/cookbook/blob/main/notebooks/en/fine_tuning_llm_to_generate_persian_product_catalogs_in_json_format.ipynb) |
 | Preference Optimization | [`DPOTrainer`] | Align Mistral-7b using Direct Preference Optimization for human preference alignment | [Maxime Labonne](https://huggingface.co/mlabonne) | [Link](https://mlabonne.github.io/blog/posts/Fine_tune_Mistral_7b_with_DPO.html) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mlabonne/llm-course/blob/main/Fine_tune_a_Mistral_7b_model_with_DPO.ipynb) |
-| Preference Optimization | [`ORPOTrainer`] | Fine-tuning Llama 3 with ORPO combining instruction tuning and preference alignment | [Maxime Labonne](https://huggingface.co/mlabonne) | [Link](https://mlabonne.github.io/blog/posts/2024-04-19_Fine_tune_Llama_3_with_ORPO.html) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1eHNWg9gnaXErdAa8_mcvjMupbSS6rDvi) |
+| Preference Optimization | [`experimental.orpo.ORPOTrainer`] | Fine-tuning Llama 3 with ORPO combining instruction tuning and preference alignment | [Maxime Labonne](https://huggingface.co/mlabonne) | [Link](https://mlabonne.github.io/blog/posts/2024-04-19_Fine_tune_Llama_3_with_ORPO.html) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1eHNWg9gnaXErdAa8_mcvjMupbSS6rDvi) |
 | Instruction tuning | [`SFTTrainer`] | How to fine-tune open LLMs in 2025 with Hugging Face | [Philipp Schmid](https://huggingface.co/philschmid) | [Link](https://www.philschmid.de/fine-tune-llms-in-2025) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/deep-learning-pytorch-huggingface/blob/main/training/fine-tune-llms-in-2025.ipynb) |
 
 ### Videos
diff --git a/docs/source/dataset_formats.md b/docs/source/dataset_formats.md
index 958dfb3af52..7d98b8abe7b 100644
--- a/docs/source/dataset_formats.md
+++ b/docs/source/dataset_formats.md
@@ -395,7 +395,7 @@ Choosing the right dataset type depends on the task you are working on and the s
 | [`KTOTrainer`] | [Unpaired preference](#unpaired-preference) or [Preference (explicit prompt recommended)](#preference) |
 | [`NashMDTrainer`] | [Prompt-only](#prompt-only) |
 | [`OnlineDPOTrainer`] | [Prompt-only](#prompt-only) |
-| [`ORPOTrainer`] | [Preference (explicit prompt recommended)](#preference) |
+| [`experimental.orpo.ORPOTrainer`] | [Preference (explicit prompt recommended)](#preference) |
 | [`PPOTrainer`] | Tokenized language modeling |
 | [`PRMTrainer`] | [Stepwise supervision](#stepwise-supervision) |
 | [`RewardTrainer`] | [Preference (implicit prompt recommended)](#preference) |
diff --git a/docs/source/example_overview.md b/docs/source/example_overview.md
index 0f12f5ba1a0..d9cba0b9114 100644
--- a/docs/source/example_overview.md
+++ b/docs/source/example_overview.md
@@ -54,7 +54,7 @@ Scripts are maintained in the [`trl/scripts`](https://github.com/huggingface/trl
 | [`examples/scripts/nash_md.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/nash_md.py) | This script shows how to use the [`NashMDTrainer`] to fine-tune a model. |
 | [`examples/scripts/online_dpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/online_dpo.py) | This script shows how to use the [`OnlineDPOTrainer`] to fine-tune a model. |
 | [`examples/scripts/online_dpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/online_dpo_vlm.py) | This script shows how to use the [`OnlineDPOTrainer`] to fine-tune a a Vision Language Model. |
-| [`examples/scripts/orpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/orpo.py) | This script shows how to use the [`ORPOTrainer`] to fine-tune a model to increase helpfulness and harmlessness using the [Anthropic/hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf) dataset. |
+| [`examples/scripts/orpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/orpo.py) | This script shows how to use the [`experimental.orpo.ORPOTrainer`] to fine-tune a model to increase helpfulness and harmlessness using the [Anthropic/hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf) dataset. |
 | [`examples/scripts/ppo/ppo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo/ppo.py) | This script shows how to use the [`PPOTrainer`] to fine-tune a model to improve its ability to continue text with positive sentiment or physically descriptive language. |
 | [`examples/scripts/ppo/ppo_tldr.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo/ppo_tldr.py) | This script shows how to use the [`PPOTrainer`] to fine-tune a model to improve its ability to generate TL;DR summaries. |
 | [`examples/scripts/prm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/prm.py) | This script shows how to use the [`PRMTrainer`] to fine-tune a Process-supervised Reward Model (PRM). |
diff --git a/docs/source/index.md b/docs/source/index.md
index 9d6584cc2b8..e0268d51868 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -41,8 +41,8 @@ Below is the current list of TRL trainers, organized by method type (⚡️ = vL
 
 - [`SFTTrainer`]
 - [`DPOTrainer`]
-- [`ORPOTrainer`]
 - [`experimental.bco.BCOTrainer`] 🧪
+- [`experimental.orpo.ORPOTrainer`] 🧪
 - [`CPOTrainer`]
 - [`KTOTrainer`]
 
diff --git a/docs/source/orpo_trainer.md b/docs/source/orpo_trainer.md
index d3428313bb8..555f0858316 100644
--- a/docs/source/orpo_trainer.md
+++ b/docs/source/orpo_trainer.md
@@ -79,9 +79,9 @@ Here are some other factors to consider when choosing a programming language for
 
 ## Expected dataset type
 
-ORPO requires a [preference dataset](dataset_formats#preference). The [`ORPOTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset format. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
+ORPO requires a [preference dataset](dataset_formats#preference). The [`experimental.orpo.ORPOTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset format. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
 
-Although the [`ORPOTrainer`] supports both explicit and implicit prompts, we recommend using explicit prompts. If provided with an implicit prompt dataset, the trainer will automatically extract the prompt from the `"chosen"` and `"rejected"` columns. For more information, refer to the [preference style](dataset_formats#preference) section.
+Although the [`experimental.orpo.ORPOTrainer`] supports both explicit and implicit prompts, we recommend using explicit prompts. If provided with an implicit prompt dataset, the trainer will automatically extract the prompt from the `"chosen"` and `"rejected"` columns. For more information, refer to the [preference style](dataset_formats#preference) section.
 
 ## Example script
 
@@ -121,11 +121,11 @@ While training and evaluating, we record the following reward metrics:
 
 ## ORPOTrainer
 
-[[autodoc]] ORPOTrainer
+[[autodoc]] experimental.orpo.ORPOTrainer
     - train
     - save_model
     - push_to_hub
 
 ## ORPOConfig
 
-[[autodoc]] ORPOConfig
+[[autodoc]] experimental.orpo.ORPOConfig
diff --git a/tests/test_orpo_trainer.py b/tests/experimental/test_orpo_trainer.py
similarity index 100%
rename from tests/test_orpo_trainer.py
rename to tests/experimental/test_orpo_trainer.py
diff --git a/tests/experimental/test_trainers_args.py b/tests/experimental/test_trainers_args.py
index bd86bb61b5d..6b3e1bbb0f1 100644
--- a/tests/experimental/test_trainers_args.py
+++ b/tests/experimental/test_trainers_args.py
@@ -16,6 +16,7 @@
 from transformers import AutoTokenizer
 
 from trl.experimental.bco import BCOConfig, BCOTrainer
+from trl.experimental.orpo import ORPOConfig, ORPOTrainer
 
 from ..testing_utils import TrlTestCase, require_sklearn
 
@@ -68,3 +69,30 @@ def test_bco(self):
         assert trainer.args.prompt_sample_size == 512
         assert trainer.args.min_density_ratio == 0.2
         assert trainer.args.max_density_ratio == 20.0
+
+    def test_orpo(self):
+        model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train")
+        training_args = ORPOConfig(
+            self.tmp_dir,
+            max_length=256,
+            max_prompt_length=64,
+            max_completion_length=64,
+            beta=0.5,
+            disable_dropout=False,
+            label_pad_token_id=-99,
+            padding_value=-99,
+            truncation_mode="keep_start",
+            # generate_during_eval=True, # ignore this one, it requires wandb
+            is_encoder_decoder=True,
+            model_init_kwargs={"trust_remote_code": True},
+            dataset_num_proc=4,
+        )
+        trainer = ORPOTrainer(model=model_id, args=training_args, train_dataset=dataset, processing_class=tokenizer)
+        assert trainer.args.max_length == 256
+        assert trainer.args.max_prompt_length == 64
+        assert trainer.args.max_completion_length == 64
+        assert trainer.args.beta == 0.5
+        assert not trainer.args.disable_dropout
+        assert trainer.args.label_pad_token_id == -99
diff --git a/tests/test_trainers_args.py b/tests/test_trainers_args.py
index 014ec6ac5da..1a6c8171c3f 100644
--- a/tests/test_trainers_args.py
+++ b/tests/test_trainers_args.py
@@ -28,8 +28,6 @@
     NashMDTrainer,
     OnlineDPOConfig,
     OnlineDPOTrainer,
-    ORPOConfig,
-    ORPOTrainer,
     RewardConfig,
     RewardTrainer,
     SFTConfig,
@@ -248,33 +246,6 @@ def test_online_dpo(self, beta_list):
         assert trainer.args.beta == (0.6 if not beta_list else [0.6, 0.7])
         assert trainer.args.loss_type == "hinge"
 
-    def test_orpo(self):
-        model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train")
-        training_args = ORPOConfig(
-            self.tmp_dir,
-            max_length=256,
-            max_prompt_length=64,
-            max_completion_length=64,
-            beta=0.5,
-            disable_dropout=False,
-            label_pad_token_id=-99,
-            padding_value=-99,
-            truncation_mode="keep_start",
-            # generate_during_eval=True, # ignore this one, it requires wandb
-            is_encoder_decoder=True,
-            model_init_kwargs={"trust_remote_code": True},
-            dataset_num_proc=4,
-        )
-        trainer = ORPOTrainer(model=model_id, args=training_args, train_dataset=dataset, processing_class=tokenizer)
-        assert trainer.args.max_length == 256
-        assert trainer.args.max_prompt_length == 64
-        assert trainer.args.max_completion_length == 64
-        assert trainer.args.beta == 0.5
-        assert not trainer.args.disable_dropout
-        assert trainer.args.label_pad_token_id == -99
-
     def test_reward(self):
         model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
         tokenizer = AutoTokenizer.from_pretrained(model_id)
diff --git a/trl/experimental/orpo/orpo_trainer.py b/trl/experimental/orpo/orpo_trainer.py
index 8ac3b0b6ed2..b490cdf6951 100644
--- a/trl/experimental/orpo/orpo_trainer.py
+++ b/trl/experimental/orpo/orpo_trainer.py
@@ -49,9 +49,9 @@
 from transformers.utils import is_peft_available, is_torch_fx_proxy
 
 from ...data_utils import maybe_apply_chat_template, maybe_extract_prompt
-from ..base_trainer import BaseTrainer
+from ...trainer.base_trainer import BaseTrainer
 from .orpo_config import ORPOConfig
-from ..utils import (
+from ...trainer.utils import (
     DPODataCollatorWithPadding,
     add_bos_token_if_needed,
     add_eos_token_if_needed,
diff --git a/trl/trainer/__init__.py b/trl/trainer/__init__.py
index 1f2e3ccec00..98846bf7159 100644
--- a/trl/trainer/__init__.py
+++ b/trl/trainer/__init__.py
@@ -54,6 +54,8 @@
     "nash_md_trainer": ["NashMDTrainer"],
     "online_dpo_config": ["OnlineDPOConfig"],
     "online_dpo_trainer": ["OnlineDPOTrainer"],
+    "orpo_config": ["ORPOConfig"],
+    "orpo_trainer": ["ORPOTrainer"],
     "ppo_config": ["PPOConfig"],
     "ppo_trainer": ["PPOTrainer"],
     "prm_config": ["PRMConfig"],
@@ -112,6 +114,8 @@
     from .nash_md_trainer import NashMDTrainer
     from .online_dpo_config import OnlineDPOConfig
     from .online_dpo_trainer import OnlineDPOTrainer
+    from .orpo_config import ORPOConfig
+    from .orpo_trainer import ORPOTrainer
     from .ppo_config import PPOConfig
     from .ppo_trainer import PPOTrainer
     from .prm_config import PRMConfig
diff --git a/trl/trainer/orpo_trainer.py b/trl/trainer/orpo_trainer.py
index 1c60e06c5f5..a89535ca615 100644
--- a/trl/trainer/orpo_trainer.py
+++ b/trl/trainer/orpo_trainer.py
@@ -16,6 +16,7 @@
 from collections.abc import Callable
 from typing import Any
 
+import torch
 import torch.nn as nn
 from datasets import Dataset
 from transformers import (
@@ -29,11 +30,11 @@
 from transformers.trainer_callback import TrainerCallback
 from transformers.trainer_utils import EvalLoopOutput
 
-from ..experimental.orpo import ORPOTrainer as ExperimentalORPOTrainer
+from ..experimental.orpo import ORPOTrainer as _ORPOTrainer
 from .orpo_config import ORPOConfig
 
 
-class ORPOTrainer(ExperimentalORPOTrainer):
+class ORPOTrainer(_ORPOTrainer):
     """
     Initialize ORPOTrainer.
 

From 9d7c53c447645cb9f5edb6ebc066de624e3137c6 Mon Sep 17 00:00:00 2001
From: Behrooz <ermiaazarkhalili@gmail.com>
Date: Wed, 5 Nov 2025 18:41:15 -0800
Subject: [PATCH 3/8] Fix ruff linting errors - remove unused imports

- Remove unused 'import os' and 'import warnings' from trl/experimental/orpo/orpo_trainer.py
- Remove unused 'from typing import Any' from trl/trainer/orpo_trainer.py
---
 trl/experimental/orpo/orpo_trainer.py | 2 --
 trl/trainer/orpo_trainer.py           | 1 -
 2 files changed, 3 deletions(-)

diff --git a/trl/experimental/orpo/orpo_trainer.py b/trl/experimental/orpo/orpo_trainer.py
index b490cdf6951..fd68515a900 100644
--- a/trl/experimental/orpo/orpo_trainer.py
+++ b/trl/experimental/orpo/orpo_trainer.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 
 import inspect
-import os
 import random
 import textwrap
-import warnings
 from collections import defaultdict
 from collections.abc import Callable
 from contextlib import nullcontext
diff --git a/trl/trainer/orpo_trainer.py b/trl/trainer/orpo_trainer.py
index a89535ca615..7f5feb34489 100644
--- a/trl/trainer/orpo_trainer.py
+++ b/trl/trainer/orpo_trainer.py
@@ -14,7 +14,6 @@
 
 import warnings
 from collections.abc import Callable
-from typing import Any
 
 import torch
 import torch.nn as nn

From 92e218b933e0b1aa70b62c3ff3a74851242be2fb Mon Sep 17 00:00:00 2001
From: Behrooz <ermiaazarkhalili@gmail.com>
Date: Wed, 5 Nov 2025 19:58:55 -0800
Subject: [PATCH 4/8] Fix import path for testing_utils in ORPO test file

---
 tests/experimental/test_orpo_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/experimental/test_orpo_trainer.py b/tests/experimental/test_orpo_trainer.py
index 554938293ca..95a82234909 100644
--- a/tests/experimental/test_orpo_trainer.py
+++ b/tests/experimental/test_orpo_trainer.py
@@ -19,7 +19,7 @@
 
 from trl.experimental.orpo import ORPOConfig, ORPOTrainer
 
-from .testing_utils import TrlTestCase, require_peft
+from ..testing_utils import TrlTestCase, require_peft
 
 
 class TestORPOTrainer(TrlTestCase):

From c2db59638331ef5085867fb2406a5872060ad3fc Mon Sep 17 00:00:00 2001
From: Behrooz <ermiaazarkhalili@gmail.com>
Date: Wed, 5 Nov 2025 20:02:23 -0800
Subject: [PATCH 5/8] Fix import ordering in ORPO trainer

---
 trl/experimental/orpo/orpo_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/trl/experimental/orpo/orpo_trainer.py b/trl/experimental/orpo/orpo_trainer.py
index fd68515a900..29af144bb57 100644
--- a/trl/experimental/orpo/orpo_trainer.py
+++ b/trl/experimental/orpo/orpo_trainer.py
@@ -48,7 +48,6 @@
 
 from ...data_utils import maybe_apply_chat_template, maybe_extract_prompt
 from ...trainer.base_trainer import BaseTrainer
-from .orpo_config import ORPOConfig
 from ...trainer.utils import (
     DPODataCollatorWithPadding,
     add_bos_token_if_needed,
@@ -59,6 +58,7 @@
     peft_module_casting_to_bf16,
     selective_log_softmax,
 )
+from .orpo_config import ORPOConfig
 
 
 if is_peft_available():

From b6815e3f80a937d5001530e1214da74cbe40932e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Fri, 21 Nov 2025 05:43:23 +0000
Subject: [PATCH 6/8] Squashed commit of the following:
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 52ed4df2c09bfa9f04242923ad72c067ea13be94
Author: Quentin Gallouédec <gallouedec.quentin@gmail.com>
Date:   Thu Nov 20 21:41:23 2025 +0000

    Fix style OpenEnv example

commit a2639462fac330b0ac06c36dfa06bd840f305b61
Author: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com>
Date:   Thu Nov 20 14:44:15 2025 +0100

    Update OpenEnv guide with latest details (#4552)

    Co-authored-by: burtenshaw <ben.burtenshaw@gmail.com>

commit 1a9ff522308331ac32e3bd7076a09f3c1a922c1e
Author: Kashif Rasul <kashif.rasul@gmail.com>
Date:   Wed Nov 19 15:34:25 2025 +0100

    [OpenEnv] browsergym example script (#4539)

    Co-authored-by: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com>

commit 6cbcd9413440ec4663a90c8b1cafd71b394f0711
Author: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com>
Date:   Wed Nov 19 14:39:44 2025 +0100

    Update OpenEnv example scripts (#4547)

commit 85105890c185be95bf5d9fcdc030b18cecf2f302
Author: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com>
Date:   Wed Nov 19 14:39:20 2025 +0100

    Add OpenEnv Script examples to docs (#4533)

commit e622196097109080b73584d598d4162e64fc6bea
Author: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
Date:   Mon Nov 17 03:12:30 2025 -0700

    [Doc] Drop dummy reward and dataset for DeepMath-103K and accuracy reward (#4524)

commit 1b1242cc6522feb4eb063feb20097a79b11b127a
Author: Kashif Rasul <kashif.rasul@gmail.com>
Date:   Fri Nov 14 20:51:41 2025 +0100

    [OpenEnv] add vllm colocate mode to openenv scripts (#4510)

    Co-authored-by: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com>
    Co-authored-by: Quentin Gallouédec <gallouedec.quentin@gmail.com>

commit f39d18a05d002df953f6cd6609415048548c5f85
Author: Fabio Milentiansen Sim <sim.fabio.fms@gmail.com>
Date:   Fri Nov 14 23:39:02 2025 +0700

    fix(GOLDTrainer): Resolve incorrect attribute access and VLLMClient.generate() output type (#4526)

commit d45eaab3af6ab8c80a7c5b65df607a5152ed0f77
Author: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com>
Date:   Fri Nov 14 12:12:09 2025 +0100

    Add vLLM quantization option for colocate (#4496)

    Co-authored-by: Kashif Rasul <kashif.rasul@gmail.com>

commit a91d4b379a7e0af48bd879a7268f4337f3e22f36
Author: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com>
Date:   Fri Nov 14 02:19:08 2025 +0100

    Prevent upcasting norm layers in `prepare_model_for_kbit_training` (#4457)

    Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>

commit 121318e281c33deb5c6df8c399af0c5cdf15506c
Author: Behrooz Azarkhalili <80390531+behroozazarkhalili@users.noreply.github.com>
Date:   Thu Nov 13 17:13:16 2025 -0800

    docs: Extend CLI basic usage examples to all supported CLIs (#4425)

    Co-authored-by: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com>
    Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
    Co-authored-by: Quentin Gallouédec <gallouedec.quentin@gmail.com>

commit 79183203fab0faede45356d0242f45f40b55289e
Author: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
Date:   Thu Nov 13 13:20:52 2025 -0700

    Remove test trainer args (#4517)

commit 102dc4184c86a3c6d890b790a30dcd026e599071
Author: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
Date:   Thu Nov 13 12:36:43 2025 -0700

    Rename `flash-attn` to `flash-attn2` (#4514)

    Co-authored-by: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com>

commit 5de62b07e380762b4fd0ed24e9449385c4d820ee
Author: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
Date:   Thu Nov 13 12:05:48 2025 -0700

    Add step time metric to GRPO Trainer for performance tracking (#4516)

    Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

commit f1e6377e4f301cafb0d8dc29b9afe8da930facfe
Author: Behrooz Azarkhalili <80390531+behroozazarkhalili@users.noreply.github.com>
Date:   Thu Nov 13 11:01:19 2025 -0800

    Move PPOTrainer to trl.experimental.ppo (#4482)

    Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
    Co-authored-by: Quentin Gallouédec <gallouedec.quentin@gmail.com>

commit 01f497e2e11350f81d2777eaa53f2282e675201e
Author: Behrooz Azarkhalili <80390531+behroozazarkhalili@users.noreply.github.com>
Date:   Thu Nov 13 10:14:58 2025 -0800

    Move NashMDTrainer to experimental module (#4477)

    Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
    Co-authored-by: Quentin Gallouédec <gallouedec.quentin@gmail.com>

commit b6c838aa24c3b58930bd4faba9704c0223ee0590
Author: Quentin Gallouédec <gallouedec.quentin@gmail.com>
Date:   Thu Nov 13 16:53:26 2025 +0000

    `aws-general-8-plus` runner for Docker build

commit ed5c7bb5b07845b18f40627d69fe133884f72f39
Author: YangKai0616 <kai.yang@intel.com>
Date:   Fri Nov 14 00:42:48 2025 +0800

    [Bug Fix] OnlineDPOTrainer with vLLM Server Mode (#4500)

commit ded9bc6164f3bdbe1df35adb77eb8be4594f94b3
Author: lewtun <lewis.c.tunstall@gmail.com>
Date:   Thu Nov 13 17:33:59 2025 +0100

    Fix Docker images for Liger (#4522)

commit fd04760f594e9262cbf9abaccfef2bad05569775
Author: Pramodith Ballapuram <16939722+pramodith@users.noreply.github.com>
Date:   Thu Nov 13 11:31:10 2025 +0000

    Paper Index: Change `num_completions` to `num_generations` (#4515)

commit b7918c0f3bdba2e327bad7abf01aa1becfab3565
Author: Behrooz Azarkhalili <80390531+behroozazarkhalili@users.noreply.github.com>
Date:   Wed Nov 12 20:35:44 2025 -0800

    Move GKDTrainer to experimental module (#4474)

    Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
    Co-authored-by: Quentin Gallouédec <gallouedec.quentin@gmail.com>

commit 07b5011b7864d4a27b71e30963a0a4ca610da3fd
Author: Tamoghno Kandar <55907205+tamoghnokandar@users.noreply.github.com>
Date:   Wed Nov 12 20:07:33 2025 -0800

    Replace flash attention2 with kernels-community/flash-attn2 (#4426)

    Co-authored-by: Quentin Gallouédec <gallouedec.quentin@gmail.com>
    Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>

commit 7a57fd41e41b33c7fd03a24abb45dd1cdebcbc49
Author: Yuxian Gu <guyx21@mails.tsinghua.edu.cn>
Date:   Thu Nov 13 11:16:20 2025 +0800

    MiniLLM: Fix arguments in config & add to documentation index (#4518)

commit a145eaf81ac664d62c63d7d088f4d5c7d261f5b2
Author: Behrooz Azarkhalili <80390531+behroozazarkhalili@users.noreply.github.com>
Date:   Wed Nov 12 16:35:46 2025 -0800

    refactor: Move CPOTrainer to experimental module (#4470)

commit d2dc717e03062d129e60865eca3e85ed8fa73538
Author: Taha Yassine <40228615+taha-yassine@users.noreply.github.com>
Date:   Thu Nov 13 00:56:47 2025 +0100

    Replace `wandb_log_unique_prompts` with `log_unique_prompts` (#4508)

    Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
    Co-authored-by: Quentin Gallouédec <gallouedec.quentin@gmail.com>

commit 799b39b86408bc8d356cd0fbd398741becdfd059
Author: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
Date:   Wed Nov 12 16:21:05 2025 -0700

    `device_map` and `dtype` to `"auto"` by default (#4509)

    Co-authored-by: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com>

commit a6a2beb937377df7078537a3454483a01000868b
Author: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
Date:   Wed Nov 12 09:42:31 2025 -0700

    Add temporary workaround for `lr_scheduler_kwargs` dtype issue in Transformers 4.57.0 (#4513)

commit 346701ae6e5cf4b1797734732fc8040bbbec9e55
Author: lewtun <lewis.c.tunstall@gmail.com>
Date:   Wed Nov 12 17:42:18 2025 +0100

    Replace accelerate logging with stdlib in CLI (#4512)

commit 4db63af98b6437b18208a34455b6b10692086800
Author: Quentin Gallouédec <gallouedec.quentin@gmail.com>
Date:   Wed Nov 12 02:19:51 2025 +0000

    Fix GRPO unsqueeze advantages

commit ecb2811535daf0aabcd3cb88d89909cf08fb89ad
Author: Yuxian Gu <guyx21@mails.tsinghua.edu.cn>
Date:   Wed Nov 12 10:17:22 2025 +0800

    Add MiniLLM Trainer (#4504)

    Co-authored-by: Quentin Gallouédec <gallouedec.quentin@gmail.com>

commit 89e46883a1c6b2feb36eff99b937095b39de77da
Author: Taha Yassine <40228615+taha-yassine@users.noreply.github.com>
Date:   Tue Nov 11 20:36:23 2025 +0100

    Add support for images inside tables with Trackio completions logging (#4505)

commit 2d3279c2c2eac73c1f84a3efd3bf414913146b90
Author: lewtun <lewis.c.tunstall@gmail.com>
Date:   Tue Nov 11 19:22:25 2025 +0100

    Tweak description for vLLM sleep mode (#4506)

    Co-authored-by: Quentin Gallouédec <gallouedec.quentin@gmail.com>

commit 02a34777c36173ea53c8ce9979db50f13c1aaac8
Author: Luke Hinds <lukehinds@gmail.com>
Date:   Mon Nov 10 16:41:51 2025 +0000

    Fix link to OpenEnv docs (#4502)

    Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>

commit aaed6c1600ff4f3e0ccc6b3b8183c98d26390491
Author: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
Date:   Sat Nov 8 08:20:48 2025 -0700

    Consistency regarding relative imports (#4498)

commit 20760ba3ac7092432f67a5e3e9b2c624f168fdb0
Author: burtenshaw <ben.burtenshaw@gmail.com>
Date:   Fri Nov 7 10:50:50 2025 +0100

    [DOCS] update and fix openenv (#4490)

    Co-authored-by: Kashif Rasul <kashif.rasul@gmail.com>
    Co-authored-by: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com>

commit 64cfca42297311e1b1c41a32d0867b16619d577c
Author: Behrooz Azarkhalili <80390531+behroozazarkhalili@users.noreply.github.com>
Date:   Thu Nov 6 22:47:04 2025 -0800

    Move judges to experimental submodule (#4439)

    Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
    Co-authored-by: Quentin Gallouédec <gallouedec.quentin@gmail.com>

commit 97ca1a2569367d4d1d30bedf8d29377b3d4d1ec4
Author: Pramodith Ballapuram <16939722+pramodith@users.noreply.github.com>
Date:   Fri Nov 7 00:20:15 2025 +0000

    Fix bugs in CISPO conditions (#4499)

commit ffb3dd5d2e9e4d3b866527b861313be63981e024
Author: Behrooz Azarkhalili <80390531+behroozazarkhalili@users.noreply.github.com>
Date:   Thu Nov 6 16:03:00 2025 -0800

    docs: Add PEFT subsection to reducing memory usage guide (#4430)

    Co-authored-by: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com>

commit 43b6541aa46669a5ea79327c989730b282f18076
Author: SolarWindRider <31797478+SolarWindRider@users.noreply.github.com>
Date:   Fri Nov 7 06:55:34 2025 +0800

    Support completion bootstrap for VLM in GRPO/RLOO (#4452)

    Co-authored-by: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com>
    Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
    Co-authored-by: Quentin Gallouédec <gallouedec.quentin@gmail.com>

commit 642b721ee52c265835e2826c20316165b3f7a24f
Author: Pramodith Ballapuram <16939722+pramodith@users.noreply.github.com>
Date:   Thu Nov 6 22:33:00 2025 +0000

    ScaleRL: Add CISPO Loss (#4495)

commit 32e9c9fa6a4def84b128042092422534920903f1
Author: Ishita Bhattacharyya <139248026+ishitab02@users.noreply.github.com>
Date:   Fri Nov 7 03:37:43 2025 +0530

    ⛴️ Add kernels to Docker images (#4445)

    Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
    Co-authored-by: Quentin Gallouédec <gallouedec.quentin@gmail.com>

commit 1bcfc500eb548344de0567f2c9d277379b3db940
Author: Behrooz Azarkhalili <80390531+behroozazarkhalili@users.noreply.github.com>
Date:   Thu Nov 6 13:40:12 2025 -0800

    Move XPOTrainer to trl.experimental.xpo (#4485)

    Co-authored-by: Invidia19 <54266187+Invidia19@users.noreply.github.com>
    Co-authored-by: Quentin Gallouédec <gallouedec.quentin@gmail.com>

commit 37942bc19fb7d2bc8c43a839b32214ede53a0976
Author: Pramodith Ballapuram <16939722+pramodith@users.noreply.github.com>
Date:   Thu Nov 6 21:32:03 2025 +0000

    Buffer samples based on group level stds. (#4492)

commit 66cd02a6f50ae6a9dcb252bd86fe434a14299651
Author: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com>
Date:   Thu Nov 6 20:58:25 2025 +0100

    Add tiny model Qwen3VLForConditionalGeneration to CI (#4494)

commit 32febb491b386881fe75137e9990fb0f1b5cae2c
Author: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com>
Date:   Thu Nov 6 18:21:56 2025 +0100

    Add LFM2 to SFT notebook examples (#4455)
---
 .github/workflows/docker-build.yml            |    6 +-
 README.md                                     |   13 +-
 docker/trl-dev/Dockerfile                     |    5 +-
 docker/trl/Dockerfile                         |    6 +-
 docs/source/_toctree.yml                      |   30 +-
 docs/source/clis.md                           |  335 ++++-
 docs/source/cpo_trainer.md                    |   22 +-
 docs/source/dataset_formats.md                |   14 +-
 docs/source/dpo_trainer.md                    |    2 +-
 docs/source/example_overview.md               |   20 +-
 docs/source/gkd_trainer.md                    |   17 +-
 docs/source/gold_trainer.md                   |    4 +-
 docs/source/grpo_trainer.md                   |   38 +-
 docs/source/index.md                          |   13 +-
 docs/source/judges.md                         |   26 +-
 docs/source/kernels_hub.md                    |    8 +-
 docs/source/liger_kernel_integration.md       |    2 +-
 docs/source/minillm.md                        |   67 +
 docs/source/nash_md_trainer.md                |   19 +-
 docs/source/online_dpo_trainer.md             |    7 +-
 docs/source/openenv.md                        |  355 ++++--
 docs/source/paper_index.md                    |   60 +-
 docs/source/peft_integration.md               |    3 +-
 docs/source/ppo_trainer.md                    |   10 +-
 docs/source/quickstart.md                     |    9 +-
 docs/source/reducing_memory_usage.md          |   37 +-
 docs/source/rloo_trainer.md                   |   20 +-
 docs/source/vllm_integration.md               |  104 +-
 docs/source/xpo_trainer.md                    |   22 +-
 examples/datasets/deepmath_103k.py            |   98 ++
 examples/notebooks/sft_trl_lora_qlora.ipynb   |    5 +-
 examples/scripts/cpo.py                       |    3 +-
 examples/scripts/evals/judge_tldr.py          |    2 +-
 examples/scripts/gkd.py                       |    3 +-
 examples/scripts/nash_md.py                   |    7 +-
 examples/scripts/online_dpo.py                |    4 +-
 examples/scripts/openenv/browsergym.py        |  572 +++++++++
 examples/scripts/openenv/catch.py             |  176 +--
 examples/scripts/openenv/echo.py              |  120 +-
 examples/scripts/openenv/wordle.py            |  309 ++---
 examples/scripts/ppo/ppo.py                   |   11 +-
 examples/scripts/ppo/ppo_tldr.py              |   11 +-
 examples/scripts/xpo.py                       |    7 +-
 pyproject.toml                                |    5 +
 scripts/generate_tiny_models.py               |   13 +
 tests/{ => experimental}/test_cpo_trainer.py  |    4 +-
 tests/{ => experimental}/test_gkd_trainer.py  |    4 +-
 .../test_grpo_with_replay_buffer_trainer.py   |    4 +-
 tests/{ => experimental}/test_judges.py       |    4 +-
 tests/experimental/test_minillm_trainer.py    |   57 +
 .../test_nash_md_trainer.py                   |    4 +-
 tests/{ => experimental}/test_ppo_trainer.py  |    6 +-
 tests/experimental/test_trainers_args.py      |   98 --
 tests/{ => experimental}/test_xpo_trainer.py  |    5 +-
 tests/test_callbacks.py                       |    2 +-
 tests/test_grpo_trainer.py                    |   10 +-
 tests/test_online_dpo_trainer.py              |   18 +-
 tests/test_sft_trainer.py                     |   25 +-
 tests/test_trainers_args.py                   |  315 -----
 tests/testing_utils.py                        |   23 +-
 trl/cli.py                                    |   16 +-
 trl/experimental/bco/bco_config.py            |   10 +
 trl/experimental/bco/bco_trainer.py           |    6 +-
 trl/experimental/cpo/__init__.py              |   19 +
 trl/experimental/cpo/cpo_config.py            |  228 ++++
 trl/experimental/cpo/cpo_trainer.py           | 1089 +++++++++++++++++
 trl/experimental/gfpo/gfpo_trainer.py         |    3 +
 trl/experimental/gkd/__init__.py              |   19 +
 trl/experimental/gkd/gkd_config.py            |  112 ++
 trl/experimental/gkd/gkd_trainer.py           |  440 +++++++
 trl/experimental/gold/gold_config.py          |    7 +-
 trl/experimental/gold/gold_trainer.py         |   19 +-
 .../grpo_with_replay_buffer_config.py         |    2 +-
 .../grpo_with_replay_buffer_trainer.py        |   24 +-
 trl/experimental/gspo_token/grpo_trainer.py   |    3 +-
 trl/experimental/judges/__init__.py           |   36 +
 trl/experimental/judges/judges.py             |  457 +++++++
 trl/experimental/minillm/__init__.py          |   19 +
 trl/experimental/minillm/minillm_config.py    |  145 +++
 trl/experimental/minillm/minillm_trainer.py   |  396 ++++++
 trl/experimental/nash_md/__init__.py          |   19 +
 trl/experimental/nash_md/nash_md_config.py    |   46 +
 trl/experimental/nash_md/nash_md_trainer.py   |  489 ++++++++
 trl/experimental/openenv/__init__.py          |   18 +
 trl/experimental/openenv/utils.py             |  137 +++
 trl/experimental/ppo/__init__.py              |   19 +
 trl/experimental/ppo/ppo_config.py            |  135 ++
 trl/experimental/ppo/ppo_trainer.py           |  836 +++++++++++++
 trl/experimental/xpo/__init__.py              |   19 +
 trl/experimental/xpo/xpo_config.py            |   44 +
 trl/experimental/xpo/xpo_trainer.py           |  538 ++++++++
 trl/extras/vllm_client.py                     |    3 +
 trl/mergekit_utils.py                         |    2 +-
 trl/models/utils.py                           |   10 +-
 trl/rewards/accuracy_rewards.py               |   20 +-
 trl/scripts/grpo.py                           |   18 +
 trl/trainer/callbacks.py                      |    5 +-
 trl/trainer/cpo_config.py                     |  207 +---
 trl/trainer/cpo_trainer.py                    | 1088 +---------------
 trl/trainer/dpo_config.py                     |   10 +
 trl/trainer/gkd_config.py                     |  101 +-
 trl/trainer/gkd_trainer.py                    |  440 +------
 trl/trainer/grpo_config.py                    |   70 +-
 trl/trainer/grpo_trainer.py                   |  332 ++---
 trl/trainer/judges.py                         |  532 ++------
 trl/trainer/kto_config.py                     |   10 +
 trl/trainer/kto_trainer.py                    |    6 +-
 trl/trainer/model_config.py                   |    8 +-
 trl/trainer/nash_md_config.py                 |   35 +-
 trl/trainer/nash_md_trainer.py                |  488 +-------
 trl/trainer/online_dpo_config.py              |   10 +
 trl/trainer/online_dpo_trainer.py             |   28 +-
 trl/trainer/orpo_config.py                    |   30 +-
 trl/trainer/orpo_trainer.py                   |   79 +-
 trl/trainer/ppo_config.py                     |  128 +-
 trl/trainer/ppo_trainer.py                    |  835 +------------
 trl/trainer/prm_config.py                     |   10 +
 trl/trainer/reward_config.py                  |   10 +
 trl/trainer/reward_trainer.py                 |    3 +-
 trl/trainer/rloo_config.py                    |   57 +-
 trl/trainer/rloo_trainer.py                   |   84 +-
 trl/trainer/sft_config.py                     |   10 +
 trl/trainer/sft_trainer.py                    |    4 +-
 trl/trainer/utils.py                          |   15 +-
 trl/trainer/xpo_config.py                     |   33 +-
 trl/trainer/xpo_trainer.py                    |  529 +-------
 126 files changed, 8028 insertions(+), 5771 deletions(-)
 create mode 100644 docs/source/minillm.md
 create mode 100644 examples/datasets/deepmath_103k.py
 create mode 100644 examples/scripts/openenv/browsergym.py
 rename tests/{ => experimental}/test_cpo_trainer.py (98%)
 rename tests/{ => experimental}/test_gkd_trainer.py (99%)
 rename tests/{ => experimental}/test_judges.py (95%)
 create mode 100644 tests/experimental/test_minillm_trainer.py
 rename tests/{ => experimental}/test_nash_md_trainer.py (98%)
 rename tests/{ => experimental}/test_ppo_trainer.py (97%)
 delete mode 100644 tests/experimental/test_trainers_args.py
 rename tests/{ => experimental}/test_xpo_trainer.py (97%)
 delete mode 100644 tests/test_trainers_args.py
 create mode 100644 trl/experimental/cpo/__init__.py
 create mode 100644 trl/experimental/cpo/cpo_config.py
 create mode 100644 trl/experimental/cpo/cpo_trainer.py
 create mode 100644 trl/experimental/gkd/__init__.py
 create mode 100644 trl/experimental/gkd/gkd_config.py
 create mode 100644 trl/experimental/gkd/gkd_trainer.py
 create mode 100644 trl/experimental/judges/__init__.py
 create mode 100644 trl/experimental/judges/judges.py
 create mode 100644 trl/experimental/minillm/__init__.py
 create mode 100644 trl/experimental/minillm/minillm_config.py
 create mode 100644 trl/experimental/minillm/minillm_trainer.py
 create mode 100644 trl/experimental/nash_md/__init__.py
 create mode 100644 trl/experimental/nash_md/nash_md_config.py
 create mode 100644 trl/experimental/nash_md/nash_md_trainer.py
 create mode 100644 trl/experimental/openenv/__init__.py
 create mode 100644 trl/experimental/openenv/utils.py
 create mode 100644 trl/experimental/ppo/__init__.py
 create mode 100644 trl/experimental/ppo/ppo_config.py
 create mode 100644 trl/experimental/ppo/ppo_trainer.py
 create mode 100644 trl/experimental/xpo/__init__.py
 create mode 100644 trl/experimental/xpo/xpo_config.py
 create mode 100644 trl/experimental/xpo/xpo_trainer.py

diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
index 5a5fae6bf4c..2fc2192cafb 100644
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@@ -13,7 +13,8 @@ concurrency:
 jobs:
   trl:
     name: "Build and push TRL Docker image"
-    runs-on: ubuntu-latest
+    runs-on:
+      group: aws-general-8-plus
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -52,7 +53,8 @@ jobs:
 
   trl-dev:
     name: "Build and push TRL Dev Docker image"
-    runs-on: ubuntu-latest
+    runs-on:
+      group: aws-general-8-plus
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
diff --git a/README.md b/README.md
index 437b4b54cb4..c0a49208262 100644
--- a/README.md
+++ b/README.md
@@ -21,11 +21,11 @@
 
 **OpenEnv Integration:** TRL now supports **[OpenEnv](https://huggingface.co/blog/openenv)**, the open-source framework from Meta for defining, deploying, and interacting with environments in reinforcement learning and agentic workflows.
 
-Explore how to seamlessly integrate TRL with OpenEnv in our [dedicated documentation](openenv).
+Explore how to seamlessly integrate TRL with OpenEnv in our [dedicated documentation](https://huggingface.co/docs/trl/openenv).
 
 ## Overview
 
-TRL is a cutting-edge library designed for post-training foundation models using advanced techniques like Supervised Fine-Tuning (SFT), Proximal Policy Optimization (PPO), and Direct Preference Optimization (DPO). Built on top of the [🤗 Transformers](https://github.com/huggingface/transformers) ecosystem, TRL supports a variety of model architectures and modalities, and can be scaled-up across various hardware setups.
+TRL is a cutting-edge library designed for post-training foundation models using advanced techniques like Supervised Fine-Tuning (SFT), Group Realtive Policy Optimization (GRPO), and Direct Preference Optimization (DPO). Built on top of the [🤗 Transformers](https://github.com/huggingface/transformers) ecosystem, TRL supports a variety of model architectures and modalities, and can be scaled-up across various hardware setups.
 
 ## Highlights
 
@@ -92,16 +92,13 @@ trainer.train()
 ```python
 from datasets import load_dataset
 from trl import GRPOTrainer
+from trl.rewards import accuracy_reward
 
-dataset = load_dataset("trl-lib/tldr", split="train")
-
-# Dummy reward function: count the number of unique characters in the completions
-def reward_num_unique_chars(completions, **kwargs):
-    return [len(set(c)) for c in completions]
+dataset = load_dataset("trl-lib/DeepMath-103K", split="train")
 
 trainer = GRPOTrainer(
     model="Qwen/Qwen2-0.5B-Instruct",
-    reward_funcs=reward_num_unique_chars,
+    reward_funcs=accuracy_reward,
     train_dataset=dataset,
 )
 trainer.train()
diff --git a/docker/trl-dev/Dockerfile b/docker/trl-dev/Dockerfile
index c8557048d7c..9a756a8821d 100644
--- a/docker/trl-dev/Dockerfile
+++ b/docker/trl-dev/Dockerfile
@@ -1,6 +1,5 @@
-FROM pytorch/pytorch:2.8.0-cuda12.8-cudnn9-runtime
+FROM pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
 RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 RUN pip install --upgrade pip uv
 RUN uv pip install --system --no-cache "git+https://github.com/huggingface/trl.git#egg=trl[liger,peft,vlm]"
-RUN uv pip install --system hf_transfer liger_kernel trackio peft
-RUN uv pip install --system https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
\ No newline at end of file
+RUN uv pip install --system kernels liger_kernel peft trackio
\ No newline at end of file
diff --git a/docker/trl/Dockerfile b/docker/trl/Dockerfile
index 61a2b0dd278..8b6e2842a38 100644
--- a/docker/trl/Dockerfile
+++ b/docker/trl/Dockerfile
@@ -1,4 +1,4 @@
-FROM pytorch/pytorch:2.8.0-cuda12.8-cudnn9-runtime
+FROM pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
+RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 RUN pip install --upgrade pip uv
-RUN uv pip install --system trl[liger,peft,vlm] hf_transfer trackio
-RUN uv pip install --system https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
\ No newline at end of file
+RUN uv pip install --system trl[liger,peft,vlm] kernels trackio
\ No newline at end of file
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index a7555a1e7ed..6fd438151ab 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -56,22 +56,14 @@
   title: Examples
 - sections:
   - sections: # Sorted alphabetically
-    - local: cpo_trainer
-      title: CPO
     - local: dpo_trainer
       title: DPO
     - local: online_dpo_trainer
       title: Online DPO
-    - local: gkd_trainer
-      title: GKD
     - local: grpo_trainer
       title: GRPO
     - local: kto_trainer
       title: KTO
-    - local: nash_md_trainer
-      title: Nash-MD
-    - local: ppo_trainer
-      title: PPO
     - local: prm_trainer
       title: PRM
     - local: reward_trainer
@@ -80,15 +72,11 @@
       title: RLOO
     - local: sft_trainer
       title: SFT
-    - local: xpo_trainer
-      title: XPO
     title: Trainers
   - local: models
     title: Model Classes
   - local: model_utils
     title: Model Utilities
-  - local: judges
-    title: Judges
   - local: callbacks
     title: Callbacks
   - local: data_utils
@@ -107,14 +95,32 @@
     title: BEMA for Reference Model
   - local: bco_trainer
     title: BCO
+  - local: cpo_trainer
+    title: CPO
   - local: gfpo
     title: GFPO
+  - local: gkd_trainer
+    title: GKD
   - local: gold_trainer
     title: GOLD
   - local: grpo_with_replay_buffer
     title: GRPO With Replay Buffer
   - local: gspo_token
     title: GSPO-token
+  - local: judges
+    title: Judges
+  - local: minillm
+    title: MiniLLM
+  - local: nash_md_trainer
+    title: Nash-MD
+  - local: orpo_trainer
+    title: ORPO
+  - local: papo_trainer
+    title: PAPO
+  - local: ppo_trainer
+    title: PPO
+  - local: xpo_trainer
+    title: XPO
   - local: openenv
     title: OpenEnv Integration
   - local: orpo_trainer
diff --git a/docs/source/clis.md b/docs/source/clis.md
index 666584decf4..54c8c1055f9 100644
--- a/docs/source/clis.md
+++ b/docs/source/clis.md
@@ -26,7 +26,7 @@ Currently supported commands are:
 
 You can launch training directly from the CLI by specifying required arguments like the model and dataset:
 
-<hfoptions id="command_line">
+<hfoptions id="trainer">
 <hfoption id="SFT">
 
 ```bash
@@ -53,6 +53,35 @@ trl reward \
   --dataset_name trl-lib/ultrafeedback_binarized
 ```
 
+</hfoption>
+<hfoption id="GRPO">
+
+```bash
+trl grpo \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name HuggingFaceH4/Polaris-Dataset-53K \
+  --reward_funcs accuracy_reward
+```
+
+</hfoption>
+<hfoption id="RLOO">
+
+```bash
+trl rloo \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name HuggingFaceH4/Polaris-Dataset-53K \
+  --reward_funcs accuracy_reward
+```
+
+</hfoption>
+<hfoption id="KTO">
+
+```bash
+trl kto \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name trl-lib/kto-mix-14k
+```
+
 </hfoption>
 </hfoptions>
 
@@ -60,7 +89,7 @@ trl reward \
 
 To keep your CLI commands clean and reproducible, you can define all training arguments in a YAML configuration file:
 
-<hfoptions id="config_file">
+<hfoptions id="trainer">
 <hfoption id="SFT">
 
 ```yaml
@@ -105,6 +134,55 @@ Launch with:
 trl reward --config reward_config.yaml
 ```
 
+</hfoption>
+<hfoption id="GRPO">
+
+```yaml
+# grpo_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: HuggingFaceH4/Polaris-Dataset-53K
+reward_funcs:
+  - accuracy_reward
+```
+
+Launch with:
+
+```bash
+trl grpo --config grpo_config.yaml
+```
+
+</hfoption>
+<hfoption id="RLOO">
+
+```yaml
+# rloo_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: HuggingFaceH4/Polaris-Dataset-53K
+reward_funcs:
+  - accuracy_reward
+```
+
+Launch with:
+
+```bash
+trl rloo --config rloo_config.yaml
+```
+
+</hfoption>
+<hfoption id="KTO">
+
+```yaml
+# kto_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: trl-lib/kto-mix-14k
+```
+
+Launch with:
+
+```bash
+trl kto --config kto_config.yaml
+```
+
 </hfoption>
 </hfoptions>
 
@@ -114,8 +192,8 @@ TRL CLI natively supports [🤗 Accelerate](https://huggingface.co/docs/accelera
 
 You can pass any `accelerate launch` arguments directly to `trl`, such as `--num_processes`. For more information see [Using accelerate launch](https://huggingface.co/docs/accelerate/en/basic_tutorials/launch#using-accelerate-launch).
 
-<hfoptions id="launch_args">
-<hfoption id="SFT inline">
+<hfoptions id="trainer">
+<hfoption id="SFT">
 
 ```bash
 trl sft \
@@ -124,8 +202,7 @@ trl sft \
   --num_processes 4
 ```
 
-</hfoption>
-<hfoption id="SFT w/ config file">
+or, with a config file:
 
 ```yaml
 # sft_config.yaml
@@ -141,7 +218,7 @@ trl sft --config sft_config.yaml
 ```
 
 </hfoption>
-<hfoption id="DPO inline">
+<hfoption id="DPO">
 
 ```bash
 trl dpo \
@@ -150,8 +227,7 @@ trl dpo \
   --num_processes 4
 ```
 
-</hfoption>
-<hfoption id="DPO w/ config file">
+or, with a config file:
 
 ```yaml
 # dpo_config.yaml
@@ -167,7 +243,7 @@ trl dpo --config dpo_config.yaml
 ```
 
 </hfoption>
-<hfoption id="Reward inline">
+<hfoption id="Reward">
 
 ```bash
 trl reward \
@@ -176,8 +252,7 @@ trl reward \
   --num_processes 4
 ```
 
-</hfoption>
-<hfoption id="Reward w/ config file">
+or, with a config file:
 
 ```yaml
 # reward_config.yaml
@@ -192,6 +267,87 @@ Launch with:
 trl reward --config reward_config.yaml
 ```
 
+</hfoption>
+<hfoption id="GRPO">
+
+```bash
+trl grpo \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name HuggingFaceH4/Polaris-Dataset-53K \
+  --reward_funcs accuracy_reward \
+  --num_processes 4
+```
+
+or, with a config file:
+
+```yaml
+# grpo_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: HuggingFaceH4/Polaris-Dataset-53K
+reward_funcs:
+  - accuracy_reward
+num_processes: 4
+```
+
+Launch with:
+
+```bash
+trl grpo --config grpo_config.yaml
+```
+
+</hfoption>
+<hfoption id="RLOO">
+
+```bash
+trl rloo \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name HuggingFaceH4/Polaris-Dataset-53K \
+  --reward_funcs accuracy_reward \
+  --num_processes 4
+```
+
+or, with a config file:
+
+```yaml
+# rloo_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: HuggingFaceH4/Polaris-Dataset-53K
+reward_funcs:
+  - accuracy_reward
+num_processes: 4
+```
+
+Launch with:
+
+```bash
+trl rloo --config rloo_config.yaml
+```
+
+</hfoption>
+<hfoption id="KTO">
+
+```bash
+trl kto \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name trl-lib/kto-mix-14k \
+  --num_processes 4
+```
+
+or, with a config file:
+
+```yaml
+# kto_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: trl-lib/kto-mix-14k
+num_processes: 4
+```
+
+Launch with:
+
+```bash
+trl kto --config kto_config.yaml
+```
+
 </hfoption>
 </hfoptions>
 
@@ -220,8 +376,8 @@ To use one of these, just pass the name to `--accelerate_config`. TRL will autom
 
 #### Example Usage
 
-<hfoptions id="accelerate_config">
-<hfoption id="SFT inline">
+<hfoptions id="trainer">
+<hfoption id="SFT">
 
 ```bash
 trl sft \
@@ -230,8 +386,7 @@ trl sft \
   --accelerate_config zero2  # or path/to/my/accelerate/config.yaml
 ```
 
-</hfoption>
-<hfoption id="SFT w/ config file">
+or, with a config file:
 
 ```yaml
 # sft_config.yaml
@@ -247,7 +402,7 @@ trl sft --config sft_config.yaml
 ```
 
 </hfoption>
-<hfoption id="DPO inline">
+<hfoption id="DPO">
 
 ```bash
 trl dpo \
@@ -256,8 +411,7 @@ trl dpo \
   --accelerate_config zero2  # or path/to/my/accelerate/config.yaml
 ```
 
-</hfoption>
-<hfoption id="DPO w/ config file">
+or, with a config file:
 
 ```yaml
 # dpo_config.yaml
@@ -273,7 +427,7 @@ trl dpo --config dpo_config.yaml
 ```
 
 </hfoption>
-<hfoption id="Reward inline">
+<hfoption id="Reward">
 
 ```bash
 trl reward \
@@ -282,8 +436,7 @@ trl reward \
   --accelerate_config zero2  # or path/to/my/accelerate/config.yaml
 ```
 
-</hfoption>
-<hfoption id="Reward w/ config file">
+or, with a config file:
 
 ```yaml
 # reward_config.yaml
@@ -298,6 +451,87 @@ Launch with:
 trl reward --config reward_config.yaml
 ```
 
+</hfoption>
+<hfoption id="GRPO">
+
+```bash
+trl grpo \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name HuggingFaceH4/Polaris-Dataset-53K \
+  --reward_funcs accuracy_reward \
+  --accelerate_config zero2  # or path/to/my/accelerate/config.yaml
+```
+
+or, with a config file:
+
+```yaml
+# grpo_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: HuggingFaceH4/Polaris-Dataset-53K
+reward_funcs:
+  - accuracy_reward
+accelerate_config: zero2  # or path/to/my/accelerate/config.yaml
+```
+
+Launch with:
+
+```bash
+trl grpo --config grpo_config.yaml
+```
+
+</hfoption>
+<hfoption id="RLOO">
+
+```bash
+trl rloo \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name HuggingFaceH4/Polaris-Dataset-53K \
+  --reward_funcs accuracy_reward \
+  --accelerate_config zero2  # or path/to/my/accelerate/config.yaml
+```
+
+or, with a config file:
+
+```yaml
+# rloo_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: HuggingFaceH4/Polaris-Dataset-53K
+reward_funcs:
+  - accuracy_reward
+accelerate_config: zero2  # or path/to/my/accelerate/config.yaml
+```
+
+Launch with:
+
+```bash
+trl rloo --config rloo_config.yaml
+```
+
+</hfoption>
+<hfoption id="KTO">
+
+```bash
+trl kto \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name trl-lib/kto-mix-14k \
+  --accelerate_config zero2  # or path/to/my/accelerate/config.yaml
+```
+
+or, with a config file:
+
+```yaml
+# kto_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: trl-lib/kto-mix-14k
+accelerate_config: zero2  # or path/to/my/accelerate/config.yaml
+```
+
+Launch with:
+
+```bash
+trl kto --config kto_config.yaml
+```
+
 </hfoption>
 </hfoptions>
 
@@ -305,7 +539,7 @@ trl reward --config reward_config.yaml
 
 You can use dataset mixtures to combine multiple datasets into a single training dataset. This is useful for training on diverse data sources or when you want to mix different types of data.
 
-<hfoptions id="dataset_mixtures">
+<hfoptions id="trainer">
 <hfoption id="SFT">
 
 ```yaml
@@ -356,6 +590,61 @@ Launch with:
 trl reward --config reward_config.yaml
 ```
 
+</hfoption>
+<hfoption id="GRPO">
+
+```yaml
+# grpo_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+datasets:
+  - path: HuggingFaceH4/Polaris-Dataset-53K
+  - path: trl-lib/DeepMath-103K
+reward_funcs:
+  - accuracy_reward
+```
+
+Launch with:
+
+```bash
+trl grpo --config grpo_config.yaml
+```
+
+</hfoption>
+<hfoption id="RLOO">
+
+```yaml
+# rloo_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+datasets:
+  - path: HuggingFaceH4/Polaris-Dataset-53K
+  - path: trl-lib/DeepMath-103K
+reward_funcs:
+  - accuracy_reward
+```
+
+Launch with:
+
+```bash
+trl rloo --config rloo_config.yaml
+```
+
+</hfoption>
+<hfoption id="KTO">
+
+```yaml
+# kto_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+datasets:
+  - path: trl-lib/kto-mix-14k
+  - path: argilla/ultrafeedback-binarized-preferences-cleaned
+```
+
+Launch with:
+
+```bash
+trl kto --config kto_config.yaml
+```
+
 </hfoption>
 </hfoptions>
 
diff --git a/docs/source/cpo_trainer.md b/docs/source/cpo_trainer.md
index 3dcdb0e11cd..e1ff2a198a4 100644
--- a/docs/source/cpo_trainer.md
+++ b/docs/source/cpo_trainer.md
@@ -24,7 +24,7 @@ Below is the script to train the model:
 ```python
 # train_cpo.py
 from datasets import load_dataset
-from trl import CPOConfig, CPOTrainer
+from trl.experimental.cpo import CPOConfig, CPOTrainer
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
@@ -44,7 +44,7 @@ accelerate launch train_cpo.py
 
 ## Expected dataset type
 
-CPO requires a [preference dataset](dataset_formats#preference). The [`CPOTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset formats. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
+CPO requires a [preference dataset](dataset_formats#preference). The [`experimental.cpo.CPOTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset formats. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
 
 ## Example script
 
@@ -80,31 +80,31 @@ The abstract from the paper is the following:
 
 > Direct Preference Optimization (DPO) is a widely used offline preference optimization algorithm that reparameterizes reward functions in reinforcement learning from human feedback (RLHF) to enhance simplicity and training stability. In this work, we propose SimPO, a simpler yet more effective approach. The effectiveness of SimPO is attributed to a key design: using the average log probability of a sequence as the implicit reward. This reward formulation better aligns with model generation and eliminates the need for a reference model, making it more compute and memory efficient. Additionally, we introduce a target reward margin to the Bradley-Terry objective to encourage a larger margin between the winning and losing responses, further enhancing the algorithm's performance. We compare SimPO to DPO and its latest variants across various state-of-the-art training setups, including both base and instruction-tuned models like Mistral and Llama3. We evaluated on extensive instruction-following benchmarks, including AlpacaEval 2, MT-Bench, and the recent challenging Arena-Hard benchmark. Our results demonstrate that SimPO consistently and significantly outperforms existing approaches without substantially increasing response length. Specifically, SimPO outperforms DPO by up to 6.4 points on AlpacaEval 2 and by up to 7.5 points on Arena-Hard. Our top-performing model, built on Llama3-8B-Instruct, achieves a remarkable 44.7 length-controlled win rate on AlpacaEval 2 -- surpassing Claude 3 Opus on the leaderboard, and a 33.8 win rate on Arena-Hard -- making it the strongest 8B open-source model.
 
-The SimPO loss is integrated in the [`CPOTrainer`], as it's an alternative loss that adds a reward margin, allows for length normalization, and does not use BC regularization. To use this loss, just turn on `loss_type="simpo"` and `cpo_alpha=0.0` in the [`CPOConfig`] and set the `simpo_gamma` to a recommended value.
+The SimPO loss is integrated in the [`experimental.cpo.CPOTrainer`], as it's an alternative loss that adds a reward margin, allows for length normalization, and does not use BC regularization. To use this loss, just turn on `loss_type="simpo"` and `cpo_alpha=0.0` in the [`experimental.cpo.CPOConfig`] and set the `simpo_gamma` to a recommended value.
 
 ### CPO-SimPO
 
-We also offer the combined use of CPO and SimPO, which enables more stable training and improved performance. Learn more details at [CPO-SimPO GitHub](https://github.com/fe1ixxu/CPO_SIMPO). To use this method, simply enable SimPO by setting `loss_type="simpo"` and a non-zero `cpo_alpha` in the [`CPOConfig`].
+We also offer the combined use of CPO and SimPO, which enables more stable training and improved performance. Learn more details at [CPO-SimPO GitHub](https://github.com/fe1ixxu/CPO_SIMPO). To use this method, simply enable SimPO by setting `loss_type="simpo"` and a non-zero `cpo_alpha` in the [`experimental.cpo.CPOConfig`].
 
 ### AlphaPO
 
-The [AlphaPO -- Reward shape matters for LLM alignment](https://huggingface.co/papers/2501.03884) (AlphaPO) method by Aman Gupta, Shao Tang, Qingquan Song, Sirou Zhu, [Jiwoo Hong](https://huggingface.co/JW17), Ankan Saha, Viral Gupta, Noah Lee, Eunki Kim, Jason Zhu, Natesh Pillai, and S. Sathiya Keerthi is also implemented in the [`CPOTrainer`]. AlphaPO is an alternative method that applies a transformation to the reward function shape in the context of SimPO loss. The abstract from the paper is the following:
+The [AlphaPO -- Reward shape matters for LLM alignment](https://huggingface.co/papers/2501.03884) (AlphaPO) method by Aman Gupta, Shao Tang, Qingquan Song, Sirou Zhu, [Jiwoo Hong](https://huggingface.co/JW17), Ankan Saha, Viral Gupta, Noah Lee, Eunki Kim, Jason Zhu, Natesh Pillai, and S. Sathiya Keerthi is also implemented in the [`experimental.cpo.CPOTrainer`]. AlphaPO is an alternative method that applies a transformation to the reward function shape in the context of SimPO loss. The abstract from the paper is the following:
 
 > Reinforcement Learning with Human Feedback (RLHF) and its variants have made huge strides toward the effective alignment of large language models (LLMs) to follow instructions and reflect human values. More recently, Direct Alignment Algorithms (DAAs) have emerged in which the reward modeling stage of RLHF is skipped by characterizing the reward directly as a function of the policy being learned. Some popular examples of DAAs include Direct Preference Optimization (DPO) and Simple Preference Optimization (SimPO). These methods often suffer from likelihood displacement, a phenomenon by which the probabilities of preferred responses are often reduced undesirably. In this paper, we argue that, for DAAs the reward (function) shape matters. We introduce AlphaPO, a new DAA method that leverages an α-parameter to help change the shape of the reward function beyond the standard log reward. AlphaPO helps maintain fine-grained control over likelihood displacement and overoptimization. Compared to SimPO, one of the best performing DAAs, AlphaPO leads to about 7% to 10% relative improvement in alignment performance for the instruct versions of Mistral-7B and Llama3-8B while achieving 15% to 50% relative improvement over DPO on the same models. The analysis and results presented highlight the importance of the reward shape and how one can systematically change it to affect training dynamics, as well as improve alignment performance.
 
-To use this loss as described in the paper, we can set the `loss_type="alphapo"` which automatically sets `loss_type="simpo"` and `cpo_alpha=0.0`, together with `alpha` and `simpo_gamma` to recommended values in the [`CPOConfig`]. Alternatively, you can manually set `loss_type="simpo"`, `cpo_alpha=0.0`, together with `alpha` and `simpo_gamma` to recommended values. Other variants of this method are also possible, such as setting `loss_type="ipo"` and `alpha` to any non-zero value.
+To use this loss as described in the paper, we can set the `loss_type="alphapo"` which automatically sets `loss_type="simpo"` and `cpo_alpha=0.0`, together with `alpha` and `simpo_gamma` to recommended values in the [`experimental.cpo.CPOConfig`]. Alternatively, you can manually set `loss_type="simpo"`, `cpo_alpha=0.0`, together with `alpha` and `simpo_gamma` to recommended values. Other variants of this method are also possible, such as setting `loss_type="ipo"` and `alpha` to any non-zero value.
 
 ## Loss functions
 
-The CPO algorithm supports several loss functions. The loss function can be set using the `loss_type` parameter in the [`CPOConfig`]. The following loss functions are supported:
+The CPO algorithm supports several loss functions. The loss function can be set using the `loss_type` parameter in the [`experimental.cpo.CPOConfig`]. The following loss functions are supported:
 
 | `loss_type=` | Description |
 | --- | --- |
 | `"sigmoid"` (default) | Given the preference data, we can fit a binary classifier according to the Bradley-Terry model, and in fact, the [DPO](https://huggingface.co/papers/2305.18290) authors propose the sigmoid loss on the normalized likelihood via the `logsigmoid` to fit a logistic regression. |
 | `"hinge"` | The [RSO](https://huggingface.co/papers/2309.06657) authors propose to use a hinge loss on the normalized likelihood from the [SLiC](https://huggingface.co/papers/2305.10425) paper. In this case, the `beta` is the reciprocal of the margin. |
 | `"ipo"` | The [IPO](https://huggingface.co/papers/2310.12036) authors provide a deeper theoretical understanding of the DPO algorithms and identify an issue with overfitting and propose an alternative loss. In this case, the `beta` is the reciprocal of the gap between the log-likelihood ratios of the chosen vs the rejected completion pair, and thus the smaller the `beta`, the larger this gap is. As per the paper, the loss is averaged over log-likelihoods of the completion (unlike DPO, which is summed only). |
-| `"simpo"` | The [SimPO](https://huggingface.co/papers/2405.14734) method is also implemented in the [`CPOTrainer`]. SimPO is an alternative loss that adds a reward margin, allows for length normalization, and does not use BC regularization. To use this loss, simply set `loss_type="simpo"` and `cpo_alpha=0.0` in the [`CPOConfig`] and `simpo_gamma` to a recommended value. |
-| `"alphapo"` | The [AlphaPO](https://huggingface.co/papers/2501.03884) method is also implemented in the [`CPOTrainer`]. This is syntactic sugar that automatically sets `loss_type="simpo"` and `cpo_alpha=0.0`. AlphaPO applies a transformation to the reward function shape in the context of SimPO loss when the `alpha` parameter is non-zero. |
+| `"simpo"` | The [SimPO](https://huggingface.co/papers/2405.14734) method is also implemented in the [`experimental.cpo.CPOTrainer`]. SimPO is an alternative loss that adds a reward margin, allows for length normalization, and does not use BC regularization. To use this loss, simply set `loss_type="simpo"` and `cpo_alpha=0.0` in the [`experimental.cpo.CPOConfig`] and `simpo_gamma` to a recommended value. |
+| `"alphapo"` | The [AlphaPO](https://huggingface.co/papers/2501.03884) method is also implemented in the [`experimental.cpo.CPOTrainer`]. This is syntactic sugar that automatically sets `loss_type="simpo"` and `cpo_alpha=0.0`. AlphaPO applies a transformation to the reward function shape in the context of SimPO loss when the `alpha` parameter is non-zero. |
 
 ### For Mixture of Experts Models: Enabling the auxiliary loss
 
@@ -116,11 +116,11 @@ To scale how much the auxiliary loss contributes to the total loss, use the hype
 
 ## CPOTrainer
 
-[[autodoc]] CPOTrainer
+[[autodoc]] experimental.cpo.CPOTrainer
     - train
     - save_model
     - push_to_hub
 
 ## CPOConfig
 
-[[autodoc]] CPOConfig
+[[autodoc]] experimental.cpo.CPOConfig
diff --git a/docs/source/dataset_formats.md b/docs/source/dataset_formats.md
index 7d98b8abe7b..5f86257875d 100644
--- a/docs/source/dataset_formats.md
+++ b/docs/source/dataset_formats.md
@@ -387,21 +387,21 @@ Choosing the right dataset type depends on the task you are working on and the s
 
 | Trainer | Expected dataset type |
 | --- | --- |
-| [`experimental.bco.BCOTrainer`] | [Unpaired preference](#unpaired-preference) or [Preference (explicit prompt recommended)](#preference) |
-| [`CPOTrainer`] | [Preference (explicit prompt recommended)](#preference) |
 | [`DPOTrainer`] | [Preference (explicit prompt recommended)](#preference) |
-| [`GKDTrainer`] | [Prompt-completion](#prompt-completion) |
 | [`GRPOTrainer`] | [Prompt-only](#prompt-only) |
 | [`KTOTrainer`] | [Unpaired preference](#unpaired-preference) or [Preference (explicit prompt recommended)](#preference) |
-| [`NashMDTrainer`] | [Prompt-only](#prompt-only) |
 | [`OnlineDPOTrainer`] | [Prompt-only](#prompt-only) |
-| [`experimental.orpo.ORPOTrainer`] | [Preference (explicit prompt recommended)](#preference) |
-| [`PPOTrainer`] | Tokenized language modeling |
 | [`PRMTrainer`] | [Stepwise supervision](#stepwise-supervision) |
 | [`RewardTrainer`] | [Preference (implicit prompt recommended)](#preference) |
 | [`RLOOTrainer`] | [Prompt-only](#prompt-only) |
 | [`SFTTrainer`] | [Language modeling](#language-modeling) or [Prompt-completion](#prompt-completion) |
-| [`XPOTrainer`] | [Prompt-only](#prompt-only) |
+| [`experimental.bco.BCOTrainer`] | [Unpaired preference](#unpaired-preference) or [Preference (explicit prompt recommended)](#preference) |
+| [`experimental.cpo.CPOTrainer`] | [Preference (explicit prompt recommended)](#preference) |
+| [`experimental.gkd.GKDTrainer`] | [Prompt-completion](#prompt-completion) |
+| [`experimental.nash_md.NashMDTrainer`] | [Prompt-only](#prompt-only) |
+| [`experimental.orpo.ORPOTrainer`] | [Preference (explicit prompt recommended)](#preference) |
+| [`experimental.ppo.PPOTrainer`] | Tokenized language modeling |
+| [`experimental.xpo.XPOTrainer`] | [Prompt-only](#prompt-only) |
 
 ## Using any dataset with TRL: preprocessing and conversion
 
diff --git a/docs/source/dpo_trainer.md b/docs/source/dpo_trainer.md
index 8e9f0ac41b5..9e524c6a940 100644
--- a/docs/source/dpo_trainer.md
+++ b/docs/source/dpo_trainer.md
@@ -253,7 +253,7 @@ model = AutoModelForCausalLM.from_pretrained(
     "mistralai/mixtral-8x7b-v0.1",
     load_in_4bit=True,
     quantization_config=bnb_config,
-    attn_implementation="flash_attention_2",
+    attn_implementation="kernels-community/flash-attn2",
     dtype=torch.bfloat16,
     device_map="auto",
 )
diff --git a/docs/source/example_overview.md b/docs/source/example_overview.md
index d9cba0b9114..598f8150bdd 100644
--- a/docs/source/example_overview.md
+++ b/docs/source/example_overview.md
@@ -37,26 +37,30 @@ These notebooks are easier to run and are designed for quick experimentation wit
 
 Scripts are maintained in the [`trl/scripts`](https://github.com/huggingface/trl/blob/main/trl/scripts) and [`examples/scripts`](https://github.com/huggingface/trl/blob/main/examples/scripts) directories. They show how to use different trainers such as `SFTTrainer`, `PPOTrainer`, `DPOTrainer`, `GRPOTrainer`, and more.
 
- File | Description |
+| File | Description |
 | --- | --- |
 | [`examples/scripts/bco.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/bco.py) | This script shows how to use the [`KTOTrainer`] with the BCO loss to fine-tune a model to increase instruction-following, truthfulness, honesty, and helpfulness using the [openbmb/UltraFeedback](https://huggingface.co/datasets/openbmb/UltraFeedback) dataset. |
-| [`examples/scripts/cpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/cpo.py) | This script shows how to use the [`CPOTrainer`] to fine-tune a model to increase helpfulness and harmlessness using the [Anthropic/hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf) dataset. |
+| [`examples/scripts/cpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/cpo.py) | This script shows how to use the [`experimental.cpo.CPOTrainer`] to fine-tune a model to increase helpfulness and harmlessness using the [Anthropic/hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf) dataset. |
 | [`trl/scripts/dpo.py`](https://github.com/huggingface/trl/blob/main/trl/scripts/dpo.py) | This script shows how to use the [`DPOTrainer`] to fine-tune a model. |
 | [`examples/scripts/dpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/dpo_vlm.py) | This script shows how to use the [`DPOTrainer`] to fine-tune a Vision Language Model to reduce hallucinations using the [openbmb/RLAIF-V-Dataset](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset) dataset. |
-| [`examples/scripts/evals/judge_tldr.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/evals/judge_tldr.py) | This script shows how to use [`HfPairwiseJudge`] or [`OpenAIPairwiseJudge`] to judge model generations. |
-| [`examples/scripts/gkd.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/gkd.py) | This script shows how to use the [`GKDTrainer`] to fine-tune a model. |
+| [`examples/scripts/evals/judge_tldr.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/evals/judge_tldr.py) | This script shows how to use [`experimental.judges.HfPairwiseJudge`] or [`experimental.judges.OpenAIPairwiseJudge`] to judge model generations. |
+| [`examples/scripts/gkd.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/gkd.py) | This script shows how to use the [`experimental.gkd.GKDTrainer`] to fine-tune a model. |
 | [`trl/scripts/grpo.py`](https://github.com/huggingface/trl/blob/main/trl/scripts/grpo.py) | This script shows how to use the [`GRPOTrainer`] to fine-tune a model. |
 | [`examples/scripts/grpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/grpo_vlm.py) | This script shows how to use the [`GRPOTrainer`] to fine-tune a multimodal model for reasoning using the [lmms-lab/multimodal-open-r1-8k-verified](https://huggingface.co/datasets/lmms-lab/multimodal-open-r1-8k-verified) dataset. |
 | [`examples/scripts/gspo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/gspo.py) | This script shows how to use GSPO via the [`GRPOTrainer`] to fine-tune model for reasoning using the [AI-MO/NuminaMath-TIR](https://huggingface.co/datasets/AI-MO/NuminaMath-TIR) dataset. |
 | [`examples/scripts/gspo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/gspo_vlm.py) | This script shows how to use GSPO via the [`GRPOTrainer`] to fine-tune a multimodal model for reasoning using the [lmms-lab/multimodal-open-r1-8k-verified](https://huggingface.co/datasets/lmms-lab/multimodal-open-r1-8k-verified) dataset. |
 | [`examples/scripts/kto.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/kto.py) | This script shows how to use the [`KTOTrainer`] to fine-tune a model. |
 | [`examples/scripts/mpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/mpo_vlm.py) | This script shows how to use MPO via the [`DPOTrainer`] to align a model based on preferences using the [HuggingFaceH4/rlaif-v_formatted](https://huggingface.co/datasets/HuggingFaceH4/rlaif-v_formatted) dataset and a set of loss weights with weights. |
-| [`examples/scripts/nash_md.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/nash_md.py) | This script shows how to use the [`NashMDTrainer`] to fine-tune a model. |
+| [`examples/scripts/nash_md.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/nash_md.py) | This script shows how to use the [`experimental.nash_md.NashMDTrainer`] to fine-tune a model. |
 | [`examples/scripts/online_dpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/online_dpo.py) | This script shows how to use the [`OnlineDPOTrainer`] to fine-tune a model. |
 | [`examples/scripts/online_dpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/online_dpo_vlm.py) | This script shows how to use the [`OnlineDPOTrainer`] to fine-tune a a Vision Language Model. |
+| [`examples/scripts/openenv/browsergym.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/openenv/browsergym.py) | Simple script to run GRPO training via the [`GRPOTrainer`] with OpenEnv's BrowserGym environment and vLLM |
+| [`examples/scripts/openenv/catch.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/openenv/catch.py) | Simple script to run GRPO training via the [`GRPOTrainer`] with OpenEnv's Catch environment (OpenSpiel) and vLLM |
+| [`examples/scripts/openenv/echo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/openenv/echo.py) | Simple script to run GRPO training via the [`GRPOTrainer`] with OpenEnv's Echo environment and vLLM. |
+| [`examples/scripts/openenv/wordle.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/openenv/wordle.py) | Simple script to run GRPO training via the [`GRPOTrainer`] with OpenEnv's Wordle environment and vLLM. |
 | [`examples/scripts/orpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/orpo.py) | This script shows how to use the [`experimental.orpo.ORPOTrainer`] to fine-tune a model to increase helpfulness and harmlessness using the [Anthropic/hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf) dataset. |
-| [`examples/scripts/ppo/ppo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo/ppo.py) | This script shows how to use the [`PPOTrainer`] to fine-tune a model to improve its ability to continue text with positive sentiment or physically descriptive language. |
-| [`examples/scripts/ppo/ppo_tldr.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo/ppo_tldr.py) | This script shows how to use the [`PPOTrainer`] to fine-tune a model to improve its ability to generate TL;DR summaries. |
+| [`examples/scripts/ppo/ppo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo/ppo.py) | This script shows how to use the [`experimental.ppo.PPOTrainer`] to fine-tune a model to improve its ability to continue text with positive sentiment or physically descriptive language. |
+| [`examples/scripts/ppo/ppo_tldr.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo/ppo_tldr.py) | This script shows how to use the [`experimental.ppo.PPOTrainer`] to fine-tune a model to improve its ability to generate TL;DR summaries. |
 | [`examples/scripts/prm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/prm.py) | This script shows how to use the [`PRMTrainer`] to fine-tune a Process-supervised Reward Model (PRM). |
 | [`examples/scripts/reward_modeling.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/reward_modeling.py) | This script shows how to use the [`RewardTrainer`] to train an Outcome Reward Model (ORM) on your own dataset. |
 | [`examples/scripts/rloo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/rloo.py) | This script shows how to use the [`RLOOTrainer`] to fine-tune a model to improve its ability to solve math questions. |
@@ -66,7 +70,7 @@ Scripts are maintained in the [`trl/scripts`](https://github.com/huggingface/trl
 | [`examples/scripts/sft_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/sft_vlm.py) | This script shows how to use the [`SFTTrainer`] to fine-tune a Vision Language Model in a chat setting. The script has only been tested with [LLaVA 1.5](https://huggingface.co/llava-hf/llava-1.5-7b-hf), [LLaVA 1.6](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf), and [Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) models, so users may see unexpected behaviour in other model architectures. |
 | [`examples/scripts/sft_vlm_gemma3.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/sft_vlm_gemma3.py) | This script shows how to use the [`SFTTrainer`] to fine-tune a Gemma 3 model on vision to text tasks. |
 | [`examples/scripts/sft_vlm_smol_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/sft_vlm_smol_vlm.py) | This script shows how to use the [`SFTTrainer`] to fine-tune a SmolVLM model. |
-| [`examples/scripts/xpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/xpo.py) | This script shows how to use the [`XPOTrainer`] to fine-tune a model. |
+| [`examples/scripts/xpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/xpo.py) | This script shows how to use the [`experimental.xpo.XPOTrainer`] to fine-tune a model. |
 
 ## Distributed Training (for scripts)
 
diff --git a/docs/source/gkd_trainer.md b/docs/source/gkd_trainer.md
index 73be330c637..b703a1712b9 100644
--- a/docs/source/gkd_trainer.md
+++ b/docs/source/gkd_trainer.md
@@ -19,26 +19,23 @@ This post-training method was contributed by [Kashif Rasul](https://huggingface.
 
 ## Usage tips
 
-The [`GKDTrainer`] is a wrapper around the [`SFTTrainer`] class that takes in a teacher model argument. It needs three parameters to be set via the [`GKDConfig`] namely:
+The [`experimental.gkd.GKDTrainer`] is a wrapper around the [`SFTTrainer`] class that takes in a teacher model argument. It needs three parameters to be set via the [`experimental.gkd.GKDConfig`] namely:
 
 * `lmbda`:  controls the student data fraction, i.e., the proportion of on-policy student-generated outputs. When `lmbda=0.0`, the loss reduces to supervised JSD where the student is trained with the token-level probabilities of the teacher. When `lmbda=1.0`, the loss reduces to on-policy JSD, where the student generates output sequences and token-specific feedback on these sequences from the teacher. For values in between [0, 1] it is random between the two based on the `lmbda` value for each batch.
-* `seq_kd`:  controls whether to perform Sequence-Level KD (can be viewed as supervised FT on teacher-generated out). When `seq_kd=True` and `lmbda=0.0`, the loss reduces to supervised JSD, where the teacher generates output sequences and the student receives token-specific feedback on these sequences from the teacher. 
+* `seq_kd`:  controls whether to perform Sequence-Level KD (can be viewed as supervised FT on teacher-generated out). When `seq_kd=True` and `lmbda=0.0`, the loss reduces to supervised JSD, where the teacher generates output sequences and the student receives token-specific feedback on these sequences from the teacher.
 * `beta`: controls the interpolation in the generalized Jensen-Shannon Divergence.  When `beta=0.0` the loss approximates forward KL divergence, while for `beta=1.0` the loss approximates reverse KL divergence. For values in between [0, 1] it interpolates between the two.
 
 The authors find that on-policy data (high `lmbda`) performs better and the optimal `beta` varied depending on the task and evaluation method.
 
 > [!WARNING]
-> Make sure that `attn_implementation="flash_attention_2"` when training [Gemma models](https://huggingface.co/models?other=gemma2). Otherwise you will encounter NaNs in the logits due to the [soft capping technique](https://huggingface.co/blog/gemma2#soft-capping-and-attention-implementations) adopted by this architecture.
+> Make sure that `attn_implementation="kernels-community/flash-attn2"` when training [Gemma models](https://huggingface.co/models?other=gemma2). Otherwise you will encounter NaNs in the logits due to the [soft capping technique](https://huggingface.co/blog/gemma2#soft-capping-and-attention-implementations) adopted by this architecture.
 
 The basic API is as follows:
 
 ```python
 from datasets import Dataset
-from trl import GKDConfig, GKDTrainer
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-)
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from trl.experimental.gkd import GKDConfig, GKDTrainer
 
 NUM_DUMMY_SAMPLES = 100
 
@@ -92,11 +89,11 @@ The dataset should be formatted as a list of "messages" where each message is a
 
 ## GKDTrainer
 
-[[autodoc]] GKDTrainer
+[[autodoc]] experimental.gkd.GKDTrainer
     - train
     - save_model
     - push_to_hub
 
 ## GKDConfig
 
-[[autodoc]] GKDConfig
+[[autodoc]] experimental.gkd.GKDConfig
diff --git a/docs/source/gold_trainer.md b/docs/source/gold_trainer.md
index 61f68b2029a..ae2591bb3bd 100644
--- a/docs/source/gold_trainer.md
+++ b/docs/source/gold_trainer.md
@@ -13,7 +13,7 @@ Key capabilities:
 
 1. **Cross-tokenizer alignment** – GOLD incrementally decodes the student and teacher tokens, groups passages with the same visible text, and merges probabilities inside each group. This guarantees loss terms are computed over the full completion even when token boundaries differ.
 2. **Hybrid ULD loss** – when `uld_use_hybrid_loss` is enabled, GOLD compares exact vocabulary matches directly and falls back to the original sorted-probability ULD loss for unmatched tokens. This improves stability for students whose vocabularies only partially overlap with the teacher.
-3. **Seamless integration with GKD** – GOLD inherits the on-policy vs. off-policy scheduling from the [`GKDTrainer`](./gkd_trainer.md), so you can combine sequence-level KD, generalized JSD, and cross-tokenizer distillation in a single training run.
+3. **Seamless integration with GKD** – GOLD inherits the on-policy vs. off-policy scheduling from the [`experimental.gkd.GKDTrainer`], so you can combine sequence-level KD, generalized JSD, and cross-tokenizer distillation in a single training run.
 
 > [!NOTE]
 > GOLD is currently part of the `trl.experimental` namespace. APIs may change without notice while the feature is iterated on.
@@ -27,7 +27,7 @@ messages). Important configuration flags on [`GOLDConfig`] include:
 * `teacher_tokenizer_name_or_path` – required when `use_uld_loss=True`; GOLD uses the teacher tokenizer to align tokens.
 * `uld_use_hybrid_loss`, `uld_hybrid_matched_weight`, `uld_hybrid_unmatched_weight` – enables and weights the hybrid
   matched/unmatched loss.
-* `beta`, `lmbda`, `seq_kd` – inherited from `GKDConfig`, controlling the generalized JSD interpolation and on-policy
+* `beta`, `lmbda`, `seq_kd` – inherited from [`experimental.gkd.GKDConfig`], controlling the generalized JSD interpolation and on-policy
   sampling ratio.
 
 A minimal end-to-end example:
diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md
index a3d99953706..92a40b009d2 100644
--- a/docs/source/grpo_trainer.md
+++ b/docs/source/grpo_trainer.md
@@ -14,10 +14,10 @@ This post-training method was contributed by [Quentin Gallouédec](https://huggi
 
 ## Quick start
 
-This example demonstrates how to train a model using the GRPO method. We train a [Qwen 0.5B Instruct model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) with the prompts from the [UltraFeedback prompts dataset](https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt). You can view the data in the dataset here:
+This example demonstrates how to train a model using the GRPO method. We train a [Qwen 0.5B Instruct model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) with the prompts from the [DeepMath-103K dataset](https://huggingface.co/datasets/trl-lib/DeepMath-103K). You can view the data in the dataset here:
 
 <iframe
-  src="https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt/embed/viewer/default/train?row=0"
+  src="https://huggingface.co/datasets/trl-lib/DeepMath-103K/embed/viewer/default/train?row=0"
   frameborder="0"
   width="100%"
   height="560px"
@@ -28,21 +28,14 @@ Below is the script to train the model.
 ```python
 # train_grpo.py
 from datasets import load_dataset
-from trl import GRPOConfig, GRPOTrainer
-
-dataset = load_dataset("trl-lib/ultrafeedback-prompt", split="train")
+from trl import GRPOTrainer
+from trl.rewards import accuracy_reward
 
-# Dummy reward function for demonstration purposes
-def reward_num_unique_letters(completions, **kwargs):
-    """Reward function that rewards completions with more unique letters."""
-    completion_contents = [completion[0]["content"] for completion in completions]
-    return [float(len(set(content))) for content in completion_contents]
+dataset = load_dataset("trl-lib/DeepMath-103K", split="train")
 
-training_args = GRPOConfig(output_dir="Qwen2-0.5B-GRPO")
 trainer = GRPOTrainer(
     model="Qwen/Qwen2-0.5B-Instruct",
-    reward_funcs=reward_num_unique_letters,
-    args=training_args,
+    reward_funcs=accuracy_reward,
     train_dataset=dataset,
 )
 trainer.train()
@@ -149,6 +142,7 @@ This constant is recommended to be the maximum completion length. To use this fo
 While training and evaluating, we record the following reward metrics:
 
 - `num_tokens`: The total number of tokens processed so far, including both prompts and completions.
+- `step_time`: The average time (in seconds) taken per training step (including generation).
 - `completions/mean_length`: The average length of generated completions.
 - `completions/min_length`: The minimum length of generated completions.
 - `completions/max_length`: The maximum length of generated completions.
@@ -289,29 +283,27 @@ import argparse
 
 from datasets import load_dataset
 from trl import GRPOTrainer, GRPOConfig
+from trl.rewards import accuracy_reward
 
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--vllm_server_host", type=str, default="", help="The server IP")
     args = parser.parse_args()
 
-    # Example dataset from TLDR
-    dataset = load_dataset("trl-lib/tldr", split="train")
-
-    # Dummy reward function: count the number of unique characters in the completions
-    def reward_num_unique_chars(completions, **kwargs):
-        return [len(set(c)) for c in completions]
+    dataset = load_dataset("trl-lib/DeepMath-103K", split="train")
 
     training_args = GRPOConfig(
-        output_dir="Qwen2.5-72B-GRPO",
         per_device_train_batch_size=4,
-        bf16=True,
-        gradient_checkpointing=True,
         use_vllm=True,
         vllm_server_host=args.vllm_server_host.replace("ip-", "").replace("-", "."),  # from ip-X-X-X-X to X.X.X.X
     )
 
-    trainer = GRPOTrainer(model="Qwen/Qwen2.5-72B", args=training_args, reward_funcs=reward_num_unique_chars, train_dataset=dataset)
+    trainer = GRPOTrainer(
+        model="Qwen/Qwen2.5-72B",
+        args=training_args,
+        reward_funcs=accuracy_reward,
+        train_dataset=dataset
+    )
     trainer.train()
 
 if __name__=="__main__":
diff --git a/docs/source/index.md b/docs/source/index.md
index e0268d51868..01f20cf5c0b 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -25,9 +25,9 @@ Below is the current list of TRL trainers, organized by method type (⚡️ = vL
 - [`GRPOTrainer`] ⚡️
 - [`RLOOTrainer`] ⚡️
 - [`OnlineDPOTrainer`] ⚡️
-- [`NashMDTrainer`] ⚡️
-- [`XPOTrainer`] ⚡️
-- [`PPOTrainer`]
+- [`experimental.nash_md.NashMDTrainer`] 🧪 ⚡️
+- [`experimental.ppo.PPOTrainer`] 🧪
+- [`experimental.xpo.XPOTrainer`] 🧪 ⚡️
 
 ### Reward modeling
 
@@ -41,14 +41,15 @@ Below is the current list of TRL trainers, organized by method type (⚡️ = vL
 
 - [`SFTTrainer`]
 - [`DPOTrainer`]
+- [`KTOTrainer`]
 - [`experimental.bco.BCOTrainer`] 🧪
+- [`experimental.cpo.CPOTrainer`] 🧪
 - [`experimental.orpo.ORPOTrainer`] 🧪
-- [`CPOTrainer`]
-- [`KTOTrainer`]
 
 ### Knowledge distillation
 
-- [`GKDTrainer`]
+- [`experimental.gkd.GKDTrainer`] 🧪
+- [`experimental.minillm.MiniLLMTrainer`] 🧪
 
 </div>
 </div>
diff --git a/docs/source/judges.md b/docs/source/judges.md
index be7fc140cee..11ad824a776 100644
--- a/docs/source/judges.md
+++ b/docs/source/judges.md
@@ -1,7 +1,7 @@
 # Judges
 
 > [!WARNING]
-> TRL Judges is an experimental API which is subject to change at any time.
+> TRL Judges is an experimental API which is subject to change at any time. As of TRL v1.0, judges have been moved to the `trl.experimental.judges` module.
 
 TRL provides judges to easily compare two completions.
 
@@ -13,10 +13,10 @@ pip install trl[judges]
 
 ## Using the provided judges
 
-TRL provides several judges out of the box. For example, you can use the [`HfPairwiseJudge`] to compare two completions using a pre-trained model from the Hugging Face model hub:
+TRL provides several judges out of the box. For example, you can use the [`experimental.judges.HfPairwiseJudge`] to compare two completions using a pre-trained model from the Hugging Face model hub:
 
 ```python
-from trl import HfPairwiseJudge
+from trl.experimental.judges import HfPairwiseJudge
 
 judge = HfPairwiseJudge()
 judge.judge(
@@ -27,12 +27,12 @@ judge.judge(
 
 ## Define your own judge
 
-To define your own judge, we provide several base classes that you can subclass. For rank-based judges, you need to subclass [`BaseRankJudge`] and implement the [`BaseRankJudge.judge`] method. For pairwise judges, you need to subclass [`BasePairJudge`] and implement the [`BasePairJudge.judge`] method. If you want to define a judge that doesn't fit into these categories, you need to subclass [`BaseJudge`] and implement the [`BaseJudge.judge`] method.
+To define your own judge, we provide several base classes that you can subclass. For rank-based judges, you need to subclass [`experimental.judges.BaseRankJudge`] and implement the [`experimental.judges.BaseRankJudge.judge`] method. For pairwise judges, you need to subclass [`experimental.judges.BasePairJudge`] and implement the [`experimental.judges.BasePairJudge.judge`] method. If you want to define a judge that doesn't fit into these categories, you need to subclass [`experimental.judges.BaseJudge`] and implement the [`experimental.judges.BaseJudge.judge`] method.
 
 As an example, let's define a pairwise judge that prefers shorter completions:
 
 ```python
-from trl import BasePairwiseJudge
+from trl.experimental.judges import BasePairwiseJudge
 
 class PrefersShorterJudge(BasePairwiseJudge):
     def judge(self, prompts, completions, shuffle_order=False):
@@ -53,34 +53,34 @@ judge.judge(
 
 ### PairRMJudge
 
-[[autodoc]] PairRMJudge
+[[autodoc]] trl.experimental.judges.PairRMJudge
 
 ### HfPairwiseJudge
 
-[[autodoc]] HfPairwiseJudge
+[[autodoc]] trl.experimental.judges.HfPairwiseJudge
 
 ### OpenAIPairwiseJudge
 
-[[autodoc]] OpenAIPairwiseJudge
+[[autodoc]] trl.experimental.judges.OpenAIPairwiseJudge
 
 ### AllTrueJudge
 
-[[autodoc]] AllTrueJudge
+[[autodoc]] trl.experimental.judges.AllTrueJudge
 
 ## Base classes
 
 ### BaseJudge
 
-[[autodoc]] BaseJudge
+[[autodoc]] trl.experimental.judges.BaseJudge
 
 ### BaseBinaryJudge
 
-[[autodoc]] BaseBinaryJudge
+[[autodoc]] trl.experimental.judges.BaseBinaryJudge
 
 ### BaseRankJudge
 
-[[autodoc]] BaseRankJudge
+[[autodoc]] trl.experimental.judges.BaseRankJudge
 
 ### BasePairwiseJudge
 
-[[autodoc]] BasePairwiseJudge
+[[autodoc]] trl.experimental.judges.BasePairwiseJudge
diff --git a/docs/source/kernels_hub.md b/docs/source/kernels_hub.md
index a4c4a651557..f3d7ee124ba 100644
--- a/docs/source/kernels_hub.md
+++ b/docs/source/kernels_hub.md
@@ -27,20 +27,20 @@ from transformers import AutoModelForCausalLM
 
 model = AutoModelForCausalLM.from_pretrained(
     "your-model-name",
-    attn_implementation="kernels-community/flash-attn"  # other options: kernels-community/vllm-flash-attn3, kernels-community/paged-attention
+    attn_implementation="kernels-community/flash-attn2"  # other options: kernels-community/vllm-flash-attn3, kernels-community/paged-attention
 )
 ```
 
 Or when running a TRL training script:
 
 ```bash
-python sft.py ... --attn_implementation kernels-community/flash-attn
+python sft.py ... --attn_implementation kernels-community/flash-attn2
 ```
 
 Or using the TRL CLI:
 
 ```bash
-trl sft ... --attn_implementation kernels-community/flash-attn
+trl sft ... --attn_implementation kernels-community/flash-attn2
 ```
 
 > [!TIP]
@@ -84,7 +84,7 @@ from trl import SFTConfig
 
 model = AutoModelForCausalLM.from_pretrained(
     "your-model-name",
-    attn_implementation="kernels-community/flash-attn"  # choose the desired FlashAttention variant
+    attn_implementation="kernels-community/flash-attn2"  # choose the desired FlashAttention variant
 )
 
 training_args = SFTConfig(
diff --git a/docs/source/liger_kernel_integration.md b/docs/source/liger_kernel_integration.md
index 0a0a95eb0f1..7a387c813fd 100644
--- a/docs/source/liger_kernel_integration.md
+++ b/docs/source/liger_kernel_integration.md
@@ -67,7 +67,7 @@ training_args = KTOConfig(..., use_liger_kernel=True)
 <hfoption id="GKD">
 
 ```python
-from trl import GKDConfig
+from trl.experimental.gkd import GKDConfig
 
 training_args = GKDConfig(..., use_liger_kernel=True)
 ```
diff --git a/docs/source/minillm.md b/docs/source/minillm.md
new file mode 100644
index 00000000000..6db88955dc3
--- /dev/null
+++ b/docs/source/minillm.md
@@ -0,0 +1,67 @@
+# MiniLLM Trainer
+
+[![All_models-MiniLLM-blue](https://img.shields.io/badge/All_models-MiniLLM-blue)](https://huggingface.co/models?other=minillm,trl)
+
+## Overview
+
+TRL supports the MiniLLM Trainer for distilling large language models into smaller ones using reverse KLD for better precision, quality, and performance, as described in the paper [Knowledge Distillation of Large Language Models](https://huggingface.co/papers/2306.08543) by [Yuxian Gu](https://huggingface.co/t1101675), [Li Dong](https://huggingface.co/unilm), [Furu Wei](https://huggingface.co/thegenerality), and Minlie Huang.
+The abstract from the paper is the following:
+
+> Knowledge Distillation (KD) is a promising technique for reducing the high computational demand of large language models (LLMs). However, previous KD methods are primarily applied to white-box classification models or training small models to imitate black-box model APIs like ChatGPT. How to effectively distill the knowledge from white-box generative LLMs is still under-explored, which becomes more and more important with the prosperity of LLMs. In this work, we propose MiniLLM that distills smaller language models from generative larger language models. We first replace the forward Kullback-Leibler divergence (KLD) objective in the standard KD approaches with reverse KLD, which is more suitable for KD on generative language models, to prevent the student model from overestimating the low-probability regions of the teacher distribution. Then, we derive an effective optimization approach to learn this objective. Extensive experiments in the instruction-following setting show that the MiniLLM models generate more precise responses with the higher overall quality, lower exposure bias, better calibration, and higher long-text generation performance. Our method is also scalable for different model families with 120M to 13B parameters. We will release our code and model checkpoints at https://aka.ms/MiniLLM.
+
+This post-training method was contributed by [Yuxian Gu](https://huggingface.co/t1101675).
+
+It is a generalized version of [Think Machine Lab's On-Policy Distillation](https://thinkingmachines.ai/blog/on-policy-distillation/), with the option to add distribution-level single-step distillation signals (like GKD when `beta=1`) and long-context reverse KLD signals.
+
+$$
+\begin{align}
+L_{\text{MiniLLM}}&=\alpha_1\mathbb{E}_{x\sim \pi_{\theta}}\sum_{t'=t}^{|x|}\frac{\gamma^{t'-t}}{\sum_{t'}\gamma^{t'-t}}\left[\log \frac{\pi_{\theta}(x_{t'+1}|x_{1..t'})}{\pi_{\text{teacher}}(x_{t'+1}|x_{1..t'})}\right] \\
+&+ \alpha_2\mathbb{E}_{x\sim \pi_{\theta}} \text{KL}\left[\pi_\theta(\cdot|x_{1..t})||\pi_{\text{teacher}}(\cdot | x_{1..t})\right].
+\end{align}
+$$
+
+When  \\( \alpha_1=1 \\), \\( \alpha_2=0 \\), \\( \gamma=0 \\), which corresponds to
+
+```python
+from trl.experimental.minillm import MiniLLMConfig
+
+training_args = MiniLLMConfig(
+    rkl_advantage=True,
+    single_step_decomposition=False,
+    gamma=False
+)
+```
+
+\\( L_{\text{MiniLLM}} \\) becomes the on-policy KD implemented in [Tinker](https://github.com/thinking-machines-lab/tinker-cookbook/blob/5d08be6d130596b7bedd02197861c41fa81ea436/tinker_cookbook/distillation/train_on_policy.py#L88):
+
+$$
+L_{\text{tinker}}=\mathbb{E}_{x\sim \pi_{\theta}}\left[\log \frac{\pi_{\theta}(x_{t'+1}|x_{1..t'})}{\pi_{\text{teacher}}(x_{t'+1}|x_{1..t'})}\right].
+$$
+
+When \\( \alpha_1=0 \\), \\( \alpha_2=1 \\), which corresponds to
+
+```python
+from trl.experimental.minillm import MiniLLMConfig
+
+training_args = MiniLLMConfig(
+    rkl_advantage=False,
+    single_step_decomposition=True
+)
+```
+
+\\( L_{\text{MiniLLM}} \\) becomes the reverse KLD version of the GKD loss as in [GKD Trainer](./gkd.md):
+
+$$
+L_{\text{GKD-RKL}}=\mathbb{E}_{x\sim \pi_{\theta}} \text{KL}\left[\pi_\theta(\cdot|x_{1..t})||\pi_{\text{teacher}}(\cdot | x_{1..t})\right].
+$$
+
+## MiniLLMTrainer
+
+[[autodoc]] experimental.minillm.MiniLLMTrainer
+    - train
+    - save_model
+    - push_to_hub
+
+## MiniLLMConfig
+
+[[autodoc]] experimental.minillm.MiniLLMConfig
diff --git a/docs/source/nash_md_trainer.md b/docs/source/nash_md_trainer.md
index 8b87c2a8b31..e86592769a5 100644
--- a/docs/source/nash_md_trainer.md
+++ b/docs/source/nash_md_trainer.md
@@ -14,7 +14,7 @@ This post-training method was contributed by [Kashif Rasul](https://huggingface.
 
 ## Quick start
 
-This example demonstrates how to train a model using the Nash-MD method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and [`PairRMJudge`] as a judge. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the prompts in the dataset here:
+This example demonstrates how to train a model using the Nash-MD method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and [`experimental.judges.PairRMJudge`] as a judge. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the prompts in the dataset here:
 
 <iframe
   src="https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt/embed/viewer/default/train?row=0"
@@ -28,7 +28,8 @@ Below is the script to train the model:
 ```python
 # train_nash_md.py
 from datasets import load_dataset
-from trl import NashMDConfig, NashMDTrainer, PairRMJudge
+from trl.experimental.judges import PairRMJudge
+from trl.experimental.nash_md import NashMDConfig, NashMDTrainer
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
@@ -63,7 +64,7 @@ The best programming language depends on personal preference, the complexity of
 
 ## Expected dataset type
 
-Nash-MD requires a [prompt-only dataset](dataset_formats#prompt-only). The [`NashMDTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset formats. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
+Nash-MD requires a [prompt-only dataset](dataset_formats#prompt-only). The [`experimental.nash_md.NashMDTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset formats. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
 
 ## Usage tips
 
@@ -72,7 +73,7 @@ Nash-MD requires a [prompt-only dataset](dataset_formats#prompt-only). The [`Nas
 Instead of a judge, you can chose to use a reward model -- see [Reward Bench](https://huggingface.co/spaces/allenai/reward-bench) for a leaderboard of public models you can use. Below is a code example showing how to replace a judge with the [trl-lib/Qwen2-0.5B-Reward](https://huggingface.co/trl-lib/Qwen2-0.5B-Reward) model:
 
 ```diff
-- from trl import PairRMJudge
+- from trl.experimental.judges import PairRMJudge
 + from transformers import AutoModelForSequenceClassification
 
 - judge = PairRMJudge()
@@ -90,7 +91,7 @@ Instead of a judge, you can chose to use a reward model -- see [Reward Bench](ht
 
 ### Encourage EOS token generation
 
-We may want the model to generate completions within a given length. During training, the model will generate completions up to the maximum length specified in the `max_new_tokens` argument of [`NashMDConfig`]. If you want to penalize the model for not generating an EOS token before reaching the maximum length, you can use the `missing_eos_penalty` argument of [`NashMDConfig`]:
+We may want the model to generate completions within a given length. During training, the model will generate completions up to the maximum length specified in the `max_new_tokens` argument of [`experimental.nash_md.NashMDConfig`]. If you want to penalize the model for not generating an EOS token before reaching the maximum length, you can use the `missing_eos_penalty` argument of [`experimental.nash_md.NashMDConfig`]:
 
 ```python
 training_args = NashMDConfig(..., max_new_tokens=128, missing_eos_penalty=1.0)
@@ -143,16 +144,16 @@ While training and evaluating, we record the following reward metrics:
 * `logps/rejected`: The mean log probabilities of the reference completions.
 * `val/model_contain_eos_token`: The amount of times the model's output contains the eos token.
 * `val/ref_contain_eos_token`: The amount of times the mixture's output contains the eos token.
-* `beta`: The parameter that controls the weight of the loss term representing the deviation from the reference model. Typically fixed, but can be made dynamic by passing a list to [`NashMDConfig`].
-* `mixture_coef`: Logit mixture coefficient for the model and reference model. Typically fixed, but can be made dynamic by passing a list to [`NashMDConfig`].
+* `beta`: The parameter that controls the weight of the loss term representing the deviation from the reference model. Typically fixed, but can be made dynamic by passing a list to [`experimental.nash_md.NashMDConfig`].
+* `mixture_coef`: Logit mixture coefficient for the model and reference model. Typically fixed, but can be made dynamic by passing a list to [`experimental.nash_md.NashMDConfig`].
 
 ## NashMDTrainer
 
-[[autodoc]] NashMDTrainer
+[[autodoc]] experimental.nash_md.NashMDTrainer
     - train
     - save_model
     - push_to_hub
 
 ## NashMDConfig
 
-[[autodoc]] NashMDConfig
+[[autodoc]] experimental.nash_md.NashMDConfig
diff --git a/docs/source/online_dpo_trainer.md b/docs/source/online_dpo_trainer.md
index 9c7660f9b7b..f84731a3d7b 100644
--- a/docs/source/online_dpo_trainer.md
+++ b/docs/source/online_dpo_trainer.md
@@ -14,7 +14,7 @@ This post-training method was contributed by [Michael Noukhovitch](https://huggi
 
 ## Quick start
 
-This example demonstrates how to train a model using the online DPO method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and [`PairRMJudge`] as a judge. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the prompts in the dataset here:
+This example demonstrates how to train a model using the online DPO method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and [`experimental.judges.PairRMJudge`] as a judge. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the prompts in the dataset here:
 
 <iframe
   src="https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt/embed/viewer/default/train?row=0"
@@ -28,7 +28,8 @@ Below is the script to train the model:
 ```python
 # train_online_dpo.py
 from datasets import load_dataset
-from trl import OnlineDPOConfig, OnlineDPOTrainer, PairRMJudge
+from trl import OnlineDPOConfig, OnlineDPOTrainer
+from trl.experimental.judges import PairRMJudge
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
@@ -74,7 +75,7 @@ Online DPO only requires a [prompt-only dataset](dataset_formats#prompt-only) (u
 Instead of a judge, you can chose to use a reward model -- see [Reward Bench](https://huggingface.co/spaces/allenai/reward-bench) for a leaderboard of public models you can use. Below is a code example showing how to replace a judge with the [trl-lib/Qwen2-0.5B-Reward](https://huggingface.co/trl-lib/Qwen2-0.5B-Reward) model:
 
 ```diff
-- from trl import PairRMJudge
+- from trl.experimental.judges import PairRMJudge
 + from transformers import AutoModelForSequenceClassification
 
 - judge = PairRMJudge()
diff --git a/docs/source/openenv.md b/docs/source/openenv.md
index 146bd5db7c6..b8924ce3ed3 100644
--- a/docs/source/openenv.md
+++ b/docs/source/openenv.md
@@ -1,11 +1,15 @@
 # OpenEnv Integration for Training LLMs with Environments
 
-## Overview
-
-[OpenEnv](https://github.com/meta-pytorch/OpenEnv) is an open-source framework from Meta's PyTorch team for defining, deploying, and interacting with environments in reinforcement learning (RL) and agentic workflows. It offers [Gymnasium-style APIs](https://gymnasium.farama.org) (e.g., `reset()` and `step()`) to interface with environments in a standard manner, and supports running these environments as backend servers (for example via HTTP or containerised execution). You can find a collection of ready-to-use OpenEnv environments on the [Hugging Face Hub](https://huggingface.co/collections/openenv/environment-hub).
+[OpenEnv](https://github.com/meta-pytorch/OpenEnv) is an open-source framework from Meta's PyTorch team for defining, deploying, and interacting with environments in reinforcement learning (RL) and agentic workflows. It offers [Gymnasium-style APIs](https://gymnasium.farama.org) (e.g., `reset()` and `step()`) to interface with environments in a standard manner, and supports running these environments as backend servers (for example, via HTTP or containerised execution). You can find a collection of ready-to-use OpenEnv environments on the [Hugging Face Hub](https://huggingface.co/collections/openenv/environment-hub).
 
 In this guide, we’ll focus on **how to integrate OpenEnv with TRL**, but feel free to explore the links above to dive deeper into OpenEnv itself.
 
+> [!NOTE]
+> You can explore ready-to-use example scripts in the [`examples/scripts/openenv/`](https://github.com/huggingface/trl/blob/main/examples/scripts/openenv/) directory.
+
+> [!NOTE]
+> Explore the [OpenEnv docs](https://meta-pytorch.org/OpenEnv/) for more details.
+
 ## Installation
 
 To use OpenEnv with TRL, install the framework:
@@ -25,16 +29,14 @@ A rollout function must have the following signature:
 ```python
 def rollout_func(
     prompts: list[str],
-    args: GRPOConfig,
-    processing_class
+    trainer: GRPOTrainer,
 ) -> dict[str, list]:
     """
     Custom rollout function for generation and reward computation.
 
     Args:
-        prompts: List of prompts to generate from
-        args: GRPOConfig containing sampling parameters (temperature, top_p, etc.)
-        processing_class: Tokenizer/processor for encoding/decoding
+        prompts: List of prompts routed to the current process
+        trainer: Active GRPOTrainer (gives access to tokenizer, config and helper utilities)
 
     Returns:
         Dictionary containing:
@@ -53,8 +55,8 @@ def rollout_func(
 
 The typical pattern when combining OpenEnv with TRL looks like this:
 
-1. Start or connect to an OpenEnv environment (e.g., an HTTP endpoint or Dockerized env).
-2. Generate completions from your model — for example, via a vLLM inference server (`use_vllm=True`, `vllm_mode="server"`).
+1. Start or connect to an OpenEnv environment (e.g., a Dockerized env or HTTP endpoint).
+2. Generate completions from your model — either via `trl.experimental.openenv.generate_rollout_completions` when using colocated vLLM, or by hitting your inference server when using vLLM in server mode.
 3. Step through the environment using each completion to compute rewards or metrics.
 4. Add environment results (e.g., `env_reward`) to the rollout result dict.
 5. Access those rewards inside your reward function via `**kwargs`.
@@ -65,43 +67,157 @@ By using OpenEnv in this loop, you can:
 * Plug in custom simulators, web APIs, or evaluators as environments.
 * Pass structured reward signals back into RL training seamlessly.
 
+### vLLM Modes
+
+TRL supports two vLLM execution modes for generation:
+
+- **`colocate` mode** (default): vLLM runs in the same process as training. Requires 1 GPU. Use `trl.experimental.openenv.generate_rollout_completions` for generation.
+- **`server` mode**: vLLM runs as a separate server process. Requires at least 2 GPUs (one for vLLM server, one for training), but is highly scalable:
+  - You can allocate multiple GPUs to the vLLM server for tensor parallelism (faster inference)
+  - You can run multiple training processes that share the same vLLM server
+  - You can use different GPU types for inference vs training (e.g., A100 for vLLM, H100 for training)
+  - The vLLM server can serve multiple experiments simultaneously
+  - Use `trl.experimental.openenv.generate_rollout_completions` which will communicate with the server via `vllm_server_url`
+
+Configure the mode via `GRPOConfig`:
+
+```python
+# Colocate mode (1 GPU)
+args = GRPOConfig(
+    use_vllm=True,
+    vllm_mode="colocate",
+    # ... other args
+)
+
+# Server mode (2+ GPUs, scalable)
+args = GRPOConfig(
+    use_vllm=True,
+    vllm_mode="server",
+    vllm_server_base_url="http://localhost:8000",
+    # ... other args
+)
+
+# Example: Start vLLM server with multiple GPUs for tensor parallelism
+# CUDA_VISIBLE_DEVICES=0,1,2,3 trl vllm-serve --model Qwen/Qwen3-1.7B --tensor-parallel-size 4
+```
+
 ## Running the Environments
 
-You can run OpenEnv environments in three different ways:
-
-1. **Local Docker container** *(recommended)*
-
-   To start a Docker container:
-   * Open the environment on the Hugging Face Hub.
-   * Click the **⋮ (three dots)** menu.
-   * Select **“Run locally.”**
-   * Copy and execute the provided command in your terminal.
-
-   Example:
-   ```bash
-   docker run -d -p 8001:8001 registry.hf.space/openenv-echo-env:latest
-    ```
-    ![open_env_launch_docker](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/open_env_launch_docker.png)
-2. **Local Python process**: Launch the environment directly using Uvicorn.
-   You can start the server manually as a local process. For more details about the available environments, refer to the [OpenEnv repository](https://github.com/meta-pytorch/OpenEnv/tree/main/src/envs).
-   ```bash
-   python -m uvicorn envs.echo_env.server.app:app --host 0.0.0.0 --port 8001
-   ```
-3. **Hugging Face Spaces**: Connect to a hosted environment running on the Hugging Face Hub.
-   To find the connection URL, open the Space page, click the **⋮ (three dots)** menu, and select **“Embed this Space.”**
-   You can then use that URL to connect directly from your client.
-   Keep in mind that public Spaces may have rate limits or temporarily go offline if inactive.
+You can run OpenEnv environments in three different ways: 
+
+- We can load the environment from the Hugging Face Hub and execute it as a Docker container.
+- We can connect to a hosted environment running on the Hugging Face Hub.
+- We can launch the environment directly using Uvicorn in Python.
+
+<hfoptions id="env_mode">
+
+<hfoption id="docker">
+
+**Load from Hugging Face Hub** *(recommended)*
+
+We can use the [`from_hub`](https://meta-pytorch.org/OpenEnv/core/#core.http_env_client.HTTPEnvClient.from_hub) method to load the environment from the hub. This method will automatically start a Docker container for the environment on your local machine. [`openenv/echo-env`](https://huggingface.co/spaces/openenv/echo_env) is the repo_id of the space on the hub.
+
+```python
+env = EchoEnv.from_hub("openenv/echo-env")
+```
+
+If you want to launch the environment manually, you can use the following command to pull and run the Docker container:
+
+```bash
+docker run -d -p 8001:8000 --platform linux/amd64  registry.hf.space/openenv-echo-env:latest
+```
+
+And then you can connect to the environment using the following code:
+
+```python
+env = EchoEnv(base_url="http://0.0.0.0:8001")
+```
+
+Here, we map the ports from 8001 to 8000 to make space for a vLLM server, but you will need to manage the ports for your local machine.
+
+> [!NOTE]
+> You can find the Docker container for any space on the hub.
+>
+> * Open the space page on the hub.
+> * Click the **⋮ (three dots)** menu.
+> * Select **“Run locally.”**
+> * Copy and execute the provided command in your terminal.
+>
+> ![open_env_launch_docker](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/open_env_launch_docker.png)
+
+> [!NOTE]
+> You can also use the **Docker option** with `from_docker_image` by providing the image name..
+> For more details, refer to the official [OpenEnv documentation](https://meta-pytorch.org/OpenEnv/core/).
+
+</hfoption>
+<hfoption id="space">
+
+**Connect to a remote Hugging Face Space**
+
+You can connect to a hosted environment running on the Hugging Face Hub by passing the URL of the space to the `base_url` parameter of the environment class.
+
+```python
+env = EchoEnv(base_url="https://openenv-echo-env.hf.space")
+```
+
+> [!NOTE]
+> You can find the connection URL of any space on the hub.
+>
+> * Open the space page on the hub.
+> * Click the **⋮ (three dots)** menu.
+> * Select **“Embed this Space.”**
+> * Copy the connection URL.
+
+> [!WARNING]
+> **Currently**, it is recommended to **duplicate the Space to your own account** to avoid potential concurrency issues.  
+
+</hfoption>
+
+<hfoption id="local">
+
+**Local Python process**
+
+You can start the server manually as a local Python process. For more details about the available environments, refer to the [OpenEnv catalog](https://meta-pytorch.org/OpenEnv/environments/).
+   
+```bash
+hf download openenv/echo_env --repo-type=space --local-dir=echo_env
+python -m uvicorn echo_env.src.envs.echo_env.server.app:app --host 0.0.0.0 --port 8001
+```
+
+And then you can connect to the environment using the following code:
+
+```python
+env = EchoEnv(base_url="http://0.0.0.0:8001")
+```
+
+</hfoption>
+
+</hfoptions>
+
+## Environments Catalog
+
+Environment development is active and evolving.
+The best way to explore the **current catalog of maintained environments** is by visiting the official OpenEnv [catalog](https://huggingface.co/collections/openenv/environment-hub).
+
+Custom environments are also supported. To learn how to create your own, check out the guide on [Building Your Own Environment with OpenEnv](https://meta-pytorch.org/OpenEnv/environment-builder/).
+
+Environments are tightly integrated with the Hub, allowing you to **push new environments directly** so the community can easily pull, reuse, and adapt them for their own use cases.
 
 ## A simple example
 
-The [echo.py](https://github.com/huggingface/trl/blob/main/examples/scripts/openenv/echo.py) script demonstrates a minimal, end-to-end integration between TRL and OpenEnv. In this example, the Echo environment rewards completions based on their text length, encouraging the model to generate longer outputs. This pattern can be extended to any custom environment that provides structured feedback or task-based rewards:
+> [!NOTE]
+> You can explore more ready-to-use example scripts in the [`examples/scripts/openenv/`](https://github.com/huggingface/trl/blob/main/examples/scripts/openenv/) directory.
+
+The [echo.py](https://github.com/huggingface/trl/blob/main/examples/scripts/openenv/echo.py) script demonstrates a minimal, end-to-end integration between TRL and OpenEnv. In this example, the [Echo environment](https://meta-pytorch.org/OpenEnv/environments/echo/) rewards completions based on their text length, encouraging the model to generate longer outputs. This pattern can be extended to any custom environment that provides structured feedback or task-based rewards:
 
 ```python
 from envs.echo_env import EchoEnv, EchoAction
 from trl import GRPOConfig, GRPOTrainer
+from trl.experimental.openenv import generate_rollout_completions
 
 # Create HTTP client for Echo Environment
-client = EchoEnv.from_docker_image("echo-env:latest")
+client = EchoEnv.from_hub("openenv/echo-env")
+
 """
 Alternatively, you can start the environment manually with Docker and connect to it:
 
@@ -112,21 +228,13 @@ docker run -d -p 8001:8001 registry.hf.space/openenv-echo-env:latest
 client = EchoEnv(base_url="http://0.0.0.0:8001")
 """
 
-def rollout_func(prompts, args, processing_class):
-    # 1. Generate completions via vLLM inference server (running on port 8000)
-    payload = {
-        "prompts": prompts,
-        "n": args.num_generations,
-        "temperature": args.temperature,
-        "max_tokens": args.max_completion_length,
-    }
-    response = requests.post("http://0.0.0.0:8000/generate/", json=payload)
-    result = response.json()
-
-    completions_text = processing_class.batch_decode(
-        result["completion_ids"],
-        skip_special_tokens=True
-    )
+def rollout_func(prompts: list[str], trainer: GRPOTrainer):
+    # 1. Generate completions using TRL's helper (works for colocated vLLM)
+    outputs = generate_rollout_completions(trainer, prompts)
+    tokenizer = trainer.processing_class
+    completions_text = [
+        tokenizer.decode(out["completion_ids"], skip_special_tokens=True) for out in outputs
+    ]
 
     # 2. Step through the environment to get rewards
     client.reset()
@@ -136,8 +244,12 @@ def rollout_func(prompts, args, processing_class):
         env_rewards.append(env_result.reward)
 
     # 3. Add environment rewards as extra field
-    result["env_reward"] = env_rewards
-    return result
+    return {
+        "prompt_ids": [out["prompt_ids"] for out in outputs],
+        "completion_ids": [out["completion_ids"] for out in outputs],
+        "logprobs": [out["logprobs"] for out in outputs],
+        "env_reward": env_rewards,
+    }
 
 def reward_from_env(completions, **kwargs):
     """Extract environment rewards passed via rollout_func kwargs."""
@@ -153,8 +265,8 @@ trainer = GRPOTrainer(
     train_dataset=dataset,
     rollout_func=rollout_func,  # Use custom rollout
     args=GRPOConfig(
-        vllm_mode="server",
         use_vllm=True,
+        vllm_mode="colocate",  # Use colocate mode (default)
         num_train_epochs=1,
         num_generations=8,
         max_completion_length=2048,
@@ -165,28 +277,55 @@ trainer = GRPOTrainer(
 trainer.train()
 ```
 
-That's it! Now that you’ve seen the full example, let’s unpack how the main pieces fit together.
+That's it! Now that you've seen the full example, let's unpack how the main pieces fit together.
 
-1. **Environment Client:** `EchoEnv` implements an HTTP interface to interact with the environment server.  
-2. **Custom rollout:** The `rollout_func` generates completions and steps through the environment to collect rewards.  
-3. **Extra fields:** The rollout adds `env_reward` to the result dictionary, which is automatically passed to reward functions.  
+1. **Environment Client:** `EchoEnv` implements an HTTP interface to interact with the environment server.
+2. **Custom rollout:** The `rollout_func` generates completions and steps through the environment to collect rewards.
+3. **Extra fields:** The rollout adds `env_reward` to the result dictionary, which is automatically passed to reward functions.
 4. **Reward function:** Extracts `env_reward` from `kwargs` to apply environment-computed rewards during training.
 
-> [!WARNING]
-> The `rollout_func` is currently only supported when using vLLM in server mode (`use_vllm=True`, `vllm_mode="server"`).
+> [!TIP]
+> The trainer-aware rollout hook works in both vLLM server and colocate modes. Use `trl.experimental.openenv.generate_rollout_completions` so you reuse TRL's sampling configuration automatically.
 
 ### Running the Example
 
-The example requires two GPUs:
+You can run the example in either colocate mode (1 GPU) or server mode (2 GPUs):
+
+<hfoptions id="vllm_mode">
+
+<hfoption id="colocate">
+
+**Colocate mode (1 GPU, recommended)**
+
+```bash
+python examples/scripts/openenv/echo.py --vllm-mode colocate
+```
+
+This runs vLLM in the same process as training, requiring only a single GPU.
+
+</hfoption>
+
+<hfoption id="server">
+
+**Server mode (2+ GPUs, scalable)**
 
 ```bash
 # Terminal 1: Start vLLM inference server
 CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen2.5-0.5B-Instruct --host 0.0.0.0 --port 8000
 
 # Terminal 2: Run GRPO training with OpenEnv
-CUDA_VISIBLE_DEVICES=1 python examples/scripts/openenv/echo.py
+CUDA_VISIBLE_DEVICES=1 python examples/scripts/openenv/echo.py --vllm-mode server --vllm-server-url http://localhost:8000
 ```
 
+This runs vLLM as a separate server process, useful when you want to:
+- Share the inference server across multiple training jobs
+- Use multiple GPUs for the vLLM server (via `--tensor-parallel-size`)
+- Scale up training to many GPUs while sharing a single inference endpoint
+
+</hfoption>
+
+</hfoptions>
+
 Alternatively, you can manually start the Echo environment in a Docker container before running the training:
 
 ```bash
@@ -206,26 +345,24 @@ Below is the reward curve from training:
 
 <iframe src="https://trl-lib-trackio.hf.space?project=openenv&metrics=train/rewards/reward_from_env/mean&runs=qgallouedec-1761202871&sidebar=hidden&navbar=hidden" style="width:600px; height:500px; border:0;"></iframe>
 
-To learn more about how to create custom environments, see the [OpenEnv documentation](https://github.com/meta-pytorch/OpenEnv/blob/main/src/envs/README.md).
-
 ## Advanced Example
 
-Let's level this up a bit by training a model to interact with a more complex environment. We'll use the game word guessing game [wordle](https://www.nytimes.com/games/wordle/index.html) from the `textarena` environment. 
+Let's level this up a bit by training a model to interact with a more complex environment. We'll use the game word guessing game [wordle](https://www.nytimes.com/games/wordle/index.html) from the [`TextArena`](https://meta-pytorch.org/OpenEnv/environments/textarena/) environment.
 
 ### The TextArena Environment
 
 [TextArena](https://huggingface.co/papers/2504.11442) is an open-source collection of competitive text-based games designed to evaluate reasoning skills in LLMs using textual games like Wordle, Snake, Tic-Tac-Toe, and more. Research has shown that such games improve model performance on reasoning tasks.
 
-![image of textarena](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/text_arena_evals.png)
+![image of TextArena](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/text_arena_evals.png)
 
-We will use the `textarena` environment to train a model to play Wordle. The environment is a simple text based response environment that allows the model to interact with the game by making guesses and receive feedback on them.
+We will use the `TextArena` environment to train a model to play Wordle. The environment is a simple text based response environment that allows the model to interact with the game by making guesses and receive feedback on them.
 
 ### Wordle
 
 Wordle is a useful game to train a model on because it requires the model to reason about the word and the feedback provided by the environment. Also, it is a purely language based game that requires no external tools or knowledge. Furthermore, we found that models from 1 billion parameters and up are able to improve on wordle and only require 8 tokens to generate a guess, which makes the game a good benchmark to experiment with Reinforcement Learning environments without significant compute requirements.
 
 > [!NOTE] How does Wordle work?
-> Wordle is a word guessing game where the player has to guess a 5-letter word. The player can make 6 guesses, and for each guess, the environment will provide feedback on the correctness of the guess. The player wins if they guess the word in 6 guesses or less. It challenges the model to generate words that are likely to be correct, and to learn from the feedback provided by the environment. 
+> Wordle is a word guessing game where the player has to guess a 5-letter word. The player can make 6 guesses, and for each guess, the environment will provide feedback on the correctness of the guess. The player wins if they guess the word in 6 guesses or fewer. It challenges the model to generate words that are likely to be correct, and to learn from the feedback provided by the environment. 
 > 
 > For example, if the wordle environment returns the following feedback:
 >
@@ -233,9 +370,9 @@ Wordle is a useful game to train a model on because it requires the model to rea
 > G U E S S
 > X G Y X X
 > ```
-> The model has guessed the word "GUESS" and the environment has provided feedback as the letters X, G, and Y. Referring to colors in the original game blank, green, and yellow. From this feedback, the model should learn that the word is "GUESS" is incorrect. The letter "E" is in the word, but in the wrong position. The letter "U" is correct and in the correct position.
+> The model has guessed the word "GUESS" and the environment has provided feedback as the letters X, G, and Y. Referring to colors in the original game as blank, green, and yellow. From this feedback, the model should learn that the word "GUESS" is incorrect. The letter "E" is in the word, but in the wrong position. The letter "U" is correct and in the correct position.
  
-In the TextArena environment, reward is only given when the model wins the game. The reward is 1.0 if the model wins, and 0.0 otherwise. This is not a very efficient reward signal for the model, so we have added a number of custom reward functions to the script to help the model learn to play the game. The extensible nature of `reward_funcs` and `rollout_func` allows you to add any custom reward function you want to the script.  
+In the TextArena environment, a reward is only given when the model wins the game. The reward is 1.0 if the model wins, and 0.0 otherwise. This is not a very efficient reward signal for the model, so we have added a number of custom reward functions to the script to help the model learn to play the game. The extensible nature of `reward_funcs` and `rollout_func` allows you to add any custom reward function you want to the script.  
 
 ### Rollout Function
 
@@ -243,12 +380,12 @@ The rollout function runs one full Wordle episode, prompting the model for a gue
 
 ```python
 def rollout_once(
+    trainer: GRPOTrainer,
     env: TextArenaEnv,
     tokenizer: AutoTokenizer,
-    args: GRPOConfig,
     dataset_prompt: str,
-    cli_args: argparse.Namespace,
     system_prompt: str,
+    max_turns: int,
 ) -> dict[str, list]:
     result = env.reset()
     observation = result.observation
@@ -263,7 +400,7 @@ def rollout_once(
     correct_scores: list[float] = []
     guess_counts: dict[str, int] = {}
 
-    for _turn in range(cli_args.max_turns):
+    for _turn in range(max_turns):
         # when the game is over the environment will return a done=True
         if result.done:
             break
@@ -282,20 +419,15 @@ def rollout_once(
             enable_thinking=False,
         )
 
-        # generate the completion from the model using vLLM
-        vllm_result = request_vllm_completion(
-            prompt_text,
-            args,
-            endpoint=cli_args.vllm_endpoint,
-            timeout=cli_args.request_timeout,
-            fallback=cli_args,
-        )
-        prompt_ids.extend(vllm_result["prompt_ids"])
-        completion_ids.extend(vllm_result["completion_ids"])
-        logprobs.extend(vllm_result["logprobs"])
-        completion_text = vllm_result.get("text") or tokenizer.decode(
-            vllm_result["completion_ids"], skip_special_tokens=True
+        # Generate completion using trainer (works for both colocate and server modes)
+        rollout_outputs = generate_rollout_completions(trainer, [prompt_text])[0]
+        prompt_ids.extend(rollout_outputs["prompt_ids"])
+        completion_ids.extend(rollout_outputs["completion_ids"])
+        logprobs.extend(rollout_outputs["logprobs"])
+        completion_text = rollout_outputs.get("text") or tokenizer.decode(
+            rollout_outputs["completion_ids"], skip_special_tokens=True
         )
+
         # extract the guess from the completion
         guess = extract_guess(completion_text)
 
@@ -307,9 +439,9 @@ def rollout_once(
         feedback = extract_wordle_feedback(observation)
 
         # Update guess counts
-        previous_occurrences = guess_counts[guess]
+        previous_occurrences = guess_counts.get(guess, 0)
         repetition_score = scale_repetition_score(previous_occurrences, len(guess_counts))
-        guess_counts[guess] += 1
+        guess_counts[guess] = previous_occurrences + 1
 
         # calculate custom reward signals from the feedback
         if not feedback:
@@ -391,11 +523,11 @@ trainer = GRPOTrainer(
     ],
     train_dataset=dataset,
     args=grpo_config,
-    rollout_func=lambda prompts, args, processing_class: rollout_func(
+    rollout_func=lambda prompts, trainer: rollout_func(
         env=env,
         tokenizer=tokenizer,
         prompts=prompts,
-        args=args,
+        trainer=trainer,
         cli_args=cli_args,
         system_prompt=system_prompt,
     ),
@@ -405,31 +537,56 @@ trainer.train()
 
 ### Running the Advanced Example
 
-The example requires two GPUs:
+You can run the Wordle example in either colocate mode (1 GPU) or server mode (2 GPUs):
+
+<hfoptions id="wordle_vllm_mode">
+
+<hfoption id="colocate">
+
+**Colocate mode (1 GPU, recommended)**
+
+```bash
+python examples/scripts/openenv/wordle.py --vllm-mode colocate
+```
+
+This runs vLLM in the same process as training, requiring only a single GPU.
+
+</hfoption>
+
+<hfoption id="server">
+
+**Server mode (2+ GPUs, scalable)**
 
 ```bash
 # Terminal 1: Start vLLM inference server
 CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen3-1.7B --host 0.0.0.0 --port 8000
 
 # Terminal 2: Run GRPO training with OpenEnv
-CUDA_VISIBLE_DEVICES=1 python examples/scripts/openenv/wordle.py
+CUDA_VISIBLE_DEVICES=1 python examples/scripts/openenv/wordle.py --vllm-mode server --vllm-server-url http://localhost:8000
 ```
 
-Again, you can manually start the TextArena environment in a Docker container before running the training.
-In this case, initialize the client with
-`client = TextArenaEnv(base_url="http://0.0.0.0:8001")`
-instead of
-`client = TextArenaEnv.from_docker_image("registry.hf.space/burtenshaw-textarena:latest")`:
+This runs vLLM as a separate server process, useful when you want to:
+- Share the inference server across multiple training jobs
+- Use multiple GPUs for the vLLM server (via `--tensor-parallel-size`)
+- Scale up training to many GPUs while sharing a single inference endpoint
+
+</hfoption>
+
+</hfoptions>
+
+You can also manually start the TextArena environment in a Docker container before running the training:
 
 ```bash
 # Launch the TextArena environment
 docker run -d -p 8001:8001 registry.hf.space/burtenshaw-textarena:latest
 ```
 
+Then connect to it using `--env-mode docker-local--env-host localhost --env-port 8001`.
+
 ### Results
 
-The resulting model improves it's performance on the game, both by reducing the number of repetitions and by increasing the number of correct guesses. However, the the Qwen3-1.7B model we trained is not able to consistently win the game. The following reward curve shows the coverage of the model's guesses and the coverage of correct Y and G letters.
+The resulting model improves its performance on the game, both by reducing the number of repetitions and by increasing the number of correct guesses. However, the Qwen3-1.7B model we trained is not able to consistently win the game. The following reward curve shows the coverage of the model's guesses and the coverage of correct Y and G letters.
 
 <iframe src="https://burtenshaw-wordle-grpo.hf.space?project=group-Qwen-Qwen3-17B&metrics=reward&runs=run-2025-10-26_09-39-49,run-2025-10-26_08-04-49&sidebar=hidden&navbar=hidden" style="width:1600px; height:500px; border:0;"></iframe>
 
-We experimented larger models like `gpt-oss-20b` and found that model was able to consistently win the game. However, this requires a lot of compute to train and the model. Why not try this out yourself?
\ No newline at end of file
+We experimented with larger models like `gpt-oss-20b` and found that the model was able to consistently win the game. However, this requires a lot of compute to train the model. Why not try this out yourself?
diff --git a/docs/source/paper_index.md b/docs/source/paper_index.md
index 6467548d8ea..bdc41263013 100644
--- a/docs/source/paper_index.md
+++ b/docs/source/paper_index.md
@@ -142,7 +142,7 @@ training_args = GRPOConfig(
     top_p=0.99,
     top_k=100,
     temperature=0.99,
-    num_completions=8, # = num_return_sequences in the paper
+    num_generations=8, # = num_return_sequences in the paper
     num_iterations=1,  # = ppo_epochs in the paper
     per_device_train_batch_size=4,
     gradient_accumulation_steps=32,
@@ -232,6 +232,28 @@ trainer = PAPOTrainer(
 )
 ```
 
+### The Art of Scaling Reinforcement Learning
+
+**📜 Paper**: https://huggingface.co/papers/2510.13786
+
+A systematic study that defines a framework for analyzing and predicting reinforcement learning scaling in large language models, identifies key design choices that affect compute efficiency and propose a best-practice recipe called ScaleRL.
+
+You can partially reproduce the ScaleRL recipe using the [`GRPOTrainer`] with the following configs:
+
+```python
+from trl import GRPOConfig
+
+config = GRPOConfig(
+    loss_type="cispo",
+    epsilon_high=5.0,
+    num_generations=16,
+    scale_rewards="batch",
+    cast_lm_head_to_fp32=True
+)
+```
+
+
+
 ## Direct Policy Optimization
 
 Papers relating to the [`DPOTrainer`]
@@ -534,7 +556,7 @@ training_args = RLOOConfig(
 
 ## Contrastive Preference Optimization
 
-Papers relating to the [`CPOTrainer`]
+Papers relating to the [`experimental.cpo.CPOTrainer`]
 
 ### AlphaPO -- Reward shape matters for LLM alignment
 
@@ -543,7 +565,7 @@ Papers relating to the [`CPOTrainer`]
 AlphaPO is a new Direct Alignment Algorithms (DAAs) method that leverages an alpha-parameter to help change the shape of the reward function beyond the standard log reward. AlphaPO helps maintain fine-grained control over likelihood displacement and over-optimization. To reproduce the paper's setting, use this configuration:
 
 ```python
-from trl import CPOConfig
+from trl.experimental.cpo import CPOConfig
 
 # Mistral-Instruct from Table 3 of the paper
 training_args = CPOConfig(
@@ -624,12 +646,12 @@ On-Policy Distillation has been shown to outperform SFT, GRPO and can be used to
 
 Additionally on-policy distillation is more compute efficient and is less prone to overfitting when trained with limited data.
 
-To train a model with on-policy distillation using TRL, you can use the following configuration, with the [`GKDTrainer`] and [`GKDConfig`]:
+To train a model with on-policy distillation using TRL, you can use the following configuration, with the [`experimental.gkd.GKDTrainer`] and [`experimental.gkd.GKDConfig`]:
 
 ```python
-from trl import GKDConfig
+from trl.experimental.gkd import GKDConfig
 
-config = GKDConfig(
+training_args = GKDConfig(
     lmbda=1.0, # student produces rollouts for all batches
     beta=1.0, # to ensure reverse-kl as the loss function
     teacher_model_name_or_path="teacher-model", # specify the teacher model
@@ -649,3 +671,29 @@ config = GOLDConfig(
 
 )
 ```
+
+### Knowledge Distillation of Large Language Models
+
+**📜 Paper**: https://huggingface.co/papers/2306.08543
+
+MiniLLM is the first on-policy knowledge distillation method, which minimizes the sequence-level reverse KLD between the teacher and the student model and is optimized by reinforcement learning.
+
+It is a generalized version of [Think Machine Lab's On-Policy Distillation](https://thinkingmachines.ai/blog/on-policy-distillation/), with the option to add distribution-level single-step distillation signals (like GKD when `beta=1`) and long-context reverse KLD signals.
+
+Alternatively, you can use the [`experimental.MiniLLMTrainer`] and [`experimental.MiniLLMConfig`] to perform MiniLLM distillation as follows:
+
+```python
+from datasets import load_dataset
+from trl.experimental.minillm import MiniLLMTrainer
+
+dataset = load_dataset("trl-lib/tldr", split="train")
+
+trainer = MiniLLMTrainer(
+    model="Qwen/Qwen3-0.6B",
+    teacher_model="Qwen/Qwen3-1.7B",
+    train_dataset=dataset,
+)
+trainer.train()
+```
+
+For more details, see the [MiniLLM Trainer documentation](minillm) documentation.
diff --git a/docs/source/peft_integration.md b/docs/source/peft_integration.md
index bd196dd99bf..221d9b7071b 100644
--- a/docs/source/peft_integration.md
+++ b/docs/source/peft_integration.md
@@ -146,7 +146,8 @@ After training your reward adapter and pushing it to the Hub:
 
 ```python
 from peft import LoraConfig
-from trl import AutoModelForCausalLMWithValueHead, PPOTrainer
+from trl import AutoModelForCausalLMWithValueHead
+from trl.experimental.ppo import PPOTrainer
 
 model_name = "huggyllama/llama-7b"
 rm_adapter_id = "trl-lib/llama-7b-hh-rm-adapter"
diff --git a/docs/source/ppo_trainer.md b/docs/source/ppo_trainer.md
index 1dabbc4177c..3f7ea2ee73f 100644
--- a/docs/source/ppo_trainer.md
+++ b/docs/source/ppo_trainer.md
@@ -1,5 +1,11 @@
 # PPO Trainer
 
+<Tip warning={true}>
+
+**Deprecation Notice**: PPOTrainer and PPOConfig have been moved to `trl.experimental.ppo` and will be removed from `trl.trainer` in TRL 0.29.0. Please update your imports to use `from trl.experimental.ppo import PPOConfig, PPOTrainer` instead. See [issue #4466](https://github.com/huggingface/trl/issues/4466) for more information.
+
+</Tip>
+
 [![model badge](https://img.shields.io/badge/All_models-PPO-blue)](https://huggingface.co/models?other=ppo,trl)
 
 TRL supports training LLMs with [Proximal Policy Optimization (PPO)](https://huggingface.co/papers/1707.06347).
@@ -228,11 +234,11 @@ python -m openrlbenchmark.rlops_multi_metrics \
 
 ## PPOTrainer
 
-[[autodoc]] PPOTrainer
+[[autodoc]] experimental.ppo.PPOTrainer
     - train
     - save_model
     - push_to_hub
 
 ## PPOConfig
 
-[[autodoc]] PPOConfig
+[[autodoc]] experimental.ppo.PPOConfig
diff --git a/docs/source/quickstart.md b/docs/source/quickstart.md
index 3a89cf55120..6661762af93 100644
--- a/docs/source/quickstart.md
+++ b/docs/source/quickstart.md
@@ -24,15 +24,12 @@ trainer.train()
 ```python
 from trl import GRPOTrainer
 from datasets import load_dataset
-
-# Define a simple reward function (count unique chars as example)
-def reward_function(completions, **kwargs):
-    return [len(set(completion.lower())) for completion in completions]
+from trl.rewards import accuracy_reward
 
 trainer = GRPOTrainer(
     model="Qwen/Qwen2.5-0.5B-Instruct",  # Start from SFT model
-    train_dataset=load_dataset("trl-lib/tldr", split="train"),
-    reward_funcs=reward_function,
+    train_dataset=load_dataset("trl-lib/DeepMath-103K", split="train"),
+    reward_funcs=accuracy_reward,
 )
 trainer.train()
 ```
diff --git a/docs/source/reducing_memory_usage.md b/docs/source/reducing_memory_usage.md
index f258c0a20f8..f92ebb29edb 100644
--- a/docs/source/reducing_memory_usage.md
+++ b/docs/source/reducing_memory_usage.md
@@ -90,6 +90,33 @@ from trl import SFTConfig
 training_args = SFTConfig(..., packing=True, max_length=512)
 ```
 
+## PEFT for parameter-efficient fine-tuning
+
+Parameter-Efficient Fine-Tuning (PEFT) methods like LoRA are among the most effective techniques for reducing memory usage during training. Instead of training all model parameters, PEFT methods train only a small number of adapter parameters, significantly reducing memory requirements and enabling fine-tuning of larger models on limited hardware.
+
+For comprehensive details on using PEFT with TRL, including various adapter methods, quantization options, and advanced configurations, see [PEFT Integration](peft_integration).
+
+To use PEFT for reducing memory usage:
+
+```python
+from datasets import load_dataset
+from peft import LoraConfig
+from trl import SFTTrainer
+
+dataset = load_dataset("trl-lib/Capybara", split="train")
+
+peft_config = LoraConfig()
+
+trainer = SFTTrainer(
+    model="Qwen/Qwen2.5-0.5B",
+    train_dataset=dataset,
+    peft_config=peft_config,
+)
+```
+
+PEFT can be combined with other memory reduction techniques such as quantization (4-bit or 8-bit) for even greater memory savings. See [PEFT Integration](peft_integration) for quantization examples.
+
+
 ## Liger for reducing peak memory usage
 
 > [Liger Kernel](https://github.com/linkedin/Liger-Kernel) is a collection of Triton kernels designed specifically for LLM training. It can effectively increase multi-GPU training throughput by 20% and reduce memory usage by 60%.
@@ -138,7 +165,7 @@ training_args = KTOConfig(..., use_liger_kernel=True)
 <hfoption id="GKD">
 
 ```python
-from trl import GKDConfig
+from trl.experimental.gkd import GKDConfig
 
 training_args = GKDConfig(..., use_liger_kernel=True)
 ```
@@ -161,7 +188,7 @@ Padding-free batching is an alternative approach for reducing memory usage. In t
 ```python
 from trl import DPOConfig
 
-training_args = DPOConfig(..., padding_free=True, model_init_kwargs={"attn_implementation": "flash_attention_2"})
+training_args = DPOConfig(..., padding_free=True, model_init_kwargs={"attn_implementation": "kernels-community/flash-attn2"})
 ```
 
 </hfoption>
@@ -170,7 +197,7 @@ training_args = DPOConfig(..., padding_free=True, model_init_kwargs={"attn_imple
 ```python
 from trl import SFTConfig
 
-training_args = SFTConfig(..., padding_free=True, model_init_kwargs={"attn_implementation": "flash_attention_2"})
+training_args = SFTConfig(..., padding_free=True, model_init_kwargs={"attn_implementation": "kernels-community/flash-attn2"})
 ```
 
 </hfoption>
@@ -247,7 +274,7 @@ training_args = OnlineDPOConfig(..., ds3_gather_for_generation=False)
 <hfoption id="PPO">
 
 ```python
-from trl import PPOConfig
+from trl.experimental.ppo import PPOConfig
 
 training_args = PPOConfig(..., ds3_gather_for_generation=False)
 ```
@@ -290,3 +317,5 @@ training_args = RLOOConfig(..., vllm_enable_sleep_mode=True)
 
 </hfoption>
 </hfoptions>
+
+Offloading the vLLM weights and cache helps keep GPU memory usage low, which can be particularly beneficial when training large models or using limited GPU resources. However, waking the vLLM engine from sleep mode introduces some host–device transfer latency, which may slightly impact training speed.
diff --git a/docs/source/rloo_trainer.md b/docs/source/rloo_trainer.md
index 36d315e678d..68173d218da 100644
--- a/docs/source/rloo_trainer.md
+++ b/docs/source/rloo_trainer.md
@@ -15,10 +15,10 @@ This post-training method was contributed by [Costa Huang](https://github.com/vw
 
 ## Quick start
 
-This example demonstrates how to train a model using the RLOO method. We train a [Qwen 0.5B Instruct model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) with the prompts from the [UltraFeedback prompts dataset](https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt). You can view the data in the dataset here:
+This example demonstrates how to train a model using the RLOO method. We train a [Qwen 0.5B Instruct model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) with the prompts from the [DeepMath-103K dataset](https://huggingface.co/datasets/trl-lib/DeepMath-103K). You can view the data in the dataset here:
 
 <iframe
-  src="https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt/embed/viewer/default/train?row=0"
+  src="https://huggingface.co/datasets/trl-lib/DeepMath-103K/embed/viewer/default/train?row=0"
   frameborder="0"
   width="100%"
   height="560px"
@@ -29,21 +29,14 @@ Below is the script to train the model.
 ```python
 # train_rloo.py
 from datasets import load_dataset
-from trl import RLOOConfig, RLOOTrainer
-
-dataset = load_dataset("trl-lib/ultrafeedback-prompt", split="train")
+from trl import RLOOTrainer
+from trl.rewards import accuracy_reward
 
-# Dummy reward function for demonstration purposes
-def reward_num_unique_letters(completions, **kwargs):
-    """Reward function that rewards completions with more unique letters."""
-    completion_contents = [completion[0]["content"] for completion in completions]
-    return [float(len(set(content))) for content in completion_contents]
+dataset = load_dataset("trl-lib/DeepMath-103K", split="train")
 
-training_args = RLOOConfig(output_dir="Qwen2-0.5B-RLOO")
 trainer = RLOOTrainer(
     model="Qwen/Qwen2-0.5B-Instruct",
-    reward_funcs=reward_num_unique_letters,
-    args=training_args,
+    reward_funcs=accuracy_reward,
     train_dataset=dataset,
 )
 trainer.train()
@@ -135,6 +128,7 @@ In a fully online, single-step setting (default),  \\( \frac{\pi_\theta(o_i \mid
 While training and evaluating, we record the following reward metrics:
 
 - `num_tokens`: The total number of tokens processed so far, including both prompts and completions.
+- `step_time`: The average time (in seconds) taken per training step (including generation).
 - `completions/mean_length`: The average length of generated completions.
 - `completions/min_length`: The minimum length of generated completions.
 - `completions/max_length`: The maximum length of generated completions.
diff --git a/docs/source/vllm_integration.md b/docs/source/vllm_integration.md
index d9a0f6a1fd8..482b1925db6 100644
--- a/docs/source/vllm_integration.md
+++ b/docs/source/vllm_integration.md
@@ -10,9 +10,9 @@ This document will guide you through the process of using vLLM with TRL for fast
 >
 > - [`GRPOTrainer`]
 > - [`OnlineDPOTrainer`]
-> - [`NashMDTrainer`]
-> - [`XPOTrainer`]
 > - [`RLOOTrainer`]
+> - [`experimental.nash_md.NashMDTrainer`]
+> - [`experimental.xpo.XPOTrainer`]
 
 ## 🚀 How can I use vLLM with TRL to speed up training?
 
@@ -46,24 +46,14 @@ Sample of a simple `train.py` script:
 ```python
 from datasets import load_dataset
 from trl import GRPOTrainer, GRPOConfig
+from trl.rewards import accuracy_reward
 
-dataset = load_dataset("trl-lib/tldr", split="train")
-
-# Dummy reward function: count the number of unique characters in the completions
-def reward_num_unique_chars(completions, **kwargs):
-    return [len(set(c)) for c in completions]
-
-training_args = GRPOConfig(
-    output_dir="my_test",
-    use_vllm=True,
-    bf16=True,
-    gradient_checkpointing=True,
-)
+dataset = load_dataset("trl-lib/DeepMath-103K", split="train")
 
 trainer = GRPOTrainer(
     model="Qwen/Qwen2.5-7B",
-    args=training_args,
-    reward_funcs=reward_num_unique_chars,
+    args=GRPOConfig(use_vllm=True),
+    reward_funcs=accuracy_reward,
     train_dataset=dataset,
 )
 
@@ -76,24 +66,14 @@ trainer.train()
 ```python
 from datasets import load_dataset
 from trl import OnlineDPOTrainer, OnlineDPOConfig
+from trl.rewards import accuracy_reward
 
-dataset = load_dataset("trl-lib/tldr", split="train")
-
-# Dummy reward function: count the number of unique characters in the completions
-def reward_num_unique_chars(completions, **kwargs):
-    return [len(set(c)) for c in completions]
-
-training_args = OnlineDPOConfig(
-    output_dir="my_test",
-    use_vllm=True,
-    bf16=True,
-    gradient_checkpointing=True,
-)
+dataset = load_dataset("trl-lib/DeepMath-103K", split="train")
 
 trainer = OnlineDPOTrainer(
     model="Qwen/Qwen2.5-7B",
-    args=training_args,
-    reward_funcs=reward_num_unique_chars,
+    args=OnlineDPOConfig(use_vllm=True),
+    reward_funcs=accuracy_reward,
     train_dataset=dataset,
 )
 
@@ -105,25 +85,15 @@ trainer.train()
 
 ```python
 from datasets import load_dataset
-from trl import NashMDTrainer, NashMDConfig
+from trl.experimental.nash_md import NashMDConfig, NashMDTrainer
+from trl.rewards import accuracy_reward
 
-dataset = load_dataset("trl-lib/tldr", split="train")
-
-# Dummy reward function: count the number of unique characters in the completions
-def reward_num_unique_chars(completions, **kwargs):
-    return [len(set(c)) for c in completions]
-
-training_args = NashMDConfig(
-    output_dir="my_test",
-    use_vllm=True,
-    bf16=True,
-    gradient_checkpointing=True,
-)
+dataset = load_dataset("trl-lib/DeepMath-103K", split="train")
 
 trainer = NashMDTrainer(
     model="Qwen/Qwen2.5-7B",
-    args=training_args,
-    reward_funcs=reward_num_unique_chars,
+    args=NashMDConfig(use_vllm=True),
+    reward_funcs=accuracy_reward,
     train_dataset=dataset,
 )
 
@@ -136,24 +106,14 @@ trainer.train()
 ```python
 from datasets import load_dataset
 from trl import XPOTrainer, XPOConfig
+from trl.rewards import accuracy_reward
 
-dataset = load_dataset("trl-lib/tldr", split="train")
-
-# Dummy reward function: count the number of unique characters in the completions
-def reward_num_unique_chars(completions, **kwargs):
-    return [len(set(c)) for c in completions]
-
-training_args = XPOConfig(
-    output_dir="my_test",
-    use_vllm=True,
-    bf16=True,
-    gradient_checkpointing=True,
-)
+dataset = load_dataset("trl-lib/DeepMath-103K", split="train")
 
 trainer = XPOTrainer(
     model="Qwen/Qwen2.5-7B",
-    args=training_args,
-    reward_funcs=reward_num_unique_chars,
+    args=XPOConfig(use_vllm=True),
+    reward_funcs=accuracy_reward,
     train_dataset=dataset,
 )
 
@@ -166,24 +126,14 @@ trainer.train()
 ```python
 from datasets import load_dataset
 from trl import RLOOTrainer, RLOOConfig
+from trl.rewards import accuracy_reward
 
-dataset = load_dataset("trl-lib/tldr", split="train")
-
-# Dummy reward function: count the number of unique characters in the completions
-def reward_num_unique_chars(completions, **kwargs):
-    return [len(set(c)) for c in completions]
-
-training_args = RLOOConfig(
-    output_dir="my_test",
-    use_vllm=True,
-    bf16=True,
-    gradient_checkpointing=True,
-)
+dataset = load_dataset("trl-lib/DeepMath-103K", split="train")
 
 trainer = RLOOTrainer(
     model="Qwen/Qwen2.5-7B",
-    args=training_args,
-    reward_funcs=reward_num_unique_chars,
+    args=RLOOConfig(use_vllm=True),
+    reward_funcs=accuracy_reward,
     train_dataset=dataset,
 )
 
@@ -379,7 +329,7 @@ training_args = OnlineDPOConfig(
 <hfoption id="NashMD">
 
 ```python
-from trl import NashMDConfig
+from trl.experimental.nash_md import NashMDConfig
 
 training_args = NashMDConfig(
     ...,
@@ -392,7 +342,7 @@ training_args = NashMDConfig(
 <hfoption id="XPO">
 
 ```python
-from trl import XPOConfig
+from trl.experimental.xpo import XPOConfig
 
 training_args = XPOConfig(
     ...,
@@ -454,7 +404,7 @@ training_args = OnlineDPOConfig(
 <hfoption id="NashMD">
 
 ```python
-from trl import NashMDConfig
+from trl.experimental.nash_md import NashMDConfig
 
 training_args = NashMDConfig(
     ...,
@@ -467,7 +417,7 @@ training_args = NashMDConfig(
 <hfoption id="XPO">
 
 ```python
-from trl import XPOConfig
+from trl.experimental.xpo import XPOConfig
 
 training_args = XPOConfig(
     ...,
diff --git a/docs/source/xpo_trainer.md b/docs/source/xpo_trainer.md
index c4f9372b6b8..5817c9fa54c 100644
--- a/docs/source/xpo_trainer.md
+++ b/docs/source/xpo_trainer.md
@@ -12,9 +12,12 @@ The abstract from the paper is the following:
 
 This post-training method was contributed by [Kashif Rasul](https://huggingface.co/kashif),  [Quentin Gallouédec](https://huggingface.co/qgallouedec) and [Lewis Tunstall](https://huggingface.co/lewtun).
 
+> [!NOTE]
+> XPO is currently experimental. The API may change without notice while the feature is iterated on.
+
 ## Quick start
 
-This example demonstrates how to train a model using the XPO method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and [`PairRMJudge`] as a judge. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the prompts in the dataset here:
+This example demonstrates how to train a model using the XPO method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and [`experimental.judges.PairRMJudge`] as a judge. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the prompts in the dataset here:
 <iframe
   src="https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt/embed/viewer/default/train?row=0"
   frameborder="0"
@@ -27,7 +30,8 @@ Below is the script to train the model:
 ```python
 # train_xpo.py
 from datasets import load_dataset
-from trl import PairRMJudge, XPOConfig, XPOTrainer
+from trl.experimental.judges import PairRMJudge
+from trl.experimental.xpo import XPOConfig, XPOTrainer
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
@@ -62,7 +66,7 @@ The best programming language depends on individual preferences and familiarity
 
 ## Expected dataset type
 
-XPO requires a [prompt-only dataset](dataset_formats#prompt-only). The [`XPOTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset format. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
+XPO requires a [prompt-only dataset](dataset_formats#prompt-only). The [`experimental.xpo.XPOTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset format. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
 
 ## Usage tips
 
@@ -71,7 +75,7 @@ XPO requires a [prompt-only dataset](dataset_formats#prompt-only). The [`XPOTrai
 Instead of a judge, you can chose to use a reward model -- see [Reward Bench](https://huggingface.co/spaces/allenai/reward-bench) for a leaderboard of public models you can use. Below is a code example showing how to replace a judge with the [trl-lib/Qwen2-0.5B-Reward](https://huggingface.co/trl-lib/Qwen2-0.5B-Reward) model:
 
 ```diff
-- from trl import PairRMJudge
+- from trl.experimental.judges import PairRMJudge
 + from transformers import AutoModelForSequenceClassification
 
 - judge = PairRMJudge()
@@ -89,7 +93,7 @@ Instead of a judge, you can chose to use a reward model -- see [Reward Bench](ht
 
 ### Encourage EOS token generation
 
-When using a reward model, we may want the model to generate completions within a given length. During training, the model will generate completions up to the maximum length specified in the `max_new_tokens` argument of [`XPOConfig`]. If you want to penalize the model for not generating an EOS token before reaching the maximum length, you can use the `missing_eos_penalty` argument of [`XPOConfig`]:
+When using a reward model, we may want the model to generate completions within a given length. During training, the model will generate completions up to the maximum length specified in the `max_new_tokens` argument of [`experimental.xpo.XPOConfig`]. If you want to penalize the model for not generating an EOS token before reaching the maximum length, you can use the `missing_eos_penalty` argument of [`experimental.xpo.XPOConfig`]:
 
 ```python
 training_args = XPOConfig(..., max_new_tokens=128, missing_eos_penalty=1.0)
@@ -145,16 +149,16 @@ While training and evaluating we record the following reward metrics:
 * `logps/rejected`: The mean log probabilities of the rejected completions.
 * `val/model_contain_eos_token`: The amount of times the model's output contains the eos token.
 * `val/ref_contain_eos_token`: The amount of times the reference's output contains the eos token.
-* `alpha`: The weight of the XPO loss term. Typically fixed, but can be made dynamic by passing a list to [`XPOConfig`].
-* `beta`: The parameter that controls the weight of the loss term representing the deviation from the reference model. Typically fixed, but can be made dynamic by passing a list to [`XPOConfig`].
+* `alpha`: The weight of the XPO loss term. Typically fixed, but can be made dynamic by passing a list to [`experimental.xpo.XPOConfig`].
+* `beta`: The parameter that controls the weight of the loss term representing the deviation from the reference model. Typically fixed, but can be made dynamic by passing a list to [`experimental.xpo.XPOConfig`].
 
 ## XPOTrainer
 
-[[autodoc]] XPOTrainer
+[[autodoc]] experimental.xpo.XPOTrainer
     - train
     - save_model
     - push_to_hub
 
 ## XPOConfig
 
-[[autodoc]] XPOConfig
+[[autodoc]] experimental.xpo.XPOConfig
diff --git a/examples/datasets/deepmath_103k.py b/examples/datasets/deepmath_103k.py
new file mode 100644
index 00000000000..3976d23fbeb
--- /dev/null
+++ b/examples/datasets/deepmath_103k.py
@@ -0,0 +1,98 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+
+from datasets import load_dataset
+from huggingface_hub import ModelCard
+from transformers import HfArgumentParser
+
+
+@dataclass
+class ScriptArguments:
+    r"""
+    Arguments for the script.
+
+    Args:
+        push_to_hub (`bool`, *optional*, defaults to `False`):
+            Whether to push the dataset to the Hugging Face Hub.
+        repo_id (`str`, *optional*, defaults to `"trl-lib/DeepMath-103K"`):
+            Hugging Face repository ID to push the dataset to.
+        dataset_num_proc (`int`, *optional*):
+            Number of workers to use for dataset processing.
+    """
+
+    push_to_hub: bool = field(
+        default=False,
+        metadata={"help": "Whether to push the dataset to the Hugging Face Hub."},
+    )
+    repo_id: str = field(
+        default="trl-lib/DeepMath-103K",
+        metadata={"help": "Hugging Face repository ID to push the dataset to."},
+    )
+    dataset_num_proc: int | None = field(
+        default=None,
+        metadata={"help": "Number of workers to use for dataset processing."},
+    )
+
+
+def process_example(example):
+    solution = example["final_answer"]
+    if solution not in ["True", "False", "Yes", "No"]:
+        solution = f"${solution}$"
+    prompt = [{"role": "user", "content": example["question"]}]
+    return {"prompt": prompt, "solution": solution}
+
+
+model_card = ModelCard("""
+---
+tags: [trl]
+---
+
+# DeepMath-103K Dataset
+
+## Summary
+
+[DeepMath-103K](https://huggingface.co/datasets/zwhe99/DeepMath-103K) is meticulously curated to push the boundaries of mathematical reasoning in language models.
+
+## Data Structure
+
+- **Format**: [Conversational](https://huggingface.co/docs/trl/main/dataset_formats#conversational)
+- **Type**: [Prompt-only](https://huggingface.co/docs/trl/main/dataset_formats#prompt-only)
+
+Column:
+- `"prompt"`: The input question.
+- `"solution"`: The solution to the math problem.
+
+## Generation script
+
+The script used to generate this dataset can be found [here](https://github.com/huggingface/trl/blob/main/examples/datasets/deepmath_103k.py).
+""")
+
+if __name__ == "__main__":
+    parser = HfArgumentParser(ScriptArguments)
+    script_args = parser.parse_args_into_dataclasses()[0]
+
+    dataset = load_dataset("zwhe99/DeepMath-103K", split="train")
+
+    dataset = dataset.map(
+        process_example,
+        remove_columns=dataset.column_names,
+        num_proc=script_args.dataset_num_proc,
+    )
+    dataset = dataset.train_test_split(test_size=0.05, seed=42)
+
+    if script_args.push_to_hub:
+        dataset.push_to_hub(script_args.repo_id)
+        model_card.push_to_hub(script_args.repo_id, repo_type="dataset")
diff --git a/examples/notebooks/sft_trl_lora_qlora.ipynb b/examples/notebooks/sft_trl_lora_qlora.ipynb
index 21e98f841b6..12552bb9d32 100644
--- a/examples/notebooks/sft_trl_lora_qlora.ipynb
+++ b/examples/notebooks/sft_trl_lora_qlora.ipynb
@@ -305,7 +305,10 @@
         "# model_id, output_dir = \"google/gemma-3-4b-it\", \"gemma-3-4b-it\"                                  # ⚠️ ~6.8 GB VRAM\n",
         "\n",
         "## Granite\n",
-        "#model_id, output_dir = \"ibm-granite/granite-4.0-micro\", \"granite-4.0-micro\"                      # ✅ ~3.3 GB VRAM"
+        "#model_id, output_dir = \"ibm-granite/granite-4.0-micro\", \"granite-4.0-micro\"                      # ✅ ~3.3 GB VRAM\n",
+        "\n",
+        "## LFM2\n",
+        "#model_id, output_dir = \"LiquidAI/LFM2-2.6B\", \"LFM2-2.6B-SFT\"                                     # ✅ ~5.89 GB VRAM"
       ]
     },
     {
diff --git a/examples/scripts/cpo.py b/examples/scripts/cpo.py
index fef9cdf1247..5d621d9eae1 100644
--- a/examples/scripts/cpo.py
+++ b/examples/scripts/cpo.py
@@ -63,7 +63,8 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
 
-from trl import CPOConfig, CPOTrainer, ModelConfig, ScriptArguments, get_peft_config
+from trl import ModelConfig, ScriptArguments, get_peft_config
+from trl.experimental.cpo import CPOConfig, CPOTrainer
 
 
 # Enable logging in a Hugging Face Space
diff --git a/examples/scripts/evals/judge_tldr.py b/examples/scripts/evals/judge_tldr.py
index befb0d3f884..acac8f5436b 100644
--- a/examples/scripts/evals/judge_tldr.py
+++ b/examples/scripts/evals/judge_tldr.py
@@ -24,7 +24,7 @@
 from transformers import HfArgumentParser
 from vllm import LLM, SamplingParams
 
-from trl import HfPairwiseJudge, OpenAIPairwiseJudge
+from trl.experimental.judges import HfPairwiseJudge, OpenAIPairwiseJudge
 
 
 """
diff --git a/examples/scripts/gkd.py b/examples/scripts/gkd.py
index 52f826957f8..f33eb2d834d 100644
--- a/examples/scripts/gkd.py
+++ b/examples/scripts/gkd.py
@@ -58,8 +58,6 @@
 from transformers import AutoTokenizer, GenerationConfig
 
 from trl import (
-    GKDConfig,
-    GKDTrainer,
     LogCompletionsCallback,
     ModelConfig,
     ScriptArguments,
@@ -68,6 +66,7 @@
     get_peft_config,
     get_quantization_config,
 )
+from trl.experimental.gkd import GKDConfig, GKDTrainer
 
 
 # Enable logging in a Hugging Face Space
diff --git a/examples/scripts/nash_md.py b/examples/scripts/nash_md.py
index fdb8ca09a3e..ac461b1802a 100644
--- a/examples/scripts/nash_md.py
+++ b/examples/scripts/nash_md.py
@@ -61,18 +61,15 @@
 from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, GenerationConfig
 
 from trl import (
-    HfPairwiseJudge,
     LogCompletionsCallback,
     ModelConfig,
-    NashMDConfig,
-    NashMDTrainer,
-    OpenAIPairwiseJudge,
-    PairRMJudge,
     ScriptArguments,
     TrlParser,
     get_kbit_device_map,
     get_quantization_config,
 )
+from trl.experimental.judges import HfPairwiseJudge, OpenAIPairwiseJudge, PairRMJudge
+from trl.experimental.nash_md import NashMDConfig, NashMDTrainer
 
 
 # Enable logging in a Hugging Face Space
diff --git a/examples/scripts/online_dpo.py b/examples/scripts/online_dpo.py
index 4ed7afe884d..3a550e98168 100644
--- a/examples/scripts/online_dpo.py
+++ b/examples/scripts/online_dpo.py
@@ -56,19 +56,17 @@
 from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, GenerationConfig
 
 from trl import (
-    HfPairwiseJudge,
     LogCompletionsCallback,
     ModelConfig,
     OnlineDPOConfig,
     OnlineDPOTrainer,
-    OpenAIPairwiseJudge,
-    PairRMJudge,
     ScriptArguments,
     TrlParser,
     get_kbit_device_map,
     get_peft_config,
     get_quantization_config,
 )
+from trl.experimental.judges import HfPairwiseJudge, OpenAIPairwiseJudge, PairRMJudge
 
 
 # Enable logging in a Hugging Face Space
diff --git a/examples/scripts/openenv/browsergym.py b/examples/scripts/openenv/browsergym.py
new file mode 100644
index 00000000000..8962518ec69
--- /dev/null
+++ b/examples/scripts/openenv/browsergym.py
@@ -0,0 +1,572 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Simple script to run GRPO training with OpenEnv's BrowserGym environment and vLLM.
+
+This example automatically detects and uses vision capabilities when VLM models are used.
+Screenshots from BrowserGym are collected and passed to the model during training. The GRPO
+trainer auto-detects multimodal support by checking for images in the rollout data.
+
+Setup:
+
+```sh
+uv pip install git+https://github.com/meta-pytorch/OpenEnv.git
+```
+
+Usage:
+
+# Build and start the environment only if using --env-mode docker-local; In other modes, the env is automatically managed by the script.
+# ```sh
+cd OpenEnv
+docker build -t openenv-base:latest -f src/core/containers/images/Dockerfile .
+docker build -t browsergym-env:latest -f src/envs/browsergym_env/server/Dockerfile .
+docker run -d -p 8000:8000 \
+  -e BROWSERGYM_BENCHMARK="miniwob" \
+  -e BROWSERGYM_TASK_NAME="click-test" \
+  browsergym-env:latest
+```
+
+# Option 1: Colocated vLLM (1 GPU required)
+```sh
+python examples/scripts/openenv/browsergym.py --vllm-mode colocate
+```
+
+# Option 2: Separate vLLM server (2 GPUs required)
+
+# Spin up vLLM server (Terminal 1)
+```sh
+CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen3-VL-2B-Instruct --host 0.0.0.0 --port 8001
+```
+
+# Run training (Terminal 2)
+```sh
+CUDA_VISIBLE_DEVICES=1 python examples/scripts/openenv/browsergym.py --vllm-mode server --vllm-server-url http://localhost:8001
+```
+"""
+
+from __future__ import annotations
+
+import argparse
+from datetime import datetime
+from pathlib import Path
+
+import numpy as np
+from datasets import Dataset
+from envs.browsergym_env import BrowserGymAction, BrowserGymEnv
+from PIL import Image
+from transformers import AutoTokenizer
+
+from trl import GRPOConfig, GRPOTrainer
+from trl.experimental.openenv import generate_rollout_completions
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run GRPO training for BrowserGym MiniWoB using OpenEnv environment.")
+    parser.add_argument(
+        "--tokenizer-id",
+        default="Qwen/Qwen3-VL-2B-Instruct",
+        help="Model identifier used to load the tokenizer.",
+    )
+    parser.add_argument(
+        "--model-id",
+        default="Qwen/Qwen3-VL-2B-Instruct",
+        help="Model identifier passed to GRPOTrainer for fine-tuning.",
+    )
+    parser.add_argument("--env-host", type=str, default="0.0.0.0", help="Host for the Echo environment.")
+    parser.add_argument("--env-port", type=int, default=8001, help="Port for the Echo environment.")
+    parser.add_argument(
+        "--env-mode",
+        choices=["docker-local", "docker-image", "docker-hub", "space"],
+        default="docker-image",
+        help="Where to run the environment: 'local' to launch it, 'docker-local' if already running locally, 'docker-image' to run from a Docker image, 'docker-hub' to run from Docker Hub, or 'space' to use a remote Space URL.",
+    )
+    parser.add_argument(
+        "--env-image", type=str, default="openspiel-env:latest", help="Docker image for the OpenSpiel environment."
+    )
+    parser.add_argument(
+        "--benchmark",
+        default="miniwob",
+        help="BrowserGym benchmark to use (miniwob, webarena, etc.).",
+    )
+    parser.add_argument(
+        "--task-name",
+        default="click-test",
+        help="Specific task within the benchmark (e.g., click-test, click-button).",
+    )
+    parser.add_argument(
+        "--dataset-prompt",
+        default="Complete the web task successfully.",
+        help="Prompt text used to seed the training dataset.",
+    )
+    parser.add_argument(
+        "--dataset-size",
+        type=int,
+        default=1000,
+        help="Number of entries to include in the synthetic training dataset.",
+    )
+    parser.add_argument(
+        "--max-steps",
+        type=int,
+        default=10,
+        help="Maximum number of steps per episode.",
+    )
+    parser.add_argument(
+        "--max-new-tokens",
+        type=int,
+        default=32,
+        help="Maximum number of new tokens to request from vLLM for each action.",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.7,
+        help="Sampling temperature used during rollout generation.",
+    )
+    parser.add_argument(
+        "--top-k",
+        type=int,
+        default=50,
+        help="Top-k sampling parameter forwarded to vLLM.",
+    )
+    parser.add_argument(
+        "--top-p",
+        type=float,
+        default=None,
+        help="Optional top-p sampling parameter forwarded to vLLM.",
+    )
+    parser.add_argument(
+        "--image-size",
+        type=int,
+        default=512,
+        help="Resize screenshots to this size (preserving aspect ratio) to reduce memory usage. Set to 0 to disable resizing.",
+    )
+    parser.add_argument(
+        "--learning-rate",
+        type=float,
+        default=5e-6,
+        help="Learning rate for GRPO training.",
+    )
+    parser.add_argument(
+        "--weight-decay",
+        type=float,
+        default=0.0,
+        help="Weight decay applied during optimization.",
+    )
+    parser.add_argument(
+        "--gradient-accumulation-steps",
+        type=int,
+        default=32,
+        help="Gradient accumulation steps for GRPO training.",
+    )
+    parser.add_argument(
+        "--warmup-steps",
+        type=int,
+        default=10,
+        help="Warmup steps for the scheduler.",
+    )
+    parser.add_argument(
+        "--per-device-batch-size",
+        type=int,
+        default=1,
+        help="Per-device train batch size.",
+    )
+    parser.add_argument(
+        "--num-generations",
+        type=int,
+        default=4,
+        help="Number of rollout generations per dataset prompt.",
+    )
+    parser.add_argument(
+        "--num-epochs",
+        type=int,
+        default=1,
+        help="Number of training epochs.",
+    )
+    parser.add_argument(
+        "--save-interval",
+        type=int,
+        default=50,
+        help="Interval (in steps) between checkpoint saves.",
+    )
+    parser.add_argument(
+        "--save-total-limit",
+        type=int,
+        default=None,
+        help="Maximum number of checkpoints to keep.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default=None,
+        help="Directory where training outputs and checkpoints are stored.",
+    )
+    parser.add_argument(
+        "--run-name",
+        default=None,
+        help="Optional run name for logging systems.",
+    )
+    parser.add_argument(
+        "--project",
+        default=None,
+        help="Optional project identifier for logging systems.",
+    )
+    parser.add_argument(
+        "--vllm-mode",
+        choices=("colocate", "server"),
+        default="colocate",
+        help="vLLM execution mode: 'colocate' or 'server'.",
+    )
+    parser.add_argument(
+        "--vllm-server-url",
+        type=str,
+        default="http://localhost:8001",
+        help="URL for the vLLM server (only used when --vllm-mode=server).",
+    )
+    parser.add_argument(
+        "--logging-steps",
+        type=int,
+        default=1,
+        help="Frequency of logging steps for GRPO training.",
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        default=False,
+        help="Enable verbose debugging output during rollouts.",
+    )
+    return parser.parse_args()
+
+
+def sanitize_name(name: str) -> str:
+    return name.replace("/", "-")
+
+
+# ---------------------------------------------------------------------------
+# System Prompt
+# ---------------------------------------------------------------------------
+
+SYSTEM_PROMPT = """You control a web browser through BrowserGym actions.
+You must complete the given web task by interacting with the page.
+
+Available actions:
+- noop() - Do nothing
+- click(bid) - Click element with BrowserGym ID
+- fill(bid, text) - Fill input field
+- send_keys(text) - Send keyboard input
+- scroll(direction) - Scroll up/down
+
+Reply with exactly ONE action on a single line, e.g.:
+click('123')
+fill('456', 'text')
+noop()
+
+Do not include explanations or multiple actions."""
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def make_user_prompt(goal: str, step_num: int, axtree: str, error: str = "") -> str:
+    """Create user prompt from observation."""
+    prompt_parts = [f"Step {step_num + 1}"]
+
+    if goal:
+        prompt_parts.append(f"Goal: {goal}")
+
+    if error:
+        prompt_parts.append(f"Previous action error: {error}")
+
+    # Include accessibility tree (truncated for context)
+    if axtree:
+        max_len = 2000
+        axtree_truncated = axtree[:max_len] + "..." if len(axtree) > max_len else axtree
+        prompt_parts.append(f"Page structure:\n{axtree_truncated}")
+
+    prompt_parts.append("What action do you take?")
+
+    return "\n\n".join(prompt_parts)
+
+
+def parse_action(response_text: str) -> str:
+    """Parse BrowserGym action from model response."""
+    # Extract first line that looks like an action
+    for line in response_text.strip().split("\n"):
+        line = line.strip()
+        if "(" in line and ")" in line:
+            return line
+
+    # Fallback to noop if no valid action found
+    return "noop()"
+
+
+def rollout_once(
+    trainer: GRPOTrainer,
+    env: BrowserGymEnv,
+    tokenizer: AutoTokenizer,
+    dataset_prompt: str,
+    max_steps: int,
+    image_size: int = 0,
+    debug: bool = False,
+) -> dict[str, list]:
+    """Run one episode and collect training data."""
+    result = env.reset()
+    observation = result.observation
+
+    prompt_ids: list[int] = []
+    completion_ids: list[int] = []
+    logprobs: list[float] = []
+    step_rewards: list[float] = []
+    completion_rewards: list[float] = []
+    images: list[Image.Image] = []  # Collect screenshots for VLM
+
+    for step_num in range(max_steps):
+        if result.done:
+            break
+
+        # Create prompt from observation
+        goal = observation.goal or dataset_prompt
+        axtree = observation.axtree_txt or ""
+        error = observation.error if observation.last_action_error else ""
+
+        # Collect screenshot if available (for VLM support)
+        if observation.screenshot is not None:
+            screenshot_array = np.array(observation.screenshot, dtype=np.uint8)
+            screenshot_image = Image.fromarray(screenshot_array)
+
+            # Resize to reduce memory if image_size > 0
+            if image_size > 0:
+                # Preserve aspect ratio while resizing
+                screenshot_image.thumbnail((image_size, image_size), Image.LANCZOS)
+                print(
+                    f"[DEBUG] Step {step_num + 1}: Collected and resized screenshot from {screenshot_array.shape} to {screenshot_image.size}"
+                )
+            else:
+                print(f"[DEBUG] Step {step_num + 1}: Collected screenshot, shape={screenshot_array.shape}")
+
+            images.append(screenshot_image)
+        else:
+            print(f"[DEBUG] Step {step_num + 1}: No screenshot available")
+
+        user_prompt = make_user_prompt(goal, step_num, axtree, error)
+        messages = [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": user_prompt},
+        ]
+        prompt_text = tokenizer.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+
+        # Generate action with vLLM
+        rollout_outputs = generate_rollout_completions(trainer, [prompt_text])[0]
+        prompt_ids.extend(rollout_outputs["prompt_ids"])
+        completion_ids.extend(rollout_outputs["completion_ids"])
+        logprobs.extend(rollout_outputs["logprobs"])
+
+        completion_text = rollout_outputs.get("text") or tokenizer.decode(
+            rollout_outputs["completion_ids"], skip_special_tokens=True
+        )
+
+        # Parse and execute action
+        action_str = parse_action(completion_text)
+
+        if debug:
+            print(f"Step {step_num + 1}: {action_str}")
+
+        # Take action in environment
+        result = env.step(BrowserGymAction(action_str=action_str))
+        observation = result.observation
+
+        # Track rewards
+        step_reward = float(result.reward or 0.0)
+        step_rewards.append(step_reward)
+
+        # Reward shaping: success is most important
+        if result.done and step_reward > 0:
+            completion_rewards.append(1.0)  # Task completed successfully
+        elif result.done and step_reward == 0:
+            completion_rewards.append(0.0)  # Task failed
+        else:
+            completion_rewards.append(step_reward)  # Intermediate reward
+
+    # Final reward is based on task completion
+    final_reward = completion_rewards[-1] if completion_rewards else 0.0
+
+    result_dict = {
+        "prompt_ids": prompt_ids,
+        "completion_ids": completion_ids,
+        "logprobs": logprobs,
+        "step_rewards": step_rewards,
+        "completion_reward": final_reward,
+    }
+
+    # Include images if available (GRPO trainer will auto-detect VLM support)
+    if images:
+        result_dict["images"] = images
+
+    return result_dict
+
+
+# ---------------------------------------------------------------------------
+# Rewards
+# ---------------------------------------------------------------------------
+
+
+def reward_completion(completions: list[str], **kwargs) -> list[float]:
+    """Reward for task completion."""
+    rewards = kwargs.get("completion_reward") if kwargs else None
+    if rewards is None:
+        return [0.0 for _ in completions]
+    return [float(r) for r in rewards]
+
+
+# ---------------------------------------------------------------------------
+# Main entrypoint
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+    args = parse_args()
+
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_id)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    # Select environment mode
+    if args.env_mode == "docker-local":
+        env_url = f"http://{args.env_host}:{args.env_port}"
+        client = BrowserGymEnv(base_url=env_url)
+        print(f"🌍 Using existing BrowserGym Environment (Docker) at: {env_url}")
+    elif args.env_mode == "docker-image":
+        client = BrowserGymEnv.from_docker_image(args.env_image)
+        print("🌍 Using BrowserGym Environment (Docker) from local Image")
+    elif args.env_mode == "docker-hub":
+        client = BrowserGymEnv.from_hub(args.env_image)
+        print("🌍 Using existing BrowserGym Environment (Docker) from Hub Image")
+    elif args.env_mode == "space":
+        env_url = args.env_host
+        print(f"🌍 Using Hugging Face Space environment at: {env_url}")
+    else:
+        raise ValueError(f"Unknown environment mode: {args.env_mode}")
+
+    dataset = Dataset.from_dict({"prompt": [args.dataset_prompt] * args.dataset_size})
+
+    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    default_output_dir = Path("outputs") / f"browsergym-grpo-{sanitize_name(args.model_id)}-{timestamp}"
+    output_dir = Path(args.output_dir or default_output_dir)
+
+    grpo_config = GRPOConfig(
+        use_vllm=True,
+        vllm_mode=args.vllm_mode,
+        vllm_server_base_url=args.vllm_server_url if args.vllm_mode == "server" else None,
+        vllm_gpu_memory_utilization=0.4,
+        output_dir=str(output_dir),
+        num_train_epochs=args.num_epochs,
+        learning_rate=args.learning_rate,
+        weight_decay=args.weight_decay,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        per_device_train_batch_size=args.per_device_batch_size,
+        warmup_steps=args.warmup_steps,
+        num_generations=args.num_generations,
+        generation_batch_size=args.num_generations,  # Must be divisible by num_generations
+        max_completion_length=args.max_new_tokens,
+        logging_steps=args.logging_steps,
+        save_strategy="steps",
+        save_steps=args.save_interval,
+        save_total_limit=args.save_total_limit,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+    )
+
+    grpo_config.run_name = args.run_name or f"run-{timestamp}"
+    grpo_config.project = args.project or f"group-{sanitize_name(args.model_id)}"
+
+    def rollout_func(prompts: list[str], trainer: GRPOTrainer) -> dict[str, list]:
+        episode_prompt_ids: list[list[int]] = []
+        episode_completion_ids: list[list[int]] = []
+        episode_logprobs: list[list[float]] = []
+        completion_rewards: list[float] = []
+        episode_images: list[list[Image.Image]] = []
+
+        print(f"\n[DEBUG] rollout_func called with {len(prompts)} prompts")
+
+        for i, prompt_text in enumerate(prompts):
+            print(f"[DEBUG] Processing prompt {i + 1}/{len(prompts)}")
+            episode = rollout_once(
+                trainer=trainer,
+                env=client,
+                tokenizer=tokenizer,
+                dataset_prompt=prompt_text,
+                max_steps=args.max_steps,
+                image_size=args.image_size,
+                debug=args.debug,
+            )
+            episode_prompt_ids.append(episode["prompt_ids"])
+            episode_completion_ids.append(episode["completion_ids"])
+            episode_logprobs.append(episode["logprobs"])
+            completion_rewards.append(episode["completion_reward"])
+
+            # Collect images if available (for VLM support)
+            if "images" in episode:
+                print(f"[DEBUG] Episode {i + 1} has {len(episode['images'])} images")
+                episode_images.append(episode["images"])
+            else:
+                print(f"[DEBUG] Episode {i + 1} has NO images")
+
+        result = {
+            "prompt_ids": episode_prompt_ids,
+            "completion_ids": episode_completion_ids,
+            "logprobs": episode_logprobs,
+            "completion_reward": completion_rewards,
+        }
+
+        # Include images if any episode had screenshots (GRPO trainer auto-detects VLM)
+        if episode_images:
+            result["images"] = episode_images
+            print(f"[DEBUG] rollout_func returning with images: {len(episode_images)} episodes")
+        else:
+            print("[DEBUG] rollout_func returning WITHOUT images")
+
+        return result
+
+    trainer = GRPOTrainer(
+        model=args.model_id,
+        processing_class=tokenizer,
+        reward_funcs=[reward_completion],
+        train_dataset=dataset,
+        args=grpo_config,
+        rollout_func=rollout_func,
+    )
+
+    print("=" * 80)
+    print("Starting GRPO training with BrowserGym environment")
+    print(f"Benchmark: {args.benchmark}")
+    print(f"Task: {args.task_name}")
+    print(f"Model: {args.model_id}")
+    print(f"Using {args.num_generations} rollouts per dataset prompt")
+    print(f"Output directory: {output_dir}")
+    print("=" * 80)
+
+    try:
+        trainer.train()
+        print("\nTraining completed successfully!")
+    finally:
+        client.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/scripts/openenv/catch.py b/examples/scripts/openenv/catch.py
index f7263d9d3d1..e1d9e730162 100644
--- a/examples/scripts/openenv/catch.py
+++ b/examples/scripts/openenv/catch.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 """
-Simple script to run GRPO training with OpenEnv's Catch environment (OpenSpiel) and a vLLM server. The reward function
+Simple script to run GRPO training with OpenEnv's Catch environment (OpenSpiel) and vLLM. The reward function
 is based on the catch game where the agent tries to catch falling balls.
 
 Setup:
@@ -22,23 +22,28 @@
 uv pip install git+https://github.com/meta-pytorch/OpenEnv.git
 ```
 
-Usage (2 GPUs required):
+Usage:
 
-# Start the docker container for the Catch environment (recommended). Alternatively, you can run it locally or directly from a HF Space.
+# Start the environment only if using --env-mode docker-local; In other modes, the env is automatically managed by the script.
 ```sh
 docker run -d -p 8001:8001 registry.hf.space/openenv-openspiel-env:latest
 ```
 
-# Spin up vLLM server
+# Option 1: Colocated vLLM (1 GPU required)
+```sh
+python examples/scripts/openenv/catch.py --vllm-mode colocate
+```
+
+# Option 2: Separate vLLM server (2 GPUs required)
 
+# Spin up vLLM server (Terminal 1)
 ```sh
 CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen2.5-0.5B-Instruct --host 0.0.0.0 --port 8000
 ```
 
-# Run training
-
+# Run training (Terminal 2)
 ```sh
-CUDA_VISIBLE_DEVICES=1 python examples/scripts/openenv/catch.py
+CUDA_VISIBLE_DEVICES=1 python examples/scripts/openenv/catch.py --vllm-mode server --vllm-server-url http://localhost:8000
 ```
 """
 
@@ -57,6 +62,7 @@
 from envs.openspiel_env.models import OpenSpielAction
 
 from trl import GRPOConfig, GRPOTrainer, RichProgressCallback, apply_chat_template
+from trl.experimental.openenv import generate_rollout_completions
 
 
 def parse_args():
@@ -67,17 +73,11 @@ def parse_args():
     parser.add_argument("--env-port", type=int, default=8001, help="Port for the environment server.")
     parser.add_argument(
         "--env-mode",
-        choices=["local", "docker", "space"],
-        default="docker",
-        help="Where to run the environment: 'local', 'docker', or 'space'.",
+        choices=["local", "docker-local", "docker-image", "docker-hub", "space"],
+        default="docker-image",
+        help="Where to run the environment: 'local' to launch it, 'docker-local' if already running locally, 'docker-image' to run from a Docker image, 'docker-hub' to run from Docker Hub, or 'space' to use a remote Space URL.",
     )
     # --- Generation and model config ---
-    parser.add_argument(
-        "--gen-url",
-        type=str,
-        default="http://0.0.0.0:8000/generate/",
-        help="vLLM generation endpoint URL.",
-    )
     parser.add_argument(
         "--model",
         type=str,
@@ -90,6 +90,21 @@ def parse_args():
         default=1000,
         help="Number of prompts to use for training dataset.",
     )
+    parser.add_argument(
+        "--env-image", type=str, default="openspiel-env:latest", help="Docker image for the OpenSpiel environment."
+    )
+    parser.add_argument(
+        "--vllm-mode",
+        choices=["colocate", "server"],
+        default="colocate",
+        help="vLLM execution mode: 'colocate' or 'server'.",
+    )
+    parser.add_argument(
+        "--vllm-server-url",
+        type=str,
+        default="http://localhost:8000",
+        help="URL for the vLLM server (only used when --vllm-mode=server).",
+    )
 
     return parser.parse_args()
 
@@ -159,62 +174,6 @@ def start_env_server(env_host: str, env_port: int):
 """
 
 
-def rollout_func(
-    prompts: list[str], args: GRPOConfig, processing_class, client: OpenSpielEnv, gen_url: str
-) -> dict[str, list]:
-    """Generate completions via vLLM and compute environment rewards."""
-    env_rewards = []
-    all_prompt_ids, all_completion_ids, all_logprobs = [], [], []
-
-    for base_prompt in prompts:
-        for _ in range(args.num_generations):
-            env_result = client.reset()
-            obs = env_result.observation
-            total_reward = 0.0
-
-            episode_prompt_ids, episode_completion_ids, episode_logprobs = [], [], []
-
-            while not obs.done:
-                episode_msg = {"prompt": [{"role": "user", "content": f"{base_prompt}\n\n{obs.info_state}\n"}]}
-                episode_prompt = apply_chat_template(episode_msg, processing_class)
-
-                payload = {
-                    "prompts": [episode_prompt["prompt"]],
-                    "n": 1,
-                    "temperature": args.temperature,
-                    "top_p": args.top_p,
-                    "max_tokens": args.max_completion_length,
-                }
-                response = requests.post(gen_url, json=payload)
-                response.raise_for_status()
-                result = response.json()
-
-                episode_prompt_ids.extend(result["prompt_ids"][0])
-                episode_completion_ids.extend(result["completion_ids"][0])
-                episode_logprobs.extend(result["logprobs"][0])
-
-                completion_text = processing_class.batch_decode(result["completion_ids"], skip_special_tokens=True)[0]
-
-                numbers = re.findall(r"\b([0-2])\b", completion_text)
-                action_id = int(numbers[0]) if numbers else obs.legal_actions[0]
-
-                env_result = client.step(OpenSpielAction(action_id=action_id, game_name="catch"))
-                total_reward += env_result.reward or 0.0
-                obs = env_result.observation
-
-            env_rewards.append(total_reward)
-            all_prompt_ids.append(episode_prompt_ids)
-            all_completion_ids.append(episode_completion_ids)
-            all_logprobs.append(episode_logprobs)
-
-    return {
-        "prompt_ids": all_prompt_ids,
-        "completion_ids": all_completion_ids,
-        "logprobs": all_logprobs,
-        "env_reward": env_rewards,
-    }
-
-
 def reward_from_env(completions, **kwargs):
     rewards = kwargs.get("env_reward", [])
     return [float(r) for r in rewards] if rewards else [0.0] * len(completions)
@@ -227,25 +186,34 @@ def main():
     if args.env_mode == "local":
         env_url = f"http://{args.env_host}:{args.env_port}"
         server_process = start_env_server(args.env_host, args.env_port)
-    elif args.env_mode == "docker":
+    elif args.env_mode == "docker-local":
         env_url = f"http://{args.env_host}:{args.env_port}"
         server_process = None
-        print(f"🌍 Using existing Docker environment at {env_url}")
+        print(f"🌍 Using existing OpenSpiel Environment (Docker) at: {env_url}")
+    elif args.env_mode == "docker-image":
+        client = OpenSpielEnv.from_docker_image(args.env_image)
+        server_process = None
+        print("🌍 Using OpenSpiel Environment (Docker) from local Image")
+    elif args.env_mode == "docker-hub":
+        client = OpenSpielEnv.from_hub(args.env_image)
+        server_process = None
+        print("🌍 Using existing OpenSpiel Environment (Docker) from Hub Image")
     elif args.env_mode == "space":
         env_url = args.env_host
         server_process = None
-        print(f"🚀 Using Hugging Face Space environment at {env_url}")
+        print(f"🌍 Using Hugging Face Space environment at: {env_url}")
     else:
-        raise ValueError(f"Unknown env mode: {args.env_mode}")
+        raise ValueError(f"Unknown environment mode: {args.env_mode}")
 
-    gen_url = args.gen_url
-    client = OpenSpielEnv(base_url=env_url)
+    if args.env_mode != "docker-hub" and args.env_mode != "docker-image":
+        client = OpenSpielEnv(base_url=env_url)
     dataset = Dataset.from_dict({"prompt": [BASE_PROMPT] * args.dataset_size})
 
     training_args = GRPOConfig(
         output_dir=f"{args.model.split('/')[-1]}-GRPO-Catch",
-        vllm_mode="server",
         use_vllm=True,
+        vllm_mode=args.vllm_mode,
+        vllm_server_base_url=args.vllm_server_url if args.vllm_mode == "server" else None,
         logging_steps=1,
         report_to="trackio",
         num_train_epochs=1,
@@ -253,12 +221,60 @@ def main():
         gradient_accumulation_steps=4,
     )
 
+    def rollout_func(prompts: list[str], trainer: GRPOTrainer) -> dict[str, list]:
+        """Generate completions via vLLM (colocated or server) and compute environment rewards."""
+        env_rewards: list[float] = []
+        all_prompt_ids: list[list[int]] = []
+        all_completion_ids: list[list[int]] = []
+        all_logprobs: list[list[float]] = []
+        tokenizer = trainer.processing_class
+
+        for base_prompt in prompts:
+            env_result = client.reset()
+            obs = env_result.observation
+            total_reward = 0.0
+
+            episode_prompt_ids: list[int] = []
+            episode_completion_ids: list[int] = []
+            episode_logprobs: list[float] = []
+
+            while not obs.done:
+                episode_msg = {"prompt": [{"role": "user", "content": f"{base_prompt}\n\n{obs.info_state}\n"}]}
+                episode_prompt = apply_chat_template(episode_msg, tokenizer)
+                rollout_output = generate_rollout_completions(trainer, [episode_prompt["prompt"]])[0]
+
+                episode_prompt_ids.extend(rollout_output["prompt_ids"])
+                episode_completion_ids.extend(rollout_output["completion_ids"])
+                episode_logprobs.extend(rollout_output["logprobs"])
+
+                completion_text = tokenizer.batch_decode([rollout_output["completion_ids"]], skip_special_tokens=True)[
+                    0
+                ]
+                numbers = re.findall(r"\b([0-2])\b", completion_text)
+                action_id = int(numbers[0]) if numbers else obs.legal_actions[0]
+
+                env_result = client.step(OpenSpielAction(action_id=action_id, game_name="catch"))
+                total_reward += env_result.reward or 0.0
+                obs = env_result.observation
+
+            env_rewards.append(total_reward)
+            all_prompt_ids.append(episode_prompt_ids)
+            all_completion_ids.append(episode_completion_ids)
+            all_logprobs.append(episode_logprobs)
+
+        return {
+            "prompt_ids": all_prompt_ids,
+            "completion_ids": all_completion_ids,
+            "logprobs": all_logprobs,
+            "env_reward": env_rewards,
+        }
+
     trainer = GRPOTrainer(
         model=args.model,
         reward_funcs=reward_from_env,
         args=training_args,
         train_dataset=dataset,
-        rollout_func=lambda p, a, pc: rollout_func(p, a, pc, client, gen_url),
+        rollout_func=rollout_func,
         callbacks=[RichProgressCallback()],
     )
 
diff --git a/examples/scripts/openenv/echo.py b/examples/scripts/openenv/echo.py
index b5cdb724bd7..ba77c447a11 100644
--- a/examples/scripts/openenv/echo.py
+++ b/examples/scripts/openenv/echo.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 """
-Simple script to run GRPO training with OpenEnv's Echo environment and a vLLM server. The reward function encourages
+Simple script to run GRPO training with OpenEnv's Echo environment and vLLM. The reward function encourages
 longer completions.
 
 Setup:
@@ -22,23 +22,28 @@
 uv pip install git+https://github.com/meta-pytorch/OpenEnv.git
 ```
 
-Usage (2 GPUs required):
+Usage:
 
-# Start the docker container for the Echo environment (recommended). Alternatively, you can run it locally or directly from a HF Space.
+# Start the environment only if using --env-mode docker-local; In other modes, the env is automatically managed by the script.
 ```sh
 docker run -d -p 8001:8001 registry.hf.space/openenv-echo-env:latest
 ```
 
-# Spin up server
+# Option 1: Colocated vLLM (1 GPU required)
+```sh
+python examples/scripts/openenv/echo.py --vllm-mode colocate
+```
 
+# Option 2: Separate vLLM server (2 GPUs required)
+
+# Spin up vLLM server (Terminal 1)
 ```sh
 CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen2.5-0.5B-Instruct --host 0.0.0.0 --port 8000
 ```
 
-# Run training
-
+# Run training (Terminal 2)
 ```sh
-CUDA_VISIBLE_DEVICES=1 python examples/scripts/openenv/echo.py
+CUDA_VISIBLE_DEVICES=1 python examples/scripts/openenv/echo.py --vllm-mode server --vllm-server-url http://localhost:8000
 ```
 """
 
@@ -56,6 +61,7 @@
 from envs.echo_env.models import EchoAction
 
 from trl import GRPOConfig, GRPOTrainer, RichProgressCallback
+from trl.experimental.openenv import generate_rollout_completions
 
 
 def parse_args():
@@ -65,15 +71,9 @@ def parse_args():
     parser.add_argument("--env-port", type=int, default=8001, help="Port for the Echo environment.")
     parser.add_argument(
         "--env-mode",
-        choices=["local", "docker", "space"],
-        default="docker",
-        help="Where to run the Echo environment: 'local' to launch it, 'docker' if already running, or 'space' to use a remote Space URL.",
-    )
-    parser.add_argument(
-        "--gen-url",
-        type=str,
-        default="http://0.0.0.0:8000/generate/",
-        help="Base URL for the vLLM generation endpoint.",
+        choices=["local", "docker-local", "docker-image", "docker-hub", "space"],
+        default="docker-image",
+        help="Where to run the Echo environment: 'local' to launch it, 'docker-local' if already running locally, 'docker-image' to run from a Docker image, 'docker-hub' to run from Docker Hub, or 'space' to use a remote Space URL.",
     )
     parser.add_argument(
         "--model",
@@ -87,6 +87,21 @@ def parse_args():
         default="trl-lib/ultrafeedback-prompt",
         help="Dataset to use for training.",
     )
+    parser.add_argument(
+        "--env-image", type=str, default="echo-env:latest", help="Docker image for the Echo environment."
+    )
+    parser.add_argument(
+        "--vllm-mode",
+        choices=["colocate", "server"],
+        default="colocate",
+        help="vLLM execution mode: 'colocate' or 'server'.",
+    )
+    parser.add_argument(
+        "--vllm-server-url",
+        type=str,
+        default="http://localhost:8000",
+        help="URL for the vLLM server (only used when --vllm-mode=server).",
+    )
 
     return parser.parse_args()
 
@@ -121,39 +136,6 @@ def start_env_server(env_host: str, env_port: int):
     return process
 
 
-def rollout_func(
-    prompts: list[str], args: GRPOConfig, processing_class, client: EchoEnv, gen_url: str
-) -> dict[str, list]:
-    """Generate completions via vLLM and compute environment rewards."""
-    payload = {
-        "prompts": prompts,
-        "n": args.num_generations,
-        "temperature": args.temperature,
-        "top_p": args.top_p,
-        "top_k": -1 if args.top_k is None else args.top_k,
-        "min_p": 0.0 if args.min_p is None else args.min_p,
-        "max_tokens": args.max_completion_length,
-        "repetition_penalty": args.repetition_penalty,
-    }
-
-    response = requests.post(gen_url, json=payload)
-    if response.status_code != 200:
-        print(f"Error response: {response.text}")
-    response.raise_for_status()
-
-    result = response.json()
-    completions_text = processing_class.batch_decode(result["completion_ids"], skip_special_tokens=True)
-
-    env_result = client.reset()
-    env_rewards = []
-    for msg in completions_text:
-        env_result = client.step(EchoAction(message=msg))
-        env_rewards.append(env_result.reward)
-
-    result["env_reward"] = env_rewards
-    return result
-
-
 def reward_from_env(completions, **kwargs):
     """Extract environment rewards for training."""
     env_rewards = kwargs.get("env_reward", [])
@@ -167,25 +149,34 @@ def main():
     if args.env_mode == "local":
         env_url = f"http://{args.env_host}:{args.env_port}"
         server_process = start_env_server(args.env_host, args.env_port)
-    elif args.env_mode == "docker":
+    elif args.env_mode == "docker-local":
         env_url = f"http://{args.env_host}:{args.env_port}"
         server_process = None
         print(f"🌍 Using existing Echo Environment (Docker) at: {env_url}")
+    elif args.env_mode == "docker-image":
+        client = EchoEnv.from_docker_image(args.env_image)
+        server_process = None
+        print("🌍 Using Echo Environment (Docker) from local Image")
+    elif args.env_mode == "docker-hub":
+        client = EchoEnv.from_hub(args.env_image)
+        server_process = None
+        print("🌍 Using existing Echo Environment (Docker) from Hub Image")
     elif args.env_mode == "space":
         env_url = args.env_host
         server_process = None
-        print(f"🚀 Using Hugging Face Space environment at: {env_url}")
+        print(f"🌍 Using Hugging Face Space environment at: {env_url}")
     else:
         raise ValueError(f"Unknown environment mode: {args.env_mode}")
 
-    gen_url = args.gen_url
-    client = EchoEnv(base_url=env_url)
+    if args.env_mode != "docker-hub" and args.env_mode != "docker-image":
+        client = EchoEnv(base_url=env_url)
     dataset = load_dataset(args.dataset, split="train[:1000]")
 
     training_args = GRPOConfig(
         output_dir=f"{args.model.split('/')[-1]}-GRPO-Rollout",
-        vllm_mode="server",
         use_vllm=True,
+        vllm_mode=args.vllm_mode,
+        vllm_server_base_url=args.vllm_server_url if args.vllm_mode == "server" else None,
         logging_steps=1,
         report_to="trackio",
         num_train_epochs=1,
@@ -193,12 +184,31 @@ def main():
         gradient_accumulation_steps=4,
     )
 
+    def rollout_func(prompts: list[str], trainer: GRPOTrainer) -> dict[str, list]:
+        outputs = generate_rollout_completions(trainer, prompts)
+        tokenizer = trainer.processing_class
+
+        completions_text = [tokenizer.decode(output["completion_ids"], skip_special_tokens=True) for output in outputs]
+
+        env_result = client.reset()
+        env_rewards: list[float] = []
+        for message in completions_text:
+            env_result = client.step(EchoAction(message=message))
+            env_rewards.append(env_result.reward)
+
+        return {
+            "prompt_ids": [output["prompt_ids"] for output in outputs],
+            "completion_ids": [output["completion_ids"] for output in outputs],
+            "logprobs": [output["logprobs"] for output in outputs],
+            "env_reward": env_rewards,
+        }
+
     trainer = GRPOTrainer(
         model=args.model,
         reward_funcs=reward_from_env,
         args=training_args,
         train_dataset=dataset,
-        rollout_func=lambda p, a, pc: rollout_func(p, a, pc, client, gen_url),
+        rollout_func=rollout_func,
         callbacks=[RichProgressCallback()],
     )
 
diff --git a/examples/scripts/openenv/wordle.py b/examples/scripts/openenv/wordle.py
index 2683bcdacfa..7cad1b6571a 100644
--- a/examples/scripts/openenv/wordle.py
+++ b/examples/scripts/openenv/wordle.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 """
-Simple script to run GRPO training with OpenEnv's Wordle environment and a vLLM server.
+Simple script to run GRPO training with OpenEnv's Wordle environment and vLLM.
 
 Setup:
 
@@ -21,41 +21,46 @@
 uv pip install git+https://github.com/meta-pytorch/OpenEnv.git
 ```
 
-Usage (2 GPUs required):
+Usage:
 
-# Start the docker container for the Wordle environment (recommended). Alternatively, you can run it locally or directly from a HF Space.
+# Start the environment only if using --env-mode docker-local; In other modes, the env is automatically managed by the script.
 ```sh
 docker run -d -p 8001:8001 registry.hf.space/burtenshaw-textarena:latest
 # or TEXTARENA_ENV_ID=Wordle-v0 TEXTARENA_NUM_PLAYERS=1 python -m src.envs.textarena_env.server.app
 ```
 
-# Spin up vLLM server
+# Option 1: Colocated vLLM (1 GPU required)
+```sh
+python examples/scripts/openenv/wordle.py --vllm-mode colocate
+```
+
+# Option 2: Separate vLLM server (2 GPUs required)
 
+# Spin up vLLM server (Terminal 1)
 ```sh
 CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen3-1.7B --host 0.0.0.0 --port 8000
 ```
 
-# Run training
-
+# Run training (Terminal 2)
 ```sh
-CUDA_VISIBLE_DEVICES=1 python examples/scripts/openenv/wordle.py
+CUDA_VISIBLE_DEVICES=1 python examples/scripts/openenv/wordle.py --vllm-mode server --vllm-server-url http://localhost:8000
 ```
 """
 
 from __future__ import annotations
 
 import argparse
-import os
 import sys
+from collections import defaultdict
 from collections.abc import Iterable
 from datetime import datetime
 from pathlib import Path
 
-import requests
 from datasets import Dataset
 from transformers import AutoTokenizer
 
 from trl import GRPOConfig, GRPOTrainer
+from trl.experimental.openenv import generate_rollout_completions
 
 
 # Ensure src/ is on the path
@@ -63,11 +68,7 @@
 
 from envs.textarena_env import TextArenaAction, TextArenaEnv
 from envs.textarena_env.models import TextArenaMessage
-from envs.textarena_env.rewards import (
-    extract_feedback_counts,
-    extract_guess,
-    extract_wordle_feedback,
-)
+from envs.textarena_env.rewards import extract_feedback_counts, extract_guess, extract_wordle_feedback
 
 
 def parse_args() -> argparse.Namespace:
@@ -84,10 +85,16 @@ def parse_args() -> argparse.Namespace:
         default="Qwen/Qwen3-1.7B",
         help="Model identifier passed to GRPOTrainer for fine-tuning.",
     )
+    parser.add_argument("--env-host", type=str, default="0.0.0.0", help="Host for the environment server.")
+    parser.add_argument("--env-port", type=int, default=8001, help="Port for the environment server.")
+    parser.add_argument(
+        "--env-mode",
+        choices=["docker-local", "docker-image", "docker-hub", "space"],
+        default="docker-image",
+        help="Where to run the environment: 'docker-local' if already running locally, 'docker-image' to run from a Docker image, 'docker-hub' to run from Docker Hub, or 'space' to use a remote Space URL.",
+    )
     parser.add_argument(
-        "--env-url",
-        default="https://0.0.0.0:8001",  # default="https://burtenshaw-textarena.hf.space"
-        help="Base URL for the TextArena Wordle environment.",
+        "--env-image", type=str, default="textarena-env:latest", help="Docker image for the TextArena environment."
     )
     parser.add_argument(
         "--system-prompt-path",
@@ -108,7 +115,7 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--max-turns",
         type=int,
-        default=5,
+        default=6,
         help="Maximum number of turns to play in the Wordle environment per episode.",
     )
     parser.add_argument(
@@ -210,15 +217,16 @@ def parse_args() -> argparse.Namespace:
         help="TrackIO space identifier.",
     )
     parser.add_argument(
-        "--vllm-endpoint",
-        default=os.getenv("VLLM_ENDPOINT", "http://localhost:8000/generate/"),
-        help="Endpoint for the vLLM server.",
+        "--vllm-mode",
+        choices=("colocate", "server"),
+        default="colocate",
+        help="vLLM execution mode: 'colocate' or 'server'.",
     )
     parser.add_argument(
-        "--request-timeout",
-        type=int,
-        default=60,
-        help="Timeout (in seconds) for vLLM HTTP requests.",
+        "--vllm-server-url",
+        type=str,
+        default="http://localhost:8000",
+        help="URL for the vLLM server (only used when --vllm-mode=server).",
     )
     parser.add_argument(
         "--logging-steps",
@@ -273,54 +281,6 @@ def make_user_prompt(prompt_text: str, messages: Iterable[TextArenaMessage]) ->
     )
 
 
-def request_vllm_completion(
-    prompt: str,
-    trainer_args: GRPOConfig,
-    endpoint: str,
-    timeout: int,
-    fallback: argparse.Namespace,
-) -> dict[str, list]:
-    payload: dict[str, object] = {
-        "prompts": [prompt],
-        "n": 1,
-        "temperature": getattr(trainer_args, "temperature", fallback.temperature),
-        "max_tokens": getattr(trainer_args, "max_completion_length", fallback.max_new_tokens),
-        "logprobs": True,
-    }
-
-    top_k = getattr(trainer_args, "top_k", fallback.top_k)
-    if top_k is not None:
-        payload["top_k"] = top_k
-
-    top_p = getattr(trainer_args, "top_p", fallback.top_p)
-    if top_p is not None:
-        payload["top_p"] = top_p
-
-    min_p = getattr(trainer_args, "min_p", None)
-    if min_p is not None:
-        payload["min_p"] = min_p
-
-    repetition_penalty = getattr(trainer_args, "repetition_penalty", None)
-    if repetition_penalty is not None:
-        payload["repetition_penalty"] = repetition_penalty
-
-    response = requests.post(endpoint, json=payload, timeout=timeout)
-    response.raise_for_status()
-    data = response.json()
-
-    prompt_ids = data.get("prompt_ids") or data.get("prompt_token_ids") or [[]]
-    completion_ids = data.get("completion_ids") or data.get("completion_token_ids") or [[]]
-    logprobs = data.get("logprobs") or data.get("completion_logprobs") or [[]]
-    texts = data.get("completions") or data.get("completion_texts") or data.get("texts")
-
-    return {
-        "prompt_ids": prompt_ids[0] if prompt_ids else [],
-        "completion_ids": completion_ids[0] if completion_ids else [],
-        "logprobs": [float(lp) for lp in (logprobs[0] if logprobs else [])],
-        "text": (texts[0] if texts else None),
-    }
-
-
 def scale_repetition_score(previous_occurrences: int, max_occurrences: int) -> float:
     """Scale the repetition score based on the number of previous occurrences from 0 to 1"""
     if max_occurrences == 0:
@@ -329,12 +289,12 @@ def scale_repetition_score(previous_occurrences: int, max_occurrences: int) -> f
 
 
 def rollout_once(
+    trainer: GRPOTrainer,
     env: TextArenaEnv,
     tokenizer: AutoTokenizer,
-    args: GRPOConfig,
     dataset_prompt: str,
-    cli_args: argparse.Namespace,
     system_prompt: str,
+    max_turns: int,
 ) -> dict[str, list]:
     result = env.reset()
     observation = result.observation
@@ -347,9 +307,9 @@ def rollout_once(
     yellow_scores: list[float] = []
     repetition_scores: list[float] = []
     correct_scores: list[float] = []
-    guess_counts: dict[str, int] = {}
+    guess_counts: defaultdict[str, int] = defaultdict(int)
 
-    for _turn in range(cli_args.max_turns):
+    for _turn in range(max_turns):
         # when the game is over the environment will return a done=True
         if result.done:
             break
@@ -368,19 +328,12 @@ def rollout_once(
             enable_thinking=False,
         )
 
-        # generate the completion from the model using vLLM
-        vllm_result = request_vllm_completion(
-            prompt_text,
-            args,
-            endpoint=cli_args.vllm_endpoint,
-            timeout=cli_args.request_timeout,
-            fallback=cli_args,
-        )
-        prompt_ids.extend(vllm_result["prompt_ids"])
-        completion_ids.extend(vllm_result["completion_ids"])
-        logprobs.extend(vllm_result["logprobs"])
-        completion_text = vllm_result.get("text") or tokenizer.decode(
-            vllm_result["completion_ids"], skip_special_tokens=True
+        rollout_outputs = generate_rollout_completions(trainer, [prompt_text])[0]
+        prompt_ids.extend(rollout_outputs["prompt_ids"])
+        completion_ids.extend(rollout_outputs["completion_ids"])
+        logprobs.extend(rollout_outputs["logprobs"])
+        completion_text = rollout_outputs.get("text") or tokenizer.decode(
+            rollout_outputs["completion_ids"], skip_special_tokens=True
         )
         # extract the guess from the completion
         guess = extract_guess(completion_text)
@@ -425,57 +378,6 @@ def rollout_once(
     }
 
 
-# ---------------------------------------------------------------------------
-# Rollout function
-# ---------------------------------------------------------------------------
-
-
-def rollout_func(
-    env: TextArenaEnv,
-    tokenizer: AutoTokenizer,
-    prompts: list[str],
-    args: GRPOConfig,
-    cli_args: argparse.Namespace,
-    system_prompt: str,
-) -> dict[str, list]:
-    all_prompt_ids: list[list[int]] = []
-    all_completion_ids: list[list[int]] = []
-    all_logprobs: list[list[float]] = []
-    correctness_rewards: list[float] = []
-    green_rewards: list[float] = []
-    yellow_rewards: list[float] = []
-    repetition_rewards: list[float] = []
-    num_generations = args.num_generations or cli_args.num_generations
-
-    for _ in range(num_generations):
-        for prompt_text in prompts:
-            rollout_stats = rollout_once(
-                env=env,
-                tokenizer=tokenizer,
-                args=args,
-                dataset_prompt=prompt_text,
-                cli_args=cli_args,
-                system_prompt=system_prompt,
-            )
-            all_prompt_ids.append(rollout_stats["prompt_ids"])
-            all_completion_ids.append(rollout_stats["completion_ids"])
-            all_logprobs.append(rollout_stats["logprobs"])
-            correctness_rewards.append(rollout_stats["correct_reward"])
-            green_rewards.append(rollout_stats["green_reward"])
-            yellow_rewards.append(rollout_stats["yellow_reward"])
-            repetition_rewards.append(rollout_stats["repetition_reward"])
-
-    return {
-        "prompt_ids": all_prompt_ids,
-        "completion_ids": all_completion_ids,
-        "logprobs": all_logprobs,
-        "correct_reward": correctness_rewards,
-        "green_reward": green_rewards,
-        "yellow_reward": yellow_rewards,
-        "repetition_reward": repetition_rewards,
-    }
-
-
 # ---------------------------------------------------------------------------
 # Rewards
 # ---------------------------------------------------------------------------
@@ -515,55 +417,100 @@ def reward_repetition(completions: list[str], **kwargs) -> list[float]:
 
 
 def main() -> None:
-    cli_args = parse_args()
+    args = parse_args()
 
-    tokenizer = AutoTokenizer.from_pretrained(cli_args.tokenizer_id)
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_id)
     tokenizer.pad_token = tokenizer.eos_token
 
-    env = TextArenaEnv(base_url=cli_args.env_url)
-
-    system_prompt = resolve_system_prompt(cli_args.system_prompt_path)
-
-    dataset = Dataset.from_dict({"prompt": [cli_args.dataset_prompt] * cli_args.dataset_size})
+    # Select environment mode
+    if args.env_mode == "docker-local":
+        env_url = f"http://{args.env_host}:{args.env_port}"
+        client = TextArenaEnv(base_url=env_url)
+        print(f"🌍 Using existing TextArena Environment (Docker) at: {env_url}")
+    elif args.env_mode == "docker-image":
+        client = TextArenaEnv.from_docker_image(args.env_image)
+        print("🌍 Using TextArena Environment (Docker) from local Image")
+    elif args.env_mode == "docker-hub":
+        client = TextArenaEnv.from_hub(args.env_image)
+        print("🌍 Using existing TextArena Environment (Docker) from Hub Image")
+    elif args.env_mode == "space":
+        env_url = args.env_host
+        print(f"🌍 Using Hugging Face Space environment at: {env_url}")
+    else:
+        raise ValueError(f"Unknown environment mode: {args.env_mode}")
+
+    system_prompt = resolve_system_prompt(args.system_prompt_path)
+
+    dataset = Dataset.from_dict({"prompt": [args.dataset_prompt] * args.dataset_size})
 
     timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-    default_output_dir = Path("outputs") / f"wordle-grpo-{sanitize_name(cli_args.model_id)}-{timestamp}"
-    output_dir = Path(cli_args.output_dir or default_output_dir)
+    default_output_dir = Path("outputs") / f"wordle-grpo-{sanitize_name(args.model_id)}-{timestamp}"
+    output_dir = Path(args.output_dir or default_output_dir)
 
     grpo_config = GRPOConfig(
-        vllm_mode="server",
         use_vllm=True,
+        vllm_mode=args.vllm_mode,
+        vllm_server_base_url=args.vllm_server_url if args.vllm_mode == "server" else None,
         output_dir=str(output_dir),
-        num_train_epochs=cli_args.num_epochs,
-        learning_rate=cli_args.learning_rate,
-        weight_decay=cli_args.weight_decay,
-        gradient_accumulation_steps=cli_args.gradient_accumulation_steps,
-        per_device_train_batch_size=cli_args.per_device_batch_size,
-        warmup_steps=cli_args.warmup_steps,
-        num_generations=cli_args.num_generations,
-        max_completion_length=cli_args.max_new_tokens,
-        logging_steps=cli_args.logging_steps,
+        num_train_epochs=args.num_epochs,
+        learning_rate=args.learning_rate,
+        weight_decay=args.weight_decay,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        per_device_train_batch_size=args.per_device_batch_size,
+        warmup_steps=args.warmup_steps,
+        num_generations=args.num_generations,
+        max_completion_length=args.max_new_tokens,
+        logging_steps=args.logging_steps,
         save_strategy="steps",
-        save_steps=cli_args.save_interval,
-        save_total_limit=cli_args.save_total_limit,
-    )
-
-    grpo_config.run_name = cli_args.run_name or f"run-{timestamp}"
-    grpo_config.project = cli_args.project or f"group-{sanitize_name(cli_args.model_id)}"
-    grpo_config.trackio_space_id = cli_args.trackio_space_id
-
-    def wrapped_rollout(prompts: list[str], args: GRPOConfig, processing_class) -> dict[str, list]:
-        return rollout_func(
-            env=env,
-            tokenizer=tokenizer,
-            prompts=prompts,
-            args=args,
-            cli_args=cli_args,
-            system_prompt=system_prompt,
-        )
+        save_steps=args.save_interval,
+        save_total_limit=args.save_total_limit,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+    )
+
+    grpo_config.run_name = args.run_name or f"run-{timestamp}"
+    grpo_config.project = args.project or f"group-{sanitize_name(args.model_id)}"
+    grpo_config.trackio_space_id = args.trackio_space_id
+
+    def rollout_func(prompts: list[str], trainer: GRPOTrainer) -> dict[str, list]:
+        episode_prompt_ids: list[list[int]] = []
+        episode_completion_ids: list[list[int]] = []
+        episode_logprobs: list[list[float]] = []
+        correctness_rewards: list[float] = []
+        green_rewards: list[float] = []
+        yellow_rewards: list[float] = []
+        repetition_rewards: list[float] = []
+
+        for prompt_text in prompts:
+            episode = rollout_once(
+                trainer=trainer,
+                env=client,
+                tokenizer=tokenizer,
+                dataset_prompt=prompt_text,
+                system_prompt=system_prompt,
+                max_turns=args.max_turns,
+            )
+            episode_prompt_ids.append(episode["prompt_ids"])
+            episode_completion_ids.append(episode["completion_ids"])
+            episode_logprobs.append(episode["logprobs"])
+            correctness_rewards.append(episode["correct_reward"])
+            green_rewards.append(episode["green_reward"])
+            yellow_rewards.append(episode["yellow_reward"])
+            repetition_rewards.append(episode["repetition_reward"])
+
+        return {
+            "prompt_ids": episode_prompt_ids,
+            "completion_ids": episode_completion_ids,
+            "logprobs": episode_logprobs,
+            "correct_reward": correctness_rewards,
+            "green_reward": green_rewards,
+            "yellow_reward": yellow_rewards,
+            "repetition_reward": repetition_rewards,
+        }
 
     trainer = GRPOTrainer(
-        model=cli_args.model_id,
+        model=args.model_id,
         processing_class=tokenizer,
         reward_funcs=[
             reward_correct,
@@ -573,16 +520,16 @@ def wrapped_rollout(prompts: list[str], args: GRPOConfig, processing_class) -> d
         ],
         train_dataset=dataset,
         args=grpo_config,
-        rollout_func=wrapped_rollout,
+        rollout_func=rollout_func,
     )
 
     print("Starting GRPO training with Wordle environment...")
-    print(f"Using {cli_args.num_generations} rollouts per dataset prompt")
+    print(f"Using {args.num_generations} rollouts per dataset prompt")
 
     try:
         trainer.train()
     finally:
-        env.close()
+        client.close()
 
 
 if __name__ == "__main__":
diff --git a/examples/scripts/ppo/ppo.py b/examples/scripts/ppo/ppo.py
index 2f5471996c2..b77f30ad457 100644
--- a/examples/scripts/ppo/ppo.py
+++ b/examples/scripts/ppo/ppo.py
@@ -34,15 +34,8 @@
     HfArgumentParser,
 )
 
-from trl import (
-    ModelConfig,
-    PPOConfig,
-    PPOTrainer,
-    ScriptArguments,
-    get_kbit_device_map,
-    get_peft_config,
-    get_quantization_config,
-)
+from trl import ModelConfig, ScriptArguments, get_kbit_device_map, get_peft_config, get_quantization_config
+from trl.experimental.ppo import PPOConfig, PPOTrainer
 
 
 # Enable logging in a Hugging Face Space
diff --git a/examples/scripts/ppo/ppo_tldr.py b/examples/scripts/ppo/ppo_tldr.py
index 7962758ec40..bf4f487823b 100644
--- a/examples/scripts/ppo/ppo_tldr.py
+++ b/examples/scripts/ppo/ppo_tldr.py
@@ -34,15 +34,8 @@
     HfArgumentParser,
 )
 
-from trl import (
-    ModelConfig,
-    PPOConfig,
-    PPOTrainer,
-    ScriptArguments,
-    get_kbit_device_map,
-    get_peft_config,
-    get_quantization_config,
-)
+from trl import ModelConfig, ScriptArguments, get_kbit_device_map, get_peft_config, get_quantization_config
+from trl.experimental.ppo import PPOConfig, PPOTrainer
 
 
 # Enable logging in a Hugging Face Space
diff --git a/examples/scripts/xpo.py b/examples/scripts/xpo.py
index 70c13226c5d..0d320e45c6b 100644
--- a/examples/scripts/xpo.py
+++ b/examples/scripts/xpo.py
@@ -45,18 +45,15 @@
 from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, GenerationConfig
 
 from trl import (
-    HfPairwiseJudge,
     LogCompletionsCallback,
     ModelConfig,
-    OpenAIPairwiseJudge,
-    PairRMJudge,
     ScriptArguments,
     TrlParser,
-    XPOConfig,
-    XPOTrainer,
     get_kbit_device_map,
     get_quantization_config,
 )
+from trl.experimental.judges import HfPairwiseJudge, OpenAIPairwiseJudge, PairRMJudge
+from trl.experimental.xpo import XPOConfig, XPOTrainer
 
 
 # Enable logging in a Hugging Face Space
diff --git a/pyproject.toml b/pyproject.toml
index 84e3f13debf..1b84ff50d7f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,6 +52,9 @@ judges = [
     "openai>=1.23.2",
     "llm-blender>=0.0.2"
 ]
+kernels = [
+    "kernels"
+]
 liger = [
     "liger-kernel>=0.6.2"
 ]
@@ -98,6 +101,8 @@ dev = [
     # judges
     "openai>=1.23.2",
     "llm-blender>=0.0.2",
+    # kernels
+    "kernels",
     # liger
     "liger-kernel>=0.6.2",
     # peft
diff --git a/scripts/generate_tiny_models.py b/scripts/generate_tiny_models.py
index 6dec8705c4d..f33fc52678b 100644
--- a/scripts/generate_tiny_models.py
+++ b/scripts/generate_tiny_models.py
@@ -75,6 +75,8 @@
     Qwen3MoeConfig,
     Qwen3MoeForCausalLM,
     Qwen3MoeForSequenceClassification,
+    Qwen3VLConfig,
+    Qwen3VLForConditionalGeneration,
     SmolVLMForConditionalGeneration,
     T5ForConditionalGeneration,
 )
@@ -313,6 +315,7 @@ def init_weights_tiny_model(model):
     ("OpenGVLab/InternVL3-8B-hf", InternVLForConditionalGeneration),
     ("Qwen/Qwen2-VL-2B-Instruct", Qwen2VLForConditionalGeneration),
     ("Qwen/Qwen2.5-VL-3B-Instruct", Qwen2_5_VLForConditionalGeneration),
+    ("Qwen/Qwen3-VL-2B-Instruct", Qwen3VLForConditionalGeneration),
 ]:
     processor = AutoProcessor.from_pretrained(model_id)
 
@@ -350,6 +353,16 @@ def init_weights_tiny_model(model):
     if issubclass(model_class.config_class, Idefics2Config):
         kwargs["perceiver_config"] = {"hidden_size": 16}
 
+    if issubclass(model_class.config_class, Qwen3VLConfig):
+        # So hasattr(config, "layer_types") is False
+        # See: https://github.com/huggingface/transformers/blob/fe5ca9ddaa07fac2872407e75c7a7661216ac956/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py#L420
+        del text_config["layer_types"]
+        # "mrope_section" needs 3 elements: for dim, offset in enumerate((1, 2), start=1): mrope_section[dim]
+        # See: https://github.com/huggingface/transformers/blob/fe5ca9ddaa07fac2872407e75c7a7661216ac956/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py#L361
+        text_config["rope_scaling"] = {"mrope_interleaved": True, "mrope_section": [2, 2, 2], "rope_type": "default"}
+        vision_config["depth"] = 2
+        vision_config["out_hidden_size"] = 16
+
     config = AutoConfig.from_pretrained(model_id, text_config=text_config, vision_config=vision_config, **kwargs)
     model = model_class(config).to(dtype=torch.bfloat16)
     push_to_hub(model, processor, "tiny")
diff --git a/tests/test_cpo_trainer.py b/tests/experimental/test_cpo_trainer.py
similarity index 98%
rename from tests/test_cpo_trainer.py
rename to tests/experimental/test_cpo_trainer.py
index a346277c07f..b699c58bbc3 100644
--- a/tests/test_cpo_trainer.py
+++ b/tests/experimental/test_cpo_trainer.py
@@ -17,9 +17,9 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer
 
-from trl import CPOConfig, CPOTrainer
+from trl.experimental.cpo import CPOConfig, CPOTrainer
 
-from .testing_utils import TrlTestCase, require_peft
+from ..testing_utils import TrlTestCase, require_peft
 
 
 class TestCPOTrainer(TrlTestCase):
diff --git a/tests/test_gkd_trainer.py b/tests/experimental/test_gkd_trainer.py
similarity index 99%
rename from tests/test_gkd_trainer.py
rename to tests/experimental/test_gkd_trainer.py
index 44a0fa9e1ea..6ea7a20094f 100644
--- a/tests/test_gkd_trainer.py
+++ b/tests/experimental/test_gkd_trainer.py
@@ -20,9 +20,9 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
 
-from trl import GKDConfig, GKDTrainer
+from trl.experimental.gkd import GKDConfig, GKDTrainer
 
-from .testing_utils import TrlTestCase, require_liger_kernel
+from ..testing_utils import TrlTestCase, require_liger_kernel
 
 
 class TestGKDTrainerGenerateOnPolicy(TrlTestCase):
diff --git a/tests/experimental/test_grpo_with_replay_buffer_trainer.py b/tests/experimental/test_grpo_with_replay_buffer_trainer.py
index 6ab0fdb2887..181f7204793 100644
--- a/tests/experimental/test_grpo_with_replay_buffer_trainer.py
+++ b/tests/experimental/test_grpo_with_replay_buffer_trainer.py
@@ -250,8 +250,9 @@ def test_update_with_inputs_different_seq_len(self):
 
 
 @pytest.mark.low_priority
+@pytest.mark.parametrize("scale_rewards", ["batch", "group"])
 class TestGRPOWithReplayBufferTrainer(TrlTestCase):
-    def test_training_with_replay_buffer(self):
+    def test_training_with_replay_buffer(self, scale_rewards):
         dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
 
         # Guarantee that some rewards have 0 std
@@ -269,6 +270,7 @@ def custom_reward_func(completions, **kwargs):
             max_completion_length=8,  # reduce the completion length to reduce memory usage
             replay_buffer_size=8,
             report_to="none",
+            scale_rewards=scale_rewards,
         )
         trainer = GRPOWithReplayBufferTrainer(
             model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
diff --git a/tests/test_judges.py b/tests/experimental/test_judges.py
similarity index 95%
rename from tests/test_judges.py
rename to tests/experimental/test_judges.py
index a1e79ca79f5..9e9738ffbb8 100644
--- a/tests/test_judges.py
+++ b/tests/experimental/test_judges.py
@@ -17,9 +17,9 @@
 
 import pytest
 
-from trl import AllTrueJudge, HfPairwiseJudge, PairRMJudge
+from trl.experimental.judges import AllTrueJudge, HfPairwiseJudge, PairRMJudge
 
-from .testing_utils import RandomBinaryJudge, TrlTestCase, require_llm_blender
+from ..testing_utils import RandomBinaryJudge, TrlTestCase, require_llm_blender
 
 
 class TestJudges(TrlTestCase):
diff --git a/tests/experimental/test_minillm_trainer.py b/tests/experimental/test_minillm_trainer.py
new file mode 100644
index 00000000000..8dc2ae7abbe
--- /dev/null
+++ b/tests/experimental/test_minillm_trainer.py
@@ -0,0 +1,57 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+from datasets import load_dataset
+
+from trl.experimental.minillm import MiniLLMConfig, MiniLLMTrainer
+
+from ..testing_utils import TrlTestCase
+
+
+@pytest.mark.low_priority
+class TestMiniLLMTrainer(TrlTestCase):
+    def test_train(self):
+        # Get the dataset
+        dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
+
+        # Initialize the trainer
+        training_args = MiniLLMConfig(
+            output_dir=self.tmp_dir,
+            per_device_train_batch_size=3,  # reduce the batch size to reduce memory usage
+            num_generations=3,  # reduce the number of generations to reduce memory usage
+            max_completion_length=32,  # reduce the completion length to reduce memory usage
+            report_to="none",
+        )
+        trainer = MiniLLMTrainer(
+            model="trl-internal-testing/small-Qwen3ForCausalLM",
+            teacher_model="trl-internal-testing/tiny-Qwen3ForCausalLM",
+            args=training_args,
+            train_dataset=dataset,
+        )
+
+        # Save the initial parameters to compare them later
+        previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+        # Train the model
+        trainer.train()
+
+        # Check that the training loss is not None
+        assert trainer.state.log_history[-1]["train_loss"] is not None
+
+        # Check the params have changed
+        for n, param in previous_trainable_params.items():
+            new_param = trainer.model.get_parameter(n)
+            assert not torch.allclose(param, new_param), f"Parameter {n} has not changed"
diff --git a/tests/test_nash_md_trainer.py b/tests/experimental/test_nash_md_trainer.py
similarity index 98%
rename from tests/test_nash_md_trainer.py
rename to tests/experimental/test_nash_md_trainer.py
index 7e7449e0fe5..1f4ec1d255b 100644
--- a/tests/test_nash_md_trainer.py
+++ b/tests/experimental/test_nash_md_trainer.py
@@ -17,9 +17,9 @@
 from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
 from transformers.utils import is_peft_available
 
-from trl import NashMDConfig, NashMDTrainer
+from trl.experimental.nash_md import NashMDConfig, NashMDTrainer
 
-from .testing_utils import RandomPairwiseJudge, TrlTestCase, require_llm_blender, require_peft
+from ..testing_utils import RandomPairwiseJudge, TrlTestCase, require_llm_blender, require_peft
 
 
 if is_peft_available():
diff --git a/tests/test_ppo_trainer.py b/tests/experimental/test_ppo_trainer.py
similarity index 97%
rename from tests/test_ppo_trainer.py
rename to tests/experimental/test_ppo_trainer.py
index 78531316440..979d80e518b 100644
--- a/tests/test_ppo_trainer.py
+++ b/tests/experimental/test_ppo_trainer.py
@@ -17,10 +17,10 @@
 from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
 from transformers.utils import is_peft_available
 
-from trl import PPOConfig, PPOTrainer
-from trl.trainer.ppo_trainer import masked_mean, masked_var, masked_whiten
+from trl.experimental.ppo import PPOConfig, PPOTrainer
+from trl.experimental.ppo.ppo_trainer import masked_mean, masked_var, masked_whiten
 
-from .testing_utils import TrlTestCase, require_peft
+from ..testing_utils import TrlTestCase, require_peft
 
 
 if is_peft_available():
diff --git a/tests/experimental/test_trainers_args.py b/tests/experimental/test_trainers_args.py
deleted file mode 100644
index 6b3e1bbb0f1..00000000000
--- a/tests/experimental/test_trainers_args.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from datasets import load_dataset
-from transformers import AutoTokenizer
-
-from trl.experimental.bco import BCOConfig, BCOTrainer
-from trl.experimental.orpo import ORPOConfig, ORPOTrainer
-
-from ..testing_utils import TrlTestCase, require_sklearn
-
-
-class TestTrainerArg(TrlTestCase):
-    @require_sklearn
-    def test_bco(self):
-        model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        dataset = load_dataset("trl-internal-testing/zen", "standard_unpaired_preference", split="train")
-        training_args = BCOConfig(
-            self.tmp_dir,
-            max_length=256,
-            max_prompt_length=64,
-            max_completion_length=64,
-            beta=0.5,
-            label_pad_token_id=-99,
-            padding_value=-99,
-            truncation_mode="keep_start",
-            # generate_during_eval=True, # ignore this one, it requires wandb
-            is_encoder_decoder=True,
-            precompute_ref_log_probs=True,
-            model_init_kwargs={"trust_remote_code": True},
-            ref_model_init_kwargs={"trust_remote_code": True},
-            dataset_num_proc=4,
-            prompt_sample_size=512,
-            min_density_ratio=0.2,
-            max_density_ratio=20.0,
-        )
-        trainer = BCOTrainer(
-            model=model_id,
-            ref_model=model_id,
-            args=training_args,
-            train_dataset=dataset,
-            processing_class=tokenizer,
-        )
-        assert trainer.args.max_length == 256
-        assert trainer.args.max_prompt_length == 64
-        assert trainer.args.max_completion_length == 64
-        assert trainer.args.beta == 0.5
-        assert trainer.args.label_pad_token_id == -99
-        assert trainer.args.padding_value == -99
-        assert trainer.args.truncation_mode == "keep_start"
-        # self.assertEqual(trainer.args.generate_during_eval, True)
-        assert trainer.args.is_encoder_decoder
-        assert trainer.args.precompute_ref_log_probs
-        assert trainer.args.model_init_kwargs == {"trust_remote_code": True}
-        assert trainer.args.ref_model_init_kwargs == {"trust_remote_code": True}
-        assert trainer.args.dataset_num_proc == 4
-        assert trainer.args.prompt_sample_size == 512
-        assert trainer.args.min_density_ratio == 0.2
-        assert trainer.args.max_density_ratio == 20.0
-
-    def test_orpo(self):
-        model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train")
-        training_args = ORPOConfig(
-            self.tmp_dir,
-            max_length=256,
-            max_prompt_length=64,
-            max_completion_length=64,
-            beta=0.5,
-            disable_dropout=False,
-            label_pad_token_id=-99,
-            padding_value=-99,
-            truncation_mode="keep_start",
-            # generate_during_eval=True, # ignore this one, it requires wandb
-            is_encoder_decoder=True,
-            model_init_kwargs={"trust_remote_code": True},
-            dataset_num_proc=4,
-        )
-        trainer = ORPOTrainer(model=model_id, args=training_args, train_dataset=dataset, processing_class=tokenizer)
-        assert trainer.args.max_length == 256
-        assert trainer.args.max_prompt_length == 64
-        assert trainer.args.max_completion_length == 64
-        assert trainer.args.beta == 0.5
-        assert not trainer.args.disable_dropout
-        assert trainer.args.label_pad_token_id == -99
diff --git a/tests/test_xpo_trainer.py b/tests/experimental/test_xpo_trainer.py
similarity index 97%
rename from tests/test_xpo_trainer.py
rename to tests/experimental/test_xpo_trainer.py
index b11643bbca9..05f7e563360 100644
--- a/tests/test_xpo_trainer.py
+++ b/tests/experimental/test_xpo_trainer.py
@@ -17,15 +17,16 @@
 from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
 from transformers.utils import is_peft_available
 
-from trl import XPOConfig, XPOTrainer
+from trl.experimental.xpo import XPOConfig, XPOTrainer
 
-from .testing_utils import RandomPairwiseJudge, TrlTestCase, require_llm_blender, require_peft
+from ..testing_utils import RandomPairwiseJudge, TrlTestCase, require_llm_blender, require_peft
 
 
 if is_peft_available():
     from peft import LoraConfig, get_peft_model
 
 
+@pytest.mark.low_priority
 class TestXPOTrainer(TrlTestCase):
     def setup_method(self):
         self.model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py
index b5323e9292a..27faa82b52a 100644
--- a/tests/test_callbacks.py
+++ b/tests/test_callbacks.py
@@ -22,7 +22,6 @@
 from transformers.utils import is_peft_available
 
 from trl import (
-    BasePairwiseJudge,
     BEMACallback,
     DPOConfig,
     DPOTrainer,
@@ -30,6 +29,7 @@
     MergeModelCallback,
     WinRateCallback,
 )
+from trl.experimental.judges import BasePairwiseJudge
 from trl.mergekit_utils import MergeConfig
 
 from .testing_utils import TrlTestCase, require_comet, require_mergekit, require_peft, require_wandb
diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py
index 88d2579a69d..b3844a399c1 100644
--- a/tests/test_grpo_trainer.py
+++ b/tests/test_grpo_trainer.py
@@ -41,8 +41,9 @@
 
 from .testing_utils import (
     TrlTestCase,
+    require_ampere_or_newer,
     require_bitsandbytes,
-    require_flash_attn,
+    require_kernels,
     require_liger_kernel,
     require_peft,
     require_torch_accelerator,
@@ -167,7 +168,7 @@ def test_training(self, config_name):
             new_param = trainer.model.get_parameter(n)
             assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
 
-    @pytest.mark.parametrize("loss_type", ["bnpo", "dr_grpo", "dapo"])
+    @pytest.mark.parametrize("loss_type", ["bnpo", "dr_grpo", "dapo", "cispo"])
     def test_training_loss_types(self, loss_type):
         dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
 
@@ -1987,7 +1988,8 @@ def test_training_with_transformers_paged(self, model_name):
             "HuggingFaceTB/SmolVLM-Instruct",  # Only test the smaller model to avoid OOM
         ],
     )
-    @require_flash_attn
+    @require_kernels
+    @require_ampere_or_newer  # Flash attention 2 requires Ampere or newer GPUs
     @require_bitsandbytes
     @require_peft
     def test_vlm_training(self, model_name):
@@ -2040,7 +2042,7 @@ def data_gen(num_samples):
         )
         model = AutoModelForImageTextToText.from_pretrained(
             model_name,
-            attn_implementation="flash_attention_2",
+            attn_implementation="kernels-community/flash-attn2",
             dtype="bfloat16",
             device_map=get_kbit_device_map(),
             quantization_config=quantization_config,
diff --git a/tests/test_online_dpo_trainer.py b/tests/test_online_dpo_trainer.py
index 09f986ad558..53c6524db52 100644
--- a/tests/test_online_dpo_trainer.py
+++ b/tests/test_online_dpo_trainer.py
@@ -274,6 +274,14 @@ def test_training_with_judge(self, config_name):
     @require_vllm
     @pytest.mark.slow
     def test_training_with_vllm(self, config_name):
+        def cleanup_vllm_communicator(trainer):
+            """Clean up vLLM communicator to avoid conflicts between test runs"""
+            try:
+                if hasattr(trainer, "vllm_client") and trainer.vllm_client is not None:
+                    trainer.vllm_client.close_communicator()
+            except Exception:
+                pass  # Continue if cleanup fails
+
         model_id = "trl-internal-testing/small-Qwen2ForCausalLM-2.5"  # We need a bigger model
         model = AutoModelForCausalLM.from_pretrained(model_id)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -295,10 +303,14 @@ def test_training_with_vllm(self, config_name):
             processing_class=tokenizer,
             reward_processing_classes=self.reward_tokenizer,
         )
-        trainer.train()
 
-        # Check if training loss is available
-        assert "train_loss" in trainer.state.log_history[-1]
+        # Ensure cleanup of vLLM communicator after the test
+        try:
+            trainer.train()
+            # Check if training loss is available
+            assert "train_loss" in trainer.state.log_history[-1]
+        finally:
+            cleanup_vllm_communicator(trainer)
 
     @require_vllm
     def test_training_with_vllm_colocate(self):
diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
index e3429809160..874d5304f2f 100644
--- a/tests/test_sft_trainer.py
+++ b/tests/test_sft_trainer.py
@@ -21,6 +21,7 @@
 import transformers
 from accelerate.utils.memory import release_memory
 from datasets import load_dataset
+from packaging.version import Version
 from packaging.version import parse as parse_version
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from transformers.testing_utils import backend_empty_cache, torch_device
@@ -32,8 +33,9 @@
 from .testing_utils import (
     TrlTestCase,
     ignore_warnings,
+    require_ampere_or_newer,
     require_bitsandbytes,
-    require_flash_attn,
+    require_kernels,
     require_liger_kernel,
     require_peft,
     require_torch_accelerator,
@@ -43,6 +45,7 @@
 
 
 if is_peft_available():
+    import peft
     from peft import (
         LoraConfig,
         PeftModel,
@@ -536,6 +539,11 @@ def test_train_with_peft_config_prompt_tuning(self, peft_type):
                 tokenizer_name_or_path="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
             )
         elif peft_type == "prefix_tuning":
+            if parse_version(peft.__version__) <= Version("0.17.1"):
+                pytest.xfail(
+                    "Prefix tuning with device_map='auto' is broken in peft 0.17.1 and below. See "
+                    "https://github.com/huggingface/peft/issues/2821"
+                )
             peft_config = PrefixTuningConfig(
                 task_type=TaskType.CAUSAL_LM,
                 num_virtual_tokens=4,
@@ -863,7 +871,8 @@ def test_train_with_iterable_dataset(self):
             new_param = trainer.model.get_parameter(n)
             assert not torch.allclose(param, new_param), f"Parameter {n} has not changed"
 
-    @require_flash_attn
+    @require_kernels
+    @require_ampere_or_newer  # Flash attention 2 requires Ampere or newer GPUs
     def test_train_padding_free(self):
         # Get the dataset
         dataset = load_dataset("trl-internal-testing/zen", "standard_language_modeling", split="train")
@@ -872,7 +881,7 @@ def test_train_padding_free(self):
         training_args = SFTConfig(
             output_dir=self.tmp_dir,
             padding_free=True,
-            model_init_kwargs={"attn_implementation": "flash_attention_2"},
+            model_init_kwargs={"attn_implementation": "kernels-community/flash-attn2"},
             bf16=True,  # flash_attention_2 only supports bf16 and fp16
             report_to="none",
         )
@@ -1345,6 +1354,13 @@ def test_tag_added_peft(self):
             "trl-internal-testing/tiny-Qwen2VLForConditionalGeneration",
             "trl-internal-testing/tiny-Qwen2_5_VLForConditionalGeneration",
             # "trl-internal-testing/tiny-SmolVLMForConditionalGeneration",  device issue from transformers, see https://github.com/huggingface/transformers/pull/39975
+            pytest.param(
+                "trl-internal-testing/tiny-Qwen3VLForConditionalGeneration",
+                marks=pytest.mark.skipif(
+                    Version(transformers.__version__) < Version("4.57.0"),
+                    reason="Qwen3-VL series were introduced in transformers-4.57.0",
+                ),
+            ),
         ],
     )
     @require_vision
@@ -1380,7 +1396,8 @@ def test_train_vlm(self, model_id):
                 model_id == "trl-internal-testing/tiny-LlavaForConditionalGeneration" and "model.vision_tower.vision_model.post_layernorm" in n or
                 model_id == "trl-internal-testing/tiny-LlavaForConditionalGeneration" and "vision_tower.vision_model.encoder.layers.1" in n or
                 model_id == "trl-internal-testing/tiny-LlavaNextForConditionalGeneration" and "model.vision_tower.vision_model.post_layernorm" in n or
-                model_id == "trl-internal-testing/tiny-LlavaNextForConditionalGeneration" and "vision_tower.vision_model.encoder.layers.1" in n
+                model_id == "trl-internal-testing/tiny-LlavaNextForConditionalGeneration" and "vision_tower.vision_model.encoder.layers.1" in n or
+                model_id == "trl-internal-testing/tiny-Qwen3VLForConditionalGeneration" and "model.visual.deepstack_merger_list" in n
             ):
             # fmt: on
                 continue
diff --git a/tests/test_trainers_args.py b/tests/test_trainers_args.py
deleted file mode 100644
index 1a6c8171c3f..00000000000
--- a/tests/test_trainers_args.py
+++ /dev/null
@@ -1,315 +0,0 @@
-# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
-
-from trl import (
-    CPOConfig,
-    CPOTrainer,
-    DPOConfig,
-    DPOTrainer,
-    FDivergenceType,
-    KTOConfig,
-    KTOTrainer,
-    NashMDConfig,
-    NashMDTrainer,
-    OnlineDPOConfig,
-    OnlineDPOTrainer,
-    RewardConfig,
-    RewardTrainer,
-    SFTConfig,
-    SFTTrainer,
-    XPOConfig,
-    XPOTrainer,
-)
-
-from .testing_utils import TrlTestCase
-
-
-class TestTrainerArg(TrlTestCase):
-    def test_cpo(self):
-        model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train")
-        training_args = CPOConfig(
-            self.tmp_dir,
-            max_length=256,
-            max_prompt_length=64,
-            max_completion_length=64,
-            beta=0.5,
-            label_smoothing=0.5,
-            loss_type="hinge",
-            disable_dropout=False,
-            cpo_alpha=0.5,
-            simpo_gamma=0.2,
-            label_pad_token_id=-99,
-            padding_value=-99,
-            truncation_mode="keep_start",
-            # generate_during_eval=True, # ignore this one, it requires wandb
-            is_encoder_decoder=True,
-            model_init_kwargs={"trust_remote_code": True},
-            dataset_num_proc=4,
-        )
-        trainer = CPOTrainer(model=model_id, args=training_args, train_dataset=dataset, processing_class=tokenizer)
-        assert trainer.args.max_length == 256
-        assert trainer.args.max_prompt_length == 64
-        assert trainer.args.max_completion_length == 64
-        assert trainer.args.beta == 0.5
-        assert trainer.args.label_smoothing == 0.5
-        assert trainer.args.loss_type == "hinge"
-        assert not trainer.args.disable_dropout
-        assert trainer.args.cpo_alpha == 0.5
-        assert trainer.args.simpo_gamma == 0.2
-        assert trainer.args.label_pad_token_id == -99
-        assert trainer.args.padding_value == -99
-        assert trainer.args.truncation_mode == "keep_start"
-        # self.assertEqual(trainer.args.generate_during_eval, True)
-        assert trainer.args.is_encoder_decoder
-        assert trainer.args.model_init_kwargs == {"trust_remote_code": True}
-        assert trainer.args.dataset_num_proc == 4
-
-    def test_dpo(self):
-        model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train")
-        training_args = DPOConfig(
-            self.tmp_dir,
-            beta=0.5,
-            label_smoothing=0.5,
-            loss_type="hinge",
-            label_pad_token_id=-99,
-            pad_token=".",
-            truncation_mode="keep_start",
-            max_length=256,
-            max_prompt_length=64,
-            max_completion_length=64,
-            disable_dropout=False,
-            # generate_during_eval=True, # ignore this one, it requires wandb
-            precompute_ref_log_probs=True,
-            dataset_num_proc=4,
-            model_init_kwargs={"trust_remote_code": True},
-            ref_model_init_kwargs={"trust_remote_code": True},
-            model_adapter_name="dummy_adapter",
-            ref_adapter_name="dummy_adapter",
-            reference_free=True,
-            force_use_ref_model=True,
-            f_divergence_type="js_divergence",
-            f_alpha_divergence_coef=0.5,
-            # sync_ref_model=True, # cannot be True when precompute_ref_log_probs=True. Don't test this.
-            ref_model_mixup_alpha=0.5,
-            ref_model_sync_steps=32,
-            rpo_alpha=0.5,
-            discopop_tau=0.1,
-        )
-        trainer = DPOTrainer(
-            model=model_id,
-            ref_model=model_id,
-            args=training_args,
-            train_dataset=dataset,
-            processing_class=tokenizer,
-        )
-        assert trainer.args.beta == 0.5
-        assert trainer.args.label_smoothing == 0.5
-        assert trainer.args.loss_type == "hinge"
-        assert trainer.args.label_pad_token_id == -99
-        assert trainer.args.pad_token == "."
-        assert trainer.args.truncation_mode == "keep_start"
-        assert trainer.args.max_length == 256
-        assert trainer.args.max_prompt_length == 64
-        assert trainer.args.max_completion_length == 64
-        assert not trainer.args.disable_dropout
-        # self.assertEqual(trainer.args.generate_during_eval, True)
-        assert trainer.args.precompute_ref_log_probs
-        assert trainer.args.dataset_num_proc == 4
-        assert trainer.args.model_init_kwargs == {"trust_remote_code": True}
-        assert trainer.args.ref_model_init_kwargs == {"trust_remote_code": True}
-        assert trainer.args.model_adapter_name == "dummy_adapter"
-        assert trainer.args.ref_adapter_name == "dummy_adapter"
-        assert trainer.args.reference_free
-        assert trainer.args.force_use_ref_model
-        assert trainer.args.f_divergence_type == FDivergenceType.JS_DIVERGENCE
-        assert trainer.args.f_alpha_divergence_coef == 0.5
-        # self.assertEqual(trainer.args.sync_ref_model, True)
-        assert trainer.args.ref_model_mixup_alpha == 0.5
-        assert trainer.args.ref_model_sync_steps == 32
-        assert trainer.args.rpo_alpha == 0.5
-        assert trainer.args.discopop_tau == 0.1
-
-    def test_kto(self):
-        model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        dataset = load_dataset("trl-internal-testing/zen", "standard_unpaired_preference", split="train")
-        training_args = KTOConfig(
-            self.tmp_dir,
-            max_length=256,
-            max_prompt_length=64,
-            max_completion_length=64,
-            beta=0.5,
-            desirable_weight=0.5,
-            undesirable_weight=0.5,
-            label_pad_token_id=-99,
-            padding_value=-99,
-            truncation_mode="keep_start",
-            # generate_during_eval=True, # ignore this one, it requires wandb
-            is_encoder_decoder=True,
-            precompute_ref_log_probs=True,
-            model_init_kwargs={"trust_remote_code": True},
-            ref_model_init_kwargs={"trust_remote_code": True},
-            dataset_num_proc=4,
-        )
-        trainer = KTOTrainer(
-            model=model_id,
-            ref_model=model_id,
-            args=training_args,
-            train_dataset=dataset,
-            processing_class=tokenizer,
-        )
-        assert trainer.args.max_length == 256
-        assert trainer.args.max_prompt_length == 64
-        assert trainer.args.max_completion_length == 64
-        assert trainer.args.beta == 0.5
-        assert trainer.args.desirable_weight == 0.5
-        assert trainer.args.undesirable_weight == 0.5
-        assert trainer.args.label_pad_token_id == -99
-        assert trainer.args.padding_value == -99
-        assert trainer.args.truncation_mode == "keep_start"
-        # self.assertEqual(trainer.args.generate_during_eval, True)
-        assert trainer.args.is_encoder_decoder
-        assert trainer.args.precompute_ref_log_probs
-        assert trainer.args.model_init_kwargs == {"trust_remote_code": True}
-        assert trainer.args.ref_model_init_kwargs == {"trust_remote_code": True}
-        assert trainer.args.dataset_num_proc == 4
-
-    @pytest.mark.parametrize("mixtures_coef_list", [False, True])
-    def test_nash_md(self, mixtures_coef_list):
-        model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        model = AutoModelForCausalLM.from_pretrained(model_id)
-        ref_model = AutoModelForCausalLM.from_pretrained(model_id)
-        reward_model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=1)
-        dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
-        training_args = NashMDConfig(
-            self.tmp_dir,
-            mixture_coef=0.5 if not mixtures_coef_list else [0.5, 0.6],
-        )
-        trainer = NashMDTrainer(
-            args=training_args,
-            processing_class=tokenizer,
-            model=model,
-            ref_model=ref_model,
-            reward_funcs=reward_model,
-            train_dataset=dataset,
-        )
-        assert trainer.args.mixture_coef == (0.5 if not mixtures_coef_list else [0.5, 0.6])
-
-    @pytest.mark.parametrize("beta_list", [False, True])
-    def test_online_dpo(self, beta_list):
-        model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        model = AutoModelForCausalLM.from_pretrained(model_id)
-        ref_model = AutoModelForCausalLM.from_pretrained(model_id)
-        reward_model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=1)
-        dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
-        training_args = OnlineDPOConfig(
-            self.tmp_dir,
-            max_new_tokens=42,
-            temperature=0.5,
-            missing_eos_penalty=0.33,
-            beta=0.6 if not beta_list else [0.6, 0.7],
-            loss_type="hinge",
-        )
-        trainer = OnlineDPOTrainer(
-            model=model,
-            ref_model=ref_model,
-            reward_funcs=reward_model,
-            args=training_args,
-            train_dataset=dataset,
-            processing_class=tokenizer,
-            reward_processing_classes=tokenizer,
-        )
-        assert trainer.args.max_new_tokens == 42
-        assert trainer.args.temperature == 0.5
-        assert trainer.args.missing_eos_penalty == 0.33
-        assert trainer.args.beta == (0.6 if not beta_list else [0.6, 0.7])
-        assert trainer.args.loss_type == "hinge"
-
-    def test_reward(self):
-        model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        model = AutoModelForCausalLM.from_pretrained(model_id)
-        dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train")
-        training_args = RewardConfig(
-            self.tmp_dir,
-            max_length=256,
-            dataset_num_proc=4,
-            center_rewards_coefficient=0.1,
-        )
-        trainer = RewardTrainer(
-            model=model,
-            args=training_args,
-            train_dataset=dataset,
-            processing_class=tokenizer,
-        )
-        assert trainer.args.max_length == 256
-        assert trainer.args.dataset_num_proc == 4
-        assert trainer.args.center_rewards_coefficient == 0.1
-
-    def test_sft(self):
-        model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
-        dataset = load_dataset("trl-internal-testing/zen", "standard_language_modeling", split="train")
-        training_args = SFTConfig(
-            self.tmp_dir,
-            dataset_text_field="dummy_text_field",
-            packing=True,
-            max_length=256,
-            dataset_num_proc=4,
-            neftune_noise_alpha=0.1,
-            model_init_kwargs={"trust_remote_code": True},
-            dataset_kwargs={"append_concat_token": True, "skip_prepare_dataset": True},
-            eval_packing=True,
-        )
-        trainer = SFTTrainer(model_id, args=training_args, train_dataset=dataset)
-        assert trainer.args.dataset_text_field == "dummy_text_field"
-        assert trainer.args.packing
-        assert trainer.args.max_length == 256
-        assert trainer.args.dataset_num_proc == 4
-        assert trainer.args.neftune_noise_alpha == 0.1
-        assert trainer.args.model_init_kwargs == {"trust_remote_code": True}
-        assert "append_concat_token" in trainer.args.dataset_kwargs
-        assert trainer.args.dataset_kwargs["append_concat_token"]
-        assert trainer.args.eval_packing
-
-    @pytest.mark.parametrize("alpha_list", [False, True])
-    def test_xpo(self, alpha_list):
-        model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        model = AutoModelForCausalLM.from_pretrained(model_id)
-        ref_model = AutoModelForCausalLM.from_pretrained(model_id)
-        reward_model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=1)
-        dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
-        training_args = XPOConfig(
-            self.tmp_dir,
-            alpha=0.5 if not alpha_list else [0.5, 0.6],
-        )
-        trainer = XPOTrainer(
-            args=training_args,
-            processing_class=tokenizer,
-            model=model,
-            ref_model=ref_model,
-            reward_funcs=reward_model,
-            train_dataset=dataset,
-        )
-        assert trainer.args.alpha == (0.5 if not alpha_list else [0.5, 0.6])
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
index 99a6e661f5c..8f558bd5491 100644
--- a/tests/testing_utils.py
+++ b/tests/testing_utils.py
@@ -24,7 +24,6 @@
 from transformers import is_bitsandbytes_available, is_comet_available, is_sklearn_available, is_wandb_available
 from transformers.testing_utils import backend_device_count, torch_device
 from transformers.utils import (
-    is_flash_attn_2_available,
     is_kernels_available,
     is_peft_available,
     is_rich_available,
@@ -32,7 +31,7 @@
     is_vision_available,
 )
 
-from trl import BaseBinaryJudge, BasePairwiseJudge
+from trl.experimental.judges import BaseBinaryJudge, BasePairwiseJudge
 from trl.import_utils import (
     is_joblib_available,
     is_liger_kernel_available,
@@ -45,6 +44,7 @@
 
 require_bitsandbytes = pytest.mark.skipif(not is_bitsandbytes_available(), reason="test requires bitsandbytes")
 require_comet = pytest.mark.skipif(not is_comet_available(), reason="test requires comet_ml")
+require_kernels = pytest.mark.skipif(not is_kernels_available(), reason="test requires kernels")
 require_liger_kernel = pytest.mark.skipif(not is_liger_kernel_available(), reason="test requires liger-kernel")
 require_llm_blender = pytest.mark.skipif(not is_llm_blender_available(), reason="test requires llm-blender")
 require_math_latex = pytest.mark.skipif(not is_math_verify_available(), reason="test requires math_verify")
@@ -85,21 +85,16 @@ def is_bitsandbytes_multi_backend_available() -> bool:
 )
 
 
-def is_flash_attn_available():
-    flash_attn_available = is_flash_attn_2_available()
-    kernels_available = is_kernels_available()
-    try:
-        from kernels import get_kernel
-
-        get_kernel("kernels-community/flash-attn")
-    except Exception:
-        kernels_available = False
+def is_ampere_or_newer(device_index=0):
+    if not torch.cuda.is_available():
+        return False
 
-    return kernels_available or flash_attn_available
+    major, minor = torch.cuda.get_device_capability(device_index)
+    # Ampere starts at compute capability 8.0 (e.g., A100 = 8.0, RTX 30xx = 8.6)
+    return (major, minor) >= (8, 0)
 
 
-# Function ported from transformers.testing_utils
-require_flash_attn = pytest.mark.skipif(not is_flash_attn_available(), reason="test requires Flash Attention")
+require_ampere_or_newer = pytest.mark.skipif(not is_ampere_or_newer(), reason="test requires Ampere or newer GPU")
 
 
 class RandomBinaryJudge(BaseBinaryJudge):
diff --git a/trl/cli.py b/trl/cli.py
index 199b1c26703..f33725e7dc8 100644
--- a/trl/cli.py
+++ b/trl/cli.py
@@ -13,11 +13,10 @@
 # limitations under the License.
 
 import importlib.resources as resources
+import logging
 import os
 import sys
 
-import torch
-from accelerate import logging
 from accelerate.commands.launch import launch_command, launch_command_parser
 
 from .scripts.dpo import make_parser as make_dpo_parser
@@ -32,7 +31,7 @@
 from .scripts.vllm_serve import make_parser as make_vllm_serve_parser
 
 
-logger = logging.get_logger(__name__)
+logger = logging.getLogger(__name__)
 
 
 def main():
@@ -144,17 +143,6 @@ def main():
 
     elif args.command == "vllm-serve":
         (script_args,) = parser.parse_args_and_config()
-
-        # Known issue: Using DeepSpeed with tensor_parallel_size=1 and data_parallel_size>1 may cause a crash when
-        # launched via the CLI. Suggest running the module directly.
-        # More information: https://github.com/vllm-project/vllm/issues/17079
-        if script_args.tensor_parallel_size == 1 and script_args.data_parallel_size > 1 and torch.cuda.is_available():
-            logger.warning(
-                "Detected configuration: tensor_parallel_size=1 and data_parallel_size>1. This setup is known to "
-                "cause a crash when using the `trl vllm-serve` CLI entry point. As a workaround, please run the "
-                "server using the module path instead: `python -m trl.scripts.vllm_serve`",
-            )
-
         vllm_serve_main(script_args)
 
 
diff --git a/trl/experimental/bco/bco_config.py b/trl/experimental/bco/bco_config.py
index 39df14e7a36..b0fc0e2220c 100644
--- a/trl/experimental/bco/bco_config.py
+++ b/trl/experimental/bco/bco_config.py
@@ -101,6 +101,16 @@ class BCOConfig(TrainingArguments):
             "`fp16` is not set."
         },
     )
+    # Transformers 4.57.0 introduced a bug that caused the dtype of `lr_scheduler_kwargs` to be unparsable. This issue
+    # was fixed in https://github.com/huggingface/transformers/pull/41322, but the fix has not yet been released. We
+    # add a temporary workaround here, which can be removed once the fix is available—likely in Transformers 4.57.2.
+    lr_scheduler_kwargs: dict | str | None = field(
+        default=None,
+        metadata={
+            "help": "Additional parameters for the lr_scheduler, such as {'num_cycles': 1} for cosine with hard "
+            "restarts."
+        },
+    )
 
     max_length: int | None = field(
         default=1024,
diff --git a/trl/experimental/bco/bco_trainer.py b/trl/experimental/bco/bco_trainer.py
index 8488827538b..86f33639938 100644
--- a/trl/experimental/bco/bco_trainer.py
+++ b/trl/experimental/bco/bco_trainer.py
@@ -393,7 +393,7 @@ def __init__(
             raise ValueError("You passed model_kwargs to the BCOTrainer. But your model is already instantiated.")
         else:
             model_init_kwargs = args.model_init_kwargs
-            dtype = model_init_kwargs.get("dtype")
+            dtype = model_init_kwargs.get("dtype", "auto")
             if dtype is not None:
                 # Convert to `torch.dtype` if an str is passed
                 if isinstance(dtype, str) and dtype != "auto":
@@ -403,6 +403,7 @@ def __init__(
                         f"Invalid `dtype` passed to the BCOConfig. Expected a string with either `torch.dtype` or 'auto', but got {dtype}."
                     )
                 model_init_kwargs["dtype"] = dtype
+            model_init_kwargs["device_map"] = model_init_kwargs.get("device_map", "auto")
 
         if args.ref_model_init_kwargs is None:
             ref_model_init_kwargs = {}
@@ -412,7 +413,7 @@ def __init__(
             )
         else:
             ref_model_init_kwargs = args.ref_model_init_kwargs
-            dtype = ref_model_init_kwargs.get("dtype")
+            dtype = ref_model_init_kwargs.get("dtype", "auto")
             if dtype is not None:
                 # Convert to `torch.dtype` if an str is passed
                 if isinstance(dtype, str) and dtype != "auto":
@@ -422,6 +423,7 @@ def __init__(
                         f"Invalid `dtype` passed to the BCOConfig. Expected a string with either `torch.dtype` or 'auto', but got {dtype}."
                     )
                 ref_model_init_kwargs["dtype"] = dtype
+            ref_model_init_kwargs["device_map"] = ref_model_init_kwargs.get("device_map", "auto")
 
         if isinstance(model, str):
             model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
diff --git a/trl/experimental/cpo/__init__.py b/trl/experimental/cpo/__init__.py
new file mode 100644
index 00000000000..e20cbbe3638
--- /dev/null
+++ b/trl/experimental/cpo/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .cpo_config import CPOConfig
+from .cpo_trainer import CPOTrainer
+
+
+__all__ = ["CPOConfig", "CPOTrainer"]
diff --git a/trl/experimental/cpo/cpo_config.py b/trl/experimental/cpo/cpo_config.py
new file mode 100644
index 00000000000..93e7015d17c
--- /dev/null
+++ b/trl/experimental/cpo/cpo_config.py
@@ -0,0 +1,228 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Any
+
+from transformers import TrainingArguments
+
+
+@dataclass
+class CPOConfig(TrainingArguments):
+    r"""
+    Configuration class for the [`experimental.cpo.CPOTrainer`].
+
+    This class includes only the parameters that are specific to CPO training. For a full list of training arguments,
+    please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
+    differ from those in [`~transformers.TrainingArguments`].
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters:
+        max_length (`int` or `None`, *optional*, defaults to `1024`):
+            Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
+            to use the default data collator.
+        max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
+            Maximum length of the prompt. This argument is required if you want to use the default data collator.
+        max_completion_length (`int`, *optional*):
+            Maximum length of the completion. This argument is required if you want to use the default data collator
+            and your model is an encoder-decoder.
+        beta (`float`, *optional*, defaults to `0.1`):
+            Parameter controlling the deviation from the reference model. Higher β means less deviation from the
+            reference model. For the IPO loss (`loss_type="ipo"`), β is the regularization parameter denoted by τ in
+            the [paper](https://huggingface.co/papers/2310.12036).
+        label_smoothing (`float`, *optional*, defaults to `0.0`):
+            Label smoothing factor. This argument is required if you want to use the default data collator.
+        loss_type (`str`, *optional*, defaults to `"sigmoid"`):
+            Type of loss to use. Possible values are:
+
+                - `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper.
+                - `"hinge"`: hinge loss on the normalized likelihood from the
+                  [SLiC](https://huggingface.co/papers/2305.10425) paper.
+                - `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper.
+                - `"simpo"`: SimPO loss from the [SimPO](https://huggingface.co/papers/2405.14734) paper.
+                - `"alphapo"`: AlphaPO loss from the [AlphaPO](https://huggingface.co/papers/2501.03884) paper. This
+                  automatically sets `loss_type="simpo"` and `cpo_alpha=0.0`.
+
+        disable_dropout (`bool`, *optional*, defaults to `True`):
+            Whether to disable dropout in the model.
+        cpo_alpha (`float`, *optional*, defaults to `1.0`):
+            Weight of the BC regularizer in CPO training.
+        simpo_gamma (`float`, *optional*, defaults to `0.5`):
+            Target reward margin for the SimPO loss, used only when the `loss_type="simpo"`.
+        alpha (`float`, *optional*, defaults to `0.0`):
+            Alpha parameter that controls reward function shape across all loss types. When alpha=0 (default), uses
+            standard log probability rewards. When `alpha != 0`, applies AlphaPO transformation: `r = (1 - p^(-alpha))
+            / alpha` from the [AlphaPO paper](https://huggingface.co/papers/2501.03884). This parameter works with all
+            loss types.
+        label_pad_token_id (`int`, *optional*, defaults to `-100`):
+            Label pad token id. This argument is required if you want to use the default data collator.
+        padding_value (`int`, *optional*):
+            Padding value to use. If `None`, the padding value of the tokenizer is used.
+        truncation_mode (`str`,*optional*,  defaults to `"keep_end"`):
+            Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
+            This argument is required if you want to use the default data collator.
+        generate_during_eval (`bool`, *optional*, defaults to `False`):
+            If `True`, generates and logs completions from the model to W&B or Comet during evaluation.
+        is_encoder_decoder (`bool`, *optional*):
+            When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
+            you need to specify if the model returned by the callable is an encoder-decoder model.
+        model_init_kwargs (`dict[str, Any]`, *optional*):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
+            string.
+        dataset_num_proc (`int`, *optional*):
+            Number of processes to use for processing the dataset.
+    """
+
+    _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["model_init_kwargs"]
+
+    # Parameters whose default values are overridden from TrainingArguments
+    learning_rate: float = field(
+        default=1e-6,
+        metadata={"help": "The initial learning rate for AdamW."},
+    )
+    logging_steps: float = field(
+        default=10,
+        metadata={
+            "help": "Log every X updates steps. Should be an integer or a float in range `[0,1)`. If smaller than 1, "
+            "will be interpreted as ratio of total training steps."
+        },
+    )
+    gradient_checkpointing: bool = field(
+        default=True,
+        metadata={
+            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
+        },
+    )
+    bf16: bool | None = field(
+        default=None,
+        metadata={
+            "help": "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA "
+            "architecture or Intel XPU or using CPU (use_cpu) or Ascend NPU. If not set, it defaults to `True` if "
+            "`fp16` is not set."
+        },
+    )
+    # Transformers 4.57.0 introduced a bug that caused the dtype of `lr_scheduler_kwargs` to be unparsable. This issue
+    # was fixed in https://github.com/huggingface/transformers/pull/41322, but the fix has not yet been released. We
+    # add a temporary workaround here, which can be removed once the fix is available—likely in Transformers 4.57.2.
+    lr_scheduler_kwargs: dict | str | None = field(
+        default=None,
+        metadata={
+            "help": "Additional parameters for the lr_scheduler, such as {'num_cycles': 1} for cosine with hard "
+            "restarts."
+        },
+    )
+
+    max_length: int | None = field(
+        default=1024,
+        metadata={"help": "Maximum length of the sequences (prompt + completion) in the batch."},
+    )
+    max_prompt_length: int | None = field(
+        default=512,
+        metadata={
+            "help": "Maximum length of the prompt. This argument is required if you want to use the default data "
+            "collator and your model is an encoder-decoder."
+        },
+    )
+    max_completion_length: int | None = field(
+        default=None,
+        metadata={
+            "help": "Maximum length of the completion. This argument is required if you want to use the default data "
+            "collator and your model is an encoder-decoder."
+        },
+    )
+    beta: float = field(
+        default=0.1,
+        metadata={
+            "help": "Parameter controlling the deviation from the reference model. Higher β means less deviation from "
+            "the reference model."
+        },
+    )
+    label_smoothing: float = field(
+        default=0.0,
+        metadata={"help": "Label smoothing factor."},
+    )
+    loss_type: str = field(
+        default="sigmoid",
+        metadata={
+            "help": "Type of loss to use.",
+            "choices": ["sigmoid", "hinge", "ipo", "simpo", "alphapo"],
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={"help": "Whether to disable dropout in the model."},
+    )
+    cpo_alpha: float = field(
+        default=1.0,
+        metadata={"help": "Weight of the BC regularizer in CPO training."},
+    )
+    simpo_gamma: float = field(
+        default=0.5,
+        metadata={"help": "Target reward margin for the SimPO loss, used only when the `loss_type='simpo'`."},
+    )
+    alpha: float = field(
+        default=0.0,
+        metadata={
+            "help": "Alpha parameter that controls reward function shape across all loss types. When alpha=0 "
+            "(default), uses standard log probability rewards. When `alpha != 0`, applies AlphaPO transformation: "
+            "`r = (1 - p^(-alpha)) / alpha` from the AlphaPO paper. This parameter works with all loss types."
+        },
+    )
+    label_pad_token_id: int = field(
+        default=-100,
+        metadata={"help": "Label pad token id."},
+    )
+    padding_value: int | None = field(
+        default=None,
+        metadata={"help": "Padding value to use. If `None`, the padding value of the tokenizer is used."},
+    )
+    truncation_mode: str = field(
+        default="keep_end",
+        metadata={
+            "help": "Truncation mode to use when the prompt is too long.",
+            "choices": ["keep_end", "keep_start"],
+        },
+    )
+    generate_during_eval: bool = field(
+        default=False,
+        metadata={"help": "If `True`, generates and logs completions from the model to W&B during evaluation."},
+    )
+    is_encoder_decoder: bool | None = field(
+        default=None,
+        metadata={"help": "Whether the model is an encoder-decoder model."},
+    )
+    model_init_kwargs: dict[str, Any] | None = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model "
+            "from a string."
+        },
+    )
+    dataset_num_proc: int | None = field(
+        default=None,
+        metadata={"help": "Number of processes to use for processing the dataset."},
+    )
+
+    def __post_init__(self):
+        self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
+
+        # Syntactic sugar for AlphaPO: set loss_type to "simpo" and cpo_alpha to 0.0
+        if self.loss_type == "alphapo":
+            self.loss_type = "simpo"
+            self.cpo_alpha = 0.0
+
+        super().__post_init__()
diff --git a/trl/experimental/cpo/cpo_trainer.py b/trl/experimental/cpo/cpo_trainer.py
new file mode 100644
index 00000000000..cd525b6aac4
--- /dev/null
+++ b/trl/experimental/cpo/cpo_trainer.py
@@ -0,0 +1,1089 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import random
+import textwrap
+from collections import defaultdict
+from collections.abc import Callable
+from contextlib import nullcontext
+from pathlib import Path
+from typing import Any, Literal
+
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from accelerate import PartialState, logging
+from datasets import Dataset
+from torch import autocast
+from torch.utils.data import DataLoader
+from transformers import (
+    AutoModelForCausalLM,
+    BaseImageProcessor,
+    DataCollator,
+    FeatureExtractionMixin,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+    is_comet_available,
+    is_wandb_available,
+)
+from transformers.trainer_callback import TrainerCallback
+from transformers.trainer_utils import EvalLoopOutput
+from transformers.utils import is_peft_available, is_torch_fx_proxy
+
+from ...data_utils import maybe_apply_chat_template, maybe_extract_prompt
+from ...trainer.base_trainer import BaseTrainer
+from ...trainer.utils import (
+    DPODataCollatorWithPadding,
+    add_bos_token_if_needed,
+    add_eos_token_if_needed,
+    disable_dropout_in_model,
+    log_table_to_comet_experiment,
+    pad_to_length,
+    peft_module_casting_to_bf16,
+    selective_log_softmax,
+)
+from .cpo_config import CPOConfig
+
+
+if is_peft_available():
+    from peft import PeftModel, get_peft_model, prepare_model_for_kbit_training
+
+
+if is_wandb_available():
+    import wandb
+
+
+logger = logging.get_logger(__name__)
+
+
+class CPOTrainer(BaseTrainer):
+    r"""
+    Initialize CPOTrainer.
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]):
+            The model to train, preferably an [`~transformers.AutoModelForSequenceClassification`].
+        args ([`experimental.cpo.CPOConfig`]):
+            The CPO config arguments to use for training.
+        data_collator ([`~transformers.DataCollator`]):
+            The data collator to use for training. If None is specified, the default data collator
+            ([`DPODataCollatorWithPadding`]) will be used which will pad the sequences to the maximum length of the
+            sequences in the batch, given a dataset of paired sequences.
+        train_dataset ([`~datasets.Dataset`]):
+            The dataset to use for training.
+        eval_dataset ([`~datasets.Dataset`]):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        model_init (`Callable[[], transformers.PreTrainedModel]`):
+            The model initializer to use for training. If None is specified, the default model initializer will be
+            used.
+        callbacks (`list[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+        peft_config (`dict`, defaults to `None`):
+            The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in
+            a PEFT model.
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+            metric values.
+    """
+
+    _tag_names = ["trl", "cpo"]
+    _name = "CPO"
+    _paper = {
+        "title": "Contrastive Preference Optimization: Pushing the Boundaries of LLM Performance in Machine Translation",
+        "id": "2401.08417",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @inproceedings{xu2024contrastive,
+                title        = {{Contrastive Preference Optimization: Pushing the Boundaries of LLM Performance in Machine Translation}},
+                author       = {Haoran Xu and Amr Sharaf and Yunmo Chen and Weiting Tan and Lingfeng Shen and Benjamin Van Durme and Kenton Murray and Young Jin Kim},
+                year         = 2024,
+                booktitle    = {Forty-first International Conference on Machine Learning, {ICML} 2024, Vienna, Austria, July 21-27, 2024},
+                publisher    = {OpenReview.net},
+                url          = {https://openreview.net/forum?id=51iwkioZpn}
+            }"""),
+    }
+
+    def __init__(
+        self,
+        model: PreTrainedModel | nn.Module | str | None = None,
+        args: CPOConfig | None = None,
+        data_collator: DataCollator | None = None,
+        train_dataset: Dataset | None = None,
+        eval_dataset: Dataset | dict[str, Dataset] | None = None,
+        processing_class: PreTrainedTokenizerBase
+        | BaseImageProcessor
+        | FeatureExtractionMixin
+        | ProcessorMixin
+        | None = None,
+        model_init: Callable[[], PreTrainedModel] | None = None,
+        callbacks: list[TrainerCallback] | None = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
+        peft_config: dict | None = None,
+        compute_metrics: Callable[[EvalLoopOutput], dict] | None = None,
+    ):
+        if args.model_init_kwargs is None:
+            model_init_kwargs = {}
+        elif not isinstance(model, str):
+            raise ValueError("You passed model_kwargs to the CPOTrainer. But your model is already instantiated.")
+        else:
+            model_init_kwargs = args.model_init_kwargs
+            dtype = model_init_kwargs.get("dtype", "auto")
+            if dtype is not None:
+                # Convert to `torch.dtype` if an str is passed
+                if isinstance(dtype, str) and dtype != "auto":
+                    dtype = getattr(torch, dtype)
+                if dtype != "auto" and not isinstance(dtype, torch.dtype):
+                    raise ValueError(
+                        f"Invalid `dtype` passed to the CPOConfig. Expected a string with either `torch.dtype` or 'auto', but got {dtype}."
+                    )
+                model_init_kwargs["dtype"] = dtype
+            model_init_kwargs["device_map"] = model_init_kwargs.get("device_map", "auto")
+
+        if isinstance(model, str):
+            model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
+
+        # Initialize this variable to False. This helps tracking the case when `peft_module_casting_to_bf16`
+        # has been called in order to properly call autocast if needed.
+        self._peft_has_been_casted_to_bf16 = False
+
+        if not is_peft_available() and peft_config is not None:
+            raise ValueError(
+                "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models"
+            )
+        elif is_peft_available() and peft_config is not None:
+            # if model is a peft model and we have a peft_config, we merge and unload it first
+            if isinstance(model, PeftModel):
+                model = model.merge_and_unload()
+
+            if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False):
+                _support_gc_kwargs = hasattr(
+                    args, "gradient_checkpointing_kwargs"
+                ) and "gradient_checkpointing_kwargs" in list(
+                    inspect.signature(prepare_model_for_kbit_training).parameters
+                )
+
+                prepare_model_kwargs = {"use_gradient_checkpointing": args.gradient_checkpointing}
+
+                if _support_gc_kwargs:
+                    prepare_model_kwargs["gradient_checkpointing_kwargs"] = args.gradient_checkpointing_kwargs
+
+                model = prepare_model_for_kbit_training(model, **prepare_model_kwargs)
+            elif args.gradient_checkpointing:
+                # For backward compatibility with older versions of transformers
+                if hasattr(model, "enable_input_require_grads"):
+                    model.enable_input_require_grads()
+                else:
+
+                    def make_inputs_require_grad(module, input, output):
+                        output.requires_grad_(True)
+
+                    model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+            # get peft model with the given config
+            model = get_peft_model(model, peft_config)
+            if args.bf16 and getattr(model, "is_loaded_in_4bit", False):
+                peft_module_casting_to_bf16(model)
+                # If args.bf16 we need to explicitly call `generate` with torch amp autocast context manager
+                self._peft_has_been_casted_to_bf16 = True
+
+        # For models that use gradient_checkpointing, we need to attach a hook that enables input
+        # to explicitly have `requires_grad=True`, otherwise training will either silently
+        # fail or completely fail.
+        elif args.gradient_checkpointing:
+            # For backward compatibility with older versions of transformers
+            if hasattr(model, "enable_input_require_grads"):
+                model.enable_input_require_grads()
+            else:
+
+                def make_inputs_require_grad(module, input, output):
+                    output.requires_grad_(True)
+
+                model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+        if args.generate_during_eval and not (is_wandb_available() or is_comet_available()):
+            raise ValueError(
+                "`generate_during_eval=True` requires Weights and Biases or Comet to be installed."
+                " Please install `wandb` or `comet-ml` to resolve."
+            )
+
+        if model is not None:
+            self.is_encoder_decoder = model.config.is_encoder_decoder
+        elif args.is_encoder_decoder is None:
+            raise ValueError("When no model is provided, you need to pass the parameter is_encoder_decoder.")
+        else:
+            self.is_encoder_decoder = args.is_encoder_decoder
+
+        if self.is_encoder_decoder:
+            self.decoder_start_token_id = model.config.decoder_start_token_id
+            self.pad_token_id = model.config.pad_token_id
+
+        if processing_class is None:
+            raise ValueError("processing_class must be specified to tokenize a CPO dataset.")
+        if args.max_length is None:
+            logger.warning(
+                "`max_length` is not set in the CPOConfig's init"
+                " it will default to `512` by default, but you should do it yourself in the future.",
+            )
+            max_length = 512
+        else:
+            max_length = args.max_length
+        if args.max_prompt_length is None:
+            logger.warning(
+                "`max_prompt_length` is not set in the CPOConfig's init"
+                " it will default to `128` by default, but you should do it yourself in the future.",
+            )
+            max_prompt_length = 128
+        else:
+            max_prompt_length = args.max_prompt_length
+
+        if not max_prompt_length < max_length:
+            raise ValueError(
+                f"max_prompt_length ({max_prompt_length}) should be strictly less than max_length ({max_length})."
+            )
+
+        if args.max_completion_length is None and self.is_encoder_decoder:
+            logger.warning(
+                "When using an encoder decoder architecture, you should set `max_completion_length` in the CPOConfig's init"
+                " it will default to `128` by default, but you should do it yourself in the future.",
+            )
+            max_completion_length = 128
+        else:
+            max_completion_length = args.max_completion_length
+
+        if data_collator is None:
+            data_collator = DPODataCollatorWithPadding(
+                pad_token_id=processing_class.pad_token_id,
+                label_pad_token_id=args.label_pad_token_id,
+                is_encoder_decoder=self.is_encoder_decoder,
+            )
+
+            if args.remove_unused_columns:
+                args.remove_unused_columns = False
+                # warn users
+                logger.warning(
+                    "When using DPODataCollatorWithPadding, you should set `remove_unused_columns=False` in your TrainingArguments"
+                    " we have set it for you, but you should do it yourself in the future.",
+                )
+
+            self.use_dpo_data_collator = True
+        else:
+            self.use_dpo_data_collator = False
+
+        # Disable dropout in the model
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+
+        self.max_length = max_length
+        self.generate_during_eval = args.generate_during_eval
+        self.label_pad_token_id = args.label_pad_token_id
+        self.padding_value = args.padding_value if args.padding_value is not None else processing_class.pad_token_id
+        self.max_prompt_length = max_prompt_length
+        self.truncation_mode = args.truncation_mode
+        self.max_completion_length = max_completion_length
+        self.processing_class = processing_class
+
+        if args.loss_type in ["hinge", "ipo"] and args.label_smoothing > 0:
+            logger.warning(
+                f"You are using the {args.loss_type} loss type that does not support label smoothing. The "
+                "`label_smoothing` parameter will be ignored. Set `label_smoothing` to `0.0` to remove this warning.",
+            )
+        if args.loss_type == "kto_pair":
+            raise ValueError("Support for kto_pair has been removed in CPOTrainer. Please use KTOTrainer.")
+
+        self.beta = args.beta
+        self.label_smoothing = args.label_smoothing
+        self.loss_type = args.loss_type
+        self.cpo_alpha = args.cpo_alpha
+        self.aux_loss_enabled = getattr(model.config, "output_router_logits", False)
+        self.aux_loss_coef = getattr(model.config, "router_aux_loss_coef", 0.0)
+        if self.aux_loss_enabled and self.aux_loss_coef == 0.0:
+            logger.warning(
+                "You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to "
+                "`0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value "
+                "greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary "
+                "loss.",
+            )
+
+        if args.loss_type == "simpo":
+            self.simpo_gamma = args.simpo_gamma
+
+        # AlphaPO parameter for reward shaping
+        self.alpha = args.alpha
+
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+
+        # The trainer estimates the number of FLOPs (floating-point operations) using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in CPO, the sampled data does not include the
+        # "input_ids" key. Instead, the available keys are "prompt_input_ids", "chosen_input_ids", and
+        # "rejected_input_ids". As a result, the trainer issues the warning: "Could not estimate the number of tokens
+        # of the input, floating-point operations will not be computed." To suppress this warning, we set the
+        # "estimate_tokens" key in the model's "warnings_issued" dictionary to True. This acts as a flag to indicate
+        # that the warning has already been issued.
+        model.warnings_issued["estimate_tokens"] = True
+
+        # Compute that only on the main process for faster data processing.
+        # see: https://github.com/huggingface/trl/pull/1255
+        with PartialState().main_process_first():
+            # Extract the prompt if needed, and apply the chat template if needed
+            train_dataset = train_dataset.map(maybe_extract_prompt, num_proc=args.dataset_num_proc)
+            train_dataset = train_dataset.map(
+                maybe_apply_chat_template, fn_kwargs={"tokenizer": processing_class}, num_proc=args.dataset_num_proc
+            )
+            if eval_dataset is not None:
+                eval_dataset = eval_dataset.map(maybe_extract_prompt, num_proc=args.dataset_num_proc)
+                eval_dataset = eval_dataset.map(
+                    maybe_apply_chat_template,
+                    fn_kwargs={"tokenizer": processing_class},
+                    num_proc=args.dataset_num_proc,
+                )
+
+            # tokenize the dataset
+            train_dataset = train_dataset.map(self.tokenize_row, num_proc=args.dataset_num_proc)
+            if eval_dataset is not None:
+                eval_dataset = eval_dataset.map(self.tokenize_row, num_proc=args.dataset_num_proc)
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            model_init=model_init,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+        if not hasattr(self, "accelerator"):
+            raise AttributeError(
+                "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`."
+            )
+
+    def build_tokenized_answer(self, prompt, answer):
+        """
+        Llama tokenizer does satisfy `enc(a + b) = enc(a) + enc(b)`. It does ensure `enc(a + b) = enc(a) + enc(a +
+        b)[len(enc(a)):]`. Reference:
+            https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
+        """
+
+        full_tokenized = self.processing_class(prompt + answer, add_special_tokens=False)
+        prompt_input_ids = self.processing_class(prompt, add_special_tokens=False)["input_ids"]
+
+        answer_input_ids = full_tokenized["input_ids"][len(prompt_input_ids) :]
+        answer_attention_mask = full_tokenized["attention_mask"][len(prompt_input_ids) :]
+
+        # Concat tokens to form `enc(a) + enc(a + b)[len(enc(a)):]`
+        full_concat_input_ids = np.concatenate([prompt_input_ids, answer_input_ids])
+
+        # Prepare input tokens for token by token comparison
+        full_input_ids = np.array(full_tokenized["input_ids"])
+
+        if len(full_input_ids) != len(full_concat_input_ids):
+            raise ValueError("Prompt input ids and answer input ids should have the same length.")
+
+        # On some tokenizers, like Llama-2 tokenizer, there are occasions where tokens
+        # can be merged together when tokenizing prompt+answer. This could result
+        # on the last token from the prompt being different when tokenized on its own
+        # vs when done as prompt+answer.
+        response_token_ids_start_idx = len(prompt_input_ids)
+
+        # If tokenized prompt is different than both prompt+answer, then it means the
+        # last token has changed due to merging.
+        if prompt_input_ids != full_tokenized["input_ids"][:response_token_ids_start_idx]:
+            response_token_ids_start_idx -= 1
+
+        prompt_input_ids = full_tokenized["input_ids"][:response_token_ids_start_idx]
+        prompt_attention_mask = full_tokenized["attention_mask"][:response_token_ids_start_idx]
+
+        if len(prompt_input_ids) != len(prompt_attention_mask):
+            raise ValueError("Prompt input ids and attention mask should have the same length.")
+
+        answer_input_ids = full_tokenized["input_ids"][response_token_ids_start_idx:]
+        answer_attention_mask = full_tokenized["attention_mask"][response_token_ids_start_idx:]
+
+        return dict(
+            prompt_input_ids=prompt_input_ids,
+            prompt_attention_mask=prompt_attention_mask,
+            input_ids=answer_input_ids,
+            attention_mask=answer_attention_mask,
+        )
+
+    def tokenize_row(self, feature, model: PreTrainedModel | nn.Module | None = None) -> dict:
+        """Tokenize a single row from a CPO specific dataset.
+
+        At this stage, we don't convert to PyTorch tensors yet; we just handle the truncation in case the prompt +
+        chosen or prompt + rejected responses is/are too long. First we truncate the prompt; if we're still too long,
+        we truncate the chosen/rejected.
+
+        We also create the labels for the chosen/rejected responses, which are of length equal to the sum of the length
+        of the prompt and the chosen/rejected response, with label_pad_token_id for the prompt tokens.
+        """
+        batch = {}
+        prompt = feature["prompt"]
+        chosen = feature["chosen"]
+        rejected = feature["rejected"]
+
+        if not self.is_encoder_decoder:
+            # Check issues below for more details
+            #  1. https://github.com/huggingface/trl/issues/907
+            #  2. https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
+            #  3. https://github.com/LianjiaTech/BELLE/issues/337
+
+            if not isinstance(prompt, str):
+                raise ValueError(f"prompt should be an str but got {type(prompt)}")
+            prompt_tokens = self.processing_class(prompt, add_special_tokens=False)
+            prompt_tokens = {f"prompt_{k}": v for k, v in prompt_tokens.items()}
+
+            if not isinstance(chosen, str):
+                raise ValueError(f"chosen should be an str but got {type(chosen)}")
+            chosen_tokens = self.build_tokenized_answer(prompt, chosen)
+
+            if not isinstance(rejected, str):
+                raise ValueError(f"rejected should be an str but got {type(rejected)}")
+            rejected_tokens = self.build_tokenized_answer(prompt, rejected)
+
+            # Last prompt token might get merged by tokenizer and
+            # it should not be included for generation if that happens
+            prompt_len_input_ids = len(prompt_tokens["prompt_input_ids"])
+
+            chosen_prompt_len_input_ids = len(chosen_tokens["prompt_input_ids"])
+            rejected_prompt_len_input_ids = len(rejected_tokens["prompt_input_ids"])
+            prompt_len_input_ids = min(chosen_prompt_len_input_ids, rejected_prompt_len_input_ids)
+
+            for k, v in prompt_tokens.items():
+                prompt_tokens[k] = v[:prompt_len_input_ids]
+
+            # Make sure prompts only have one different token at most an
+            # and length only differs by 1 at most
+            num_diff_tokens = sum(
+                a != b
+                for a, b in zip(chosen_tokens["prompt_input_ids"], rejected_tokens["prompt_input_ids"], strict=True)
+            )
+            num_diff_len = abs(chosen_prompt_len_input_ids - rejected_prompt_len_input_ids)
+            if num_diff_tokens > 1 or num_diff_len > 1:
+                raise ValueError(
+                    "Chosen and rejected prompt_input_ids might only differ on the "
+                    "last token due to tokenizer merge ops."
+                )
+
+            # add BOS token to head of prompt. Avoid adding if it's already there
+            prompt_tokens, chosen_tokens, rejected_tokens = add_bos_token_if_needed(
+                self.processing_class.bos_token_id,
+                prompt_len_input_ids,
+                prompt_tokens,
+                chosen_prompt_len_input_ids,
+                chosen_tokens,
+                rejected_prompt_len_input_ids,
+                rejected_tokens,
+            )
+
+            # add EOS token to end of answer. Avoid adding if it's already there
+            chosen_tokens, rejected_tokens = add_eos_token_if_needed(
+                self.processing_class.eos_token_id, chosen_tokens, rejected_tokens
+            )
+
+            longer_response_length = max(len(chosen_tokens["input_ids"]), len(rejected_tokens["input_ids"]))
+
+            # if combined sequence is too long, truncate the prompt
+            for answer_tokens in [chosen_tokens, rejected_tokens, prompt_tokens]:
+                if len(answer_tokens["prompt_input_ids"]) + longer_response_length > self.max_length:
+                    if self.truncation_mode == "keep_start":
+                        for k in ["prompt_input_ids", "prompt_attention_mask"]:
+                            answer_tokens[k] = answer_tokens[k][: self.max_prompt_length]
+                    elif self.truncation_mode == "keep_end":
+                        for k in ["prompt_input_ids", "prompt_attention_mask"]:
+                            answer_tokens[k] = answer_tokens[k][-self.max_prompt_length :]
+                    else:
+                        raise ValueError(f"Unknown truncation mode: {self.truncation_mode}")
+
+            # if that's still too long, truncate the response
+            for answer_tokens in [chosen_tokens, rejected_tokens]:
+                if len(answer_tokens["prompt_input_ids"]) + longer_response_length > self.max_length:
+                    for k in ["input_ids", "attention_mask"]:
+                        answer_tokens[k] = answer_tokens[k][: self.max_length - self.max_prompt_length]
+
+            # Create labels
+            chosen_sequence_tokens = {
+                k: chosen_tokens[f"prompt_{k}"] + chosen_tokens[k] for k in ["input_ids", "attention_mask"]
+            }
+            rejected_sequence_tokens = {
+                k: rejected_tokens[f"prompt_{k}"] + rejected_tokens[k] for k in ["input_ids", "attention_mask"]
+            }
+            chosen_sequence_tokens["labels"] = chosen_sequence_tokens["input_ids"][:]
+            chosen_sequence_tokens["labels"][: len(chosen_tokens["prompt_input_ids"])] = [
+                self.label_pad_token_id
+            ] * len(chosen_tokens["prompt_input_ids"])
+            rejected_sequence_tokens["labels"] = rejected_sequence_tokens["input_ids"][:]
+            rejected_sequence_tokens["labels"][: len(rejected_tokens["prompt_input_ids"])] = [
+                self.label_pad_token_id
+            ] * len(rejected_tokens["prompt_input_ids"])
+
+            for k, toks in {
+                "chosen_": chosen_sequence_tokens,
+                "rejected_": rejected_sequence_tokens,
+                "": prompt_tokens,
+            }.items():
+                for type_key, tokens in toks.items():
+                    if type_key == "token_type_ids":
+                        continue
+                    batch[f"{k}{type_key}"] = tokens
+
+        else:
+            chosen_tokens = self.processing_class(
+                chosen, truncation=True, max_length=self.max_completion_length, add_special_tokens=True
+            )
+            rejected_tokens = self.processing_class(
+                rejected, truncation=True, max_length=self.max_completion_length, add_special_tokens=True
+            )
+            prompt_tokens = self.processing_class(
+                prompt, truncation=True, max_length=self.max_prompt_length, add_special_tokens=True
+            )
+
+            batch["chosen_labels"] = chosen_tokens["input_ids"]
+            batch["rejected_labels"] = rejected_tokens["input_ids"]
+            batch["prompt_input_ids"] = prompt_tokens["input_ids"]
+            batch["prompt_attention_mask"] = prompt_tokens["attention_mask"]
+
+            if model is not None and hasattr(model, "prepare_decoder_input_ids_from_labels"):
+                batch["rejected_decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels(
+                    labels=torch.tensor(batch["rejected_labels"])
+                )
+                batch["chosen_decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels(
+                    labels=torch.tensor(batch["chosen_labels"])
+                )
+
+        return batch
+
+    @staticmethod
+    def concatenated_inputs(
+        batch: dict[str, list | torch.LongTensor],
+        is_encoder_decoder: bool = False,
+        label_pad_token_id: int = -100,
+        padding_value: int = 0,
+        device: torch.device | None = None,
+    ) -> dict[str, torch.LongTensor]:
+        """Concatenate the chosen and rejected inputs into a single tensor.
+
+        Args:
+            batch:
+                A batch of data. Must contain the keys 'chosen_input_ids' and 'rejected_input_ids', which are tensors
+                of shape (batch_size, sequence_length).
+            is_encoder_decoder:
+                Whether the model is an encoder-decoder model.
+            label_pad_token_id:
+                The label pad token id.
+            padding_value:
+                The padding value to use for the concatenated inputs_ids.
+            device:
+                The device for the concatenated inputs.
+
+        Returns:
+            A dictionary containing the concatenated inputs under the key 'concatenated_input_ids'.
+        """
+        concatenated_batch = {}
+
+        if is_encoder_decoder:
+            max_length = max(batch["chosen_labels"].shape[1], batch["rejected_labels"].shape[1])
+        else:
+            max_length = max(batch["chosen_input_ids"].shape[1], batch["rejected_input_ids"].shape[1])
+
+        for k in batch:
+            if k.startswith("chosen") and isinstance(batch[k], torch.Tensor):
+                if "labels" in k or is_encoder_decoder:
+                    pad_value = label_pad_token_id
+                elif k.endswith("_input_ids"):
+                    pad_value = padding_value
+                elif k.endswith("_attention_mask"):
+                    pad_value = 0
+                concatenated_key = k.replace("chosen", "concatenated")
+                concatenated_batch[concatenated_key] = pad_to_length(batch[k], max_length, pad_value=pad_value)
+        for k in batch:
+            if k.startswith("rejected") and isinstance(batch[k], torch.Tensor):
+                if "labels" in k or is_encoder_decoder:
+                    pad_value = label_pad_token_id
+                elif k.endswith("_input_ids"):
+                    pad_value = padding_value
+                elif k.endswith("_attention_mask"):
+                    pad_value = 0
+                concatenated_key = k.replace("rejected", "concatenated")
+                concatenated_batch[concatenated_key] = torch.cat(
+                    (
+                        concatenated_batch[concatenated_key],
+                        pad_to_length(batch[k], max_length, pad_value=pad_value),
+                    ),
+                    dim=0,
+                ).to(device=device)
+
+        if is_encoder_decoder:
+            concatenated_batch["concatenated_input_ids"] = batch["prompt_input_ids"].repeat(2, 1).to(device=device)
+            concatenated_batch["concatenated_attention_mask"] = (
+                batch["prompt_attention_mask"].repeat(2, 1).to(device=device)
+            )
+
+        return concatenated_batch
+
+    def cpo_loss(
+        self,
+        policy_chosen_logps: torch.FloatTensor,
+        policy_rejected_logps: torch.FloatTensor,
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        """Compute the CPO loss for a batch of policy and reference model log probabilities.
+
+        Args:
+            policy_chosen_logps:
+                Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
+            policy_rejected_logps:
+                Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)
+
+        Returns:
+            A tuple of three tensors: (losses, chosen_rewards, rejected_rewards). The losses tensor contains the CPO
+            loss for each example in the batch. The chosen_rewards and rejected_rewards tensors contain the rewards for
+            the chosen and rejected responses, respectively.
+        """
+        # Apply AlphaPO reward transformation if alpha != 0
+        if self.alpha != 0.0:
+            # Compute probabilities
+            chosen_probs = torch.exp(policy_chosen_logps)
+            rejected_probs = torch.exp(policy_rejected_logps)
+
+            # Apply AlphaPO transformation: r = (1 - p^(-alpha)) / alpha
+            policy_chosen_rewards = (1 - chosen_probs.pow(-self.alpha)) / self.alpha
+            policy_rejected_rewards = (1 - rejected_probs.pow(-self.alpha)) / self.alpha
+
+            logits = (policy_chosen_rewards - policy_rejected_rewards).to(self.accelerator.device)
+        else:
+            # Standard log probability rewards when alpha = 0
+            logits = (policy_chosen_logps - policy_rejected_logps).to(self.accelerator.device)
+
+        # The beta is a temperature parameter for the CPO loss, typically something in the range of 0.1 to 0.5.
+        # We ignore the reference model as beta -> 0. The label_smoothing parameter encodes our uncertainty about the labels and
+        # calculates a conservative CPO loss.
+
+        if self.loss_type == "simpo":
+            gamma_logratios = self.simpo_gamma / self.beta
+            logits = logits - gamma_logratios
+            # This reduces to Equation 3 from the CPO paper when label_smoothing -> 0.
+            losses = (
+                -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
+                - F.logsigmoid(-self.beta * logits) * self.label_smoothing
+            )
+        elif self.loss_type == "sigmoid":
+            # This reduces to Equation 3 from the CPO paper when label_smoothing -> 0.
+            losses = (
+                -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
+                - F.logsigmoid(-self.beta * logits) * self.label_smoothing
+            )
+        elif self.loss_type == "hinge":
+            losses = torch.relu(1 - self.beta * logits)
+        elif self.loss_type == "ipo":
+            # eqn (17) of the paper where beta is the regularization parameter for the IPO loss, denoted by tau in the paper.
+            losses = (logits - 1 / (2 * self.beta)) ** 2
+        else:
+            raise ValueError(
+                f"Unknown loss type: {self.loss_type}. Should be one of ['sigmoid', 'hinge', 'ipo', 'simpo']"
+            )
+
+        # Calculate rewards for logging
+        if self.alpha != 0.0:
+            # When using AlphaPO transformation, use the transformed rewards
+            chosen_rewards = self.beta * policy_chosen_rewards.to(self.accelerator.device).detach()
+            rejected_rewards = self.beta * policy_rejected_rewards.to(self.accelerator.device).detach()
+        else:
+            # Standard log probability rewards
+            chosen_rewards = self.beta * (policy_chosen_logps.to(self.accelerator.device)).detach()
+            rejected_rewards = self.beta * (policy_rejected_logps.to(self.accelerator.device)).detach()
+
+        return losses, chosen_rewards, rejected_rewards
+
+    @staticmethod
+    def get_batch_logps(
+        logits: torch.FloatTensor,
+        labels: torch.LongTensor,
+        average_log_prob: bool = False,
+        label_pad_token_id: int = -100,
+        is_encoder_decoder: bool = False,
+    ) -> torch.FloatTensor:
+        """Compute the log probabilities of the given labels under the given logits.
+
+        Args:
+            logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size)
+            labels:
+                Labels for which to compute the log probabilities. Label tokens with a value of label_pad_token_id are
+                ignored. Shape: (batch_size, sequence_length)
+            average_log_prob:
+                If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the
+                log probabilities of the (non-masked) tokens.
+            label_pad_token_id: The label pad token id.
+            is_encoder_decoder: Whether the model is an encoder-decoder model.
+
+        Returns:
+            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the
+            given logits.
+        """
+        if logits.shape[:-1] != labels.shape:
+            raise ValueError("Logits (batch and sequence length dim) and labels must have the same shape.")
+
+        if not is_encoder_decoder:
+            labels = labels[:, 1:].clone()
+            logits = logits[:, :-1, :]
+        loss_mask = labels != label_pad_token_id
+
+        # dummy token; we'll ignore the losses on these tokens later
+        labels[labels == label_pad_token_id] = 0
+
+        per_token_logps = selective_log_softmax(logits, labels)
+
+        if average_log_prob:
+            return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        else:
+            return (per_token_logps * loss_mask).sum(-1)
+
+    def concatenated_forward(
+        self, model: nn.Module, batch: dict[str, list | torch.LongTensor]
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        """Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.
+
+        We do this to avoid doing two forward passes, because it's faster for FSDP.
+        """
+        concatenated_batch = self.concatenated_inputs(
+            batch,
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+            padding_value=self.padding_value,
+            device=self.accelerator.device,
+        )
+        len_chosen = batch["chosen_labels"].shape[0]
+
+        model_kwargs = (
+            {
+                "decoder_input_ids": self._shift_right(concatenated_batch["concatenated_labels"]),
+            }
+            if self.is_encoder_decoder
+            else {}
+        )
+
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+
+        outputs = model(
+            concatenated_batch["concatenated_input_ids"],
+            attention_mask=concatenated_batch["concatenated_attention_mask"],
+            use_cache=False,
+            **model_kwargs,
+        )
+        all_logits = outputs.logits
+
+        def cross_entropy_loss(logits, labels):
+            if not self.is_encoder_decoder:
+                # Shift so that tokens < n predict n
+                logits = logits[..., :-1, :].contiguous()
+                labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            logits = logits.view(-1, logits.shape[-1])
+            labels = labels.view(-1)
+            # Enable model parallelism
+            labels = labels.to(logits.device)
+            loss = loss_fct(logits, labels)
+            return loss
+
+        labels = concatenated_batch["concatenated_labels"].clone()
+
+        if self.cpo_alpha == 0:
+            nll_loss = torch.tensor(0.0).to(self.accelerator.device)
+        else:
+            nll_loss = cross_entropy_loss(all_logits[:len_chosen], labels[:len_chosen])
+
+        all_logps = self.get_batch_logps(
+            all_logits,
+            concatenated_batch["concatenated_labels"],
+            average_log_prob=self.loss_type in ["ipo", "simpo"],
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+        )
+
+        chosen_logps = all_logps[:len_chosen]
+        rejected_logps = all_logps[len_chosen:]
+
+        chosen_logits = all_logits[:len_chosen]
+        rejected_logits = all_logits[len_chosen:]
+
+        if self.aux_loss_enabled:
+            return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, nll_loss, outputs.aux_loss)
+
+        return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, nll_loss)
+
+    def get_batch_loss_metrics(
+        self,
+        model,
+        batch: dict[str, list | torch.LongTensor],
+        train_eval: Literal["train", "eval"] = "train",
+    ):
+        """Compute the CPO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+
+        forward_output = self.concatenated_forward(model, batch)
+        (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits,
+            policy_rejected_logits,
+            policy_nll_loss,
+        ) = forward_output[:5]
+        if self.aux_loss_enabled:
+            aux_loss = forward_output[5]
+
+        losses, chosen_rewards, rejected_rewards = self.cpo_loss(
+            policy_chosen_logps,
+            policy_rejected_logps,
+        )
+
+        loss = losses.mean() + self.cpo_alpha * policy_nll_loss
+        reward_accuracies = (chosen_rewards > rejected_rewards).float()
+
+        prefix = "eval_" if train_eval == "eval" else ""
+        metrics[f"{prefix}rewards/chosen"] = self.accelerator.gather_for_metrics(chosen_rewards).mean().item()
+        metrics[f"{prefix}rewards/rejected"] = self.accelerator.gather_for_metrics(rejected_rewards).mean().item()
+        metrics[f"{prefix}rewards/accuracies"] = self.accelerator.gather_for_metrics(reward_accuracies).mean().item()
+        metrics[f"{prefix}rewards/margins"] = (
+            self.accelerator.gather_for_metrics(chosen_rewards - rejected_rewards).mean().item()
+        )
+        metrics[f"{prefix}logps/rejected"] = (
+            self.accelerator.gather_for_metrics(policy_rejected_logps).detach().mean().item()
+        )
+        metrics[f"{prefix}logps/chosen"] = (
+            self.accelerator.gather_for_metrics(policy_chosen_logps).detach().mean().item()
+        )
+        metrics[f"{prefix}logits/rejected"] = (
+            self.accelerator.gather_for_metrics(policy_rejected_logits.detach().mean()).mean().item()
+        )
+        metrics[f"{prefix}logits/chosen"] = (
+            self.accelerator.gather_for_metrics(policy_chosen_logits.detach().mean()).mean().item()
+        )
+        metrics[f"{prefix}nll_loss"] = self.accelerator.gather_for_metrics(policy_nll_loss).detach().mean().item()
+
+        if self.aux_loss_enabled:
+            loss += self.aux_loss_coef * aux_loss
+
+        return loss, metrics
+
+    def compute_loss(
+        self,
+        model: PreTrainedModel | nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        return_outputs=False,
+        num_items_in_batch=None,
+    ) -> torch.Tensor | tuple[torch.Tensor, dict[str, torch.Tensor]]:
+        compute_loss_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+
+        with compute_loss_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="train")
+
+        # force log the metrics
+        self.store_metrics(metrics, train_eval="train")
+
+        if return_outputs:
+            return (loss, metrics)
+        return loss
+
+    def generate_from_model(self, model, batch: dict[str, torch.LongTensor]) -> str:
+        """Generate samples from the model and reference model for the given batch of inputs."""
+
+        # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with
+        # the torch amp context manager as some hidden states are silently casted to full precision.
+        generate_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+
+        with generate_context_manager:
+            policy_output = model.generate(
+                input_ids=batch["prompt_input_ids"],
+                attention_mask=batch["prompt_attention_mask"],
+                max_length=self.max_length,
+                do_sample=True,
+                pad_token_id=self.processing_class.pad_token_id,
+            )
+
+        policy_output = pad_to_length(policy_output, self.max_length, self.processing_class.pad_token_id)
+        policy_output_decoded = self.processing_class.batch_decode(policy_output, skip_special_tokens=True)
+
+        return policy_output_decoded
+
+    def prediction_step(
+        self,
+        model: PreTrainedModel | nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        prediction_loss_only: bool,
+        ignore_keys: list[str] | None = None,
+    ):
+        if ignore_keys is None:
+            if hasattr(model, "config"):
+                ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", [])
+            else:
+                ignore_keys = []
+
+        prediction_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+
+        with torch.no_grad(), prediction_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="eval")
+
+        # force log the metrics
+        self.store_metrics(metrics, train_eval="eval")
+
+        if prediction_loss_only:
+            return (loss.detach(), None, None)
+
+        # logits for the chosen and rejected samples from model
+        logits_dict = {
+            "eval_logits/chosen": metrics["eval_logits/chosen"],
+            "eval_logits/rejected": metrics["eval_logits/rejected"],
+        }
+        logits = [v for k, v in logits_dict.items() if k not in ignore_keys]
+        logits = torch.tensor(logits, device=self.accelerator.device)
+        labels = torch.zeros(logits.shape[0], device=self.accelerator.device)
+
+        return (loss.detach(), logits, labels)
+
+    def store_metrics(self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None:
+        for key, value in metrics.items():
+            self._stored_metrics[train_eval][key].append(value)
+
+    def evaluation_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: bool | None = None,
+        ignore_keys: list[str] | None = None,
+        metric_key_prefix: str = "eval",
+    ) -> EvalLoopOutput:
+        """
+        Overriding built-in evaluation loop to store metrics for each batch. Prediction/evaluation loop, shared by
+        `Trainer.evaluate()` and `Trainer.predict()`.
+
+        Works both with or without labels.
+        """
+
+        # Sample and save to game log if requested (for one batch to save time)
+        if self.generate_during_eval:
+            # Generate random indices within the range of the total number of samples
+            num_samples = len(dataloader.dataset)
+            random_indices = random.sample(range(num_samples), k=self.args.eval_batch_size)
+
+            # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader
+            random_batch_dataset = dataloader.dataset.select(random_indices)
+            random_batch = self.data_collator(random_batch_dataset)
+            random_batch = self._prepare_inputs(random_batch)
+
+            policy_output_decoded = self.generate_from_model(self.model, random_batch)
+
+            table = pd.DataFrame(
+                columns=["Prompt", "Policy"],
+                data=[
+                    [prompt, pol[len(prompt) :]]
+                    for prompt, pol in zip(random_batch["prompt"], policy_output_decoded, strict=True)
+                ],
+            )
+            if "wandb" in self.args.report_to:
+                wandb.log({"game_log": wandb.Table(data=table)})
+
+            if "comet_ml" in self.args.report_to:
+                log_table_to_comet_experiment(
+                    name="game_log.csv",
+                    table=table,
+                )
+
+        # Base evaluation
+        initial_output = super().evaluation_loop(
+            dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix
+        )
+
+        return initial_output
+
+    def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
+        """
+        Log `logs` on the various objects watching training, including stored metrics.
+
+        Args:
+            logs (`dict[str, float]`):
+                The values to log.
+            start_time (`float`, *optional*):
+                Start time of the training.
+        """
+        # logs either has 'loss' or 'eval_loss'
+        train_eval = "train" if "loss" in logs else "eval"
+        # Add averaged stored metrics to logs
+        for key, metrics in self._stored_metrics[train_eval].items():
+            logs[key] = torch.tensor(metrics).mean().item()
+        del self._stored_metrics[train_eval]
+        return super().log(logs, start_time)
+
+    def _shift_right(self, input_ids):
+        if self.decoder_start_token_id is None:
+            raise ValueError(
+                "model.config.decoder_start_token_id has to be defined. It is usually set to the pad_token_id."
+            )
+
+        # shift inputs to the right
+        if is_torch_fx_proxy(input_ids):
+            # Item assignment is not supported natively for proxies.
+            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), self.decoder_start_token_id)
+            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
+        else:
+            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+            shifted_input_ids[..., 0] = self.decoder_start_token_id
+
+        if self.pad_token_id is None:
+            raise ValueError("model.config.pad_token_id has to be defined.")
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, self.pad_token_id)
+
+        return shifted_input_ids
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
diff --git a/trl/experimental/gfpo/gfpo_trainer.py b/trl/experimental/gfpo/gfpo_trainer.py
index 37de2b8442b..44dcc548de1 100644
--- a/trl/experimental/gfpo/gfpo_trainer.py
+++ b/trl/experimental/gfpo/gfpo_trainer.py
@@ -206,6 +206,9 @@ def _generate_and_score_completions(self, inputs):
             completions = []
             for prompt, completion in zip(prompts, completions_text, strict=True):
                 bootstrap = prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else ""
+                if isinstance(bootstrap, list):  # for VLM, the format might be [{"type": "text", "text": "..."}]
+                    assert len(bootstrap) == 1 and bootstrap[0]["type"] == "text"
+                    bootstrap = bootstrap[0]["text"]
                 completions.append([{"role": "assistant", "content": bootstrap + completion}])
         else:
             completions = completions_text
diff --git a/trl/experimental/gkd/__init__.py b/trl/experimental/gkd/__init__.py
new file mode 100644
index 00000000000..b2a869b8595
--- /dev/null
+++ b/trl/experimental/gkd/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .gkd_config import GKDConfig
+from .gkd_trainer import GKDTrainer
+
+
+__all__ = ["GKDConfig", "GKDTrainer"]
diff --git a/trl/experimental/gkd/gkd_config.py b/trl/experimental/gkd/gkd_config.py
new file mode 100644
index 00000000000..691aa4fdc5b
--- /dev/null
+++ b/trl/experimental/gkd/gkd_config.py
@@ -0,0 +1,112 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Any
+
+from transformers import TrainingArguments
+
+from ...trainer.sft_config import SFTConfig
+
+
+@dataclass
+class GKDConfig(SFTConfig):
+    """
+    Configuration class for [`experimental.gkd.GKDTrainer`].
+
+    This class includes only the parameters that are specific to GKD training. For a full list of training arguments,
+    please refer to the [`~transformers.TrainingArguments`] and [`SFTConfig`] documentation.
+
+    Args:
+        temperature (`float`, *optional*, defaults to `0.9`):
+            Temperature for sampling. The higher the temperature, the more random the completions.
+        lmbda (`float`, *optional*, defaults to `0.5`):
+            Lambda parameter that controls the student data fraction (i.e., the proportion of on-policy
+            student-generated outputs).
+        beta (`float`, *optional*, defaults to `0.5`):
+            Interpolation coefficient between `0.0` and `1.0` of the Generalized Jensen-Shannon Divergence loss. When
+            beta is `0.0`, the loss is the KL divergence. When beta is `1.0`, the loss is the Inverse KL Divergence.
+        max_new_tokens (`int`, *optional*, defaults to `128`):
+            Maximum number of tokens to generate per completion.
+        teacher_model_name_or_path (`str`, *optional*):
+            Model name or path of the teacher model. If `None`, the teacher model will be the same as the model being
+            trained.
+        teacher_model_init_kwargs (`dict[str, Any]]`, *optional*):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the teacher model
+            from a string.
+        disable_dropout (`bool`, *optional*, defaults to `True`):
+            Whether to disable dropout in the model.
+        seq_kd (`bool`, *optional*, defaults to `False`):
+            Seq_kd parameter that controls whether to perform Sequence-Level KD (can be viewed as supervised FT on
+            teacher-generated output).
+    """
+
+    _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["teacher_model_init_kwargs"]
+
+    temperature: float = field(
+        default=0.9,
+        metadata={"help": "Temperature for sampling. The higher the temperature, the more random the completions."},
+    )
+    lmbda: float = field(
+        default=0.5,
+        metadata={
+            "help": "Lambda parameter that controls the student data fraction (i.e., the proportion of on-policy "
+            "student-generated outputs)."
+        },
+    )
+    beta: float = field(
+        default=0.5,
+        metadata={
+            "help": "Interpolation coefficient between `0.0` and `1.0` of the Generalized Jensen-Shannon Divergence "
+            "loss. When beta is `0.0`, the loss is the KL divergence. When beta is `1.0`, the loss is the Inverse KL "
+            "Divergence."
+        },
+    )
+    max_new_tokens: int = field(
+        default=128,
+        metadata={"help": "Maximum number of tokens to generate per completion."},
+    )
+    teacher_model_name_or_path: str | None = field(
+        default=None,
+        metadata={
+            "help": "Model name or path of the teacher model. If `None`, the teacher model will be the same as the "
+            "model being trained."
+        },
+    )
+    teacher_model_init_kwargs: dict[str, Any] | None = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the "
+            "teacher model from a string."
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={"help": "Whether to disable dropouts in `model`."},
+    )
+    seq_kd: bool = field(
+        default=False,
+        metadata={
+            "help": "Seq_kd parameter that controls whether to perform Sequence-Level KD (can be viewed as supervised "
+            "FT on teacher-generated output)."
+        },
+    )
+
+    def __post_init__(self):
+        super().__post_init__()
+        # check lmbda and beta are in the range [0, 1]
+        if self.lmbda < 0.0 or self.lmbda > 1.0:
+            raise ValueError("lmbda must be in the range [0.0, 1.0].")
+        if self.beta < 0.0 or self.beta > 1.0:
+            raise ValueError("beta must be in the range [0.0, 1.0].")
diff --git a/trl/experimental/gkd/gkd_trainer.py b/trl/experimental/gkd/gkd_trainer.py
new file mode 100644
index 00000000000..eaee5e86dd8
--- /dev/null
+++ b/trl/experimental/gkd/gkd_trainer.py
@@ -0,0 +1,440 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import textwrap
+from collections.abc import Callable
+from typing import Any
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from datasets import Dataset
+from transformers import (
+    AutoModelForCausalLM,
+    BaseImageProcessor,
+    DataCollator,
+    FeatureExtractionMixin,
+    GenerationConfig,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+)
+from transformers.trainer_callback import TrainerCallback
+from transformers.trainer_utils import EvalPrediction
+from transformers.utils import is_liger_kernel_available, is_peft_available
+
+from ...models import prepare_deepspeed
+from ...models.utils import unwrap_model_for_generation
+from ...trainer.sft_trainer import SFTTrainer
+from ...trainer.utils import DataCollatorForChatML, disable_dropout_in_model, empty_cache
+from .gkd_config import GKDConfig
+
+
+if is_peft_available():
+    from peft import PeftConfig
+
+if is_liger_kernel_available():
+    from liger_kernel.chunked_loss import LigerFusedLinearJSDLoss
+
+
+class GKDTrainer(SFTTrainer):
+    """Trainer for Generalized Knowledge Distillation (GKD) of language models.
+
+    For details on GKD, see the paper: [On-Policy Distillation of Language Models: Learning from Self-Generated
+    Mistakes](https://huggingface.co/papers/2306.13649).
+
+    Args:
+        model ([`~transformers.PreTrainedModel`] or `torch.nn.Module` or `str`, *optional*):
+            Model to be trained, or the string identifier of the model to be instantiated from a pretrained model.
+        teacher_model ([`~transformers.PreTrainedModel`] or `torch.nn.Module` or `str`, *optional*):
+            Teacher model for knowledge distillation, or the string identifier of the model to be instantiated from a
+            pretrained model.
+        args ([`experimental.gkd.GKDConfig`], *optional*):
+            Training arguments.
+        data_collator ([`~transformers.DataCollator`], *optional*):
+            Data collator to batch samples from the dataset. It defaults to a [`DataCollatorForChatML`] using the
+            `processing_class`.
+        train_dataset ([`~datasets.Dataset`], *optional*):
+            Dataset for training.
+        eval_dataset ([`~datasets.Dataset`] or `dict` of [`~datasets.Dataset`], *optional*):
+            Dataset for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
+           Class to process the data.
+        compute_metrics (`Callable`, *optional*):
+            Function to compute metrics at evaluation. Must take in an [`~transformers.EvalPrediction`] and return a
+            dictionary string to float.
+        callbacks (`list` of [`~transformers.TrainerCallback`], *optional*):
+            Callbacks to use during training.
+        optimizers (`tuple` of `torch.optim.Optimizer` and `torch.optim.lr_scheduler.LambdaLR`, *optional*, defaults to `(None, None)`):
+            Tuple containing the optimizer and the learning rate scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable`, *optional*):
+            Function to preprocess the logits before computing the metrics. Must take in the `logits` and `labels` and
+            return the logits to be used for metrics computation.
+        peft_config ([`~peft.PeftConfig`], *optional*):
+            PEFT configuration to use PEFT for training. If `None`, PEFT is not used. If provided, the `model` will be
+            wrapped with the specified PEFT adapter.
+        formatting_func (`Callable`, *optional*):
+            Function to format the dataset. Must take in an example and return an example.
+    """
+
+    _tag_names = ["trl", "gkd"]
+    _name = "GKD"
+    _paper = {
+        "title": "On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes",
+        "id": "2306.13649",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @inproceedings{agarwal2024on-policy,
+                title        = {{On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes}},
+                author       = {Rishabh Agarwal and Nino Vieillard and Yongchao Zhou and Piotr Stanczyk and Sabela Ramos Garea and Matthieu Geist and Olivier Bachem},
+                year         = 2024,
+                booktitle    = {The Twelfth International Conference on Learning Representations, {ICLR} 2024, Vienna, Austria, May 7-11, 2024},
+                publisher    = {OpenReview.net},
+                url          = {https://openreview.net/forum?id=3zKtaqxLhW},
+            }"""),
+    }
+
+    def __init__(
+        self,
+        model: PreTrainedModel | nn.Module | str | None = None,
+        teacher_model: PreTrainedModel | nn.Module | str = None,
+        args: GKDConfig | None = None,
+        data_collator: DataCollator | None = None,  # type: ignore
+        train_dataset: Dataset | None = None,
+        eval_dataset: Dataset | dict[str, Dataset] | None = None,
+        processing_class: PreTrainedTokenizerBase
+        | BaseImageProcessor
+        | FeatureExtractionMixin
+        | ProcessorMixin
+        | None = None,
+        compute_metrics: Callable[[EvalPrediction], dict] | None = None,
+        callbacks: list[TrainerCallback] | None = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
+        peft_config: "PeftConfig | None" = None,
+        formatting_func: Callable | None = None,
+    ):
+        # Ensure Trainer does not drop non-signature columns used by the collator (e.g., "prompts")
+        args.remove_unused_columns = False
+        # Respect a user-provided data_collator; otherwise, provide a ChatML collator that
+        if data_collator is None:
+            data_collator = DataCollatorForChatML(tokenizer=processing_class, max_length=args.max_length)
+
+        # Ensure SFTTrainer does not pre-process the dataset when using a ChatML collator,
+        # so that raw conversational fields (e.g., "messages") remain available to the collator.
+        if args.dataset_kwargs is None:
+            args.dataset_kwargs = {"skip_prepare_dataset": True}
+        else:
+            args.dataset_kwargs["skip_prepare_dataset"] = True
+
+        # Liger fused GKD loss (JSD)
+        self.use_liger_gkd_loss = False
+        if args.use_liger_kernel:
+            self.liger_jsd_loss = LigerFusedLinearJSDLoss(
+                beta=args.beta,
+                ignore_index=-100,
+                temperature=args.temperature,
+                compiled=False,
+            )
+            self.use_liger_gkd_loss = True
+
+        super().__init__(
+            model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+            peft_config=peft_config,
+            formatting_func=formatting_func,
+        )
+
+        if args.teacher_model_init_kwargs is None:
+            teacher_model_init_kwargs = {}
+        elif not isinstance(teacher_model, str):
+            raise ValueError(
+                "You passed teacher_model_init_kwargs to the GKDConfig, but your teacher_model is already instantiated."
+            )
+        else:
+            teacher_model_init_kwargs = args.teacher_model_init_kwargs
+            teacher_model_init_kwargs["dtype"] = (
+                teacher_model_init_kwargs["dtype"]
+                if teacher_model_init_kwargs["dtype"] in ["auto", None]
+                else getattr(torch, teacher_model_init_kwargs["dtype"])
+            )
+
+        if isinstance(teacher_model, str):
+            teacher_model = AutoModelForCausalLM.from_pretrained(teacher_model, **teacher_model_init_kwargs)
+
+        # Disable dropout in the model
+        if args.disable_dropout:
+            disable_dropout_in_model(self.model)
+
+        if self.is_deepspeed_enabled:
+            self.teacher_model = prepare_deepspeed(teacher_model, self.accelerator)
+        else:
+            self.teacher_model = self.accelerator.prepare_model(teacher_model, evaluation_mode=True)
+
+        self.lmbda = args.lmbda
+        self.beta = args.beta
+        self.temperature = args.temperature
+        self.seq_kd = args.seq_kd
+
+        self.generation_config = GenerationConfig(
+            max_new_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            do_sample=True,
+            top_k=0,
+            use_cache=False if args.gradient_checkpointing else True,
+            pad_token_id=self.processing_class.pad_token_id,
+        )
+        # Set custom EOS tokens if they are specified by the model's generation
+        # config. This is important for models with the Llama 3 chat template,
+        # which use special tokens <|eot_id|> and <|eom_id|> to mark the end of
+        # turns or messages.
+        if (
+            hasattr(self.model.generation_config, "eos_token_id")
+            and self.model.generation_config.eos_token_id is not None
+        ):
+            self.generation_config.eos_token_id = self.model.generation_config.eos_token_id
+
+    @staticmethod
+    def generalized_jsd_loss(
+        student_logits, teacher_logits, labels=None, beta=0.5, temperature=1.0, reduction="batchmean"
+    ):
+        """
+        Compute the generalized Jensen-Shannon Divergence loss for knowledge distillation using F.kl_div. See Eq. (1)
+        of https://huggingface.co/papers/2306.13649 for the definition.
+
+        Args:
+            student_logits:
+                Tensor of shape (batch_size, sequence_length, vocab_size)
+            teacher_logits:
+                Tensor of shape (batch_size, sequence_length, vocab_size)
+            labels:
+                Tensor of shape (batch_size, sequence_length) with -100 for padding tokens to ignore when computing
+                loss
+            beta:
+                Interpolation coefficient between 0 and 1 (default: 0.5)
+            temperature:
+                Softmax temperature (default: 1.0)
+            reduction:
+                Specifies the reduction to apply to the output (default: 'batchmean')
+
+        Returns:
+            loss: Scalar tensor with the generalized JSD loss
+        """
+
+        # Apply temperature scaling
+        student_logits = student_logits / temperature
+        teacher_logits = teacher_logits / temperature
+
+        # Compute log probabilities for student and probabilities for teacher
+        student_log_probs = F.log_softmax(student_logits, dim=-1)
+        teacher_log_probs = F.log_softmax(teacher_logits, dim=-1)
+
+        if beta == 0:
+            jsd = F.kl_div(student_log_probs, teacher_log_probs, reduction="none", log_target=True)
+        elif beta == 1:
+            jsd = F.kl_div(teacher_log_probs, student_log_probs, reduction="none", log_target=True)
+        else:
+            # Compute the log of the mixture distribution
+            # log(a + b) = log(exp(log(a)) + exp(log(b))) -> for mixture
+            beta = torch.tensor(beta, dtype=student_log_probs.dtype)
+            mixture_log_probs = torch.logsumexp(
+                torch.stack([student_log_probs + torch.log(1 - beta), teacher_log_probs + torch.log(beta)]),
+                dim=0,
+            )
+
+            # Compute KL divergences using F.kl_div
+            # PyTorch differs from the standard mathematical definition, so the order of the probability distributions is swapped compared to that defined in the paper.
+            kl_teacher = F.kl_div(mixture_log_probs, teacher_log_probs, reduction="none", log_target=True)
+            kl_student = F.kl_div(mixture_log_probs, student_log_probs, reduction="none", log_target=True)
+
+            # Compute the Generalized Jensen-Shannon Divergence
+            jsd = beta * kl_teacher + (1 - beta) * kl_student
+
+        # Masking
+        if labels is not None:
+            mask = labels != -100
+            jsd = jsd[mask]
+
+        # Apply reduction
+        if reduction == "batchmean":
+            return jsd.sum() / mask.sum() if labels is not None else jsd.sum() / jsd.size(0)
+        elif reduction == "sum":
+            return jsd.sum()
+        elif reduction == "mean":
+            return jsd.mean()
+        else:
+            return jsd
+
+    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
+        if self.use_liger_gkd_loss:
+            # Forward only through the base models (avoid lm_head to save memory)
+            unwrapped_student = self.accelerator.unwrap_model(model)
+            if hasattr(unwrapped_student, "get_decoder") and unwrapped_student.get_decoder() is not None:
+                base_student = unwrapped_student.get_decoder()
+            else:
+                base_student = getattr(
+                    unwrapped_student, getattr(unwrapped_student, "base_model_prefix", "model"), unwrapped_student
+                )
+
+            student_outputs = base_student(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+                use_cache=False,
+            )
+
+            self.teacher_model.eval()
+            unwrapped_teacher = self.accelerator.unwrap_model(self.teacher_model)
+            if hasattr(unwrapped_teacher, "get_decoder") and unwrapped_teacher.get_decoder() is not None:
+                base_teacher = unwrapped_teacher.get_decoder()
+            else:
+                base_teacher = getattr(
+                    unwrapped_teacher, getattr(unwrapped_teacher, "base_model_prefix", "model"), unwrapped_teacher
+                )
+            with torch.no_grad():
+                teacher_outputs = base_teacher(
+                    input_ids=inputs["input_ids"],
+                    attention_mask=inputs["attention_mask"],
+                    use_cache=False,
+                )
+
+            # hidden states (shifted)
+            student_hidden = student_outputs.last_hidden_state[:, :-1]
+            teacher_hidden = teacher_outputs.last_hidden_state[:, :-1]
+
+            # Release full outputs to free memory
+            del student_outputs, teacher_outputs
+
+            # labels mask and labels (shifted)
+            labels_mask = inputs["labels"] != -100
+            masked_input_ids = torch.where(
+                labels_mask, inputs["input_ids"], torch.full_like(inputs["input_ids"], -100)
+            )
+            true_labels = masked_input_ids[:, 1:].contiguous()
+
+            # Release intermediate tensors
+            del labels_mask, masked_input_ids
+
+            # heads
+            student_head = unwrapped_student.get_output_embeddings()
+            teacher_head = unwrapped_teacher.get_output_embeddings()
+
+            # liger fused jsd loss
+            loss = self.liger_jsd_loss(
+                student_input=student_hidden,
+                student_weight=student_head.weight,
+                teacher_input=teacher_hidden,
+                teacher_weight=teacher_head.weight,
+                true_labels=true_labels,
+                student_bias=getattr(student_head, "bias", None),
+                teacher_bias=getattr(teacher_head, "bias", None),
+            )
+
+            # Release hidden states after loss computation
+            del student_hidden, teacher_hidden, true_labels
+        else:
+            # compute student output
+            student_outputs = model(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+            )
+
+            # compute teacher output in eval mode
+            self.teacher_model.eval()
+            with torch.no_grad():
+                teacher_outputs = self.teacher_model(
+                    input_ids=inputs["input_ids"],
+                    attention_mask=inputs["attention_mask"],
+                )
+
+            # slice the logits for the generated tokens using the inputs["prompts"] lengths
+            prompt_lengths = inputs["prompts"].shape[1]
+            shifted_student_logits = student_outputs.logits[:, prompt_lengths - 1 : -1, :]
+            shifted_teacher_logits = teacher_outputs.logits[:, prompt_lengths - 1 : -1, :]
+            shifted_labels = inputs["labels"][:, prompt_lengths:]
+
+            # compute loss
+            loss = self.generalized_jsd_loss(
+                student_logits=shifted_student_logits,
+                teacher_logits=shifted_teacher_logits,
+                labels=shifted_labels,
+                beta=self.beta,
+            )
+
+        # empty cache
+        empty_cache()
+
+        # Return loss
+        return (loss, student_outputs) if return_outputs else loss
+
+    @staticmethod
+    def generate_on_policy_outputs(model, inputs, generation_config, pad_token_id=None):
+        # Generate output with respect to the prompt-only
+        generated_outputs = model.generate(
+            input_ids=inputs["prompts"],
+            attention_mask=inputs.get("prompt_attention_mask", None),
+            generation_config=generation_config,
+            return_dict_in_generate=True,
+        )
+
+        # Get the generated token IDs
+        generated_tokens = generated_outputs.sequences
+        # Calculate new attention mask
+        new_attention_mask = torch.ones_like(generated_tokens)
+        new_labels = generated_tokens.clone()
+
+        # If there's pad_token_id, set attention mask to 0 for padding tokens
+        if pad_token_id is not None:
+            new_labels[new_labels == pad_token_id] = -100
+            new_attention_mask[generated_tokens == pad_token_id] = 0
+
+        return generated_tokens, new_attention_mask, new_labels
+
+    def training_step(
+        self, model: nn.Module, inputs: dict[str, torch.Tensor | Any], num_items_in_batch: int | None = None
+    ) -> torch.Tensor:
+        """
+        Perform a training step for the Generalized Knowledge Distillation (GKD) model.
+
+        This method implements the on-policy learning approach described in the GKD paper. With probability
+        `self.lmbda`, it generates new responses using the student model, which are then used for training instead of
+        the original inputs.
+        """
+        if self.seq_kd:
+            with unwrap_model_for_generation(self.teacher_model, self.accelerator) as unwrapped_model:
+                new_input_ids, new_attention_mask, new_labels = self.generate_on_policy_outputs(
+                    unwrapped_model, inputs, self.generation_config, self.processing_class.pad_token_id
+                )
+            inputs["input_ids"] = new_input_ids
+            inputs["attention_mask"] = new_attention_mask
+            inputs["labels"] = new_labels
+        if random.random() <= self.lmbda:
+            with unwrap_model_for_generation(model, self.accelerator) as unwrapped_model:
+                new_input_ids, new_attention_mask, new_labels = self.generate_on_policy_outputs(
+                    unwrapped_model, inputs, self.generation_config, self.processing_class.pad_token_id
+                )
+            inputs["input_ids"] = new_input_ids
+            inputs["attention_mask"] = new_attention_mask
+            inputs["labels"] = new_labels
+
+        loss = super().training_step(model, inputs, num_items_in_batch)
+        return loss
diff --git a/trl/experimental/gold/gold_config.py b/trl/experimental/gold/gold_config.py
index 6c2d9c34e12..76bb23edc36 100644
--- a/trl/experimental/gold/gold_config.py
+++ b/trl/experimental/gold/gold_config.py
@@ -90,8 +90,8 @@ class GOLDConfig(SFTConfig):
             Frequency (in training steps) to synchronize student model weights to vLLM engine. Set to 1 to sync after
             every step.
         vllm_enable_sleep_mode (`bool`, *optional*, defaults to `False`):
-            Whether to enable sleep mode for the student vLLM engine. If set to `True`, the engine will enter sleep
-            mode after each training step to save resources.
+            Enable vLLM sleep mode to offload student weights/cache during the optimizer step. Keeps GPU memory usage
+            low, but waking the engine adds host–device transfer latency.
     """
 
     _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["teacher_model_init_kwargs"]
@@ -313,7 +313,8 @@ class GOLDConfig(SFTConfig):
     vllm_enable_sleep_mode: bool = field(
         default=False,
         metadata={
-            "help": "Whether to enable sleep mode for the colocated vLLM engine. When `True`, the engine sleeps during the optimizer step and wakes for weight sync and generation."
+            "help": "Enable vLLM sleep mode to offload student weights/cache during the optimizer step. Keeps GPU "
+            "memory usage low, but waking the engine adds host–device transfer latency."
         },
     )
     # Parameters that control the logging
diff --git a/trl/experimental/gold/gold_trainer.py b/trl/experimental/gold/gold_trainer.py
index b2697d94623..ccca9c4a7ce 100644
--- a/trl/experimental/gold/gold_trainer.py
+++ b/trl/experimental/gold/gold_trainer.py
@@ -29,7 +29,7 @@
 from accelerate.utils import DistributedType, broadcast_object_list, gather_object, is_peft_model
 from datasets import Dataset, IterableDataset
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, is_bitsandbytes_available
 from transformers.data.data_collator import DataCollator
 from transformers.feature_extraction_utils import FeatureExtractionMixin
 from transformers.generation.configuration_utils import GenerationConfig
@@ -84,6 +84,9 @@
     from rich.table import Table
     from rich.text import Text
 
+if is_bitsandbytes_available():
+    import bitsandbytes as bnb
+
 
 def print_prompt_completions_sample_uld(
     prompts: list[str],
@@ -941,6 +944,15 @@ def __init__(
                 os.environ["WORLD_SIZE"] = str(self.accelerator.num_processes)
                 ensure_master_addr_port()
 
+                vllm_quantization = None
+                if is_bitsandbytes_available():
+                    for _, module in model.named_modules():
+                        if isinstance(module, bnb.nn.Linear4bit):
+                            vllm_quantization = "bitsandbytes"
+                            break
+                        elif isinstance(module, bnb.nn.Linear8bitLt):
+                            raise ValueError("vLLM does not support in-flight 8-bit quantization.")
+
                 self.vllm_engine = LLM(
                     model=student_model_name_or_path,
                     revision=self.model_revision,
@@ -952,6 +964,7 @@ def __init__(
                     # Feed identical seed for tp groups to ensure sampling results are the same across workers
                     seed=self.accelerator.process_index // self.vllm_tensor_parallel_size,
                     enable_sleep_mode=self.vllm_enable_sleep_mode,
+                    quantization=vllm_quantization,
                 )
 
                 if self.vllm_enable_sleep_mode:
@@ -1649,7 +1662,7 @@ def _generate_on_policy_outputs_vllm(self, inputs, generation_config, pad_token_
         # prompts_text = [p.replace(target_system_prompt, system_prompt) for p in prompts_text]
         # Add system prompt to prompts
 
-        max_completion_length = generation_config.max_completion_length
+        max_completion_length = generation_config.max_new_tokens
         temperature = generation_config.temperature
         # vLLM uses top_k=-1 for no top_k, transformers uses 0 or None.
         top_k = generation_config.top_k if generation_config.top_k and generation_config.top_k > 0 else -1
@@ -1671,7 +1684,7 @@ def _generate_on_policy_outputs_vllm(self, inputs, generation_config, pad_token_
                     min_p=min_p,
                     max_tokens=max_completion_length,
                     guided_decoding_regex=self.vllm_guided_decoding_regex,
-                )
+                )["completion_ids"]
             else:
                 completion_ids = [None] * len(all_prompts_text)
             completion_ids = broadcast_object_list(completion_ids, from_process=0)
diff --git a/trl/experimental/grpo_with_replay_buffer/grpo_with_replay_buffer_config.py b/trl/experimental/grpo_with_replay_buffer/grpo_with_replay_buffer_config.py
index 37a4341f27b..6f0b0381bef 100644
--- a/trl/experimental/grpo_with_replay_buffer/grpo_with_replay_buffer_config.py
+++ b/trl/experimental/grpo_with_replay_buffer/grpo_with_replay_buffer_config.py
@@ -14,7 +14,7 @@
 
 from dataclasses import dataclass, field
 
-from trl.trainer.grpo_config import GRPOConfig
+from ...trainer.grpo_config import GRPOConfig
 
 
 @dataclass
diff --git a/trl/experimental/grpo_with_replay_buffer/grpo_with_replay_buffer_trainer.py b/trl/experimental/grpo_with_replay_buffer/grpo_with_replay_buffer_trainer.py
index 597d6218084..e5c44710123 100644
--- a/trl/experimental/grpo_with_replay_buffer/grpo_with_replay_buffer_trainer.py
+++ b/trl/experimental/grpo_with_replay_buffer/grpo_with_replay_buffer_trainer.py
@@ -18,14 +18,9 @@
 import torch
 from accelerate.utils import gather_object
 
-from trl.data_utils import (
-    apply_chat_template,
-    is_conversational,
-    prepare_multimodal_messages,
-)
-from trl.trainer.grpo_trainer import GRPOTrainer
-from trl.trainer.utils import nanmax, nanmin, nanstd, pad
-
+from ...data_utils import apply_chat_template, is_conversational, prepare_multimodal_messages
+from ...trainer.grpo_trainer import GRPOTrainer
+from ...trainer.utils import nanmax, nanmin, nanstd, pad
 from .grpo_with_replay_buffer_config import GRPOWithReplayBufferConfig
 
 
@@ -210,6 +205,9 @@ def _generate_and_score_completions(
             completions = []
             for prompt, completion in zip(prompts, completions_text, strict=True):
                 bootstrap = prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else ""
+                if isinstance(bootstrap, list):  # for VLM, the format might be [{"type": "text", "text": "..."}]
+                    assert len(bootstrap) == 1 and bootstrap[0]["type"] == "text"
+                    bootstrap = bootstrap[0]["text"]
                 completions.append([{"role": "assistant", "content": bootstrap + completion}])
         else:
             completions = completions_text
@@ -238,10 +236,12 @@ def _generate_and_score_completions(
         mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(self.num_generations, dim=0)
         advantages = rewards - mean_grouped_rewards
 
+        grouped_std_rewards = rewards.view(-1, self.num_generations).std(dim=1)
+        grouped_std_rewards = grouped_std_rewards.repeat_interleave(self.num_generations, dim=0)
+
         if self.scale_rewards in ["group", "none"]:
             # If self.scale_rewards = "none", we'll still log group level std
-            std_rewards = rewards.view(-1, self.num_generations).std(dim=1)
-            std_rewards = std_rewards.repeat_interleave(self.num_generations, dim=0)
+            std_rewards = grouped_std_rewards.clone()
         elif self.scale_rewards == "batch":
             # Compute global std
             std_rewards = rewards.std().expand_as(rewards)
@@ -261,7 +261,7 @@ def _generate_and_score_completions(
         )
         all_process_advantages = advantages.clone()  # keep the aggregated advantages for logging
         advantages = advantages[process_slice]
-        std_rewards = std_rewards[process_slice]
+        grouped_std_rewards = grouped_std_rewards[process_slice]
 
         # Calculate mean reward per function, but only for samples where the function was applied (non-NaN values)
         for i, reward_func_name in enumerate(self.reward_func_names):
@@ -316,7 +316,7 @@ def _generate_and_score_completions(
             )
         outputs_after_sampling_buffer = self.update_with_replay_buffer(
             advantages,
-            std_rewards,
+            grouped_std_rewards,
             prompt_ids,
             prompt_mask,
             completion_ids,
diff --git a/trl/experimental/gspo_token/grpo_trainer.py b/trl/experimental/gspo_token/grpo_trainer.py
index f267bbd4b62..62c124ab134 100644
--- a/trl/experimental/gspo_token/grpo_trainer.py
+++ b/trl/experimental/gspo_token/grpo_trainer.py
@@ -14,8 +14,7 @@
 
 import torch
 
-from trl import GRPOTrainer as _GRPOTrainer
-
+from ...trainer.grpo_trainer import GRPOTrainer as _GRPOTrainer
 from ...trainer.utils import nanmax, nanmin
 
 
diff --git a/trl/experimental/judges/__init__.py b/trl/experimental/judges/__init__.py
new file mode 100644
index 00000000000..332e949da49
--- /dev/null
+++ b/trl/experimental/judges/__init__.py
@@ -0,0 +1,36 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .judges import (
+    AllTrueJudge,
+    BaseBinaryJudge,
+    BaseJudge,
+    BasePairwiseJudge,
+    BaseRankJudge,
+    HfPairwiseJudge,
+    OpenAIPairwiseJudge,
+    PairRMJudge,
+)
+
+
+__all__ = [
+    "AllTrueJudge",
+    "BaseBinaryJudge",
+    "BaseJudge",
+    "BasePairwiseJudge",
+    "BaseRankJudge",
+    "HfPairwiseJudge",
+    "OpenAIPairwiseJudge",
+    "PairRMJudge",
+]
diff --git a/trl/experimental/judges/judges.py b/trl/experimental/judges/judges.py
new file mode 100644
index 00000000000..c5e716f31c4
--- /dev/null
+++ b/trl/experimental/judges/judges.py
@@ -0,0 +1,457 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import concurrent.futures
+import logging
+from abc import ABC, abstractmethod
+
+import numpy as np
+from accelerate import Accelerator
+from huggingface_hub import InferenceClient
+from transformers.utils import is_openai_available
+
+from ...import_utils import is_llm_blender_available
+
+
+if is_llm_blender_available():
+    import llm_blender
+
+if is_openai_available():
+    from openai import OpenAI
+
+
+DEFAULT_PAIRWISE_SYSTEM_PROMPT = '''I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
+
+## Instruction
+
+{{
+    "instruction": """{prompt}""",
+}}
+
+## Model Outputs
+
+Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.
+
+{{
+    {{
+        "model_identifier": "0",
+        "output": """{response0}"""
+    }},
+    {{
+        "model_identifier": "1",
+        "output": """{response1}"""
+    }}
+}}
+
+## Task
+
+Evaluate the models on the basis of the quality and relevance of their results, and select the model that generated the best result. Reply with the identifier of the best model. Our evaluation will only take into account the first character of your answer, so make sure it contains only one of the identifiers and nothing else (no quotation marks, no spaces, no new lines, ...).
+'''
+
+
+class BaseJudge(ABC):
+    """
+    Base class for judges. The subclasses of this class should implement the `judge` method.
+    """
+
+    @abstractmethod
+    def judge(self, prompts: list[str], completions: list[str], shuffle_order: bool = True) -> list:
+        raise NotImplementedError("Judge subclasses must implement the `judge` method.")
+
+
+class BaseRankJudge(ABC):
+    """
+    Base class for LLM ranking judges.
+
+    **Example**:
+    ```python
+    class MyRankJudge(BaseRankJudge):
+        def judge(self, prompts, completions, shuffle_order=True):
+            return ...  # Your ranking logic here
+
+
+    judge = MyRankJudge()
+    judge.judge(
+        prompts=["The capital of France is", "The capital of Germany is"],
+        completions=[[" Paris", " Marseille", "Lyon"], [" Munich", " Berlin"]],
+    )  # [[0, 1, 2], [1, 0]]
+    ```
+    """
+
+    @abstractmethod
+    def judge(self, prompts: list[str], completions: list[list[str]], shuffle_order: bool = True) -> list[list[int]]:
+        """
+        Judge the completion for the given prompts and return the ranks of each completion.
+
+        Args:
+            prompts (`list[str]`):
+                List of prompts.
+            completions (`list[list[str]]`):
+                List of completions list, where each element is a list of completions for the corresponding prompt.
+            shuffle_order (`bool`, *optional*, defaults to `True`):
+                Whether to shuffle the order of the completions to avoid positional bias.
+
+        Returns:
+            `list[list[int]]`:
+                List of lists of idxs, where each list contains the ranks of the completions for the corresponding
+                prompt. E.g., `[1, 2, 0]` means that the second completion (`idx=1`) is the best, followed by the
+                third, and then the first.
+        """
+        raise NotImplementedError("Judge subclasses must implement the `judge` method.")
+
+
+class BasePairwiseJudge(BaseJudge):
+    """
+    Base class for pairwise judges.
+    """
+
+    @abstractmethod
+    def judge(self, prompts: list[str], completions: list[list[str]], shuffle_order: bool = True) -> list[int]:
+        """
+        Judge the completion pairs for the given prompts.
+
+        Args:
+            prompts (`list[str]`):
+                List of prompts.
+            completions (`list[list[str]]`):
+                List of completions pairs, where each element is a pair of completions for the corresponding prompt.
+            shuffle_order (`bool`, *optional*, defaults to `True`):
+                Whether to shuffle the order of the completions to avoid positional bias.
+
+        Returns:
+            `list[int]`:
+                List of idxs, where each idx is the rank of the best completion for the corresponding prompt. E.g., `1`
+                means that the second completion (`idx=1`) is the best.
+
+        Note:
+            If the judge returns `-1` for any prompt, it indicates that the inner process used to compute the
+            preference has failed. For instance, this could occur if the underlying language model returned an invalid
+            answer. In such cases, the caller should handle these invalid indices appropriately, possibly by
+            implementing fallback logic or error handling.
+        """
+        raise NotImplementedError("Judge subclasses must implement the `judge` method.")
+
+
+class BaseBinaryJudge(BaseJudge):
+    """
+    Base class for binary judges.
+    """
+
+    @abstractmethod
+    def judge(
+        self,
+        prompts: list[str],
+        completions: list[str],
+        gold_completions: list[str] | None = None,
+        shuffle_order: bool = True,
+    ) -> list[int]:
+        """
+        Judge the completion for a given prompt. Used to assess if a completion satisfies a constraint.
+
+        This base class should be used to implement binary evaluations as done in section 4.1.4 of the [CGPO
+        paper](https://huggingface.co/papers/2409.20370). It is relevant for assessing whether a prompt-completion pair
+        satisfies a specific constraint.
+
+        Args:
+            prompts (`list[str]`): List of prompts.
+            completions (`list[str]`): List of completions.
+            gold_completions (`list[str]`, `optional`): List of gold completions if it exists.
+            shuffle_order (`bool`): Whether to shuffle the order of the completions to avoid positional bias.
+
+        Returns:
+            list[int]: A list of binary labels:
+                - 1 indicates that the completion satisfies the evaluated constraint.
+                - 0 indicates that the completion does not satisfy the evaluated constraint.
+
+        Note:
+            If the judge returns -1 for any prompt, it indicates that the inner process used to compute the preference
+            has failed. For instance, this could occur if the underlying language model or rule based constraint
+            returned an invalid answer. In such cases, the caller should handle these invalid indices appropriately,
+            possibly by implementing fallback logic or error handling.
+        """
+        raise NotImplementedError("Judge subclasses must implement the `judge` method.")
+
+
+class PairRMJudge(BasePairwiseJudge):
+    # docstyle-ignore
+    """
+    LLM judge based on the PairRM model from AllenAI.
+
+    This judge uses the PairRM model to rank pairs of completions for given prompts. It's designed for pairwise
+    comparison of language model outputs. The PairRM model is loaded using the llm-blender library and runs on the
+    default Accelerator device.
+
+    **Attributes**:
+
+        blender (`llm_blender.Blender`):
+            An instance of the Blender class from llm-blender.
+
+    **Example**:
+    ```python
+    >>> pairrm_judge = PairRMJudge()
+    >>> prompts = ["Translate 'hello' to French", "What's the capital of Japan?"]
+    >>> completions = [["Bonjour", "Salut"], ["Kyoto", "Tokyo"]]
+    >>> results = pairrm_judge.judge(prompts, completions)
+    >>> print(results)  # [0, 1] (indicating the first completion is preferred for the first prompt and the second)
+    ```
+
+    > [!TIP]
+    > This class requires the llm-blender library to be installed. Install it with: `pip install llm-blender`.
+    """
+
+    def __init__(self):
+        if not is_llm_blender_available():
+            raise ValueError("llm-blender is not installed. Please install it with `pip install llm-blender`.")
+        self.blender = llm_blender.Blender()
+        self.blender.loadranker("llm-blender/PairRM", device=Accelerator().device)
+
+    def judge(
+        self,
+        prompts: list[str],
+        completions: list[list[str]],
+        shuffle_order: bool = True,
+        return_scores: bool = False,
+        temperature: float = 1.0,
+    ) -> list[int | float]:
+        """
+        Judge the completion pairs for the given prompts using the PairRM model.
+
+        Args:
+            prompts (`list[str]`):
+                List of prompts to judge.
+            completions (`list[list[str]]`):
+                List of completion pairs for each prompt.
+            shuffle_order (`bool`, *optional*, defaults to `True`):
+                Whether to shuffle the order of the completions to avoid positional bias.
+            return_scores (`bool`, *optional*, defaults to `False`):
+                If `True`, return probability scores of the first completion instead of ranks (i.e. a *soft-judge*).
+            temperature (`float`, *optional*, defaults to `1.0`):
+                Temperature for scaling logits if `return_scores` is True.
+
+        Returns:
+            `list[int | float]`:
+                If `return_scores` is `False`, returns a list of ranks (`0` or `1`) for each prompt, indicating which
+                completion is preferred. If `return_scores` is `True`, returns softmax probabilities for the first
+                completion.
+
+        Raises:
+            `ValueError`:
+                If the number of completions per prompt is not exactly 2.
+
+        Note:
+            Unlike llm-blender, ranks are 0-indexed (`0` means the first completion is preferred).
+        """
+
+        if len(completions[0]) != 2:
+            raise ValueError("PairRM judge requires exactly 2 completions per prompt.")
+
+        # Shuffle the order of the completions to avoid positional bias
+        if shuffle_order:
+            flip_mask = np.random.choice([True, False], size=len(prompts))
+            completions = [pair[::-1] if flip else pair for flip, pair in zip(flip_mask, completions, strict=True)]
+
+        # Rank the completions
+        ranks = self.blender.rank(prompts, completions, return_scores=return_scores, disable_tqdm=True)
+        if not return_scores:
+            ranks -= 1  # PairRM rank is 1-indexed, so we subtract 1 to make it 0-indexed
+        else:
+            # scale the logits by temperature
+            ranks /= temperature
+
+        # Flip back the ranks or scores to the original order if needed
+        if shuffle_order:
+            ranks[flip_mask] = ranks[flip_mask][:, ::-1]
+
+        # Return the ranks or score probability
+        if return_scores:
+            logit_max = np.amax(ranks, axis=-1, keepdims=True)
+            exp_logit_shifted = np.exp(ranks - logit_max)
+            probs = exp_logit_shifted / np.sum(exp_logit_shifted, axis=-1, keepdims=True)
+            return probs[:, 0].tolist()
+        else:
+            return ranks[:, 0].tolist()
+
+
+class HfPairwiseJudge(BasePairwiseJudge):
+    """
+    Pairwise judge based on the Hugging Face API with chat completion.
+
+    This judge is relevant for assessing the quality chat models, where the completion is a response to a given prompt.
+
+    Args:
+        model (`str`, *optional*, defaults to `"meta-llama/Meta-Llama-3-70B-Instruct"`):
+            Model to use for the judge.
+        token (`str`, *optional*):
+            Hugging Face API token to use for the [`huggingface_hub.InferenceClient`].
+        system_prompt (`str`, *optional*):
+            The system prompt to be used for the judge. If not provided, a default prompt is used. Note that the system
+            prompt should contain the following placeholders: `{prompt}`, `{response0}`, and `{response1}`. Also, the
+            inference is called with `max_tokens=1`, consequently the system prompt should ask for a single token
+            response.
+    """
+
+    def __init__(
+        self,
+        model="meta-llama/Meta-Llama-3-70B-Instruct",
+        token: str | None = None,
+        system_prompt: str | None = None,
+    ):
+        self.client = InferenceClient(model=model, token=token)
+        self.system_prompt = system_prompt or DEFAULT_PAIRWISE_SYSTEM_PROMPT
+
+    def judge(self, prompts: list[str], completions: list[list[str]], shuffle_order: bool = True) -> list[int]:
+        # Shuffle the order of the completions to avoid positional bias
+        if shuffle_order:
+            flip_mask = np.random.choice([True, False], size=len(prompts))
+            completions = [pair[::-1] if flip else pair for flip, pair in zip(flip_mask, completions, strict=True)]
+
+        # Define a function to get the rank for a single prompt, will be called concurrently
+        def get_rank(prompt, candidates):
+            content = self.system_prompt.format(prompt=prompt, response0=candidates[0], response1=candidates[1])
+            completion = self.client.chat_completion(messages=[{"role": "user", "content": content}], max_tokens=1)
+            response = completion.choices[0].message.content
+            if response in ["0", "1"]:
+                return int(response)
+            else:
+                logging.debug(f"Invalid response from the judge model: '{response}'. Returning -1.")
+                return -1
+
+        # Call the completions concurrently
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            ranks = list(executor.map(get_rank, prompts, completions))
+
+        # Flip back the ranks to the original order if needed
+        if shuffle_order:
+            ranks = [ranks[i] if not flip else 1 - ranks[i] for i, flip in enumerate(flip_mask)]
+
+        # Return the ranks
+        return ranks
+
+
+class OpenAIPairwiseJudge(BasePairwiseJudge):
+    """
+    Judge based on the OpenAI API.
+
+    This judge is relevant for assessing the quality chat models, where the completion is a response to a given prompt.
+
+    Args:
+        model (`str`, *optional*, defaults to `"gpt-4-turbo-preview"`):
+            Model to use for the judge.
+        system_prompt (`str`, *optional*):
+            System prompt to be used for the judge. If not provided, a default prompt is used. Note that the system
+            prompt should contain the following placeholders: `{prompt}`, `{response0}`, and `{response1}`. Also, the
+            inference is called with `max_tokens=1`, consequently the system prompt should ask for a single token
+            response.
+        max_requests (`int` or `None`, *optional*, defaults to `1000`):
+            Maximum number of requests to make to the OpenAI API. If set to `None`, there is no limit.
+    """
+
+    def __init__(
+        self, model="gpt-4-turbo-preview", system_prompt: str | None = None, max_requests: int | None = 1_000
+    ):
+        if not is_openai_available():
+            raise ValueError("OpenAI client is not installed. Please install it with 'pip install openai'.")
+        self.client = OpenAI()
+        self.model = model
+        self.system_prompt = system_prompt or DEFAULT_PAIRWISE_SYSTEM_PROMPT
+        self.max_requests = max_requests
+        self.num_requests = 0
+        self._warned = False
+
+    def judge(self, prompts: list[str], completions: list[list[str]], shuffle_order: bool = True) -> list[int]:
+        # Check if the limit of requests is reached, if so, use random choice instead
+        if self.max_requests is not None and self.num_requests >= self.max_requests:
+            if not self._warned:  # Print the warning only once
+                logging.warning(
+                    f"Reached the maximum number of requests ({self.max_requests}). From now on, returning -1 instead. "
+                    " To increase the limit, set `max_requests` to a higher value, or to `None` for no limit."
+                )
+                self._warned = True
+            return [-1] * len(prompts)
+
+        # Shuffle the order of the completions to avoid positional bias
+        if shuffle_order:
+            flip_mask = np.random.choice([True, False], size=len(prompts))
+            completions = [pair[::-1] if flip else pair for flip, pair in zip(flip_mask, completions, strict=True)]
+
+        # Define a function to get the rank for a single prompt, will be called concurrently
+        def get_rank(prompt, candidates):
+            content = self.system_prompt.format(prompt=prompt, response0=candidates[0], response1=candidates[1])
+            messages = [{"role": "user", "content": content}]
+            completion = self.client.chat.completions.create(model=self.model, messages=messages, max_tokens=1)
+            response = completion.choices[0].message.content
+            if response in ["0", "1"]:
+                return int(response)
+            else:
+                logging.debug(f"Invalid response from the judge model: '{response}'. Returning -1.")
+                return -1
+
+        # Call the completions concurrently
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            ranks = list(executor.map(get_rank, prompts, completions))
+
+        # Flip back the ranks to the original order if needed
+        if shuffle_order:
+            ranks = [ranks[i] if not flip else 1 - ranks[i] for i, flip in enumerate(flip_mask)]
+
+        # Update the number of requests
+        self.num_requests += len(prompts)
+
+        # Return the ranks
+        return ranks
+
+
+class AllTrueJudge(BaseBinaryJudge):
+    """
+    Unify the decision of multiple [`experimental.judges.BaseBinaryJudge`] instances.
+
+    Returns `1` only if all inner binary judges return `1`. If any judge returns `0`, it returns `0`. If any judge
+    returns `-1`, indicating a failure in its process, this judge will also return `-1`.
+
+    Implements the Mixture of Judges as described in the [CGPO paper](https://huggingface.co/papers/2409.20370).
+
+    Args:
+        judges (`list[BaseBinaryJudge]`):
+            A list of [`experimental.judges.BaseBinaryJudge`] instances whose decisions will be unified.
+    """
+
+    def __init__(self, judges: list[BaseBinaryJudge]):
+        self.judges = judges
+
+    def judge(
+        self,
+        prompts: list[str],
+        completions: list[str],
+        gold_completions: list[str] | None = None,
+        shuffle_order: bool = True,
+    ) -> list[int]:
+        all_binary_judgments = [
+            judge.judge(prompts, completions, gold_completions, shuffle_order) for judge in self.judges
+        ]
+        output = []
+        for binary_judgments in zip(*all_binary_judgments, strict=True):
+            # Check that all values are in {0, 1, -1}
+            if any(binary_judgment not in {0, 1, -1} for binary_judgment in binary_judgments):
+                raise ValueError(
+                    f"Invalid binary judgment: {binary_judgments}, expected list of values in {{0, 1, -1}}."
+                )
+
+            # Unify the decision
+            if -1 in binary_judgments:
+                output.append(-1)
+            elif all(binary_judgment == 1 for binary_judgment in binary_judgments):
+                output.append(1)
+            else:
+                output.append(0)
+        return output
diff --git a/trl/experimental/minillm/__init__.py b/trl/experimental/minillm/__init__.py
new file mode 100644
index 00000000000..49b19dd946d
--- /dev/null
+++ b/trl/experimental/minillm/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .minillm_config import MiniLLMConfig
+from .minillm_trainer import MiniLLMTrainer
+
+
+__all__ = ["MiniLLMConfig", "MiniLLMTrainer"]
diff --git a/trl/experimental/minillm/minillm_config.py b/trl/experimental/minillm/minillm_config.py
new file mode 100644
index 00000000000..3eb5f56d537
--- /dev/null
+++ b/trl/experimental/minillm/minillm_config.py
@@ -0,0 +1,145 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from dataclasses import dataclass, field
+from typing import Any
+
+from transformers import TrainingArguments
+
+from ...trainer.grpo_config import GRPOConfig
+
+
+@dataclass
+class MiniLLMConfig(GRPOConfig):
+    """
+    Configuration class for [`MiniLLMTrainer`].
+
+    This class includes only the parameters that are specific to MiniLLM training. For a full list of training
+    arguments, please refer to the [`~transformers.TrainingArguments`] and [`GRPOConfig`] documentation.
+
+    Args:
+        teacher_model_init_kwargs (`dict[str, Any]]`, *optional*):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the teacher model
+            from a string.
+        disable_dropout (`bool`, *optional*, defaults to `True`):
+            Whether to disable dropout in the model.
+        rkl_advantage (`bool`, *optional*, defaults to `True`):
+            Whether to add the reverse KL advantage to the reward advantage.
+        single_step_decomposition (`bool`, *optional*, defaults to `True`):
+            Whether to use single-step decomposition for the KL divergence computation.
+        kd_temperature (`float`, *optional*, defaults to `1.0`):
+            Temperature for knowledge distillation. Higher temperatures produce softer probability distributions over
+            classes.
+        gamma (`float`, *optional*, defaults to `0.0`):
+            Discount factor for future rewards in reinforcement learning.
+        length_normalization (`bool`, *optional*, defaults to `True`):
+            Whether to apply length normalization to the rewards.
+    """
+
+    teacher_model_init_kwargs: dict[str, Any] | None = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the "
+            "teacher model from a string."
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={"help": "Whether to disable dropouts in `model`."},
+    )
+    rkl_advantage: bool = field(
+        default=True,
+        metadata={"help": "Whether to add the reverse KL advantage to the reward advantage."},
+    )
+    single_step_decomposition: bool = field(
+        default=True,
+        metadata={"help": "Whether to use single-step decomposition for the KL divergence computation."},
+    )
+    kd_temperature: float = field(
+        default=1.0,
+        metadata={
+            "help": "Temperature for knowledge distillation. Higher temperatures produce softer probability "
+            "distributions over classes."
+        },
+    )
+    gamma: float = field(
+        default=0.0,
+        metadata={"help": "Discount factor for future rewards in reinforcement learning."},
+    )
+    length_normalization: bool = field(
+        default=True,
+        metadata={"help": "Whether to apply length normalization to the rewards."},
+    )
+
+    def __post_init__(self):
+        # We do not use the post_init of GRPOConfig because:
+        # 1. num_generations can be < 2 in MiniLLMConfig. Scale_rewards must be set to "none" to avoid nan.
+        self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
+
+        TrainingArguments.__post_init__(self)
+
+        self.scale_rewards = {True: "group", False: "none"}.get(self.scale_rewards, self.scale_rewards)
+        if self.num_generations == 1:
+            self.scale_rewards = "none"
+
+        num_processes = self.world_size
+        # The current default effective batch size
+        if self.generation_batch_size is None and self.steps_per_generation is None:
+            self.steps_per_generation = self.gradient_accumulation_steps
+            self.generation_batch_size = self.per_device_train_batch_size * num_processes * self.steps_per_generation
+        elif self.generation_batch_size is not None and self.steps_per_generation is None:
+            # Just ensure the value is divisible by the global batch size
+            if self.generation_batch_size % (self.per_device_train_batch_size * num_processes) != 0:
+                raise ValueError(
+                    f"generation_batch_size ({self.generation_batch_size}) must be divisible by the global batch size "
+                    f"({self.per_device_train_batch_size * num_processes})."
+                )
+            self.steps_per_generation = self.generation_batch_size // (
+                self.per_device_train_batch_size * num_processes
+            )
+        elif self.generation_batch_size is None and self.steps_per_generation is not None:
+            self.generation_batch_size = self.per_device_train_batch_size * num_processes * self.steps_per_generation
+        else:
+            raise ValueError(
+                "'generation_batch_size' and 'steps_per_generation' can not be both configured at the same time"
+            )
+
+        if self.do_eval and self.eval_strategy != "no":
+            # Just ensure the value is divisible by the global batch size
+            if (self.per_device_eval_batch_size * num_processes) % self.num_generations != 0:
+                raise ValueError(
+                    f"The global eval batch size ({self.per_device_eval_batch_size} * {num_processes}) must be "
+                    f"divisible by num_generations ({self.num_generations})."
+                )
+
+        # The generation batch must contain full prompt groups (no partials), so it must be divisible by
+        # num_generations.
+        if self.generation_batch_size % self.num_generations != 0:
+            raise ValueError(
+                f"generation_batch_size ({self.generation_batch_size}) must be divisible by num_generations "
+                f"({self.num_generations})."
+            )
+
+        if self.use_liger_loss is not None:
+            warnings.warn(
+                "The `use_liger_loss` argument is deprecated and will be removed in version 0.28.0. Please use "
+                "`use_liger_kernel` instead.",
+                FutureWarning,
+                stacklevel=2,
+            )
+            self.use_liger_kernel = self.use_liger_loss
+
+        if self.delta is not None and self.use_liger_kernel:
+            raise ValueError("Liger kernel does not support two-sided GRPO loss yet.")
diff --git a/trl/experimental/minillm/minillm_trainer.py b/trl/experimental/minillm/minillm_trainer.py
new file mode 100644
index 00000000000..ab4e69435c9
--- /dev/null
+++ b/trl/experimental/minillm/minillm_trainer.py
@@ -0,0 +1,396 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import textwrap
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from datasets import Dataset, IterableDataset
+from transformers import AutoModelForCausalLM, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin
+from transformers.trainer_callback import TrainerCallback
+from transformers.utils import is_peft_available
+
+from ...models import prepare_deepspeed
+from ...trainer.grpo_trainer import GRPOTrainer, RewardFunc, RolloutFunc
+from ...trainer.utils import disable_dropout_in_model, empty_cache, get_config_model_id
+from .minillm_config import MiniLLMConfig
+
+
+if is_peft_available():
+    from peft import PeftConfig
+
+
+def dummy_reward_func(completions: list, **kwargs):
+    # placeholder reward function when no reward function is provided
+    return [1.0 for _ in completions]
+
+
+class MiniLLMTrainer(GRPOTrainer):
+    """
+    Trainer for the Knowledge Distillation of Language Models (MiniLLM) method. This algorithm was initially proposed
+    in the paper [Knowledge Distillation of Large Language Models](https://huggingface.co/papers/2306.08543).
+
+    Example:
+
+    ```python
+    from datasets import load_dataset
+    from trl.experimental.minillm import MiniLLMTrainer
+
+    dataset = load_dataset("trl-lib/tldr", split="train")
+
+    trainer = MiniLLMTrainer(
+        model="Qwen/Qwen3-0.6B",
+        teacher_model="Qwen/Qwen3-1.7B",
+        train_dataset=dataset,
+    )
+    trainer.train()
+    ```
+
+    Args:
+        model (`str | PreTrainedModel`):
+            Model to be trained. Can be either:
+
+            - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
+              path to a *directory* containing model weights saved using
+              [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+              using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
+              `args.model_init_kwargs`.
+            - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
+        teacher_model (`PreTrainedModel | nn.Module | str`):
+            Teacher model used for knowledge distillation. Instantiated similarly to `model`.
+        reward_funcs (`RewardFunc | list[RewardFunc]`, *optional*):
+            Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
+            functions with the prompts and completions and sum the rewards. Can be either:
+
+            - A single reward function, such as:
+                - A string: The *model ID* of a pretrained model hosted inside a model repo on huggingface.co, or a
+                path to a *directory* containing model weights saved using
+                [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+                using [`~transformers.AutoModelForSequenceClassification.from_pretrained`] with `num_labels=1` and the
+                keyword arguments in `args.model_init_kwargs`.
+                - A [`~transformers.PreTrainedModel`] object: Only sequence classification models are supported.
+                - A custom reward function: The function is provided with the prompts and the generated completions,
+                  plus any additional columns in the dataset. It should return a list of rewards. Custom reward
+                  functions can also return `None` when the reward is not applicable to those samples. This is useful
+                  for multi-task training where different reward functions apply to different types of samples. When a
+                  reward function returns `None` for a sample, that reward function is excluded from the reward
+                  calculation for that sample. For more details, see [Using a custom reward
+                  function](#using-a-custom-reward-function).
+
+                  The trainer's state is also passed to the reward function. The trainer's state is an instance of
+                  [`~transformers.TrainerState`] and can be accessed by accessing the `trainer_state` argument to the
+                  reward function's signature.
+            - A list of reward functions, where each item can independently be any of the above types. Mixing different
+            types within the list (e.g., a string model ID and a custom reward function) is allowed.
+        args ([`experimental.minillm.MiniLLMConfig`], *optional*):
+            Configuration for this trainer. If `None`, a default configuration is used.
+        train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
+            Dataset to use for training. It must include a column `"prompt"`. Any additional columns in the dataset is
+            ignored. The format of the samples can be either:
+
+            - [Standard](dataset_formats#standard): Each sample contains plain text.
+            - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
+              and content).
+        eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Dataset | IterableDataset]`):
+            Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.ProcessorMixin`], *optional*):
+            Processing class used to process the data. The padding side must be set to "left". If `None`, the
+            processing class is loaded from the model's name with [`~transformers.AutoProcessor.from_pretrained`]. A
+            padding token, `tokenizer.pad_token`, must be set. If the processing class has not set a padding token,
+            `tokenizer.eos_token` will be used as the default.
+        reward_processing_classes ([`~transformers.PreTrainedTokenizerBase`] or `list[PreTrainedTokenizerBase]`, *optional*):
+            Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either:
+
+            - A single processing class: Used when `reward_funcs` contains only one reward function.
+            - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`.
+            If set to `None`, or if an element of the list corresponding to a [`~transformers.PreTrainedModel`] is
+            `None`, the tokenizer for the model is automatically loaded using
+            [`~transformers.AutoTokenizer.from_pretrained`]. For elements in `reward_funcs` that are custom reward
+            functions (not [`~transformers.PreTrainedModel`]), the corresponding entries in `reward_processing_classes`
+            are ignored.
+        callbacks (list of [`~transformers.TrainerCallback`], *optional*):
+            List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
+            in [here](https://huggingface.co/docs/transformers/main_classes/callback).
+
+            If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
+            method.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
+            A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
+            model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
+        peft_config ([`~peft.PeftConfig`], *optional*):
+            PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
+        rollout_func (`RolloutFunc`, *optional*):
+            Function to use for generating completions. It must take prompts, args, and processing_class as parameters
+            and return a dict with `"prompt_ids"`, `"completion_ids"`, and `"logprobs"` fields. Any other fields that
+            are forwarded to the reward functions. This feature is experimental and may change or be removed at any
+            time without prior notice.
+    """
+
+    _tag_names = ["trl", "minillm"]
+    _name = "MiniLLM"
+    _paper = {
+        "title": "MiniLLM: Knowledge Distillation of Large Language Models",
+        "id": "2306.08543",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @inproceedings{
+                gu2024minillm,
+                title={{MiniLLM: Knowledge Distillation of Large Language Models}},
+                author={Yuxian Gu and Li Dong and Furu Wei and Minlie Huang},
+                booktitle={The Twelfth International Conference on Learning Representations},
+                year={2024},
+                url={https://openreview.net/forum?id=5h0qf7IBZZ}
+            }"""),
+    }
+
+    def __init__(
+        self,
+        model: str | PreTrainedModel,
+        teacher_model: PreTrainedModel | nn.Module | str,
+        reward_funcs: RewardFunc | list[RewardFunc] | None = None,
+        args: MiniLLMConfig | None = None,
+        train_dataset: Dataset | IterableDataset | None = None,
+        eval_dataset: Dataset | IterableDataset | dict[str, Dataset | IterableDataset] | None = None,
+        processing_class: PreTrainedTokenizerBase | ProcessorMixin | None = None,
+        reward_processing_classes: PreTrainedTokenizerBase | list[PreTrainedTokenizerBase] | None = None,
+        callbacks: list[TrainerCallback] | None = None,
+        optimizers: tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None] = (None, None),
+        peft_config: "PeftConfig | None" = None,
+        rollout_func: RolloutFunc | None = None,
+    ):
+        if reward_funcs is None:
+            reward_funcs = [dummy_reward_func]
+
+        # Args
+        if args is None:
+            model_name = model if isinstance(model, str) else get_config_model_id(model.config)
+            model_name = model_name.split("/")[-1]
+            args = MiniLLMConfig(f"{model_name}-MiniLLM")
+
+        super().__init__(
+            model,
+            reward_funcs,
+            args=args,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            reward_processing_classes=reward_processing_classes,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            peft_config=peft_config,
+            rollout_func=rollout_func,
+        )
+
+        if args.teacher_model_init_kwargs is None:
+            teacher_model_init_kwargs = {}
+        elif not isinstance(teacher_model, str):
+            raise ValueError(
+                "You passed teacher_model_init_kwargs to the MiniLLMConfig, but your teacher_model is already instantiated."
+            )
+        else:
+            teacher_model_init_kwargs = args.teacher_model_init_kwargs
+            teacher_model_init_kwargs["dtype"] = (
+                teacher_model_init_kwargs["dtype"]
+                if teacher_model_init_kwargs["dtype"] in ["auto", None]
+                else getattr(torch, teacher_model_init_kwargs["dtype"])
+            )
+
+        if isinstance(teacher_model, str):
+            teacher_model = AutoModelForCausalLM.from_pretrained(teacher_model, **teacher_model_init_kwargs)
+
+        # Disable dropout in the model
+        if args.disable_dropout:
+            disable_dropout_in_model(self.model)
+
+        if self.is_deepspeed_enabled:
+            self.teacher_model = prepare_deepspeed(teacher_model, self.accelerator)
+        else:
+            self.teacher_model = self.accelerator.prepare_model(teacher_model, evaluation_mode=True)
+
+        self.temperature = args.temperature
+        self.kd_temperature = args.kd_temperature
+        self.single_step_decomposition = args.single_step_decomposition
+        self.rkl_advantage = args.rkl_advantage
+        self.gamma = args.gamma
+        self.length_normalization = args.length_normalization
+
+    def _single_step_decomposition_loss(
+        self,
+        student_log_probs: torch.Tensor,
+        teacher_log_probs: torch.Tensor,
+        mask: torch.Tensor | None = None,
+        reduction: str = "batchmean",
+    ):
+        """
+        Compute the MiniLLM loss for knowledge distillation using F.kl_div. See Eq. (1) of
+        https://huggingface.co/papers/2306.08543 for the definition.
+
+        Args:
+            student_logits:
+                Tensor of shape (batch_size, sequence_length, vocab_size)
+            teacher_logits:
+                Tensor of shape (batch_size, sequence_length, vocab_size)
+            labels:
+                Tensor of shape (batch_size, sequence_length) with -100 for padding tokens to ignore when computing
+                loss
+            beta:
+                Interpolation coefficient between 0 and 1 (default: 0.5)
+            temperature:
+                Softmax temperature (default: 1.0)
+            reduction:
+                Specifies the reduction to apply to the output (default: 'batchmean')
+
+        Returns:
+            loss: Scalar tensor with the generalized JSD loss
+        """
+        reg_loss = F.kl_div(
+            teacher_log_probs, student_log_probs, reduction="none", log_target=True
+        )  # (batch_size, sequence_length)
+
+        # Masking
+        if mask is not None:
+            reg_loss = reg_loss[mask]
+
+        # Apply reduction
+        if reduction == "batchmean":
+            return reg_loss.sum() / mask.sum() if mask is not None else reg_loss.sum() / reg_loss.size(0)
+        elif reduction == "sum":
+            return reg_loss.sum()
+        elif reduction == "mean":
+            return reg_loss.mean()
+        else:
+            return reg_loss
+
+    def _compute_advantage(
+        self,
+        student_log_probs_on_labels: torch.Tensor,
+        teacher_log_probs_on_labels: torch.Tensor,
+        mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        r"""Compute the advantage for Reverse KL Divergence.
+
+        Mostly following [this
+        implementation](https://github.com/microsoft/LMOps/blob/e210d2c026b9958617887762400778ace81172e6/minillm/minillm/losses.py#L37-L49).
+
+        $$ \text{rewards}_t = \text{teacher\_log\_probs\_on\_labels}_t - \text{student\_log\_probs\_on\_labels}_t $$
+
+        If length normalization is enabled:
+
+        $$ \text{lengths}_t = \sum_{i=t}^{T} \gamma^{i-t} $$
+
+        $$ \text{advantages}_t = \frac{\sum_{i=t}^{T} \gamma^{i-t} R_i}{\text{lengths}_t} $$
+
+        Otherwise:
+
+        $$ \text{advantages}_t = \sum_{i=t}^{T} \gamma^{i-t} R_i $$
+
+        Args:
+            student_log_probs_on_labels: Log probabilities of the student model on the labels.
+                Shape: (batch_size, sequence_length)
+            teacher_log_probs_on_labels: Log probabilities of the teacher model on the labels.
+                Shape: (batch_size, sequence_length)
+            mask: Optional mask to apply to the log probabilities. Shape: (batch_size, sequence_length)
+        Returns:
+            advantage: Computed advantage. Shape: (batch_size, sequence_length)
+        """
+        response_length = student_log_probs_on_labels.size(1)
+        if mask is None:
+            mask = torch.ones_like(student_log_probs_on_labels)
+        mask = mask.float()
+        student_log_probs_on_labels = student_log_probs_on_labels * mask
+        teacher_log_probs_on_labels = teacher_log_probs_on_labels * mask
+
+        rewards = teacher_log_probs_on_labels - student_log_probs_on_labels  # (batch_size, sequence_length)
+
+        if self.gamma > 0.0:
+            gamma_pow = torch.pow(self.gamma, torch.arange(response_length, device=rewards.device))
+
+            advantages = rewards * gamma_pow
+            advantages = advantages.flip(1).cumsum(dim=1).flip(1)
+
+            if self.length_normalization:
+                mask = torch.where(mask < 0.5, 1e-4, mask)
+                lengths = mask * gamma_pow
+                lengths = lengths.flip(1).cumsum(dim=1).flip(1)
+                advantages = advantages / lengths
+        else:
+            advantages = rewards
+
+        return advantages
+
+    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
+        input_ids = torch.cat([inputs["prompt_ids"], inputs["completion_ids"]], dim=1)
+        attention_mask = torch.cat([inputs["prompt_mask"], inputs["completion_mask"]], dim=1)
+        labels = input_ids.clone()
+        labels[attention_mask == 0] = -100
+
+        # Compute student output
+        student_outputs = model(input_ids=input_ids, attention_mask=attention_mask, use_cache=False)
+
+        # Compute teacher output in eval mode
+        self.teacher_model.eval()
+        with torch.no_grad():
+            teacher_outputs = self.teacher_model(input_ids=input_ids, attention_mask=attention_mask, use_cache=False)
+
+        # Slice the logits for the generated tokens using the inputs["prompts"] lengths
+        prompt_lengths = inputs["prompt_ids"].shape[1]
+        student_logits = student_outputs.logits[:, prompt_lengths - 1 : -1, :]
+        teacher_logits = teacher_outputs.logits[:, prompt_lengths - 1 : -1, :]
+        shifted_labels = input_ids[:, prompt_lengths:]
+
+        # Apply temperature scaling
+        student_logits = student_logits / self.kd_temperature
+        teacher_logits = teacher_logits / self.kd_temperature
+
+        # Compute log probabilities for student and probabilities for teacher
+        student_log_probs = F.log_softmax(student_logits, dim=-1)
+        teacher_log_probs = F.log_softmax(teacher_logits, dim=-1)
+
+        student_log_probs_on_labels = torch.gather(
+            student_log_probs, dim=-1, index=shifted_labels.unsqueeze(-1)
+        ).squeeze(-1)
+        teacher_log_probs_on_labels = torch.gather(
+            teacher_log_probs, dim=-1, index=shifted_labels.unsqueeze(-1)
+        ).squeeze(-1)
+
+        mask = shifted_labels != -100
+
+        if self.rkl_advantage:
+            reverse_kl_advantage = self._compute_advantage(
+                student_log_probs_on_labels=student_log_probs_on_labels,
+                teacher_log_probs_on_labels=teacher_log_probs_on_labels,
+                mask=mask,
+            )
+
+            inputs["advantages"] = inputs["advantages"].unsqueeze(1) + reverse_kl_advantage
+
+        # Compute GRPO loss on verifiable reward
+        loss = self._compute_loss(model, inputs)
+
+        # Compute loss
+        if self.single_step_decomposition:
+            single_step_decomposition_loss = self._single_step_decomposition_loss(
+                student_log_probs=student_log_probs,
+                teacher_log_probs=teacher_log_probs,
+                mask=mask,
+            )
+
+            loss += single_step_decomposition_loss
+
+        # Empty cache
+        empty_cache()
+
+        # Return loss
+        return (loss, student_outputs) if return_outputs else loss
diff --git a/trl/experimental/nash_md/__init__.py b/trl/experimental/nash_md/__init__.py
new file mode 100644
index 00000000000..9369b5312ba
--- /dev/null
+++ b/trl/experimental/nash_md/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .nash_md_config import NashMDConfig
+from .nash_md_trainer import NashMDTrainer
+
+
+__all__ = ["NashMDConfig", "NashMDTrainer"]
diff --git a/trl/experimental/nash_md/nash_md_config.py b/trl/experimental/nash_md/nash_md_config.py
new file mode 100644
index 00000000000..0f74236eba9
--- /dev/null
+++ b/trl/experimental/nash_md/nash_md_config.py
@@ -0,0 +1,46 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+
+from ...trainer.online_dpo_config import OnlineDPOConfig
+
+
+@dataclass
+class NashMDConfig(OnlineDPOConfig):
+    r"""
+    Configuration class for the [`experimental.nash_md.NashMDTrainer`].
+
+    Subclass of [`OnlineDPOConfig`] we can use all its arguments and add the following:
+
+    Parameters:
+        mixture_coef (`float` or `list[float]`, *optional*, defaults to `0.5`):
+            Logit mixture coefficient for the model and reference model. If a list of floats is provided then the
+            mixture coefficient is selected for each new epoch and the last coefficient is used for the rest of the
+            epochs.
+    """
+
+    mixture_coef: list[float] = field(
+        default_factory=lambda: [0.5],
+        metadata={
+            "help": "Logit mixture coefficient for the model and reference model. If a list of floats is provided "
+            "then the mixture coefficient is selected for each new epoch and the last coefficient is used for the "
+            "rest of the epochs."
+        },
+    )
+
+    def __post_init__(self):
+        super().__post_init__()
+        if hasattr(self.mixture_coef, "__len__") and len(self.mixture_coef) == 1:
+            self.mixture_coef = self.mixture_coef[0]
diff --git a/trl/experimental/nash_md/nash_md_trainer.py b/trl/experimental/nash_md/nash_md_trainer.py
new file mode 100644
index 00000000000..845354b0d69
--- /dev/null
+++ b/trl/experimental/nash_md/nash_md_trainer.py
@@ -0,0 +1,489 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import textwrap
+from collections.abc import Callable
+from typing import Any
+
+import jinja2
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from datasets import Dataset, IterableDataset
+from transformers import (
+    BaseImageProcessor,
+    FeatureExtractionMixin,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+    TrainerCallback,
+)
+from transformers.trainer_utils import EvalPrediction
+from transformers.training_args import OptimizerNames
+from transformers.utils import is_peft_available
+
+from ...data_utils import is_conversational, maybe_apply_chat_template
+from ...models.modeling_base import GeometricMixtureWrapper
+from ...models.utils import unwrap_model_for_generation
+from ...trainer.judges import BasePairwiseJudge
+from ...trainer.online_dpo_trainer import OnlineDPOTrainer
+from ...trainer.utils import SIMPLE_CHAT_TEMPLATE, empty_cache, get_reward, selective_log_softmax, truncate_right
+from .nash_md_config import NashMDConfig
+
+
+if is_peft_available():
+    from peft import PeftModel
+
+
+class NashMDTrainer(OnlineDPOTrainer):
+    """
+    Trainer for the Nash-MD method.
+
+    It is implemented as a subclass of [`OnlineDPOTrainer`].
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]):
+            The model to train, preferably an `AutoModelForCausalLM`.
+        ref_model ([`PreTrainedModelWrapper`]):
+            Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
+            and loss. If no reference model is provided, the trainer will create a reference model with the same
+            architecture as the model to be optimized.
+        reward_funcs ([`~transformers.PreTrainedModel`]):
+            The reward model to score completions with, preferably an
+            [`~transformers.AutoModelForSequenceClassification`].
+        judge ([`BasePairwiseJudge`]):
+            The judge to use for pairwise comparison of model completions.
+        args ([`experimental.nash_md.NashMDConfig`]):
+            The NashMD config arguments to use for training.
+        data_collator ([`~transformers.DataCollator`]):
+            The data collator to use for training. If None is specified, the default data collator
+            ([`DPODataCollatorWithPadding`]) will be used which will pad the sequences to the maximum length of the
+            sequences in the batch, given a dataset of paired sequences.
+        train_dataset ([`~datasets.Dataset`]):
+            The dataset to use for training.
+        eval_dataset ([`~datasets.Dataset`]):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        peft_config (`dict`):
+            The peft config to use for training.
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+            metric values.
+        callbacks (`list[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+    """
+
+    _tag_names = ["trl", "nash-md"]
+    _name = "Nash-MD"
+    _paper = {
+        "title": "Nash Learning from Human Feedback",
+        "id": "2312.00886",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @inproceedings{munos2024nash,
+                title        = {{Nash Learning from Human Feedback}},
+                author       = {R{\'{e}}mi Munos and Michal Valko and Daniele Calandriello and Mohammad Gheshlaghi Azar and Mark Rowland and Zhaohan Daniel Guo and Yunhao Tang and Matthieu Geist and Thomas Mesnard and C{\\^{o}}me Fiegel and Andrea Michi and Marco Selvi and Sertan Girgin and Nikola Momchev and Olivier Bachem and Daniel J. Mankowitz and Doina Precup and Bilal Piot},
+                year         = 2024,
+                booktitle    = {Forty-first International Conference on Machine Learning, {ICML} 2024, Vienna, Austria, July 21-27, 2024},
+                publisher    = {OpenReview.net},
+                url          = {https://openreview.net/forum?id=Y5AmNYiyCQ}
+            }"""),
+    }
+
+    def __init__(
+        self,
+        model: PreTrainedModel | nn.Module = None,
+        ref_model: PreTrainedModel | nn.Module = None,
+        reward_funcs: PreTrainedModel | nn.Module | None = None,
+        judge: BasePairwiseJudge | None = None,
+        args: NashMDConfig | None = None,
+        data_collator: Callable | None = None,
+        train_dataset: Dataset | IterableDataset | None = None,
+        eval_dataset: Dataset | dict[str, Dataset] | None = None,
+        processing_class: PreTrainedTokenizerBase
+        | BaseImageProcessor
+        | FeatureExtractionMixin
+        | ProcessorMixin
+        | None = None,
+        peft_config: dict | None = None,
+        compute_metrics: Callable[[EvalPrediction], dict] | None = None,
+        callbacks: list[TrainerCallback] | None = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
+    ) -> None:
+        super().__init__(
+            model=model,
+            ref_model=ref_model,
+            reward_funcs=reward_funcs,
+            judge=judge,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            reward_processing_classes=processing_class,
+            peft_config=peft_config,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        self._mixture_coef = self.args.mixture_coef
+
+        # Overwrite the stats dictionary to include NashMD specific statistics
+        self.stats = {
+            # Remove "non_score_reward", "rlhf_reward", "scores_margin"
+            # Add "mixture_coef"
+            "loss/kl": [],
+            "objective/entropy": [],
+            "loss/score": [],
+            "rewards/probabilities": [],
+            "rewards/accuracies": [],
+            "rewards/margins": [],
+            "logps/chosen": [],
+            "logps/rejected": [],
+            "val/model_contain_eos_token": [],
+            "val/ref_contain_eos_token": [],
+            "beta": [],
+            "mixture_coef": [],
+        }
+        if self.reward_funcs is not None:
+            if len(self.reward_funcs) != 1:
+                raise ValueError("NashMDTrainer only supports one reward function/model.")
+            self.reward_funcs = self.reward_funcs[0]
+            self.stats["rewards/chosen"] = []
+            self.stats["rewards/rejected"] = []
+
+    @property
+    def mixture_coef(self):
+        if isinstance(self._mixture_coef, list):
+            epoch = self.state.epoch
+            return self._mixture_coef[epoch] if epoch < len(self._mixture_coef) else self._mixture_coef[-1]
+        else:
+            return self._mixture_coef
+
+    def _generate_completions(self, model, prompts):
+        # Generate completions from the policy model.
+        with unwrap_model_for_generation(model, self.accelerator) as unwrapped_policy_for_gen_ctx:
+            model_output = unwrapped_policy_for_gen_ctx.generate(
+                input_ids=prompts["input_ids"],
+                attention_mask=prompts["attention_mask"],
+                generation_config=self.generation_config,
+            )
+
+        # Get the DDP/FSDP unwrapped version of the main model.
+        # This will be the policy model for GeometricMixtureWrapper (PEFT adapters active if PEFT is used).
+        policy_model_for_gmw = self.accelerator.unwrap_model(model)
+
+        # Determine the correct reference model for GeometricMixtureWrapper.
+        # This also needs to be DDP/FSDP unwrapped.
+        ref_model_for_gmw: torch.nn.Module
+        if self.ref_model is None:
+            # No explicit ref_model is provided.
+            # Use the base of the main `model` if it's a PEFT model.
+            # policy_model_for_gmw is already DDP-unwrapped.
+            if is_peft_available() and isinstance(policy_model_for_gmw, PeftModel):
+                ref_model_for_gmw = policy_model_for_gmw.get_base_model()
+            else:
+                # Not a PEFT model (or PEFT not available), or already a base model.
+                # Use the DDP-unwrapped policy model itself as the reference.
+                ref_model_for_gmw = policy_model_for_gmw
+        else:
+            # An explicit ref_model is provided. Unwrap it for DDP/FSDP.
+            ref_model_for_gmw = self.accelerator.unwrap_model(self.ref_model)
+
+        # Both models given to GeometricMixtureWrapper (policy_model_for_gmw and ref_model_for_gmw) are DDP-unwrapped.
+        with torch.no_grad():  # Ensure no_grad context for mixture model generation
+            mixture_model = GeometricMixtureWrapper(
+                model=policy_model_for_gmw,
+                ref_model=ref_model_for_gmw,
+                generation_config=self.generation_config,
+                mixture_coef=self.mixture_coef,
+                device=self.accelerator.device,
+            )
+
+            mixture_output = mixture_model.generate(
+                input_ids=prompts["input_ids"],
+                attention_mask=prompts["attention_mask"],
+                generation_config=self.generation_config,
+            )
+
+        return model_output, mixture_output
+
+    def _process_completions(self, model_output, mixture_output, prompts):
+        context_length = prompts["input_ids"].shape[1]
+
+        # Process model completions
+        model_completion_ids = model_output[:, context_length:]
+        model_completion_ids, model_completion_mask = truncate_right(
+            model_completion_ids, self.processing_class.eos_token_id, self.processing_class.pad_token_id
+        )
+        model_data = {
+            "input_ids": torch.cat((prompts["input_ids"], model_completion_ids), dim=1),
+            "attention_mask": torch.cat((prompts["attention_mask"], model_completion_mask), dim=1),
+            "raw": prompts["raw"],
+        }
+
+        # Process reference model completions
+        mixture_completion_ids = mixture_output[:, context_length:]
+        mixture_completion_ids, mixture_completion_mask = truncate_right(
+            mixture_completion_ids, self.processing_class.eos_token_id, self.processing_class.pad_token_id
+        )
+        mixture_data = {
+            "input_ids": torch.cat((prompts["input_ids"], mixture_completion_ids), dim=1),
+            "attention_mask": torch.cat((prompts["attention_mask"], mixture_completion_mask), dim=1),
+            "raw": prompts["raw"],
+        }
+
+        return model_data, mixture_data
+
+    def _compute_rewards(self, model_data, mixture_data, context_length):
+        with torch.no_grad():
+            _, model_scores, _ = get_reward(
+                self.reward_funcs, model_data["input_ids"], self.processing_class.pad_token_id, context_length
+            )
+            _, mixture_scores, _ = get_reward(
+                self.reward_funcs, mixture_data["input_ids"], self.processing_class.pad_token_id, context_length
+            )
+
+        # Apply EOS penalty if needed
+        if self.args.missing_eos_penalty is not None:
+            model_contain_eos = torch.any(model_data["input_ids"] == self.processing_class.eos_token_id, dim=-1)
+            mixture_contain_eos = torch.any(mixture_data["input_ids"] == self.processing_class.eos_token_id, dim=-1)
+            model_scores[~model_contain_eos] -= self.args.missing_eos_penalty
+            mixture_scores[~mixture_contain_eos] -= self.args.missing_eos_penalty
+
+        return model_scores, mixture_scores
+
+    def _compute_judge(self, model_data, mixture_data, context_length):
+        prompts = model_data["raw"]
+        model_data_completions = self.processing_class.batch_decode(
+            model_data["input_ids"][:, context_length:], skip_special_tokens=True
+        )
+        model_data_completions = [completion.strip() for completion in model_data_completions]
+
+        mixture_data_completions = self.processing_class.batch_decode(
+            mixture_data["input_ids"][:, context_length:], skip_special_tokens=True
+        )
+        mixture_data_completions = [completion.strip() for completion in mixture_data_completions]
+        if is_conversational({"prompt": prompts[0]}):
+            model_data_completions = [
+                [{"role": "assistant", "content": completion}] for completion in model_data_completions
+            ]
+            environment = jinja2.Environment()
+            template = environment.from_string(SIMPLE_CHAT_TEMPLATE)
+            prompts = [template.render(messages=message) for message in prompts]
+            model_data_completions = [template.render(messages=completion) for completion in model_data_completions]
+
+            mixture_data_completions = [
+                [{"role": "assistant", "content": completion}] for completion in mixture_data_completions
+            ]
+            mixture_data_completions = [
+                template.render(messages=completion) for completion in mixture_data_completions
+            ]
+
+        probability = self.judge.judge(
+            prompts,
+            list(zip(model_data_completions, mixture_data_completions, strict=True)),
+            return_scores=True,
+        )
+        return torch.tensor(probability, device=model_data["input_ids"].device)
+
+    def _compute_logprobs(self, model, model_data, context_length):
+        def compute_logprobs_for_data(m, data):
+            output = m(data["input_ids"], attention_mask=data["attention_mask"])
+            logits = output.logits[:, context_length - 1 : -1]
+            token_logprobs = selective_log_softmax(logits, data["input_ids"][:, context_length:])
+            return token_logprobs
+
+        # Compute logprobs for model completions under the model
+        model_logprobs_model_data = compute_logprobs_for_data(model, model_data)
+
+        # Compute logprobs of model completions under the reference model
+        with torch.no_grad():
+            if self.ref_model is None:
+                with model.disable_adapter():
+                    ref_logprobs_model_data = compute_logprobs_for_data(model, model_data)
+            else:
+                ref_logprobs_model_data = compute_logprobs_for_data(self.ref_model, model_data)
+
+        # Mask padding tokens
+        model_padding_mask = model_data["attention_mask"][:, context_length:] == 0
+        model_logprobs_model_data = model_logprobs_model_data.masked_fill(model_padding_mask, 0.0)
+        ref_logprobs_model_data = ref_logprobs_model_data.masked_fill(model_padding_mask, 0.0)
+
+        return (model_logprobs_model_data, ref_logprobs_model_data)
+
+    def _compute_losses(
+        self,
+        model_logprobs_model_data,
+        ref_logprobs_model_data,
+        probability,
+    ):
+        # reinforce score where 0.5 is a control variate
+        score = (probability - 0.5) * model_logprobs_model_data.sum(1)
+
+        # kl divergence via reinforce
+        with torch.no_grad():
+            log_ratio = model_logprobs_model_data - ref_logprobs_model_data
+            kl_div_log = log_ratio.sum(1)
+        kl_div_loss = (log_ratio * model_logprobs_model_data).sum(1)
+
+        # final loss
+        loss = self.beta * kl_div_loss - score
+
+        return loss.mean(), score, kl_div_log
+
+    def _log_statistics(
+        self,
+        model_data,
+        mixture_data,
+        model_logprobs_model_data,
+        ref_logprobs_model_data,
+        probability,
+        score,
+        kl_div,
+        context_length,
+        model_scores=None,
+        mixture_scores=None,
+    ):
+        # Helper function to gather and compute mean
+        def gather_mean(tensor):
+            return self.accelerator.gather_for_metrics(tensor).mean().item()
+
+        # Log score
+        self.stats["loss/score"].append(gather_mean(score))
+        # Log KL divergence
+        self.stats["loss/kl"].append(gather_mean(kl_div))
+
+        # Log logprobs
+        model_logprobs_model_data_sum = model_logprobs_model_data.sum(1)
+        ref_logprobs_model_data_sum = ref_logprobs_model_data.sum(1)
+
+        self.stats["logps/chosen"].append(gather_mean(model_logprobs_model_data_sum))
+        self.stats["logps/rejected"].append(gather_mean(ref_logprobs_model_data_sum))
+
+        # Log rewards
+        if self.reward_funcs is not None:
+            self.stats["rewards/chosen"].append(gather_mean(model_scores))
+            self.stats["rewards/rejected"].append(gather_mean(mixture_scores))
+
+        # Log probabilities
+        self.stats["rewards/probabilities"].append(gather_mean(probability))
+
+        # Calculate entropy for model data
+        entropy_model_data = -model_logprobs_model_data.sum(1)
+        self.stats["objective/entropy"].append(gather_mean(entropy_model_data))
+
+        # Calculate margins
+        margin = model_logprobs_model_data_sum - ref_logprobs_model_data_sum
+        self.stats["rewards/margins"].append(gather_mean(margin))
+
+        # Calculate accuracy
+        accuracy = (margin > 0).float()
+        self.stats["rewards/accuracies"].append(gather_mean(accuracy))
+
+        # Log EOS token statistics
+        model_eos = (model_data["input_ids"][:, context_length:] == self.processing_class.eos_token_id).any(dim=1)
+        mixture_eos = (mixture_data["input_ids"][:, context_length:] == self.processing_class.eos_token_id).any(dim=1)
+        self.stats["val/model_contain_eos_token"].append(gather_mean(model_eos.float()))
+        self.stats["val/ref_contain_eos_token"].append(gather_mean(mixture_eos.float()))
+
+        # Log beta and mixture coef
+        self.stats["beta"].append(self.beta)
+        self.stats["mixture_coef"].append(self.mixture_coef)
+
+    def training_step(
+        self, model: nn.Module, inputs: dict[str, torch.Tensor | Any], num_items_in_batch: int | None = None
+    ) -> torch.Tensor:
+        model.train()
+
+        # Apply chat template and tokenize the input
+        batch_size = len(next(iter(inputs.values())))
+        prompts = inputs["prompt"]
+        inputs = [{k: v[i] for k, v in inputs.items()} for i in range(batch_size)]
+        inputs = [maybe_apply_chat_template(x, self.processing_class) for x in inputs]
+        inputs = [self.tokenize_row(x, self.model.config.is_encoder_decoder, self.processing_class) for x in inputs]
+        inputs = self.data_collator(inputs)
+
+        # need the prompt_ only
+        inputs = self._prepare_inputs(inputs)
+        context_length = inputs["prompt_input_ids"].shape[1]
+        prompts = {
+            "input_ids": inputs["prompt_input_ids"],
+            "attention_mask": inputs["prompt_attention_mask"],
+            "raw": prompts,
+        }
+        del inputs
+
+        # Sample completions from both the model and the reference model
+        model_output, mixture_output = self._generate_completions(model, prompts)
+
+        # Process model completions
+        model_data, mixture_data = self._process_completions(model_output, mixture_output, prompts)
+
+        # Compute rewards
+        if self.reward_funcs is not None:
+            model_scores, mixture_scores = self._compute_rewards(model_data, mixture_data, context_length)
+            # probability of the model data vs the mixture data
+            probability = F.sigmoid(model_scores - mixture_scores)
+        else:
+            model_scores, mixture_scores = None, None
+            probability = self._compute_judge(model_data, mixture_data, context_length)
+
+        # Compute logprobs
+        model_logprobs_model_data, ref_logprobs_model_data = self._compute_logprobs(model, model_data, context_length)
+
+        # Compute loss
+        loss, score, kl_div = self._compute_losses(model_logprobs_model_data, ref_logprobs_model_data, probability)
+
+        # Log everything
+        self._log_statistics(
+            model_data,
+            mixture_data,
+            model_logprobs_model_data.detach(),
+            ref_logprobs_model_data,
+            probability,
+            score.detach(),
+            kl_div.detach(),
+            context_length,
+            model_scores,
+            mixture_scores,
+        )
+
+        if (
+            self.args.torch_empty_cache_steps is not None
+            and self.state.global_step % self.args.torch_empty_cache_steps == 0
+        ):
+            empty_cache()
+
+        kwargs = {}
+        # For LOMO optimizers you need to explicitly use the learning rate
+        if self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
+            kwargs["learning_rate"] = self._get_learning_rate()
+
+        if self.args.n_gpu > 1:
+            loss = loss.mean()  # mean() to average on multi-gpu parallel training
+
+        self.accelerator.backward(loss, **kwargs)
+
+        return loss.detach() / self.args.gradient_accumulation_steps
diff --git a/trl/experimental/openenv/__init__.py b/trl/experimental/openenv/__init__.py
new file mode 100644
index 00000000000..39838bd2ef9
--- /dev/null
+++ b/trl/experimental/openenv/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .utils import generate_rollout_completions
+
+
+__all__ = ["generate_rollout_completions"]
diff --git a/trl/experimental/openenv/utils.py b/trl/experimental/openenv/utils.py
new file mode 100644
index 00000000000..ca26fdc796c
--- /dev/null
+++ b/trl/experimental/openenv/utils.py
@@ -0,0 +1,137 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any
+
+import torch
+
+from ...data_utils import is_conversational
+from ...extras.profiling import profiling_context
+from ...import_utils import is_vllm_available
+
+
+if is_vllm_available():
+    from vllm import SamplingParams
+    from vllm.sampling_params import GuidedDecodingParams
+
+
+def _build_colocate_sampling_params(
+    trainer,
+    overrides: dict[str, Any] | None = None,
+    *,
+    logprobs: bool = True,
+) -> SamplingParams:
+    if trainer.guided_decoding_regex:
+        guided_decoding = GuidedDecodingParams(regex=trainer.guided_decoding_regex)
+    else:
+        guided_decoding = None
+
+    generation_kwargs: dict[str, Any] = {
+        "n": 1,
+        "temperature": trainer.temperature,
+        "top_k": -1 if trainer.top_k is None else trainer.top_k,
+        "min_p": 0.0 if trainer.min_p is None else trainer.min_p,
+        "max_tokens": trainer.max_completion_length,
+        "truncate_prompt_tokens": trainer.max_prompt_length,
+        "guided_decoding": guided_decoding,
+    }
+    if trainer.repetition_penalty is not None:
+        generation_kwargs["repetition_penalty"] = trainer.repetition_penalty
+    if trainer.top_p is not None:
+        generation_kwargs["top_p"] = trainer.top_p
+    if logprobs:
+        generation_kwargs["logprobs"] = 0
+
+    if trainer.args.generation_kwargs is not None:
+        generation_kwargs.update(trainer.args.generation_kwargs)
+
+    if overrides is not None:
+        generation_kwargs.update(overrides)
+
+    generation_kwargs = {key: value for key, value in generation_kwargs.items() if value is not None}
+
+    sampling_params = SamplingParams(**generation_kwargs)
+    if sampling_params.n != 1:
+        raise ValueError("generate_rollout_completions expects n=1 when using colocated vLLM.")
+    return sampling_params
+
+
+def generate_rollout_completions(
+    trainer,
+    prompts: list[str],
+    *,
+    generation_overrides: dict[str, Any] | None = None,
+    as_chat: bool | None = None,
+) -> list[dict[str, Any]]:
+    """
+    Generate completions for custom rollouts when vLLM is running in colocate mode.
+
+    Returns one result per prompt, containing prompt and completion token ids along with per-token log probabilities
+    and the generated text.
+    """
+
+    if not prompts:
+        return []
+
+    if not trainer.use_vllm or trainer.vllm_mode != "colocate":
+        raise RuntimeError("Custom rollouts require vLLM in colocate mode to call generate_rollout_completions.")
+
+    sampling_params = _build_colocate_sampling_params(trainer, generation_overrides)
+    prompts_for_generation = prompts
+    original_size = len(prompts)
+
+    if trainer.vllm_tensor_parallel_size > 1:
+        gathered_prompts = [None for _ in range(trainer.vllm_tensor_parallel_size)]
+        torch.distributed.all_gather_object(gathered_prompts, prompts, group=trainer.tp_group)
+        prompts_for_generation = [prompt for group_prompts in gathered_prompts for prompt in group_prompts]
+
+    if as_chat is None:
+        as_chat = prompts_for_generation and is_conversational({"prompt": prompts_for_generation[0]})
+
+    if trainer.args.vllm_enable_sleep_mode:
+        trainer.llm.wake_up(tags=["kv_cache"])
+
+    with profiling_context(trainer, "vLLM.generate_rollout"):
+        if as_chat:
+            vllm_outputs = trainer.llm.chat(prompts_for_generation, sampling_params=sampling_params, use_tqdm=False)
+        else:
+            vllm_outputs = trainer.llm.generate(
+                prompts_for_generation, sampling_params=sampling_params, use_tqdm=False
+            )
+
+    results: list[dict[str, Any]] = []
+    for request in vllm_outputs:
+        if not request.outputs:
+            results.append({"prompt_ids": request.prompt_token_ids, "completion_ids": [], "logprobs": [], "text": ""})
+            continue
+        sequence = request.outputs[0]
+        logprobs = [next(iter(token_logprob.values())).logprob for token_logprob in sequence.logprobs]
+        results.append(
+            {
+                "prompt_ids": request.prompt_token_ids,
+                "completion_ids": sequence.token_ids,
+                "logprobs": logprobs,
+                "text": sequence.text,
+            }
+        )
+
+    if trainer.vllm_tensor_parallel_size > 1:
+        local_rank_in_group = torch.distributed.get_rank(group=trainer.tp_group)
+        tp_slice = slice(local_rank_in_group * original_size, (local_rank_in_group + 1) * original_size)
+        results = results[tp_slice]
+
+    if trainer.args.vllm_enable_sleep_mode:
+        trainer.llm.sleep(level=2)
+
+    return results
diff --git a/trl/experimental/ppo/__init__.py b/trl/experimental/ppo/__init__.py
new file mode 100644
index 00000000000..6a58ea42975
--- /dev/null
+++ b/trl/experimental/ppo/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .ppo_config import PPOConfig
+from .ppo_trainer import PPOTrainer
+
+
+__all__ = ["PPOConfig", "PPOTrainer"]
diff --git a/trl/experimental/ppo/ppo_config.py b/trl/experimental/ppo/ppo_config.py
new file mode 100644
index 00000000000..0d24617cff5
--- /dev/null
+++ b/trl/experimental/ppo/ppo_config.py
@@ -0,0 +1,135 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from dataclasses import dataclass, field
+from typing import Literal
+
+from ...trainer.utils import OnPolicyConfig
+
+
+@dataclass
+class PPOConfig(OnPolicyConfig):
+    r"""
+    Configuration class for the [`experimental.ppo.PPOTrainer`].
+
+    This class includes only the parameters that are specific to PPO training. For a full list of training arguments,
+    please refer to the [`~transformers.TrainingArguments`] and [`OnPolicyConfig`] documentation. Note that default
+    values in this class may differ from those in [`~transformers.TrainingArguments`].
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters:
+        exp_name (`str`, *optional*, defaults to `os.path.basename(__file__)[:-3]`):
+            Name of this experiment.
+        reward_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`):
+            Path to the reward model.
+        model_adapter_name (`str`, *optional*):
+            Name of the train target PEFT adapter, when using LoRA with multiple adapters.
+        ref_adapter_name (`str`, *optional*):
+            Name of the reference PEFT adapter, when using LoRA with multiple adapters.
+        num_ppo_epochs (`int`, *optional*, defaults to `4`):
+            Number of epochs to train.
+        whiten_rewards (`bool`, *optional*, defaults to `False`):
+            Whether to whiten the rewards.
+        kl_coef (`float`, *optional*, defaults to `0.05`):
+            KL coefficient.
+        kl_estimator (`Literal["k1", "k3"]`, *optional*, defaults to `"k1"`):
+            Which estimator for KL-Divergence to use from [Approximating KL
+            Divergence](http://joschu.net/blog/kl-approx.html). Defaults to "k1", a straightforward, unbiased
+            estimator. Can be set to "k3", an unbiased estimator with lower variance which "appears to be a strictly
+            better estimator". Cannot be set to "k2", as it is used for logging purposes.
+        cliprange (`float`, *optional*, defaults to `0.2`):
+            Clip range.
+        vf_coef (`float`, *optional*, defaults to `0.1`):
+            Value function coefficient.
+        cliprange_value (`float`, *optional*, defaults to `0.2`):
+            Clip range for the value function.
+        gamma (`float`, *optional*, defaults to `1.0`):
+            Discount factor.
+        lam (`float`, *optional*, defaults to `0.95`):
+            Lambda value for GAE.
+        ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
+            This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
+            improving generation speed. However, disabling this option allows training models that exceed the VRAM
+            capacity of a single GPU, albeit at the cost of slower generation.
+    """
+
+    exp_name: str = field(
+        default=os.path.basename(__file__)[:-3],
+        metadata={"help": "Name of this experiment."},
+    )
+    reward_model_path: str = field(
+        default="EleutherAI/pythia-160m",
+        metadata={"help": "Path to the reward model."},
+    )
+    model_adapter_name: str | None = field(
+        default=None,
+        metadata={"help": "Name of the train target PEFT adapter, when using LoRA with multiple adapters."},
+    )
+    ref_adapter_name: str | None = field(
+        default=None,
+        metadata={"help": "Name of the reference PEFT adapter, when using LoRA with multiple adapters."},
+    )
+    num_ppo_epochs: int = field(
+        default=4,
+        metadata={"help": "Number of epochs to train."},
+    )
+    whiten_rewards: bool = field(
+        default=False,
+        metadata={"help": "Whether to whiten the rewards."},
+    )
+    kl_coef: float = field(
+        default=0.05,
+        metadata={"help": "KL coefficient."},
+    )
+    kl_estimator: Literal["k1", "k3"] = field(
+        default="k1",
+        metadata={
+            "help": "Which estimator for KL-Divergence to use from Approximating KL Divergence "
+            "(http://joschu.net/blog/kl-approx.html). Defaults to 'k1', a straightforward, unbiased estimator. Can be "
+            "set to 'k3', an unbiased estimator with lower variance which 'appears to be a strictly better "
+            "estimator'. Cannot be set to 'k2', as it is used for logging purposes."
+        },
+    )
+    cliprange: float = field(
+        default=0.2,
+        metadata={"help": "Clip range."},
+    )
+    vf_coef: float = field(
+        default=0.1,
+        metadata={"help": "Value function coefficient."},
+    )
+    cliprange_value: float = field(
+        default=0.2,
+        metadata={"help": "Clip range for the value function."},
+    )
+    gamma: float = field(
+        default=1.0,
+        metadata={"help": "Discount factor."},
+    )
+    lam: float = field(
+        default=0.95,
+        metadata={"help": "Lambda value for GAE."},
+    )
+    ds3_gather_for_generation: bool = field(
+        default=True,
+        metadata={
+            "help": "This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for "
+            "generation, improving generation speed. However, disabling this option allows training models that "
+            "exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation."
+        },
+    )
diff --git a/trl/experimental/ppo/ppo_trainer.py b/trl/experimental/ppo/ppo_trainer.py
new file mode 100644
index 00000000000..b11a245582f
--- /dev/null
+++ b/trl/experimental/ppo/ppo_trainer.py
@@ -0,0 +1,836 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import math
+import os
+import textwrap
+import time
+from collections import defaultdict
+from contextlib import contextmanager, nullcontext
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+from accelerate import Accelerator, logging
+from accelerate.utils import broadcast, gather_object
+from datasets import Dataset
+from torch.utils.data import DataLoader
+from transformers import (
+    BaseImageProcessor,
+    DataCollatorWithPadding,
+    FeatureExtractionMixin,
+    GenerationConfig,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+    TrainerCallback,
+    TrainerControl,
+)
+from transformers.integrations import get_reporting_integration_callbacks
+from transformers.trainer import DEFAULT_CALLBACKS, DEFAULT_PROGRESS_CALLBACK
+from transformers.trainer_callback import CallbackHandler, ExportableState, PrinterCallback
+from transformers.utils import is_peft_available, is_rich_available
+
+from ...models import create_reference_model
+from ...models.utils import unwrap_model_for_generation
+from ...trainer.base_trainer import BaseTrainer
+from ...trainer.utils import (
+    OnlineTrainerState,
+    batch_generation,
+    disable_dropout_in_model,
+    empty_cache,
+    exact_div,
+    first_true_indices,
+    forward,
+    get_reward,
+    log_table_to_comet_experiment,
+    peft_module_casting_to_bf16,
+    prepare_deepspeed,
+    print_rich_table,
+    selective_log_softmax,
+    truncate_response,
+)
+from .ppo_config import PPOConfig
+
+
+logger = logging.get_logger(__name__)
+
+if is_peft_available():
+    from peft import PeftConfig, PeftModel, get_peft_model
+
+
+INVALID_LOGPROB = 1.0
+
+
+def masked_mean(values: torch.Tensor, mask: torch.Tensor, axis: bool | None = None) -> torch.Tensor:
+    """Compute mean of tensor with a masked values."""
+    if axis is not None:
+        return (values * mask).sum(axis=axis) / mask.sum(axis=axis)
+    else:
+        return (values * mask).sum() / mask.sum()
+
+
+def masked_var(values: torch.Tensor, mask: torch.Tensor, unbiased: bool = True) -> torch.Tensor:
+    """Compute variance of tensor with masked values."""
+    mean = masked_mean(values, mask)
+    centered_values = values - mean
+    variance = masked_mean(centered_values**2, mask)
+    if unbiased:
+        mask_sum = mask.sum()
+        if mask_sum == 0:
+            raise ValueError(
+                "The sum of the mask is zero, which can happen when `mini_batch_size=1`;"
+                "try increase the `mini_batch_size` or `gradient_accumulation_steps`"
+            )
+        # note that if mask_sum == 1, then there is a division by zero issue
+        # to avoid it you just need to use a larger minibatch_size
+        bessel_correction = mask_sum / (mask_sum - 1)
+        variance = variance * bessel_correction
+    return variance
+
+
+def masked_whiten(values: torch.Tensor, mask: torch.Tensor, shift_mean: bool = True) -> torch.Tensor:
+    """Whiten values with masked values."""
+    mean, var = masked_mean(values, mask), masked_var(values, mask)
+    whitened = (values - mean) * torch.rsqrt(var + 1e-8)
+    if not shift_mean:
+        whitened += mean
+    return whitened
+
+
+# taken from https://github.com/OpenLMLab/MOSS-RLHF/blob/40b91eb2f2b71b16919addede0341d2bef70825d/ppo/ppo_trainer.py#L29
+# we did this we can do a single `model = accelerator.prepare(model)`
+class PolicyAndValueWrapper(nn.Module):
+    def __init__(self, policy, value_model) -> None:
+        super().__init__()
+        self.policy = policy
+        self.value_model = value_model
+        self.critic_backbone = getattr(value_model, value_model.base_model_prefix)
+        self.is_gradient_checkpointing = policy.is_gradient_checkpointing
+
+    def forward(self, **kwargs):
+        output = self.critic_backbone(**kwargs)
+        logits = self.value_model.score(output.hidden_states[-1])
+        return self.policy(**kwargs), logits
+
+
+class PPOTrainer(BaseTrainer):
+    """Trainer for Proximal Policy Optimization (PPO).
+
+    For details on PPO, see the paper: [Proximal Policy Optimization
+    Algorithms](https://huggingface.co/papers/1707.06347).
+
+    Args:
+        args ([`experimental.ppo.PPOConfig`]):
+            Training arguments.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`]):
+            Class to process the data.
+        model (`torch.nn.Module`):
+            Model to be trained. This is the policy model.
+        ref_model (`torch.nn.Module`, *optional*):
+            Reference model used to compute the KL divergence. If `None`, a copy of the policy model is created.
+        reward_model (`torch.nn.Module`):
+            Reward model used to compute the rewards.
+        train_dataset ([`~datasets.Dataset`]):
+            Dataset for training.
+        value_model (`torch.nn.Module`):
+            Value model used to predict the value of a state.
+        data_collator ([`~transformers.DataCollatorWithPadding`], *optional*):
+            Data collator to batch and pad samples from the dataset. If `None`, a default data collator is created
+            using the `processing_class`.
+        eval_dataset ([`~datasets.Dataset`] or `dict` of [`~datasets.Dataset`], *optional*):
+            Dataset for evaluation.
+        optimizers (`tuple` of `torch.optim.Optimizer` and `torch.optim.lr_scheduler.LambdaLR`, *optional*, defaults to `(None, None)`):
+            Tuple containing the optimizer and the learning rate scheduler to use for training. If `None`, the
+            optimizer and the learning rate scheduler are created using the
+            [`~transformers.Trainer.create_optimizer_and_scheduler`] method.
+        callbacks (`list` of [`~transformers.TrainerCallback`], *optional*):
+            Callbacks to use during training.
+        peft_config ([`~peft.PeftConfig`], *optional*):
+            PEFT configuration to use PEFT for training. If `None`, PEFT is not used. If provided, the policy `model`
+            will be wrapped with the specified PEFT adapter.
+    """
+
+    _tag_names = ["trl", "ppo"]
+    _name = "PPO"
+    _paper = {
+        "title": "Fine-Tuning Language Models from Human Preferences",
+        "id": "1909.08593",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @article{mziegler2019fine-tuning,
+                title        = {{Fine-Tuning Language Models from Human Preferences}},
+                author       = {Daniel M. Ziegler and Nisan Stiennon and Jeffrey Wu and Tom B. Brown and Alec Radford and Dario Amodei and Paul F. Christiano and Geoffrey Irving},
+                year         = 2019,
+                eprint       = {arXiv:1909.08593}
+            }"""),
+    }
+
+    def __init__(
+        self,
+        args: PPOConfig,
+        processing_class: PreTrainedTokenizerBase | BaseImageProcessor | FeatureExtractionMixin | ProcessorMixin,
+        model: nn.Module,
+        ref_model: nn.Module | None,
+        reward_model: nn.Module,
+        train_dataset: Dataset,
+        value_model: nn.Module,
+        data_collator: DataCollatorWithPadding | None = None,
+        eval_dataset: Dataset | dict[str, Dataset] | None = None,
+        # less commonly used
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        callbacks: list[TrainerCallback] | None = None,
+        peft_config: "PeftConfig | None" = None,
+    ) -> None:
+        if ref_model is model:
+            raise ValueError(
+                "`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the "
+                "same as `model`, you must make a copy of it, or `None` if you use peft."
+            )
+
+        self.args = args
+        self.processing_class = processing_class
+        self.policy_model = model
+
+        # Define the collator if not provided
+        if data_collator is None:
+            data_collator = DataCollatorWithPadding(self.processing_class)
+
+        # Handle stop token settings: update policy model's generation_config to use provided stop token
+        if args.stop_token and args.stop_token_id:
+            raise ValueError("You cannot set both `stop_token` and `stop_token_id`.")
+        elif args.stop_token:
+            if args.stop_token == "eos":
+                self.policy_model.generation_config.eos_token_id = self.stop_token_id = processing_class.eos_token_id
+            else:
+                raise ValueError(
+                    f"Unknown `stop_token` {args.stop_token}. Allowed values are: `'eos'` and `None` (no stop token)."
+                )
+        else:
+            self.policy_model.generation_config.eos_token_id = self.stop_token_id = args.stop_token_id  # None or int
+
+        # Check that the kl estimator is valid
+        if self.args.kl_estimator not in {"k1", "k3"}:
+            raise ValueError(
+                "kl_estimator must be either 'k1' (straightforward, unbiased) or 'k3' (lower variance, unbiased, "
+                "appears to be a strictly better estimator). See "
+                "[Approximating KL Divergence](http://joschu.net/blog/kl-approx.html) for details."
+            )
+
+        # peft support
+        if not is_peft_available() and peft_config is not None:
+            raise ImportError(
+                "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models"
+            )
+        elif is_peft_available() and peft_config is not None:
+            # if model is a peft model and we have a peft_confg, we merge and unload it first
+            if isinstance(self.policy_model, PeftModel):
+                self.policy_model = self.policy_model.merge_and_unload()
+
+            # get peft model with the given config
+            self.policy_model = get_peft_model(self.policy_model, peft_config)
+            if args.bf16 and getattr(self.policy_model, "is_loaded_in_4bit", False):
+                peft_module_casting_to_bf16(self.policy_model)
+
+        self.is_peft_model = is_peft_available() and isinstance(self.policy_model, PeftModel)
+        self.model_adapter_name = args.model_adapter_name
+        self.ref_adapter_name = args.ref_adapter_name
+
+        if ref_model:
+            self.ref_model = ref_model
+        elif self.is_peft_model:
+            self.ref_model = None
+        else:
+            self.ref_model = create_reference_model(self.policy_model)
+
+        self.reward_model = reward_model
+        self.train_dataset = train_dataset
+        self.train_dataset_len = len(train_dataset)
+        self.value_model = value_model
+        self.data_collator = data_collator
+        self.eval_dataset = eval_dataset
+        self.optimizer, self.lr_scheduler = optimizers
+        self.optimizer_cls_and_kwargs = None  # needed for transformers >= 4.47
+
+        #########
+        # calculate various batch sizes
+        #########
+        if args.total_episodes is None:  # allow the users to define episodes in terms of epochs.
+            args.total_episodes = int(args.num_train_epochs * self.train_dataset_len)
+        accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps)
+        self.accelerator = accelerator
+        args.world_size = accelerator.num_processes
+        args.local_batch_size = args.per_device_train_batch_size * args.gradient_accumulation_steps
+        args.micro_batch_size = int(args.per_device_train_batch_size * args.world_size)
+        args.batch_size = int(args.local_batch_size * args.world_size)
+        args.mini_batch_size = exact_div(
+            args.batch_size, args.num_mini_batches, "`batch_size` must be a multiple of `num_mini_batches`"
+        )
+        args.local_mini_batch_size = exact_div(
+            args.local_batch_size, args.num_mini_batches, "`local_batch_size` must be a multiple of `num_mini_batches`"
+        )
+        if args.whiten_rewards:
+            assert args.local_mini_batch_size >= 8, (
+                f"Per-rank minibatch size {args.local_mini_batch_size} is insufficient for whitening"
+            )
+        # `per_rank_rollout_batch_size` is our `args.local_batch_size`
+        # `per_rank_minibatch_size` is our `args.local_mini_batch_size`
+        args.num_total_batches = math.ceil(
+            args.total_episodes / args.batch_size
+        )  # we may train for more than `total_episodes`
+        time_tensor = torch.tensor(int(time.time()), device=accelerator.device)
+        time_int = broadcast(time_tensor, 0).item()  # avoid different timestamps across processes
+        args.run_name = f"{args.exp_name}__{args.seed}__{time_int}"
+        self.local_seed = args.seed + accelerator.process_index * 100003  # Prime
+        if args.num_sample_generations > 0:
+            self.sample_generations_freq = max(1, args.num_total_batches // args.num_sample_generations)
+        self.local_dataloader_batch_size = args.local_batch_size
+
+        #########
+        # setup model, optimizer, and others
+        #########
+        for module in [self.policy_model, self.ref_model, self.value_model, self.reward_model]:
+            if module is not None:
+                disable_dropout_in_model(module)
+        self.model = PolicyAndValueWrapper(self.policy_model, self.value_model)
+        self.model.config = self.policy_model.config  # needed for pushing to hub
+        self.create_optimizer_and_scheduler(
+            num_training_steps=args.num_total_batches
+        )  # note that we are calling `self.lr_scheduler.step()` manually only at the batch level
+
+        #########
+        # trainer specifics
+        #########
+        default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to)
+        self.callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks
+        self.callback_handler = CallbackHandler(
+            self.callbacks, self.model, self.processing_class, self.optimizer, self.lr_scheduler
+        )
+        self.add_callback(PrinterCallback if self.args.disable_tqdm else DEFAULT_PROGRESS_CALLBACK)
+        self.control = TrainerControl()
+        self.state = OnlineTrainerState(
+            is_local_process_zero=self.is_local_process_zero(),
+            is_world_process_zero=self.is_world_process_zero(),
+            stateful_callbacks=[
+                cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState)
+            ],
+        )
+        self.current_flos = 0
+        self.hp_search_backend = None
+        self.is_deepspeed_enabled = getattr(self.accelerator.state, "deepspeed_plugin", None) is not None
+        self.is_fsdp_enabled = getattr(self.accelerator.state, "fsdp_plugin", None) is not None
+        # Create distant repo and output directory if needed
+        self.hub_model_id = None
+        if self.args.push_to_hub:
+            self.init_hf_repo()
+        if self.args.should_save:
+            os.makedirs(self.args.output_dir, exist_ok=True)
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+        #########
+        # setup dataloader
+        #########
+        self.dataloader = DataLoader(
+            self.train_dataset,
+            batch_size=self.local_dataloader_batch_size,
+            shuffle=True,
+            collate_fn=self.data_collator,
+            drop_last=True,  # needed; otherwise the last batch will be of ragged shape
+        )
+        # sync random states for DataLoader(shuffle=True) before `accelerator.prepare`
+        # see https://gist.github.com/vwxyzjn/2581bff1e48e185e0b85b6dfe1def79c
+        torch.manual_seed(args.seed)
+        self.model, self.optimizer, self.dataloader = accelerator.prepare(self.model, self.optimizer, self.dataloader)
+        torch.manual_seed(self.local_seed)  # reset the local seed again
+
+        self.eval_dataloader = DataLoader(
+            self.eval_dataset,
+            batch_size=args.per_device_eval_batch_size,
+            collate_fn=self.data_collator,
+            drop_last=True,
+        )  # no need to shuffle eval dataset
+        self.eval_dataloader = accelerator.prepare(self.eval_dataloader)
+
+        if self.is_deepspeed_enabled:
+            self.reward_model = prepare_deepspeed(
+                self.reward_model, args.per_device_train_batch_size, args.fp16, args.bf16
+            )
+
+            if self.ref_model is None:
+                if not self.is_peft_model:
+                    raise ValueError("No reference model and model is not a Peft model.")
+            else:
+                self.ref_model = prepare_deepspeed(
+                    self.ref_model, args.per_device_train_batch_size, args.fp16, args.bf16
+                )
+        else:
+            if self.ref_model is None:
+                if not self.is_peft_model:
+                    raise ValueError("No reference model and model is not a Peft model.")
+            else:
+                self.ref_model = self.ref_model.to(self.accelerator.device)
+            self.reward_model = self.reward_model.to(self.accelerator.device)
+
+    def get_train_dataloader(self) -> DataLoader:
+        return self.dataloader
+
+    def get_eval_dataloader(self) -> DataLoader:
+        return self.eval_dataloader
+
+    @contextmanager
+    def null_ref_context(self):
+        """Context manager for handling null reference model (that is, peft adapter manipulation)."""
+        with (
+            self.accelerator.unwrap_model(self.model.policy).disable_adapter()
+            if self.is_peft_model and not self.ref_adapter_name
+            else nullcontext()
+        ):
+            if self.ref_adapter_name:
+                self.model.policy.set_adapter(self.ref_adapter_name)
+            yield
+            if self.ref_adapter_name:
+                self.model.policy.set_adapter(self.model_adapter_name or "default")
+
+    def save_model(self, output_dir: str | None = None, _internal_call: bool = False):
+        backup_model = self.model
+        self.model = self.model.policy  # save only the policy
+
+        if self.is_deepspeed_enabled:
+            backup_deepspeed = self.deepspeed
+            self.deepspeed = self.model
+
+        super().save_model(output_dir, _internal_call)
+
+        self.model = backup_model
+
+        if self.is_deepspeed_enabled:
+            self.deepspeed = backup_deepspeed
+
+    def train(self):
+        args = self.args
+        accelerator = self.accelerator
+        optimizer = self.optimizer
+        model = self.model
+        ref_policy = self.ref_model
+        reward_model = self.reward_model
+        processing_class = self.processing_class
+        dataloader = self.dataloader
+        device = accelerator.device
+
+        def repeat_generator():
+            while True:
+                yield from dataloader
+
+        iter_dataloader = iter(repeat_generator())
+        generation_config = GenerationConfig(
+            max_new_tokens=args.response_length,
+            temperature=(args.temperature + 1e-7),
+            top_k=0.0,
+            top_p=1.0,
+            do_sample=True,
+        )
+
+        accelerator.print("===training policy===")
+        start_time = time.time()
+        stats_shape = (args.num_ppo_epochs, args.num_mini_batches, args.gradient_accumulation_steps)
+        approxkl_stats = torch.zeros(stats_shape, device=device)
+        pg_clipfrac_stats = torch.zeros(stats_shape, device=device)
+        pg_loss_stats = torch.zeros(stats_shape, device=device)
+        vf_loss_stats = torch.zeros(stats_shape, device=device)
+        vf_clipfrac_stats = torch.zeros(stats_shape, device=device)
+        entropy_stats = torch.zeros(stats_shape, device=device)
+        ratio_stats = torch.zeros(stats_shape, device=device)
+        model.train()
+
+        # trainer state initialization
+        self.state.global_step = 0
+        self.state.episode = 0
+        self.state.max_steps = args.num_total_batches
+        self.state.num_train_epochs = args.total_episodes / self.train_dataset_len
+        # Compute absolute values for logging, eval, and save if given as ratio
+        if args.logging_steps is not None:
+            if args.logging_steps < 1:
+                self.state.logging_steps = math.ceil(self.state.max_steps * args.logging_steps)
+            else:
+                self.state.logging_steps = args.logging_steps
+        if args.eval_steps is not None:
+            if args.eval_steps < 1:
+                self.state.eval_steps = math.ceil(self.state.max_steps * args.eval_steps)
+            else:
+                self.state.eval_steps = args.eval_steps
+        if args.save_steps is not None:
+            if args.save_steps < 1:
+                self.state.save_steps = math.ceil(self.state.max_steps * args.save_steps)
+            else:
+                self.state.save_steps = args.save_steps
+        self.control = self.callback_handler.on_train_begin(args, self.state, self.control)
+
+        # backward compatibility
+        if self.is_deepspeed_enabled:
+            self.deepspeed = self.model
+            self.model_wrapped = self.model
+
+        for update in range(1, args.num_total_batches + 1):
+            self.state.episode += 1 * args.batch_size
+            data = next(iter_dataloader)
+            with torch.no_grad():
+                queries = data["input_ids"].to(device)
+                context_length = queries.shape[1]
+                responses = []
+                postprocessed_responses = []
+                logprobs = []
+                ref_logprobs = []
+                scores = []
+                sequence_lengths = []
+                values = []
+                with unwrap_model_for_generation(
+                    self.model, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation
+                ) as unwrapped_model:
+                    query_responses, logitss = batch_generation(
+                        unwrapped_model.policy,
+                        queries,
+                        args.local_rollout_forward_batch_size,
+                        processing_class.pad_token_id,
+                        generation_config,
+                    )
+
+                for i in range(0, queries.shape[0], args.local_rollout_forward_batch_size):
+                    query = queries[i : i + args.local_rollout_forward_batch_size]
+                    query_response = query_responses[i : i + args.local_rollout_forward_batch_size]
+                    response = query_response[:, context_length:]
+                    logits = logitss[i : i + args.local_rollout_forward_batch_size]
+                    logprob = selective_log_softmax(logits, response)
+                    del logits
+                    empty_cache()
+
+                    if ref_policy is None:
+                        with self.null_ref_context():
+                            ref_output = forward(model.policy, query_response, processing_class.pad_token_id)
+                    else:
+                        ref_output = forward(ref_policy, query_response, processing_class.pad_token_id)
+                    ref_logits = ref_output.logits[:, context_length - 1 : -1]
+                    ref_logits /= args.temperature + 1e-7
+                    ref_logprob = selective_log_softmax(ref_logits, response)
+                    del ref_output, ref_logits
+                    empty_cache()
+
+                    # Response Processing 1. truncate response after the first occurrence of `stop_token_id`
+                    postprocessed_response = response
+                    if self.stop_token_id is not None:  # handle the edge case when stop_token_id exists but is 0
+                        postprocessed_response = truncate_response(
+                            self.stop_token_id, processing_class.pad_token_id, response
+                        )
+
+                    # Response Processing 2. run reward model on the truncated responses
+                    postprocessed_query_response = torch.cat((query, postprocessed_response), 1)
+                    sequence_length = first_true_indices(postprocessed_response == processing_class.pad_token_id) - 1
+                    unwrapped_value_model = accelerator.unwrap_model(model).value_model
+                    full_value, _, _ = get_reward(
+                        unwrapped_value_model, query_response, processing_class.pad_token_id, context_length
+                    )
+                    value = full_value[:, context_length - 1 : -1].squeeze(-1)
+                    _, score, _ = get_reward(
+                        reward_model, postprocessed_query_response, processing_class.pad_token_id, context_length
+                    )
+
+                    responses.append(response)
+                    postprocessed_responses.append(postprocessed_response)
+                    logprobs.append(logprob)
+                    ref_logprobs.append(ref_logprob)
+                    sequence_lengths.append(sequence_length)
+                    scores.append(score)
+                    values.append(value)
+                responses = torch.cat(responses, 0)
+                postprocessed_responses = torch.cat(postprocessed_responses, 0)
+                logprobs = torch.cat(logprobs, 0)
+                ref_logprobs = torch.cat(ref_logprobs, 0)
+                sequence_lengths = torch.cat(sequence_lengths, 0)
+                scores = torch.cat(scores, 0)
+                values = torch.cat(values, 0)
+                del (logprob, ref_logprob, full_value, value, score, unwrapped_model)
+                empty_cache()
+                gc.collect()
+
+                # Response Processing 3. Filter completion. Ensure that the sample contains stop_token_id
+                # Completions not passing that filter will receive a lower score.
+                contain_eos_token = torch.any(postprocessed_responses == self.processing_class.eos_token_id, dim=-1)
+                if self.args.missing_eos_penalty is not None:
+                    scores[~contain_eos_token] -= self.args.missing_eos_penalty
+                # accelerator.print(f"{scores=}, {(contain_eos_token.sum() / len(contain_eos_token))=}")
+
+                # be very careful with `padding_mask_p1`; see https://excalidraw.com/#json=LWnzG4w2k5DjF_EOL_xPt,e2w3a-hFJ_gX5vOfeyXGTw
+                response_idxs = torch.arange(responses.shape[1], device=responses.device).repeat(responses.shape[0], 1)
+                padding_mask = response_idxs > sequence_lengths.unsqueeze(1)
+                logprobs = torch.masked_fill(logprobs, padding_mask, INVALID_LOGPROB)
+                ref_logprobs = torch.masked_fill(ref_logprobs, padding_mask, INVALID_LOGPROB)
+                sequence_lengths_p1 = sequence_lengths + 1
+                padding_mask_p1 = response_idxs > (sequence_lengths_p1.unsqueeze(1))
+                values = torch.masked_fill(values, padding_mask_p1, 0)
+
+                # 4. compute rewards
+                # Formula used by http://joschu.net/blog/kl-approx.html for the k1 and k3 estimators
+                logr = ref_logprobs - logprobs
+                kl = -logr if args.kl_estimator == "k1" else (logr.exp() - 1) - logr  # Else statement is k3
+                non_score_reward = -args.kl_coef * kl
+                rewards = non_score_reward.clone()
+                actual_start = torch.arange(rewards.size(0), device=rewards.device)
+                actual_end = torch.where(sequence_lengths_p1 < rewards.size(1), sequence_lengths_p1, sequence_lengths)
+                rewards[actual_start, actual_end] += scores
+
+                # 5. whiten rewards
+                if args.whiten_rewards:
+                    rewards = masked_whiten(rewards, mask=~padding_mask_p1, shift_mean=False)
+                    rewards = torch.masked_fill(rewards, padding_mask_p1, 0)
+
+                # 6. compute advantages and returns
+                lastgaelam = 0
+                advantages_reversed = []
+                gen_length = responses.shape[1]
+                for t in reversed(range(gen_length)):
+                    nextvalues = values[:, t + 1] if t < gen_length - 1 else 0.0
+                    delta = rewards[:, t] + args.gamma * nextvalues - values[:, t]
+                    lastgaelam = delta + args.gamma * args.lam * lastgaelam
+                    advantages_reversed.append(lastgaelam)
+                advantages = torch.stack(advantages_reversed[::-1], axis=1)
+                returns = advantages + values
+                advantages = masked_whiten(advantages, ~padding_mask)
+                advantages = torch.masked_fill(advantages, padding_mask, 0)
+                empty_cache()
+
+            # Do multiple epochs of PPO training, with a fresh random shuffle in each epoch
+            for ppo_epoch_idx in range(args.num_ppo_epochs):
+                b_inds = np.random.permutation(args.local_batch_size)
+                minibatch_idx = 0
+                for mini_batch_start in range(0, args.local_batch_size, args.local_mini_batch_size):
+                    mini_batch_end = mini_batch_start + args.local_mini_batch_size
+                    mini_batch_inds = b_inds[mini_batch_start:mini_batch_end]
+                    gradient_accumulation_idx = 0
+                    for micro_batch_start in range(0, args.local_mini_batch_size, args.per_device_train_batch_size):
+                        with accelerator.accumulate(model):
+                            micro_batch_end = micro_batch_start + args.per_device_train_batch_size
+                            micro_batch_inds = mini_batch_inds[micro_batch_start:micro_batch_end]
+                            mb_advantage = advantages[micro_batch_inds]
+                            mb_responses = responses[micro_batch_inds]
+                            mb_query_responses = query_responses[micro_batch_inds]
+                            mb_logprobs = logprobs[micro_batch_inds]
+                            mb_return = returns[micro_batch_inds]
+                            mb_values = values[micro_batch_inds]
+
+                            output, vpred_temp = forward(model, mb_query_responses, processing_class.pad_token_id)
+                            logits = output.logits[:, context_length - 1 : -1]
+                            logits /= args.temperature + 1e-7
+                            new_logprobs = selective_log_softmax(logits, mb_responses)
+                            new_logprobs = torch.masked_fill(
+                                new_logprobs, padding_mask[micro_batch_inds], INVALID_LOGPROB
+                            )
+                            vpred = vpred_temp[:, context_length - 1 : -1].squeeze(-1)
+                            vpred = torch.masked_fill(vpred, padding_mask_p1[micro_batch_inds], 0)
+                            vpredclipped = torch.clamp(
+                                vpred,
+                                mb_values - args.cliprange_value,
+                                mb_values + args.cliprange_value,
+                            )
+                            vf_losses1 = torch.square(vpred - mb_return)
+                            vf_losses2 = torch.square(vpredclipped - mb_return)
+                            vf_loss_max = torch.max(vf_losses1, vf_losses2)
+                            vf_loss = 0.5 * masked_mean(vf_loss_max, ~padding_mask_p1[micro_batch_inds])
+                            vf_clipfrac = masked_mean(
+                                (vf_losses2 > vf_losses1).float(), ~padding_mask_p1[micro_batch_inds]
+                            )
+                            logprobs_diff = new_logprobs - mb_logprobs
+                            ratio = torch.exp(logprobs_diff)
+                            pg_losses = -mb_advantage * ratio
+                            pg_losses2 = -mb_advantage * torch.clamp(ratio, 1.0 - args.cliprange, 1.0 + args.cliprange)
+                            pg_loss_max = torch.max(pg_losses, pg_losses2)
+                            pg_loss = masked_mean(pg_loss_max, ~padding_mask[micro_batch_inds])
+                            loss = pg_loss + args.vf_coef * vf_loss
+                            accelerator.backward(loss)
+                            optimizer.step()
+                            optimizer.zero_grad()
+                            with torch.no_grad():
+                                pg_clipfrac = masked_mean(
+                                    (pg_losses2 > pg_losses).float(), ~padding_mask[micro_batch_inds]
+                                )
+                                prob_dist = torch.nn.functional.softmax(logits, dim=-1)
+                                entropy = torch.logsumexp(logits, dim=-1) - torch.sum(prob_dist * logits, dim=-1)
+                                approxkl = 0.5 * (logprobs_diff**2).mean()
+                                approxkl_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = approxkl
+                                pg_clipfrac_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = (
+                                    pg_clipfrac
+                                )
+                                pg_loss_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = pg_loss
+                                vf_loss_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = vf_loss
+                                vf_clipfrac_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = (
+                                    vf_clipfrac
+                                )
+                                entropy_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = entropy.mean()
+                                ratio_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = ratio.mean()
+                        gradient_accumulation_idx += 1
+                    minibatch_idx += 1
+                    # del everything and empty cache
+                    # fmt: off
+                    del (
+                        output, vpred_temp, logits, new_logprobs, vpred, vpredclipped,
+                        vf_losses1, vf_losses2, vf_loss, vf_clipfrac, logprobs_diff, ratio, pg_losses, pg_losses2, pg_loss_max,
+                        pg_loss, loss, pg_clipfrac, prob_dist, entropy, approxkl, mb_return,
+                        mb_advantage, mb_values, mb_responses, mb_query_responses, mb_logprobs,
+                    )
+                    # fmt: on
+                    empty_cache()
+            with torch.no_grad():
+                mean_kl = kl.sum(1).mean()
+                mean_entropy = (-logprobs).sum(1).mean()
+                mean_non_score_reward = non_score_reward.sum(1).mean()
+                rlhf_reward = mean_non_score_reward + scores.mean()
+                eps = int(self.state.episode / (time.time() - start_time))
+                metrics = {}
+                metrics["eps"] = eps
+                metrics["objective/kl"] = self.accelerator.gather_for_metrics(mean_kl).mean().item()
+                metrics["objective/entropy"] = self.accelerator.gather_for_metrics(mean_entropy).mean().item()
+                metrics["objective/non_score_reward"] = (
+                    self.accelerator.gather_for_metrics(mean_non_score_reward).mean().item()
+                )
+                metrics["objective/rlhf_reward"] = self.accelerator.gather_for_metrics(rlhf_reward).mean().item()
+                metrics["objective/scores"] = self.accelerator.gather_for_metrics(scores.mean()).mean().item()
+                metrics["policy/approxkl_avg"] = self.accelerator.gather_for_metrics(approxkl_stats).mean().item()
+                metrics["policy/clipfrac_avg"] = self.accelerator.gather_for_metrics(pg_clipfrac_stats).mean().item()
+                metrics["loss/policy_avg"] = self.accelerator.gather_for_metrics(pg_loss_stats).mean().item()
+                metrics["loss/value_avg"] = self.accelerator.gather_for_metrics(vf_loss_stats).mean().item()
+                metrics["val/clipfrac_avg"] = self.accelerator.gather_for_metrics(vf_clipfrac_stats).mean().item()
+                metrics["policy/entropy_avg"] = self.accelerator.gather_for_metrics(entropy_stats).mean().item()
+                metrics["val/ratio"] = self.accelerator.gather_for_metrics(ratio_stats).mean().item()
+                metrics["val/ratio_var"] = self.accelerator.gather_for_metrics(ratio_stats).var().item()
+                metrics["val/num_eos_tokens"] = (responses == processing_class.eos_token_id).sum().item()
+                metrics["lr"] = self.lr_scheduler.get_last_lr()[0]
+                metrics["episode"] = self.state.episode
+                self.state.epoch = self.state.episode / self.train_dataset_len  # used by self.log
+                self.state.global_step += 1
+                self.log(metrics)
+
+            self.lr_scheduler.step()
+            self.control = self.callback_handler.on_step_end(args, self.state, self.control)
+            if self.control.should_save:
+                self._save_checkpoint(model, trial=None)
+                self.control = self.callback_handler.on_save(self.args, self.state, self.control)
+            del kl, mean_kl, mean_entropy, mean_non_score_reward, scores, metrics, non_score_reward
+            empty_cache()
+            gc.collect()
+
+            if args.num_sample_generations > 0 and (update - 1) % self.sample_generations_freq == 0:
+                self.generate_completions(sampling=True)
+                empty_cache()
+            del (
+                query_responses,
+                responses,
+                postprocessed_responses,
+                logprobs,
+                ref_logprobs,
+                values,
+                sequence_lengths,
+                contain_eos_token,
+                sequence_lengths_p1,
+                response_idxs,
+                padding_mask,
+                padding_mask_p1,
+                rewards,
+                actual_start,
+                actual_end,
+                advantages,
+                returns,
+            )
+            empty_cache()
+
+        # HF trainer specifics
+        self.control = self.callback_handler.on_train_end(args, self.state, self.control)
+        if self.control.should_save:
+            self._save_checkpoint(model, trial=None)
+            self.control = self.callback_handler.on_save(self.args, self.state, self.control)
+
+    def generate_completions(self, sampling: bool = False):
+        args = self.args
+        processing_class = self.processing_class
+        generation_config = GenerationConfig(
+            max_new_tokens=self.args.response_length,
+            temperature=(0.01 + 1e-7),
+            top_k=0.0,
+            top_p=1.0,
+            do_sample=True,
+        )
+
+        table = defaultdict(list)
+        with unwrap_model_for_generation(
+            self.model, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation
+        ) as unwrapped_model:
+            for batch in self.eval_dataloader:
+                query = batch["input_ids"]
+                with torch.no_grad():
+                    context_length = query.shape[1]
+                    query_response, _ = batch_generation(
+                        unwrapped_model.policy,
+                        query,
+                        query.shape[0],
+                        processing_class.pad_token_id,
+                        generation_config,
+                    )
+                    response = query_response[:, context_length:]
+                    postprocessed_response = response
+                    if self.stop_token_id is not None:  # handle the edge case when stop_token_id exists but is 0
+                        postprocessed_response = truncate_response(
+                            self.stop_token_id, processing_class.pad_token_id, response
+                        )
+                    table["query"].extend(
+                        gather_object(processing_class.batch_decode(query, skip_special_tokens=True))
+                    )
+                    table["model response"].extend(
+                        gather_object(processing_class.batch_decode(postprocessed_response))
+                    )
+
+                    postprocessed_query_response = torch.cat((query, postprocessed_response), 1)
+                    _, score, _ = get_reward(
+                        self.reward_model, postprocessed_query_response, processing_class.pad_token_id, context_length
+                    )
+                    table["score"].extend(self.accelerator.gather_for_metrics(score).float().cpu().numpy())
+
+                if sampling:
+                    break
+        df = pd.DataFrame(table)
+
+        if self.accelerator.is_main_process:
+            if is_rich_available():
+                print_rich_table(df.iloc[0 : 0 + 5])
+            if "wandb" in args.report_to:
+                import wandb
+
+                if wandb.run is not None:
+                    wandb.log({"completions": wandb.Table(dataframe=df)})
+
+            if "comet_ml" in args.report_to:
+                log_table_to_comet_experiment(
+                    name="completions.csv",
+                    table=df,
+                )
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
diff --git a/trl/experimental/xpo/__init__.py b/trl/experimental/xpo/__init__.py
new file mode 100644
index 00000000000..ca4a4a608d1
--- /dev/null
+++ b/trl/experimental/xpo/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .xpo_config import XPOConfig
+from .xpo_trainer import XPOTrainer
+
+
+__all__ = ["XPOConfig", "XPOTrainer"]
diff --git a/trl/experimental/xpo/xpo_config.py b/trl/experimental/xpo/xpo_config.py
new file mode 100644
index 00000000000..ddd38a0ecd7
--- /dev/null
+++ b/trl/experimental/xpo/xpo_config.py
@@ -0,0 +1,44 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+
+from ...trainer.online_dpo_config import OnlineDPOConfig
+
+
+@dataclass
+class XPOConfig(OnlineDPOConfig):
+    r"""
+    Configuration class for the [`experimental.xpo.XPOTrainer`].
+
+    Subclass of [`OnlineDPOConfig`] we can use all its arguments and add the following:
+
+    Parameters:
+        alpha (`float` or `list[float]`, *optional*, defaults to `1e-5`):
+            Weight of the XPO loss term. If a list of floats is provided then the alpha is selected for each new epoch
+            and the last alpha is used for the rest of the epochs.
+    """
+
+    alpha: list[float] = field(
+        default_factory=lambda: [1e-5],
+        metadata={
+            "help": "Weight of the XPO loss term. If a list of floats is provided then the alpha is selected for each "
+            "new epoch and the last alpha is used for the rest of the epochs."
+        },
+    )
+
+    def __post_init__(self):
+        super().__post_init__()
+        if hasattr(self.alpha, "__len__") and len(self.alpha) == 1:
+            self.alpha = self.alpha[0]
diff --git a/trl/experimental/xpo/xpo_trainer.py b/trl/experimental/xpo/xpo_trainer.py
new file mode 100644
index 00000000000..bfa20d8d1fa
--- /dev/null
+++ b/trl/experimental/xpo/xpo_trainer.py
@@ -0,0 +1,538 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import textwrap
+from collections.abc import Callable
+from typing import Any
+
+import jinja2
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from datasets import Dataset, IterableDataset
+from transformers import (
+    BaseImageProcessor,
+    FeatureExtractionMixin,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+    TrainerCallback,
+)
+from transformers.trainer_utils import EvalPrediction
+from transformers.training_args import OptimizerNames
+from transformers.utils import is_peft_available
+
+from ...data_utils import is_conversational, maybe_apply_chat_template
+from ...models.utils import unwrap_model_for_generation
+from ...trainer.judges import BasePairwiseJudge
+from ...trainer.online_dpo_trainer import OnlineDPOTrainer
+from ...trainer.utils import (
+    SIMPLE_CHAT_TEMPLATE,
+    empty_cache,
+    get_reward,
+    selective_log_softmax,
+    truncate_right,
+)
+from .xpo_config import XPOConfig
+
+
+if is_peft_available():
+    from peft import PeftModel
+
+
+class XPOTrainer(OnlineDPOTrainer):
+    """
+    Trainer for Exploratory Preference Optimization (XPO).
+
+    It is implemented as a subclass of [`OnlineDPOTrainer`].
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]):
+            The model to train, preferably an `AutoModelForCausalLM`.
+        ref_model ([`PreTrainedModelWrapper`]):
+            Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
+            and loss. If no reference model is provided, the trainer will create a reference model with the same
+            architecture as the model to be optimized.
+        reward_funcs ([`~transformers.PreTrainedModel`]):
+            The reward model to score completions with, preferably an
+            [`~transformers.AutoModelForSequenceClassification`].
+        judge ([`experimental.judges.BasePairwiseJudge`]):
+            The judge to use for pairwise comparison of model completions.
+        args ([`experimental.xpo.XPOConfig`]):
+            The XPO config arguments to use for training.
+        data_collator ([`~transformers.DataCollator`]):
+            The data collator to use for training. If None is specified, the default data collator
+            ([`DPODataCollatorWithPadding`]) will be used which will pad the sequences to the maximum length of the
+            sequences in the batch, given a dataset of paired sequences.
+        train_dataset ([`~datasets.Dataset`]):
+            The dataset to use for training.
+        eval_dataset ([`~datasets.Dataset`]):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        peft_config (`dict`):
+            The peft config to use for training.
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+            metric values.
+        callbacks (`list[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+    """
+
+    _tag_names = ["trl", "xpo"]
+    _name = "XPO"
+    _paper = {
+        "title": "Exploratory Preference Optimization: Harnessing Implicit Q*-Approximation for Sample-Efficient RLHF",
+        "id": "2405.21046",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @article{jung2024binary,
+                title        = {{Exploratory Preference Optimization: Harnessing Implicit Q*-Approximation for Sample-Efficient RLHF}},
+                author       = {Tengyang Xie and Dylan J. Foster and Akshay Krishnamurthy and Corby Rosset and Ahmed Awadallah and Alexander Rakhlin},
+                year         = 2024,
+                eprint       = {arXiv:2405.21046}
+            }"""),
+    }
+
+    def __init__(
+        self,
+        model: PreTrainedModel | nn.Module = None,
+        ref_model: PreTrainedModel | nn.Module = None,
+        reward_funcs: nn.Module | None = None,
+        judge: BasePairwiseJudge | None = None,
+        args: XPOConfig | None = None,
+        data_collator: Callable | None = None,
+        train_dataset: Dataset | IterableDataset | None = None,
+        eval_dataset: Dataset | dict[str, Dataset] | None = None,
+        processing_class: PreTrainedTokenizerBase
+        | BaseImageProcessor
+        | FeatureExtractionMixin
+        | ProcessorMixin
+        | None = None,
+        reward_processing_classes: PreTrainedTokenizerBase | list[PreTrainedTokenizerBase] | None = None,
+        peft_config: dict | None = None,
+        compute_metrics: Callable[[EvalPrediction], dict] | None = None,
+        callbacks: list[TrainerCallback] | None = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
+    ) -> None:
+        super().__init__(
+            model=model,
+            ref_model=ref_model,
+            judge=judge,
+            reward_funcs=reward_funcs,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            reward_processing_classes=reward_processing_classes,
+            peft_config=peft_config,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        self._alpha = self.args.alpha
+
+        # Overwrite the stats dictionary to include XPO specific statistics
+        self.stats = {
+            # Remove "non_score_reward", "rlhf_reward", "scores"
+            # Add "loss/dpo", "loss/xpo"
+            "loss/dpo": [],
+            "loss/xpo": [],
+            "objective/kl": [],
+            "objective/entropy": [],
+            "rewards/chosen": [],
+            "rewards/rejected": [],
+            "rewards/accuracies": [],
+            "rewards/margins": [],
+            "logps/chosen": [],
+            "logps/rejected": [],
+            # Replace "contain_eos_token" by "model_contain_eos_token" and "ref_contain_eos_token"
+            "val/model_contain_eos_token": [],
+            "val/ref_contain_eos_token": [],
+            "alpha": [],
+            "beta": [],
+        }
+        if self.reward_funcs is not None:
+            if len(self.reward_funcs) != 1:
+                raise ValueError("XPOTrainer only supports one reward function/model.")
+            self.reward_funcs = self.reward_funcs[0]
+            self.stats["objective/model_scores"] = []
+            self.stats["objective/ref_scores"] = []
+            self.stats["objective/scores_margin"] = []
+
+    @property
+    def alpha(self):
+        if isinstance(self._alpha, list):
+            epoch = self.state.epoch
+            return self._alpha[epoch] if epoch < len(self._alpha) else self._alpha[-1]
+        else:
+            return self._alpha
+
+    def _generate_completions(self, prompts, model):
+        with unwrap_model_for_generation(model, self.accelerator) as unwrapped_policy_model_for_gen:
+            model_output = unwrapped_policy_model_for_gen.generate(
+                input_ids=prompts["input_ids"],
+                attention_mask=prompts["attention_mask"],
+                generation_config=self.generation_config,
+            )
+
+        actual_model_for_ref_generation: torch.nn.Module
+        if self.ref_model is None:
+            unwrapped_main_model_for_ref_logic = self.accelerator.unwrap_model(model)
+
+            if is_peft_available() and isinstance(unwrapped_main_model_for_ref_logic, PeftModel):
+                actual_model_for_ref_generation = unwrapped_main_model_for_ref_logic.get_base_model()
+            else:
+                actual_model_for_ref_generation = unwrapped_main_model_for_ref_logic
+        else:
+            actual_model_for_ref_generation = self.accelerator.unwrap_model(self.ref_model)
+
+        with unwrap_model_for_generation(actual_model_for_ref_generation, self.accelerator) as final_ref_model_for_gen:
+            ref_output = final_ref_model_for_gen.generate(
+                input_ids=prompts["input_ids"],
+                attention_mask=prompts["attention_mask"],
+                generation_config=self.generation_config,
+            )
+
+        return model_output, ref_output
+
+    def _process_completions(self, model_output, ref_output, prompts):
+        context_length = prompts["input_ids"].shape[1]
+
+        # Process model completions
+        model_completion_ids = model_output[:, context_length:]
+        model_completion_ids, model_completion_mask = truncate_right(
+            model_completion_ids, self.processing_class.eos_token_id, self.processing_class.pad_token_id
+        )
+        model_data = {
+            "input_ids": torch.cat((prompts["input_ids"], model_completion_ids), dim=1),
+            "attention_mask": torch.cat((prompts["attention_mask"], model_completion_mask), dim=1),
+            "raw": prompts["raw"],
+        }
+
+        # Process reference model completions
+        ref_completion_ids = ref_output[:, context_length:]
+        ref_completion_ids, ref_completion_mask = truncate_right(
+            ref_completion_ids, self.processing_class.eos_token_id, self.processing_class.pad_token_id
+        )
+        ref_data = {
+            "input_ids": torch.cat((prompts["input_ids"], ref_completion_ids), dim=1),
+            "attention_mask": torch.cat((prompts["attention_mask"], ref_completion_mask), dim=1),
+            "raw": prompts["raw"],
+        }
+
+        return model_data, ref_data
+
+    def _compute_rewards(self, model_data, ref_data, context_length):
+        with torch.no_grad():
+            _, model_scores, _ = get_reward(
+                self.reward_funcs, model_data["input_ids"], self.processing_class.pad_token_id, context_length
+            )
+            _, ref_scores, _ = get_reward(
+                self.reward_funcs, ref_data["input_ids"], self.processing_class.pad_token_id, context_length
+            )
+
+        # Apply EOS penalty if needed
+        if self.args.missing_eos_penalty is not None:
+            model_contain_eos = torch.any(model_data["input_ids"] == self.processing_class.eos_token_id, dim=-1)
+            ref_contain_eos = torch.any(ref_data["input_ids"] == self.processing_class.eos_token_id, dim=-1)
+            model_scores[~model_contain_eos] -= self.args.missing_eos_penalty
+            ref_scores[~ref_contain_eos] -= self.args.missing_eos_penalty
+
+        return model_scores, ref_scores
+
+    def _compute_judge(self, model_data, ref_data, context_length):
+        prompts = model_data["raw"]
+        model_data_completions = self.processing_class.batch_decode(
+            model_data["input_ids"][:, context_length:], skip_special_tokens=True
+        )
+        model_data_completions = [completion.strip() for completion in model_data_completions]
+
+        ref_data_completions = self.processing_class.batch_decode(
+            ref_data["input_ids"][:, context_length:], skip_special_tokens=True
+        )
+        ref_data_completions = [completion.strip() for completion in ref_data_completions]
+
+        if is_conversational({"prompt": prompts[0]}):
+            model_data_completions = [
+                [{"role": "assistant", "content": completion}] for completion in model_data_completions
+            ]
+            environment = jinja2.Environment()
+            template = environment.from_string(SIMPLE_CHAT_TEMPLATE)
+            prompts = [template.render(messages=message) for message in prompts]
+            model_data_completions = [template.render(messages=completion) for completion in model_data_completions]
+
+            ref_data_completions = [
+                [{"role": "assistant", "content": completion}] for completion in ref_data_completions
+            ]
+            ref_data_completions = [template.render(messages=completion) for completion in ref_data_completions]
+
+        ranks_of_first_completion = self.judge.judge(
+            prompts,
+            list(zip(model_data_completions, ref_data_completions, strict=True)),
+        )
+        # convert ranks to a True/False mask:
+        # when rank == 0, it means the first completion is the best
+        # when rank == 1, it means the second completion is the best
+        return torch.tensor([rank == 0 for rank in ranks_of_first_completion], device=model_data["input_ids"].device)
+
+    def _compute_logprobs(self, model, model_data, ref_data, context_length):
+        def compute_logprobs_for_data(m, data):
+            output = m(data["input_ids"], attention_mask=data["attention_mask"])
+            logits = output.logits[:, context_length - 1 : -1]
+            token_logprobs = selective_log_softmax(logits, data["input_ids"][:, context_length:])
+            return token_logprobs
+
+        # Compute logprobs for model completions
+        model_logprobs_model_data = compute_logprobs_for_data(model, model_data)
+        # Compute logprobs for model on reference completions (for XPO loss)
+        model_logprobs_ref_data = compute_logprobs_for_data(model, ref_data)
+
+        # Compute logprobs for reference model completions
+        with torch.no_grad():
+            if self.ref_model is None:
+                with model.disable_adapter():
+                    ref_logprobs_model_data = compute_logprobs_for_data(model, model_data)
+                    ref_logprobs_ref_data = compute_logprobs_for_data(model, ref_data)
+            else:
+                ref_logprobs_model_data = compute_logprobs_for_data(self.ref_model, model_data)
+                ref_logprobs_ref_data = compute_logprobs_for_data(self.ref_model, ref_data)
+
+        # Mask padding tokens
+        model_padding_mask = model_data["attention_mask"][:, context_length:] == 0
+        ref_padding_mask = ref_data["attention_mask"][:, context_length:] == 0
+        model_logprobs_model_data = model_logprobs_model_data.masked_fill(model_padding_mask, 0.0)
+        model_logprobs_ref_data = model_logprobs_ref_data.masked_fill(ref_padding_mask, 0.0)
+        ref_logprobs_ref_data = ref_logprobs_ref_data.masked_fill(ref_padding_mask, 0.0)
+        ref_logprobs_model_data = ref_logprobs_model_data.masked_fill(model_padding_mask, 0.0)
+
+        return model_logprobs_model_data, model_logprobs_ref_data, ref_logprobs_ref_data, ref_logprobs_model_data
+
+    def _compute_losses(
+        self,
+        model_logprobs_model_data,
+        model_logprobs_ref_data,
+        ref_logprobs_ref_data,
+        ref_logprobs_model_data,
+        chosen_mask,
+    ):
+        # Compute log probs
+        model_logprobs_model_data_sum = model_logprobs_model_data.sum(1)
+        model_logprobs_ref_data_sum = model_logprobs_ref_data.sum(1)
+        ref_logprobs_ref_data_sum = ref_logprobs_ref_data.sum(1)
+        ref_logprobs_model_data_sum = ref_logprobs_model_data.sum(1)
+
+        chosen_model_logprobs = torch.where(chosen_mask, model_logprobs_model_data_sum, model_logprobs_ref_data_sum)
+        chosen_ref_logprobs = torch.where(chosen_mask, ref_logprobs_model_data_sum, ref_logprobs_ref_data_sum)
+        chosen_log_ratios = chosen_model_logprobs - chosen_ref_logprobs
+
+        rejected_model_logprobs = torch.where(~chosen_mask, model_logprobs_model_data_sum, model_logprobs_ref_data_sum)
+        rejected_ref_logprobs = torch.where(~chosen_mask, ref_logprobs_model_data_sum, ref_logprobs_ref_data_sum)
+        rejected_log_ratios = rejected_model_logprobs - rejected_ref_logprobs
+
+        # Compute logits as the difference between chosen and rejected log ratios
+        logits = chosen_log_ratios - rejected_log_ratios
+
+        if self.args.loss_type == "sigmoid":
+            dpo_losses = -F.logsigmoid(self.beta * logits)
+        elif self.args.loss_type == "ipo":
+            dpo_losses = (logits - 1 / (2 * self.beta)) ** 2
+        else:
+            raise NotImplementedError(f"invalid loss type {self.args.loss_type}")
+
+        # Compute XPO specific loss
+        xpo_losses = self.alpha * model_logprobs_ref_data_sum
+
+        # Total loss
+        loss = (dpo_losses + xpo_losses).mean()
+
+        return loss, dpo_losses, xpo_losses
+
+    def _log_statistics(
+        self,
+        model_data,
+        ref_data,
+        model_logprobs_model_data,
+        model_logprobs_ref_data,
+        ref_logprobs_ref_data,
+        ref_logprobs_model_data,
+        chosen_mask,
+        dpo_losses,
+        xpo_losses,
+        context_length,
+        model_scores=None,
+        ref_scores=None,
+    ):
+        # Helper function to gather and compute mean
+        def gather_mean(tensor):
+            return self.accelerator.gather_for_metrics(tensor).mean().item()
+
+        # Log losses
+        self.stats["loss/dpo"].append(gather_mean(dpo_losses))
+        self.stats["loss/xpo"].append(gather_mean(xpo_losses))
+
+        # Log scores
+        if self.reward_funcs is not None:
+            self.stats["objective/model_scores"].append(gather_mean(model_scores))
+            self.stats["objective/ref_scores"].append(gather_mean(ref_scores))
+            self.stats["objective/scores_margin"].append(gather_mean(model_scores - ref_scores))
+
+        # Log logprobs
+        model_logprobs_model_data_sum = model_logprobs_model_data.sum(1)
+        model_logprobs_ref_data_sum = model_logprobs_ref_data.sum(1)
+        ref_logprobs_ref_data_sum = ref_logprobs_ref_data.sum(1)
+        ref_logprobs_model_data_sum = ref_logprobs_model_data.sum(1)
+
+        chosen_model_logprobs = torch.where(chosen_mask, model_logprobs_model_data_sum, model_logprobs_ref_data_sum)
+        chosen_ref_logprobs = torch.where(chosen_mask, ref_logprobs_model_data_sum, ref_logprobs_ref_data_sum)
+        chosen_log_ratios = chosen_model_logprobs - chosen_ref_logprobs
+
+        rejected_model_logprobs = torch.where(~chosen_mask, model_logprobs_model_data_sum, model_logprobs_ref_data_sum)
+        rejected_ref_logprobs = torch.where(~chosen_mask, ref_logprobs_model_data_sum, ref_logprobs_ref_data_sum)
+        rejected_log_ratios = rejected_model_logprobs - rejected_ref_logprobs
+
+        self.stats["logps/chosen"].append(gather_mean(chosen_model_logprobs.mean() + chosen_ref_logprobs.mean()))
+        self.stats["logps/rejected"].append(gather_mean(rejected_model_logprobs.mean() + rejected_ref_logprobs.mean()))
+
+        # Log rewards
+        # Compute various statistics
+        chosen_rewards = chosen_log_ratios * self.beta
+        rejected_rewards = rejected_log_ratios * self.beta
+        self.stats["rewards/chosen"].append(gather_mean(chosen_rewards.mean()))
+        self.stats["rewards/rejected"].append(gather_mean(rejected_rewards.mean()))
+
+        # Calculate KL divergence for model and ref data
+        kl_model_data = model_logprobs_model_data - ref_logprobs_model_data
+        kl_ref_data = model_logprobs_ref_data - ref_logprobs_ref_data
+        mean_kl = (kl_model_data.sum(1) + kl_ref_data.sum(1)).mean() / 2
+        self.stats["objective/kl"].append(gather_mean(mean_kl))
+
+        # Calculate entropy for model and ref data
+        entropy_model_data = -model_logprobs_model_data.sum(1)
+        entropy_ref_data = -model_logprobs_ref_data.sum(1)
+        mean_entropy = (entropy_model_data.mean() + entropy_ref_data.mean()) / 2
+        self.stats["objective/entropy"].append(gather_mean(mean_entropy))
+
+        # Calculate margins
+        margin = chosen_rewards - rejected_rewards
+        self.stats["rewards/margins"].append(gather_mean(margin.mean()))
+
+        # Calculate accuracy
+        accuracy = (margin > 0).float()
+        self.stats["rewards/accuracies"].append(gather_mean(accuracy.mean()))
+
+        # Log EOS token statistics
+        model_eos = (model_data["input_ids"][:, context_length:] == self.processing_class.eos_token_id).any(dim=1)
+        ref_eos = (ref_data["input_ids"][:, context_length:] == self.processing_class.eos_token_id).any(dim=1)
+        self.stats["val/model_contain_eos_token"].append(gather_mean(model_eos.float()))
+        self.stats["val/ref_contain_eos_token"].append(gather_mean(ref_eos.float()))
+
+        # Log alpha and beta
+        self.stats["alpha"].append(self.alpha)
+        self.stats["beta"].append(self.beta)
+
+    def training_step(
+        self, model: nn.Module, inputs: dict[str, torch.Tensor | Any], num_items_in_batch: int | None = None
+    ) -> torch.Tensor:
+        model.train()
+
+        # Apply chat template and tokenize the input
+        batch_size = len(next(iter(inputs.values())))
+        prompts = inputs["prompt"]
+        inputs = [{k: v[i] for k, v in inputs.items()} for i in range(batch_size)]
+        inputs = [maybe_apply_chat_template(x, self.processing_class) for x in inputs]
+        inputs = [self.tokenize_row(x, self.model.config.is_encoder_decoder, self.processing_class) for x in inputs]
+        inputs = self.data_collator(inputs)
+
+        # need the prompt_ only
+        inputs = self._prepare_inputs(inputs)
+        context_length = inputs["prompt_input_ids"].shape[1]
+        prompts = {
+            "input_ids": inputs["prompt_input_ids"],
+            "attention_mask": inputs["prompt_attention_mask"],
+            "raw": prompts,
+        }
+        del inputs
+
+        # Sample completions from both the model and the reference model
+        model_output, ref_output = self._generate_completions(prompts, model)
+
+        # Process model completions
+        model_data, ref_data = self._process_completions(model_output, ref_output, prompts)
+
+        # Compute rewards
+        if self.reward_funcs is not None:
+            model_scores, ref_scores = self._compute_rewards(model_data, ref_data, context_length)
+            chosen_mask = model_scores >= ref_scores
+        else:
+            model_scores, ref_scores = None, None
+            chosen_mask = self._compute_judge(model_data, ref_data, context_length)
+
+        # Compute logprobs
+        model_logprobs_model_data, model_logprobs_ref_data, ref_logprobs_ref_data, ref_logprobs_model_data = (
+            self._compute_logprobs(model, model_data, ref_data, context_length)
+        )
+
+        # Compute loss
+        loss, dpo_losses, xpo_losses = self._compute_losses(
+            model_logprobs_model_data,
+            model_logprobs_ref_data,
+            ref_logprobs_ref_data,
+            ref_logprobs_model_data,
+            chosen_mask,
+        )
+
+        # Log everything
+        self._log_statistics(
+            model_data,
+            ref_data,
+            model_logprobs_model_data.detach(),
+            model_logprobs_ref_data.detach(),
+            ref_logprobs_ref_data,
+            ref_logprobs_model_data,
+            chosen_mask,
+            dpo_losses.detach(),
+            xpo_losses.detach(),
+            context_length,
+            model_scores,
+            ref_scores,
+        )
+
+        if (
+            self.args.torch_empty_cache_steps is not None
+            and self.state.global_step % self.args.torch_empty_cache_steps == 0
+        ):
+            empty_cache()
+
+        kwargs = {}
+        # For LOMO optimizers you need to explicitly use the learning rate
+        if self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
+            kwargs["learning_rate"] = self._get_learning_rate()
+
+        if self.args.n_gpu > 1:
+            loss = loss.mean()  # mean() to average on multi-gpu parallel training
+
+        self.accelerator.backward(loss, **kwargs)
+
+        return loss.detach() / self.args.gradient_accumulation_steps
diff --git a/trl/extras/vllm_client.py b/trl/extras/vllm_client.py
index 5d76d422c16..5ddc6150a59 100644
--- a/trl/extras/vllm_client.py
+++ b/trl/extras/vllm_client.py
@@ -495,6 +495,9 @@ def close_communicator(self):
             if response.status_code != 200:
                 raise Exception(f"Request failed: {response.status_code}, {response.text}")
 
+        if self.communicator is not None:
+            self.communicator = None
+
 
 # Example usage
 if __name__ == "__main__":
diff --git a/trl/mergekit_utils.py b/trl/mergekit_utils.py
index fc9787b8f6b..d070a8dd923 100644
--- a/trl/mergekit_utils.py
+++ b/trl/mergekit_utils.py
@@ -15,7 +15,7 @@
 import torch
 from huggingface_hub import HfApi
 
-from trl.import_utils import is_mergekit_available
+from .import_utils import is_mergekit_available
 
 
 if is_mergekit_available():
diff --git a/trl/models/utils.py b/trl/models/utils.py
index 450ed0c6fe3..d15bcf4ae70 100644
--- a/trl/models/utils.py
+++ b/trl/models/utils.py
@@ -485,18 +485,10 @@ def prepare_model_for_kbit_training(model, use_gradient_checkpointing=True, grad
     if gradient_checkpointing_kwargs is None:
         gradient_checkpointing_kwargs = {}
 
-    n_upcasted = 0
-    for name, param in model.named_parameters():
+    for _, param in model.named_parameters():
         # freeze all parameters
         param.requires_grad = False
 
-        # upcast LayerNorm / Norm to float32 for numerical stability
-        if (param.dtype in [torch.float16, torch.bfloat16]) and (
-            "norm" in name.lower() or "layernorm" in name.lower()
-        ):
-            param.data = param.data.to(torch.float32)
-            n_upcasted += 1
-
     # Enable gradient checkpointing if needed
     if (loaded_in_kbit or is_quantized) and use_gradient_checkpointing:
         if hasattr(model, "enable_input_require_grads"):
diff --git a/trl/rewards/accuracy_rewards.py b/trl/rewards/accuracy_rewards.py
index 1ae7d21426d..cb02ee83a9c 100644
--- a/trl/rewards/accuracy_rewards.py
+++ b/trl/rewards/accuracy_rewards.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from trl.import_utils import is_math_verify_available
+from ..import_utils import is_math_verify_available
 
 
 if is_math_verify_available():
@@ -54,23 +54,14 @@ def accuracy_reward(completions: list[list[dict[str, str]]], solution: list[str]
     contents = [completion[0]["content"] for completion in completions]
     rewards = []
     for content, sol in zip(contents, solution, strict=True):
-        gold_parsed = parse(
-            sol,
-            extraction_mode="first_match",
-        )
+        gold_parsed = parse(sol)
         if len(gold_parsed) != 0:
             # We require the answer to be provided in correct latex (no malformed operators)
             answer_parsed = parse(
                 content,
                 extraction_config=[
                     LatexExtractionConfig(
-                        normalization_config=NormalizationConfig(
-                            nits=False,
-                            malformed_operators=False,
-                            basic_latex=True,
-                            boxed="all",
-                            units=True,
-                        ),
+                        normalization_config=NormalizationConfig(units=True),
                         # Ensures that boxed is tried first
                         boxed_match_priority=0,
                         try_extract_without_anchor=False,
@@ -79,10 +70,7 @@ def accuracy_reward(completions: list[list[dict[str, str]]], solution: list[str]
                 extraction_mode="first_match",
             )
             # Compute binary rewards if verifiable, `None` otherwise to skip this example
-            try:
-                reward = float(verify(gold_parsed, answer_parsed))
-            except Exception:
-                reward = None
+            reward = float(verify(gold_parsed, answer_parsed))
         else:
             # If the gold solution is not parseable, we assign `None` to skip this example
             reward = float(content.strip().lower() == sol.strip().lower())
diff --git a/trl/scripts/grpo.py b/trl/scripts/grpo.py
index 7e46a3858fc..ece0ae55808 100644
--- a/trl/scripts/grpo.py
+++ b/trl/scripts/grpo.py
@@ -27,6 +27,7 @@
 import sys
 from dataclasses import dataclass, field
 
+import torch
 from accelerate import logging
 from datasets import load_dataset
 
@@ -38,7 +39,9 @@
     ScriptArguments,
     TrlParser,
     get_dataset,
+    get_kbit_device_map,
     get_peft_config,
+    get_quantization_config,
 )
 from trl.rewards import accuracy_reward, get_soft_overlong_punishment, think_format_reward
 
@@ -112,6 +115,21 @@ def main(script_args, training_args, model_args, dataset_args):
                     f"Could not load reward function '{func_name}'. Expected one of "
                     f"{list(reward_funcs_registry.keys())} or a valid import path."
                 )
+    dtype = model_args.dtype if model_args.dtype in ["auto", None] else getattr(torch, model_args.dtype)
+
+    model_kwargs = dict(
+        revision=model_args.model_revision,
+        attn_implementation=model_args.attn_implementation,
+        dtype=dtype,
+    )
+    quantization_config = get_quantization_config(model_args)
+
+    if quantization_config is not None:
+        # Passing None would not be treated the same as omitting the argument, so we include it only when valid.
+        model_kwargs["device_map"] = get_kbit_device_map()
+        model_kwargs["quantization_config"] = quantization_config
+
+    training_args.model_init_kwargs = model_kwargs
 
     # Load the dataset
     if dataset_args.datasets and script_args.dataset_name:
diff --git a/trl/trainer/callbacks.py b/trl/trainer/callbacks.py
index 252f38ef31e..013117406ab 100644
--- a/trl/trainer/callbacks.py
+++ b/trl/trainer/callbacks.py
@@ -37,7 +37,6 @@
 from ..import_utils import is_mergekit_available, is_weave_available
 from ..mergekit_utils import MergeConfig, merge_models, upload_model_to_hf
 from ..models.utils import unwrap_model_for_generation
-from .judges import BasePairwiseJudge
 from .utils import get_config_model_id, log_table_to_comet_experiment
 
 
@@ -282,7 +281,7 @@ class WinRateCallback(TrainerCallback):
     ```
 
     Args:
-        judge ([`BasePairwiseJudge`]):
+        judge ([`experimental.judges.BasePairwiseJudge`]):
             The judge to use for comparing completions.
         trainer (`Trainer`):
             Trainer to which the callback will be attached. The trainer's evaluation dataset must include a `"prompt"`
@@ -303,7 +302,7 @@ class WinRateCallback(TrainerCallback):
 
     def __init__(
         self,
-        judge: BasePairwiseJudge,
+        judge,
         trainer: Trainer,
         generation_config: GenerationConfig | None = None,
         num_prompts: int | None = None,
diff --git a/trl/trainer/cpo_config.py b/trl/trainer/cpo_config.py
index 426b00da153..91df318221d 100644
--- a/trl/trainer/cpo_config.py
+++ b/trl/trainer/cpo_config.py
@@ -12,207 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass, field
-from typing import Any
+import warnings
+from dataclasses import dataclass
 
-from transformers import TrainingArguments
+from ..experimental.cpo import CPOConfig as _CPOConfig
 
 
 @dataclass
-class CPOConfig(TrainingArguments):
-    r"""
-    Configuration class for the [`CPOTrainer`].
-
-    This class includes only the parameters that are specific to CPO training. For a full list of training arguments,
-    please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
-    differ from those in [`~transformers.TrainingArguments`].
-
-    Using [`~transformers.HfArgumentParser`] we can turn this class into
-    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
-    command line.
-
-    Parameters:
-        max_length (`int` or `None`, *optional*, defaults to `1024`):
-            Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
-            to use the default data collator.
-        max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
-            Maximum length of the prompt. This argument is required if you want to use the default data collator.
-        max_completion_length (`int`, *optional*):
-            Maximum length of the completion. This argument is required if you want to use the default data collator
-            and your model is an encoder-decoder.
-        beta (`float`, *optional*, defaults to `0.1`):
-            Parameter controlling the deviation from the reference model. Higher β means less deviation from the
-            reference model. For the IPO loss (`loss_type="ipo"`), β is the regularization parameter denoted by τ in
-            the [paper](https://huggingface.co/papers/2310.12036).
-        label_smoothing (`float`, *optional*, defaults to `0.0`):
-            Label smoothing factor. This argument is required if you want to use the default data collator.
-        loss_type (`str`, *optional*, defaults to `"sigmoid"`):
-            Type of loss to use. Possible values are:
-
-                - `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper.
-                - `"hinge"`: hinge loss on the normalized likelihood from the
-                  [SLiC](https://huggingface.co/papers/2305.10425) paper.
-                - `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper.
-                - `"simpo"`: SimPO loss from the [SimPO](https://huggingface.co/papers/2405.14734) paper.
-                - `"alphapo"`: AlphaPO loss from the [AlphaPO](https://huggingface.co/papers/2501.03884) paper. This
-                  automatically sets `loss_type="simpo"` and `cpo_alpha=0.0`.
-
-        disable_dropout (`bool`, *optional*, defaults to `True`):
-            Whether to disable dropout in the model.
-        cpo_alpha (`float`, *optional*, defaults to `1.0`):
-            Weight of the BC regularizer in CPO training.
-        simpo_gamma (`float`, *optional*, defaults to `0.5`):
-            Target reward margin for the SimPO loss, used only when the `loss_type="simpo"`.
-        alpha (`float`, *optional*, defaults to `0.0`):
-            Alpha parameter that controls reward function shape across all loss types. When alpha=0 (default), uses
-            standard log probability rewards. When `alpha != 0`, applies AlphaPO transformation: `r = (1 - p^(-alpha))
-            / alpha` from the [AlphaPO paper](https://huggingface.co/papers/2501.03884). This parameter works with all
-            loss types.
-        label_pad_token_id (`int`, *optional*, defaults to `-100`):
-            Label pad token id. This argument is required if you want to use the default data collator.
-        padding_value (`int`, *optional*):
-            Padding value to use. If `None`, the padding value of the tokenizer is used.
-        truncation_mode (`str`,*optional*,  defaults to `"keep_end"`):
-            Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
-            This argument is required if you want to use the default data collator.
-        generate_during_eval (`bool`, *optional*, defaults to `False`):
-            If `True`, generates and logs completions from the model to W&B or Comet during evaluation.
-        is_encoder_decoder (`bool`, *optional*):
-            When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
-            you need to specify if the model returned by the callable is an encoder-decoder model.
-        model_init_kwargs (`dict[str, Any]`, *optional*):
-            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
-            string.
-        dataset_num_proc (`int`, *optional*):
-            Number of processes to use for processing the dataset.
-    """
-
-    _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["model_init_kwargs"]
-
-    # Parameters whose default values are overridden from TrainingArguments
-    learning_rate: float = field(
-        default=1e-6,
-        metadata={"help": "The initial learning rate for AdamW."},
-    )
-    logging_steps: float = field(
-        default=10,
-        metadata={
-            "help": "Log every X updates steps. Should be an integer or a float in range `[0,1)`. If smaller than 1, "
-            "will be interpreted as ratio of total training steps."
-        },
-    )
-    gradient_checkpointing: bool = field(
-        default=True,
-        metadata={
-            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
-        },
-    )
-    bf16: bool | None = field(
-        default=None,
-        metadata={
-            "help": "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA "
-            "architecture or Intel XPU or using CPU (use_cpu) or Ascend NPU. If not set, it defaults to `True` if "
-            "`fp16` is not set."
-        },
-    )
-
-    max_length: int | None = field(
-        default=1024,
-        metadata={"help": "Maximum length of the sequences (prompt + completion) in the batch."},
-    )
-    max_prompt_length: int | None = field(
-        default=512,
-        metadata={
-            "help": "Maximum length of the prompt. This argument is required if you want to use the default data "
-            "collator and your model is an encoder-decoder."
-        },
-    )
-    max_completion_length: int | None = field(
-        default=None,
-        metadata={
-            "help": "Maximum length of the completion. This argument is required if you want to use the default data "
-            "collator and your model is an encoder-decoder."
-        },
-    )
-    beta: float = field(
-        default=0.1,
-        metadata={
-            "help": "Parameter controlling the deviation from the reference model. Higher β means less deviation from "
-            "the reference model."
-        },
-    )
-    label_smoothing: float = field(
-        default=0.0,
-        metadata={"help": "Label smoothing factor."},
-    )
-    loss_type: str = field(
-        default="sigmoid",
-        metadata={
-            "help": "Type of loss to use.",
-            "choices": ["sigmoid", "hinge", "ipo", "simpo", "alphapo"],
-        },
-    )
-    disable_dropout: bool = field(
-        default=True,
-        metadata={"help": "Whether to disable dropout in the model."},
-    )
-    cpo_alpha: float = field(
-        default=1.0,
-        metadata={"help": "Weight of the BC regularizer in CPO training."},
-    )
-    simpo_gamma: float = field(
-        default=0.5,
-        metadata={"help": "Target reward margin for the SimPO loss, used only when the `loss_type='simpo'`."},
-    )
-    alpha: float = field(
-        default=0.0,
-        metadata={
-            "help": "Alpha parameter that controls reward function shape across all loss types. When alpha=0 "
-            "(default), uses standard log probability rewards. When `alpha != 0`, applies AlphaPO transformation: "
-            "`r = (1 - p^(-alpha)) / alpha` from the AlphaPO paper. This parameter works with all loss types."
-        },
-    )
-    label_pad_token_id: int = field(
-        default=-100,
-        metadata={"help": "Label pad token id."},
-    )
-    padding_value: int | None = field(
-        default=None,
-        metadata={"help": "Padding value to use. If `None`, the padding value of the tokenizer is used."},
-    )
-    truncation_mode: str = field(
-        default="keep_end",
-        metadata={
-            "help": "Truncation mode to use when the prompt is too long.",
-            "choices": ["keep_end", "keep_start"],
-        },
-    )
-    generate_during_eval: bool = field(
-        default=False,
-        metadata={"help": "If `True`, generates and logs completions from the model to W&B during evaluation."},
-    )
-    is_encoder_decoder: bool | None = field(
-        default=None,
-        metadata={"help": "Whether the model is an encoder-decoder model."},
-    )
-    model_init_kwargs: dict[str, Any] | None = field(
-        default=None,
-        metadata={
-            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model "
-            "from a string."
-        },
-    )
-    dataset_num_proc: int | None = field(
-        default=None,
-        metadata={"help": "Number of processes to use for processing the dataset."},
-    )
-
+class CPOConfig(_CPOConfig):
     def __post_init__(self):
-        self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
-
-        # Syntactic sugar for AlphaPO: set loss_type to "simpo" and cpo_alpha to 0.0
-        if self.loss_type == "alphapo":
-            self.loss_type = "simpo"
-            self.cpo_alpha = 0.0
-
+        warnings.warn(
+            "The `CPOConfig` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.cpo import CPOConfig`. The current import path will be removed and no longer "
+            "supported in TRL 0.29. For more information, see https://github.com/huggingface/trl/issues/4223."
+        )
         super().__post_init__()
diff --git a/trl/trainer/cpo_trainer.py b/trl/trainer/cpo_trainer.py
index 565983a6a10..4a89a0aa157 100644
--- a/trl/trainer/cpo_trainer.py
+++ b/trl/trainer/cpo_trainer.py
@@ -12,1086 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import inspect
-import os
-import random
-import textwrap
 import warnings
-from collections import defaultdict
-from collections.abc import Callable
-from contextlib import nullcontext
-from pathlib import Path
-from typing import Any, Literal
+from dataclasses import dataclass
 
-import numpy as np
-import pandas as pd
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from accelerate import PartialState, logging
-from datasets import Dataset
-from torch import autocast
-from torch.utils.data import DataLoader
-from transformers import (
-    AutoModelForCausalLM,
-    BaseImageProcessor,
-    DataCollator,
-    FeatureExtractionMixin,
-    PreTrainedModel,
-    PreTrainedTokenizerBase,
-    ProcessorMixin,
-    is_comet_available,
-    is_wandb_available,
-)
-from transformers.trainer_callback import TrainerCallback
-from transformers.trainer_utils import EvalLoopOutput
-from transformers.utils import is_peft_available, is_torch_fx_proxy
+from ..experimental.cpo import CPOTrainer as _CPOTrainer
 
-from ..data_utils import maybe_apply_chat_template, maybe_extract_prompt
-from .base_trainer import BaseTrainer
-from .cpo_config import CPOConfig
-from .utils import (
-    DPODataCollatorWithPadding,
-    add_bos_token_if_needed,
-    add_eos_token_if_needed,
-    disable_dropout_in_model,
-    log_table_to_comet_experiment,
-    pad_to_length,
-    peft_module_casting_to_bf16,
-    selective_log_softmax,
-)
 
-
-if is_peft_available():
-    from peft import PeftModel, get_peft_model, prepare_model_for_kbit_training
-
-
-if is_wandb_available():
-    import wandb
-
-
-logger = logging.get_logger(__name__)
-
-
-class CPOTrainer(BaseTrainer):
-    r"""
-    Initialize CPOTrainer.
-
-    Args:
-        model ([`~transformers.PreTrainedModel`]):
-            The model to train, preferably an [`~transformers.AutoModelForSequenceClassification`].
-        args ([`CPOConfig`]):
-            The CPO config arguments to use for training.
-        data_collator ([`~transformers.DataCollator`]):
-            The data collator to use for training. If None is specified, the default data collator
-            ([`DPODataCollatorWithPadding`]) will be used which will pad the sequences to the maximum length of the
-            sequences in the batch, given a dataset of paired sequences.
-        train_dataset ([`~datasets.Dataset`]):
-            The dataset to use for training.
-        eval_dataset ([`~datasets.Dataset`]):
-            The dataset to use for evaluation.
-        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
-            Processing class used to process the data. If provided, will be used to automatically process the inputs
-            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
-            reuse the fine-tuned model.
-        model_init (`Callable[[], transformers.PreTrainedModel]`):
-            The model initializer to use for training. If None is specified, the default model initializer will be
-            used.
-        callbacks (`list[transformers.TrainerCallback]`):
-            The callbacks to use for training.
-        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
-            The optimizer and scheduler to use for training.
-        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
-            The function to use to preprocess the logits before computing the metrics.
-        peft_config (`dict`, defaults to `None`):
-            The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in
-            a PEFT model.
-        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
-            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
-            metric values.
-    """
-
-    _tag_names = ["trl", "cpo"]
-    _name = "CPO"
-    _paper = {
-        "title": "Contrastive Preference Optimization: Pushing the Boundaries of LLM Performance in Machine Translation",
-        "id": "2401.08417",
-        # docstyle-ignore
-        "citation": textwrap.dedent("""\
-            @inproceedings{xu2024contrastive,
-                title        = {{Contrastive Preference Optimization: Pushing the Boundaries of LLM Performance in Machine Translation}},
-                author       = {Haoran Xu and Amr Sharaf and Yunmo Chen and Weiting Tan and Lingfeng Shen and Benjamin Van Durme and Kenton Murray and Young Jin Kim},
-                year         = 2024,
-                booktitle    = {Forty-first International Conference on Machine Learning, {ICML} 2024, Vienna, Austria, July 21-27, 2024},
-                publisher    = {OpenReview.net},
-                url          = {https://openreview.net/forum?id=51iwkioZpn}
-            }"""),
-    }
-
-    def __init__(
-        self,
-        model: PreTrainedModel | nn.Module | str | None = None,
-        args: CPOConfig | None = None,
-        data_collator: DataCollator | None = None,
-        train_dataset: Dataset | None = None,
-        eval_dataset: Dataset | dict[str, Dataset] | None = None,
-        processing_class: PreTrainedTokenizerBase
-        | BaseImageProcessor
-        | FeatureExtractionMixin
-        | ProcessorMixin
-        | None = None,
-        model_init: Callable[[], PreTrainedModel] | None = None,
-        callbacks: list[TrainerCallback] | None = None,
-        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
-        preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
-        peft_config: dict | None = None,
-        compute_metrics: Callable[[EvalLoopOutput], dict] | None = None,
-    ):
-        if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
-            warnings.warn(
-                "This trainer will soon be moved to trl.experimental and is a candidate for removal. If you rely on "
-                "it and want it to remain, please share your comments here: "
-                "https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable "
-                "TRL_EXPERIMENTAL_SILENCE=1."
-            )
-        if args.model_init_kwargs is None:
-            model_init_kwargs = {}
-        elif not isinstance(model, str):
-            raise ValueError("You passed model_kwargs to the CPOTrainer. But your model is already instantiated.")
-        else:
-            model_init_kwargs = args.model_init_kwargs
-            dtype = model_init_kwargs.get("dtype")
-            if dtype is not None:
-                # Convert to `torch.dtype` if an str is passed
-                if isinstance(dtype, str) and dtype != "auto":
-                    dtype = getattr(torch, dtype)
-                if dtype != "auto" and not isinstance(dtype, torch.dtype):
-                    raise ValueError(
-                        f"Invalid `dtype` passed to the CPOConfig. Expected a string with either `torch.dtype` or 'auto', but got {dtype}."
-                    )
-                model_init_kwargs["dtype"] = dtype
-
-        if isinstance(model, str):
-            model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
-
-        # Initialize this variable to False. This helps tracking the case when `peft_module_casting_to_bf16`
-        # has been called in order to properly call autocast if needed.
-        self._peft_has_been_casted_to_bf16 = False
-
-        if not is_peft_available() and peft_config is not None:
-            raise ValueError(
-                "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models"
-            )
-        elif is_peft_available() and peft_config is not None:
-            # if model is a peft model and we have a peft_config, we merge and unload it first
-            if isinstance(model, PeftModel):
-                model = model.merge_and_unload()
-
-            if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False):
-                _support_gc_kwargs = hasattr(
-                    args, "gradient_checkpointing_kwargs"
-                ) and "gradient_checkpointing_kwargs" in list(
-                    inspect.signature(prepare_model_for_kbit_training).parameters
-                )
-
-                prepare_model_kwargs = {"use_gradient_checkpointing": args.gradient_checkpointing}
-
-                if _support_gc_kwargs:
-                    prepare_model_kwargs["gradient_checkpointing_kwargs"] = args.gradient_checkpointing_kwargs
-
-                model = prepare_model_for_kbit_training(model, **prepare_model_kwargs)
-            elif args.gradient_checkpointing:
-                # For backward compatibility with older versions of transformers
-                if hasattr(model, "enable_input_require_grads"):
-                    model.enable_input_require_grads()
-                else:
-
-                    def make_inputs_require_grad(module, input, output):
-                        output.requires_grad_(True)
-
-                    model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
-
-            # get peft model with the given config
-            model = get_peft_model(model, peft_config)
-            if args.bf16 and getattr(model, "is_loaded_in_4bit", False):
-                peft_module_casting_to_bf16(model)
-                # If args.bf16 we need to explicitly call `generate` with torch amp autocast context manager
-                self._peft_has_been_casted_to_bf16 = True
-
-        # For models that use gradient_checkpointing, we need to attach a hook that enables input
-        # to explicitly have `requires_grad=True`, otherwise training will either silently
-        # fail or completely fail.
-        elif args.gradient_checkpointing:
-            # For backward compatibility with older versions of transformers
-            if hasattr(model, "enable_input_require_grads"):
-                model.enable_input_require_grads()
-            else:
-
-                def make_inputs_require_grad(module, input, output):
-                    output.requires_grad_(True)
-
-                model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
-
-        if args.generate_during_eval and not (is_wandb_available() or is_comet_available()):
-            raise ValueError(
-                "`generate_during_eval=True` requires Weights and Biases or Comet to be installed."
-                " Please install `wandb` or `comet-ml` to resolve."
-            )
-
-        if model is not None:
-            self.is_encoder_decoder = model.config.is_encoder_decoder
-        elif args.is_encoder_decoder is None:
-            raise ValueError("When no model is provided, you need to pass the parameter is_encoder_decoder.")
-        else:
-            self.is_encoder_decoder = args.is_encoder_decoder
-
-        if self.is_encoder_decoder:
-            self.decoder_start_token_id = model.config.decoder_start_token_id
-            self.pad_token_id = model.config.pad_token_id
-
-        if processing_class is None:
-            raise ValueError("processing_class must be specified to tokenize a CPO dataset.")
-        if args.max_length is None:
-            logger.warning(
-                "`max_length` is not set in the CPOConfig's init"
-                " it will default to `512` by default, but you should do it yourself in the future.",
-            )
-            max_length = 512
-        else:
-            max_length = args.max_length
-        if args.max_prompt_length is None:
-            logger.warning(
-                "`max_prompt_length` is not set in the CPOConfig's init"
-                " it will default to `128` by default, but you should do it yourself in the future.",
-            )
-            max_prompt_length = 128
-        else:
-            max_prompt_length = args.max_prompt_length
-
-        if not max_prompt_length < max_length:
-            raise ValueError(
-                f"max_prompt_length ({max_prompt_length}) should be strictly less than max_length ({max_length})."
-            )
-
-        if args.max_completion_length is None and self.is_encoder_decoder:
-            logger.warning(
-                "When using an encoder decoder architecture, you should set `max_completion_length` in the CPOConfig's init"
-                " it will default to `128` by default, but you should do it yourself in the future.",
-            )
-            max_completion_length = 128
-        else:
-            max_completion_length = args.max_completion_length
-
-        if data_collator is None:
-            data_collator = DPODataCollatorWithPadding(
-                pad_token_id=processing_class.pad_token_id,
-                label_pad_token_id=args.label_pad_token_id,
-                is_encoder_decoder=self.is_encoder_decoder,
-            )
-
-            if args.remove_unused_columns:
-                args.remove_unused_columns = False
-                # warn users
-                logger.warning(
-                    "When using DPODataCollatorWithPadding, you should set `remove_unused_columns=False` in your TrainingArguments"
-                    " we have set it for you, but you should do it yourself in the future.",
-                )
-
-            self.use_dpo_data_collator = True
-        else:
-            self.use_dpo_data_collator = False
-
-        # Disable dropout in the model
-        if args.disable_dropout:
-            disable_dropout_in_model(model)
-
-        self.max_length = max_length
-        self.generate_during_eval = args.generate_during_eval
-        self.label_pad_token_id = args.label_pad_token_id
-        self.padding_value = args.padding_value if args.padding_value is not None else processing_class.pad_token_id
-        self.max_prompt_length = max_prompt_length
-        self.truncation_mode = args.truncation_mode
-        self.max_completion_length = max_completion_length
-        self.processing_class = processing_class
-
-        if args.loss_type in ["hinge", "ipo"] and args.label_smoothing > 0:
-            logger.warning(
-                f"You are using the {args.loss_type} loss type that does not support label smoothing. The "
-                "`label_smoothing` parameter will be ignored. Set `label_smoothing` to `0.0` to remove this warning.",
-            )
-        if args.loss_type == "kto_pair":
-            raise ValueError("Support for kto_pair has been removed in CPOTrainer. Please use KTOTrainer.")
-
-        self.beta = args.beta
-        self.label_smoothing = args.label_smoothing
-        self.loss_type = args.loss_type
-        self.cpo_alpha = args.cpo_alpha
-        self.aux_loss_enabled = getattr(model.config, "output_router_logits", False)
-        self.aux_loss_coef = getattr(model.config, "router_aux_loss_coef", 0.0)
-        if self.aux_loss_enabled and self.aux_loss_coef == 0.0:
-            logger.warning(
-                "You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to "
-                "`0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value "
-                "greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary "
-                "loss.",
-            )
-
-        if args.loss_type == "simpo":
-            self.simpo_gamma = args.simpo_gamma
-
-        # AlphaPO parameter for reward shaping
-        self.alpha = args.alpha
-
-        self._stored_metrics = defaultdict(lambda: defaultdict(list))
-
-        # The trainer estimates the number of FLOPs (floating-point operations) using the number of elements in the
-        # input tensor associated with the key "input_ids". However, in CPO, the sampled data does not include the
-        # "input_ids" key. Instead, the available keys are "prompt_input_ids", "chosen_input_ids", and
-        # "rejected_input_ids". As a result, the trainer issues the warning: "Could not estimate the number of tokens
-        # of the input, floating-point operations will not be computed." To suppress this warning, we set the
-        # "estimate_tokens" key in the model's "warnings_issued" dictionary to True. This acts as a flag to indicate
-        # that the warning has already been issued.
-        model.warnings_issued["estimate_tokens"] = True
-
-        # Compute that only on the main process for faster data processing.
-        # see: https://github.com/huggingface/trl/pull/1255
-        with PartialState().main_process_first():
-            # Extract the prompt if needed, and apply the chat template if needed
-            train_dataset = train_dataset.map(maybe_extract_prompt, num_proc=args.dataset_num_proc)
-            train_dataset = train_dataset.map(
-                maybe_apply_chat_template, fn_kwargs={"tokenizer": processing_class}, num_proc=args.dataset_num_proc
-            )
-            if eval_dataset is not None:
-                eval_dataset = eval_dataset.map(maybe_extract_prompt, num_proc=args.dataset_num_proc)
-                eval_dataset = eval_dataset.map(
-                    maybe_apply_chat_template,
-                    fn_kwargs={"tokenizer": processing_class},
-                    num_proc=args.dataset_num_proc,
-                )
-
-            # tokenize the dataset
-            train_dataset = train_dataset.map(self.tokenize_row, num_proc=args.dataset_num_proc)
-            if eval_dataset is not None:
-                eval_dataset = eval_dataset.map(self.tokenize_row, num_proc=args.dataset_num_proc)
-
-        super().__init__(
-            model=model,
-            args=args,
-            data_collator=data_collator,
-            train_dataset=train_dataset,
-            eval_dataset=eval_dataset,
-            processing_class=processing_class,
-            model_init=model_init,
-            compute_metrics=compute_metrics,
-            callbacks=callbacks,
-            optimizers=optimizers,
-            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+@dataclass
+class CPOTrainer(_CPOTrainer):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The `CPOTrainer` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.cpo import CPOTrainer`. The current import path will be removed and no longer "
+            "supported in TRL 0.29. For more information, see https://github.com/huggingface/trl/issues/4223."
         )
-
-        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
-        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
-        # self.model_accepts_loss_kwargs to False to enable scaling.
-        self.model_accepts_loss_kwargs = False
-
-        # Add tags for models that have been loaded with the correct transformers version
-        if hasattr(self.model, "add_model_tags"):
-            self.model.add_model_tags(self._tag_names)
-
-        if not hasattr(self, "accelerator"):
-            raise AttributeError(
-                "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`."
-            )
-
-    def build_tokenized_answer(self, prompt, answer):
-        """
-        Llama tokenizer does satisfy `enc(a + b) = enc(a) + enc(b)`. It does ensure `enc(a + b) = enc(a) + enc(a +
-        b)[len(enc(a)):]`. Reference:
-            https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
-        """
-
-        full_tokenized = self.processing_class(prompt + answer, add_special_tokens=False)
-        prompt_input_ids = self.processing_class(prompt, add_special_tokens=False)["input_ids"]
-
-        answer_input_ids = full_tokenized["input_ids"][len(prompt_input_ids) :]
-        answer_attention_mask = full_tokenized["attention_mask"][len(prompt_input_ids) :]
-
-        # Concat tokens to form `enc(a) + enc(a + b)[len(enc(a)):]`
-        full_concat_input_ids = np.concatenate([prompt_input_ids, answer_input_ids])
-
-        # Prepare input tokens for token by token comparison
-        full_input_ids = np.array(full_tokenized["input_ids"])
-
-        if len(full_input_ids) != len(full_concat_input_ids):
-            raise ValueError("Prompt input ids and answer input ids should have the same length.")
-
-        # On some tokenizers, like Llama-2 tokenizer, there are occasions where tokens
-        # can be merged together when tokenizing prompt+answer. This could result
-        # on the last token from the prompt being different when tokenized on its own
-        # vs when done as prompt+answer.
-        response_token_ids_start_idx = len(prompt_input_ids)
-
-        # If tokenized prompt is different than both prompt+answer, then it means the
-        # last token has changed due to merging.
-        if prompt_input_ids != full_tokenized["input_ids"][:response_token_ids_start_idx]:
-            response_token_ids_start_idx -= 1
-
-        prompt_input_ids = full_tokenized["input_ids"][:response_token_ids_start_idx]
-        prompt_attention_mask = full_tokenized["attention_mask"][:response_token_ids_start_idx]
-
-        if len(prompt_input_ids) != len(prompt_attention_mask):
-            raise ValueError("Prompt input ids and attention mask should have the same length.")
-
-        answer_input_ids = full_tokenized["input_ids"][response_token_ids_start_idx:]
-        answer_attention_mask = full_tokenized["attention_mask"][response_token_ids_start_idx:]
-
-        return dict(
-            prompt_input_ids=prompt_input_ids,
-            prompt_attention_mask=prompt_attention_mask,
-            input_ids=answer_input_ids,
-            attention_mask=answer_attention_mask,
-        )
-
-    def tokenize_row(self, feature, model: PreTrainedModel | nn.Module | None = None) -> dict:
-        """Tokenize a single row from a CPO specific dataset.
-
-        At this stage, we don't convert to PyTorch tensors yet; we just handle the truncation in case the prompt +
-        chosen or prompt + rejected responses is/are too long. First we truncate the prompt; if we're still too long,
-        we truncate the chosen/rejected.
-
-        We also create the labels for the chosen/rejected responses, which are of length equal to the sum of the length
-        of the prompt and the chosen/rejected response, with label_pad_token_id for the prompt tokens.
-        """
-        batch = {}
-        prompt = feature["prompt"]
-        chosen = feature["chosen"]
-        rejected = feature["rejected"]
-
-        if not self.is_encoder_decoder:
-            # Check issues below for more details
-            #  1. https://github.com/huggingface/trl/issues/907
-            #  2. https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
-            #  3. https://github.com/LianjiaTech/BELLE/issues/337
-
-            if not isinstance(prompt, str):
-                raise ValueError(f"prompt should be an str but got {type(prompt)}")
-            prompt_tokens = self.processing_class(prompt, add_special_tokens=False)
-            prompt_tokens = {f"prompt_{k}": v for k, v in prompt_tokens.items()}
-
-            if not isinstance(chosen, str):
-                raise ValueError(f"chosen should be an str but got {type(chosen)}")
-            chosen_tokens = self.build_tokenized_answer(prompt, chosen)
-
-            if not isinstance(rejected, str):
-                raise ValueError(f"rejected should be an str but got {type(rejected)}")
-            rejected_tokens = self.build_tokenized_answer(prompt, rejected)
-
-            # Last prompt token might get merged by tokenizer and
-            # it should not be included for generation if that happens
-            prompt_len_input_ids = len(prompt_tokens["prompt_input_ids"])
-
-            chosen_prompt_len_input_ids = len(chosen_tokens["prompt_input_ids"])
-            rejected_prompt_len_input_ids = len(rejected_tokens["prompt_input_ids"])
-            prompt_len_input_ids = min(chosen_prompt_len_input_ids, rejected_prompt_len_input_ids)
-
-            for k, v in prompt_tokens.items():
-                prompt_tokens[k] = v[:prompt_len_input_ids]
-
-            # Make sure prompts only have one different token at most an
-            # and length only differs by 1 at most
-            num_diff_tokens = sum(
-                a != b
-                for a, b in zip(chosen_tokens["prompt_input_ids"], rejected_tokens["prompt_input_ids"], strict=True)
-            )
-            num_diff_len = abs(chosen_prompt_len_input_ids - rejected_prompt_len_input_ids)
-            if num_diff_tokens > 1 or num_diff_len > 1:
-                raise ValueError(
-                    "Chosen and rejected prompt_input_ids might only differ on the "
-                    "last token due to tokenizer merge ops."
-                )
-
-            # add BOS token to head of prompt. Avoid adding if it's already there
-            prompt_tokens, chosen_tokens, rejected_tokens = add_bos_token_if_needed(
-                self.processing_class.bos_token_id,
-                prompt_len_input_ids,
-                prompt_tokens,
-                chosen_prompt_len_input_ids,
-                chosen_tokens,
-                rejected_prompt_len_input_ids,
-                rejected_tokens,
-            )
-
-            # add EOS token to end of answer. Avoid adding if it's already there
-            chosen_tokens, rejected_tokens = add_eos_token_if_needed(
-                self.processing_class.eos_token_id, chosen_tokens, rejected_tokens
-            )
-
-            longer_response_length = max(len(chosen_tokens["input_ids"]), len(rejected_tokens["input_ids"]))
-
-            # if combined sequence is too long, truncate the prompt
-            for answer_tokens in [chosen_tokens, rejected_tokens, prompt_tokens]:
-                if len(answer_tokens["prompt_input_ids"]) + longer_response_length > self.max_length:
-                    if self.truncation_mode == "keep_start":
-                        for k in ["prompt_input_ids", "prompt_attention_mask"]:
-                            answer_tokens[k] = answer_tokens[k][: self.max_prompt_length]
-                    elif self.truncation_mode == "keep_end":
-                        for k in ["prompt_input_ids", "prompt_attention_mask"]:
-                            answer_tokens[k] = answer_tokens[k][-self.max_prompt_length :]
-                    else:
-                        raise ValueError(f"Unknown truncation mode: {self.truncation_mode}")
-
-            # if that's still too long, truncate the response
-            for answer_tokens in [chosen_tokens, rejected_tokens]:
-                if len(answer_tokens["prompt_input_ids"]) + longer_response_length > self.max_length:
-                    for k in ["input_ids", "attention_mask"]:
-                        answer_tokens[k] = answer_tokens[k][: self.max_length - self.max_prompt_length]
-
-            # Create labels
-            chosen_sequence_tokens = {
-                k: chosen_tokens[f"prompt_{k}"] + chosen_tokens[k] for k in ["input_ids", "attention_mask"]
-            }
-            rejected_sequence_tokens = {
-                k: rejected_tokens[f"prompt_{k}"] + rejected_tokens[k] for k in ["input_ids", "attention_mask"]
-            }
-            chosen_sequence_tokens["labels"] = chosen_sequence_tokens["input_ids"][:]
-            chosen_sequence_tokens["labels"][: len(chosen_tokens["prompt_input_ids"])] = [
-                self.label_pad_token_id
-            ] * len(chosen_tokens["prompt_input_ids"])
-            rejected_sequence_tokens["labels"] = rejected_sequence_tokens["input_ids"][:]
-            rejected_sequence_tokens["labels"][: len(rejected_tokens["prompt_input_ids"])] = [
-                self.label_pad_token_id
-            ] * len(rejected_tokens["prompt_input_ids"])
-
-            for k, toks in {
-                "chosen_": chosen_sequence_tokens,
-                "rejected_": rejected_sequence_tokens,
-                "": prompt_tokens,
-            }.items():
-                for type_key, tokens in toks.items():
-                    if type_key == "token_type_ids":
-                        continue
-                    batch[f"{k}{type_key}"] = tokens
-
-        else:
-            chosen_tokens = self.processing_class(
-                chosen, truncation=True, max_length=self.max_completion_length, add_special_tokens=True
-            )
-            rejected_tokens = self.processing_class(
-                rejected, truncation=True, max_length=self.max_completion_length, add_special_tokens=True
-            )
-            prompt_tokens = self.processing_class(
-                prompt, truncation=True, max_length=self.max_prompt_length, add_special_tokens=True
-            )
-
-            batch["chosen_labels"] = chosen_tokens["input_ids"]
-            batch["rejected_labels"] = rejected_tokens["input_ids"]
-            batch["prompt_input_ids"] = prompt_tokens["input_ids"]
-            batch["prompt_attention_mask"] = prompt_tokens["attention_mask"]
-
-            if model is not None and hasattr(model, "prepare_decoder_input_ids_from_labels"):
-                batch["rejected_decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels(
-                    labels=torch.tensor(batch["rejected_labels"])
-                )
-                batch["chosen_decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels(
-                    labels=torch.tensor(batch["chosen_labels"])
-                )
-
-        return batch
-
-    @staticmethod
-    def concatenated_inputs(
-        batch: dict[str, list | torch.LongTensor],
-        is_encoder_decoder: bool = False,
-        label_pad_token_id: int = -100,
-        padding_value: int = 0,
-        device: torch.device | None = None,
-    ) -> dict[str, torch.LongTensor]:
-        """Concatenate the chosen and rejected inputs into a single tensor.
-
-        Args:
-            batch:
-                A batch of data. Must contain the keys 'chosen_input_ids' and 'rejected_input_ids', which are tensors
-                of shape (batch_size, sequence_length).
-            is_encoder_decoder:
-                Whether the model is an encoder-decoder model.
-            label_pad_token_id:
-                The label pad token id.
-            padding_value:
-                The padding value to use for the concatenated inputs_ids.
-            device:
-                The device for the concatenated inputs.
-
-        Returns:
-            A dictionary containing the concatenated inputs under the key 'concatenated_input_ids'.
-        """
-        concatenated_batch = {}
-
-        if is_encoder_decoder:
-            max_length = max(batch["chosen_labels"].shape[1], batch["rejected_labels"].shape[1])
-        else:
-            max_length = max(batch["chosen_input_ids"].shape[1], batch["rejected_input_ids"].shape[1])
-
-        for k in batch:
-            if k.startswith("chosen") and isinstance(batch[k], torch.Tensor):
-                if "labels" in k or is_encoder_decoder:
-                    pad_value = label_pad_token_id
-                elif k.endswith("_input_ids"):
-                    pad_value = padding_value
-                elif k.endswith("_attention_mask"):
-                    pad_value = 0
-                concatenated_key = k.replace("chosen", "concatenated")
-                concatenated_batch[concatenated_key] = pad_to_length(batch[k], max_length, pad_value=pad_value)
-        for k in batch:
-            if k.startswith("rejected") and isinstance(batch[k], torch.Tensor):
-                if "labels" in k or is_encoder_decoder:
-                    pad_value = label_pad_token_id
-                elif k.endswith("_input_ids"):
-                    pad_value = padding_value
-                elif k.endswith("_attention_mask"):
-                    pad_value = 0
-                concatenated_key = k.replace("rejected", "concatenated")
-                concatenated_batch[concatenated_key] = torch.cat(
-                    (
-                        concatenated_batch[concatenated_key],
-                        pad_to_length(batch[k], max_length, pad_value=pad_value),
-                    ),
-                    dim=0,
-                ).to(device=device)
-
-        if is_encoder_decoder:
-            concatenated_batch["concatenated_input_ids"] = batch["prompt_input_ids"].repeat(2, 1).to(device=device)
-            concatenated_batch["concatenated_attention_mask"] = (
-                batch["prompt_attention_mask"].repeat(2, 1).to(device=device)
-            )
-
-        return concatenated_batch
-
-    def cpo_loss(
-        self,
-        policy_chosen_logps: torch.FloatTensor,
-        policy_rejected_logps: torch.FloatTensor,
-    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
-        """Compute the CPO loss for a batch of policy and reference model log probabilities.
-
-        Args:
-            policy_chosen_logps:
-                Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
-            policy_rejected_logps:
-                Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)
-
-        Returns:
-            A tuple of three tensors: (losses, chosen_rewards, rejected_rewards). The losses tensor contains the CPO
-            loss for each example in the batch. The chosen_rewards and rejected_rewards tensors contain the rewards for
-            the chosen and rejected responses, respectively.
-        """
-        # Apply AlphaPO reward transformation if alpha != 0
-        if self.alpha != 0.0:
-            # Compute probabilities
-            chosen_probs = torch.exp(policy_chosen_logps)
-            rejected_probs = torch.exp(policy_rejected_logps)
-
-            # Apply AlphaPO transformation: r = (1 - p^(-alpha)) / alpha
-            policy_chosen_rewards = (1 - chosen_probs.pow(-self.alpha)) / self.alpha
-            policy_rejected_rewards = (1 - rejected_probs.pow(-self.alpha)) / self.alpha
-
-            logits = (policy_chosen_rewards - policy_rejected_rewards).to(self.accelerator.device)
-        else:
-            # Standard log probability rewards when alpha = 0
-            logits = (policy_chosen_logps - policy_rejected_logps).to(self.accelerator.device)
-
-        # The beta is a temperature parameter for the CPO loss, typically something in the range of 0.1 to 0.5.
-        # We ignore the reference model as beta -> 0. The label_smoothing parameter encodes our uncertainty about the labels and
-        # calculates a conservative CPO loss.
-
-        if self.loss_type == "simpo":
-            gamma_logratios = self.simpo_gamma / self.beta
-            logits = logits - gamma_logratios
-            # This reduces to Equation 3 from the CPO paper when label_smoothing -> 0.
-            losses = (
-                -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
-                - F.logsigmoid(-self.beta * logits) * self.label_smoothing
-            )
-        elif self.loss_type == "sigmoid":
-            # This reduces to Equation 3 from the CPO paper when label_smoothing -> 0.
-            losses = (
-                -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
-                - F.logsigmoid(-self.beta * logits) * self.label_smoothing
-            )
-        elif self.loss_type == "hinge":
-            losses = torch.relu(1 - self.beta * logits)
-        elif self.loss_type == "ipo":
-            # eqn (17) of the paper where beta is the regularization parameter for the IPO loss, denoted by tau in the paper.
-            losses = (logits - 1 / (2 * self.beta)) ** 2
-        else:
-            raise ValueError(
-                f"Unknown loss type: {self.loss_type}. Should be one of ['sigmoid', 'hinge', 'ipo', 'simpo']"
-            )
-
-        # Calculate rewards for logging
-        if self.alpha != 0.0:
-            # When using AlphaPO transformation, use the transformed rewards
-            chosen_rewards = self.beta * policy_chosen_rewards.to(self.accelerator.device).detach()
-            rejected_rewards = self.beta * policy_rejected_rewards.to(self.accelerator.device).detach()
-        else:
-            # Standard log probability rewards
-            chosen_rewards = self.beta * (policy_chosen_logps.to(self.accelerator.device)).detach()
-            rejected_rewards = self.beta * (policy_rejected_logps.to(self.accelerator.device)).detach()
-
-        return losses, chosen_rewards, rejected_rewards
-
-    @staticmethod
-    def get_batch_logps(
-        logits: torch.FloatTensor,
-        labels: torch.LongTensor,
-        average_log_prob: bool = False,
-        label_pad_token_id: int = -100,
-        is_encoder_decoder: bool = False,
-    ) -> torch.FloatTensor:
-        """Compute the log probabilities of the given labels under the given logits.
-
-        Args:
-            logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size)
-            labels:
-                Labels for which to compute the log probabilities. Label tokens with a value of label_pad_token_id are
-                ignored. Shape: (batch_size, sequence_length)
-            average_log_prob:
-                If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the
-                log probabilities of the (non-masked) tokens.
-            label_pad_token_id: The label pad token id.
-            is_encoder_decoder: Whether the model is an encoder-decoder model.
-
-        Returns:
-            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the
-            given logits.
-        """
-        if logits.shape[:-1] != labels.shape:
-            raise ValueError("Logits (batch and sequence length dim) and labels must have the same shape.")
-
-        if not is_encoder_decoder:
-            labels = labels[:, 1:].clone()
-            logits = logits[:, :-1, :]
-        loss_mask = labels != label_pad_token_id
-
-        # dummy token; we'll ignore the losses on these tokens later
-        labels[labels == label_pad_token_id] = 0
-
-        per_token_logps = selective_log_softmax(logits, labels)
-
-        if average_log_prob:
-            return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
-        else:
-            return (per_token_logps * loss_mask).sum(-1)
-
-    def concatenated_forward(
-        self, model: nn.Module, batch: dict[str, list | torch.LongTensor]
-    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
-        """Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.
-
-        We do this to avoid doing two forward passes, because it's faster for FSDP.
-        """
-        concatenated_batch = self.concatenated_inputs(
-            batch,
-            is_encoder_decoder=self.is_encoder_decoder,
-            label_pad_token_id=self.label_pad_token_id,
-            padding_value=self.padding_value,
-            device=self.accelerator.device,
-        )
-        len_chosen = batch["chosen_labels"].shape[0]
-
-        model_kwargs = (
-            {
-                "decoder_input_ids": self._shift_right(concatenated_batch["concatenated_labels"]),
-            }
-            if self.is_encoder_decoder
-            else {}
-        )
-
-        if self.aux_loss_enabled:
-            model_kwargs["output_router_logits"] = True
-
-        outputs = model(
-            concatenated_batch["concatenated_input_ids"],
-            attention_mask=concatenated_batch["concatenated_attention_mask"],
-            use_cache=False,
-            **model_kwargs,
-        )
-        all_logits = outputs.logits
-
-        def cross_entropy_loss(logits, labels):
-            if not self.is_encoder_decoder:
-                # Shift so that tokens < n predict n
-                logits = logits[..., :-1, :].contiguous()
-                labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = nn.CrossEntropyLoss()
-            logits = logits.view(-1, logits.shape[-1])
-            labels = labels.view(-1)
-            # Enable model parallelism
-            labels = labels.to(logits.device)
-            loss = loss_fct(logits, labels)
-            return loss
-
-        labels = concatenated_batch["concatenated_labels"].clone()
-
-        if self.cpo_alpha == 0:
-            nll_loss = torch.tensor(0.0).to(self.accelerator.device)
-        else:
-            nll_loss = cross_entropy_loss(all_logits[:len_chosen], labels[:len_chosen])
-
-        all_logps = self.get_batch_logps(
-            all_logits,
-            concatenated_batch["concatenated_labels"],
-            average_log_prob=self.loss_type in ["ipo", "simpo"],
-            is_encoder_decoder=self.is_encoder_decoder,
-            label_pad_token_id=self.label_pad_token_id,
-        )
-
-        chosen_logps = all_logps[:len_chosen]
-        rejected_logps = all_logps[len_chosen:]
-
-        chosen_logits = all_logits[:len_chosen]
-        rejected_logits = all_logits[len_chosen:]
-
-        if self.aux_loss_enabled:
-            return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, nll_loss, outputs.aux_loss)
-
-        return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, nll_loss)
-
-    def get_batch_loss_metrics(
-        self,
-        model,
-        batch: dict[str, list | torch.LongTensor],
-        train_eval: Literal["train", "eval"] = "train",
-    ):
-        """Compute the CPO loss and other metrics for the given batch of inputs for train or test."""
-        metrics = {}
-
-        forward_output = self.concatenated_forward(model, batch)
-        (
-            policy_chosen_logps,
-            policy_rejected_logps,
-            policy_chosen_logits,
-            policy_rejected_logits,
-            policy_nll_loss,
-        ) = forward_output[:5]
-        if self.aux_loss_enabled:
-            aux_loss = forward_output[5]
-
-        losses, chosen_rewards, rejected_rewards = self.cpo_loss(
-            policy_chosen_logps,
-            policy_rejected_logps,
-        )
-
-        loss = losses.mean() + self.cpo_alpha * policy_nll_loss
-        reward_accuracies = (chosen_rewards > rejected_rewards).float()
-
-        prefix = "eval_" if train_eval == "eval" else ""
-        metrics[f"{prefix}rewards/chosen"] = self.accelerator.gather_for_metrics(chosen_rewards).mean().item()
-        metrics[f"{prefix}rewards/rejected"] = self.accelerator.gather_for_metrics(rejected_rewards).mean().item()
-        metrics[f"{prefix}rewards/accuracies"] = self.accelerator.gather_for_metrics(reward_accuracies).mean().item()
-        metrics[f"{prefix}rewards/margins"] = (
-            self.accelerator.gather_for_metrics(chosen_rewards - rejected_rewards).mean().item()
-        )
-        metrics[f"{prefix}logps/rejected"] = (
-            self.accelerator.gather_for_metrics(policy_rejected_logps).detach().mean().item()
-        )
-        metrics[f"{prefix}logps/chosen"] = (
-            self.accelerator.gather_for_metrics(policy_chosen_logps).detach().mean().item()
-        )
-        metrics[f"{prefix}logits/rejected"] = (
-            self.accelerator.gather_for_metrics(policy_rejected_logits.detach().mean()).mean().item()
-        )
-        metrics[f"{prefix}logits/chosen"] = (
-            self.accelerator.gather_for_metrics(policy_chosen_logits.detach().mean()).mean().item()
-        )
-        metrics[f"{prefix}nll_loss"] = self.accelerator.gather_for_metrics(policy_nll_loss).detach().mean().item()
-
-        if self.aux_loss_enabled:
-            loss += self.aux_loss_coef * aux_loss
-
-        return loss, metrics
-
-    def compute_loss(
-        self,
-        model: PreTrainedModel | nn.Module,
-        inputs: dict[str, torch.Tensor | Any],
-        return_outputs=False,
-        num_items_in_batch=None,
-    ) -> torch.Tensor | tuple[torch.Tensor, dict[str, torch.Tensor]]:
-        compute_loss_context_manager = (
-            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
-        )
-
-        with compute_loss_context_manager:
-            loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="train")
-
-        # force log the metrics
-        self.store_metrics(metrics, train_eval="train")
-
-        if return_outputs:
-            return (loss, metrics)
-        return loss
-
-    def generate_from_model(self, model, batch: dict[str, torch.LongTensor]) -> str:
-        """Generate samples from the model and reference model for the given batch of inputs."""
-
-        # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with
-        # the torch amp context manager as some hidden states are silently casted to full precision.
-        generate_context_manager = (
-            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
-        )
-
-        with generate_context_manager:
-            policy_output = model.generate(
-                input_ids=batch["prompt_input_ids"],
-                attention_mask=batch["prompt_attention_mask"],
-                max_length=self.max_length,
-                do_sample=True,
-                pad_token_id=self.processing_class.pad_token_id,
-            )
-
-        policy_output = pad_to_length(policy_output, self.max_length, self.processing_class.pad_token_id)
-        policy_output_decoded = self.processing_class.batch_decode(policy_output, skip_special_tokens=True)
-
-        return policy_output_decoded
-
-    def prediction_step(
-        self,
-        model: PreTrainedModel | nn.Module,
-        inputs: dict[str, torch.Tensor | Any],
-        prediction_loss_only: bool,
-        ignore_keys: list[str] | None = None,
-    ):
-        if ignore_keys is None:
-            if hasattr(model, "config"):
-                ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", [])
-            else:
-                ignore_keys = []
-
-        prediction_context_manager = (
-            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
-        )
-
-        with torch.no_grad(), prediction_context_manager:
-            loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="eval")
-
-        # force log the metrics
-        self.store_metrics(metrics, train_eval="eval")
-
-        if prediction_loss_only:
-            return (loss.detach(), None, None)
-
-        # logits for the chosen and rejected samples from model
-        logits_dict = {
-            "eval_logits/chosen": metrics["eval_logits/chosen"],
-            "eval_logits/rejected": metrics["eval_logits/rejected"],
-        }
-        logits = [v for k, v in logits_dict.items() if k not in ignore_keys]
-        logits = torch.tensor(logits, device=self.accelerator.device)
-        labels = torch.zeros(logits.shape[0], device=self.accelerator.device)
-
-        return (loss.detach(), logits, labels)
-
-    def store_metrics(self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None:
-        for key, value in metrics.items():
-            self._stored_metrics[train_eval][key].append(value)
-
-    def evaluation_loop(
-        self,
-        dataloader: DataLoader,
-        description: str,
-        prediction_loss_only: bool | None = None,
-        ignore_keys: list[str] | None = None,
-        metric_key_prefix: str = "eval",
-    ) -> EvalLoopOutput:
-        """
-        Overriding built-in evaluation loop to store metrics for each batch. Prediction/evaluation loop, shared by
-        `Trainer.evaluate()` and `Trainer.predict()`.
-
-        Works both with or without labels.
-        """
-
-        # Sample and save to game log if requested (for one batch to save time)
-        if self.generate_during_eval:
-            # Generate random indices within the range of the total number of samples
-            num_samples = len(dataloader.dataset)
-            random_indices = random.sample(range(num_samples), k=self.args.eval_batch_size)
-
-            # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader
-            random_batch_dataset = dataloader.dataset.select(random_indices)
-            random_batch = self.data_collator(random_batch_dataset)
-            random_batch = self._prepare_inputs(random_batch)
-
-            policy_output_decoded = self.generate_from_model(self.model, random_batch)
-
-            table = pd.DataFrame(
-                columns=["Prompt", "Policy"],
-                data=[
-                    [prompt, pol[len(prompt) :]]
-                    for prompt, pol in zip(random_batch["prompt"], policy_output_decoded, strict=True)
-                ],
-            )
-            if "wandb" in self.args.report_to:
-                wandb.log({"game_log": wandb.Table(data=table)})
-
-            if "comet_ml" in self.args.report_to:
-                log_table_to_comet_experiment(
-                    name="game_log.csv",
-                    table=table,
-                )
-
-        # Base evaluation
-        initial_output = super().evaluation_loop(
-            dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix
-        )
-
-        return initial_output
-
-    def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
-        """
-        Log `logs` on the various objects watching training, including stored metrics.
-
-        Args:
-            logs (`dict[str, float]`):
-                The values to log.
-            start_time (`float`, *optional*):
-                Start time of the training.
-        """
-        # logs either has 'loss' or 'eval_loss'
-        train_eval = "train" if "loss" in logs else "eval"
-        # Add averaged stored metrics to logs
-        for key, metrics in self._stored_metrics[train_eval].items():
-            logs[key] = torch.tensor(metrics).mean().item()
-        del self._stored_metrics[train_eval]
-        return super().log(logs, start_time)
-
-    def _shift_right(self, input_ids):
-        if self.decoder_start_token_id is None:
-            raise ValueError(
-                "model.config.decoder_start_token_id has to be defined. It is usually set to the pad_token_id."
-            )
-
-        # shift inputs to the right
-        if is_torch_fx_proxy(input_ids):
-            # Item assignment is not supported natively for proxies.
-            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), self.decoder_start_token_id)
-            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
-        else:
-            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
-            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
-            shifted_input_ids[..., 0] = self.decoder_start_token_id
-
-        if self.pad_token_id is None:
-            raise ValueError("model.config.pad_token_id has to be defined.")
-        # replace possible -100 values in labels by `pad_token_id`
-        shifted_input_ids.masked_fill_(shifted_input_ids == -100, self.pad_token_id)
-
-        return shifted_input_ids
-
-    # Ensure the model card is saved along with the checkpoint
-    def _save_checkpoint(self, model, trial):
-        if self.args.hub_model_id is None:
-            model_name = Path(self.args.output_dir).name
-        else:
-            model_name = self.args.hub_model_id.split("/")[-1]
-        self.create_model_card(model_name=model_name)
-        super()._save_checkpoint(model, trial)
+        super().__init__(*args, **kwargs)
diff --git a/trl/trainer/dpo_config.py b/trl/trainer/dpo_config.py
index 887cd95646f..9c2fa288f2e 100644
--- a/trl/trainer/dpo_config.py
+++ b/trl/trainer/dpo_config.py
@@ -262,6 +262,16 @@ class DPOConfig(TrainingArguments):
             "`fp16` is not set."
         },
     )
+    # Transformers 4.57.0 introduced a bug that caused the dtype of `lr_scheduler_kwargs` to be unparsable. This issue
+    # was fixed in https://github.com/huggingface/transformers/pull/41322, but the fix has not yet been released. We
+    # add a temporary workaround here, which can be removed once the fix is available—likely in Transformers 4.57.2.
+    lr_scheduler_kwargs: dict | str | None = field(
+        default=None,
+        metadata={
+            "help": "Additional parameters for the lr_scheduler, such as {'num_cycles': 1} for cosine with hard "
+            "restarts."
+        },
+    )
 
     # Parameters that control the model and reference model
     model_init_kwargs: dict[str, Any] | None = field(
diff --git a/trl/trainer/gkd_config.py b/trl/trainer/gkd_config.py
index f5a047945fb..8c7a3a46660 100644
--- a/trl/trainer/gkd_config.py
+++ b/trl/trainer/gkd_config.py
@@ -12,101 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass, field
-from typing import Any
+import warnings
+from dataclasses import dataclass
 
-from transformers import TrainingArguments
-
-from .sft_config import SFTConfig
+from ..experimental.gkd import GKDConfig as _GKDConfig
 
 
 @dataclass
-class GKDConfig(SFTConfig):
-    """
-    Configuration class for [`GKDTrainer`].
-
-    This class includes only the parameters that are specific to GKD training. For a full list of training arguments,
-    please refer to the [`~transformers.TrainingArguments`] and [`SFTConfig`] documentation.
-
-    Args:
-        temperature (`float`, *optional*, defaults to `0.9`):
-            Temperature for sampling. The higher the temperature, the more random the completions.
-        lmbda (`float`, *optional*, defaults to `0.5`):
-            Lambda parameter that controls the student data fraction (i.e., the proportion of on-policy
-            student-generated outputs).
-        beta (`float`, *optional*, defaults to `0.5`):
-            Interpolation coefficient between `0.0` and `1.0` of the Generalized Jensen-Shannon Divergence loss. When
-            beta is `0.0`, the loss is the KL divergence. When beta is `1.0`, the loss is the Inverse KL Divergence.
-        max_new_tokens (`int`, *optional*, defaults to `128`):
-            Maximum number of tokens to generate per completion.
-        teacher_model_name_or_path (`str`, *optional*):
-            Model name or path of the teacher model. If `None`, the teacher model will be the same as the model being
-            trained.
-        teacher_model_init_kwargs (`dict[str, Any]]`, *optional*):
-            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the teacher model
-            from a string.
-        disable_dropout (`bool`, *optional*, defaults to `True`):
-            Whether to disable dropout in the model.
-        seq_kd (`bool`, *optional*, defaults to `False`):
-            Seq_kd parameter that controls whether to perform Sequence-Level KD (can be viewed as supervised FT on
-            teacher-generated output).
-    """
-
-    _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["teacher_model_init_kwargs"]
-
-    temperature: float = field(
-        default=0.9,
-        metadata={"help": "Temperature for sampling. The higher the temperature, the more random the completions."},
-    )
-    lmbda: float = field(
-        default=0.5,
-        metadata={
-            "help": "Lambda parameter that controls the student data fraction (i.e., the proportion of on-policy "
-            "student-generated outputs)."
-        },
-    )
-    beta: float = field(
-        default=0.5,
-        metadata={
-            "help": "Interpolation coefficient between `0.0` and `1.0` of the Generalized Jensen-Shannon Divergence "
-            "loss. When beta is `0.0`, the loss is the KL divergence. When beta is `1.0`, the loss is the Inverse KL "
-            "Divergence."
-        },
-    )
-    max_new_tokens: int = field(
-        default=128,
-        metadata={"help": "Maximum number of tokens to generate per completion."},
-    )
-    teacher_model_name_or_path: str | None = field(
-        default=None,
-        metadata={
-            "help": "Model name or path of the teacher model. If `None`, the teacher model will be the same as the "
-            "model being trained."
-        },
-    )
-    teacher_model_init_kwargs: dict[str, Any] | None = field(
-        default=None,
-        metadata={
-            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the "
-            "teacher model from a string."
-        },
-    )
-    disable_dropout: bool = field(
-        default=True,
-        metadata={"help": "Whether to disable dropouts in `model`."},
-    )
-    seq_kd: bool = field(
-        default=False,
-        metadata={
-            "help": "Seq_kd parameter that controls whether to perform Sequence-Level KD (can be viewed as supervised "
-            "FT on teacher-generated output)."
-        },
-    )
-
+class GKDConfig(_GKDConfig):
     def __post_init__(self):
+        warnings.warn(
+            "The `GKDConfig` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.gkd import GKDConfig`. The current import path will be removed and no longer "
+            "supported in TRL 0.29. For more information, see https://github.com/huggingface/trl/issues/4223."
+        )
         super().__post_init__()
-        # check lmbda and beta are in the range [0, 1]
-        if self.lmbda < 0.0 or self.lmbda > 1.0:
-            raise ValueError("lmbda must be in the range [0.0, 1.0].")
-        if self.beta < 0.0 or self.beta > 1.0:
-            raise ValueError("beta must be in the range [0.0, 1.0].")
diff --git a/trl/trainer/gkd_trainer.py b/trl/trainer/gkd_trainer.py
index 8db258cfbb8..bf20ed68f6f 100644
--- a/trl/trainer/gkd_trainer.py
+++ b/trl/trainer/gkd_trainer.py
@@ -12,438 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import random
-import textwrap
 import warnings
-from collections.abc import Callable
-from typing import Any
+from dataclasses import dataclass
 
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from datasets import Dataset
-from transformers import (
-    AutoModelForCausalLM,
-    BaseImageProcessor,
-    DataCollator,
-    FeatureExtractionMixin,
-    GenerationConfig,
-    PreTrainedModel,
-    PreTrainedTokenizerBase,
-    ProcessorMixin,
-)
-from transformers.trainer_callback import TrainerCallback
-from transformers.trainer_utils import EvalPrediction
-from transformers.utils import is_liger_kernel_available, is_peft_available
+from ..experimental.gkd import GKDTrainer as _GKDTrainer
 
-from ..models import prepare_deepspeed
-from ..models.utils import unwrap_model_for_generation
-from .gkd_config import GKDConfig
-from .sft_trainer import SFTTrainer
-from .utils import DataCollatorForChatML, disable_dropout_in_model, empty_cache
 
-
-if is_peft_available():
-    from peft import PeftConfig
-
-if is_liger_kernel_available():
-    from liger_kernel.chunked_loss import LigerFusedLinearJSDLoss
-
-
-class GKDTrainer(SFTTrainer):
-    """Trainer for Generalized Knowledge Distillation (GKD) of language models.
-
-    For details on GKD, see the paper: [On-Policy Distillation of Language Models: Learning from Self-Generated
-    Mistakes](https://huggingface.co/papers/2306.13649).
-
-    Args:
-        model ([`~transformers.PreTrainedModel`] or `torch.nn.Module` or `str`, *optional*):
-            Model to be trained, or the string identifier of the model to be instantiated from a pretrained model.
-        teacher_model ([`~transformers.PreTrainedModel`] or `torch.nn.Module` or `str`, *optional*):
-            Teacher model for knowledge distillation, or the string identifier of the model to be instantiated from a
-            pretrained model.
-        args ([`GKDConfig`], *optional*):
-            Training arguments.
-        data_collator ([`~transformers.DataCollator`], *optional*):
-            Data collator to batch samples from the dataset. It defaults to a [`DataCollatorForChatML`] using the
-            `processing_class`.
-        train_dataset ([`~datasets.Dataset`], *optional*):
-            Dataset for training.
-        eval_dataset ([`~datasets.Dataset`] or `dict` of [`~datasets.Dataset`], *optional*):
-            Dataset for evaluation.
-        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
-           Class to process the data.
-        compute_metrics (`Callable`, *optional*):
-            Function to compute metrics at evaluation. Must take in an [`~transformers.EvalPrediction`] and return a
-            dictionary string to float.
-        callbacks (`list` of [`~transformers.TrainerCallback`], *optional*):
-            Callbacks to use during training.
-        optimizers (`tuple` of `torch.optim.Optimizer` and `torch.optim.lr_scheduler.LambdaLR`, *optional*, defaults to `(None, None)`):
-            Tuple containing the optimizer and the learning rate scheduler to use for training.
-        preprocess_logits_for_metrics (`Callable`, *optional*):
-            Function to preprocess the logits before computing the metrics. Must take in the `logits` and `labels` and
-            return the logits to be used for metrics computation.
-        peft_config ([`~peft.PeftConfig`], *optional*):
-            PEFT configuration to use PEFT for training. If `None`, PEFT is not used. If provided, the `model` will be
-            wrapped with the specified PEFT adapter.
-        formatting_func (`Callable`, *optional*):
-            Function to format the dataset. Must take in an example and return an example.
-    """
-
-    _tag_names = ["trl", "gkd"]
-    _name = "GKD"
-    _paper = {
-        "title": "On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes",
-        "id": "2306.13649",
-        # docstyle-ignore
-        "citation": textwrap.dedent("""\
-            @inproceedings{agarwal2024on-policy,
-                title        = {{On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes}},
-                author       = {Rishabh Agarwal and Nino Vieillard and Yongchao Zhou and Piotr Stanczyk and Sabela Ramos Garea and Matthieu Geist and Olivier Bachem},
-                year         = 2024,
-                booktitle    = {The Twelfth International Conference on Learning Representations, {ICLR} 2024, Vienna, Austria, May 7-11, 2024},
-                publisher    = {OpenReview.net},
-                url          = {https://openreview.net/forum?id=3zKtaqxLhW},
-            }"""),
-    }
-
-    def __init__(
-        self,
-        model: PreTrainedModel | nn.Module | str | None = None,
-        teacher_model: PreTrainedModel | nn.Module | str = None,
-        args: GKDConfig | None = None,
-        data_collator: DataCollator | None = None,  # type: ignore
-        train_dataset: Dataset | None = None,
-        eval_dataset: Dataset | dict[str, Dataset] | None = None,
-        processing_class: PreTrainedTokenizerBase
-        | BaseImageProcessor
-        | FeatureExtractionMixin
-        | ProcessorMixin
-        | None = None,
-        compute_metrics: Callable[[EvalPrediction], dict] | None = None,
-        callbacks: list[TrainerCallback] | None = None,
-        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
-        preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
-        peft_config: "PeftConfig | None" = None,
-        formatting_func: Callable | None = None,
-    ):
-        if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
-            warnings.warn(
-                "This trainer will soon be moved to trl.experimental and is a candidate for removal. If you rely on "
-                "it and want it to remain, please share your comments here: "
-                "https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable "
-                "TRL_EXPERIMENTAL_SILENCE=1."
-            )
-        # Ensure Trainer does not drop non-signature columns used by the collator (e.g., "prompts")
-        args.remove_unused_columns = False
-        # Respect a user-provided data_collator; otherwise, provide a ChatML collator that
-        if data_collator is None:
-            data_collator = DataCollatorForChatML(tokenizer=processing_class, max_length=args.max_length)
-
-        # Ensure SFTTrainer does not pre-process the dataset when using a ChatML collator,
-        # so that raw conversational fields (e.g., "messages") remain available to the collator.
-        if args.dataset_kwargs is None:
-            args.dataset_kwargs = {"skip_prepare_dataset": True}
-        else:
-            args.dataset_kwargs["skip_prepare_dataset"] = True
-
-        # Liger fused GKD loss (JSD)
-        self.use_liger_gkd_loss = False
-        if args.use_liger_kernel:
-            self.liger_jsd_loss = LigerFusedLinearJSDLoss(
-                beta=args.beta,
-                ignore_index=-100,
-                temperature=args.temperature,
-                compiled=False,
-            )
-            self.use_liger_gkd_loss = True
-
-        super().__init__(
-            model,
-            args=args,
-            data_collator=data_collator,
-            train_dataset=train_dataset,
-            eval_dataset=eval_dataset,
-            processing_class=processing_class,
-            compute_metrics=compute_metrics,
-            callbacks=callbacks,
-            optimizers=optimizers,
-            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
-            peft_config=peft_config,
-            formatting_func=formatting_func,
-        )
-
-        if args.teacher_model_init_kwargs is None:
-            teacher_model_init_kwargs = {}
-        elif not isinstance(teacher_model, str):
-            raise ValueError(
-                "You passed teacher_model_init_kwargs to the GKDConfig, but your teacher_model is already instantiated."
-            )
-        else:
-            teacher_model_init_kwargs = args.teacher_model_init_kwargs
-            teacher_model_init_kwargs["dtype"] = (
-                teacher_model_init_kwargs["dtype"]
-                if teacher_model_init_kwargs["dtype"] in ["auto", None]
-                else getattr(torch, teacher_model_init_kwargs["dtype"])
-            )
-
-        if isinstance(teacher_model, str):
-            teacher_model = AutoModelForCausalLM.from_pretrained(teacher_model, **teacher_model_init_kwargs)
-
-        # Disable dropout in the model
-        if args.disable_dropout:
-            disable_dropout_in_model(self.model)
-
-        if self.is_deepspeed_enabled:
-            self.teacher_model = prepare_deepspeed(teacher_model, self.accelerator)
-        else:
-            self.teacher_model = self.accelerator.prepare_model(teacher_model, evaluation_mode=True)
-
-        self.lmbda = args.lmbda
-        self.beta = args.beta
-        self.temperature = args.temperature
-        self.seq_kd = args.seq_kd
-
-        self.generation_config = GenerationConfig(
-            max_new_tokens=args.max_new_tokens,
-            temperature=args.temperature,
-            do_sample=True,
-            top_k=0,
-            use_cache=False if args.gradient_checkpointing else True,
-            pad_token_id=self.processing_class.pad_token_id,
+@dataclass
+class GKDTrainer(_GKDTrainer):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The `GKDTrainer` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.gkd import GKDTrainer`. The current import path will be removed and no longer "
+            "supported in TRL 0.29. For more information, see https://github.com/huggingface/trl/issues/4223."
         )
-        # Set custom EOS tokens if they are specified by the model's generation
-        # config. This is important for models with the Llama 3 chat template,
-        # which use special tokens <|eot_id|> and <|eom_id|> to mark the end of
-        # turns or messages.
-        if (
-            hasattr(self.model.generation_config, "eos_token_id")
-            and self.model.generation_config.eos_token_id is not None
-        ):
-            self.generation_config.eos_token_id = self.model.generation_config.eos_token_id
-
-    @staticmethod
-    def generalized_jsd_loss(
-        student_logits, teacher_logits, labels=None, beta=0.5, temperature=1.0, reduction="batchmean"
-    ):
-        """
-        Compute the generalized Jensen-Shannon Divergence loss for knowledge distillation using F.kl_div. See Eq. (1)
-        of https://huggingface.co/papers/2306.13649 for the definition.
-
-        Args:
-            student_logits:
-                Tensor of shape (batch_size, sequence_length, vocab_size)
-            teacher_logits:
-                Tensor of shape (batch_size, sequence_length, vocab_size)
-            labels:
-                Tensor of shape (batch_size, sequence_length) with -100 for padding tokens to ignore when computing
-                loss
-            beta:
-                Interpolation coefficient between 0 and 1 (default: 0.5)
-            temperature:
-                Softmax temperature (default: 1.0)
-            reduction:
-                Specifies the reduction to apply to the output (default: 'batchmean')
-
-        Returns:
-            loss: Scalar tensor with the generalized JSD loss
-        """
-
-        # Apply temperature scaling
-        student_logits = student_logits / temperature
-        teacher_logits = teacher_logits / temperature
-
-        # Compute log probabilities for student and probabilities for teacher
-        student_log_probs = F.log_softmax(student_logits, dim=-1)
-        teacher_log_probs = F.log_softmax(teacher_logits, dim=-1)
-
-        if beta == 0:
-            jsd = F.kl_div(student_log_probs, teacher_log_probs, reduction="none", log_target=True)
-        elif beta == 1:
-            jsd = F.kl_div(teacher_log_probs, student_log_probs, reduction="none", log_target=True)
-        else:
-            # Compute the log of the mixture distribution
-            # log(a + b) = log(exp(log(a)) + exp(log(b))) -> for mixture
-            beta = torch.tensor(beta, dtype=student_log_probs.dtype)
-            mixture_log_probs = torch.logsumexp(
-                torch.stack([student_log_probs + torch.log(1 - beta), teacher_log_probs + torch.log(beta)]),
-                dim=0,
-            )
-
-            # Compute KL divergences using F.kl_div
-            # PyTorch differs from the standard mathematical definition, so the order of the probability distributions is swapped compared to that defined in the paper.
-            kl_teacher = F.kl_div(mixture_log_probs, teacher_log_probs, reduction="none", log_target=True)
-            kl_student = F.kl_div(mixture_log_probs, student_log_probs, reduction="none", log_target=True)
-
-            # Compute the Generalized Jensen-Shannon Divergence
-            jsd = beta * kl_teacher + (1 - beta) * kl_student
-
-        # Masking
-        if labels is not None:
-            mask = labels != -100
-            jsd = jsd[mask]
-
-        # Apply reduction
-        if reduction == "batchmean":
-            return jsd.sum() / mask.sum() if labels is not None else jsd.sum() / jsd.size(0)
-        elif reduction == "sum":
-            return jsd.sum()
-        elif reduction == "mean":
-            return jsd.mean()
-        else:
-            return jsd
-
-    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
-        if self.use_liger_gkd_loss:
-            # Forward only through the base models (avoid lm_head to save memory)
-            unwrapped_student = self.accelerator.unwrap_model(model)
-            if hasattr(unwrapped_student, "get_decoder") and unwrapped_student.get_decoder() is not None:
-                base_student = unwrapped_student.get_decoder()
-            else:
-                base_student = getattr(
-                    unwrapped_student, getattr(unwrapped_student, "base_model_prefix", "model"), unwrapped_student
-                )
-
-            student_outputs = base_student(
-                input_ids=inputs["input_ids"],
-                attention_mask=inputs["attention_mask"],
-                use_cache=False,
-            )
-
-            self.teacher_model.eval()
-            unwrapped_teacher = self.accelerator.unwrap_model(self.teacher_model)
-            if hasattr(unwrapped_teacher, "get_decoder") and unwrapped_teacher.get_decoder() is not None:
-                base_teacher = unwrapped_teacher.get_decoder()
-            else:
-                base_teacher = getattr(
-                    unwrapped_teacher, getattr(unwrapped_teacher, "base_model_prefix", "model"), unwrapped_teacher
-                )
-            with torch.no_grad():
-                teacher_outputs = base_teacher(
-                    input_ids=inputs["input_ids"],
-                    attention_mask=inputs["attention_mask"],
-                    use_cache=False,
-                )
-
-            # hidden states (shifted)
-            student_hidden = student_outputs.last_hidden_state[:, :-1]
-            teacher_hidden = teacher_outputs.last_hidden_state[:, :-1]
-
-            # Release full outputs to free memory
-            del student_outputs, teacher_outputs
-
-            # labels mask and labels (shifted)
-            labels_mask = inputs["labels"] != -100
-            masked_input_ids = torch.where(
-                labels_mask, inputs["input_ids"], torch.full_like(inputs["input_ids"], -100)
-            )
-            true_labels = masked_input_ids[:, 1:].contiguous()
-
-            # Release intermediate tensors
-            del labels_mask, masked_input_ids
-
-            # heads
-            student_head = unwrapped_student.get_output_embeddings()
-            teacher_head = unwrapped_teacher.get_output_embeddings()
-
-            # liger fused jsd loss
-            loss = self.liger_jsd_loss(
-                student_input=student_hidden,
-                student_weight=student_head.weight,
-                teacher_input=teacher_hidden,
-                teacher_weight=teacher_head.weight,
-                true_labels=true_labels,
-                student_bias=getattr(student_head, "bias", None),
-                teacher_bias=getattr(teacher_head, "bias", None),
-            )
-
-            # Release hidden states after loss computation
-            del student_hidden, teacher_hidden, true_labels
-        else:
-            # compute student output
-            student_outputs = model(
-                input_ids=inputs["input_ids"],
-                attention_mask=inputs["attention_mask"],
-            )
-
-            # compute teacher output in eval mode
-            self.teacher_model.eval()
-            with torch.no_grad():
-                teacher_outputs = self.teacher_model(
-                    input_ids=inputs["input_ids"],
-                    attention_mask=inputs["attention_mask"],
-                )
-
-            # slice the logits for the generated tokens using the inputs["prompts"] lengths
-            prompt_lengths = inputs["prompts"].shape[1]
-            shifted_student_logits = student_outputs.logits[:, prompt_lengths - 1 : -1, :]
-            shifted_teacher_logits = teacher_outputs.logits[:, prompt_lengths - 1 : -1, :]
-            shifted_labels = inputs["labels"][:, prompt_lengths:]
-
-            # compute loss
-            loss = self.generalized_jsd_loss(
-                student_logits=shifted_student_logits,
-                teacher_logits=shifted_teacher_logits,
-                labels=shifted_labels,
-                beta=self.beta,
-            )
-
-        # empty cache
-        empty_cache()
-
-        # Return loss
-        return (loss, student_outputs) if return_outputs else loss
-
-    @staticmethod
-    def generate_on_policy_outputs(model, inputs, generation_config, pad_token_id=None):
-        # Generate output with respect to the prompt-only
-        generated_outputs = model.generate(
-            input_ids=inputs["prompts"],
-            attention_mask=inputs.get("prompt_attention_mask", None),
-            generation_config=generation_config,
-            return_dict_in_generate=True,
-        )
-
-        # Get the generated token IDs
-        generated_tokens = generated_outputs.sequences
-        # Calculate new attention mask
-        new_attention_mask = torch.ones_like(generated_tokens)
-        new_labels = generated_tokens.clone()
-
-        # If there's pad_token_id, set attention mask to 0 for padding tokens
-        if pad_token_id is not None:
-            new_labels[new_labels == pad_token_id] = -100
-            new_attention_mask[generated_tokens == pad_token_id] = 0
-
-        return generated_tokens, new_attention_mask, new_labels
-
-    def training_step(
-        self, model: nn.Module, inputs: dict[str, torch.Tensor | Any], num_items_in_batch: int | None = None
-    ) -> torch.Tensor:
-        """
-        Perform a training step for the Generalized Knowledge Distillation (GKD) model.
-
-        This method implements the on-policy learning approach described in the GKD paper. With probability
-        `self.lmbda`, it generates new responses using the student model, which are then used for training instead of
-        the original inputs.
-        """
-        if self.seq_kd:
-            with unwrap_model_for_generation(self.teacher_model, self.accelerator) as unwrapped_model:
-                new_input_ids, new_attention_mask, new_labels = self.generate_on_policy_outputs(
-                    unwrapped_model, inputs, self.generation_config, self.processing_class.pad_token_id
-                )
-            inputs["input_ids"] = new_input_ids
-            inputs["attention_mask"] = new_attention_mask
-            inputs["labels"] = new_labels
-        if random.random() <= self.lmbda:
-            with unwrap_model_for_generation(model, self.accelerator) as unwrapped_model:
-                new_input_ids, new_attention_mask, new_labels = self.generate_on_policy_outputs(
-                    unwrapped_model, inputs, self.generation_config, self.processing_class.pad_token_id
-                )
-            inputs["input_ids"] = new_input_ids
-            inputs["attention_mask"] = new_attention_mask
-            inputs["labels"] = new_labels
-
-        loss = super().training_step(model, inputs, num_items_in_batch)
-        return loss
+        super().__init__(*args, **kwargs)
diff --git a/trl/trainer/grpo_config.py b/trl/trainer/grpo_config.py
index 6001a8dc524..2d97d67bd8e 100644
--- a/trl/trainer/grpo_config.py
+++ b/trl/trainer/grpo_config.py
@@ -147,8 +147,8 @@ class GRPOConfig(TrainingArguments):
             `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when
             launching the vLLM server via the `--vllm_tensor_parallel_size` flag.
         vllm_enable_sleep_mode (`bool`, *optional*, defaults to `False`):
-            Whether to enable sleep mode for vLLM. If `True`, vLLM will sleep during the optimization step and woken
-            for weight sync and generation.
+            Enable vLLM sleep mode to offload weights/cache during the optimizer step. Keeps GPU memory usage low, but
+            waking the engine adds host–device transfer latency.
 
         > Parameters that control the training
 
@@ -166,6 +166,8 @@ class GRPOConfig(TrainingArguments):
         epsilon_high (`float`, *optional*):
             Upper-bound epsilon value for clipping. If not specified, it defaults to the same value as the lower-bound
             specified in argument `epsilon`. Paper [DAPO](https://huggingface.co/papers/2503.14476) recommends `0.28`.
+            When used with `loss_type='cispo'`, this corresponds to the ε_max param specified in the [ScaleRL
+            paper](https://arxiv.org/pdf/2510.13786) and the recommended value is `5.0`.
         importance_sampling_level (`str`, *optional*, defaults to `"token"`):
             Controls whether importance sampling ratios are computed at the `"token"` or `"sequence"` level. `"token"`
             keeps the raw per-token log-probability ratios (one weight per token). `"sequence"` averages the
@@ -201,6 +203,10 @@ class GRPOConfig(TrainingArguments):
               batch. Note that normalization is performed over the local batch only, so results may slightly vary
               depending on the local batch size, despite a constant effective batch size. When using
               `per_device_train_batch_size==1`, the loss is equivalent to the GRPO loss.
+            - `"cispo"`: Clips the importance sampling weights instead of the advantage scaled importance weights. The
+              clipped weights are then multiplied with the advantages and policy model's log probs. Individual token
+              losses are aggregated by normalizing with the number of active tokens in the global accumulated batch.
+              This method was introduced in the [MiniMax-M1 paper](https://huggingface.co/papers/2506.13585).
         mask_truncated_completions (`bool`, *optional*, defaults to `False`):
             When enabled, truncated completions are excluded from the loss calculation, preventing them from being
             incorrectly penalized and introducing noise during training. According to the
@@ -252,9 +258,20 @@ class GRPOConfig(TrainingArguments):
             `trackio`.
         num_completions_to_print (`int`, *optional*):
             Number of completions to print with `rich`. If `None`, all completions are logged.
-        wandb_log_unique_prompts (`bool`, *optional*, defaults to `False`):
-            Whether to log unique prompts in wandb. If `True`, only unique prompts are logged. If `False`, all prompts
-            are logged.
+        log_unique_prompts (`bool`, *optional*, defaults to `False`):
+            Whether to log unique prompts. If `True`, only unique prompts are logged. If `False`, all prompts are
+            logged.
+
+        > Deprecated arguments
+
+        wandb_log_unique_prompts (`bool`, *optional*):
+
+            <Deprecated version="0.26.0">
+
+            Parameter `wandb_log_unique_prompts` is deprecated and will be removed in version 0.27.0. Use
+            `log_unique_prompts` instead.
+
+            </Deprecated>
     """
 
     _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["model_init_kwargs"]
@@ -285,6 +302,16 @@ class GRPOConfig(TrainingArguments):
             "`fp16` is not set."
         },
     )
+    # Transformers 4.57.0 introduced a bug that caused the dtype of `lr_scheduler_kwargs` to be unparsable. This issue
+    # was fixed in https://github.com/huggingface/transformers/pull/41322, but the fix has not yet been released. We
+    # add a temporary workaround here, which can be removed once the fix is available—likely in Transformers 4.57.2.
+    lr_scheduler_kwargs: dict | str | None = field(
+        default=None,
+        metadata={
+            "help": "Additional parameters for the lr_scheduler, such as {'num_cycles': 1} for cosine with hard "
+            "restarts."
+        },
+    )
 
     # Parameters that control the model and reference model
     model_init_kwargs: dict | str | None = field(
@@ -454,8 +481,8 @@ class GRPOConfig(TrainingArguments):
     vllm_enable_sleep_mode: bool = field(
         default=False,
         metadata={
-            "help": "Whether to enable sleep mode for vLLM. If `True`, vLLM will sleep during the optimization step "
-            "and woken for weight sync and generation."
+            "help": "Enable vLLM sleep mode to offload weights/cache during the optimizer step. Keeps GPU memory "
+            "usage low, but waking the engine adds host–device transfer latency."
         },
     )
     vllm_guided_decoding_regex: str | None = field(
@@ -533,7 +560,9 @@ class GRPOConfig(TrainingArguments):
         default=None,
         metadata={
             "help": "Upper-bound epsilon value for clipping. If not specified, it defaults to the same value as the "
-            "lower-bound specified in argument `epsilon`. Paper DAPO recommends `0.28`."
+            "lower-bound specified in argument `epsilon`. Paper DAPO recommends `0.28`. "
+            "When used with `loss_type='cispo'`, this corresponds to the ε_max param specified in the"
+            "[ScaleRL paper]https://huggingface.co/papers/2510.13786) and the recommended value is `5.0`."
         },
     )
     importance_sampling_level: str = field(
@@ -582,6 +611,11 @@ class GRPOConfig(TrainingArguments):
             "Note that normalization is performed over the local batch only, so results may slightly vary depending "
             "on the local batch size, despite a constant effective batch size. When using "
             "`per_device_train_batch_size==1`, the loss is equivalent to the GRPO loss."
+            "'cispo': Clips the importance sampling weights instead of the advantage scaled importance weights. "
+            "The clipped weights are then multiplied with the advantages and policy model's log probs. "
+            "Individual token losses are aggregated by normalizing with the number of active tokens in "
+            "the global accumulated batch. This method was introduced in the "
+            "[MiniMax-M1 paper](https://huggingface.co/papers/2506.13585)."
         },
     )
     mask_truncated_completions: bool = field(
@@ -658,14 +692,19 @@ class GRPOConfig(TrainingArguments):
         default=None,
         metadata={"help": "Number of completions to print with `rich`. If `None`, all completions are logged."},
     )
-    wandb_log_unique_prompts: bool | None = field(
+    log_unique_prompts: bool = field(
         default=False,
         metadata={
-            "help": "Whether to log unique prompts in wandb. If `True`, only unique prompts are logged. If `False`, "
-            "all prompts are logged."
+            "help": "Whether to log unique prompts. If `True`, only unique prompts are logged. If `False`, all prompts are logged."
         },
     )
 
+    # Deprecated arguments
+    wandb_log_unique_prompts: bool | None = field(
+        default=None,
+        metadata={"help": "Deprecated, use `log_unique_prompts` instead."},
+    )
+
     def __post_init__(self):
         self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
 
@@ -728,3 +767,12 @@ def __post_init__(self):
 
         if self.delta is not None and self.use_liger_kernel:
             raise ValueError("Liger kernel does not support two-sided GRPO loss yet.")
+
+        if self.wandb_log_unique_prompts is not None:
+            warnings.warn(
+                "The `wandb_log_unique_prompts` argument is deprecated and will be removed in version 0.27.0. Please "
+                "use `log_unique_prompts` instead.",
+                FutureWarning,
+                stacklevel=2,
+            )
+            self.log_unique_prompts = self.wandb_log_unique_prompts
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index fe29c78b92a..8abe55e2011 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -15,6 +15,7 @@
 import inspect
 import os
 import textwrap
+import time
 import warnings
 from collections import defaultdict, deque
 from collections.abc import Callable
@@ -44,6 +45,7 @@
     PreTrainedTokenizerBase,
     ProcessorMixin,
     TrainerCallback,
+    is_bitsandbytes_available,
     is_trackio_available,
     is_wandb_available,
 )
@@ -100,6 +102,8 @@
 if is_trackio_available():
     import trackio
 
+if is_bitsandbytes_available():
+    import bitsandbytes as bnb
 
 logger = logging.get_logger(__name__)
 
@@ -107,10 +111,10 @@
 # rewards. When it's a string, it's a model ID, so it's loaded as a pretrained model.
 RewardFunc = str | PreTrainedModel | Callable[[list, list], list[float]]
 
-# What we call a rollout function is a callable that takes prompts (list), args (GRPOConfig), and processing_class as
-# parameters and returns a dict of generation results. Those results must include "prompt_ids", "completion_ids", and
-# "logprobs" fields. Any extra fields (per-completion) are forwarded to the reward functions.
-RolloutFunc = Callable[[list[str], Any, Any], dict[str, Any]]
+# What we call a rollout function is a callable that takes prompts (list) and the trainer instance as parameters and
+# returns a dict of generation results. Those results must include "prompt_ids", "completion_ids", and "logprobs"
+# fields. Any extra fields (per-completion) are forwarded to the reward functions.
+RolloutFunc = Callable[[list[str], "GRPOTrainer"], dict[str, Any]]
 
 
 class GRPOTrainer(BaseTrainer):
@@ -124,21 +128,15 @@ class GRPOTrainer(BaseTrainer):
     ```python
     from datasets import load_dataset
     from trl import GRPOTrainer
+    from trl.rewards import accuracy_reward
 
-    dataset = load_dataset("trl-lib/tldr", split="train")
-
-
-    def reward_func(completions, **kwargs):
-        # Dummy reward function that rewards completions with more unique letters.
-        return [float(len(set(completion))) for completion in completions]
-
+    dataset = load_dataset("trl-lib/DeepMath-103K", split="train")
 
     trainer = GRPOTrainer(
         model="Qwen/Qwen2-0.5B-Instruct",
-        reward_funcs=reward_func,
+        reward_funcs=accuracy_reward,
         train_dataset=dataset,
     )
-
     trainer.train()
     ```
 
@@ -214,10 +212,10 @@ def reward_func(completions, **kwargs):
         peft_config ([`~peft.PeftConfig`], *optional*):
             PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
         rollout_func (`RolloutFunc`, *optional*):
-            Function to use for generating completions. It must take prompts, args, and processing_class as parameters
-            and return a dict with `"prompt_ids"`, `"completion_ids"`, and `"logprobs"` fields. Any other fields that
-            are forwarded to the reward functions. This feature is experimental and may change or be removed at any
-            time without prior notice.
+            Function to use for generating completions. It receives the list of prompts allocated to the current
+            process and the trainer instance. It must return a dict with `"prompt_ids"`, `"completion_ids"`, and
+            `"logprobs"` fields. Any other fields are forwarded to the reward functions. This feature is experimental
+            and may change or be removed at any time without prior notice.
     """
 
     _tag_names = ["trl", "grpo"]
@@ -261,7 +259,7 @@ def __init__(
         model_init_kwargs = args.model_init_kwargs or {}
         if isinstance(model, str):
             model_id = model
-            dtype = model_init_kwargs.get("dtype")
+            dtype = model_init_kwargs.get("dtype", "auto")
             if isinstance(dtype, torch.dtype) or dtype == "auto" or dtype is None:
                 pass  # dtype is already a torch.dtype or "auto" or None
             elif isinstance(dtype, str):  # it's a str, but not "auto"
@@ -272,7 +270,7 @@ def __init__(
                     "Invalid `dtype` passed to `GRPOConfig`. Expected either 'auto' or a string representing "
                     f"a `torch.dtype` (e.g., 'float32'), but got {dtype}."
                 )
-            # Disable caching if gradient checkpointing is enabled (not supported)
+            model_init_kwargs["device_map"] = model_init_kwargs.get("device_map", "auto")
             config = AutoConfig.from_pretrained(model_id)
             architecture = getattr(transformers, config.architectures[0])
             model = architecture.from_pretrained(model_id, **model_init_kwargs)
@@ -531,8 +529,9 @@ def cast_outputs_to_original_dtype(module, args, output):
         # Initialize the metrics
         self._metrics = {"train": defaultdict(list), "eval": defaultdict(list)}
         self._total_train_tokens = 0
+        self._current_train_step_time = 0.0
         self.log_completions = args.log_completions
-        self.wandb_log_unique_prompts = args.wandb_log_unique_prompts
+        self.log_unique_prompts = args.log_unique_prompts
         self.num_completions_to_print = args.num_completions_to_print
         # Keep logs sized to the generation batch to record only outputs from the latest model update.
         self._logs = {
@@ -594,6 +593,15 @@ def cast_outputs_to_original_dtype(module, args, output):
                     max_model_len = self.max_prompt_length + self.max_completion_length
                 else:
                     max_model_len = None
+
+                vllm_quantization = None
+                if is_bitsandbytes_available():
+                    for _, module in model.named_modules():
+                        if isinstance(module, bnb.nn.Linear4bit):
+                            vllm_quantization = "bitsandbytes"
+                            break
+                        elif isinstance(module, bnb.nn.Linear8bitLt):
+                            raise ValueError("vLLM does not support in-flight 8-bit quantization.")
                 self.llm = LLM(
                     model=model.name_or_path,
                     tensor_parallel_size=args.vllm_tensor_parallel_size,
@@ -611,6 +619,7 @@ def cast_outputs_to_original_dtype(module, args, output):
                     enable_sleep_mode=self.args.vllm_enable_sleep_mode,
                     # Important so temperature scaling/logit tweaking affects the TIS log probs
                     logprobs_mode="processed_logprobs",
+                    quantization=vllm_quantization,
                 )
                 if self.args.vllm_enable_sleep_mode:
                     self.llm.sleep(level=2)
@@ -1044,6 +1053,17 @@ def _move_model_to_vllm(self):
         elif self.vllm_mode == "colocate":
             self.llm.reset_prefix_cache()
 
+    def training_step(self, model, inputs, num_items_in_batch):
+        time_before = time.perf_counter()
+        output = super().training_step(model, inputs, num_items_in_batch)
+        self._step += 1
+        time_after = time.perf_counter()
+        self._current_train_step_time += time_after - time_before
+        if self._step % self.current_gradient_accumulation_steps == 0:
+            self._metrics["train"]["step_time"].append(self._current_train_step_time)
+            self._current_train_step_time = 0.0
+        return output
+
     @profiling_decorator
     def _prepare_inputs(self, generation_batch: dict[str, torch.Tensor | Any]) -> dict[str, torch.Tensor | Any]:
         # Prepares inputs for model training/evaluation by managing completion generation and batch handling.
@@ -1070,7 +1090,6 @@ def _prepare_inputs(self, generation_batch: dict[str, torch.Tensor | Any]) -> di
                 generation_batches = split_tensor_dict(generation_batch, self.args.steps_per_generation)
                 self._buffered_inputs = [unsplit_pixel_values_by_grid(batch) for batch in generation_batches]
             inputs = self._buffered_inputs[self._step % self.args.steps_per_generation]
-            self._step += 1
         else:
             # In evaluation, there is neither batch grouping for generation, nor multiple iterations, hence
             # local generation batch == local eval batch
@@ -1177,18 +1196,15 @@ def _generate_single_turn(self, prompts: list):
                     }
                     with profiling_context(self, "vLLM.generate"):
                         if self.rollout_func is not None:
-                            if is_conversational({"prompt": ordered_set_of_prompts[0]}):
-                                ordered_set_of_prompts = [
+                            rollout_prompts = ordered_set_of_prompts
+                            if rollout_prompts and is_conversational({"prompt": rollout_prompts[0]}):
+                                rollout_prompts = [
                                     apply_chat_template(
                                         {"prompt": p}, self.processing_class, **self.chat_template_kwargs
                                     )["prompt"]
-                                    for p in ordered_set_of_prompts
+                                    for p in rollout_prompts
                                 ]
-                            output = self.rollout_func(
-                                ordered_set_of_prompts,
-                                self.args,
-                                self.processing_class,
-                            )
+                            output = self.rollout_func(rollout_prompts, self)
                         else:
                             if is_conversational({"prompt": ordered_set_of_prompts[0]}):
                                 output = self.vllm_client.chat(
@@ -1231,71 +1247,89 @@ def _generate_single_turn(self, prompts: list):
 
             # Generate completions using colocated vLLM instances: each device holds vLLM copy and work on their own batch of prompts
             elif self.vllm_mode == "colocate":
-                if self.guided_decoding_regex:
-                    guided_decoding = GuidedDecodingParams(regex=self.guided_decoding_regex)
-                else:
-                    guided_decoding = None
-
-                generation_kwargs = {
-                    "n": 1,  # vLLM on each GPU generates only 1 in colocate mode
-                    "repetition_penalty": self.repetition_penalty,
-                    "temperature": self.temperature,
-                    "top_p": self.top_p,
-                    "top_k": -1 if self.top_k is None else self.top_k,
-                    "min_p": 0.0 if self.min_p is None else self.min_p,
-                    "max_tokens": self.max_completion_length,
-                    "truncate_prompt_tokens": self.max_prompt_length,
-                    "guided_decoding": guided_decoding,
-                    "logprobs": 0,  # enable returning log probabilities; 0 means for the sampled tokens only
-                }
-                if self.args.generation_kwargs is not None:
-                    generation_kwargs.update(self.args.generation_kwargs)
-                sampling_params = SamplingParams(**generation_kwargs)
-
-                if self.vllm_tensor_parallel_size > 1:
-                    # Gather prompts from all ranks in the TP group and flatten.
-                    # Each rank starts with its own prompts; after gathering, all ranks see the full group set.
-                    orig_size = len(prompts)
-                    gathered_prompts = [None for _ in range(self.vllm_tensor_parallel_size)]
-                    torch.distributed.all_gather_object(gathered_prompts, prompts, group=self.tp_group)
-                    all_prompts = [p for sublist in gathered_prompts for p in sublist]
+                if self.rollout_func is not None:
+                    rollout_prompts = prompts
+                    if rollout_prompts and is_conversational({"prompt": rollout_prompts[0]}):
+                        rollout_prompts = [
+                            apply_chat_template(
+                                {"prompt": prompt}, self.processing_class, **self.chat_template_kwargs
+                            )["prompt"]
+                            for prompt in rollout_prompts
+                        ]
+                    output = self.rollout_func(rollout_prompts, self)
+                    required_keys = {"prompt_ids", "completion_ids", "logprobs"}
+                    extra_fields = {k: v for k, v in output.items() if k not in required_keys}
+                    prompt_ids = output["prompt_ids"]
+                    completion_ids = output["completion_ids"]
+                    logprobs = output["logprobs"]
                 else:
-                    all_prompts = prompts
-
-                if self.args.vllm_enable_sleep_mode:
-                    self.llm.wake_up(tags=["kv_cache"])
+                    if self.guided_decoding_regex:
+                        guided_decoding = GuidedDecodingParams(regex=self.guided_decoding_regex)
+                    else:
+                        guided_decoding = None
 
-                with profiling_context(self, "vLLM.generate"):
-                    if is_conversational({"prompt": prompts[0]}):
-                        all_outputs = self.llm.chat(all_prompts, sampling_params=sampling_params, use_tqdm=False)
+                    generation_kwargs = {
+                        "n": 1,  # vLLM on each GPU generates only 1 in colocate mode
+                        "repetition_penalty": self.repetition_penalty,
+                        "temperature": self.temperature,
+                        "top_p": self.top_p,
+                        "top_k": -1 if self.top_k is None else self.top_k,
+                        "min_p": 0.0 if self.min_p is None else self.min_p,
+                        "max_tokens": self.max_completion_length,
+                        "truncate_prompt_tokens": self.max_prompt_length,
+                        "guided_decoding": guided_decoding,
+                        "logprobs": 0,  # enable returning log probabilities; 0 means for the sampled tokens only
+                    }
+                    if self.args.generation_kwargs is not None:
+                        generation_kwargs.update(self.args.generation_kwargs)
+                    sampling_params = SamplingParams(**generation_kwargs)
+
+                    if self.vllm_tensor_parallel_size > 1:
+                        # Gather prompts from all ranks in the TP group and flatten.
+                        # Each rank starts with its own prompts; after gathering, all ranks see the full group set.
+                        orig_size = len(prompts)
+                        gathered_prompts = [None for _ in range(self.vllm_tensor_parallel_size)]
+                        torch.distributed.all_gather_object(gathered_prompts, prompts, group=self.tp_group)
+                        all_prompts = [p for sublist in gathered_prompts for p in sublist]
                     else:
-                        all_outputs = self.llm.generate(all_prompts, sampling_params=sampling_params, use_tqdm=False)
+                        all_prompts = prompts
 
-                all_prompt_ids = [output.prompt_token_ids for output in all_outputs]
-                all_completion_ids = [output.token_ids for outputs in all_outputs for output in outputs.outputs]
-                all_logprobs = [
-                    [next(iter(lp.values())).logprob for lp in output.logprobs]
-                    for outputs in all_outputs
-                    for output in outputs.outputs
-                ]
+                    if self.args.vllm_enable_sleep_mode:
+                        self.llm.wake_up(tags=["kv_cache"])
 
-                if self.vllm_tensor_parallel_size > 1:
-                    # Slice completions for this rank within its TP group.
-                    # Each rank generates all outputs — we keep only our share.
-                    local_rank_in_group = torch.distributed.get_rank(group=self.tp_group)
-                    tp_slice = slice(local_rank_in_group * orig_size, (local_rank_in_group + 1) * orig_size)
-                    prompt_ids = all_prompt_ids[tp_slice]
-                    completion_ids = all_completion_ids[tp_slice]
-                    logprobs = all_logprobs[tp_slice]
-                else:
-                    prompt_ids = all_prompt_ids
-                    completion_ids = all_completion_ids
-                    logprobs = all_logprobs
+                    with profiling_context(self, "vLLM.generate"):
+                        if is_conversational({"prompt": prompts[0]}):
+                            all_outputs = self.llm.chat(all_prompts, sampling_params=sampling_params, use_tqdm=False)
+                        else:
+                            all_outputs = self.llm.generate(
+                                all_prompts, sampling_params=sampling_params, use_tqdm=False
+                            )
 
-                extra_fields = {}  # No extra fields for colocate mode
+                    all_prompt_ids = [output.prompt_token_ids for output in all_outputs]
+                    all_completion_ids = [output.token_ids for outputs in all_outputs for output in outputs.outputs]
+                    all_logprobs = [
+                        [next(iter(lp.values())).logprob for lp in output.logprobs]
+                        for outputs in all_outputs
+                        for output in outputs.outputs
+                    ]
+
+                    if self.vllm_tensor_parallel_size > 1:
+                        # Slice completions for this rank within its TP group.
+                        # Each rank generates all outputs — we keep only our share.
+                        local_rank_in_group = torch.distributed.get_rank(group=self.tp_group)
+                        tp_slice = slice(local_rank_in_group * orig_size, (local_rank_in_group + 1) * orig_size)
+                        prompt_ids = all_prompt_ids[tp_slice]
+                        completion_ids = all_completion_ids[tp_slice]
+                        logprobs = all_logprobs[tp_slice]
+                    else:
+                        prompt_ids = all_prompt_ids
+                        completion_ids = all_completion_ids
+                        logprobs = all_logprobs
 
-                if self.args.vllm_enable_sleep_mode:
-                    self.llm.sleep(level=2)
+                    extra_fields = {}  # No extra fields for colocate mode
+
+                    if self.args.vllm_enable_sleep_mode:
+                        self.llm.sleep(level=2)
 
         elif self.use_transformers_paged:
             processor_kwargs = {
@@ -1575,6 +1609,9 @@ def _generate_and_score_completions(
             completions = []
             for prompt, completion in zip(prompts, completions_text, strict=True):
                 bootstrap = prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else ""
+                if isinstance(bootstrap, list):  # for VLM, the format might be [{"type": "text", "text": "..."}]
+                    assert len(bootstrap) == 1 and bootstrap[0]["type"] == "text"
+                    bootstrap = bootstrap[0]["text"]
                 completions.append([{"role": "assistant", "content": bootstrap + completion}])
         else:
             completions = completions_text
@@ -1797,6 +1834,10 @@ def _compute_loss(self, model, inputs):
 
         # Compute the loss
         advantages = inputs["advantages"]
+        # In the base GRPO implementation, advantages are expected to have shape (B,). To support subclasses that
+        # provide advantages with shape (B, T) (e.g., MiniLLM), we *conditionally* unsqueeze the tensor.
+        if advantages.dim() == 1:
+            advantages = advantages.unsqueeze(1)
         # When num_iterations == 1 and steps_per_generation <= gradient_accumulation_steps,
         # old_per_token_logps == per_token_logps. In this case we can skip its computation
         # (see _generate_and_score_completions) and instead use per_token_logps.detach().
@@ -1816,19 +1857,26 @@ def _compute_loss(self, model, inputs):
                 f"Unknown importance sampling level: {self.importance_sampling_level}. Possible values are 'token' "
                 "and 'sequence'."
             )
-        # From here, log_importance_weights (and all subsequent tensors, coef_1, coef_2, etc.) shape depends on
-        # importance_sampling_level: "token" level: (B, T); "sequence" level: (B, 1)
 
         coef_1 = torch.exp(log_importance_weights)
-        coef_2 = torch.clamp(coef_1, 1 - self.epsilon_low, 1 + self.epsilon_high)
 
-        # Two-sided clipping
-        if self.args.delta is not None:
-            coef_1 = torch.clamp(coef_1, max=self.args.delta)
+        # From here, log_importance_weights (and all subsequent tensors, coef_1, coef_2, etc.) shape depends on
+        # importance_sampling_level: "token" level: (B, T); "sequence" level: (B, 1)
+        if self.loss_type == "cispo":
+            clamped_ratios = torch.clamp(coef_1, max=self.epsilon_high).detach()
+            per_token_loss = -clamped_ratios * advantages * per_token_logps
+        elif self.loss_type in ["grpo", "bnpo", "dr_grpo", "dapo"]:
+            coef_2 = torch.clamp(coef_1, 1 - self.epsilon_low, 1 + self.epsilon_high)
+            # Two-sided clipping
+            if self.args.delta is not None:
+                coef_1 = torch.clamp(coef_1, max=self.args.delta)
+
+            per_token_loss1 = coef_1 * advantages
+            per_token_loss2 = coef_2 * advantages
+            per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
+        else:
+            raise ValueError(f"Unknown loss type: {self.loss_type}")
 
-        per_token_loss1 = coef_1 * advantages.unsqueeze(1)
-        per_token_loss2 = coef_2 * advantages.unsqueeze(1)
-        per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
         if entropy_mask is not None:
             per_token_loss = per_token_loss * entropy_mask
 
@@ -1847,7 +1895,7 @@ def _compute_loss(self, model, inputs):
         elif self.loss_type == "dr_grpo":
             loss = (per_token_loss * completion_mask).sum() / (per_token_loss.size(0) * self.max_completion_length)
             loss = loss / self.current_gradient_accumulation_steps
-        elif self.loss_type == "dapo":
+        elif self.loss_type in ["cispo", "dapo"]:
             normalizer = inputs["num_items_in_batch"] / self.accelerator.num_processes
             loss = (per_token_loss * completion_mask).sum() / normalizer
         else:
@@ -1871,23 +1919,30 @@ def masked_batch_mean(x):
         mean_entropy = masked_batch_mean(entropies)
         self._metrics[mode]["entropy"].append(self.accelerator.gather(mean_entropy).nanmean().item())
 
-        # Compute the clipped probability ratios
-        is_low_clipped = (coef_1 < 1 - self.epsilon_low) & (advantages.unsqueeze(1) < 0)
-        is_high_clipped = (coef_1 > 1 + self.epsilon_high) & (advantages.unsqueeze(1) > 0)
-        is_region_clipped = is_low_clipped | is_high_clipped
-
-        low_clip = masked_batch_mean(is_low_clipped.float())
-        high_clip = masked_batch_mean(is_high_clipped.float())
-        clip_ratio = masked_batch_mean(is_region_clipped.float())
-
-        gathered_low_clip = self.accelerator.gather(low_clip)
-        self._metrics[mode]["clip_ratio/low_mean"].append(gathered_low_clip.nanmean().item())
-        self._metrics[mode]["clip_ratio/low_min"].append(nanmin(gathered_low_clip).item())
-        gathered_high_clip = self.accelerator.gather(high_clip)
-        self._metrics[mode]["clip_ratio/high_mean"].append(gathered_high_clip.nanmean().item())
-        self._metrics[mode]["clip_ratio/high_max"].append(nanmax(gathered_high_clip).item())
-        gathered_clip_ratio = self.accelerator.gather(clip_ratio)
-        self._metrics[mode]["clip_ratio/region_mean"].append(gathered_clip_ratio.nanmean().item())
+        if self.loss_type in ["grpo", "bnpo", "dr_grpo", "dapo"]:
+            # Compute the clipped probability ratios
+            is_low_clipped = (coef_1 < 1 - self.epsilon_low) & (advantages < 0)
+            is_high_clipped = (coef_1 > 1 + self.epsilon_high) & (advantages > 0)
+            is_region_clipped = is_low_clipped | is_high_clipped
+
+            low_clip = masked_batch_mean(is_low_clipped.float())
+            high_clip = masked_batch_mean(is_high_clipped.float())
+            clip_ratio = masked_batch_mean(is_region_clipped.float())
+
+            gathered_low_clip = self.accelerator.gather(low_clip)
+            self._metrics[mode]["clip_ratio/low_mean"].append(gathered_low_clip.nanmean().item())
+            self._metrics[mode]["clip_ratio/low_min"].append(nanmin(gathered_low_clip).item())
+            gathered_high_clip = self.accelerator.gather(high_clip)
+            self._metrics[mode]["clip_ratio/high_mean"].append(gathered_high_clip.nanmean().item())
+            self._metrics[mode]["clip_ratio/high_max"].append(nanmax(gathered_high_clip).item())
+            gathered_clip_ratio = self.accelerator.gather(clip_ratio)
+            self._metrics[mode]["clip_ratio/region_mean"].append(gathered_clip_ratio.nanmean().item())
+        elif self.loss_type == "cispo":
+            is_cispo_clipped = (coef_1 > self.epsilon_high) & (advantages > 0)
+            cispo_clip_ratio = masked_batch_mean(is_cispo_clipped.float())
+            gathered_cispo_clip_ratio = self.accelerator.gather(cispo_clip_ratio)
+            self._metrics[mode]["cispo_clip_ratio"].append(gathered_cispo_clip_ratio.nanmean().item())
+
         return loss
 
     def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys: list[str] | None = None):
@@ -1922,6 +1977,12 @@ def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
                     self.num_completions_to_print,
                 )
 
+            logging_backends = []
+            if self.args.report_to and "wandb" in self.args.report_to and wandb.run is not None:
+                logging_backends.append(wandb)
+            if self.args.report_to and "trackio" in self.args.report_to:
+                logging_backends.append(trackio)
+
             table = {
                 "step": [str(self.state.global_step)] * len(self._logs["prompt"]),
                 "prompt": self._logs["prompt"],
@@ -1933,34 +1994,23 @@ def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
             df_base = pd.DataFrame(table)
             images_raw = self._logs["images"] or []
 
-            for logging_backend in self.args.report_to:
-                if logging_backend == "wandb":
-                    if images_raw:
-                        images = []
-                        for image_list in self._logs["images"]:
-                            images.append([wandb.Image(image) for image in image_list])
-                        df = pd.concat([df_base, pd.Series(images, name="image")], axis=1, copy=False)
-                    else:
-                        df = df_base
-
-                    if self.wandb_log_unique_prompts:
-                        df = df.drop_duplicates(subset=["prompt"])
-
-                    wandb.log({"completions": wandb.Table(dataframe=df)})
-
-                if logging_backend == "trackio":
-                    if images_raw:
-                        # TODO: Implement once supported upstream https://github.com/gradio-app/trackio/issues/334
-                        logger.info("Skipping image logging for Trackio")
-                        df = df_base
-                        # images = []
-                        # for image_list in self._logs["images"]:
-                        #     images.append([trackio.Image(image) for image in image_list])
-                        # df = pd.concat([df_base, pd.Series(images, name="image")], axis=1, copy=False)
-                    else:
-                        df = df_base
+            for logging_backend in logging_backends:
+                if images_raw:
+                    images = []
+                    for image_list in self._logs["images"]:
+                        images.append([logging_backend.Image(image) for image in image_list])
+                    df = pd.concat(
+                        [df_base, pd.Series(images, name="image")],
+                        axis=1,
+                        copy=False,
+                    )
+                else:
+                    df = df_base
+
+                if self.log_unique_prompts:
+                    df = df.drop_duplicates(subset=["prompt"])
 
-                    trackio.log({"completions": trackio.Table(dataframe=df)})
+                logging_backend.log({"completions": logging_backend.Table(dataframe=df)})
 
     # Ensure the model card is saved along with the checkpoint
     def _save_checkpoint(self, model, trial):
diff --git a/trl/trainer/judges.py b/trl/trainer/judges.py
index 819e76cf6b8..1cb8fbab77c 100644
--- a/trl/trainer/judges.py
+++ b/trl/trainer/judges.py
@@ -12,445 +12,93 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import concurrent.futures
-import logging
-from abc import ABC, abstractmethod
-
-import numpy as np
-from accelerate import Accelerator
-from huggingface_hub import InferenceClient
-from transformers.utils import is_openai_available
-
-from ..import_utils import is_llm_blender_available
-
-
-if is_llm_blender_available():
-    import llm_blender
-
-if is_openai_available():
-    from openai import OpenAI
-
-
-DEFAULT_PAIRWISE_SYSTEM_PROMPT = '''I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
-
-## Instruction
-
-{{
-    "instruction": """{prompt}""",
-}}
-
-## Model Outputs
-
-Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.
-
-{{
-    {{
-        "model_identifier": "0",
-        "output": """{response0}"""
-    }},
-    {{
-        "model_identifier": "1",
-        "output": """{response1}"""
-    }}
-}}
-
-## Task
-
-Evaluate the models on the basis of the quality and relevance of their results, and select the model that generated the best result. Reply with the identifier of the best model. Our evaluation will only take into account the first character of your answer, so make sure it contains only one of the identifiers and nothing else (no quotation marks, no spaces, no new lines, ...).
-'''
-
-
-class BaseJudge(ABC):
-    """
-    Base class for judges. The subclasses of this class should implement the `judge` method.
-    """
-
-    @abstractmethod
-    def judge(self, prompts: list[str], completions: list[str], shuffle_order: bool = True) -> list:
-        raise NotImplementedError("Judge subclasses must implement the `judge` method.")
-
-
-class BaseRankJudge(ABC):
-    """
-    Base class for LLM ranking judges.
-
-    **Example**:
-    ```python
-    class MyRankJudge(BaseRankJudge):
-        def judge(self, prompts, completions, shuffle_order=True):
-            return ...  # Your ranking logic here
-
-
-    judge = MyRankJudge()
-    judge.judge(
-        prompts=["The capital of France is", "The capital of Germany is"],
-        completions=[[" Paris", " Marseille", "Lyon"], [" Munich", " Berlin"]],
-    )  # [[0, 1, 2], [1, 0]]
-    ```
-    """
-
-    @abstractmethod
-    def judge(self, prompts: list[str], completions: list[list[str]], shuffle_order: bool = True) -> list[list[int]]:
-        """
-        Judge the completion for the given prompts and return the ranks of each completion.
-
-        Args:
-            prompts (`list[str]`):
-                List of prompts.
-            completions (`list[list[str]]`):
-                List of completions list, where each element is a list of completions for the corresponding prompt.
-            shuffle_order (`bool`, *optional*, defaults to `True`):
-                Whether to shuffle the order of the completions to avoid positional bias.
-
-        Returns:
-            `list[list[int]]`:
-                List of lists of idxs, where each list contains the ranks of the completions for the corresponding
-                prompt. E.g., `[1, 2, 0]` means that the second completion (`idx=1`) is the best, followed by the
-                third, and then the first.
-        """
-        raise NotImplementedError("Judge subclasses must implement the `judge` method.")
-
-
-class BasePairwiseJudge(BaseJudge):
-    """
-    Base class for pairwise judges.
-    """
-
-    @abstractmethod
-    def judge(self, prompts: list[str], completions: list[list[str]], shuffle_order: bool = True) -> list[int]:
-        """
-        Judge the completion pairs for the given prompts.
-
-        Args:
-            prompts (`list[str]`):
-                List of prompts.
-            completions (`list[list[str]]`):
-                List of completions pairs, where each element is a pair of completions for the corresponding prompt.
-            shuffle_order (`bool`, *optional*, defaults to `True`):
-                Whether to shuffle the order of the completions to avoid positional bias.
-
-        Returns:
-            `list[int]`:
-                List of idxs, where each idx is the rank of the best completion for the corresponding prompt. E.g., `1`
-                means that the second completion (`idx=1`) is the best.
-
-        Note:
-            If the judge returns `-1` for any prompt, it indicates that the inner process used to compute the
-            preference has failed. For instance, this could occur if the underlying language model returned an invalid
-            answer. In such cases, the caller should handle these invalid indices appropriately, possibly by
-            implementing fallback logic or error handling.
-        """
-        raise NotImplementedError("Judge subclasses must implement the `judge` method.")
-
-
-class BaseBinaryJudge(BaseJudge):
-    """
-    Base class for binary judges.
-    """
-
-    @abstractmethod
-    def judge(
-        self,
-        prompts: list[str],
-        completions: list[str],
-        gold_completions: list[str] | None = None,
-        shuffle_order: bool = True,
-    ) -> list[int]:
-        """
-        Judge the completion for a given prompt. Used to assess if a completion satisfies a constraint.
-
-        This base class should be used to implement binary evaluations as done in section 4.1.4 of the [CGPO
-        paper](https://huggingface.co/papers/2409.20370). It is relevant for assessing whether a prompt-completion pair
-        satisfies a specific constraint.
-
-        Args:
-            prompts (`list[str]`): List of prompts.
-            completions (`list[str]`): List of completions.
-            gold_completions (`list[str]`, `optional`): List of gold completions if it exists.
-            shuffle_order (`bool`): Whether to shuffle the order of the completions to avoid positional bias.
-
-        Returns:
-            list[int]: A list of binary labels:
-                - 1 indicates that the completion satisfies the evaluated constraint.
-                - 0 indicates that the completion does not satisfy the evaluated constraint.
-
-        Note:
-            If the judge returns -1 for any prompt, it indicates that the inner process used to compute the preference
-            has failed. For instance, this could occur if the underlying language model or rule based constraint
-            returned an invalid answer. In such cases, the caller should handle these invalid indices appropriately,
-            possibly by implementing fallback logic or error handling.
-        """
-        raise NotImplementedError("Judge subclasses must implement the `judge` method.")
-
-
-class PairRMJudge(BasePairwiseJudge):
-    # docstyle-ignore
-    """
-    LLM judge based on the PairRM model from AllenAI.
-
-    This judge uses the PairRM model to rank pairs of completions for given prompts. It's designed for pairwise
-    comparison of language model outputs. The PairRM model is loaded using the llm-blender library and runs on the
-    default Accelerator device.
-
-    **Attributes**:
-
-        blender (`llm_blender.Blender`):
-            An instance of the Blender class from llm-blender.
-
-    **Example**:
-    ```python
-    >>> pairrm_judge = PairRMJudge()
-    >>> prompts = ["Translate 'hello' to French", "What's the capital of Japan?"]
-    >>> completions = [["Bonjour", "Salut"], ["Kyoto", "Tokyo"]]
-    >>> results = pairrm_judge.judge(prompts, completions)
-    >>> print(results)  # [0, 1] (indicating the first completion is preferred for the first prompt and the second)
-    ```
-
-    > [!TIP]
-    > This class requires the llm-blender library to be installed. Install it with: `pip install llm-blender`.
-    """
-
-    def __init__(self):
-        if not is_llm_blender_available():
-            raise ValueError("llm-blender is not installed. Please install it with `pip install llm-blender`.")
-        self.blender = llm_blender.Blender()
-        self.blender.loadranker("llm-blender/PairRM", device=Accelerator().device)
-
-    def judge(
-        self,
-        prompts: list[str],
-        completions: list[list[str]],
-        shuffle_order: bool = True,
-        return_scores: bool = False,
-        temperature: float = 1.0,
-    ) -> list[int | float]:
-        """
-        Judge the completion pairs for the given prompts using the PairRM model.
-
-        Args:
-            prompts (`list[str]`):
-                List of prompts to judge.
-            completions (`list[list[str]]`):
-                List of completion pairs for each prompt.
-            shuffle_order (`bool`, *optional*, defaults to `True`):
-                Whether to shuffle the order of the completions to avoid positional bias.
-            return_scores (`bool`, *optional*, defaults to `False`):
-                If `True`, return probability scores of the first completion instead of ranks (i.e. a *soft-judge*).
-            temperature (`float`, *optional*, defaults to `1.0`):
-                Temperature for scaling logits if `return_scores` is True.
-
-        Returns:
-            `list[int | float]`:
-                If `return_scores` is `False`, returns a list of ranks (`0` or `1`) for each prompt, indicating which
-                completion is preferred. If `return_scores` is `True`, returns softmax probabilities for the first
-                completion.
-
-        Raises:
-            `ValueError`:
-                If the number of completions per prompt is not exactly 2.
-
-        Note:
-            Unlike llm-blender, ranks are 0-indexed (`0` means the first completion is preferred).
-        """
-
-        if len(completions[0]) != 2:
-            raise ValueError("PairRM judge requires exactly 2 completions per prompt.")
-
-        # Shuffle the order of the completions to avoid positional bias
-        if shuffle_order:
-            flip_mask = np.random.choice([True, False], size=len(prompts))
-            completions = [pair[::-1] if flip else pair for flip, pair in zip(flip_mask, completions, strict=True)]
-
-        # Rank the completions
-        ranks = self.blender.rank(prompts, completions, return_scores=return_scores, disable_tqdm=True)
-        if not return_scores:
-            ranks -= 1  # PairRM rank is 1-indexed, so we subtract 1 to make it 0-indexed
-        else:
-            # scale the logits by temperature
-            ranks /= temperature
-
-        # Flip back the ranks or scores to the original order if needed
-        if shuffle_order:
-            ranks[flip_mask] = ranks[flip_mask][:, ::-1]
-
-        # Return the ranks or score probability
-        if return_scores:
-            logit_max = np.amax(ranks, axis=-1, keepdims=True)
-            exp_logit_shifted = np.exp(ranks - logit_max)
-            probs = exp_logit_shifted / np.sum(exp_logit_shifted, axis=-1, keepdims=True)
-            return probs[:, 0].tolist()
-        else:
-            return ranks[:, 0].tolist()
-
-
-class HfPairwiseJudge(BasePairwiseJudge):
-    """
-    Pairwise judge based on the Hugging Face API with chat completion.
-
-    This judge is relevant for assessing the quality chat models, where the completion is a response to a given prompt.
-
-    Args:
-        model (`str`, *optional*, defaults to `"meta-llama/Meta-Llama-3-70B-Instruct"`):
-            Model to use for the judge.
-        token (`str`, *optional*):
-            Hugging Face API token to use for the [`huggingface_hub.InferenceClient`].
-        system_prompt (`str`, *optional*):
-            The system prompt to be used for the judge. If not provided, a default prompt is used. Note that the system
-            prompt should contain the following placeholders: `{prompt}`, `{response0}`, and `{response1}`. Also, the
-            inference is called with `max_tokens=1`, consequently the system prompt should ask for a single token
-            response.
-    """
-
-    def __init__(
-        self,
-        model="meta-llama/Meta-Llama-3-70B-Instruct",
-        token: str | None = None,
-        system_prompt: str | None = None,
-    ):
-        self.client = InferenceClient(model=model, token=token)
-        self.system_prompt = system_prompt or DEFAULT_PAIRWISE_SYSTEM_PROMPT
-
-    def judge(self, prompts: list[str], completions: list[list[str]], shuffle_order: bool = True) -> list[int]:
-        # Shuffle the order of the completions to avoid positional bias
-        if shuffle_order:
-            flip_mask = np.random.choice([True, False], size=len(prompts))
-            completions = [pair[::-1] if flip else pair for flip, pair in zip(flip_mask, completions, strict=True)]
-
-        # Define a function to get the rank for a single prompt, will be called concurrently
-        def get_rank(prompt, candidates):
-            content = self.system_prompt.format(prompt=prompt, response0=candidates[0], response1=candidates[1])
-            completion = self.client.chat_completion(messages=[{"role": "user", "content": content}], max_tokens=1)
-            response = completion.choices[0].message.content
-            if response in ["0", "1"]:
-                return int(response)
-            else:
-                logging.debug(f"Invalid response from the judge model: '{response}'. Returning -1.")
-                return -1
-
-        # Call the completions concurrently
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            ranks = list(executor.map(get_rank, prompts, completions))
-
-        # Flip back the ranks to the original order if needed
-        if shuffle_order:
-            ranks = [ranks[i] if not flip else 1 - ranks[i] for i, flip in enumerate(flip_mask)]
-
-        # Return the ranks
-        return ranks
-
-
-class OpenAIPairwiseJudge(BasePairwiseJudge):
-    """
-    Judge based on the OpenAI API.
-
-    This judge is relevant for assessing the quality chat models, where the completion is a response to a given prompt.
-
-    Args:
-        model (`str`, *optional*, defaults to `"gpt-4-turbo-preview"`):
-            Model to use for the judge.
-        system_prompt (`str`, *optional*):
-            System prompt to be used for the judge. If not provided, a default prompt is used. Note that the system
-            prompt should contain the following placeholders: `{prompt}`, `{response0}`, and `{response1}`. Also, the
-            inference is called with `max_tokens=1`, consequently the system prompt should ask for a single token
-            response.
-        max_requests (`int` or `None`, *optional*, defaults to `1000`):
-            Maximum number of requests to make to the OpenAI API. If set to `None`, there is no limit.
-    """
-
-    def __init__(
-        self, model="gpt-4-turbo-preview", system_prompt: str | None = None, max_requests: int | None = 1_000
-    ):
-        if not is_openai_available():
-            raise ValueError("OpenAI client is not installed. Please install it with 'pip install openai'.")
-        self.client = OpenAI()
-        self.model = model
-        self.system_prompt = system_prompt or DEFAULT_PAIRWISE_SYSTEM_PROMPT
-        self.max_requests = max_requests
-        self.num_requests = 0
-        self._warned = False
-
-    def judge(self, prompts: list[str], completions: list[list[str]], shuffle_order: bool = True) -> list[int]:
-        # Check if the limit of requests is reached, if so, use random choice instead
-        if self.max_requests is not None and self.num_requests >= self.max_requests:
-            if not self._warned:  # Print the warning only once
-                logging.warning(
-                    f"Reached the maximum number of requests ({self.max_requests}). From now on, returning -1 instead. "
-                    " To increase the limit, set `max_requests` to a higher value, or to `None` for no limit."
-                )
-                self._warned = True
-            return [-1] * len(prompts)
-
-        # Shuffle the order of the completions to avoid positional bias
-        if shuffle_order:
-            flip_mask = np.random.choice([True, False], size=len(prompts))
-            completions = [pair[::-1] if flip else pair for flip, pair in zip(flip_mask, completions, strict=True)]
-
-        # Define a function to get the rank for a single prompt, will be called concurrently
-        def get_rank(prompt, candidates):
-            content = self.system_prompt.format(prompt=prompt, response0=candidates[0], response1=candidates[1])
-            messages = [{"role": "user", "content": content}]
-            completion = self.client.chat.completions.create(model=self.model, messages=messages, max_tokens=1)
-            response = completion.choices[0].message.content
-            if response in ["0", "1"]:
-                return int(response)
-            else:
-                logging.debug(f"Invalid response from the judge model: '{response}'. Returning -1.")
-                return -1
-
-        # Call the completions concurrently
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            ranks = list(executor.map(get_rank, prompts, completions))
-
-        # Flip back the ranks to the original order if needed
-        if shuffle_order:
-            ranks = [ranks[i] if not flip else 1 - ranks[i] for i, flip in enumerate(flip_mask)]
-
-        # Update the number of requests
-        self.num_requests += len(prompts)
-
-        # Return the ranks
-        return ranks
-
-
-class AllTrueJudge(BaseBinaryJudge):
-    """
-    Unify the decision of multiple [`BaseBinaryJudge`] instances.
-
-    Returns `1` only if all inner binary judges return `1`. If any judge returns `0`, it returns `0`. If any judge
-    returns `-1`, indicating a failure in its process, this judge will also return `-1`.
-
-    Implements the Mixture of Judges as described in the [CGPO paper](https://huggingface.co/papers/2409.20370).
-
-    Args:
-        judges (`list[BaseBinaryJudge]`): A list of [`BaseBinaryJudge`] instances whose decisions will be unified.
-    """
-
-    def __init__(self, judges: list[BaseBinaryJudge]):
-        self.judges = judges
-
-    def judge(
-        self,
-        prompts: list[str],
-        completions: list[str],
-        gold_completions: list[str] | None = None,
-        shuffle_order: bool = True,
-    ) -> list[int]:
-        all_binary_judgments = [
-            judge.judge(prompts, completions, gold_completions, shuffle_order) for judge in self.judges
-        ]
-        output = []
-        for binary_judgments in zip(*all_binary_judgments, strict=True):
-            # Check that all values are in {0, 1, -1}
-            if any(binary_judgment not in {0, 1, -1} for binary_judgment in binary_judgments):
-                raise ValueError(
-                    f"Invalid binary judgment: {binary_judgments}, expected list of values in {{0, 1, -1}}."
-                )
-
-            # Unify the decision
-            if -1 in binary_judgments:
-                output.append(-1)
-            elif all(binary_judgment == 1 for binary_judgment in binary_judgments):
-                output.append(1)
-            else:
-                output.append(0)
-        return output
+import warnings
+
+from ..experimental.judges import AllTrueJudge as _AllTrueJudge
+from ..experimental.judges import BaseBinaryJudge as _BaseBinaryJudge
+from ..experimental.judges import BaseJudge as _BaseJudge
+from ..experimental.judges import BasePairwiseJudge as _BasePairwiseJudge
+from ..experimental.judges import BaseRankJudge as _BaseRankJudge
+from ..experimental.judges import HfPairwiseJudge as _HfPairwiseJudge
+from ..experimental.judges import OpenAIPairwiseJudge as _OpenAIPairwiseJudge
+from ..experimental.judges import PairRMJudge as _PairRMJudge
+
+
+class AllTrueJudge(_AllTrueJudge):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The `AllTrueJudge` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.judges import AllTrueJudge`. The current import path will be removed and no "
+            "longer supported in TRL 0.29."
+        )
+        super().__init__(*args, **kwargs)
+
+
+class BaseBinaryJudge(_BaseBinaryJudge):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The `BaseBinaryJudge` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.judges import BaseBinaryJudge`. The current import path will be removed and no "
+            "longer supported in TRL 0.29."
+        )
+        super().__init__(*args, **kwargs)
+
+
+class BaseJudge(_BaseJudge):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The `BaseJudge` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.judges import BaseJudge`. The current import path will be removed and no "
+            "longer supported in TRL 0.29."
+        )
+        super().__init__(*args, **kwargs)
+
+
+class BasePairwiseJudge(_BasePairwiseJudge):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The `BasePairwiseJudge` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.judges import BasePairwiseJudge`. The current import path will be removed and no "
+            "longer supported in TRL 0.29."
+        )
+        super().__init__(*args, **kwargs)
+
+
+class BaseRankJudge(_BaseRankJudge):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The `BaseRankJudge` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.judges import BaseRankJudge`. The current import path will be removed and no "
+            "longer supported in TRL 0.29."
+        )
+        super().__init__(*args, **kwargs)
+
+
+class HfPairwiseJudge(_HfPairwiseJudge):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The `HfPairwiseJudge` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.judges import HfPairwiseJudge`. The current import path will be removed and no "
+            "longer supported in TRL 0.29."
+        )
+        super().__init__(*args, **kwargs)
+
+
+class OpenAIPairwiseJudge(_OpenAIPairwiseJudge):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The `OpenAIPairwiseJudge` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.judges import OpenAIPairwiseJudge`. The current import path will be removed and no "
+            "longer supported in TRL 0.29."
+        )
+        super().__init__(*args, **kwargs)
+
+
+class PairRMJudge(_PairRMJudge):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The `PairRMJudge` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.judges import PairRMJudge`. The current import path will be removed and no "
+            "longer supported in TRL 0.29."
+        )
+        super().__init__(*args, **kwargs)
diff --git a/trl/trainer/kto_config.py b/trl/trainer/kto_config.py
index 96d8e64dfc9..ee830f76919 100644
--- a/trl/trainer/kto_config.py
+++ b/trl/trainer/kto_config.py
@@ -125,6 +125,16 @@ class KTOConfig(TrainingArguments):
             "`fp16` is not set."
         },
     )
+    # Transformers 4.57.0 introduced a bug that caused the dtype of `lr_scheduler_kwargs` to be unparsable. This issue
+    # was fixed in https://github.com/huggingface/transformers/pull/41322, but the fix has not yet been released. We
+    # add a temporary workaround here, which can be removed once the fix is available—likely in Transformers 4.57.2.
+    lr_scheduler_kwargs: dict | str | None = field(
+        default=None,
+        metadata={
+            "help": "Additional parameters for the lr_scheduler, such as {'num_cycles': 1} for cosine with hard "
+            "restarts."
+        },
+    )
 
     max_length: int | None = field(
         default=1024,
diff --git a/trl/trainer/kto_trainer.py b/trl/trainer/kto_trainer.py
index 8a7bd5f9a2f..aa615fdc4b6 100644
--- a/trl/trainer/kto_trainer.py
+++ b/trl/trainer/kto_trainer.py
@@ -382,7 +382,7 @@ def __init__(
             raise ValueError("You passed model_kwargs to the KTOTrainer. But your model is already instantiated.")
         else:
             model_init_kwargs = args.model_init_kwargs
-            dtype = model_init_kwargs.get("dtype")
+            dtype = model_init_kwargs.get("dtype", "auto")
             if dtype is not None:
                 # Convert to `torch.dtype` if an str is passed
                 if isinstance(dtype, str) and dtype != "auto":
@@ -392,6 +392,7 @@ def __init__(
                         f"Invalid `dtype` passed to the KTOConfig. Expected a string with either `torch.dtype` or 'auto', but got {dtype}."
                     )
                 model_init_kwargs["dtype"] = dtype
+            model_init_kwargs["device_map"] = model_init_kwargs.get("device_map", "auto")
 
         if args.ref_model_init_kwargs is None:
             ref_model_init_kwargs = {}
@@ -401,7 +402,7 @@ def __init__(
             )
         else:
             ref_model_init_kwargs = args.ref_model_init_kwargs
-            dtype = ref_model_init_kwargs.get("dtype")
+            dtype = ref_model_init_kwargs.get("dtype", "auto")
             if dtype is not None:
                 # Convert to `torch.dtype` if an str is passed
                 if isinstance(dtype, str) and dtype != "auto":
@@ -411,6 +412,7 @@ def __init__(
                         f"Invalid `dtype` passed to the KTOConfig. Expected a string with either `torch.dtype` or 'auto', but got {dtype}."
                     )
                 ref_model_init_kwargs["dtype"] = dtype
+            ref_model_init_kwargs["device_map"] = ref_model_init_kwargs.get("device_map", "auto")
 
         if isinstance(model, str):
             model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
diff --git a/trl/trainer/model_config.py b/trl/trainer/model_config.py
index 9e3d5fe8021..84bb1450b6a 100644
--- a/trl/trainer/model_config.py
+++ b/trl/trainer/model_config.py
@@ -43,8 +43,8 @@ class ModelConfig:
             be set to `True` for repositories you trust and in which you have read the code, as it will execute code
             present on the Hub on your local machine.
         attn_implementation (`str`, *optional*):
-            Which attention implementation to use. You can run `--attn_implementation=flash_attention_2`, in which case
-            you must install this manually by running `pip install flash-attn --no-build-isolation`.
+            Which attention implementation to use. More information in the [Kernels Hub Integrations
+            Guide](kernels_hub).
         use_peft (`bool`, *optional*, defaults to `False`):
             Whether to use PEFT for training.
         lora_r (`int`, *optional*, defaults to `16`):
@@ -176,6 +176,10 @@ class ModelConfig:
         default=False,
         metadata={"help": "Whether to use nested quantization."},
     )
+    bnb_4bit_quant_storage: str | None = field(
+        default=None,
+        metadata={"help": "Quantization storage dtype"},
+    )
     # Deprecated params
     torch_dtype: str | None = field(
         default=None,
diff --git a/trl/trainer/nash_md_config.py b/trl/trainer/nash_md_config.py
index 07d8152f4fa..9da8247f59f 100644
--- a/trl/trainer/nash_md_config.py
+++ b/trl/trainer/nash_md_config.py
@@ -12,35 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass, field
+import warnings
+from dataclasses import dataclass
 
-from trl.trainer.online_dpo_config import OnlineDPOConfig
+from ..experimental.nash_md import NashMDConfig as _NashMDConfig
 
 
 @dataclass
-class NashMDConfig(OnlineDPOConfig):
-    r"""
-    Configuration class for the [`NashMDTrainer`].
-
-    Subclass of [`OnlineDPOConfig`] we can use all its arguments and add the following:
-
-    Parameters:
-        mixture_coef (`float` or `list[float]`, *optional*, defaults to `0.5`):
-            Logit mixture coefficient for the model and reference model. If a list of floats is provided then the
-            mixture coefficient is selected for each new epoch and the last coefficient is used for the rest of the
-            epochs.
-    """
-
-    mixture_coef: list[float] = field(
-        default_factory=lambda: [0.5],
-        metadata={
-            "help": "Logit mixture coefficient for the model and reference model. If a list of floats is provided "
-            "then the mixture coefficient is selected for each new epoch and the last coefficient is used for the "
-            "rest of the epochs."
-        },
-    )
-
+class NashMDConfig(_NashMDConfig):
     def __post_init__(self):
+        warnings.warn(
+            "The `NashMDConfig` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.nash_md import NashMDConfig`. The current import path will be removed and no "
+            "longer supported in TRL 0.29. For more information, see https://github.com/huggingface/trl/issues/4223."
+        )
         super().__post_init__()
-        if hasattr(self.mixture_coef, "__len__") and len(self.mixture_coef) == 1:
-            self.mixture_coef = self.mixture_coef[0]
diff --git a/trl/trainer/nash_md_trainer.py b/trl/trainer/nash_md_trainer.py
index ea9bb74cf86..23aae32b9e5 100644
--- a/trl/trainer/nash_md_trainer.py
+++ b/trl/trainer/nash_md_trainer.py
@@ -12,484 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import textwrap
-from collections.abc import Callable
-from typing import Any
+import warnings
+from dataclasses import dataclass
 
-import jinja2
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from datasets import Dataset, IterableDataset
-from transformers import (
-    BaseImageProcessor,
-    FeatureExtractionMixin,
-    PreTrainedModel,
-    PreTrainedTokenizerBase,
-    ProcessorMixin,
-    TrainerCallback,
-)
-from transformers.trainer_utils import EvalPrediction
-from transformers.training_args import OptimizerNames
-from transformers.utils import is_peft_available
+from ..experimental.nash_md import NashMDTrainer as _NashMDTrainer
 
-from ..data_utils import is_conversational, maybe_apply_chat_template
-from ..models.modeling_base import GeometricMixtureWrapper
-from ..models.utils import unwrap_model_for_generation
-from .judges import BasePairwiseJudge
-from .nash_md_config import NashMDConfig
-from .online_dpo_trainer import OnlineDPOTrainer
-from .utils import (
-    SIMPLE_CHAT_TEMPLATE,
-    empty_cache,
-    get_reward,
-    selective_log_softmax,
-    truncate_right,
-)
 
-
-if is_peft_available():
-    from peft import PeftModel
-
-
-class NashMDTrainer(OnlineDPOTrainer):
-    """
-    Trainer for the Nash-MD method.
-
-    It is implemented as a subclass of [`OnlineDPOTrainer`].
-
-    Args:
-        model ([`~transformers.PreTrainedModel`]):
-            The model to train, preferably an `AutoModelForCausalLM`.
-        ref_model ([`PreTrainedModelWrapper`]):
-            Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
-            and loss. If no reference model is provided, the trainer will create a reference model with the same
-            architecture as the model to be optimized.
-        reward_funcs ([`~transformers.PreTrainedModel`]):
-            The reward model to score completions with, preferably an
-            [`~transformers.AutoModelForSequenceClassification`].
-        judge ([`BasePairwiseJudge`]):
-            The judge to use for pairwise comparison of model completions.
-        args ([`NashMDConfig`]):
-            The NashMD config arguments to use for training.
-        data_collator ([`~transformers.DataCollator`]):
-            The data collator to use for training. If None is specified, the default data collator
-            ([`DPODataCollatorWithPadding`]) will be used which will pad the sequences to the maximum length of the
-            sequences in the batch, given a dataset of paired sequences.
-        train_dataset ([`~datasets.Dataset`]):
-            The dataset to use for training.
-        eval_dataset ([`~datasets.Dataset`]):
-            The dataset to use for evaluation.
-        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
-            Processing class used to process the data. If provided, will be used to automatically process the inputs
-            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
-            reuse the fine-tuned model.
-        peft_config (`dict`):
-            The peft config to use for training.
-        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
-            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
-            metric values.
-        callbacks (`list[transformers.TrainerCallback]`):
-            The callbacks to use for training.
-        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
-            The optimizer and scheduler to use for training.
-        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
-            The function to use to preprocess the logits before computing the metrics.
-    """
-
-    _tag_names = ["trl", "nash-md"]
-    _name = "Nash-MD"
-    _paper = {
-        "title": "Nash Learning from Human Feedback",
-        "id": "2312.00886",
-        # docstyle-ignore
-        "citation": textwrap.dedent("""\
-            @inproceedings{munos2024nash,
-                title        = {{Nash Learning from Human Feedback}},
-                author       = {R{\'{e}}mi Munos and Michal Valko and Daniele Calandriello and Mohammad Gheshlaghi Azar and Mark Rowland and Zhaohan Daniel Guo and Yunhao Tang and Matthieu Geist and Thomas Mesnard and C{\\^{o}}me Fiegel and Andrea Michi and Marco Selvi and Sertan Girgin and Nikola Momchev and Olivier Bachem and Daniel J. Mankowitz and Doina Precup and Bilal Piot},
-                year         = 2024,
-                booktitle    = {Forty-first International Conference on Machine Learning, {ICML} 2024, Vienna, Austria, July 21-27, 2024},
-                publisher    = {OpenReview.net},
-                url          = {https://openreview.net/forum?id=Y5AmNYiyCQ}
-            }"""),
-    }
-
-    def __init__(
-        self,
-        model: PreTrainedModel | nn.Module = None,
-        ref_model: PreTrainedModel | nn.Module = None,
-        reward_funcs: PreTrainedModel | nn.Module | None = None,
-        judge: BasePairwiseJudge | None = None,
-        args: NashMDConfig | None = None,
-        data_collator: Callable | None = None,
-        train_dataset: Dataset | IterableDataset | None = None,
-        eval_dataset: Dataset | dict[str, Dataset] | None = None,
-        processing_class: PreTrainedTokenizerBase
-        | BaseImageProcessor
-        | FeatureExtractionMixin
-        | ProcessorMixin
-        | None = None,
-        peft_config: dict | None = None,
-        compute_metrics: Callable[[EvalPrediction], dict] | None = None,
-        callbacks: list[TrainerCallback] | None = None,
-        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
-        preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
-    ) -> None:
-        super().__init__(
-            model=model,
-            ref_model=ref_model,
-            reward_funcs=reward_funcs,
-            judge=judge,
-            args=args,
-            data_collator=data_collator,
-            train_dataset=train_dataset,
-            eval_dataset=eval_dataset,
-            processing_class=processing_class,
-            reward_processing_classes=processing_class,
-            peft_config=peft_config,
-            compute_metrics=compute_metrics,
-            callbacks=callbacks,
-            optimizers=optimizers,
-            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
-        )
-
-        self._mixture_coef = self.args.mixture_coef
-
-        # Overwrite the stats dictionary to include NashMD specific statistics
-        self.stats = {
-            # Remove "non_score_reward", "rlhf_reward", "scores_margin"
-            # Add "mixture_coef"
-            "loss/kl": [],
-            "objective/entropy": [],
-            "loss/score": [],
-            "rewards/probabilities": [],
-            "rewards/accuracies": [],
-            "rewards/margins": [],
-            "logps/chosen": [],
-            "logps/rejected": [],
-            "val/model_contain_eos_token": [],
-            "val/ref_contain_eos_token": [],
-            "beta": [],
-            "mixture_coef": [],
-        }
-        if self.reward_funcs is not None:
-            if len(self.reward_funcs) != 1:
-                raise ValueError("NashMDTrainer only supports one reward function/model.")
-            self.reward_funcs = self.reward_funcs[0]
-            self.stats["rewards/chosen"] = []
-            self.stats["rewards/rejected"] = []
-
-    @property
-    def mixture_coef(self):
-        if isinstance(self._mixture_coef, list):
-            epoch = self.state.epoch
-            return self._mixture_coef[epoch] if epoch < len(self._mixture_coef) else self._mixture_coef[-1]
-        else:
-            return self._mixture_coef
-
-    def _generate_completions(self, model, prompts):
-        # Generate completions from the policy model.
-        with unwrap_model_for_generation(model, self.accelerator) as unwrapped_policy_for_gen_ctx:
-            model_output = unwrapped_policy_for_gen_ctx.generate(
-                input_ids=prompts["input_ids"],
-                attention_mask=prompts["attention_mask"],
-                generation_config=self.generation_config,
-            )
-
-        # Get the DDP/FSDP unwrapped version of the main model.
-        # This will be the policy model for GeometricMixtureWrapper (PEFT adapters active if PEFT is used).
-        policy_model_for_gmw = self.accelerator.unwrap_model(model)
-
-        # Determine the correct reference model for GeometricMixtureWrapper.
-        # This also needs to be DDP/FSDP unwrapped.
-        ref_model_for_gmw: torch.nn.Module
-        if self.ref_model is None:
-            # No explicit ref_model is provided.
-            # Use the base of the main `model` if it's a PEFT model.
-            # policy_model_for_gmw is already DDP-unwrapped.
-            if is_peft_available() and isinstance(policy_model_for_gmw, PeftModel):
-                ref_model_for_gmw = policy_model_for_gmw.get_base_model()
-            else:
-                # Not a PEFT model (or PEFT not available), or already a base model.
-                # Use the DDP-unwrapped policy model itself as the reference.
-                ref_model_for_gmw = policy_model_for_gmw
-        else:
-            # An explicit ref_model is provided. Unwrap it for DDP/FSDP.
-            ref_model_for_gmw = self.accelerator.unwrap_model(self.ref_model)
-
-        # Both models given to GeometricMixtureWrapper (policy_model_for_gmw and ref_model_for_gmw) are DDP-unwrapped.
-        with torch.no_grad():  # Ensure no_grad context for mixture model generation
-            mixture_model = GeometricMixtureWrapper(
-                model=policy_model_for_gmw,
-                ref_model=ref_model_for_gmw,
-                generation_config=self.generation_config,
-                mixture_coef=self.mixture_coef,
-                device=self.accelerator.device,
-            )
-
-            mixture_output = mixture_model.generate(
-                input_ids=prompts["input_ids"],
-                attention_mask=prompts["attention_mask"],
-                generation_config=self.generation_config,
-            )
-
-        return model_output, mixture_output
-
-    def _process_completions(self, model_output, mixture_output, prompts):
-        context_length = prompts["input_ids"].shape[1]
-
-        # Process model completions
-        model_completion_ids = model_output[:, context_length:]
-        model_completion_ids, model_completion_mask = truncate_right(
-            model_completion_ids, self.processing_class.eos_token_id, self.processing_class.pad_token_id
+@dataclass
+class NashMDTrainer(_NashMDTrainer):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The `NashMDTrainer` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.nash_md import NashMDTrainer`. The current import path will be removed and no "
+            "longer supported in TRL 0.29. For more information, see https://github.com/huggingface/trl/issues/4223."
         )
-        model_data = {
-            "input_ids": torch.cat((prompts["input_ids"], model_completion_ids), dim=1),
-            "attention_mask": torch.cat((prompts["attention_mask"], model_completion_mask), dim=1),
-            "raw": prompts["raw"],
-        }
-
-        # Process reference model completions
-        mixture_completion_ids = mixture_output[:, context_length:]
-        mixture_completion_ids, mixture_completion_mask = truncate_right(
-            mixture_completion_ids, self.processing_class.eos_token_id, self.processing_class.pad_token_id
-        )
-        mixture_data = {
-            "input_ids": torch.cat((prompts["input_ids"], mixture_completion_ids), dim=1),
-            "attention_mask": torch.cat((prompts["attention_mask"], mixture_completion_mask), dim=1),
-            "raw": prompts["raw"],
-        }
-
-        return model_data, mixture_data
-
-    def _compute_rewards(self, model_data, mixture_data, context_length):
-        with torch.no_grad():
-            _, model_scores, _ = get_reward(
-                self.reward_funcs, model_data["input_ids"], self.processing_class.pad_token_id, context_length
-            )
-            _, mixture_scores, _ = get_reward(
-                self.reward_funcs, mixture_data["input_ids"], self.processing_class.pad_token_id, context_length
-            )
-
-        # Apply EOS penalty if needed
-        if self.args.missing_eos_penalty is not None:
-            model_contain_eos = torch.any(model_data["input_ids"] == self.processing_class.eos_token_id, dim=-1)
-            mixture_contain_eos = torch.any(mixture_data["input_ids"] == self.processing_class.eos_token_id, dim=-1)
-            model_scores[~model_contain_eos] -= self.args.missing_eos_penalty
-            mixture_scores[~mixture_contain_eos] -= self.args.missing_eos_penalty
-
-        return model_scores, mixture_scores
-
-    def _compute_judge(self, model_data, mixture_data, context_length):
-        prompts = model_data["raw"]
-        model_data_completions = self.processing_class.batch_decode(
-            model_data["input_ids"][:, context_length:], skip_special_tokens=True
-        )
-        model_data_completions = [completion.strip() for completion in model_data_completions]
-
-        mixture_data_completions = self.processing_class.batch_decode(
-            mixture_data["input_ids"][:, context_length:], skip_special_tokens=True
-        )
-        mixture_data_completions = [completion.strip() for completion in mixture_data_completions]
-        if is_conversational({"prompt": prompts[0]}):
-            model_data_completions = [
-                [{"role": "assistant", "content": completion}] for completion in model_data_completions
-            ]
-            environment = jinja2.Environment()
-            template = environment.from_string(SIMPLE_CHAT_TEMPLATE)
-            prompts = [template.render(messages=message) for message in prompts]
-            model_data_completions = [template.render(messages=completion) for completion in model_data_completions]
-
-            mixture_data_completions = [
-                [{"role": "assistant", "content": completion}] for completion in mixture_data_completions
-            ]
-            mixture_data_completions = [
-                template.render(messages=completion) for completion in mixture_data_completions
-            ]
-
-        probability = self.judge.judge(
-            prompts,
-            list(zip(model_data_completions, mixture_data_completions, strict=True)),
-            return_scores=True,
-        )
-        return torch.tensor(probability, device=model_data["input_ids"].device)
-
-    def _compute_logprobs(self, model, model_data, context_length):
-        def compute_logprobs_for_data(m, data):
-            output = m(data["input_ids"], attention_mask=data["attention_mask"])
-            logits = output.logits[:, context_length - 1 : -1]
-            token_logprobs = selective_log_softmax(logits, data["input_ids"][:, context_length:])
-            return token_logprobs
-
-        # Compute logprobs for model completions under the model
-        model_logprobs_model_data = compute_logprobs_for_data(model, model_data)
-
-        # Compute logprobs of model completions under the reference model
-        with torch.no_grad():
-            if self.ref_model is None:
-                with model.disable_adapter():
-                    ref_logprobs_model_data = compute_logprobs_for_data(model, model_data)
-            else:
-                ref_logprobs_model_data = compute_logprobs_for_data(self.ref_model, model_data)
-
-        # Mask padding tokens
-        model_padding_mask = model_data["attention_mask"][:, context_length:] == 0
-        model_logprobs_model_data = model_logprobs_model_data.masked_fill(model_padding_mask, 0.0)
-        ref_logprobs_model_data = ref_logprobs_model_data.masked_fill(model_padding_mask, 0.0)
-
-        return (model_logprobs_model_data, ref_logprobs_model_data)
-
-    def _compute_losses(
-        self,
-        model_logprobs_model_data,
-        ref_logprobs_model_data,
-        probability,
-    ):
-        # reinforce score where 0.5 is a control variate
-        score = (probability - 0.5) * model_logprobs_model_data.sum(1)
-
-        # kl divergence via reinforce
-        with torch.no_grad():
-            log_ratio = model_logprobs_model_data - ref_logprobs_model_data
-            kl_div_log = log_ratio.sum(1)
-        kl_div_loss = (log_ratio * model_logprobs_model_data).sum(1)
-
-        # final loss
-        loss = self.beta * kl_div_loss - score
-
-        return loss.mean(), score, kl_div_log
-
-    def _log_statistics(
-        self,
-        model_data,
-        mixture_data,
-        model_logprobs_model_data,
-        ref_logprobs_model_data,
-        probability,
-        score,
-        kl_div,
-        context_length,
-        model_scores=None,
-        mixture_scores=None,
-    ):
-        # Helper function to gather and compute mean
-        def gather_mean(tensor):
-            return self.accelerator.gather_for_metrics(tensor).mean().item()
-
-        # Log score
-        self.stats["loss/score"].append(gather_mean(score))
-        # Log KL divergence
-        self.stats["loss/kl"].append(gather_mean(kl_div))
-
-        # Log logprobs
-        model_logprobs_model_data_sum = model_logprobs_model_data.sum(1)
-        ref_logprobs_model_data_sum = ref_logprobs_model_data.sum(1)
-
-        self.stats["logps/chosen"].append(gather_mean(model_logprobs_model_data_sum))
-        self.stats["logps/rejected"].append(gather_mean(ref_logprobs_model_data_sum))
-
-        # Log rewards
-        if self.reward_funcs is not None:
-            self.stats["rewards/chosen"].append(gather_mean(model_scores))
-            self.stats["rewards/rejected"].append(gather_mean(mixture_scores))
-
-        # Log probabilities
-        self.stats["rewards/probabilities"].append(gather_mean(probability))
-
-        # Calculate entropy for model data
-        entropy_model_data = -model_logprobs_model_data.sum(1)
-        self.stats["objective/entropy"].append(gather_mean(entropy_model_data))
-
-        # Calculate margins
-        margin = model_logprobs_model_data_sum - ref_logprobs_model_data_sum
-        self.stats["rewards/margins"].append(gather_mean(margin))
-
-        # Calculate accuracy
-        accuracy = (margin > 0).float()
-        self.stats["rewards/accuracies"].append(gather_mean(accuracy))
-
-        # Log EOS token statistics
-        model_eos = (model_data["input_ids"][:, context_length:] == self.processing_class.eos_token_id).any(dim=1)
-        mixture_eos = (mixture_data["input_ids"][:, context_length:] == self.processing_class.eos_token_id).any(dim=1)
-        self.stats["val/model_contain_eos_token"].append(gather_mean(model_eos.float()))
-        self.stats["val/ref_contain_eos_token"].append(gather_mean(mixture_eos.float()))
-
-        # Log beta and mixture coef
-        self.stats["beta"].append(self.beta)
-        self.stats["mixture_coef"].append(self.mixture_coef)
-
-    def training_step(
-        self, model: nn.Module, inputs: dict[str, torch.Tensor | Any], num_items_in_batch: int | None = None
-    ) -> torch.Tensor:
-        model.train()
-
-        # Apply chat template and tokenize the input
-        batch_size = len(next(iter(inputs.values())))
-        prompts = inputs["prompt"]
-        inputs = [{k: v[i] for k, v in inputs.items()} for i in range(batch_size)]
-        inputs = [maybe_apply_chat_template(x, self.processing_class) for x in inputs]
-        inputs = [self.tokenize_row(x, self.model.config.is_encoder_decoder, self.processing_class) for x in inputs]
-        inputs = self.data_collator(inputs)
-
-        # need the prompt_ only
-        inputs = self._prepare_inputs(inputs)
-        context_length = inputs["prompt_input_ids"].shape[1]
-        prompts = {
-            "input_ids": inputs["prompt_input_ids"],
-            "attention_mask": inputs["prompt_attention_mask"],
-            "raw": prompts,
-        }
-        del inputs
-
-        # Sample completions from both the model and the reference model
-        model_output, mixture_output = self._generate_completions(model, prompts)
-
-        # Process model completions
-        model_data, mixture_data = self._process_completions(model_output, mixture_output, prompts)
-
-        # Compute rewards
-        if self.reward_funcs is not None:
-            model_scores, mixture_scores = self._compute_rewards(model_data, mixture_data, context_length)
-            # probability of the model data vs the mixture data
-            probability = F.sigmoid(model_scores - mixture_scores)
-        else:
-            model_scores, mixture_scores = None, None
-            probability = self._compute_judge(model_data, mixture_data, context_length)
-
-        # Compute logprobs
-        model_logprobs_model_data, ref_logprobs_model_data = self._compute_logprobs(model, model_data, context_length)
-
-        # Compute loss
-        loss, score, kl_div = self._compute_losses(model_logprobs_model_data, ref_logprobs_model_data, probability)
-
-        # Log everything
-        self._log_statistics(
-            model_data,
-            mixture_data,
-            model_logprobs_model_data.detach(),
-            ref_logprobs_model_data,
-            probability,
-            score.detach(),
-            kl_div.detach(),
-            context_length,
-            model_scores,
-            mixture_scores,
-        )
-
-        if (
-            self.args.torch_empty_cache_steps is not None
-            and self.state.global_step % self.args.torch_empty_cache_steps == 0
-        ):
-            empty_cache()
-
-        kwargs = {}
-        # For LOMO optimizers you need to explicitly use the learning rate
-        if self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
-            kwargs["learning_rate"] = self._get_learning_rate()
-
-        if self.args.n_gpu > 1:
-            loss = loss.mean()  # mean() to average on multi-gpu parallel training
-
-        self.accelerator.backward(loss, **kwargs)
-
-        return loss.detach() / self.args.gradient_accumulation_steps
+        super().__init__(*args, **kwargs)
diff --git a/trl/trainer/online_dpo_config.py b/trl/trainer/online_dpo_config.py
index 680aeb922f1..d2d21e3c3ee 100644
--- a/trl/trainer/online_dpo_config.py
+++ b/trl/trainer/online_dpo_config.py
@@ -171,6 +171,16 @@ class may differ from those in [`~transformers.TrainingArguments`].
             "`fp16` is not set."
         },
     )
+    # Transformers 4.57.0 introduced a bug that caused the dtype of `lr_scheduler_kwargs` to be unparsable. This issue
+    # was fixed in https://github.com/huggingface/transformers/pull/41322, but the fix has not yet been released. We
+    # add a temporary workaround here, which can be removed once the fix is available—likely in Transformers 4.57.2.
+    lr_scheduler_kwargs: dict | str | None = field(
+        default=None,
+        metadata={
+            "help": "Additional parameters for the lr_scheduler, such as {'num_cycles': 1} for cosine with hard "
+            "restarts."
+        },
+    )
 
     reward_model_path: str | None = field(
         default=None,
diff --git a/trl/trainer/online_dpo_trainer.py b/trl/trainer/online_dpo_trainer.py
index bbf5aea469c..1b4e487c861 100644
--- a/trl/trainer/online_dpo_trainer.py
+++ b/trl/trainer/online_dpo_trainer.py
@@ -45,6 +45,7 @@
     ProcessorMixin,
     Trainer,
     TrainerCallback,
+    is_bitsandbytes_available,
 )
 from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
 from transformers.trainer_utils import EvalPrediction, seed_worker
@@ -67,7 +68,6 @@
     unwrap_model_for_generation,
 )
 from .base_trainer import BaseTrainer
-from .judges import BasePairwiseJudge
 from .online_dpo_config import OnlineDPOConfig
 from .utils import (
     SIMPLE_CHAT_TEMPLATE,
@@ -98,6 +98,8 @@
     from vllm import LLM, SamplingParams
     from vllm.sampling_params import GuidedDecodingParams
 
+if is_bitsandbytes_available():
+    import bitsandbytes as bnb
 
 logger = logging.get_logger(__name__)
 
@@ -123,7 +125,7 @@ class OnlineDPOTrainer(BaseTrainer):
         ref_model ([`~transformers.PreTrainedModel`] or `torch.nn.Module` or `None`):
             The reference model to use for training. If None is specified, the reference model will be created from the
             model.
-        judge ([`BasePairwiseJudge`]):
+        judge ([`experimental.judges.BasePairwiseJudge`]):
             The judge to use for pairwise comparison of model completions.
         reward_funcs (`RewardFunc | list[RewardFunc]`, *optional*):
             Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
@@ -189,7 +191,7 @@ def __init__(
         model: PreTrainedModel | nn.Module | str,
         ref_model: PreTrainedModel | nn.Module | None = None,
         reward_funcs: RewardFunc | list[RewardFunc] | None = None,
-        judge: BasePairwiseJudge | None = None,
+        judge=None,
         args: OnlineDPOConfig | None = None,
         data_collator: DataCollator | None = None,
         train_dataset: Dataset | IterableDataset | None = None,
@@ -305,7 +307,7 @@ def __init__(
             model_id = model
 
             # Handle dtype in model_init_kwargs
-            dtype = model_init_kwargs.get("dtype")
+            dtype = model_init_kwargs.get("dtype", "auto")
             if isinstance(dtype, torch.dtype) or dtype == "auto" or dtype is None:
                 pass
             elif isinstance(dtype, str):
@@ -316,6 +318,7 @@ def __init__(
                     "Invalid `dtype` passed to `OnlineDPOConfig`. Expected either 'auto' or a string "
                     f"representing a `torch.dtype` (e.g., 'float32'), but got {dtype}."
                 )
+            model_init_kwargs["device_map"] = model_init_kwargs.get("device_map", "auto")
 
             model = AutoModelForCausalLM.from_pretrained(model_id, **model_init_kwargs)
         else:
@@ -463,7 +466,11 @@ def __init__(
                     else:
                         base_url = f"http://{args.vllm_server_host}:{args.vllm_server_port}"
                     self.vllm_client = VLLMClient(base_url=base_url, connection_timeout=args.vllm_server_timeout)
-                    self.vllm_client.init_communicator(device=torch.cuda.current_device())
+
+                    # Determine device type (supports cuda, xpu, etc.)
+                    accelerator_type = torch.accelerator.current_accelerator().type
+                    current_device = getattr(torch, accelerator_type).current_device()
+                    self.vllm_client.init_communicator(device=current_device)
                 else:
                     self.vllm_client = None
             elif self.vllm_mode == "colocate":
@@ -473,6 +480,14 @@ def __init__(
                 # after the first optimizer step and remain in GPU memory throughout training. So we must reserve enough
                 # space for them.
                 # Configure vLLM parameters
+                vllm_quantization = None
+                if is_bitsandbytes_available():
+                    for _, module in model.named_modules():
+                        if isinstance(module, bnb.nn.Linear4bit):
+                            vllm_quantization = "bitsandbytes"
+                            break
+                        elif isinstance(module, bnb.nn.Linear8bitLt):
+                            raise ValueError("vLLM does not support in-flight 8-bit quantization.")
                 vllm_kwargs = {
                     "model": model.name_or_path,
                     "tensor_parallel_size": self.vllm_tensor_parallel_size,
@@ -485,6 +500,7 @@ def __init__(
                     "seed": self.accelerator.process_index // self.vllm_tensor_parallel_size,
                     # Latest vLLM v1 memory profiler is misled by the high default value (i.e., 32768)
                     "max_num_batched_tokens": 4096,
+                    "quantization": vllm_quantization,
                 }
 
                 # vLLM requires the environment variables to be set for distributed training.
@@ -755,7 +771,7 @@ def _generate_vllm_server(self, prompts, images=None):
                 max_tokens=self.generation_config.max_tokens,
                 guided_decoding_regex=self.guided_decoding_regex if hasattr(self, "guided_decoding_regex") else None,
                 generation_kwargs=self.args.generation_kwargs,
-            )
+            )["completion_ids"]
             # Flatten: each prompt generates 2 completions
             completion_ids = [[comp_id] for prompt_completions in completion_ids for comp_id in prompt_completions]
         else:
diff --git a/trl/trainer/orpo_config.py b/trl/trainer/orpo_config.py
index 73d2345e88c..e5f30575fd7 100644
--- a/trl/trainer/orpo_config.py
+++ b/trl/trainer/orpo_config.py
@@ -15,35 +15,15 @@
 import warnings
 from dataclasses import dataclass
 
-from ..experimental.orpo import ORPOConfig as ExperimentalORPOConfig
+from ..experimental.orpo import ORPOConfig as _ORPOConfig
 
 
 @dataclass
-class ORPOConfig(ExperimentalORPOConfig):
-    r"""
-    Configuration class for the [`ORPOTrainer`].
-
-    <Deprecated version="0.25.0">
-
-    This class has been moved to `trl.experimental.orpo.ORPOConfig` and will be removed in TRL 0.29.0.
-    Please update your imports:
-
-    ```python
-    from trl.experimental.orpo import ORPOConfig
-    ```
-
-    For more details, see: https://github.com/huggingface/trl/issues/4223
-
-    </Deprecated>
-    """
-
+class ORPOConfig(_ORPOConfig):
     def __post_init__(self):
         warnings.warn(
-            "ORPOConfig has been moved to trl.experimental.orpo.ORPOConfig and will be removed from "
-            "trl.trainer in TRL 0.29.0. Please update your imports to: "
-            "`from trl.experimental.orpo import ORPOConfig`. "
-            "For more details, see: https://github.com/huggingface/trl/issues/4223",
-            FutureWarning,
-            stacklevel=2,
+            "The `ORPOConfig` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.orpo import ORPOConfig`. The current import path will be removed and no longer "
+            "supported in TRL 0.29. For more information, see https://github.com/huggingface/trl/issues/4223."
         )
         super().__post_init__()
diff --git a/trl/trainer/orpo_trainer.py b/trl/trainer/orpo_trainer.py
index 7f5feb34489..b91a59eef35 100644
--- a/trl/trainer/orpo_trainer.py
+++ b/trl/trainer/orpo_trainer.py
@@ -13,82 +13,17 @@
 # limitations under the License.
 
 import warnings
-from collections.abc import Callable
-
-import torch
-import torch.nn as nn
-from datasets import Dataset
-from transformers import (
-    BaseImageProcessor,
-    DataCollator,
-    FeatureExtractionMixin,
-    PreTrainedModel,
-    PreTrainedTokenizerBase,
-    ProcessorMixin,
-)
-from transformers.trainer_callback import TrainerCallback
-from transformers.trainer_utils import EvalLoopOutput
+from dataclasses import dataclass
 
 from ..experimental.orpo import ORPOTrainer as _ORPOTrainer
-from .orpo_config import ORPOConfig
 
 
+@dataclass
 class ORPOTrainer(_ORPOTrainer):
-    """
-    Initialize ORPOTrainer.
-
-    <Deprecated version="0.25.0">
-
-    This class has been moved to `trl.experimental.orpo.ORPOTrainer` and will be removed in TRL 0.29.0.
-    Please update your imports:
-
-    ```python
-    from trl.experimental.orpo import ORPOTrainer
-    ```
-
-    For more details, see: https://github.com/huggingface/trl/issues/4223
-
-    </Deprecated>
-    """
-
-    def __init__(
-        self,
-        model: PreTrainedModel | nn.Module | str | None = None,
-        args: ORPOConfig | None = None,
-        data_collator: DataCollator | None = None,
-        train_dataset: Dataset | None = None,
-        eval_dataset: Dataset | dict[str, Dataset] | None = None,
-        processing_class: PreTrainedTokenizerBase
-        | BaseImageProcessor
-        | FeatureExtractionMixin
-        | ProcessorMixin
-        | None = None,
-        model_init: Callable[[], PreTrainedModel] | None = None,
-        callbacks: list[TrainerCallback] | None = None,
-        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
-        preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
-        peft_config: dict | None = None,
-        compute_metrics: Callable[[EvalLoopOutput], dict] | None = None,
-    ):
+    def __init__(self, *args, **kwargs):
         warnings.warn(
-            "ORPOTrainer has been moved to trl.experimental.orpo.ORPOTrainer and will be removed from "
-            "trl.trainer in TRL 0.29.0. Please update your imports to: "
-            "`from trl.experimental.orpo import ORPOTrainer`. "
-            "For more details, see: https://github.com/huggingface/trl/issues/4223",
-            FutureWarning,
-            stacklevel=2,
-        )
-        super().__init__(
-            model=model,
-            args=args,
-            data_collator=data_collator,
-            train_dataset=train_dataset,
-            eval_dataset=eval_dataset,
-            processing_class=processing_class,
-            model_init=model_init,
-            callbacks=callbacks,
-            optimizers=optimizers,
-            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
-            peft_config=peft_config,
-            compute_metrics=compute_metrics,
+            "The `ORPOTrainer` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.orpo import ORPOTrainer`. The current import path will be removed and no longer "
+            "supported in TRL 0.29. For more information, see https://github.com/huggingface/trl/issues/4223."
         )
+        super().__init__(*args, **kwargs)
diff --git a/trl/trainer/ppo_config.py b/trl/trainer/ppo_config.py
index 40d48b82dbf..e38cd190fe8 100644
--- a/trl/trainer/ppo_config.py
+++ b/trl/trainer/ppo_config.py
@@ -12,124 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-from dataclasses import dataclass, field
-from typing import Literal
+import warnings
+from dataclasses import dataclass
 
-from ..trainer.utils import OnPolicyConfig
+from ..experimental.ppo import PPOConfig as _PPOConfig
 
 
 @dataclass
-class PPOConfig(OnPolicyConfig):
-    r"""
-    Configuration class for the [`PPOTrainer`].
-
-    This class includes only the parameters that are specific to PPO training. For a full list of training arguments,
-    please refer to the [`~transformers.TrainingArguments`] and [`OnPolicyConfig`] documentation. Note that default
-    values in this class may differ from those in [`~transformers.TrainingArguments`].
-
-    Using [`~transformers.HfArgumentParser`] we can turn this class into
-    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
-    command line.
-
-    Parameters:
-        exp_name (`str`, *optional*, defaults to `os.path.basename(__file__)[:-3]`):
-            Name of this experiment.
-        reward_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`):
-            Path to the reward model.
-        model_adapter_name (`str`, *optional*):
-            Name of the train target PEFT adapter, when using LoRA with multiple adapters.
-        ref_adapter_name (`str`, *optional*):
-            Name of the reference PEFT adapter, when using LoRA with multiple adapters.
-        num_ppo_epochs (`int`, *optional*, defaults to `4`):
-            Number of epochs to train.
-        whiten_rewards (`bool`, *optional*, defaults to `False`):
-            Whether to whiten the rewards.
-        kl_coef (`float`, *optional*, defaults to `0.05`):
-            KL coefficient.
-        kl_estimator (`Literal["k1", "k3"]`, *optional*, defaults to `"k1"`):
-            Which estimator for KL-Divergence to use from [Approximating KL
-            Divergence](http://joschu.net/blog/kl-approx.html). Defaults to "k1", a straightforward, unbiased
-            estimator. Can be set to "k3", an unbiased estimator with lower variance which "appears to be a strictly
-            better estimator". Cannot be set to "k2", as it is used for logging purposes.
-        cliprange (`float`, *optional*, defaults to `0.2`):
-            Clip range.
-        vf_coef (`float`, *optional*, defaults to `0.1`):
-            Value function coefficient.
-        cliprange_value (`float`, *optional*, defaults to `0.2`):
-            Clip range for the value function.
-        gamma (`float`, *optional*, defaults to `1.0`):
-            Discount factor.
-        lam (`float`, *optional*, defaults to `0.95`):
-            Lambda value for GAE.
-        ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
-            This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
-            improving generation speed. However, disabling this option allows training models that exceed the VRAM
-            capacity of a single GPU, albeit at the cost of slower generation.
-    """
-
-    exp_name: str = field(
-        default=os.path.basename(__file__)[:-3],
-        metadata={"help": "Name of this experiment."},
-    )
-    reward_model_path: str = field(
-        default="EleutherAI/pythia-160m",
-        metadata={"help": "Path to the reward model."},
-    )
-    model_adapter_name: str | None = field(
-        default=None,
-        metadata={"help": "Name of the train target PEFT adapter, when using LoRA with multiple adapters."},
-    )
-    ref_adapter_name: str | None = field(
-        default=None,
-        metadata={"help": "Name of the reference PEFT adapter, when using LoRA with multiple adapters."},
-    )
-    num_ppo_epochs: int = field(
-        default=4,
-        metadata={"help": "Number of epochs to train."},
-    )
-    whiten_rewards: bool = field(
-        default=False,
-        metadata={"help": "Whether to whiten the rewards."},
-    )
-    kl_coef: float = field(
-        default=0.05,
-        metadata={"help": "KL coefficient."},
-    )
-    kl_estimator: Literal["k1", "k3"] = field(
-        default="k1",
-        metadata={
-            "help": "Which estimator for KL-Divergence to use from Approximating KL Divergence "
-            "(http://joschu.net/blog/kl-approx.html). Defaults to 'k1', a straightforward, unbiased estimator. Can be "
-            "set to 'k3', an unbiased estimator with lower variance which 'appears to be a strictly better "
-            "estimator'. Cannot be set to 'k2', as it is used for logging purposes."
-        },
-    )
-    cliprange: float = field(
-        default=0.2,
-        metadata={"help": "Clip range."},
-    )
-    vf_coef: float = field(
-        default=0.1,
-        metadata={"help": "Value function coefficient."},
-    )
-    cliprange_value: float = field(
-        default=0.2,
-        metadata={"help": "Clip range for the value function."},
-    )
-    gamma: float = field(
-        default=1.0,
-        metadata={"help": "Discount factor."},
-    )
-    lam: float = field(
-        default=0.95,
-        metadata={"help": "Lambda value for GAE."},
-    )
-    ds3_gather_for_generation: bool = field(
-        default=True,
-        metadata={
-            "help": "This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for "
-            "generation, improving generation speed. However, disabling this option allows training models that "
-            "exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation."
-        },
-    )
+class PPOConfig(_PPOConfig):
+    def __post_init__(self):
+        warnings.warn(
+            "The `PPOConfig` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.ppo import PPOConfig`. The current import path will be removed and no longer "
+            "supported in TRL 0.29. For more information, see https://github.com/huggingface/trl/issues/4223."
+        )
+        super().__post_init__()
diff --git a/trl/trainer/ppo_trainer.py b/trl/trainer/ppo_trainer.py
index c4e5182d48f..fb2b94dfa37 100644
--- a/trl/trainer/ppo_trainer.py
+++ b/trl/trainer/ppo_trainer.py
@@ -12,833 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import gc
-import math
-import os
-import textwrap
-import time
 import warnings
-from collections import defaultdict
-from contextlib import contextmanager, nullcontext
-from pathlib import Path
+from dataclasses import dataclass
 
-import numpy as np
-import pandas as pd
-import torch
-import torch.nn as nn
-from accelerate import Accelerator, logging
-from accelerate.utils import broadcast, gather_object
-from datasets import Dataset
-from torch.utils.data import DataLoader
-from transformers import (
-    BaseImageProcessor,
-    DataCollatorWithPadding,
-    FeatureExtractionMixin,
-    GenerationConfig,
-    PreTrainedTokenizerBase,
-    ProcessorMixin,
-    TrainerCallback,
-    TrainerControl,
-)
-from transformers.integrations import get_reporting_integration_callbacks
-from transformers.trainer import DEFAULT_CALLBACKS, DEFAULT_PROGRESS_CALLBACK
-from transformers.trainer_callback import CallbackHandler, ExportableState, PrinterCallback
-from transformers.utils import is_peft_available, is_rich_available
+from ..experimental.ppo import PPOTrainer as _PPOTrainer
 
-from ..models import create_reference_model
-from ..models.utils import unwrap_model_for_generation
-from .base_trainer import BaseTrainer
-from .ppo_config import PPOConfig
-from .utils import (
-    OnlineTrainerState,
-    batch_generation,
-    disable_dropout_in_model,
-    empty_cache,
-    exact_div,
-    first_true_indices,
-    forward,
-    get_reward,
-    log_table_to_comet_experiment,
-    peft_module_casting_to_bf16,
-    prepare_deepspeed,
-    print_rich_table,
-    selective_log_softmax,
-    truncate_response,
-)
 
-
-logger = logging.get_logger(__name__)
-
-if is_peft_available():
-    from peft import PeftConfig, PeftModel, get_peft_model
-
-
-INVALID_LOGPROB = 1.0
-
-
-def masked_mean(values: torch.Tensor, mask: torch.Tensor, axis: bool | None = None) -> torch.Tensor:
-    """Compute mean of tensor with a masked values."""
-    if axis is not None:
-        return (values * mask).sum(axis=axis) / mask.sum(axis=axis)
-    else:
-        return (values * mask).sum() / mask.sum()
-
-
-def masked_var(values: torch.Tensor, mask: torch.Tensor, unbiased: bool = True) -> torch.Tensor:
-    """Compute variance of tensor with masked values."""
-    mean = masked_mean(values, mask)
-    centered_values = values - mean
-    variance = masked_mean(centered_values**2, mask)
-    if unbiased:
-        mask_sum = mask.sum()
-        if mask_sum == 0:
-            raise ValueError(
-                "The sum of the mask is zero, which can happen when `mini_batch_size=1`;"
-                "try increase the `mini_batch_size` or `gradient_accumulation_steps`"
-            )
-        # note that if mask_sum == 1, then there is a division by zero issue
-        # to avoid it you just need to use a larger minibatch_size
-        bessel_correction = mask_sum / (mask_sum - 1)
-        variance = variance * bessel_correction
-    return variance
-
-
-def masked_whiten(values: torch.Tensor, mask: torch.Tensor, shift_mean: bool = True) -> torch.Tensor:
-    """Whiten values with masked values."""
-    mean, var = masked_mean(values, mask), masked_var(values, mask)
-    whitened = (values - mean) * torch.rsqrt(var + 1e-8)
-    if not shift_mean:
-        whitened += mean
-    return whitened
-
-
-# taken from https://github.com/OpenLMLab/MOSS-RLHF/blob/40b91eb2f2b71b16919addede0341d2bef70825d/ppo/ppo_trainer.py#L29
-# we did this we can do a single `model = accelerator.prepare(model)`
-class PolicyAndValueWrapper(nn.Module):
-    def __init__(self, policy, value_model) -> None:
-        super().__init__()
-        self.policy = policy
-        self.value_model = value_model
-        self.critic_backbone = getattr(value_model, value_model.base_model_prefix)
-        self.is_gradient_checkpointing = policy.is_gradient_checkpointing
-
-    def forward(self, **kwargs):
-        output = self.critic_backbone(**kwargs)
-        logits = self.value_model.score(output.hidden_states[-1])
-        return self.policy(**kwargs), logits
-
-
-class PPOTrainer(BaseTrainer):
-    """Trainer for Proximal Policy Optimization (PPO).
-
-    For details on PPO, see the paper: [Proximal Policy Optimization
-    Algorithms](https://huggingface.co/papers/1707.06347).
-
-    Args:
-        args ([`PPOConfig`]):
-            Training arguments.
-        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`]):
-            Class to process the data.
-        model (`torch.nn.Module`):
-            Model to be trained. This is the policy model.
-        ref_model (`torch.nn.Module`, *optional*):
-            Reference model used to compute the KL divergence. If `None`, a copy of the policy model is created.
-        reward_model (`torch.nn.Module`):
-            Reward model used to compute the rewards.
-        train_dataset ([`~datasets.Dataset`]):
-            Dataset for training.
-        value_model (`torch.nn.Module`):
-            Value model used to predict the value of a state.
-        data_collator ([`~transformers.DataCollatorWithPadding`], *optional*):
-            Data collator to batch and pad samples from the dataset. If `None`, a default data collator is created
-            using the `processing_class`.
-        eval_dataset ([`~datasets.Dataset`] or `dict` of [`~datasets.Dataset`], *optional*):
-            Dataset for evaluation.
-        optimizers (`tuple` of `torch.optim.Optimizer` and `torch.optim.lr_scheduler.LambdaLR`, *optional*, defaults to `(None, None)`):
-            Tuple containing the optimizer and the learning rate scheduler to use for training. If `None`, the
-            optimizer and the learning rate scheduler are created using the
-            [`~transformers.Trainer.create_optimizer_and_scheduler`] method.
-        callbacks (`list` of [`~transformers.TrainerCallback`], *optional*):
-            Callbacks to use during training.
-        peft_config ([`~peft.PeftConfig`], *optional*):
-            PEFT configuration to use PEFT for training. If `None`, PEFT is not used. If provided, the policy `model`
-            will be wrapped with the specified PEFT adapter.
-    """
-
-    _tag_names = ["trl", "ppo"]
-    _name = "PPO"
-    _paper = {
-        "title": "Fine-Tuning Language Models from Human Preferences",
-        "id": "1909.08593",
-        # docstyle-ignore
-        "citation": textwrap.dedent("""\
-            @article{mziegler2019fine-tuning,
-                title        = {{Fine-Tuning Language Models from Human Preferences}},
-                author       = {Daniel M. Ziegler and Nisan Stiennon and Jeffrey Wu and Tom B. Brown and Alec Radford and Dario Amodei and Paul F. Christiano and Geoffrey Irving},
-                year         = 2019,
-                eprint       = {arXiv:1909.08593}
-            }"""),
-    }
-
-    def __init__(
-        self,
-        args: PPOConfig,
-        processing_class: PreTrainedTokenizerBase | BaseImageProcessor | FeatureExtractionMixin | ProcessorMixin,
-        model: nn.Module,
-        ref_model: nn.Module | None,
-        reward_model: nn.Module,
-        train_dataset: Dataset,
-        value_model: nn.Module,
-        data_collator: DataCollatorWithPadding | None = None,
-        eval_dataset: Dataset | dict[str, Dataset] | None = None,
-        # less commonly used
-        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
-        callbacks: list[TrainerCallback] | None = None,
-        peft_config: "PeftConfig | None" = None,
-    ) -> None:
-        if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
-            warnings.warn(
-                "This trainer will soon be moved to trl.experimental and is a candidate for removal. If you rely on "
-                "it and want it to remain, please share your comments here: "
-                "https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable "
-                "TRL_EXPERIMENTAL_SILENCE=1."
-            )
-        if ref_model is model:
-            raise ValueError(
-                "`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the "
-                "same as `model`, you must make a copy of it, or `None` if you use peft."
-            )
-
-        self.args = args
-        self.processing_class = processing_class
-        self.policy_model = model
-
-        # Define the collator if not provided
-        if data_collator is None:
-            data_collator = DataCollatorWithPadding(self.processing_class)
-
-        # Handle stop token settings: update policy model's generation_config to use provided stop token
-        if args.stop_token and args.stop_token_id:
-            raise ValueError("You cannot set both `stop_token` and `stop_token_id`.")
-        elif args.stop_token:
-            if args.stop_token == "eos":
-                self.policy_model.generation_config.eos_token_id = self.stop_token_id = processing_class.eos_token_id
-            else:
-                raise ValueError(
-                    f"Unknown `stop_token` {args.stop_token}. Allowed values are: `'eos'` and `None` (no stop token)."
-                )
-        else:
-            self.policy_model.generation_config.eos_token_id = self.stop_token_id = args.stop_token_id  # None or int
-
-        # Check that the kl estimator is valid
-        if self.args.kl_estimator not in {"k1", "k3"}:
-            raise ValueError(
-                "kl_estimator must be either 'k1' (straightforward, unbiased) or 'k3' (lower variance, unbiased, "
-                "appears to be a strictly better estimator). See "
-                "[Approximating KL Divergence](http://joschu.net/blog/kl-approx.html) for details."
-            )
-
-        # peft support
-        if not is_peft_available() and peft_config is not None:
-            raise ImportError(
-                "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models"
-            )
-        elif is_peft_available() and peft_config is not None:
-            # if model is a peft model and we have a peft_confg, we merge and unload it first
-            if isinstance(self.policy_model, PeftModel):
-                self.policy_model = self.policy_model.merge_and_unload()
-
-            # get peft model with the given config
-            self.policy_model = get_peft_model(self.policy_model, peft_config)
-            if args.bf16 and getattr(self.policy_model, "is_loaded_in_4bit", False):
-                peft_module_casting_to_bf16(self.policy_model)
-
-        self.is_peft_model = is_peft_available() and isinstance(self.policy_model, PeftModel)
-        self.model_adapter_name = args.model_adapter_name
-        self.ref_adapter_name = args.ref_adapter_name
-
-        if ref_model:
-            self.ref_model = ref_model
-        elif self.is_peft_model:
-            self.ref_model = None
-        else:
-            self.ref_model = create_reference_model(self.policy_model)
-
-        self.reward_model = reward_model
-        self.train_dataset = train_dataset
-        self.train_dataset_len = len(train_dataset)
-        self.value_model = value_model
-        self.data_collator = data_collator
-        self.eval_dataset = eval_dataset
-        self.optimizer, self.lr_scheduler = optimizers
-        self.optimizer_cls_and_kwargs = None  # needed for transformers >= 4.47
-
-        #########
-        # calculate various batch sizes
-        #########
-        if args.total_episodes is None:  # allow the users to define episodes in terms of epochs.
-            args.total_episodes = int(args.num_train_epochs * self.train_dataset_len)
-        accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps)
-        self.accelerator = accelerator
-        args.world_size = accelerator.num_processes
-        args.local_batch_size = args.per_device_train_batch_size * args.gradient_accumulation_steps
-        args.micro_batch_size = int(args.per_device_train_batch_size * args.world_size)
-        args.batch_size = int(args.local_batch_size * args.world_size)
-        args.mini_batch_size = exact_div(
-            args.batch_size, args.num_mini_batches, "`batch_size` must be a multiple of `num_mini_batches`"
-        )
-        args.local_mini_batch_size = exact_div(
-            args.local_batch_size, args.num_mini_batches, "`local_batch_size` must be a multiple of `num_mini_batches`"
-        )
-        if args.whiten_rewards:
-            assert args.local_mini_batch_size >= 8, (
-                f"Per-rank minibatch size {args.local_mini_batch_size} is insufficient for whitening"
-            )
-        # `per_rank_rollout_batch_size` is our `args.local_batch_size`
-        # `per_rank_minibatch_size` is our `args.local_mini_batch_size`
-        args.num_total_batches = math.ceil(
-            args.total_episodes / args.batch_size
-        )  # we may train for more than `total_episodes`
-        time_tensor = torch.tensor(int(time.time()), device=accelerator.device)
-        time_int = broadcast(time_tensor, 0).item()  # avoid different timestamps across processes
-        args.run_name = f"{args.exp_name}__{args.seed}__{time_int}"
-        self.local_seed = args.seed + accelerator.process_index * 100003  # Prime
-        if args.num_sample_generations > 0:
-            self.sample_generations_freq = max(1, args.num_total_batches // args.num_sample_generations)
-        self.local_dataloader_batch_size = args.local_batch_size
-
-        #########
-        # setup model, optimizer, and others
-        #########
-        for module in [self.policy_model, self.ref_model, self.value_model, self.reward_model]:
-            if module is not None:
-                disable_dropout_in_model(module)
-        self.model = PolicyAndValueWrapper(self.policy_model, self.value_model)
-        self.model.config = self.policy_model.config  # needed for pushing to hub
-        self.create_optimizer_and_scheduler(
-            num_training_steps=args.num_total_batches
-        )  # note that we are calling `self.lr_scheduler.step()` manually only at the batch level
-
-        #########
-        # trainer specifics
-        #########
-        default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to)
-        self.callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks
-        self.callback_handler = CallbackHandler(
-            self.callbacks, self.model, self.processing_class, self.optimizer, self.lr_scheduler
-        )
-        self.add_callback(PrinterCallback if self.args.disable_tqdm else DEFAULT_PROGRESS_CALLBACK)
-        self.control = TrainerControl()
-        self.state = OnlineTrainerState(
-            is_local_process_zero=self.is_local_process_zero(),
-            is_world_process_zero=self.is_world_process_zero(),
-            stateful_callbacks=[
-                cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState)
-            ],
-        )
-        self.current_flos = 0
-        self.hp_search_backend = None
-        self.is_deepspeed_enabled = getattr(self.accelerator.state, "deepspeed_plugin", None) is not None
-        self.is_fsdp_enabled = getattr(self.accelerator.state, "fsdp_plugin", None) is not None
-        # Create distant repo and output directory if needed
-        self.hub_model_id = None
-        if self.args.push_to_hub:
-            self.init_hf_repo()
-        if self.args.should_save:
-            os.makedirs(self.args.output_dir, exist_ok=True)
-
-        # Add tags for models that have been loaded with the correct transformers version
-        if hasattr(self.model, "add_model_tags"):
-            self.model.add_model_tags(self._tag_names)
-
-        #########
-        # setup dataloader
-        #########
-        self.dataloader = DataLoader(
-            self.train_dataset,
-            batch_size=self.local_dataloader_batch_size,
-            shuffle=True,
-            collate_fn=self.data_collator,
-            drop_last=True,  # needed; otherwise the last batch will be of ragged shape
-        )
-        # sync random states for DataLoader(shuffle=True) before `accelerator.prepare`
-        # see https://gist.github.com/vwxyzjn/2581bff1e48e185e0b85b6dfe1def79c
-        torch.manual_seed(args.seed)
-        self.model, self.optimizer, self.dataloader = accelerator.prepare(self.model, self.optimizer, self.dataloader)
-        torch.manual_seed(self.local_seed)  # reset the local seed again
-
-        self.eval_dataloader = DataLoader(
-            self.eval_dataset,
-            batch_size=args.per_device_eval_batch_size,
-            collate_fn=self.data_collator,
-            drop_last=True,
-        )  # no need to shuffle eval dataset
-        self.eval_dataloader = accelerator.prepare(self.eval_dataloader)
-
-        if self.is_deepspeed_enabled:
-            self.reward_model = prepare_deepspeed(
-                self.reward_model, args.per_device_train_batch_size, args.fp16, args.bf16
-            )
-
-            if self.ref_model is None:
-                if not self.is_peft_model:
-                    raise ValueError("No reference model and model is not a Peft model.")
-            else:
-                self.ref_model = prepare_deepspeed(
-                    self.ref_model, args.per_device_train_batch_size, args.fp16, args.bf16
-                )
-        else:
-            if self.ref_model is None:
-                if not self.is_peft_model:
-                    raise ValueError("No reference model and model is not a Peft model.")
-            else:
-                self.ref_model = self.ref_model.to(self.accelerator.device)
-            self.reward_model = self.reward_model.to(self.accelerator.device)
-
-    def get_train_dataloader(self) -> DataLoader:
-        return self.dataloader
-
-    def get_eval_dataloader(self) -> DataLoader:
-        return self.eval_dataloader
-
-    @contextmanager
-    def null_ref_context(self):
-        """Context manager for handling null reference model (that is, peft adapter manipulation)."""
-        with (
-            self.accelerator.unwrap_model(self.model.policy).disable_adapter()
-            if self.is_peft_model and not self.ref_adapter_name
-            else nullcontext()
-        ):
-            if self.ref_adapter_name:
-                self.model.policy.set_adapter(self.ref_adapter_name)
-            yield
-            if self.ref_adapter_name:
-                self.model.policy.set_adapter(self.model_adapter_name or "default")
-
-    def save_model(self, output_dir: str | None = None, _internal_call: bool = False):
-        backup_model = self.model
-        self.model = self.model.policy  # save only the policy
-
-        if self.is_deepspeed_enabled:
-            backup_deepspeed = self.deepspeed
-            self.deepspeed = self.model
-
-        super().save_model(output_dir, _internal_call)
-
-        self.model = backup_model
-
-        if self.is_deepspeed_enabled:
-            self.deepspeed = backup_deepspeed
-
-    def train(self):
-        args = self.args
-        accelerator = self.accelerator
-        optimizer = self.optimizer
-        model = self.model
-        ref_policy = self.ref_model
-        reward_model = self.reward_model
-        processing_class = self.processing_class
-        dataloader = self.dataloader
-        device = accelerator.device
-
-        def repeat_generator():
-            while True:
-                yield from dataloader
-
-        iter_dataloader = iter(repeat_generator())
-        generation_config = GenerationConfig(
-            max_new_tokens=args.response_length,
-            temperature=(args.temperature + 1e-7),
-            top_k=0.0,
-            top_p=1.0,
-            do_sample=True,
-        )
-
-        accelerator.print("===training policy===")
-        start_time = time.time()
-        stats_shape = (args.num_ppo_epochs, args.num_mini_batches, args.gradient_accumulation_steps)
-        approxkl_stats = torch.zeros(stats_shape, device=device)
-        pg_clipfrac_stats = torch.zeros(stats_shape, device=device)
-        pg_loss_stats = torch.zeros(stats_shape, device=device)
-        vf_loss_stats = torch.zeros(stats_shape, device=device)
-        vf_clipfrac_stats = torch.zeros(stats_shape, device=device)
-        entropy_stats = torch.zeros(stats_shape, device=device)
-        ratio_stats = torch.zeros(stats_shape, device=device)
-        model.train()
-
-        # trainer state initialization
-        self.state.global_step = 0
-        self.state.episode = 0
-        self.state.max_steps = args.num_total_batches
-        self.state.num_train_epochs = args.total_episodes / self.train_dataset_len
-        # Compute absolute values for logging, eval, and save if given as ratio
-        if args.logging_steps is not None:
-            if args.logging_steps < 1:
-                self.state.logging_steps = math.ceil(self.state.max_steps * args.logging_steps)
-            else:
-                self.state.logging_steps = args.logging_steps
-        if args.eval_steps is not None:
-            if args.eval_steps < 1:
-                self.state.eval_steps = math.ceil(self.state.max_steps * args.eval_steps)
-            else:
-                self.state.eval_steps = args.eval_steps
-        if args.save_steps is not None:
-            if args.save_steps < 1:
-                self.state.save_steps = math.ceil(self.state.max_steps * args.save_steps)
-            else:
-                self.state.save_steps = args.save_steps
-        self.control = self.callback_handler.on_train_begin(args, self.state, self.control)
-
-        # backward compatibility
-        if self.is_deepspeed_enabled:
-            self.deepspeed = self.model
-            self.model_wrapped = self.model
-
-        for update in range(1, args.num_total_batches + 1):
-            self.state.episode += 1 * args.batch_size
-            data = next(iter_dataloader)
-            with torch.no_grad():
-                queries = data["input_ids"].to(device)
-                context_length = queries.shape[1]
-                responses = []
-                postprocessed_responses = []
-                logprobs = []
-                ref_logprobs = []
-                scores = []
-                sequence_lengths = []
-                values = []
-                with unwrap_model_for_generation(
-                    self.model, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation
-                ) as unwrapped_model:
-                    query_responses, logitss = batch_generation(
-                        unwrapped_model.policy,
-                        queries,
-                        args.local_rollout_forward_batch_size,
-                        processing_class.pad_token_id,
-                        generation_config,
-                    )
-
-                for i in range(0, queries.shape[0], args.local_rollout_forward_batch_size):
-                    query = queries[i : i + args.local_rollout_forward_batch_size]
-                    query_response = query_responses[i : i + args.local_rollout_forward_batch_size]
-                    response = query_response[:, context_length:]
-                    logits = logitss[i : i + args.local_rollout_forward_batch_size]
-                    logprob = selective_log_softmax(logits, response)
-                    del logits
-                    empty_cache()
-
-                    if ref_policy is None:
-                        with self.null_ref_context():
-                            ref_output = forward(model.policy, query_response, processing_class.pad_token_id)
-                    else:
-                        ref_output = forward(ref_policy, query_response, processing_class.pad_token_id)
-                    ref_logits = ref_output.logits[:, context_length - 1 : -1]
-                    ref_logits /= args.temperature + 1e-7
-                    ref_logprob = selective_log_softmax(ref_logits, response)
-                    del ref_output, ref_logits
-                    empty_cache()
-
-                    # Response Processing 1. truncate response after the first occurrence of `stop_token_id`
-                    postprocessed_response = response
-                    if self.stop_token_id is not None:  # handle the edge case when stop_token_id exists but is 0
-                        postprocessed_response = truncate_response(
-                            self.stop_token_id, processing_class.pad_token_id, response
-                        )
-
-                    # Response Processing 2. run reward model on the truncated responses
-                    postprocessed_query_response = torch.cat((query, postprocessed_response), 1)
-                    sequence_length = first_true_indices(postprocessed_response == processing_class.pad_token_id) - 1
-                    unwrapped_value_model = accelerator.unwrap_model(model).value_model
-                    full_value, _, _ = get_reward(
-                        unwrapped_value_model, query_response, processing_class.pad_token_id, context_length
-                    )
-                    value = full_value[:, context_length - 1 : -1].squeeze(-1)
-                    _, score, _ = get_reward(
-                        reward_model, postprocessed_query_response, processing_class.pad_token_id, context_length
-                    )
-
-                    responses.append(response)
-                    postprocessed_responses.append(postprocessed_response)
-                    logprobs.append(logprob)
-                    ref_logprobs.append(ref_logprob)
-                    sequence_lengths.append(sequence_length)
-                    scores.append(score)
-                    values.append(value)
-                responses = torch.cat(responses, 0)
-                postprocessed_responses = torch.cat(postprocessed_responses, 0)
-                logprobs = torch.cat(logprobs, 0)
-                ref_logprobs = torch.cat(ref_logprobs, 0)
-                sequence_lengths = torch.cat(sequence_lengths, 0)
-                scores = torch.cat(scores, 0)
-                values = torch.cat(values, 0)
-                del (logprob, ref_logprob, full_value, value, score, unwrapped_model)
-                empty_cache()
-                gc.collect()
-
-                # Response Processing 3. Filter completion. Ensure that the sample contains stop_token_id
-                # Completions not passing that filter will receive a lower score.
-                contain_eos_token = torch.any(postprocessed_responses == self.processing_class.eos_token_id, dim=-1)
-                if self.args.missing_eos_penalty is not None:
-                    scores[~contain_eos_token] -= self.args.missing_eos_penalty
-                # accelerator.print(f"{scores=}, {(contain_eos_token.sum() / len(contain_eos_token))=}")
-
-                # be very careful with `padding_mask_p1`; see https://excalidraw.com/#json=LWnzG4w2k5DjF_EOL_xPt,e2w3a-hFJ_gX5vOfeyXGTw
-                response_idxs = torch.arange(responses.shape[1], device=responses.device).repeat(responses.shape[0], 1)
-                padding_mask = response_idxs > sequence_lengths.unsqueeze(1)
-                logprobs = torch.masked_fill(logprobs, padding_mask, INVALID_LOGPROB)
-                ref_logprobs = torch.masked_fill(ref_logprobs, padding_mask, INVALID_LOGPROB)
-                sequence_lengths_p1 = sequence_lengths + 1
-                padding_mask_p1 = response_idxs > (sequence_lengths_p1.unsqueeze(1))
-                values = torch.masked_fill(values, padding_mask_p1, 0)
-
-                # 4. compute rewards
-                # Formula used by http://joschu.net/blog/kl-approx.html for the k1 and k3 estimators
-                logr = ref_logprobs - logprobs
-                kl = -logr if args.kl_estimator == "k1" else (logr.exp() - 1) - logr  # Else statement is k3
-                non_score_reward = -args.kl_coef * kl
-                rewards = non_score_reward.clone()
-                actual_start = torch.arange(rewards.size(0), device=rewards.device)
-                actual_end = torch.where(sequence_lengths_p1 < rewards.size(1), sequence_lengths_p1, sequence_lengths)
-                rewards[actual_start, actual_end] += scores
-
-                # 5. whiten rewards
-                if args.whiten_rewards:
-                    rewards = masked_whiten(rewards, mask=~padding_mask_p1, shift_mean=False)
-                    rewards = torch.masked_fill(rewards, padding_mask_p1, 0)
-
-                # 6. compute advantages and returns
-                lastgaelam = 0
-                advantages_reversed = []
-                gen_length = responses.shape[1]
-                for t in reversed(range(gen_length)):
-                    nextvalues = values[:, t + 1] if t < gen_length - 1 else 0.0
-                    delta = rewards[:, t] + args.gamma * nextvalues - values[:, t]
-                    lastgaelam = delta + args.gamma * args.lam * lastgaelam
-                    advantages_reversed.append(lastgaelam)
-                advantages = torch.stack(advantages_reversed[::-1], axis=1)
-                returns = advantages + values
-                advantages = masked_whiten(advantages, ~padding_mask)
-                advantages = torch.masked_fill(advantages, padding_mask, 0)
-                empty_cache()
-
-            # Do multiple epochs of PPO training, with a fresh random shuffle in each epoch
-            for ppo_epoch_idx in range(args.num_ppo_epochs):
-                b_inds = np.random.permutation(args.local_batch_size)
-                minibatch_idx = 0
-                for mini_batch_start in range(0, args.local_batch_size, args.local_mini_batch_size):
-                    mini_batch_end = mini_batch_start + args.local_mini_batch_size
-                    mini_batch_inds = b_inds[mini_batch_start:mini_batch_end]
-                    gradient_accumulation_idx = 0
-                    for micro_batch_start in range(0, args.local_mini_batch_size, args.per_device_train_batch_size):
-                        with accelerator.accumulate(model):
-                            micro_batch_end = micro_batch_start + args.per_device_train_batch_size
-                            micro_batch_inds = mini_batch_inds[micro_batch_start:micro_batch_end]
-                            mb_advantage = advantages[micro_batch_inds]
-                            mb_responses = responses[micro_batch_inds]
-                            mb_query_responses = query_responses[micro_batch_inds]
-                            mb_logprobs = logprobs[micro_batch_inds]
-                            mb_return = returns[micro_batch_inds]
-                            mb_values = values[micro_batch_inds]
-
-                            output, vpred_temp = forward(model, mb_query_responses, processing_class.pad_token_id)
-                            logits = output.logits[:, context_length - 1 : -1]
-                            logits /= args.temperature + 1e-7
-                            new_logprobs = selective_log_softmax(logits, mb_responses)
-                            new_logprobs = torch.masked_fill(
-                                new_logprobs, padding_mask[micro_batch_inds], INVALID_LOGPROB
-                            )
-                            vpred = vpred_temp[:, context_length - 1 : -1].squeeze(-1)
-                            vpred = torch.masked_fill(vpred, padding_mask_p1[micro_batch_inds], 0)
-                            vpredclipped = torch.clamp(
-                                vpred,
-                                mb_values - args.cliprange_value,
-                                mb_values + args.cliprange_value,
-                            )
-                            vf_losses1 = torch.square(vpred - mb_return)
-                            vf_losses2 = torch.square(vpredclipped - mb_return)
-                            vf_loss_max = torch.max(vf_losses1, vf_losses2)
-                            vf_loss = 0.5 * masked_mean(vf_loss_max, ~padding_mask_p1[micro_batch_inds])
-                            vf_clipfrac = masked_mean(
-                                (vf_losses2 > vf_losses1).float(), ~padding_mask_p1[micro_batch_inds]
-                            )
-                            logprobs_diff = new_logprobs - mb_logprobs
-                            ratio = torch.exp(logprobs_diff)
-                            pg_losses = -mb_advantage * ratio
-                            pg_losses2 = -mb_advantage * torch.clamp(ratio, 1.0 - args.cliprange, 1.0 + args.cliprange)
-                            pg_loss_max = torch.max(pg_losses, pg_losses2)
-                            pg_loss = masked_mean(pg_loss_max, ~padding_mask[micro_batch_inds])
-                            loss = pg_loss + args.vf_coef * vf_loss
-                            accelerator.backward(loss)
-                            optimizer.step()
-                            optimizer.zero_grad()
-                            with torch.no_grad():
-                                pg_clipfrac = masked_mean(
-                                    (pg_losses2 > pg_losses).float(), ~padding_mask[micro_batch_inds]
-                                )
-                                prob_dist = torch.nn.functional.softmax(logits, dim=-1)
-                                entropy = torch.logsumexp(logits, dim=-1) - torch.sum(prob_dist * logits, dim=-1)
-                                approxkl = 0.5 * (logprobs_diff**2).mean()
-                                approxkl_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = approxkl
-                                pg_clipfrac_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = (
-                                    pg_clipfrac
-                                )
-                                pg_loss_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = pg_loss
-                                vf_loss_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = vf_loss
-                                vf_clipfrac_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = (
-                                    vf_clipfrac
-                                )
-                                entropy_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = entropy.mean()
-                                ratio_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = ratio.mean()
-                        gradient_accumulation_idx += 1
-                    minibatch_idx += 1
-                    # del everything and empty cache
-                    # fmt: off
-                    del (
-                        output, vpred_temp, logits, new_logprobs, vpred, vpredclipped,
-                        vf_losses1, vf_losses2, vf_loss, vf_clipfrac, logprobs_diff, ratio, pg_losses, pg_losses2, pg_loss_max,
-                        pg_loss, loss, pg_clipfrac, prob_dist, entropy, approxkl, mb_return,
-                        mb_advantage, mb_values, mb_responses, mb_query_responses, mb_logprobs,
-                    )
-                    # fmt: on
-                    empty_cache()
-            with torch.no_grad():
-                mean_kl = kl.sum(1).mean()
-                mean_entropy = (-logprobs).sum(1).mean()
-                mean_non_score_reward = non_score_reward.sum(1).mean()
-                rlhf_reward = mean_non_score_reward + scores.mean()
-                eps = int(self.state.episode / (time.time() - start_time))
-                metrics = {}
-                metrics["eps"] = eps
-                metrics["objective/kl"] = self.accelerator.gather_for_metrics(mean_kl).mean().item()
-                metrics["objective/entropy"] = self.accelerator.gather_for_metrics(mean_entropy).mean().item()
-                metrics["objective/non_score_reward"] = (
-                    self.accelerator.gather_for_metrics(mean_non_score_reward).mean().item()
-                )
-                metrics["objective/rlhf_reward"] = self.accelerator.gather_for_metrics(rlhf_reward).mean().item()
-                metrics["objective/scores"] = self.accelerator.gather_for_metrics(scores.mean()).mean().item()
-                metrics["policy/approxkl_avg"] = self.accelerator.gather_for_metrics(approxkl_stats).mean().item()
-                metrics["policy/clipfrac_avg"] = self.accelerator.gather_for_metrics(pg_clipfrac_stats).mean().item()
-                metrics["loss/policy_avg"] = self.accelerator.gather_for_metrics(pg_loss_stats).mean().item()
-                metrics["loss/value_avg"] = self.accelerator.gather_for_metrics(vf_loss_stats).mean().item()
-                metrics["val/clipfrac_avg"] = self.accelerator.gather_for_metrics(vf_clipfrac_stats).mean().item()
-                metrics["policy/entropy_avg"] = self.accelerator.gather_for_metrics(entropy_stats).mean().item()
-                metrics["val/ratio"] = self.accelerator.gather_for_metrics(ratio_stats).mean().item()
-                metrics["val/ratio_var"] = self.accelerator.gather_for_metrics(ratio_stats).var().item()
-                metrics["val/num_eos_tokens"] = (responses == processing_class.eos_token_id).sum().item()
-                metrics["lr"] = self.lr_scheduler.get_last_lr()[0]
-                metrics["episode"] = self.state.episode
-                self.state.epoch = self.state.episode / self.train_dataset_len  # used by self.log
-                self.state.global_step += 1
-                self.log(metrics)
-
-            self.lr_scheduler.step()
-            self.control = self.callback_handler.on_step_end(args, self.state, self.control)
-            if self.control.should_save:
-                self._save_checkpoint(model, trial=None)
-                self.control = self.callback_handler.on_save(self.args, self.state, self.control)
-            del kl, mean_kl, mean_entropy, mean_non_score_reward, scores, metrics, non_score_reward
-            empty_cache()
-            gc.collect()
-
-            if args.num_sample_generations > 0 and (update - 1) % self.sample_generations_freq == 0:
-                self.generate_completions(sampling=True)
-                empty_cache()
-            del (
-                query_responses,
-                responses,
-                postprocessed_responses,
-                logprobs,
-                ref_logprobs,
-                values,
-                sequence_lengths,
-                contain_eos_token,
-                sequence_lengths_p1,
-                response_idxs,
-                padding_mask,
-                padding_mask_p1,
-                rewards,
-                actual_start,
-                actual_end,
-                advantages,
-                returns,
-            )
-            empty_cache()
-
-        # HF trainer specifics
-        self.control = self.callback_handler.on_train_end(args, self.state, self.control)
-        if self.control.should_save:
-            self._save_checkpoint(model, trial=None)
-            self.control = self.callback_handler.on_save(self.args, self.state, self.control)
-
-    def generate_completions(self, sampling: bool = False):
-        args = self.args
-        processing_class = self.processing_class
-        generation_config = GenerationConfig(
-            max_new_tokens=self.args.response_length,
-            temperature=(0.01 + 1e-7),
-            top_k=0.0,
-            top_p=1.0,
-            do_sample=True,
+@dataclass
+class PPOTrainer(_PPOTrainer):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The `PPOTrainer` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.ppo import PPOTrainer`. The current import path will be removed and no longer "
+            "supported in TRL 0.29. For more information, see https://github.com/huggingface/trl/issues/4223."
         )
-
-        table = defaultdict(list)
-        with unwrap_model_for_generation(
-            self.model, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation
-        ) as unwrapped_model:
-            for batch in self.eval_dataloader:
-                query = batch["input_ids"]
-                with torch.no_grad():
-                    context_length = query.shape[1]
-                    query_response, _ = batch_generation(
-                        unwrapped_model.policy,
-                        query,
-                        query.shape[0],
-                        processing_class.pad_token_id,
-                        generation_config,
-                    )
-                    response = query_response[:, context_length:]
-                    postprocessed_response = response
-                    if self.stop_token_id is not None:  # handle the edge case when stop_token_id exists but is 0
-                        postprocessed_response = truncate_response(
-                            self.stop_token_id, processing_class.pad_token_id, response
-                        )
-                    table["query"].extend(
-                        gather_object(processing_class.batch_decode(query, skip_special_tokens=True))
-                    )
-                    table["model response"].extend(
-                        gather_object(processing_class.batch_decode(postprocessed_response))
-                    )
-
-                    postprocessed_query_response = torch.cat((query, postprocessed_response), 1)
-                    _, score, _ = get_reward(
-                        self.reward_model, postprocessed_query_response, processing_class.pad_token_id, context_length
-                    )
-                    table["score"].extend(self.accelerator.gather_for_metrics(score).float().cpu().numpy())
-
-                if sampling:
-                    break
-        df = pd.DataFrame(table)
-
-        if self.accelerator.is_main_process:
-            if is_rich_available():
-                print_rich_table(df.iloc[0 : 0 + 5])
-            if "wandb" in args.report_to:
-                import wandb
-
-                if wandb.run is not None:
-                    wandb.log({"completions": wandb.Table(dataframe=df)})
-
-            if "comet_ml" in args.report_to:
-                log_table_to_comet_experiment(
-                    name="completions.csv",
-                    table=df,
-                )
-
-    # Ensure the model card is saved along with the checkpoint
-    def _save_checkpoint(self, model, trial):
-        if self.args.hub_model_id is None:
-            model_name = Path(self.args.output_dir).name
-        else:
-            model_name = self.args.hub_model_id.split("/")[-1]
-        self.create_model_card(model_name=model_name)
-        super()._save_checkpoint(model, trial)
+        super().__init__(*args, **kwargs)
diff --git a/trl/trainer/prm_config.py b/trl/trainer/prm_config.py
index 4d0da1f931c..3afeb0be751 100644
--- a/trl/trainer/prm_config.py
+++ b/trl/trainer/prm_config.py
@@ -73,6 +73,16 @@ class PRMConfig(TrainingArguments):
             "`fp16` is not set."
         },
     )
+    # Transformers 4.57.0 introduced a bug that caused the dtype of `lr_scheduler_kwargs` to be unparsable. This issue
+    # was fixed in https://github.com/huggingface/transformers/pull/41322, but the fix has not yet been released. We
+    # add a temporary workaround here, which can be removed once the fix is available—likely in Transformers 4.57.2.
+    lr_scheduler_kwargs: dict | str | None = field(
+        default=None,
+        metadata={
+            "help": "Additional parameters for the lr_scheduler, such as {'num_cycles': 1} for cosine with hard "
+            "restarts."
+        },
+    )
 
     max_length: int | None = field(
         default=1024,
diff --git a/trl/trainer/reward_config.py b/trl/trainer/reward_config.py
index cb2a694b604..91b3bc4308a 100644
--- a/trl/trainer/reward_config.py
+++ b/trl/trainer/reward_config.py
@@ -100,6 +100,16 @@ class may differ from those in [`~transformers.TrainingArguments`].
             "`fp16` is not set."
         },
     )
+    # Transformers 4.57.0 introduced a bug that caused the dtype of `lr_scheduler_kwargs` to be unparsable. This issue
+    # was fixed in https://github.com/huggingface/transformers/pull/41322, but the fix has not yet been released. We
+    # add a temporary workaround here, which can be removed once the fix is available—likely in Transformers 4.57.2.
+    lr_scheduler_kwargs: dict | str | None = field(
+        default=None,
+        metadata={
+            "help": "Additional parameters for the lr_scheduler, such as {'num_cycles': 1} for cosine with hard "
+            "restarts."
+        },
+    )
 
     # Parameters that control the model
     model_init_kwargs: dict[str, Any] | None = field(
diff --git a/trl/trainer/reward_trainer.py b/trl/trainer/reward_trainer.py
index 60b3f5a2544..5c567331159 100644
--- a/trl/trainer/reward_trainer.py
+++ b/trl/trainer/reward_trainer.py
@@ -282,7 +282,7 @@ def __init__(
         model_init_kwargs = args.model_init_kwargs or {}
         if isinstance(model, str):
             model_id = model
-            dtype = model_init_kwargs.get("dtype")
+            dtype = model_init_kwargs.get("dtype", "auto")
             if isinstance(dtype, torch.dtype) or dtype == "auto" or dtype is None:
                 pass  # dtype is already a torch.dtype or "auto" or None
             elif isinstance(dtype, str) and dtype in ["bfloat16", "float16", "float32"]:
@@ -292,6 +292,7 @@ def __init__(
                     "Invalid `dtype` passed to `RewardConfig`. Expected either 'auto' or a string representing "
                     f"a valid `torch.dtype` (e.g., 'float32'), but got {dtype}."
                 )
+            model_init_kwargs["device_map"] = model_init_kwargs.get("device_map", "auto")
             with suppress_from_pretrained_warning(transformers.modeling_utils.logger):
                 model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=1, **model_init_kwargs)
         else:
diff --git a/trl/trainer/rloo_config.py b/trl/trainer/rloo_config.py
index 678b4a57a36..eb893c604c8 100644
--- a/trl/trainer/rloo_config.py
+++ b/trl/trainer/rloo_config.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
 from dataclasses import dataclass, field
 
 from transformers import TrainingArguments
@@ -142,8 +143,8 @@ class RLOOConfig(TrainingArguments):
             `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when
             launching the vLLM server via the `--vllm_tensor_parallel_size` flag.
         vllm_enable_sleep_mode (`bool`, *optional*, defaults to `False`):
-            Whether to enable sleep mode for vLLM. If `True`, vLLM will sleep during the optimization step and woken
-            for weight sync and generation.
+            Enable vLLM sleep mode to offload weights/cache during the optimizer step. Keeps GPU memory usage low, but
+            waking the engine adds host–device transfer latency.
 
         > Parameters that control the training
 
@@ -191,9 +192,20 @@ class RLOOConfig(TrainingArguments):
             `trackio`.
         num_completions_to_print (`int`, *optional*):
             Number of completions to print with `rich`. If `None`, all completions are logged.
-        wandb_log_unique_prompts (`bool`, *optional*, defaults to `False`):
-            Whether to log unique prompts in wandb. If `True`, only unique prompts are logged. If `False`, all prompts
-            are logged.
+        log_unique_prompts (`bool`, *optional*, defaults to `False`):
+            Whether to log unique prompts. If `True`, only unique prompts are logged. If `False`, all prompts are
+            logged.
+
+        > Deprecated arguments
+
+        wandb_log_unique_prompts (`bool`, *optional*):
+
+            <Deprecated version="0.26.0">
+
+            Parameter `wandb_log_unique_prompts` is deprecated and will be removed in version 0.27.0. Use
+            `log_unique_prompts` instead.
+
+            </Deprecated>
     """
 
     _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["model_init_kwargs"]
@@ -224,6 +236,16 @@ class RLOOConfig(TrainingArguments):
             "`fp16` is not set."
         },
     )
+    # Transformers 4.57.0 introduced a bug that caused the dtype of `lr_scheduler_kwargs` to be unparsable. This issue
+    # was fixed in https://github.com/huggingface/transformers/pull/41322, but the fix has not yet been released. We
+    # add a temporary workaround here, which can be removed once the fix is available—likely in Transformers 4.57.2.
+    lr_scheduler_kwargs: dict | str | None = field(
+        default=None,
+        metadata={
+            "help": "Additional parameters for the lr_scheduler, such as {'num_cycles': 1} for cosine with hard "
+            "restarts."
+        },
+    )
 
     # Parameters that control the model and reference model
     model_init_kwargs: dict | str | None = field(
@@ -385,8 +407,8 @@ class RLOOConfig(TrainingArguments):
     vllm_enable_sleep_mode: bool = field(
         default=False,
         metadata={
-            "help": "Whether to enable sleep mode for vLLM. If `True`, vLLM will sleep during the optimization step "
-            "and woken for weight sync and generation."
+            "help": "Enable vLLM sleep mode to offload weights/cache during the optimizer step. Keeps GPU memory "
+            "usage low, but waking the engine adds host–device transfer latency."
         },
     )
     vllm_guided_decoding_regex: str | None = field(
@@ -520,14 +542,20 @@ class RLOOConfig(TrainingArguments):
         default=None,
         metadata={"help": "Number of completions to print with `rich`. If `None`, all completions are logged."},
     )
-    wandb_log_unique_prompts: bool | None = field(
+    log_unique_prompts: bool = field(
         default=False,
         metadata={
-            "help": "Whether to log unique prompts in wandb. If `True`, only unique prompts are logged. If `False`, "
-            "all prompts are logged."
+            "help": "Whether to log unique prompts. If `True`, only unique prompts are logged. If `False`, all "
+            "prompts are logged."
         },
     )
 
+    # Deprecated arguments
+    wandb_log_unique_prompts: bool | None = field(
+        default=None,
+        metadata={"help": "Deprecated, use `log_unique_prompts` instead."},
+    )
+
     def __post_init__(self):
         self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
 
@@ -576,3 +604,12 @@ def __post_init__(self):
                 "RLOO requires at least 2 generations per prompt to calculate the advantages. You provided "
                 f"{self.num_generations}, which is less than the minimum required."
             )
+
+        if self.wandb_log_unique_prompts is not None:
+            warnings.warn(
+                "The `wandb_log_unique_prompts` argument is deprecated and will be removed in version 0.27.0. Please "
+                "use `log_unique_prompts` instead.",
+                FutureWarning,
+                stacklevel=2,
+            )
+            self.log_unique_prompts = self.wandb_log_unique_prompts
diff --git a/trl/trainer/rloo_trainer.py b/trl/trainer/rloo_trainer.py
index 211ccc230f3..ca572caae7e 100644
--- a/trl/trainer/rloo_trainer.py
+++ b/trl/trainer/rloo_trainer.py
@@ -15,6 +15,7 @@
 import inspect
 import os
 import textwrap
+import time
 import warnings
 from collections import defaultdict, deque
 from collections.abc import Callable
@@ -44,6 +45,7 @@
     PreTrainedTokenizerBase,
     ProcessorMixin,
     TrainerCallback,
+    is_bitsandbytes_available,
     is_trackio_available,
     is_wandb_available,
 )
@@ -96,6 +98,9 @@
 if is_trackio_available():
     import trackio
 
+if is_bitsandbytes_available():
+    import bitsandbytes as bnb
+
 
 logger = logging.get_logger(__name__)
 
@@ -256,7 +261,7 @@ def __init__(
         model_init_kwargs = args.model_init_kwargs or {}
         if isinstance(model, str):
             model_id = model
-            dtype = model_init_kwargs.get("dtype")
+            dtype = model_init_kwargs.get("dtype", "auto")
             if isinstance(dtype, torch.dtype) or dtype == "auto" or dtype is None:
                 pass  # dtype is already a torch.dtype or "auto" or None
             elif isinstance(dtype, str):  # it's a str, but not "auto"
@@ -267,7 +272,7 @@ def __init__(
                     "Invalid `dtype` passed to `RLOOConfig`. Expected either 'auto' or a string representing "
                     f"a `torch.dtype` (e.g., 'float32'), but got {dtype}."
                 )
-            # Disable caching if gradient checkpointing is enabled (not supported)
+            model_init_kwargs["device_map"] = model_init_kwargs.get("device_map", "auto")
             config = AutoConfig.from_pretrained(model_id)
             architecture = getattr(transformers, config.architectures[0])
             model = architecture.from_pretrained(model_id, **model_init_kwargs)
@@ -448,8 +453,9 @@ def __init__(
         # Initialize the metrics
         self._metrics = {"train": defaultdict(list), "eval": defaultdict(list)}
         self._total_train_tokens = 0
+        self._current_train_step_time = 0.0
         self.log_completions = args.log_completions
-        self.wandb_log_unique_prompts = args.wandb_log_unique_prompts
+        self.log_unique_prompts = args.log_unique_prompts
         self.num_completions_to_print = args.num_completions_to_print
         # Keep logs sized to the generation batch to record only outputs from the latest model update.
         self._logs = {
@@ -511,6 +517,14 @@ def __init__(
                     max_model_len = self.max_prompt_length + self.max_completion_length
                 else:
                     max_model_len = None
+                vllm_quantization = None
+                if is_bitsandbytes_available():
+                    for _, module in model.named_modules():
+                        if isinstance(module, bnb.nn.Linear4bit):
+                            vllm_quantization = "bitsandbytes"
+                            break
+                        elif isinstance(module, bnb.nn.Linear8bitLt):
+                            raise ValueError("vLLM does not support in-flight 8-bit quantization.")
                 self.llm = LLM(
                     model=model.name_or_path,
                     tensor_parallel_size=args.vllm_tensor_parallel_size,
@@ -526,6 +540,7 @@ def __init__(
                     max_num_batched_tokens=4096,
                     model_impl=self.args.vllm_model_impl,
                     enable_sleep_mode=self.args.vllm_enable_sleep_mode,
+                    quantization=vllm_quantization,
                 )
                 if self.args.vllm_enable_sleep_mode:
                     self.llm.sleep(level=2)
@@ -878,6 +893,17 @@ def _move_model_to_vllm(self):
         elif self.vllm_mode == "colocate":
             self.llm.reset_prefix_cache()
 
+    def training_step(self, model, inputs, num_items_in_batch):
+        time_before = time.perf_counter()
+        output = super().training_step(model, inputs, num_items_in_batch)
+        self._step += 1
+        time_after = time.perf_counter()
+        self._current_train_step_time += time_after - time_before
+        if self._step % self.current_gradient_accumulation_steps == 0:
+            self._metrics["train"]["step_time"].append(self._current_train_step_time)
+            self._current_train_step_time = 0.0
+        return output
+
     @profiling_decorator
     def _prepare_inputs(self, generation_batch: dict[str, torch.Tensor | Any]) -> dict[str, torch.Tensor | Any]:
         # Prepares inputs for model training/evaluation by managing completion generation and batch handling.
@@ -1342,6 +1368,9 @@ def _generate_and_score_completions(
             completions = []
             for prompt, completion in zip(prompts, completions_text, strict=True):
                 bootstrap = prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else ""
+                if isinstance(bootstrap, list):  # for VLM, the format might be [{"type": "text", "text": "..."}]
+                    assert len(bootstrap) == 1 and bootstrap[0]["type"] == "text"
+                    bootstrap = bootstrap[0]["text"]
                 completions.append([{"role": "assistant", "content": bootstrap + completion}])
         else:
             completions = completions_text
@@ -1531,6 +1560,12 @@ def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
                     self.num_completions_to_print,
                 )
 
+            logging_backends = []
+            if self.args.report_to and "wandb" in self.args.report_to and wandb.run is not None:
+                logging_backends.append(wandb)
+            if self.args.report_to and "trackio" in self.args.report_to:
+                logging_backends.append(trackio)
+
             table = {
                 "step": [str(self.state.global_step)] * len(self._logs["prompt"]),
                 "prompt": self._logs["prompt"],
@@ -1542,34 +1577,23 @@ def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
             df_base = pd.DataFrame(table)
             images_raw = self._logs["images"] or []
 
-            for logging_backend in self.args.report_to:
-                if logging_backend == "wandb":
-                    if images_raw:
-                        images = []
-                        for image_list in self._logs["images"]:
-                            images.append([wandb.Image(image) for image in image_list])
-                        df = pd.concat([df_base, pd.Series(images, name="image")], axis=1, copy=False)
-                    else:
-                        df = df_base
-
-                    if self.wandb_log_unique_prompts:
-                        df = df.drop_duplicates(subset=["prompt"])
-
-                    wandb.log({"completions": wandb.Table(dataframe=df)})
-
-                if logging_backend == "trackio":
-                    if images_raw:
-                        # TODO: Implement once supported upstream https://github.com/gradio-app/trackio/issues/334
-                        logger.info("Skipping image logging for Trackio")
-                        df = df_base
-                        # images = []
-                        # for image_list in self._logs["images"]:
-                        #     images.append([trackio.Image(image) for image in image_list])
-                        # df = pd.concat([df_base, pd.Series(images, name="image")], axis=1, copy=False)
-                    else:
-                        df = df_base
+            for logging_backend in logging_backends:
+                if images_raw:
+                    images = []
+                    for image_list in self._logs["images"]:
+                        images.append([logging_backend.Image(image) for image in image_list])
+                    df = pd.concat(
+                        [df_base, pd.Series(images, name="image")],
+                        axis=1,
+                        copy=False,
+                    )
+                else:
+                    df = df_base
+
+                if self.log_unique_prompts:
+                    df = df.drop_duplicates(subset=["prompt"])
 
-                    trackio.log({"completions": trackio.Table(dataframe=df)})
+                logging_backend.log({"completions": logging_backend.Table(dataframe=df)})
 
     # Ensure the model card is saved along with the checkpoint
     def _save_checkpoint(self, model, trial):
diff --git a/trl/trainer/sft_config.py b/trl/trainer/sft_config.py
index c43a7a51bf7..6345d2a4bd7 100644
--- a/trl/trainer/sft_config.py
+++ b/trl/trainer/sft_config.py
@@ -127,6 +127,16 @@ class SFTConfig(TrainingArguments):
             "`fp16` is not set."
         },
     )
+    # Transformers 4.57.0 introduced a bug that caused the dtype of `lr_scheduler_kwargs` to be unparsable. This issue
+    # was fixed in https://github.com/huggingface/transformers/pull/41322, but the fix has not yet been released. We
+    # add a temporary workaround here, which can be removed once the fix is available—likely in Transformers 4.57.2.
+    lr_scheduler_kwargs: dict | str | None = field(
+        default=None,
+        metadata={
+            "help": "Additional parameters for the lr_scheduler, such as {'num_cycles': 1} for cosine with hard "
+            "restarts."
+        },
+    )
 
     # Parameters that control the model
     model_init_kwargs: dict[str, Any] | None = field(
diff --git a/trl/trainer/sft_trainer.py b/trl/trainer/sft_trainer.py
index f69a5f3eddb..f446efdc63c 100644
--- a/trl/trainer/sft_trainer.py
+++ b/trl/trainer/sft_trainer.py
@@ -72,9 +72,9 @@
 FLASH_ATTENTION_VARIANTS = {
     "flash_attention_2",
     "flash_attention_3",
-    "kernels-community/flash-attn",
-    "kernels-community/vllm-flash-attn3",
+    "kernels-community/flash-attn2",
     "kernels-community/flash-attn3",
+    "kernels-community/vllm-flash-attn3",
 }
 
 
diff --git a/trl/trainer/utils.py b/trl/trainer/utils.py
index edeb50c0323..02d4cc78073 100644
--- a/trl/trainer/utils.py
+++ b/trl/trainer/utils.py
@@ -662,7 +662,7 @@ def get_quantization_config(model_args: ModelConfig) -> BitsAndBytesConfig | Non
             bnb_4bit_compute_dtype=model_args.dtype,  # For consistency with model weights, we use the same value as `dtype`
             bnb_4bit_quant_type=model_args.bnb_4bit_quant_type,
             bnb_4bit_use_double_quant=model_args.use_bnb_nested_quant,
-            bnb_4bit_quant_storage=model_args.dtype,
+            bnb_4bit_quant_storage=model_args.bnb_4bit_quant_storage,
         )
     elif model_args.load_in_8bit:
         quantization_config = BitsAndBytesConfig(
@@ -847,6 +847,16 @@ class may differ from those in [`~transformers.TrainingArguments`].
             "`fp16` is not set."
         },
     )
+    # Transformers 4.57.0 introduced a bug that caused the dtype of `lr_scheduler_kwargs` to be unparsable. This issue
+    # was fixed in https://github.com/huggingface/transformers/pull/41322, but the fix has not yet been released. We
+    # add a temporary workaround here, which can be removed once the fix is available—likely in Transformers 4.57.2.
+    lr_scheduler_kwargs: dict | str | None = field(
+        default=None,
+        metadata={
+            "help": "Additional parameters for the lr_scheduler, such as {'num_cycles': 1} for cosine with hard "
+            "restarts."
+        },
+    )
 
     run_name: str | None = field(
         default=None,
@@ -1986,7 +1996,7 @@ def create_model_from_path(model_id: str, **kwargs) -> PreTrainedModel:
         [`~transformers.PreTrainedModel`]:
             The instantiated model.
     """
-    dtype = kwargs.get("dtype")
+    dtype = kwargs.get("dtype", "auto")
     if isinstance(dtype, torch.dtype) or dtype == "auto" or dtype is None:
         pass  # dtype is already a torch.dtype or "auto" or None
     elif isinstance(dtype, str) and dtype in ["bfloat16", "float16", "float32"]:
@@ -1996,6 +2006,7 @@ def create_model_from_path(model_id: str, **kwargs) -> PreTrainedModel:
             "Invalid `dtype` passed to the config. Expected either 'auto' or a string representing "
             f"a valid `torch.dtype` (e.g., 'float32'), but got {dtype}."
         )
+    kwargs["device_map"] = kwargs.get("device_map", "auto")
     config = AutoConfig.from_pretrained(model_id)
     architecture = getattr(transformers, config.architectures[0])
     model = architecture.from_pretrained(model_id, **kwargs)
diff --git a/trl/trainer/xpo_config.py b/trl/trainer/xpo_config.py
index 78c1e562128..a05cecbfe37 100644
--- a/trl/trainer/xpo_config.py
+++ b/trl/trainer/xpo_config.py
@@ -12,33 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass, field
+import warnings
 
-from trl.trainer.online_dpo_config import OnlineDPOConfig
+from ..experimental.xpo import XPOConfig as _XPOConfig
 
 
-@dataclass
-class XPOConfig(OnlineDPOConfig):
-    r"""
-    Configuration class for the [`XPOTrainer`].
-
-    Subclass of [`OnlineDPOConfig`] we can use all its arguments and add the following:
-
-    Parameters:
-        alpha (`float` or `list[float]`, *optional*, defaults to `1e-5`):
-            Weight of the XPO loss term. If a list of floats is provided then the alpha is selected for each new epoch
-            and the last alpha is used for the rest of the epochs.
-    """
-
-    alpha: list[float] = field(
-        default_factory=lambda: [1e-5],
-        metadata={
-            "help": "Weight of the XPO loss term. If a list of floats is provided then the alpha is selected for each "
-            "new epoch and the last alpha is used for the rest of the epochs."
-        },
-    )
-
+class XPOConfig(_XPOConfig):
     def __post_init__(self):
+        warnings.warn(
+            "The `XPOConfig` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.xco import XPOConfig`. The current import path will be removed and no longer "
+            "supported in TRL 0.29. For more information, see https://github.com/huggingface/trl/issues/4223."
+        )
         super().__post_init__()
-        if hasattr(self.alpha, "__len__") and len(self.alpha) == 1:
-            self.alpha = self.alpha[0]
diff --git a/trl/trainer/xpo_trainer.py b/trl/trainer/xpo_trainer.py
index 5646885e791..c537f886d41 100644
--- a/trl/trainer/xpo_trainer.py
+++ b/trl/trainer/xpo_trainer.py
@@ -12,527 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import textwrap
-from collections.abc import Callable
-from typing import Any
+import warnings
 
-import jinja2
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from datasets import Dataset, IterableDataset
-from transformers import (
-    BaseImageProcessor,
-    FeatureExtractionMixin,
-    PreTrainedModel,
-    PreTrainedTokenizerBase,
-    ProcessorMixin,
-    TrainerCallback,
-)
-from transformers.trainer_utils import EvalPrediction
-from transformers.training_args import OptimizerNames
-from transformers.utils import is_peft_available
+from ..experimental.xpo import XPOTrainer as _XPOTrainer
 
-from ..data_utils import is_conversational, maybe_apply_chat_template
-from ..models.utils import unwrap_model_for_generation
-from .judges import BasePairwiseJudge
-from .online_dpo_trainer import OnlineDPOTrainer
-from .utils import (
-    SIMPLE_CHAT_TEMPLATE,
-    empty_cache,
-    get_reward,
-    selective_log_softmax,
-    truncate_right,
-)
-from .xpo_config import XPOConfig
 
-
-if is_peft_available():
-    from peft import PeftModel
-
-
-class XPOTrainer(OnlineDPOTrainer):
-    """
-    Trainer for Exploratory Preference Optimization (XPO).
-
-    It is implemented as a subclass of [`OnlineDPOTrainer`].
-
-    Args:
-        model ([`~transformers.PreTrainedModel`]):
-            The model to train, preferably an `AutoModelForCausalLM`.
-        ref_model ([`PreTrainedModelWrapper`]):
-            Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
-            and loss. If no reference model is provided, the trainer will create a reference model with the same
-            architecture as the model to be optimized.
-        reward_funcs ([`~transformers.PreTrainedModel`]):
-            The reward model to score completions with, preferably an
-            [`~transformers.AutoModelForSequenceClassification`].
-        judge ([`BasePairwiseJudge`]):
-            The judge to use for pairwise comparison of model completions.
-        args ([`XPOConfig`]):
-            The XPO config arguments to use for training.
-        data_collator ([`~transformers.DataCollator`]):
-            The data collator to use for training. If None is specified, the default data collator
-            ([`DPODataCollatorWithPadding`]) will be used which will pad the sequences to the maximum length of the
-            sequences in the batch, given a dataset of paired sequences.
-        train_dataset ([`~datasets.Dataset`]):
-            The dataset to use for training.
-        eval_dataset ([`~datasets.Dataset`]):
-            The dataset to use for evaluation.
-        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
-            Processing class used to process the data. If provided, will be used to automatically process the inputs
-            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
-            reuse the fine-tuned model.
-        peft_config (`dict`):
-            The peft config to use for training.
-        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
-            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
-            metric values.
-        callbacks (`list[transformers.TrainerCallback]`):
-            The callbacks to use for training.
-        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
-            The optimizer and scheduler to use for training.
-        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
-            The function to use to preprocess the logits before computing the metrics.
-    """
-
-    _tag_names = ["trl", "xpo"]
-    _name = "XPO"
-    _paper = {
-        "title": "Exploratory Preference Optimization: Harnessing Implicit Q*-Approximation for Sample-Efficient RLHF",
-        "id": "2405.21046",
-        # docstyle-ignore
-        "citation": textwrap.dedent("""\
-            @article{jung2024binary,
-                title        = {{Exploratory Preference Optimization: Harnessing Implicit Q*-Approximation for Sample-Efficient RLHF}},
-                author       = {Tengyang Xie and Dylan J. Foster and Akshay Krishnamurthy and Corby Rosset and Ahmed Awadallah and Alexander Rakhlin},
-                year         = 2024,
-                eprint       = {arXiv:2405.21046}
-            }"""),
-    }
-
-    def __init__(
-        self,
-        model: PreTrainedModel | nn.Module = None,
-        ref_model: PreTrainedModel | nn.Module = None,
-        reward_funcs: nn.Module | None = None,
-        judge: BasePairwiseJudge | None = None,
-        args: XPOConfig | None = None,
-        data_collator: Callable | None = None,
-        train_dataset: Dataset | IterableDataset | None = None,
-        eval_dataset: Dataset | dict[str, Dataset] | None = None,
-        processing_class: PreTrainedTokenizerBase
-        | BaseImageProcessor
-        | FeatureExtractionMixin
-        | ProcessorMixin
-        | None = None,
-        reward_processing_classes: PreTrainedTokenizerBase | list[PreTrainedTokenizerBase] | None = None,
-        peft_config: dict | None = None,
-        compute_metrics: Callable[[EvalPrediction], dict] | None = None,
-        callbacks: list[TrainerCallback] | None = None,
-        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
-        preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
-    ) -> None:
-        super().__init__(
-            model=model,
-            ref_model=ref_model,
-            judge=judge,
-            reward_funcs=reward_funcs,
-            args=args,
-            data_collator=data_collator,
-            train_dataset=train_dataset,
-            eval_dataset=eval_dataset,
-            processing_class=processing_class,
-            reward_processing_classes=reward_processing_classes,
-            peft_config=peft_config,
-            compute_metrics=compute_metrics,
-            callbacks=callbacks,
-            optimizers=optimizers,
-            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
-        )
-
-        self._alpha = self.args.alpha
-
-        # Overwrite the stats dictionary to include XPO specific statistics
-        self.stats = {
-            # Remove "non_score_reward", "rlhf_reward", "scores"
-            # Add "loss/dpo", "loss/xpo"
-            "loss/dpo": [],
-            "loss/xpo": [],
-            "objective/kl": [],
-            "objective/entropy": [],
-            "rewards/chosen": [],
-            "rewards/rejected": [],
-            "rewards/accuracies": [],
-            "rewards/margins": [],
-            "logps/chosen": [],
-            "logps/rejected": [],
-            # Replace "contain_eos_token" by "model_contain_eos_token" and "ref_contain_eos_token"
-            "val/model_contain_eos_token": [],
-            "val/ref_contain_eos_token": [],
-            "alpha": [],
-            "beta": [],
-        }
-        if self.reward_funcs is not None:
-            if len(self.reward_funcs) != 1:
-                raise ValueError("XPOTrainer only supports one reward function/model.")
-            self.reward_funcs = self.reward_funcs[0]
-            self.stats["objective/model_scores"] = []
-            self.stats["objective/ref_scores"] = []
-            self.stats["objective/scores_margin"] = []
-
-    @property
-    def alpha(self):
-        if isinstance(self._alpha, list):
-            epoch = self.state.epoch
-            return self._alpha[epoch] if epoch < len(self._alpha) else self._alpha[-1]
-        else:
-            return self._alpha
-
-    def _generate_completions(self, prompts, model):
-        with unwrap_model_for_generation(model, self.accelerator) as unwrapped_policy_model_for_gen:
-            model_output = unwrapped_policy_model_for_gen.generate(
-                input_ids=prompts["input_ids"],
-                attention_mask=prompts["attention_mask"],
-                generation_config=self.generation_config,
-            )
-
-        actual_model_for_ref_generation: torch.nn.Module
-        if self.ref_model is None:
-            unwrapped_main_model_for_ref_logic = self.accelerator.unwrap_model(model)
-
-            if is_peft_available() and isinstance(unwrapped_main_model_for_ref_logic, PeftModel):
-                actual_model_for_ref_generation = unwrapped_main_model_for_ref_logic.get_base_model()
-            else:
-                actual_model_for_ref_generation = unwrapped_main_model_for_ref_logic
-        else:
-            actual_model_for_ref_generation = self.accelerator.unwrap_model(self.ref_model)
-
-        with unwrap_model_for_generation(actual_model_for_ref_generation, self.accelerator) as final_ref_model_for_gen:
-            ref_output = final_ref_model_for_gen.generate(
-                input_ids=prompts["input_ids"],
-                attention_mask=prompts["attention_mask"],
-                generation_config=self.generation_config,
-            )
-
-        return model_output, ref_output
-
-    def _process_completions(self, model_output, ref_output, prompts):
-        context_length = prompts["input_ids"].shape[1]
-
-        # Process model completions
-        model_completion_ids = model_output[:, context_length:]
-        model_completion_ids, model_completion_mask = truncate_right(
-            model_completion_ids, self.processing_class.eos_token_id, self.processing_class.pad_token_id
-        )
-        model_data = {
-            "input_ids": torch.cat((prompts["input_ids"], model_completion_ids), dim=1),
-            "attention_mask": torch.cat((prompts["attention_mask"], model_completion_mask), dim=1),
-            "raw": prompts["raw"],
-        }
-
-        # Process reference model completions
-        ref_completion_ids = ref_output[:, context_length:]
-        ref_completion_ids, ref_completion_mask = truncate_right(
-            ref_completion_ids, self.processing_class.eos_token_id, self.processing_class.pad_token_id
-        )
-        ref_data = {
-            "input_ids": torch.cat((prompts["input_ids"], ref_completion_ids), dim=1),
-            "attention_mask": torch.cat((prompts["attention_mask"], ref_completion_mask), dim=1),
-            "raw": prompts["raw"],
-        }
-
-        return model_data, ref_data
-
-    def _compute_rewards(self, model_data, ref_data, context_length):
-        with torch.no_grad():
-            _, model_scores, _ = get_reward(
-                self.reward_funcs, model_data["input_ids"], self.processing_class.pad_token_id, context_length
-            )
-            _, ref_scores, _ = get_reward(
-                self.reward_funcs, ref_data["input_ids"], self.processing_class.pad_token_id, context_length
-            )
-
-        # Apply EOS penalty if needed
-        if self.args.missing_eos_penalty is not None:
-            model_contain_eos = torch.any(model_data["input_ids"] == self.processing_class.eos_token_id, dim=-1)
-            ref_contain_eos = torch.any(ref_data["input_ids"] == self.processing_class.eos_token_id, dim=-1)
-            model_scores[~model_contain_eos] -= self.args.missing_eos_penalty
-            ref_scores[~ref_contain_eos] -= self.args.missing_eos_penalty
-
-        return model_scores, ref_scores
-
-    def _compute_judge(self, model_data, ref_data, context_length):
-        prompts = model_data["raw"]
-        model_data_completions = self.processing_class.batch_decode(
-            model_data["input_ids"][:, context_length:], skip_special_tokens=True
-        )
-        model_data_completions = [completion.strip() for completion in model_data_completions]
-
-        ref_data_completions = self.processing_class.batch_decode(
-            ref_data["input_ids"][:, context_length:], skip_special_tokens=True
-        )
-        ref_data_completions = [completion.strip() for completion in ref_data_completions]
-
-        if is_conversational({"prompt": prompts[0]}):
-            model_data_completions = [
-                [{"role": "assistant", "content": completion}] for completion in model_data_completions
-            ]
-            environment = jinja2.Environment()
-            template = environment.from_string(SIMPLE_CHAT_TEMPLATE)
-            prompts = [template.render(messages=message) for message in prompts]
-            model_data_completions = [template.render(messages=completion) for completion in model_data_completions]
-
-            ref_data_completions = [
-                [{"role": "assistant", "content": completion}] for completion in ref_data_completions
-            ]
-            ref_data_completions = [template.render(messages=completion) for completion in ref_data_completions]
-
-        ranks_of_first_completion = self.judge.judge(
-            prompts,
-            list(zip(model_data_completions, ref_data_completions, strict=True)),
-        )
-        # convert ranks to a True/False mask:
-        # when rank == 0, it means the first completion is the best
-        # when rank == 1, it means the second completion is the best
-        return torch.tensor([rank == 0 for rank in ranks_of_first_completion], device=model_data["input_ids"].device)
-
-    def _compute_logprobs(self, model, model_data, ref_data, context_length):
-        def compute_logprobs_for_data(m, data):
-            output = m(data["input_ids"], attention_mask=data["attention_mask"])
-            logits = output.logits[:, context_length - 1 : -1]
-            token_logprobs = selective_log_softmax(logits, data["input_ids"][:, context_length:])
-            return token_logprobs
-
-        # Compute logprobs for model completions
-        model_logprobs_model_data = compute_logprobs_for_data(model, model_data)
-        # Compute logprobs for model on reference completions (for XPO loss)
-        model_logprobs_ref_data = compute_logprobs_for_data(model, ref_data)
-
-        # Compute logprobs for reference model completions
-        with torch.no_grad():
-            if self.ref_model is None:
-                with model.disable_adapter():
-                    ref_logprobs_model_data = compute_logprobs_for_data(model, model_data)
-                    ref_logprobs_ref_data = compute_logprobs_for_data(model, ref_data)
-            else:
-                ref_logprobs_model_data = compute_logprobs_for_data(self.ref_model, model_data)
-                ref_logprobs_ref_data = compute_logprobs_for_data(self.ref_model, ref_data)
-
-        # Mask padding tokens
-        model_padding_mask = model_data["attention_mask"][:, context_length:] == 0
-        ref_padding_mask = ref_data["attention_mask"][:, context_length:] == 0
-        model_logprobs_model_data = model_logprobs_model_data.masked_fill(model_padding_mask, 0.0)
-        model_logprobs_ref_data = model_logprobs_ref_data.masked_fill(ref_padding_mask, 0.0)
-        ref_logprobs_ref_data = ref_logprobs_ref_data.masked_fill(ref_padding_mask, 0.0)
-        ref_logprobs_model_data = ref_logprobs_model_data.masked_fill(model_padding_mask, 0.0)
-
-        return model_logprobs_model_data, model_logprobs_ref_data, ref_logprobs_ref_data, ref_logprobs_model_data
-
-    def _compute_losses(
-        self,
-        model_logprobs_model_data,
-        model_logprobs_ref_data,
-        ref_logprobs_ref_data,
-        ref_logprobs_model_data,
-        chosen_mask,
-    ):
-        # Compute log probs
-        model_logprobs_model_data_sum = model_logprobs_model_data.sum(1)
-        model_logprobs_ref_data_sum = model_logprobs_ref_data.sum(1)
-        ref_logprobs_ref_data_sum = ref_logprobs_ref_data.sum(1)
-        ref_logprobs_model_data_sum = ref_logprobs_model_data.sum(1)
-
-        chosen_model_logprobs = torch.where(chosen_mask, model_logprobs_model_data_sum, model_logprobs_ref_data_sum)
-        chosen_ref_logprobs = torch.where(chosen_mask, ref_logprobs_model_data_sum, ref_logprobs_ref_data_sum)
-        chosen_log_ratios = chosen_model_logprobs - chosen_ref_logprobs
-
-        rejected_model_logprobs = torch.where(~chosen_mask, model_logprobs_model_data_sum, model_logprobs_ref_data_sum)
-        rejected_ref_logprobs = torch.where(~chosen_mask, ref_logprobs_model_data_sum, ref_logprobs_ref_data_sum)
-        rejected_log_ratios = rejected_model_logprobs - rejected_ref_logprobs
-
-        # Compute logits as the difference between chosen and rejected log ratios
-        logits = chosen_log_ratios - rejected_log_ratios
-
-        if self.args.loss_type == "sigmoid":
-            dpo_losses = -F.logsigmoid(self.beta * logits)
-        elif self.args.loss_type == "ipo":
-            dpo_losses = (logits - 1 / (2 * self.beta)) ** 2
-        else:
-            raise NotImplementedError(f"invalid loss type {self.args.loss_type}")
-
-        # Compute XPO specific loss
-        xpo_losses = self.alpha * model_logprobs_ref_data_sum
-
-        # Total loss
-        loss = (dpo_losses + xpo_losses).mean()
-
-        return loss, dpo_losses, xpo_losses
-
-    def _log_statistics(
-        self,
-        model_data,
-        ref_data,
-        model_logprobs_model_data,
-        model_logprobs_ref_data,
-        ref_logprobs_ref_data,
-        ref_logprobs_model_data,
-        chosen_mask,
-        dpo_losses,
-        xpo_losses,
-        context_length,
-        model_scores=None,
-        ref_scores=None,
-    ):
-        # Helper function to gather and compute mean
-        def gather_mean(tensor):
-            return self.accelerator.gather_for_metrics(tensor).mean().item()
-
-        # Log losses
-        self.stats["loss/dpo"].append(gather_mean(dpo_losses))
-        self.stats["loss/xpo"].append(gather_mean(xpo_losses))
-
-        # Log scores
-        if self.reward_funcs is not None:
-            self.stats["objective/model_scores"].append(gather_mean(model_scores))
-            self.stats["objective/ref_scores"].append(gather_mean(ref_scores))
-            self.stats["objective/scores_margin"].append(gather_mean(model_scores - ref_scores))
-
-        # Log logprobs
-        model_logprobs_model_data_sum = model_logprobs_model_data.sum(1)
-        model_logprobs_ref_data_sum = model_logprobs_ref_data.sum(1)
-        ref_logprobs_ref_data_sum = ref_logprobs_ref_data.sum(1)
-        ref_logprobs_model_data_sum = ref_logprobs_model_data.sum(1)
-
-        chosen_model_logprobs = torch.where(chosen_mask, model_logprobs_model_data_sum, model_logprobs_ref_data_sum)
-        chosen_ref_logprobs = torch.where(chosen_mask, ref_logprobs_model_data_sum, ref_logprobs_ref_data_sum)
-        chosen_log_ratios = chosen_model_logprobs - chosen_ref_logprobs
-
-        rejected_model_logprobs = torch.where(~chosen_mask, model_logprobs_model_data_sum, model_logprobs_ref_data_sum)
-        rejected_ref_logprobs = torch.where(~chosen_mask, ref_logprobs_model_data_sum, ref_logprobs_ref_data_sum)
-        rejected_log_ratios = rejected_model_logprobs - rejected_ref_logprobs
-
-        self.stats["logps/chosen"].append(gather_mean(chosen_model_logprobs.mean() + chosen_ref_logprobs.mean()))
-        self.stats["logps/rejected"].append(gather_mean(rejected_model_logprobs.mean() + rejected_ref_logprobs.mean()))
-
-        # Log rewards
-        # Compute various statistics
-        chosen_rewards = chosen_log_ratios * self.beta
-        rejected_rewards = rejected_log_ratios * self.beta
-        self.stats["rewards/chosen"].append(gather_mean(chosen_rewards.mean()))
-        self.stats["rewards/rejected"].append(gather_mean(rejected_rewards.mean()))
-
-        # Calculate KL divergence for model and ref data
-        kl_model_data = model_logprobs_model_data - ref_logprobs_model_data
-        kl_ref_data = model_logprobs_ref_data - ref_logprobs_ref_data
-        mean_kl = (kl_model_data.sum(1) + kl_ref_data.sum(1)).mean() / 2
-        self.stats["objective/kl"].append(gather_mean(mean_kl))
-
-        # Calculate entropy for model and ref data
-        entropy_model_data = -model_logprobs_model_data.sum(1)
-        entropy_ref_data = -model_logprobs_ref_data.sum(1)
-        mean_entropy = (entropy_model_data.mean() + entropy_ref_data.mean()) / 2
-        self.stats["objective/entropy"].append(gather_mean(mean_entropy))
-
-        # Calculate margins
-        margin = chosen_rewards - rejected_rewards
-        self.stats["rewards/margins"].append(gather_mean(margin.mean()))
-
-        # Calculate accuracy
-        accuracy = (margin > 0).float()
-        self.stats["rewards/accuracies"].append(gather_mean(accuracy.mean()))
-
-        # Log EOS token statistics
-        model_eos = (model_data["input_ids"][:, context_length:] == self.processing_class.eos_token_id).any(dim=1)
-        ref_eos = (ref_data["input_ids"][:, context_length:] == self.processing_class.eos_token_id).any(dim=1)
-        self.stats["val/model_contain_eos_token"].append(gather_mean(model_eos.float()))
-        self.stats["val/ref_contain_eos_token"].append(gather_mean(ref_eos.float()))
-
-        # Log alpha and beta
-        self.stats["alpha"].append(self.alpha)
-        self.stats["beta"].append(self.beta)
-
-    def training_step(
-        self, model: nn.Module, inputs: dict[str, torch.Tensor | Any], num_items_in_batch: int | None = None
-    ) -> torch.Tensor:
-        model.train()
-
-        # Apply chat template and tokenize the input
-        batch_size = len(next(iter(inputs.values())))
-        prompts = inputs["prompt"]
-        inputs = [{k: v[i] for k, v in inputs.items()} for i in range(batch_size)]
-        inputs = [maybe_apply_chat_template(x, self.processing_class) for x in inputs]
-        inputs = [self.tokenize_row(x, self.model.config.is_encoder_decoder, self.processing_class) for x in inputs]
-        inputs = self.data_collator(inputs)
-
-        # need the prompt_ only
-        inputs = self._prepare_inputs(inputs)
-        context_length = inputs["prompt_input_ids"].shape[1]
-        prompts = {
-            "input_ids": inputs["prompt_input_ids"],
-            "attention_mask": inputs["prompt_attention_mask"],
-            "raw": prompts,
-        }
-        del inputs
-
-        # Sample completions from both the model and the reference model
-        model_output, ref_output = self._generate_completions(prompts, model)
-
-        # Process model completions
-        model_data, ref_data = self._process_completions(model_output, ref_output, prompts)
-
-        # Compute rewards
-        if self.reward_funcs is not None:
-            model_scores, ref_scores = self._compute_rewards(model_data, ref_data, context_length)
-            chosen_mask = model_scores >= ref_scores
-        else:
-            model_scores, ref_scores = None, None
-            chosen_mask = self._compute_judge(model_data, ref_data, context_length)
-
-        # Compute logprobs
-        model_logprobs_model_data, model_logprobs_ref_data, ref_logprobs_ref_data, ref_logprobs_model_data = (
-            self._compute_logprobs(model, model_data, ref_data, context_length)
+class XPOTrainer(_XPOTrainer):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The `XPOTrainer` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.xpo import XPOTrainer`. The current import path will be removed and no longer "
+            "supported in TRL 0.29. For more information, see https://github.com/huggingface/trl/issues/4223."
         )
-
-        # Compute loss
-        loss, dpo_losses, xpo_losses = self._compute_losses(
-            model_logprobs_model_data,
-            model_logprobs_ref_data,
-            ref_logprobs_ref_data,
-            ref_logprobs_model_data,
-            chosen_mask,
-        )
-
-        # Log everything
-        self._log_statistics(
-            model_data,
-            ref_data,
-            model_logprobs_model_data.detach(),
-            model_logprobs_ref_data.detach(),
-            ref_logprobs_ref_data,
-            ref_logprobs_model_data,
-            chosen_mask,
-            dpo_losses.detach(),
-            xpo_losses.detach(),
-            context_length,
-            model_scores,
-            ref_scores,
-        )
-
-        if (
-            self.args.torch_empty_cache_steps is not None
-            and self.state.global_step % self.args.torch_empty_cache_steps == 0
-        ):
-            empty_cache()
-
-        kwargs = {}
-        # For LOMO optimizers you need to explicitly use the learning rate
-        if self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
-            kwargs["learning_rate"] = self._get_learning_rate()
-
-        if self.args.n_gpu > 1:
-            loss = loss.mean()  # mean() to average on multi-gpu parallel training
-
-        self.accelerator.backward(loss, **kwargs)
-
-        return loss.detach() / self.args.gradient_accumulation_steps
+        super().__init__(*args, **kwargs)

From dbf140d0a51ddb7e1176f0a4a22cbeb00f644bf3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Fri, 21 Nov 2025 05:43:39 +0000
Subject: [PATCH 7/8] style

---
 trl/experimental/orpo/orpo_config.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/trl/experimental/orpo/orpo_config.py b/trl/experimental/orpo/orpo_config.py
index 523beeab934..1ec70c838da 100644
--- a/trl/experimental/orpo/orpo_config.py
+++ b/trl/experimental/orpo/orpo_config.py
@@ -93,6 +93,16 @@ class ORPOConfig(TrainingArguments):
             "`fp16` is not set."
         },
     )
+    # Transformers 4.57.0 introduced a bug that caused the dtype of `lr_scheduler_kwargs` to be unparsable. This issue
+    # was fixed in https://github.com/huggingface/transformers/pull/41322, but the fix has not yet been released. We
+    # add a temporary workaround here, which can be removed once the fix is available—likely in Transformers 4.57.2.
+    lr_scheduler_kwargs: dict | str | None = field(
+        default=None,
+        metadata={
+            "help": "Additional parameters for the lr_scheduler, such as {'num_cycles': 1} for cosine with hard "
+            "restarts."
+        },
+    )
 
     max_length: int | None = field(
         default=1024,
@@ -162,8 +172,3 @@ class ORPOConfig(TrainingArguments):
         default=None,
         metadata={"help": "Number of processes to use for processing the dataset."},
     )
-
-    def __post_init__(self):
-        self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
-
-        super().__post_init__()

From b13c01854df2934a2e8fc35b5345dec3fd424685 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Fri, 21 Nov 2025 05:55:28 +0000
Subject: [PATCH 8/8] finish pr

---
 docs/source/_toctree.yml              |  4 ----
 docs/source/paper_index.md            | 24 ++++++++++++++++++++++++
 trl/experimental/orpo/orpo_config.py  |  7 ++++++-
 trl/experimental/orpo/orpo_trainer.py |  5 +++--
 4 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 6fd438151ab..7a6f7ee497c 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -123,8 +123,4 @@
     title: XPO
   - local: openenv
     title: OpenEnv Integration
-  - local: orpo_trainer
-    title: ORPO
-  - local: papo_trainer
-    title: PAPO
   title: Experimental
\ No newline at end of file
diff --git a/docs/source/paper_index.md b/docs/source/paper_index.md
index bdc41263013..662c2f034f9 100644
--- a/docs/source/paper_index.md
+++ b/docs/source/paper_index.md
@@ -697,3 +697,27 @@ trainer.train()
 ```
 
 For more details, see the [MiniLLM Trainer documentation](minillm) documentation.
+
+## Distributed Training
+
+### ZeRO: Memory Optimizations Toward Training Trillion Parameter Models
+
+**📜 Paper**: https://huggingface.co/papers/1910.02054
+
+ZeRO (Zero Redundancy Optimizer) eliminates memory redundancies in data- and model-parallel training by partitioning optimizer states, gradients, and parameters across devices while retaining low communication volume and high computational granularity. This allows for the efficient training of large models that would otherwise not fit in GPU memory.
+
+TRL supports ZeRO via the [DeepSpeed integration](deepspeed_integration). To use it, provide a DeepSpeed configuration file with your desired settings,
+
+```yaml
+# config.yaml
+distributed_type: DEEPSPEED
+num_processes: 2
+deepspeed_config:
+  zero_stage: 3
+```
+
+and launch the training script using `accelerate launch --config_file config_file`.
+
+```sh
+accelerate launch --config_file config.yaml train.py
+```
diff --git a/trl/experimental/orpo/orpo_config.py b/trl/experimental/orpo/orpo_config.py
index 1ec70c838da..1db0cf94086 100644
--- a/trl/experimental/orpo/orpo_config.py
+++ b/trl/experimental/orpo/orpo_config.py
@@ -21,7 +21,7 @@
 @dataclass
 class ORPOConfig(TrainingArguments):
     r"""
-    Configuration class for the [`ORPOTrainer`].
+    Configuration class for the [`experimental.orpo.ORPOTrainer`].
 
     This class includes only the parameters that are specific to ORPO training. For a full list of training arguments,
     please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
@@ -172,3 +172,8 @@ class ORPOConfig(TrainingArguments):
         default=None,
         metadata={"help": "Number of processes to use for processing the dataset."},
     )
+
+    def __post_init__(self):
+        self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
+
+        super().__post_init__()
diff --git a/trl/experimental/orpo/orpo_trainer.py b/trl/experimental/orpo/orpo_trainer.py
index 29af144bb57..9e3b1987eba 100644
--- a/trl/experimental/orpo/orpo_trainer.py
+++ b/trl/experimental/orpo/orpo_trainer.py
@@ -82,7 +82,7 @@ class ORPOTrainer(BaseTrainer):
     Args:
         model ([`~transformers.PreTrainedModel`]):
             The model to train, preferably an [`~transformers.AutoModelForSequenceClassification`].
-        args ([`ORPOConfig`]):
+        args ([`experimental.orpo.ORPOConfig`]):
             The ORPO config arguments to use for training.
         data_collator ([`~transformers.DataCollator`]):
             The data collator to use for training. If None is specified, the default data collator
@@ -153,7 +153,7 @@ def __init__(
             raise ValueError("You passed model_kwargs to the ORPOTrainer. But your model is already instantiated.")
         else:
             model_init_kwargs = args.model_init_kwargs
-            dtype = model_init_kwargs.get("dtype")
+            dtype = model_init_kwargs.get("dtype", "auto")
             if dtype is not None:
                 # Convert to `torch.dtype` if an str is passed
                 if isinstance(dtype, str) and dtype != "auto":
@@ -163,6 +163,7 @@ def __init__(
                         f"Invalid `dtype` passed to the ORPOConfig. Expected a string with either `torch.dtype` or 'auto', but got {dtype}."
                     )
                 model_init_kwargs["dtype"] = dtype
+            model_init_kwargs["device_map"] = model_init_kwargs.get("device_map", "auto")
 
         if isinstance(model, str):
             model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)