huggingface · qgallouedec · Aug 17, 2025 · Oct 7, 2025 · Oct 8, 2025 · Oct 8, 2025
diff --git a/README.md b/README.md
@@ -113,18 +113,13 @@ trainer.train()
 
 ```python
 from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from trl import DPOConfig, DPOTrainer
+from trl import DPOTrainer
 
-model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
-tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
 dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
-training_args = DPOConfig(output_dir="Qwen2.5-0.5B-DPO")
+
 trainer = DPOTrainer(
-    model=model,
-    args=training_args,
+    model="Qwen3/Qwen-0.6B",
     train_dataset=dataset,
-    processing_class=tokenizer
 )
 trainer.train()
 ```

diff --git a/docs/source/bema_for_reference_model.md b/docs/source/bema_for_reference_model.md
@@ -7,26 +7,16 @@ This feature implements the BEMA algorithm to update the reference model during
 ```python
 from trl.experimental.bema_for_ref_model import BEMACallback, DPOTrainer
 from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
 
-
-pref_dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train")
-ref_model = AutoModelForCausalLM.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")
+dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train")
 
 bema_callback = BEMACallback(update_ref_model=True)
 
-model = AutoModelForCausalLM.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")
-tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")
-tokenizer.pad_token = tokenizer.eos_token
-
 trainer = DPOTrainer(
-    model=model,
-    ref_model=ref_model,
-    train_dataset=pref_dataset,
-    processing_class=tokenizer,
+    model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
+    train_dataset=dataset,
     callbacks=[bema_callback],
 )
-
 trainer.train()
 ```
 

diff --git a/docs/source/customization.md b/docs/source/customization.md
@@ -1,32 +1,26 @@
 # Training customization
 
-TRL is designed with modularity in mind so that users are able to efficiently customize the training loop for their needs. Below are examples on how you can apply and test different techniques. 
+TRL is designed with modularity in mind so that users are able to efficiently customize the training loop for their needs. Below are examples on how you can apply and test different techniques.
 
 > [!NOTE]
 > Although these examples use the [`DPOTrainer`], these customization methods apply to most (if not all) trainers in TRL.
 
 ## Use different optimizers and schedulers
 
-By default, the `DPOTrainer` creates a `torch.optim.AdamW` optimizer. You can create and define a different optimizer and pass it to `DPOTrainer` as follows:
+By default, the [`DPOTrainer`] creates a `torch.optim.AdamW` optimizer. You can create and define a different optimizer and pass it to [`DPOTrainer`] as follows:
 
 ```python
 from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from torch import optim
-from trl import DPOConfig, DPOTrainer
+from trl import DPOTrainer
 
-model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
-tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
 dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
-training_args = DPOConfig(output_dir="Qwen2.5-0.5B-DPO")
 
 optimizer = optim.SGD(model.parameters(), lr=training_args.learning_rate)
 
 trainer = DPOTrainer(
-    model=model,
-    args=training_args,
+    model="Qwen/Qwen2.5-0.5B-Instruct",
     train_dataset=dataset,
-    tokenizer=tokenizer,
     optimizers=(optimizer, None),
 )
 trainer.train()
@@ -50,7 +44,7 @@ trainer = DPOTrainer(..., optimizers=(optimizer, lr_scheduler))
 Another tool you can use for more memory efficient fine-tuning is to share layers between the reference model and the model you want to train.
 
 ```python
-from trl import create_reference_model
+from trl.experimental.utils import create_reference_model
 
 ref_model = create_reference_model(model, num_shared_layers=6)
 

diff --git a/docs/source/dpo_trainer.md b/docs/source/dpo_trainer.md
diff --git a/docs/source/example_overview.md b/docs/source/example_overview.md
@@ -44,7 +44,7 @@ These notebooks are easier to run and are designed for quick experimentation wit
 
 ## Scripts
 
-Scripts are maintained in the [`trl/scripts`](https://github.com/huggingface/trl/blob/main/trl/scripts) and [`examples/scripts`](https://github.com/huggingface/trl/blob/main/examples/scripts) directories. They show how to use different trainers such as `SFTTrainer`, `PPOTrainer`, `DPOTrainer`, `GRPOTrainer`, and more.
+Scripts are maintained in the [`trl/scripts`](https://github.com/huggingface/trl/blob/main/trl/scripts) and [`examples/scripts`](https://github.com/huggingface/trl/blob/main/examples/scripts) directories. They show how to use different trainers such as [`SFTTrainer`], [`PPOTrainer`], [`DPOTrainer`], [`GRPOTrainer`], and more.
 
 | File | Description |
 | --- | --- |

diff --git a/docs/source/lora_without_regret.md b/docs/source/lora_without_regret.md
@@ -277,7 +277,6 @@ Here are the parameters we used to train the above models
 | `--model_name_or_path` | HuggingFaceTB/SmolLM3-3B | HuggingFaceTB/SmolLM3-3B |
 | `--dataset_name` | HuggingFaceH4/OpenR1-Math-220k-default-verified | HuggingFaceH4/OpenR1-Math-220k-default-verified |
 | `--learning_rate` | 1.0e-5 | 1.0e-6 |
-| `--max_prompt_length` | 1024 | 1024 |
 | `--max_completion_length` | 4096 | 4096 |
 | `--lora_r` | 1 | - |
 | `--lora_alpha` | 32 | - |

diff --git a/docs/source/model_utils.md b/docs/source/model_utils.md
@@ -7,7 +7,3 @@
 ## disable_gradient_checkpointing
 
 [[autodoc]] models.utils.disable_gradient_checkpointing
-
-## create_reference_model
-
-[[autodoc]] create_reference_model