From fb0a5b034a0d92942e633f4326a4411f49ffd0f4 Mon Sep 17 00:00:00 2001
From: Behrooz <ermiaazarkhalili@gmail.com>
Date: Sun, 2 Nov 2025 19:37:38 -0800
Subject: [PATCH 1/2] fix: Remove chat template setting from non-SFT trainer
 scripts

Resolves #4404

- Remove SIMPLE_CHAT_TEMPLATE import from 7 trainer scripts
- Remove chat template setting for non-SFT trainers (DPO, CPO, ORPO, PPO, Nash-MD, XPO, Online DPO)
- Chat templates only make sense for SFT (instruction tuning), not for preference optimization or reward-based training
- Scripts modified:
  - examples/scripts/online_dpo.py
  - examples/scripts/orpo.py
  - examples/scripts/cpo.py
  - examples/scripts/nash_md.py
  - examples/scripts/xpo.py
  - examples/scripts/ppo/ppo.py
  - examples/scripts/ppo/ppo_tldr.py
---
 examples/scripts/cpo.py          | 3 ---
 examples/scripts/nash_md.py      | 3 ---
 examples/scripts/online_dpo.py   | 3 ---
 examples/scripts/orpo.py         | 3 ---
 examples/scripts/ppo/ppo.py      | 3 ---
 examples/scripts/ppo/ppo_tldr.py | 3 ---
 examples/scripts/xpo.py          | 3 ---
 7 files changed, 21 deletions(-)

diff --git a/examples/scripts/cpo.py b/examples/scripts/cpo.py
index 2d9049136c6..fef9cdf1247 100644
--- a/examples/scripts/cpo.py
+++ b/examples/scripts/cpo.py
@@ -64,7 +64,6 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
 
 from trl import CPOConfig, CPOTrainer, ModelConfig, ScriptArguments, get_peft_config
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 
 # Enable logging in a Hugging Face Space
@@ -90,8 +89,6 @@
     # Dataset
     ################
     dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
-    if tokenizer.chat_template is None:
-        tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
 
     ################
     # Training
diff --git a/examples/scripts/nash_md.py b/examples/scripts/nash_md.py
index e3f1486c75d..fdb8ca09a3e 100644
--- a/examples/scripts/nash_md.py
+++ b/examples/scripts/nash_md.py
@@ -73,7 +73,6 @@
     get_kbit_device_map,
     get_quantization_config,
 )
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 
 # Enable logging in a Hugging Face Space
@@ -128,8 +127,6 @@
     )
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
-    if tokenizer.chat_template is None:
-        tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
 
     dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
 
diff --git a/examples/scripts/online_dpo.py b/examples/scripts/online_dpo.py
index 91569c8b4f1..4ed7afe884d 100644
--- a/examples/scripts/online_dpo.py
+++ b/examples/scripts/online_dpo.py
@@ -69,7 +69,6 @@
     get_peft_config,
     get_quantization_config,
 )
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 
 # Enable logging in a Hugging Face Space
@@ -131,8 +130,6 @@
         trust_remote_code=model_args.trust_remote_code,
         **model_kwargs,
     )
-    if tokenizer.chat_template is None:
-        tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
     if tokenizer.pad_token_id is None:
         tokenizer.pad_token = tokenizer.eos_token
 
diff --git a/examples/scripts/orpo.py b/examples/scripts/orpo.py
index d392bb7bf1b..e256a4277ad 100644
--- a/examples/scripts/orpo.py
+++ b/examples/scripts/orpo.py
@@ -64,7 +64,6 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
 
 from trl import ModelConfig, ORPOConfig, ORPOTrainer, ScriptArguments, get_peft_config
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 
 # Enable logging in a Hugging Face Space
@@ -91,8 +90,6 @@
     # Dataset
     ################
     dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
-    if tokenizer.chat_template is None:
-        tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
 
     ################
     # Training
diff --git a/examples/scripts/ppo/ppo.py b/examples/scripts/ppo/ppo.py
index 5dfcda55429..2f5471996c2 100644
--- a/examples/scripts/ppo/ppo.py
+++ b/examples/scripts/ppo/ppo.py
@@ -43,7 +43,6 @@
     get_peft_config,
     get_quantization_config,
 )
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 
 # Enable logging in a Hugging Face Space
@@ -106,8 +105,6 @@
         model_args.model_name_or_path, padding_side="left", trust_remote_code=model_args.trust_remote_code
     )
     tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-    if tokenizer.chat_template is None:
-        tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
     value_model = AutoModelForSequenceClassification.from_pretrained(
         training_args.reward_model_path, trust_remote_code=model_args.trust_remote_code, num_labels=1
     )
diff --git a/examples/scripts/ppo/ppo_tldr.py b/examples/scripts/ppo/ppo_tldr.py
index 4ef1cf4e7b6..7962758ec40 100644
--- a/examples/scripts/ppo/ppo_tldr.py
+++ b/examples/scripts/ppo/ppo_tldr.py
@@ -43,7 +43,6 @@
     get_peft_config,
     get_quantization_config,
 )
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 
 # Enable logging in a Hugging Face Space
@@ -113,8 +112,6 @@
         model_args.model_name_or_path, padding_side="left", trust_remote_code=model_args.trust_remote_code
     )
     tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-    if tokenizer.chat_template is None:
-        tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
     value_model = AutoModelForSequenceClassification.from_pretrained(
         training_args.reward_model_path, trust_remote_code=model_args.trust_remote_code, num_labels=1
     )
diff --git a/examples/scripts/xpo.py b/examples/scripts/xpo.py
index e4e7c6301a6..70c13226c5d 100644
--- a/examples/scripts/xpo.py
+++ b/examples/scripts/xpo.py
@@ -57,7 +57,6 @@
     get_kbit_device_map,
     get_quantization_config,
 )
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 
 # Enable logging in a Hugging Face Space
@@ -113,8 +112,6 @@
     )
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
-    if tokenizer.chat_template is None:
-        tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
 
     dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
 

From ee5827cbe5ea579b8afe5913d8c017a1d1f7b01b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Mon, 3 Nov 2025 15:26:46 +0000
Subject: [PATCH 2/2] same for test

---
 tests/test_cpo_trainer.py  | 3 ---
 tests/test_gkd_trainer.py  | 5 -----
 tests/test_orpo_trainer.py | 3 ---
 tests/test_ppo_trainer.py  | 4 ----
 trl/trainer/utils.py       | 3 ---
 5 files changed, 18 deletions(-)

diff --git a/tests/test_cpo_trainer.py b/tests/test_cpo_trainer.py
index 19833a414ff..01c581eb53f 100644
--- a/tests/test_cpo_trainer.py
+++ b/tests/test_cpo_trainer.py
@@ -17,7 +17,6 @@
 from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer
 
 from trl import CPOConfig, CPOTrainer
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 from .testing_utils import TrlTestCase, require_peft
 
@@ -33,7 +32,6 @@ def setup_method(self):
         model_id = "trl-internal-testing/tiny-T5ForConditionalGeneration"
         self.t5_model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
         self.t5_tokenizer = AutoTokenizer.from_pretrained(model_id)
-        self.t5_tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
 
     @pytest.mark.parametrize(
         "name, loss_type, config_name",
@@ -41,7 +39,6 @@ def setup_method(self):
             ("qwen", "sigmoid", "standard_preference"),
             ("t5", "hinge", "standard_implicit_prompt_preference"),
             ("qwen", "ipo", "conversational_preference"),
-            ("t5", "ipo", "conversational_implicit_prompt_preference"),
             ("qwen", "simpo", "standard_preference"),
             ("t5", "simpo", "standard_implicit_prompt_preference"),
             ("qwen", "hinge", "conversational_preference"),
diff --git a/tests/test_gkd_trainer.py b/tests/test_gkd_trainer.py
index 0a13d9db12a..6516a88f94e 100644
--- a/tests/test_gkd_trainer.py
+++ b/tests/test_gkd_trainer.py
@@ -21,7 +21,6 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
 
 from trl import GKDConfig, GKDTrainer
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 from .testing_utils import TrlTestCase, require_liger_kernel
 
@@ -206,10 +205,6 @@ def setup_method(self):
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
         self.tokenizer.pad_token = self.tokenizer.eos_token
 
-        # Ensure the tokenizer has a chat template
-        if not hasattr(self.tokenizer, "chat_template") or self.tokenizer.chat_template is None:
-            self.tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
-
     def test_gkd_trainer(self):
         training_args = GKDConfig(
             output_dir=self.tmp_dir,
diff --git a/tests/test_orpo_trainer.py b/tests/test_orpo_trainer.py
index f882cf756f8..d91654fc7ba 100644
--- a/tests/test_orpo_trainer.py
+++ b/tests/test_orpo_trainer.py
@@ -17,7 +17,6 @@
 from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer
 
 from trl import ORPOConfig, ORPOTrainer
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 from .testing_utils import TrlTestCase, require_peft
 
@@ -33,7 +32,6 @@ def setup_method(self):
         model_id = "trl-internal-testing/tiny-T5ForConditionalGeneration"
         self.t5_model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
         self.t5_tokenizer = AutoTokenizer.from_pretrained(model_id)
-        self.t5_tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
 
     @pytest.mark.parametrize(
         "name, config_name",
@@ -41,7 +39,6 @@ def setup_method(self):
             ("qwen", "standard_preference"),
             ("t5", "standard_implicit_prompt_preference"),
             ("qwen", "conversational_preference"),
-            ("t5", "conversational_implicit_prompt_preference"),
         ],
     )
     def test_orpo_trainer(self, name, config_name):
diff --git a/tests/test_ppo_trainer.py b/tests/test_ppo_trainer.py
index 6e62e742115..317bab51351 100644
--- a/tests/test_ppo_trainer.py
+++ b/tests/test_ppo_trainer.py
@@ -19,7 +19,6 @@
 from transformers.utils import is_peft_available
 
 from trl import PPOConfig, PPOTrainer
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 from .testing_utils import TrlTestCase, require_peft
 
@@ -37,9 +36,6 @@ def setup_method(self):
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, padding_side="left")
         self.tokenizer.add_special_tokens({"pad_token": "[PAD]"})
 
-        if self.tokenizer.chat_template is None:
-            self.tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
-
         # Add reward and value models as in ppo.py
         reward_model_id = "trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5"
         self.value_model = AutoModelForSequenceClassification.from_pretrained(reward_model_id, num_labels=1)
diff --git a/trl/trainer/utils.py b/trl/trainer/utils.py
index e8130d694d7..f9014b61506 100644
--- a/trl/trainer/utils.py
+++ b/trl/trainer/utils.py
@@ -738,9 +738,6 @@ def print_rich_table(df: pd.DataFrame) -> None:
     console.print(table)
 
 
-SIMPLE_SFT_CHAT_TEMPLATE = "{% for message in messages %}{{' ' + message['content']}}{% endfor %}{{ eos_token }}"
-# SIMPLE_SFT_CHAT_TEMPLATE simply ends things with an EOS token, this helps the SFT model learn to end the completions with EOS tokens
-
 SIMPLE_CHAT_TEMPLATE = "{% for message in messages %}{{message['role'].capitalize() + ': ' + message['content'] + '\n\n'}}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"