Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions docs/source/_toctree.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,6 @@
title: KTO
- local: nash_md_trainer
title: Nash-MD
- local: orpo_trainer
title: ORPO
- local: ppo_trainer
title: PPO
- local: prm_trainer
Expand Down Expand Up @@ -117,8 +115,10 @@
title: GRPO With Replay Buffer
- local: gspo_token
title: GSPO-token
- local: papo_trainer
title: PAPO
- local: openenv
title: OpenEnv Integration
- local: orpo_trainer
title: ORPO
- local: papo_trainer
title: PAPO
title: Experimental
2 changes: 1 addition & 1 deletion docs/source/orpo_trainer.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ Below is the script to train the model:
```python
# train_orpo.py
from datasets import load_dataset
from trl import ORPOConfig, ORPOTrainer
from trl.experimental.orpo import ORPOConfig, ORPOTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
Expand Down
3 changes: 2 additions & 1 deletion examples/scripts/orpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser

from trl import ModelConfig, ORPOConfig, ORPOTrainer, ScriptArguments, get_peft_config
from trl import ModelConfig, ScriptArguments, get_peft_config
from trl.experimental.orpo import ORPOConfig, ORPOTrainer


# Enable logging in a Hugging Face Space
Expand Down
2 changes: 1 addition & 1 deletion tests/test_orpo_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer

from trl import ORPOConfig, ORPOTrainer
from trl.experimental.orpo import ORPOConfig, ORPOTrainer

from .testing_utils import TrlTestCase, require_peft

Expand Down
19 changes: 19 additions & 0 deletions trl/experimental/orpo/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .orpo_config import ORPOConfig
from .orpo_trainer import ORPOTrainer


__all__ = ["ORPOConfig", "ORPOTrainer"]
169 changes: 169 additions & 0 deletions trl/experimental/orpo/orpo_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from dataclasses import dataclass, field
from typing import Any

from transformers import TrainingArguments


@dataclass
class ORPOConfig(TrainingArguments):
r"""
Configuration class for the [`ORPOTrainer`].

This class includes only the parameters that are specific to ORPO training. For a full list of training arguments,
please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
differ from those in [`~transformers.TrainingArguments`].

Using [`~transformers.HfArgumentParser`] we can turn this class into
[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
command line.

Parameters:
max_length (`int` or `None`, *optional*, defaults to `1024`):
Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
to use the default data collator.
max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
Maximum length of the prompt. This argument is required if you want to use the default data collator.
max_completion_length (`int`, *optional*):
Maximum length of the completion. This argument is required if you want to use the default data collator
and your model is an encoder-decoder.
beta (`float`, *optional*, defaults to `0.1`):
Parameter controlling the relative ratio loss weight in the ORPO loss. In the
[paper](https://huggingface.co/papers/2403.07691), it is denoted by λ. In the
[code](https://github.com/xfactlab/orpo), it is denoted by `alpha`.
disable_dropout (`bool`, *optional*, defaults to `True`):
Whether to disable dropout in the model.
label_pad_token_id (`int`, *optional*, defaults to `-100`):
Label pad token id. This argument is required if you want to use the default data collator.
padding_value (`int`, *optional*):
Padding value to use. If `None`, the padding value of the tokenizer is used.
truncation_mode (`str`, *optional*, defaults to `"keep_end"`):
Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
This argument is required if you want to use the default data collator.
generate_during_eval (`bool`, *optional*, defaults to `False`):
If `True`, generates and logs completions from the model to W&B or Comet during evaluation.
is_encoder_decoder (`bool`, *optional*):
When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
you need to specify if the model returned by the callable is an encoder-decoder model.
model_init_kwargs (`dict[str, Any]`, *optional*):
Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
string.
dataset_num_proc (`int`, *optional*):
Number of processes to use for processing the dataset.
"""

_VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["model_init_kwargs"]

# Parameters whose default values are overridden from TrainingArguments
learning_rate: float = field(
default=1e-6,
metadata={"help": "The initial learning rate for AdamW."},
)
logging_steps: float = field(
default=10,
metadata={
"help": "Log every X updates steps. Should be an integer or a float in range `[0,1)`. If smaller than 1, "
"will be interpreted as ratio of total training steps."
},
)
gradient_checkpointing: bool = field(
default=True,
metadata={
"help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
},
)
bf16: bool | None = field(
default=None,
metadata={
"help": "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA "
"architecture or Intel XPU or using CPU (use_cpu) or Ascend NPU. If not set, it defaults to `True` if "
"`fp16` is not set."
},
)

max_length: int | None = field(
default=1024,
metadata={"help": "Maximum length of the sequences (prompt + completion) in the batch."},
)
max_prompt_length: int | None = field(
default=512,
metadata={
"help": "Maximum length of the prompt. This argument is required if you want to use the default data "
"collator and your model is an encoder-decoder."
},
)
max_completion_length: int | None = field(
default=None,
metadata={
"help": "Maximum length of the completion. This argument is required if you want to use the default data "
"collator and your model is an encoder-decoder."
},
)
beta: float = field(
default=0.1,
metadata={
"help": "Parameter controlling the relative ratio loss weight in the ORPO loss. In the paper, it is "
"denoted by λ."
},
)
disable_dropout: bool = field(
default=True,
metadata={"help": "Whether to disable dropout in the model."},
)
label_pad_token_id: int = field(
default=-100,
metadata={
"help": "Label pad token id. This argument is required if you want to use the default data collator."
},
)
padding_value: int | None = field(
default=None,
metadata={"help": "Padding value to use. If `None`, the padding value of the tokenizer is used."},
)
truncation_mode: str = field(
default="keep_end",
metadata={
"help": "Truncation mode to use when the prompt is too long.",
"choices": ["keep_end", "keep_start"],
},
)
generate_during_eval: bool = field(
default=False,
metadata={"help": "If `True`, generates and logs completions from the model to W&B during evaluation."},
)
is_encoder_decoder: bool | None = field(
default=None,
metadata={
"help": "When using the `model_init` argument (callable) to instantiate the model instead of the `model` "
"argument, you need to specify if the model returned by the callable is an encoder-decoder model."
},
)
model_init_kwargs: dict[str, Any] | None = field(
default=None,
metadata={
"help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model "
"from a string."
},
)
dataset_num_proc: int | None = field(
default=None,
metadata={"help": "Number of processes to use for processing the dataset."},
)

def __post_init__(self):
self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16

super().__post_init__()
Loading
Loading