Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,16 +92,13 @@ trainer.train()
```python
from datasets import load_dataset
from trl import GRPOTrainer
from trl.rewards import accuracy_reward

dataset = load_dataset("trl-lib/tldr", split="train")

# Dummy reward function: count the number of unique characters in the completions
def reward_num_unique_chars(completions, **kwargs):
return [len(set(c)) for c in completions]
dataset = load_dataset("trl-lib/DeepMath-103K", split="train")

trainer = GRPOTrainer(
model="Qwen/Qwen2-0.5B-Instruct",
reward_funcs=reward_num_unique_chars,
reward_funcs=accuracy_reward,
train_dataset=dataset,
)
trainer.train()
Expand Down
37 changes: 14 additions & 23 deletions docs/source/grpo_trainer.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ This post-training method was contributed by [Quentin Gallouédec](https://huggi

## Quick start

This example demonstrates how to train a model using the GRPO method. We train a [Qwen 0.5B Instruct model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) with the prompts from the [UltraFeedback prompts dataset](https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt). You can view the data in the dataset here:
This example demonstrates how to train a model using the GRPO method. We train a [Qwen 0.5B Instruct model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) with the prompts from the [DeepMath-103K dataset](https://huggingface.co/datasets/trl-lib/DeepMath-103K). You can view the data in the dataset here:

<iframe
src="https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt/embed/viewer/default/train?row=0"
src="https://huggingface.co/datasets/trl-lib/DeepMath-103K/embed/viewer/default/train?row=0"
frameborder="0"
width="100%"
height="560px"
Expand All @@ -28,21 +28,14 @@ Below is the script to train the model.
```python
# train_grpo.py
from datasets import load_dataset
from trl import GRPOConfig, GRPOTrainer

dataset = load_dataset("trl-lib/ultrafeedback-prompt", split="train")
from trl import GRPOTrainer
from trl.rewards import accuracy_reward

# Dummy reward function for demonstration purposes
def reward_num_unique_letters(completions, **kwargs):
"""Reward function that rewards completions with more unique letters."""
completion_contents = [completion[0]["content"] for completion in completions]
return [float(len(set(content))) for content in completion_contents]
dataset = load_dataset("trl-lib/DeepMath-103K", split="train")

training_args = GRPOConfig(output_dir="Qwen2-0.5B-GRPO")
trainer = GRPOTrainer(
model="Qwen/Qwen2-0.5B-Instruct",
reward_funcs=reward_num_unique_letters,
args=training_args,
reward_funcs=accuracy_reward,
train_dataset=dataset,
)
trainer.train()
Expand Down Expand Up @@ -290,29 +283,27 @@ import argparse

from datasets import load_dataset
from trl import GRPOTrainer, GRPOConfig
from trl.rewards import accuracy_reward

def main():
parser = argparse.ArgumentParser()
parser.add_argument("--vllm_server_host", type=str, default="", help="The server IP")
args = parser.parse_args()

# Example dataset from TLDR
dataset = load_dataset("trl-lib/tldr", split="train")

# Dummy reward function: count the number of unique characters in the completions
def reward_num_unique_chars(completions, **kwargs):
return [len(set(c)) for c in completions]
dataset = load_dataset("trl-lib/DeepMath-103K", split="train")

training_args = GRPOConfig(
output_dir="Qwen2.5-72B-GRPO",

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

generated automatically, no need to pass it explicitly

per_device_train_batch_size=4,
bf16=True,
gradient_checkpointing=True,
Comment on lines -309 to -310

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these are default values

use_vllm=True,
vllm_server_host=args.vllm_server_host.replace("ip-", "").replace("-", "."), # from ip-X-X-X-X to X.X.X.X
)

trainer = GRPOTrainer(model="Qwen/Qwen2.5-72B", args=training_args, reward_funcs=reward_num_unique_chars, train_dataset=dataset)
trainer = GRPOTrainer(
model="Qwen/Qwen2.5-72B",
args=training_args,
reward_funcs=accuracy_reward,
train_dataset=dataset
)
trainer.train()

if __name__=="__main__":
Expand Down
9 changes: 3 additions & 6 deletions docs/source/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,12 @@ trainer.train()
```python
from trl import GRPOTrainer
from datasets import load_dataset

# Define a simple reward function (count unique chars as example)
def reward_function(completions, **kwargs):
return [len(set(completion.lower())) for completion in completions]
from trl.rewards import accuracy_reward

trainer = GRPOTrainer(
model="Qwen/Qwen2.5-0.5B-Instruct", # Start from SFT model
train_dataset=load_dataset("trl-lib/tldr", split="train"),
reward_funcs=reward_function,
train_dataset=load_dataset("trl-lib/DeepMath-103K", split="train"),
reward_funcs=accuracy_reward,
)
trainer.train()
```
Expand Down
19 changes: 6 additions & 13 deletions docs/source/rloo_trainer.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ This post-training method was contributed by [Costa Huang](https://github.com/vw

## Quick start

This example demonstrates how to train a model using the RLOO method. We train a [Qwen 0.5B Instruct model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) with the prompts from the [UltraFeedback prompts dataset](https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt). You can view the data in the dataset here:
This example demonstrates how to train a model using the RLOO method. We train a [Qwen 0.5B Instruct model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) with the prompts from the [DeepMath-103K dataset](https://huggingface.co/datasets/trl-lib/DeepMath-103K). You can view the data in the dataset here:

<iframe
src="https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt/embed/viewer/default/train?row=0"
src="https://huggingface.co/datasets/trl-lib/DeepMath-103K/embed/viewer/default/train?row=0"
frameborder="0"
width="100%"
height="560px"
Expand All @@ -29,21 +29,14 @@ Below is the script to train the model.
```python
# train_rloo.py
from datasets import load_dataset
from trl import RLOOConfig, RLOOTrainer

dataset = load_dataset("trl-lib/ultrafeedback-prompt", split="train")
from trl import RLOOTrainer
from trl.rewards import accuracy_reward

# Dummy reward function for demonstration purposes
def reward_num_unique_letters(completions, **kwargs):
"""Reward function that rewards completions with more unique letters."""
completion_contents = [completion[0]["content"] for completion in completions]
return [float(len(set(content))) for content in completion_contents]
dataset = load_dataset("trl-lib/DeepMath-103K", split="train")

training_args = RLOOConfig(output_dir="Qwen2-0.5B-RLOO")
trainer = RLOOTrainer(
model="Qwen/Qwen2-0.5B-Instruct",
reward_funcs=reward_num_unique_letters,
args=training_args,
reward_funcs=accuracy_reward,
train_dataset=dataset,
)
trainer.train()
Expand Down
92 changes: 21 additions & 71 deletions docs/source/vllm_integration.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,24 +46,14 @@ Sample of a simple `train.py` script:
```python
from datasets import load_dataset
from trl import GRPOTrainer, GRPOConfig
from trl.rewards import accuracy_reward

dataset = load_dataset("trl-lib/tldr", split="train")

# Dummy reward function: count the number of unique characters in the completions
def reward_num_unique_chars(completions, **kwargs):
return [len(set(c)) for c in completions]

training_args = GRPOConfig(
output_dir="my_test",
use_vllm=True,
bf16=True,
gradient_checkpointing=True,
)
dataset = load_dataset("trl-lib/DeepMath-103K", split="train")

trainer = GRPOTrainer(
model="Qwen/Qwen2.5-7B",
args=training_args,
reward_funcs=reward_num_unique_chars,
args=GRPOConfig(use_vllm=True),
reward_funcs=accuracy_reward,
train_dataset=dataset,
)

Expand All @@ -76,24 +66,14 @@ trainer.train()
```python
from datasets import load_dataset
from trl import OnlineDPOTrainer, OnlineDPOConfig
from trl.rewards import accuracy_reward

dataset = load_dataset("trl-lib/tldr", split="train")

# Dummy reward function: count the number of unique characters in the completions
def reward_num_unique_chars(completions, **kwargs):
return [len(set(c)) for c in completions]

training_args = OnlineDPOConfig(
output_dir="my_test",
use_vllm=True,
bf16=True,
gradient_checkpointing=True,
)
dataset = load_dataset("trl-lib/DeepMath-103K", split="train")

trainer = OnlineDPOTrainer(
model="Qwen/Qwen2.5-7B",
args=training_args,
reward_funcs=reward_num_unique_chars,
args=OnlineDPOConfig(use_vllm=True),
reward_funcs=accuracy_reward,
train_dataset=dataset,
)

Expand All @@ -106,24 +86,14 @@ trainer.train()
```python
from datasets import load_dataset
from trl.experimental.nash_md import NashMDConfig, NashMDTrainer
from trl.rewards import accuracy_reward

dataset = load_dataset("trl-lib/tldr", split="train")

# Dummy reward function: count the number of unique characters in the completions
def reward_num_unique_chars(completions, **kwargs):
return [len(set(c)) for c in completions]

training_args = NashMDConfig(
output_dir="my_test",
use_vllm=True,
bf16=True,
gradient_checkpointing=True,
)
dataset = load_dataset("trl-lib/DeepMath-103K", split="train")

trainer = NashMDTrainer(
model="Qwen/Qwen2.5-7B",
args=training_args,
reward_funcs=reward_num_unique_chars,
args=NashMDConfig(use_vllm=True),
reward_funcs=accuracy_reward,
train_dataset=dataset,
)

Expand All @@ -135,25 +105,15 @@ trainer.train()

```python
from datasets import load_dataset
from trl.experimental.xpo import XPOTrainer, XPOConfig

dataset = load_dataset("trl-lib/tldr", split="train")
from trl import XPOTrainer, XPOConfig
from trl.rewards import accuracy_reward

# Dummy reward function: count the number of unique characters in the completions
def reward_num_unique_chars(completions, **kwargs):
return [len(set(c)) for c in completions]

training_args = XPOConfig(
output_dir="my_test",
use_vllm=True,
bf16=True,
gradient_checkpointing=True,
)
dataset = load_dataset("trl-lib/DeepMath-103K", split="train")

trainer = XPOTrainer(
model="Qwen/Qwen2.5-7B",
args=training_args,
reward_funcs=reward_num_unique_chars,
args=XPOConfig(use_vllm=True),
reward_funcs=accuracy_reward,
train_dataset=dataset,
)

Expand All @@ -166,24 +126,14 @@ trainer.train()
```python
from datasets import load_dataset
from trl import RLOOTrainer, RLOOConfig
from trl.rewards import accuracy_reward

dataset = load_dataset("trl-lib/tldr", split="train")

# Dummy reward function: count the number of unique characters in the completions
def reward_num_unique_chars(completions, **kwargs):
return [len(set(c)) for c in completions]

training_args = RLOOConfig(
output_dir="my_test",
use_vllm=True,
bf16=True,
gradient_checkpointing=True,
)
dataset = load_dataset("trl-lib/DeepMath-103K", split="train")

trainer = RLOOTrainer(
model="Qwen/Qwen2.5-7B",
args=training_args,
reward_funcs=reward_num_unique_chars,
args=RLOOConfig(use_vllm=True),
reward_funcs=accuracy_reward,
train_dataset=dataset,
)

Expand Down
Loading
Loading