Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions examples/trl/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,81 @@ python run_generation.py \
--prompt "Here is my prompt"

```


## PPO pipeline

### Training

The following example is for the creation of StackLlaMa 2: a Stack exchange llama-v2-7b model.
There are three main steps to the PPO training process:
1. Supervised fine-tuning of the base llama-v2-7b model to create llama-v2-7b-se:
```
python ../gaudi_spawn.py --world_size 8 --use_mpi sft.py \
--model_name_or_path meta-llama/Llama-2-7b-hf \
--output_dir="./sft" \
--max_steps=500 \
--logging_steps=10 \
--save_steps=100 \
--per_device_train_batch_size=4 \
--per_device_eval_batch_size=1 \
--gradient_accumulation_steps=2 \
--learning_rate=1e-4 \
--lr_scheduler_type="cosine" \
--warmup_steps=100 \
--weight_decay=0.05 \
--optim="paged_adamw_32bit" \
--lora_target_modules "q_proj" "v_proj" \
--bf16 \
--remove_unused_columns=False \
--run_name="sft_llama2" \
--report_to=none \
--use_habana \
--use_lazy_mode
```
2. Reward modeling using dialog pairs from the SE dataset on the llama-v2-7b-se to create llama-v2-7b-se-rm
```
Comment thread
sywangyi marked this conversation as resolved.
python ../gaudi_spawn.py --world_size 8 --use_mpi reward_modeling.py \
--model_name=./sft/final_merged_checkpoint \
Comment thread
sywangyi marked this conversation as resolved.
--tokenizer_name=meta-llama/Llama-2-7b-hf \
--output_dir=./rm
```
To merge the adaptors into the base model we can use the `merge_peft_adapter.py` helper script that comes with TRL:

```
python merge_peft_adapter.py --base_model_name="meta-llama/Llama-2-7b-hf" --adapter_model_name="rm" --output_name="rm_merged_checkpoint"
```

3. RL fine-tuning of llama-v2-7b-se with the llama-v2-7b-se-rm reward model:
```
python ../gaudi_spawn.py --world_size 8 --use_mpi ppo.py \
--model_name=./sft/final_merged_checkpoint \
Comment thread
sywangyi marked this conversation as resolved.
--reward_model_name=./rm_merged_checkpoint \
--tokenizer_name=meta-llama/Llama-2-7b-hf \
--adafactor=False \
--output_max_length=128 \
--batch_size=8 \
--gradient_accumulation_steps=8 \
--batched_gen=True \
--ppo_epochs=4 \
--seed=0 \
--learning_rate=1.4e-5 \
--early_stopping=True \
--output_dir=llama-se-rl-finetune
```
To merge the adaptors into the base model we can use the `merge_peft_adapter.py` helper script that comes with TRL:

```
python merge_peft_adapter.py --base_model_name="meta-llama/Llama-2-7b-hf" --adapter_model_name="llama-se-rl-finetune" --output_name="rl_merged_checkpoint"
```

### Running the model
We can load the PPO-trained LoRA adaptors which were saved by the PPO training step and run it through the [text-generation example](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation).

```
python run_generation.py \
--model_name_or_path ../trl/rl_merged_checkpoint/ \
--use_hpu_graphs --use_kv_cache --batch_size 1 --bf16 --max_new_tokens 100 \
--prompt "Here is my prompt"
```

312 changes: 312 additions & 0 deletions examples/trl/ppo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,312 @@
# copy from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/rl_training.py, enable it for Gaudi2
from dataclasses import dataclass, field
from typing import List, Optional

import torch
from datasets import load_dataset
from peft import LoraConfig
from tqdm import tqdm
from transformers import Adafactor, AutoModelForSequenceClassification, AutoTokenizer, HfArgumentParser, pipeline
from trl import AutoModelForCausalLMWithValueHead
from trl.core import LengthSampler

from optimum.habana.accelerate import GaudiAccelerator
from optimum.habana.trl import GaudiPPOConfig, GaudiPPOTrainer, adapt_PreTrainedModelWrapper_to_gaudi
from optimum.habana.utils import set_seed


tqdm.pandas()


@dataclass
class ScriptArguments:
"""
The name of the Casual LM model we wish to fine with PPO
"""

# NOTE: gpt2 models use Conv1D instead of Linear layers which are not yet supported in 8 bit mode
# models like gpt-neo* models are more suitable.
model_name: Optional[str] = field(default="meta-llama/Llama-2-7b-hf", metadata={"help": "the model name"})
tokenizer_name: Optional[str] = field(default="meta-llama/Llama-2-7b-hf", metadata={"help": "the tokenizer name"})
reward_model_name: Optional[str] = field(default="", metadata={"help": "the reward model name"})
log_with: Optional[str] = field(default=None, metadata={"help": "use 'wandb' to log with wandb"})
learning_rate: Optional[float] = field(default=1.41e-5, metadata={"help": "the learning rate"})
output_max_length: Optional[int] = field(default=128, metadata={"help": "maximum output length for generation"})
input_max_length: Optional[int] = field(default=512, metadata={"help": "maximum input length for generation"})
mini_batch_size: Optional[int] = field(default=1, metadata={"help": "the PPO minibatch size"})
batch_size: Optional[int] = field(default=32, metadata={"help": "the batch size"})
ppo_epochs: Optional[int] = field(default=4, metadata={"help": "the number of ppo epochs"})
gradient_accumulation_steps: Optional[int] = field(
default=4, metadata={"help": "the number of gradient accumulation steps"}
)
adafactor: Optional[bool] = field(default=False, metadata={"help": "whether to use the adafactor optimizer"})
early_stopping: Optional[bool] = field(default=False, metadata={"help": "whether to early stop"})
target_kl: Optional[float] = field(default=0.1, metadata={"help": "kl target for early stopping"})
reward_baseline: Optional[float] = field(
default=0.0,
metadata={"help": "a baseline value that is subtracted from the reward"},
)
batched_gen: Optional[bool] = field(default=False, metadata={"help": "whether to use the batched text gen"})
save_freq: Optional[int] = field(default=None, metadata={"help": "n steps to save the model"})
output_dir: Optional[str] = field(default="runs/", metadata={"help": "n steps to save the model"})
seed: Optional[int] = field(default=0, metadata={"help": "the seed"})
steps: Optional[int] = field(default=20000, metadata={"help": "number of epochs"})
init_kl_coef: Optional[float] = field(
default=0.2,
metadata={"help": "Initial KL penalty coefficient (used for adaptive and linear control)"},
)

adap_kl_ctrl: Optional[bool] = field(default=True, metadata={"help": "Use adaptive KL control, otherwise linear"})
use_habana: Optional[bool] = field(default=True, metadata={"help": "use habana for RL training"})
lora_alpha: Optional[float] = field(default=32, metadata={"help": "the lora alpha parameter"})
lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"})
lora_r: Optional[int] = field(default=16, metadata={"help": "the lora r parameter"})
lora_target_modules: List[str] = field(
default_factory=lambda: None,
metadata={"help": "Target modules for the LoRA method."},
)
max_train_samples: Optional[int] = field(
default=None,
metadata={
"help": (
"For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
)
},
)


adapt_PreTrainedModelWrapper_to_gaudi()
parser = HfArgumentParser(ScriptArguments)
script_args: ScriptArguments = parser.parse_args_into_dataclasses()[0]
reward_model_name = script_args.reward_model_name
dataset_name = "lvwerra/stack-exchange-paired"
config = GaudiPPOConfig(
steps=script_args.steps,
model_name=script_args.model_name,
learning_rate=script_args.learning_rate,
log_with=script_args.log_with,
batch_size=script_args.batch_size,
mini_batch_size=script_args.mini_batch_size,
gradient_accumulation_steps=script_args.gradient_accumulation_steps,
optimize_cuda_cache=True,
early_stopping=script_args.early_stopping,
target_kl=script_args.target_kl,
ppo_epochs=script_args.ppo_epochs,
seed=script_args.seed,
init_kl_coef=script_args.init_kl_coef,
adap_kl_ctrl=script_args.adap_kl_ctrl,
use_habana=script_args.use_habana,
pad_max_len=script_args.input_max_length + script_args.output_max_length,
pad_max_input_len=script_args.input_max_length,
)

train_dataset = load_dataset("lvwerra/stack-exchange-paired", data_dir="data/rl", split="train")
if script_args.max_train_samples is not None:
max_train_samples = min(len(train_dataset), script_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
original_columns = train_dataset.column_names

# We then define the arguments to pass to the sentiment analysis pipeline.
# We set `return_all_scores` to True to get the sentiment score for each token.
sent_kwargs = {
"return_all_scores": True,
"function_to_apply": "none",
"batch_size": 16,
"truncation": True,
}
if config.pad_for_acceleration:
sent_kwargs["padding"] = "max_length"
sent_kwargs["max_length"] = script_args.input_max_length + script_args.output_max_length

tokenizer = AutoTokenizer.from_pretrained(script_args.tokenizer_name)
# GPT-2 tokenizer has a pad token, but it is not eos_token by default. We need to set it to eos_token.
# only for this model.

if getattr(tokenizer, "pad_token", None) is None:
tokenizer.pad_token = tokenizer.eos_token


# Below is an example function to build the dataset. In our case, we use the IMDB dataset
# from the `datasets` library. One should customize this function to train the model on
# its own dataset.
def build_dataset(
tokenizer,
dataset_name="lvwerra/stack-exchange-paired",
):
"""
Build dataset for training. This builds the dataset from `load_dataset`, one should
customize this function to train the model on its own dataset.

Args:
dataset_name (`str`):
The name of the dataset to be loaded.

Returns:
dataloader (`torch.utils.data.DataLoader`):
The dataloader for the dataset.
"""

num_proc = 24

def preprocess_function(examples):
new_examples = {
"query": [],
"input_ids": [],
}
for question in examples["question"]:
query = "Question: " + question + "\n\nAnswer: "
tokenized_question = tokenizer(query, truncation=True)
new_examples["query"].append(query)
new_examples["input_ids"].append(tokenized_question["input_ids"])

return new_examples

ds = train_dataset.map(
preprocess_function,
batched=True,
num_proc=num_proc,
remove_columns=original_columns,
)
ds = ds.filter(lambda x: len(x["input_ids"]) < 512, batched=False)

ds.set_format(type="torch")
return ds


# We retrieve the dataloader by calling the `build_dataset` function.
dataset = build_dataset(tokenizer)


def collator(data):
return {key: [d[key] for d in data] for key in data[0]}


# set seed before initializing value head for deterministic eval
set_seed(config.seed)

# Now let's build the model, the reference model, and the tokenizer.
current_device = GaudiAccelerator().local_process_index
lora_config = LoraConfig(
r=script_args.lora_r,
lora_alpha=script_args.lora_alpha,
lora_dropout=script_args.lora_dropout,
target_modules=script_args.lora_target_modules,
bias="none",
task_type="CAUSAL_LM",
)
model = AutoModelForCausalLMWithValueHead.from_pretrained(
config.model_name,
peft_config=lora_config,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
)

optimizer = None
model = model.to(torch.bfloat16)

if script_args.use_habana:
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(
config.model_name,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
)
Comment thread
sywangyi marked this conversation as resolved.
else:
ref_model = None
if script_args.adafactor:
optimizer = Adafactor(
filter(lambda p: p.requires_grad, model.parameters()),
scale_parameter=False,
relative_step=False,
warmup_init=False,
lr=config.learning_rate,
)
# We then build the PPOTrainer, passing the model, the reference model, the tokenizer
ppo_trainer = GaudiPPOTrainer(
config,
model,
ref_model=ref_model,
tokenizer=tokenizer,
dataset=dataset,
data_collator=collator,
optimizer=optimizer,
)

# We then build the sentiment analysis pipeline using our reward model, passing the
# model name and the sentiment analysis pipeline arguments. Let's also make sure to
# set the device to the same device as the PPOTrainer.
device = ppo_trainer.accelerator.device

reward_model = AutoModelForSequenceClassification.from_pretrained(
reward_model_name,
num_labels=1,
low_cpu_mem_usage=True,
torch_dtype=torch.bfloat16,
)

if config.use_habana:
from habana_frameworks.torch.hpu import wrap_in_hpu_graph

reward_model = wrap_in_hpu_graph(reward_model)

if device.type == "hpu":
device = "hpu"

sentiment_pipe = pipeline(
"sentiment-analysis",
model=reward_model,
tokenizer=tokenizer,
return_token_type_ids=False,
device=device,
model_kwargs={
"low_cpu_mem_usage": True,
"torch_dtype": torch.bfloat16,
},
)

if sentiment_pipe.model.config.pad_token_id is None:
sentiment_pipe.model.config.pad_token_id = tokenizer.pad_token_id
# We then define the arguments to pass to the `generate` function. These arguments
# are passed to the `generate` function of the PPOTrainer, which is a wrapper around
# the `generate` function of the trained model.
generation_kwargs = {
# "min_length": -1,
"top_k": 0.0,
"top_p": 1.0,
"do_sample": True,
"pad_token_id": tokenizer.pad_token_id,
"eos_token_id": 100_000,
}
output_min_length = 32
output_max_length = script_args.output_max_length
if not config.pad_for_acceleration:
output_length_sampler = LengthSampler(output_min_length, output_max_length)
else:
output_length_sampler = LengthSampler(output_max_length, output_max_length + 1)
for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
if epoch >= config.total_ppo_epochs:
break

question_tensors = batch["input_ids"]

response_tensors = ppo_trainer.generate(
question_tensors,
return_prompt=False,
length_sampler=output_length_sampler,
**generation_kwargs,
)
batch["response"] = tokenizer.batch_decode(response_tensors, skip_special_tokens=True)

# Compute reward score (using the sentiment analysis pipeline)
texts = [q + r for q, r in zip(batch["query"], batch["response"])]
pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
rewards = [torch.tensor(output[0]["score"] - script_args.reward_baseline) for output in pipe_outputs]

# Run PPO step
stats = ppo_trainer.step(question_tensors, response_tensors, rewards)
ppo_trainer.log_stats(stats, batch, rewards)

if script_args.save_freq and epoch and epoch % script_args.save_freq == 0:
ppo_trainer.save_pretrained(script_args.output_dir + f"step_{epoch}")

ppo_trainer.save_pretrained(script_args.output_dir)
Loading