From 6102aa1c3a76675917f96569eab53d99bf39d2f1 Mon Sep 17 00:00:00 2001 From: "Wang, Yi A" Date: Wed, 27 Dec 2023 01:58:44 -0800 Subject: [PATCH 1/6] add PPO and stack_llama support Signed-off-by: Wang, Yi A --- examples/trl/stack_llama/README.md | 18 + .../trl/stack_llama/merge_peft_adapter.py | 50 + examples/trl/stack_llama/reward_modeling.py | 313 +++++++ examples/trl/stack_llama/rl_training.py | 294 ++++++ .../trl/stack_llama/supervised_finetuning.py | 215 +++++ optimum/habana/trl/__init__.py | 3 + optimum/habana/trl/models/__init__.py | 17 + optimum/habana/trl/models/modeling_base.py | 71 ++ optimum/habana/trl/trainer/__init__.py | 2 + optimum/habana/trl/trainer/ppo_config.py | 70 ++ optimum/habana/trl/trainer/ppo_trainer.py | 881 ++++++++++++++++++ 11 files changed, 1934 insertions(+) create mode 100644 examples/trl/stack_llama/README.md create mode 100644 examples/trl/stack_llama/merge_peft_adapter.py create mode 100644 examples/trl/stack_llama/reward_modeling.py create mode 100644 examples/trl/stack_llama/rl_training.py create mode 100644 examples/trl/stack_llama/supervised_finetuning.py create mode 100644 optimum/habana/trl/models/__init__.py create mode 100644 optimum/habana/trl/models/modeling_base.py create mode 100644 optimum/habana/trl/trainer/ppo_config.py create mode 100644 optimum/habana/trl/trainer/ppo_trainer.py diff --git a/examples/trl/stack_llama/README.md b/examples/trl/stack_llama/README.md new file mode 100644 index 0000000000..51a9728ed4 --- /dev/null +++ b/examples/trl/stack_llama/README.md @@ -0,0 +1,18 @@ +# RLHF pipeline for the creation of StackLLaMa: a Stack exchange llama-7b model. +There were three main steps to the training process: +1. Supervised fine-tuning of the base llama-7b model to create llama-7b-se: + - `torchrun --nnodes 1 --nproc_per_node 8 supervised_finetuning.py --model_path= --streaming --learning_rate 1e-5 --max_steps 5000 --bf16 --output_dir ./llama-se` +2. Reward modeling using dialog pairs from the SE dataset using the llama-7b-se to create llama-7b-se-rm: + - `torchrun --nnodes 1 --nproc_per_node 8 reward_modeling.py --model_name=` +3. RL fine-tuning of llama-7b-se with the llama-7b-se-rm reward model: + - `torchrun --nnodes 1 --nproc_per_node 8 rl_training.py --log_with=wandb --model_name= --reward_model_name= --adafactor=False --tokenizer_name= --save_freq=100 --output_max_length=128 --batch_size=8 --gradient_accumulation_steps=8 --batched_gen=True --ppo_epochs=4 --seed=0 --learning_rate=1.4e-5 --early_stopping=True --output_dir=llama-se-rl-finetune-128-8-8-1.4e-5_adam` + + +LoRA layers were using at all stages to reduce memory requirements. +At each stage the peft adapter layers were merged with the base model, using: +```shell +python merge_peft_adapter.py --adapter_model_name=XXX --base_model_name=YYY --output_name=ZZZ +``` +Note that this script requires `peft>=0.3.0`. + +For access to the base llama-7b model, please see Meta's [release](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) and [request form](https://docs.google.com/forms/d/e/1FAIpQLSfqNECQnMkycAp2jP4Z9TFX0cGR4uf7b_fBxjY_OjhJILlKGA/viewform). diff --git a/examples/trl/stack_llama/merge_peft_adapter.py b/examples/trl/stack_llama/merge_peft_adapter.py new file mode 100644 index 0000000000..8913fc62a4 --- /dev/null +++ b/examples/trl/stack_llama/merge_peft_adapter.py @@ -0,0 +1,50 @@ +# copy from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/merge_peft_adapter.py. +# only difference is removal of model.push_to_hub +from dataclasses import dataclass, field +from typing import Optional + +import torch +from peft import PeftConfig, PeftModel +from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, HfArgumentParser + + +@dataclass +class ScriptArguments: + """ + The input names representing the Adapter and Base model fine-tuned with PEFT, and the output name representing the + merged model. + """ + + adapter_model_name: Optional[str] = field(default=None, metadata={"help": "the adapter name"}) + base_model_name: Optional[str] = field(default=None, metadata={"help": "the base model name"}) + output_name: Optional[str] = field(default=None, metadata={"help": "the merged model name"}) + + +parser = HfArgumentParser(ScriptArguments) +script_args = parser.parse_args_into_dataclasses()[0] +assert script_args.adapter_model_name is not None, "please provide the name of the Adapter you would like to merge" +assert script_args.base_model_name is not None, "please provide the name of the Base model" +assert script_args.output_name is not None, "please provide the output name of the merged model" + +peft_config = PeftConfig.from_pretrained(script_args.adapter_model_name) +if peft_config.task_type == "SEQ_CLS": + # The sequence classification task is used for the reward model in PPO + model = AutoModelForSequenceClassification.from_pretrained( + script_args.base_model_name, num_labels=1, torch_dtype=torch.bfloat16 + ) +else: + model = AutoModelForCausalLM.from_pretrained( + script_args.base_model_name, return_dict=True, torch_dtype=torch.bfloat16 + ) + +tokenizer = AutoTokenizer.from_pretrained(script_args.base_model_name) + +# Load the PEFT model +model = PeftModel.from_pretrained(model, script_args.adapter_model_name) +model.eval() + +model = model.merge_and_unload() + +model.save_pretrained(f"{script_args.output_name}") +tokenizer.save_pretrained(f"{script_args.output_name}") +# model.push_to_hub(f"{script_args.output_name}", use_temp_dir=False) diff --git a/examples/trl/stack_llama/reward_modeling.py b/examples/trl/stack_llama/reward_modeling.py new file mode 100644 index 0000000000..32ce0faf50 --- /dev/null +++ b/examples/trl/stack_llama/reward_modeling.py @@ -0,0 +1,313 @@ +# copy from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/reward_modeling.py, enable it for Gaudi2 + +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, Union + +import evaluate +import numpy as np +import torch +import torch.nn as nn +from datasets import load_dataset +from peft import LoraConfig, TaskType, get_peft_model +from transformers import ( + AutoModelForSequenceClassification, + AutoTokenizer, + HfArgumentParser, + PreTrainedTokenizerBase, + TrainerCallback, +) +from transformers.utils import PaddingStrategy + +from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments + + +# Define and parse arguments. +@dataclass +class ScriptArguments: + """ + These arguments vary depending on how many GPUs you have, what their capacity and features are, and what size model you want to train. + """ + + local_rank: Optional[int] = field(default=-1, metadata={"help": "Used for multi-gpu"}) + resume_from_checkpoint: Optional[bool] = field( + default=False, + metadata={"help": "If you want to resume training where it left off."}, + ) + deepspeed: Optional[str] = field( + default=None, + metadata={ + "help": "Path to deepspeed config if using deepspeed. You may need this if the model that you want to train doesn't fit on a single GPU." + }, + ) + per_device_train_batch_size: Optional[int] = field(default=4) + per_device_eval_batch_size: Optional[int] = field(default=1) + gradient_accumulation_steps: Optional[int] = field(default=1) + learning_rate: Optional[float] = field(default=2e-5) + weight_decay: Optional[float] = field(default=0.001) + model_name: Optional[str] = field( + default="gpt2", + metadata={ + "help": "The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc." + }, + ) + tokenizer_name: Optional[str] = field( + default=None, + metadata={ + "help": "The tokenizer for your model, if left empty will use the default for your model", + }, + ) + bf16: Optional[bool] = field( + default=True, + metadata={ + "help": "This essentially cuts the training time in half if you want to sacrifice a little precision and have a supported GPU." + }, + ) + num_train_epochs: Optional[int] = field( + default=1, + metadata={"help": "The number of training epochs for the reward model."}, + ) + train_subset: Optional[int] = field( + default=100000, + metadata={"help": "The size of the subset of the training data to use"}, + ) + eval_subset: Optional[int] = field( + default=50000, + metadata={"help": "The size of the subset of the eval data to use"}, + ) + gradient_checkpointing: Optional[bool] = field( + default=False, + metadata={"help": "Enables gradient checkpointing."}, + ) + optim: Optional[str] = field( + default="adamw_hf", + metadata={"help": "The optimizer to use."}, + ) + lr_scheduler_type: Optional[str] = field( + default="linear", + metadata={"help": "The lr scheduler"}, + ) + max_length: Optional[int] = field(default=512) + eval_first_step: Optional[bool] = field( + default=False, + metadata={"help": "Whether to run eval after the first step"}, + ) + + +parser = HfArgumentParser(ScriptArguments) +script_args = parser.parse_args_into_dataclasses()[0] + +# Load the human stack-exchange-paired dataset for tuning the reward model. +train_dataset = load_dataset("lvwerra/stack-exchange-paired", data_dir="data/reward", split="train") +if script_args.train_subset > 0: + train_dataset = train_dataset.select(range(script_args.train_subset)) +eval_dataset = load_dataset("lvwerra/stack-exchange-paired", data_dir="data/evaluation", split="train") +if script_args.eval_subset > 0: + eval_dataset = eval_dataset.select(range(script_args.eval_subset)) +# Define the training args. Needs to be done before the model is loaded if you are using deepspeed. +model_name_split = script_args.model_name.split("/")[-1] +output_name = ( + f"{model_name_split}_peft_stack-exchange-paired_rmts__{script_args.train_subset}_{script_args.learning_rate}" +) + +training_args = GaudiTrainingArguments( + output_dir=output_name, + learning_rate=script_args.learning_rate, + per_device_train_batch_size=script_args.per_device_train_batch_size, + per_device_eval_batch_size=script_args.per_device_eval_batch_size, + num_train_epochs=script_args.num_train_epochs, + weight_decay=script_args.weight_decay, + evaluation_strategy="steps", + eval_steps=500, + save_strategy="steps", + save_steps=500, + gradient_accumulation_steps=script_args.gradient_accumulation_steps, + gradient_checkpointing=script_args.gradient_checkpointing, + deepspeed=script_args.deepspeed, + local_rank=script_args.local_rank, + remove_unused_columns=False, + label_names=[], + bf16=script_args.bf16, + logging_strategy="steps", + logging_steps=10, + optim=script_args.optim, + lr_scheduler_type=script_args.lr_scheduler_type, + report_to="none", + use_habana=True, + use_lazy_mode=True, +) +# Load the value-head model and tokenizer. +tokenizer_name = script_args.tokenizer_name if script_args.tokenizer_name is not None else script_args.model_name +tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_auth_token=True) +tokenizer.pad_token = tokenizer.eos_token + + +peft_config = LoraConfig( + task_type=TaskType.SEQ_CLS, + inference_mode=False, + r=8, + lora_alpha=32, + lora_dropout=0.1, +) +torch.autograd.set_detect_anomaly(True) +model = AutoModelForSequenceClassification.from_pretrained( + script_args.model_name, num_labels=1, torch_dtype=torch.bfloat16 +) + +model = get_peft_model(model, peft_config) +model.print_trainable_parameters() + +# Need to do this for gpt2, because it doesn't have an official pad token. +tokenizer.pad_token = tokenizer.eos_token +model.config.pad_token_id = tokenizer.eos_token_id +model.config.use_cache = not script_args.gradient_checkpointing +num_proc = 24 # Can adjust to be higher if you have more processors. +original_columns = train_dataset.column_names + + +# Turn the dataset into pairs of post + summaries, where text_j is the preferred question + answer and text_k is the other. +# Then tokenize the dataset. +def preprocess_function(examples): + new_examples = { + "input_ids_j": [], + "attention_mask_j": [], + "input_ids_k": [], + "attention_mask_k": [], + } + for question, response_j, response_k in zip(examples["question"], examples["response_j"], examples["response_k"]): + tokenized_j = tokenizer("Question: " + question + "\n\nAnswer: " + response_j, truncation=True) + tokenized_k = tokenizer("Question: " + question + "\n\nAnswer: " + response_k, truncation=True) + + new_examples["input_ids_j"].append(tokenized_j["input_ids"]) + new_examples["attention_mask_j"].append(tokenized_j["attention_mask"]) + new_examples["input_ids_k"].append(tokenized_k["input_ids"]) + new_examples["attention_mask_k"].append(tokenized_k["attention_mask"]) + + return new_examples + + +# preprocess the dataset and filter out QAs that are longer than script_args.max_length +train_dataset = train_dataset.map( + preprocess_function, + batched=True, + num_proc=num_proc, + remove_columns=original_columns, +) +train_dataset = train_dataset.filter( + lambda x: len(x["input_ids_j"]) <= script_args.max_length and len(x["input_ids_k"]) <= script_args.max_length +) + +eval_dataset = eval_dataset.map( + preprocess_function, + batched=True, + num_proc=num_proc, + remove_columns=original_columns, +) +eval_dataset = eval_dataset.filter( + lambda x: len(x["input_ids_j"]) <= script_args.max_length and len(x["input_ids_k"]) <= script_args.max_length +) + + +# We need to define a special data collator that batches the data in our j vs k format. +@dataclass +class RewardDataCollatorWithPadding: + tokenizer: PreTrainedTokenizerBase + padding: Union[bool, str, PaddingStrategy] = True + max_length: Optional[int] = None + pad_to_multiple_of: Optional[int] = None + return_tensors: str = "pt" + + def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]: + features_j = [] + features_k = [] + for feature in features: + features_j.append( + { + "input_ids": feature["input_ids_j"], + "attention_mask": feature["attention_mask_j"], + } + ) + features_k.append( + { + "input_ids": feature["input_ids_k"], + "attention_mask": feature["attention_mask_k"], + } + ) + batch_j = self.tokenizer.pad( + features_j, + padding=self.padding, + max_length=self.max_length, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors=self.return_tensors, + ) + batch_k = self.tokenizer.pad( + features_k, + padding=self.padding, + max_length=self.max_length, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors=self.return_tensors, + ) + batch = { + "input_ids_j": batch_j["input_ids"], + "attention_mask_j": batch_j["attention_mask"], + "input_ids_k": batch_k["input_ids"], + "attention_mask_k": batch_k["attention_mask"], + "return_loss": True, + } + return batch + + +# Define the metric that we'll use for validation. +accuracy = evaluate.load("accuracy") + + +def compute_metrics(eval_pred): + predictions, _ = eval_pred + # Here, predictions is rewards_j and rewards_k. + # We want to see how much of the time rewards_j > rewards_k. + predictions = np.argmax(predictions, axis=0) + labels = np.zeros(predictions.shape) + return accuracy.compute(predictions=predictions, references=labels) + + +class RewardTrainer(GaudiTrainer): + # Define how to compute the reward loss. We use the InstructGPT pairwise logloss: https://arxiv.org/abs/2203.02155 + def compute_loss(self, model, inputs, return_outputs=False): + rewards_j = model(input_ids=inputs["input_ids_j"], attention_mask=inputs["attention_mask_j"])[0] + rewards_k = model(input_ids=inputs["input_ids_k"], attention_mask=inputs["attention_mask_k"])[0] + loss = -nn.functional.logsigmoid(rewards_j - rewards_k).mean() + if return_outputs: + return loss, {"rewards_j": rewards_j, "rewards_k": rewards_k} + return loss + + +gaudi_config = GaudiConfig() +gaudi_config.use_fused_adam = True +gaudi_config.use_fused_clip_norm = True + +# Train the model, woohoo. +trainer = RewardTrainer( + model=model, + gaudi_config=gaudi_config, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + compute_metrics=compute_metrics, + data_collator=RewardDataCollatorWithPadding( + tokenizer=tokenizer, max_length=script_args.max_length, padding="max_length" + ), +) + + +if script_args.eval_first_step: + + class EvaluateFirstStepCallback(TrainerCallback): + def on_step_end(self, args, state, control, **kwargs): + if state.global_step == 1: + control.should_evaluate = True + + trainer.add_callback(EvaluateFirstStepCallback()) + +trainer.train(script_args.resume_from_checkpoint) + +print("Saving last checkpoint of the model") +trainer.save_model(output_name + "_peft_last_checkpoint") diff --git a/examples/trl/stack_llama/rl_training.py b/examples/trl/stack_llama/rl_training.py new file mode 100644 index 0000000000..53ec5b7251 --- /dev/null +++ b/examples/trl/stack_llama/rl_training.py @@ -0,0 +1,294 @@ +# copy from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/rl_training.py, enable it for Gaudi2 +from dataclasses import dataclass, field +from typing import Optional + +import torch +from datasets import load_dataset +from peft import LoraConfig +from tqdm import tqdm +from transformers import Adafactor, AutoModelForSequenceClassification, AutoTokenizer, HfArgumentParser, pipeline +from trl import AutoModelForCausalLMWithValueHead +from trl.core import LengthSampler + +from optimum.habana.accelerate import GaudiAccelerator +from optimum.habana.trl import GaudiPPOConfig, GaudiPPOTrainer, adapt_PreTrainedModelWrapper_to_gaudi +from optimum.habana.utils import set_seed + + +tqdm.pandas() + + +@dataclass +class ScriptArguments: + """ + The name of the Casual LM model we wish to fine with PPO + """ + + # NOTE: gpt2 models use Conv1D instead of Linear layers which are not yet supported in 8 bit mode + # models like gpt-neo* models are more suitable. + model_name: Optional[str] = field(default="", metadata={"help": "the model name"}) + tokenizer_name: Optional[str] = field(default="", metadata={"help": "the tokenizer name"}) + reward_model_name: Optional[str] = field(default="", metadata={"help": "the reward model name"}) + log_with: Optional[str] = field(default=None, metadata={"help": "use 'wandb' to log with wandb"}) + learning_rate: Optional[float] = field(default=1.41e-5, metadata={"help": "the learning rate"}) + output_max_length: Optional[int] = field(default=128, metadata={"help": "maximum length for generation"}) + mini_batch_size: Optional[int] = field(default=1, metadata={"help": "the PPO minibatch size"}) + batch_size: Optional[int] = field(default=32, metadata={"help": "the batch size"}) + ppo_epochs: Optional[int] = field(default=4, metadata={"help": "the number of ppo epochs"}) + gradient_accumulation_steps: Optional[int] = field( + default=4, metadata={"help": "the number of gradient accumulation steps"} + ) + adafactor: Optional[bool] = field(default=False, metadata={"help": "whether to use the adafactor optimizer"}) + early_stopping: Optional[bool] = field(default=False, metadata={"help": "whether to early stop"}) + target_kl: Optional[float] = field(default=0.1, metadata={"help": "kl target for early stopping"}) + reward_baseline: Optional[float] = field( + default=0.0, + metadata={"help": "a baseline value that is subtracted from the reward"}, + ) + batched_gen: Optional[bool] = field(default=False, metadata={"help": "whether to use the batched text gen"}) + save_freq: Optional[int] = field(default=None, metadata={"help": "n steps to save the model"}) + output_dir: Optional[str] = field(default="runs/", metadata={"help": "n steps to save the model"}) + seed: Optional[int] = field(default=0, metadata={"help": "the seed"}) + steps: Optional[int] = field(default=20000, metadata={"help": "number of epochs"}) + init_kl_coef: Optional[float] = field( + default=0.2, + metadata={"help": "Initial KL penalty coefficient (used for adaptive and linear control)"}, + ) + + adap_kl_ctrl: Optional[bool] = field(default=True, metadata={"help": "Use adaptive KL control, otherwise linear"}) + use_habana: Optional[bool] = field(default=True, metadata={"help": "use habana for RL training"}) + + +adapt_PreTrainedModelWrapper_to_gaudi() +parser = HfArgumentParser(ScriptArguments) +script_args: ScriptArguments = parser.parse_args_into_dataclasses()[0] +reward_model_name = script_args.reward_model_name +dataset_name = "lvwerra/stack-exchange-paired" +config = GaudiPPOConfig( + steps=script_args.steps, + model_name=script_args.model_name, + learning_rate=script_args.learning_rate, + log_with=script_args.log_with, + batch_size=script_args.batch_size, + mini_batch_size=script_args.mini_batch_size, + gradient_accumulation_steps=script_args.gradient_accumulation_steps, + optimize_cuda_cache=True, + early_stopping=script_args.early_stopping, + target_kl=script_args.target_kl, + ppo_epochs=script_args.ppo_epochs, + seed=script_args.seed, + init_kl_coef=script_args.init_kl_coef, + adap_kl_ctrl=script_args.adap_kl_ctrl, + use_habana=script_args.use_habana, + pad_for_acceleration=script_args.use_habana, + pad_max_len=512 + script_args.output_max_length, + pad_max_input_len=512, +) + +train_dataset = load_dataset("lvwerra/stack-exchange-paired", data_dir="data/rl", split="train") +train_dataset = train_dataset.select(range(100000)) +original_columns = train_dataset.column_names + +# We then define the arguments to pass to the sentiment analysis pipeline. +# We set `return_all_scores` to True to get the sentiment score for each token. +sent_kwargs = { + "return_all_scores": True, + "function_to_apply": "none", + "batch_size": 16, + "truncation": True, +} +if config.pad_for_acceleration: + sent_kwargs["padding"] = "max_length" + sent_kwargs["max_length"] = 512 + script_args.output_max_length + +tokenizer = AutoTokenizer.from_pretrained(script_args.tokenizer_name) +# GPT-2 tokenizer has a pad token, but it is not eos_token by default. We need to set it to eos_token. +# only for this model. + +if getattr(tokenizer, "pad_token", None) is None: + tokenizer.pad_token = tokenizer.eos_token + + +# Below is an example function to build the dataset. In our case, we use the IMDB dataset +# from the `datasets` library. One should customize this function to train the model on +# its own dataset. +def build_dataset( + tokenizer, + dataset_name="lvwerra/stack-exchange-paired", +): + """ + Build dataset for training. This builds the dataset from `load_dataset`, one should + customize this function to train the model on its own dataset. + + Args: + dataset_name (`str`): + The name of the dataset to be loaded. + + Returns: + dataloader (`torch.utils.data.DataLoader`): + The dataloader for the dataset. + """ + + num_proc = 24 + + def preprocess_function(examples): + new_examples = { + "query": [], + "input_ids": [], + } + for question in examples["question"]: + query = "Question: " + question + "\n\nAnswer: " + tokenized_question = tokenizer(query, truncation=True) + new_examples["query"].append(query) + new_examples["input_ids"].append(tokenized_question["input_ids"]) + + return new_examples + + ds = train_dataset.map( + preprocess_function, + batched=True, + num_proc=num_proc, + remove_columns=original_columns, + ) + ds = ds.filter(lambda x: len(x["input_ids"]) < 512, batched=False) + + ds.set_format(type="torch") + return ds + + +# We retrieve the dataloader by calling the `build_dataset` function. +dataset = build_dataset(tokenizer) + + +def collator(data): + return {key: [d[key] for d in data] for key in data[0]} + + +# set seed before initializing value head for deterministic eval +set_seed(config.seed) + +# Now let's build the model, the reference model, and the tokenizer. +current_device = GaudiAccelerator().local_process_index +lora_config = LoraConfig( + r=16, + lora_alpha=32, + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", +) +model = AutoModelForCausalLMWithValueHead.from_pretrained( + config.model_name, + peft_config=lora_config, + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, +) + +optimizer = None +model = model.to(torch.bfloat16) + +if script_args.use_habana: + ref_model = AutoModelForCausalLMWithValueHead.from_pretrained( + config.model_name, + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + ) +else: + ref_model = None +if script_args.adafactor: + optimizer = Adafactor( + filter(lambda p: p.requires_grad, model.parameters()), + scale_parameter=False, + relative_step=False, + warmup_init=False, + lr=config.learning_rate, + ) +# We then build the PPOTrainer, passing the model, the reference model, the tokenizer +ppo_trainer = GaudiPPOTrainer( + config, + model, + ref_model=ref_model, + tokenizer=tokenizer, + dataset=dataset, + data_collator=collator, + optimizer=optimizer, +) + +# We then build the sentiment analysis pipeline using our reward model, passing the +# model name and the sentiment analysis pipeline arguments. Let's also make sure to +# set the device to the same device as the PPOTrainer. +device = ppo_trainer.accelerator.device +if ppo_trainer.accelerator.num_processes == 1 and torch.cuda.is_available(): + device = 0 + +reward_model = AutoModelForSequenceClassification.from_pretrained( + reward_model_name, + num_labels=1, + low_cpu_mem_usage=True, + torch_dtype=torch.bfloat16, +) + +if config.use_habana: + from habana_frameworks.torch.hpu import wrap_in_hpu_graph + + reward_model = wrap_in_hpu_graph(reward_model) + +if device.type == "hpu": + device = "hpu" + +sentiment_pipe = pipeline( + "sentiment-analysis", + model=reward_model, + tokenizer=tokenizer, + return_token_type_ids=False, + device=device, + model_kwargs={ + "low_cpu_mem_usage": True, + "torch_dtype": torch.bfloat16, + }, +) + +if sentiment_pipe.model.config.pad_token_id is None: + sentiment_pipe.model.config.pad_token_id = tokenizer.pad_token_id +# We then define the arguments to pass to the `generate` function. These arguments +# are passed to the `generate` function of the PPOTrainer, which is a wrapper around +# the `generate` function of the trained model. +generation_kwargs = { + # "min_length": -1, + "top_k": 0.0, + "top_p": 1.0, + "do_sample": True, + "pad_token_id": tokenizer.pad_token_id, + "eos_token_id": 100_000, +} +output_min_length = 32 +output_max_length = script_args.output_max_length +if not config.pad_for_acceleration: + output_length_sampler = LengthSampler(output_min_length, output_max_length) +else: + output_length_sampler = LengthSampler(output_max_length, output_max_length + 1) + +for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)): + if epoch >= config.total_ppo_epochs: + break + + question_tensors = batch["input_ids"] + + response_tensors = ppo_trainer.generate( + question_tensors, + return_prompt=False, + length_sampler=output_length_sampler, + **generation_kwargs, + ) + batch["response"] = tokenizer.batch_decode(response_tensors, skip_special_tokens=True) + + # Compute reward score (using the sentiment analysis pipeline) + texts = [q + r for q, r in zip(batch["query"], batch["response"])] + pipe_outputs = sentiment_pipe(texts, **sent_kwargs) + rewards = [torch.tensor(output[0]["score"] - script_args.reward_baseline) for output in pipe_outputs] + + # Run PPO step + stats = ppo_trainer.step(question_tensors, response_tensors, rewards) + ppo_trainer.log_stats(stats, batch, rewards) + + if script_args.save_freq and epoch and epoch % script_args.save_freq == 0: + ppo_trainer.save_pretrained(script_args.output_dir + f"step_{epoch}") diff --git a/examples/trl/stack_llama/supervised_finetuning.py b/examples/trl/stack_llama/supervised_finetuning.py new file mode 100644 index 0000000000..a61bca6e3b --- /dev/null +++ b/examples/trl/stack_llama/supervised_finetuning.py @@ -0,0 +1,215 @@ +# copy from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/supervised_finetuning.py, enable it for Gaudi2 + +import argparse +import os + +import torch +from datasets import load_dataset +from peft import LoraConfig +from tqdm import tqdm +from transformers import AutoModelForCausalLM, AutoTokenizer, logging, set_seed +from trl.trainer import ConstantLengthDataset + +from optimum.habana import GaudiConfig, GaudiTrainingArguments +from optimum.habana.trl import GaudiSFTTrainer + + +""" +Fine-Tune Llama-7b on SE paired dataset +""" + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--model_path", type=str, default="") + parser.add_argument("--dataset_name", type=str, default="lvwerra/stack-exchange-paired") + parser.add_argument("--subset", type=str, default="data/finetune") + parser.add_argument("--split", type=str, default="train") + parser.add_argument("--size_valid_set", type=int, default=4000) + parser.add_argument("--streaming", action="store_true") + parser.add_argument("--shuffle_buffer", type=int, default=5000) + + parser.add_argument("--seq_length", type=int, default=1024) + parser.add_argument("--max_steps", type=int, default=10000) + parser.add_argument("--batch_size", type=int, default=4) + parser.add_argument("--gradient_accumulation_steps", type=int, default=1) + parser.add_argument("--eos_token_id", type=int, default=49152) + + parser.add_argument("--learning_rate", type=float, default=1e-4) + parser.add_argument("--lr_scheduler_type", type=str, default="cosine") + parser.add_argument("--num_warmup_steps", type=int, default=100) + parser.add_argument("--weight_decay", type=float, default=0.05) + + parser.add_argument("--local_rank", type=int, default=0) + parser.add_argument("--fp16", action="store_true", default=False) + parser.add_argument("--bf16", action="store_true", default=False) + parser.add_argument("--gradient_checkpointing", action="store_true", default=False) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--num_workers", type=int, default=None) + parser.add_argument("--output_dir", type=str, default="./checkpoints") + parser.add_argument("--log_freq", default=1, type=int) + parser.add_argument("--eval_freq", default=1000, type=int) + parser.add_argument("--save_freq", default=1000, type=int) + + return parser.parse_args() + + +def chars_token_ratio(dataset, tokenizer, nb_examples=400): + """ + Estimate the average number of characters per token in the dataset. + """ + total_characters, total_tokens = 0, 0 + for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples): + text = prepare_sample_text(example) + total_characters += len(text) + if tokenizer.is_fast: + total_tokens += len(tokenizer(text).tokens()) + else: + total_tokens += len(tokenizer.tokenize(text)) + + return total_characters / total_tokens + + +def print_trainable_parameters(model): + """ + Prints the number of trainable parameters in the model. + """ + trainable_params = 0 + all_param = 0 + for _, param in model.named_parameters(): + all_param += param.numel() + if param.requires_grad: + trainable_params += param.numel() + print( + f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" + ) + + +def prepare_sample_text(example): + """Prepare the text from a sample of the dataset.""" + text = f"Question: {example['question']}\n\nAnswer: {example['response_j']}" + return text + + +def create_datasets(tokenizer, args): + dataset = load_dataset( + args.dataset_name, + data_dir=args.subset, + split=args.split, + use_auth_token=True, + num_proc=args.num_workers if not args.streaming else None, + streaming=args.streaming, + ) + if args.streaming: + print("Loading the dataset in streaming mode") + valid_data = dataset.take(args.size_valid_set) + train_data = dataset.skip(args.size_valid_set) + train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=args.seed) + else: + dataset = dataset.train_test_split(test_size=0.005, seed=args.seed) + train_data = dataset["train"] + valid_data = dataset["test"] + print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}") + + chars_per_token = chars_token_ratio(train_data, tokenizer) + print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}") + + train_dataset = ConstantLengthDataset( + tokenizer, + train_data, + formatting_func=prepare_sample_text, + infinite=True, + seq_length=args.seq_length, + chars_per_token=chars_per_token, + ) + valid_dataset = ConstantLengthDataset( + tokenizer, + valid_data, + formatting_func=prepare_sample_text, + infinite=False, + seq_length=args.seq_length, + chars_per_token=chars_per_token, + ) + return train_dataset, valid_dataset + + +def run_training(args, train_data, val_data): + print("Loading the model") + + lora_config = LoraConfig( + r=16, + lora_alpha=32, + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + train_data.start_iteration = 0 + + print("Starting main loop") + + training_args = GaudiTrainingArguments( + output_dir=args.output_dir, + dataloader_drop_last=True, + evaluation_strategy="steps", + max_steps=args.max_steps, + eval_steps=args.eval_freq, + save_steps=args.save_freq, + logging_steps=args.log_freq, + per_device_train_batch_size=args.batch_size, + per_device_eval_batch_size=args.batch_size, + learning_rate=args.learning_rate, + lr_scheduler_type=args.lr_scheduler_type, + warmup_steps=args.num_warmup_steps, + gradient_accumulation_steps=args.gradient_accumulation_steps, + gradient_checkpointing=args.gradient_checkpointing, + fp16=args.fp16, + bf16=args.bf16, + weight_decay=args.weight_decay, + run_name="llama-7b-finetuned", + report_to="none", + ddp_find_unused_parameters=False, + use_habana=True, + use_lazy_mode=True, + ) + model = AutoModelForCausalLM.from_pretrained(args.model_path, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16) + + gaudi_config = GaudiConfig() + gaudi_config.use_fused_adam = True + gaudi_config.use_fused_clip_norm = True + + trainer = GaudiSFTTrainer( + model=model, + gaudi_config=gaudi_config, + args=training_args, + train_dataset=train_data, + eval_dataset=val_data, + peft_config=lora_config, + packing=True, + ) + + print_trainable_parameters(trainer.model) + + print("Training...") + trainer.train() + + print("Saving last checkpoint of the model") + trainer.save_model(os.path.join(args.output_dir, "final_checkpoint/")) + + +def main(args): + tokenizer = AutoTokenizer.from_pretrained(args.model_path) + train_dataset, eval_dataset = create_datasets(tokenizer, args) + run_training(args, train_dataset, eval_dataset) + + +if __name__ == "__main__": + args = get_args() + assert args.model_path != "", "Please provide the llama model path" + + set_seed(args.seed) + os.makedirs(args.output_dir, exist_ok=True) + + logging.set_verbosity_error() + + main(args) diff --git a/optimum/habana/trl/__init__.py b/optimum/habana/trl/__init__.py index e80fac8b8a..90c9624dc1 100644 --- a/optimum/habana/trl/__init__.py +++ b/optimum/habana/trl/__init__.py @@ -1,2 +1,5 @@ +from .models.modeling_base import adapt_PreTrainedModelWrapper_to_gaudi from .trainer.dpo_trainer import GaudiDPOTrainer +from .trainer.ppo_config import GaudiPPOConfig +from .trainer.ppo_trainer import GaudiPPOTrainer from .trainer.sft_trainer import GaudiSFTTrainer diff --git a/optimum/habana/trl/models/__init__.py b/optimum/habana/trl/models/__init__.py new file mode 100644 index 0000000000..22bf871003 --- /dev/null +++ b/optimum/habana/trl/models/__init__.py @@ -0,0 +1,17 @@ +# flake8: noqa + +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .modeling_base import adapt_PreTrainedModelWrapper_to_gaudi diff --git a/optimum/habana/trl/models/modeling_base.py b/optimum/habana/trl/models/modeling_base.py new file mode 100644 index 0000000000..ad02ffd1a7 --- /dev/null +++ b/optimum/habana/trl/models/modeling_base.py @@ -0,0 +1,71 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import torch +from trl import PreTrainedModelWrapper +from trl.import_utils import is_npu_available, is_xpu_available + +from optimum.habana.accelerate import GaudiPartialState as PartialState +from optimum.habana.utils import to_device_dtype + + +def adapt_PreTrainedModelWrapper_to_gaudi(): + PreTrainedModelWrapper._get_current_device = gaudi_get_current_device + PreTrainedModelWrapper.save_pretrained = gaudi_save_pretrained + + +def gaudi_get_current_device(): + """ + Copied from PreTrainedModelWrapper._get_current_device: https://github.com/huggingface/trl/blob/v0.7.6/trl/models/modeling_base.py#L392 + - add hpu device + """ + state = PartialState() + if is_xpu_available(): + return f"xpu:{state.local_process_index}" + elif is_npu_available(): + return f"npu:{state.local_process_index}" + elif hasattr(torch, "hpu") and torch.hpu.is_available(): + return "hpu" + else: + return state.local_process_index if torch.cuda.is_available() else "cpu" + + +def gaudi_save_pretrained(self, *args, **kwargs): + """ + Copied from PreTrainedModelWrapper.save_pretrained: https://github.com/huggingface/trl/blob/v0.7.6/trl/models/modeling_base.py#L528 + - to cpu if model dict is in hpu + """ + state_dict = kwargs.get("state_dict") + if state_dict is None: + state_dict = self.state_dict() + kwargs["state_dict"] = state_dict + + if self.__class__._get_current_device() == "hpu": + state_dict = to_device_dtype(state_dict, target_device=torch.device("cpu")) + + # if it is a peft model only save the `v_head` state_dict and + # pop the `state_dict` from the kwargs to avoid slient bugs with `peft` + if self.is_peft_model: + save_path = args[0] + save_path = os.path.join(save_path, "pytorch_model.bin") + torch.save(state_dict, save_path) + _ = kwargs.pop("state_dict", None) + + if self.__class__._get_current_device() == "hpu": + state_dict = self.pretrained_model.state_dict() + state_dict = to_device_dtype(state_dict, target_device=torch.device("cpu")) + kwargs["state_dict"] = state_dict + + return self.pretrained_model.save_pretrained(*args, **kwargs) diff --git a/optimum/habana/trl/trainer/__init__.py b/optimum/habana/trl/trainer/__init__.py index 13bf554fd7..b17262fc82 100644 --- a/optimum/habana/trl/trainer/__init__.py +++ b/optimum/habana/trl/trainer/__init__.py @@ -19,3 +19,5 @@ from .sft_trainer import GaudiSFTTrainer from .dpo_trainer import GaudiDPOTrainer +from .ppo_config import GaudiPPOConfig +from .ppo_trainer import GaudiPPOTrainer diff --git a/optimum/habana/trl/trainer/ppo_config.py b/optimum/habana/trl/trainer/ppo_config.py new file mode 100644 index 0000000000..03bf06dbca --- /dev/null +++ b/optimum/habana/trl/trainer/ppo_config.py @@ -0,0 +1,70 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from dataclasses import dataclass + +import numpy as np +from trl import PPOConfig, is_wandb_available +from trl.trainer.utils import exact_div + + +@dataclass +class GaudiPPOConfig(PPOConfig): + """ + Configuration class for GaudiPPOTrainer + """ + + use_habana: bool = False + """Use habana. Only applicable if use_habana is True""" + pad_for_acceleration: bool = False + """Use pad_for_acceleration. Only applicable if pad_for_acceleration is True""" + pad_max_len: int = 0 + """Use pad_for_acceleration. Only applicable if pad_for_acceleration is True""" + pad_max_input_len: int = 0 + + def __post_init__(self): + self.backward_batch_size = self.mini_batch_size * self.gradient_accumulation_steps + exact_div( + self.batch_size, + self.backward_batch_size, + "`batch_size`", + "`mini_batch_size * gradient_accumulation_steps`", + "`batch_size` must be a multiple of `mini_batch_size * gradient_accumulation_steps`", + ) + self.total_ppo_epochs = int(np.ceil(self.steps / self.batch_size)) + + # check if wandb is installed + if self.log_with == "wandb": + # raise error if wandb is not installed + if not is_wandb_available(): + raise ImportError( + "Please install wandb to use wandb logging. You can do this by running `pip install wandb`." + ) + + if self.pad_for_acceleration: + if self.pad_max_input_len == 0: + raise AssertionError("pad_max_input_len ({self.pad_max_input_len}) must be set for pad input ") + if self.pad_max_input_len >= self.pad_max_len: + raise AssertionError( + "pad_max_input_len ({self.pad_max_input_len}) must be smaller " + " then pad_max_len ({self.pad_max_len})" + ) + + if self.use_habana: + from optimum.habana.transformers.modeling_utils import ( # pylint: disable=E0611, E0401 + adapt_transformers_to_gaudi, + ) + + adapt_transformers_to_gaudi() + + assert self.kl_penalty in ["kl", "abs", "mse", "full"] diff --git a/optimum/habana/trl/trainer/ppo_trainer.py b/optimum/habana/trl/trainer/ppo_trainer.py new file mode 100644 index 0000000000..5ef2745e1e --- /dev/null +++ b/optimum/habana/trl/trainer/ppo_trainer.py @@ -0,0 +1,881 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +import time +import typing +import warnings +from contextlib import nullcontext +from typing import Callable, List, Optional, Union + +import numpy as np +import torch +from accelerate.utils import ProjectConfiguration +from datasets import Dataset +from torch.optim import Adam +from transformers import ( + DataCollatorForLanguageModeling, + PreTrainedTokenizer, + PreTrainedTokenizerBase, + PreTrainedTokenizerFast, +) +from trl import PPOTrainer +from trl.core import ( + WANDB_PADDING, + PPODecorators, + convert_to_scalar, + logprobs_from_logits, + stack_dicts, + stats_to_np, +) +from trl.import_utils import is_npu_available, is_torch_greater_2_0, is_xpu_available +from trl.models import SUPPORTED_ARCHITECTURES, PreTrainedModelWrapper, create_reference_model +from trl.trainer import AdaptiveKLController, BaseTrainer, FixedKLController, RunningMoments + +from optimum.habana.utils import set_seed + +from . import GaudiPPOConfig + + +class GaudiPPOTrainer(PPOTrainer, BaseTrainer): + def __init__( + self, + config: GaudiPPOConfig = None, + model: PreTrainedModelWrapper = None, + ref_model: Optional[PreTrainedModelWrapper] = None, + tokenizer: PreTrainedTokenizerBase = None, + dataset: Optional[Union[torch.utils.data.Dataset, Dataset]] = None, + optimizer: Optional[torch.optim.Optimizer] = None, + data_collator: Optional[typing.Callable] = None, + num_shared_layers: Optional[int] = None, + lr_scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None, + ): + """ + Copied from PPOTrainer.__init__: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L145 + The only differences are: + - add new args for guadi in config + - use GaudiAccelerator instead of Accelerator + """ + BaseTrainer.__init__(self, config) + + # initial seed for reproducible experiments + set_seed(config.seed) + + # Step 0: check positional arguments validity + if not isinstance(config, GaudiPPOConfig): + raise ValueError(f"config must be a PPOConfig, got {type(config)}") + if not isinstance(tokenizer, (PreTrainedTokenizerBase)): + raise ValueError( + f"tokenizer must be a PreTrainedTokenizerBase like a PreTrainedTokenizer or a PreTrainedTokenizerFast, got {type(tokenizer)}" + ) + if not isinstance(model, (SUPPORTED_ARCHITECTURES)): + raise ValueError( + f"model must be a PreTrainedModelWrapper, got {type(model)} - supported architectures are: {SUPPORTED_ARCHITECTURES}" + ) + # Step 1: Initialize Accelerator + if config.use_habana: + from optimum.habana.accelerate import GaudiAccelerator as Accelerator # pylint: disable=E0611, E0401 + else: + from accelerate import Accelerator + self.accelerator = Accelerator( + log_with=config.log_with, + gradient_accumulation_steps=config.gradient_accumulation_steps, + project_config=ProjectConfiguration(**config.project_kwargs), + **config.accelerator_kwargs, + ) + + # Step 1.1 Runtime variables filled by the accelerator + config.world_size = self.accelerator.num_processes + config.global_backward_batch_size = config.backward_batch_size * config.world_size + config.global_batch_size = config.batch_size * config.world_size + + self.model = model.to(self.accelerator.device.type) + self.model_params = filter(lambda p: p.requires_grad, self.model.parameters()) + self.is_encoder_decoder = hasattr(self.model, "is_encoder_decoder") + self.is_peft_model = getattr(self.model, "is_peft_model", False) + config.is_encoder_decoder = self.is_encoder_decoder + config.is_peft_model = self.is_peft_model + + is_using_tensorboard = config.log_with is not None and config.log_with == "tensorboard" + self.accelerator.init_trackers( + config.tracker_project_name, + config={"trl_ppo_trainer_config": config.to_dict()} if not is_using_tensorboard else config.to_dict(), + init_kwargs=config.tracker_kwargs, + ) + self.is_using_text_environment = getattr(config, "use_text_environment", False) + + if isinstance(ref_model, SUPPORTED_ARCHITECTURES): + self.ref_model = ref_model.to(self.accelerator.device.type) + if num_shared_layers is not None: + warnings.warn( + "num_shared_layers is ignored when ref_model is provided. Two different models are used for the " + "model and the reference model and no layers are shared.", + UserWarning, + ) + elif ref_model is None and not self.is_peft_model: + self.ref_model = create_reference_model(self.model, num_shared_layers=num_shared_layers) + elif self.is_peft_model: + self.ref_model = None + else: + raise ValueError( + f"ref_model must be a PreTrainedModelWrapper or `None`, got {type(ref_model)} - supported " + f"architectures are: {SUPPORTED_ARCHITECTURES} " + ) + self.optional_peft_ctx = ( + self.accelerator.unwrap_model(self.model).pretrained_model.disable_adapter + if self.is_peft_model + else nullcontext + ) + + if not (isinstance(tokenizer, PreTrainedTokenizer) or isinstance(tokenizer, PreTrainedTokenizerFast)): + raise ValueError( + "tokenizer must be a transformers.PreTrainedTokenizer or transformers.PreTrainedTokenizerFast" + ) + self.tokenizer = tokenizer + + if dataset is not None and not (isinstance(dataset, torch.utils.data.Dataset) or isinstance(dataset, Dataset)): + raise ValueError("dataset must be a torch.utils.data.Dataset or datasets.Dataset") + elif dataset is None: + warnings.warn( + "No dataset is provided. Make sure to set config.batch_size to the correct value before training.", + UserWarning, + ) + self.dataset = dataset + self._signature_columns = None + if self.dataset is not None: + self.dataloader = self.prepare_dataloader(self.dataset, data_collator) + elif self.dataset is None and self.accelerator.num_processes > 1: + warnings.warn( + "No dataset is provided. In a multi-GPU setting, this will lead to an error. You should" + " prepare your dataloader yourself with `dataloader = ppo_trainer.accelerator.prepare(dataloader)`" + " and using `torch.utils.data.DataLoader`, or pass a dataset to the `PPOTrainer`. Please " + " refer to the documentation for more details.", + UserWarning, + ) + self.dataloader = None + else: + self.dataloader = None + + # Step 3: Initialize optimizer and data collator + self.data_collator = DataCollatorForLanguageModeling(self.tokenizer, mlm=False) + if optimizer is None: + self.optimizer = Adam( + filter(lambda p: p.requires_grad, self.model.parameters()), + lr=self.config.learning_rate, + ) + else: + self.optimizer = optimizer + + self.lr_scheduler = lr_scheduler + if self.lr_scheduler is not None: + lr_scheduler_class = ( + torch.optim.lr_scheduler._LRScheduler + if not is_torch_greater_2_0() + else torch.optim.lr_scheduler.LRScheduler + ) + + if not isinstance(self.lr_scheduler, lr_scheduler_class): + raise ValueError( + "lr_scheduler must be a torch.optim.lr_scheduler._LRScheduler or torch.optim.lr_scheduler.LRScheduler (for torch >= 2.0)" + ) + + if self.config.adap_kl_ctrl: + self.kl_ctl = AdaptiveKLController(self.config.init_kl_coef, self.config.target, self.config.horizon) + else: + self.kl_ctl = FixedKLController(self.config.init_kl_coef) + + if self.accelerator.distributed_type == "MULTI_HPU": + from accelerate.utils import DistributedDataParallelKwargs + + kwargs = {} + kwargs["find_unused_parameters"] = True + kwargs["gradient_as_bucket_view"] = True + self.accelerator.ddp_handler = DistributedDataParallelKwargs(**kwargs) + + # Safety checkers for DS integration + is_deepspeed_used = self.accelerator.distributed_type == "DEEPSPEED" and hasattr( + self.accelerator.state, "deepspeed_plugin" + ) + + ( + self.model, + self.optimizer, + self.data_collator, + self.dataloader, + self.lr_scheduler, + ) = self.accelerator.prepare( + self.model, + self.optimizer, + self.data_collator, + self.dataloader, + self.lr_scheduler, + ) + if is_deepspeed_used: + # Quantized models are already set on the correct device + if not self.is_peft_model and not ( + getattr(self.ref_model.pretrained_model, "is_loaded_in_8bit", False) + or getattr(self.ref_model.pretrained_model, "is_loaded_in_4bit", False) + ): + self.ref_model = self._prepare_deepspeed(self.ref_model) + else: + self.ref_model = self.accelerator.prepare(self.ref_model) + + # In a distributed setup, only logging needs to be performed on the main process + # check: https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html + # or: https://discuss.pytorch.org/t/use-distributed-data-parallel-correctly/82500/11 + self.is_distributed = self.accelerator.num_processes > 1 + + # init the current step + self.current_step = 0 + + # init variables for pushing model to hub + if config.push_to_hub_if_best_kwargs: + if "repo_id" not in config.push_to_hub_if_best_kwargs: + raise ValueError("You have to specify repo_id in order to push the model to the hub!") + self.push_to_hub_kwargs = config.push_to_hub_if_best_kwargs + self.compare_step = 0 + self.highest_reward = torch.tensor(-float("inf")) + + # post process for PP + if not getattr(self.model, "is_sequential_parallel", False): + self.current_device = self.accelerator.device + else: + if is_xpu_available(): + self.current_device = torch.device("xpu:0") + elif is_npu_available(): + self.current_device = torch.device("npu:0") + elif self.accelerator.device.type == "hpu": + self.current_device = torch.device("hpu:0") + else: + self.current_device = torch.device("cuda:0") + + PPODecorators.optimize_device_cache = self.config.optimize_device_cache + + self.running = RunningMoments(self.accelerator) + if config.use_habana: + import habana_frameworks.torch.core as htcore + + self.htcore = htcore + + def generate( + self, + query_tensor: Union[torch.Tensor, List[torch.Tensor]], + length_sampler: Callable = None, + batch_size: int = 4, + return_prompt: bool = True, + generate_ref_response: bool = False, + **generation_kwargs, + ): + """ + Copied from PPOTrainer.generate: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L433 + The only differences are: + - add hpu graph for acceleration + """ + if generate_ref_response: + ref_model = self.model if self.is_peft_model else self.ref_model + if isinstance(query_tensor, List): + if self.config.use_habana: + self.wrap_generation_for_hpu_graph_mode(self.model) + response = self._generate_batched( + self.model, + query_tensor, + length_sampler=length_sampler, + batch_size=batch_size, + return_prompt=return_prompt, + **generation_kwargs, + ) + if generate_ref_response: + with self.optional_peft_ctx(): + if self.config.use_habana: + self.wrap_generation_for_hpu_graph_mode(ref_model) + ref_response = self._generate_batched( + ref_model, + query_tensor, + length_sampler=length_sampler, + batch_size=batch_size, + return_prompt=return_prompt, + **generation_kwargs, + ) + + else: + if len(query_tensor.shape) == 2: + raise ValueError( + "query_tensor must be a tensor of shape (`seq_len`) or a list of tensors of shape (`seq_len`)" + ) + + if length_sampler is not None: + generation_kwargs["max_new_tokens"] = length_sampler() + if self.config.use_habana: + self.wrap_generation_for_hpu_graph_mode(self.model) + response = self.accelerator.unwrap_model(self.model).generate( + input_ids=query_tensor.unsqueeze(dim=0), **generation_kwargs + ) + if generate_ref_response: + with self.optional_peft_ctx(): + if self.config.use_habana: + self.wrap_generation_for_hpu_graph_mode(ref_model) + ref_response = ref_model.generate(input_ids=query_tensor.unsqueeze(dim=0), **generation_kwargs) + + if not return_prompt and not self.is_encoder_decoder: + response = response[:, query_tensor.shape[0] :] + if generate_ref_response: + ref_response = ref_response[:, query_tensor.shape[0] :] + + if generate_ref_response: + return response, ref_response + return response + + def _generate_batched( + self, + model: PreTrainedModelWrapper, + query_tensors: List[torch.Tensor], + length_sampler: Callable = None, + batch_size: int = 4, + return_prompt: bool = True, + pad_to_multiple_of: int = None, + remove_padding: bool = True, + **generation_kwargs, + ): + """ + Copied from PPOTrainer._generate_batched: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L509 + The only differences are: + - pad to pad_max_input_len to get static shape for generation acceleration + - use lazy mode and hpu_graphs for generation in hpu + """ + outputs = [] + + padding_side_default = self.tokenizer.padding_side + if not self.is_encoder_decoder: + self.tokenizer.padding_side = "left" + + # in case we have fewer examples than bs + batch_size = min(len(query_tensors), batch_size) + + for i in range(0, len(query_tensors), batch_size): + if length_sampler is not None: + generation_kwargs["max_new_tokens"] = length_sampler() + + # prevent overflow if query tensors are not even multiple of bs + end_index = min(len(query_tensors), i + batch_size) + + batch = query_tensors[i:end_index] + batch_mask = [torch.ones_like(element) for element in batch] + inputs = {"input_ids": batch, "attention_mask": batch_mask} + + if self.config.pad_for_acceleration and self.config.pad_max_input_len > 0: + padded_inputs = self.tokenizer.pad( + inputs, + padding="max_length", + max_length=self.config.pad_max_input_len, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors="pt", + ).to(self.current_device) + else: + padded_inputs = self.tokenizer.pad( + inputs, + padding=True, + max_length=None, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors="pt", + ).to(self.current_device) + + if self.config.use_habana: + generation_kwargs["ignore_eos"] = False + generation_kwargs["lazy_mode"] = True + generation_kwargs["hpu_graphs"] = True + + generations = self.accelerator.unwrap_model(model).generate(**padded_inputs, **generation_kwargs) + + for generation, mask in zip(generations, padded_inputs["attention_mask"]): + if not self.is_encoder_decoder: + output = generation[(1 - mask).sum() :] # remove padding + else: + output = generation + + if not return_prompt and not self.is_encoder_decoder: + output = output[(mask).sum() :] # remove prompt + + if remove_padding and self.tokenizer.eos_token_id in output: + pad_mask = output == self.tokenizer.eos_token_id + pad_start = torch.nonzero(pad_mask, as_tuple=False)[0, 0].item() + output = output[: pad_start + 1] # keep the eos token at the end + + outputs.append(output) + + self.tokenizer.padding_side = padding_side_default + return outputs + + @PPODecorators.empty_device_cache() + def step( + self, + queries: List[torch.LongTensor], + responses: List[torch.LongTensor], + scores: List[torch.FloatTensor], + response_masks: Optional[List[torch.LongTensor]] = None, + ): + """ + Copied from PPOTrainer.step: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L620 + The only differences are: + - use hpu_graphs for sampling and training + - remove duplicated padding if padding is done in prepare_model_inputs + """ + bs = self.config.batch_size + + queries, responses, scores, response_masks = self._step_safety_checker( + bs, queries, responses, scores, response_masks + ) + scores = torch.tensor(scores, device=self.current_device) + if self.config.use_score_scaling: + # Score scaling + scores_mean, scores_std = self.running.update(scores) + tensor_to_kwargs = {"dtype": scores.dtype, "device": scores.device} + score_scaling_factor = self.running.std.to(**tensor_to_kwargs) + torch.finfo(scores.dtype).eps + if self.config.use_score_norm: + scores = (scores - self.running.mean.to(**tensor_to_kwargs)) / score_scaling_factor + else: + scores /= score_scaling_factor + + if self.config.score_clip is not None: + # Score clipping + scores_dtype = scores.dtype + scores = torch.clip(scores.float(), -self.config.score_clip, self.config.score_clip).to(dtype=scores_dtype) + + # if we want to push best model to the hub + if hasattr(self, "highest_reward"): + if self.compare_step % self.config.compare_steps == 0: + curr_mean_reward = scores.mean() + # if the best reward ever seen + if curr_mean_reward > self.highest_reward: + self.highest_reward = curr_mean_reward + # push model to hub + self.push_to_hub(**self.push_to_hub_kwargs) + self.compare_step += 1 + + timing = {} + t0 = time.time() + + t = time.time() + + model_inputs = self.prepare_model_inputs(queries, responses) + + if self.is_distributed and not self.config.pad_for_acceleration: + pad_first = self.tokenizer.padding_side == "left" + + model_inputs["input_ids"] = self.accelerator.pad_across_processes( + model_inputs["input_ids"], + dim=1, + pad_index=self.tokenizer.pad_token_id, + pad_first=pad_first, + ) + model_inputs["attention_mask"] = self.accelerator.pad_across_processes( + model_inputs["attention_mask"], dim=1, pad_index=0, pad_first=pad_first + ) + if self.is_encoder_decoder: + model_inputs["decoder_input_ids"] = self.accelerator.pad_across_processes( + model_inputs["decoder_input_ids"], + dim=1, + pad_index=self.tokenizer.pad_token_id, + pad_first=pad_first, + ) + model_inputs["decoder_attention_mask"] = self.accelerator.pad_across_processes( + model_inputs["decoder_attention_mask"], + dim=1, + pad_index=0, + pad_first=pad_first, + ) + + model_inputs_names = list(model_inputs.keys()) + + full_kl_penalty = self.config.kl_penalty == "full" + + with torch.no_grad(): + if self.config.use_habana: + self.unwrap_generation_for_hpu_graph_mode(self.model) + self.wrap_fw_for_hpu_graph_mode(self.model) + if self.ref_model is not None: + self.unwrap_generation_for_hpu_graph_mode(self.ref_model) + self.wrap_fw_for_hpu_graph_mode(self.ref_model) + all_logprobs, logits_or_none, values, masks = self.batched_forward_pass( + self.model, + queries, + responses, + model_inputs, + response_masks=response_masks, + return_logits=full_kl_penalty, + ) + with self.optional_peft_ctx(): + ref_logprobs, ref_logits_or_none, _, _ = self.batched_forward_pass( + self.model if self.is_peft_model else self.ref_model, + queries, + responses, + model_inputs, + return_logits=full_kl_penalty, + ) + + timing["time/ppo/forward_pass"] = time.time() - t + + with torch.no_grad(): + t = time.time() + if full_kl_penalty: + active_full_logprobs = logprobs_from_logits(logits_or_none, None, gather=False) + ref_full_logprobs = logprobs_from_logits(ref_logits_or_none, None, gather=False) + + rewards, non_score_reward = self.compute_rewards( + scores, active_full_logprobs, ref_full_logprobs, masks + ) + else: + rewards, non_score_reward = self.compute_rewards(scores, all_logprobs, ref_logprobs, masks) + timing["time/ppo/compute_rewards"] = time.time() - t + + t = time.time() + values, advantages, returns = self.compute_advantages(values, rewards, masks) + timing["time/ppo/compute_advantages"] = time.time() - t + + # upcast to float32 to avoid dataset issues + batch_dict = { + "queries": queries, + "responses": responses, + "logprobs": all_logprobs.to(torch.float32), + "values": values.to(torch.float32), + "masks": masks, + "advantages": advantages, + "returns": returns, + } + batch_dict.update(model_inputs) + + t = time.time() + all_stats = [] + early_stop = False + if self.config.use_habana: + self.unwrap_fw_for_hpu_graph_mode(self.model) + import habana_frameworks.torch as ht # pylint: disable=E0611, E0401 + + model = self.accelerator.unwrap_model(self.model) + if not hasattr(model, "wrap_train_in_graph"): + ht.hpu.ModuleCacher()(model=model, inplace=True) + setattr(model, "wrap_train_in_graph", model.forward) + else: + model.forward = getattr(model, "wrap_train_in_graph") + + for _ in range(self.config.ppo_epochs): + if early_stop: + break + b_inds = np.random.permutation(bs) + for backward_batch_start in range(0, bs, self.config.backward_batch_size): + backward_batch_end = backward_batch_start + self.config.backward_batch_size + backward_batch_inds = b_inds[backward_batch_start:backward_batch_end] + + for mini_batch_start in range(0, self.config.backward_batch_size, self.config.mini_batch_size): + mini_batch_end = mini_batch_start + self.config.mini_batch_size + mini_batch_inds = backward_batch_inds[mini_batch_start:mini_batch_end] + mini_batch_dict = { + "logprobs": batch_dict["logprobs"][mini_batch_inds], + "values": batch_dict["values"][mini_batch_inds], + "masks": batch_dict["masks"][mini_batch_inds], + # hacks: the queries and responses are ragged. + "queries": [batch_dict["queries"][i] for i in mini_batch_inds], + "responses": [batch_dict["responses"][i] for i in mini_batch_inds], + "advantages": batch_dict["advantages"][mini_batch_inds], + "returns": batch_dict["returns"][mini_batch_inds], + } + for k in model_inputs_names: + mini_batch_dict[k] = batch_dict[k][mini_batch_inds] + with self.accelerator.accumulate(self.model): + model_inputs = {k: mini_batch_dict[k] for k in model_inputs_names} + + logprobs, logits, vpreds, _ = self.batched_forward_pass( + self.model, + mini_batch_dict["queries"], + mini_batch_dict["responses"], + model_inputs, + return_logits=True, + ) + train_stats = self.train_minibatch( + mini_batch_dict["logprobs"], + mini_batch_dict["values"], + logprobs, + logits, + vpreds, + mini_batch_dict["masks"], + mini_batch_dict["advantages"], + mini_batch_dict["returns"], + ) + all_stats.append(train_stats) + + # typically, early stopping is done at the epoch level + if self.config.early_stopping: + policykl = train_stats["policy/policykl"] + early_stop = self._early_stop(policykl) + if early_stop: + break + + timing["time/ppo/optimize_step"] = time.time() - t + + t = time.time() + train_stats = stack_dicts(all_stats) + + # reshape advantages/ratios such that they are not averaged. + train_stats["policy/advantages"] = torch.flatten(train_stats["policy/advantages"]).unsqueeze(0) + train_stats["policy/advantages"] = torch.nan_to_num(train_stats["policy/advantages"], WANDB_PADDING) + train_stats["policy/ratio"] = torch.flatten(train_stats["policy/ratio"]).unsqueeze(0) + + stats = self.record_step_stats( + scores=scores, + logprobs=all_logprobs, + ref_logprobs=ref_logprobs, + non_score_reward=non_score_reward, + train_stats=train_stats, + kl_coef=self.kl_ctl.value, + masks=masks, + queries=queries, + responses=responses, + ) + # Gather/Reduce stats from all processes + if self.is_distributed: + stats = self.gather_stats(stats) + stats = stats_to_np(stats) + timing["time/ppo/calc_stats"] = time.time() - t + stats["ppo/learning_rate"] = self.optimizer.param_groups[0]["lr"] + + # Update the KL control - multiply the batch_size by the number of processes + self.kl_ctl.update( + stats["objective/kl"], + self.config.batch_size * self.accelerator.num_processes, + ) + + # Log the total ppo time + timing["time/ppo/total"] = time.time() - t0 + stats.update(timing) + + # post-process stats for tensorboard and other loggers + if self.config.log_with != "wandb": + stats = convert_to_scalar(stats) + + if self.lr_scheduler is not None: + self.lr_scheduler.step() + + return stats + + def prepare_model_inputs(self, queries: torch.Tensor, responses: torch.Tensor): + """ + Copied from PPOTrainer.prepare_model_inputs: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L921 + The only differences are: + - add padding to model inputs for static shape support in forward + """ + if self.is_encoder_decoder: + input_data = self.data_collator( + [{"input_ids": q, "attention_mask": torch.ones_like(q)} for q in queries] + ).to(self.current_device) + + decoder_inputs = self.data_collator( + [{"input_ids": r, "attention_mask": torch.ones_like(r)} for r in responses] + ).to(self.current_device) + + input_data["decoder_input_ids"] = decoder_inputs["input_ids"] + input_data["decoder_attention_mask"] = decoder_inputs["attention_mask"] + else: + input_ids = [torch.cat([q, r]) for q, r in zip(queries, responses)] + input_data = self.data_collator( + [{"input_ids": ids, "attention_mask": torch.ones_like(ids)} for ids in input_ids] + ).to(self.current_device) + + if self.config.pad_for_acceleration: + input_data["input_ids"] = torch.nn.functional.pad( + input_data["input_ids"], + (0, self.config.pad_max_len - input_data["input_ids"].shape[1]), + value=self.tokenizer.pad_token_id, + ) + input_data["attention_mask"] = torch.nn.functional.pad( + input_data["attention_mask"], + ( + 0, + self.config.pad_max_len - input_data["attention_mask"].shape[1], + ), + value=0, + ) + if self.is_encoder_decoder: + input_data["decoder_input_ids"] = torch.nn.functional.pad( + input_data["decoder_input_ids"], + ( + 0, + self.config.pad_max_len - input_data["decoder_input_ids"].shape[1], + ), + value=self.tokenizer.pad_token_id, + ) + input_data["decoder_attention_mask"] = torch.nn.functional.pad( + input_data["decoder_attention_mask"], + ( + 0, + self.config.pad_max_len - input_data["decoder_attention_mask"].shape[1], + ), + value=0, + ) + + input_data.pop("labels", None) # we don't want to compute LM losses + return input_data + + @PPODecorators.empty_device_cache() + def batched_forward_pass( + self, + model: PreTrainedModelWrapper, + queries: torch.Tensor, + responses: torch.Tensor, + model_inputs: dict, + return_logits: bool = False, + response_masks: Optional[torch.Tensor] = None, + ): + """ + Copied from PPOTrainer.batched_forward_pass: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L943 + The only differences are: + - input_kwargs/output need to clone() to avoid overidden in hpu + """ + bs = len(queries) + fbs = self.config.mini_batch_size + all_logprobs = [] + all_logits = [] + all_masks = [] + all_values = [] + + model.eval() + + for i in range(math.ceil(bs / fbs)): + input_kwargs = {key: value[i * fbs : (i + 1) * fbs].clone() for key, value in model_inputs.items()} + query_batch = queries[i * fbs : (i + 1) * fbs] + response_batch = responses[i * fbs : (i + 1) * fbs] + if response_masks is not None: + response_masks_batch = response_masks[i * fbs : (i + 1) * fbs] + logits, _, values = model(**input_kwargs) + + if self.is_encoder_decoder: + input_ids = input_kwargs["decoder_input_ids"] + attention_mask = input_kwargs["decoder_attention_mask"] + else: + input_ids = input_kwargs["input_ids"] + attention_mask = input_kwargs["attention_mask"] + + logprobs = logprobs_from_logits(logits[:, :-1, :], input_ids[:, 1:]) + masks = torch.zeros_like(attention_mask) + masks[:, :-1] = attention_mask[:, 1:] + + for j in range(len(query_batch)): + if self.is_encoder_decoder: + # Decoder sentence starts always in the index 1 after padding in the Enc-Dec Models + start = 1 + end = attention_mask[j, :].sum() - 1 + else: + start = len(query_batch[j]) - 1 # logprobs starts from the second query token + if attention_mask[j, 0] == 0: # offset left padding + start += attention_mask[j, :].nonzero()[0] + end = start + len(response_batch[j]) + if response_masks is not None: + response_masks_batch[j] = torch.cat( + (torch.zeros_like(query_batch[j]), response_masks_batch[j]) + )[1:] + + masks[j, :start] = 0 + masks[j, end:] = 0 + if response_masks is not None: + masks[j, start:end] = masks[j, start:end] * response_masks_batch[j][start:end] + + if return_logits: + all_logits.append(logits.clone()) + else: + del logits + all_values.append(values.clone()) + all_logprobs.append(logprobs) + all_masks.append(masks) + + return ( + torch.cat(all_logprobs), + torch.cat(all_logits)[:, :-1] if return_logits else None, + torch.cat(all_values)[:, :-1], + torch.cat(all_masks)[:, :-1], + ) + + @PPODecorators.empty_device_cache() + def train_minibatch( + self, + old_logprobs: torch.FloatTensor, + values: torch.FloatTensor, + logprobs: torch.FloatTensor, + logits: torch.FloatTensor, + vpreds: torch.FloatTensor, + mask: torch.LongTensor, + advantages: torch.FloatTensor, + returns: torch.FloatTensor, + ): + """ + Copied from PPOTrainer.batched_forward_pass: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L1034 + The only differences are: + - add htcore.mark_step + """ + self.model.train() + loss_p, loss_v, train_stats = self.loss( + old_logprobs, values, logits, vpreds, logprobs, mask, advantages, returns + ) + loss = loss_p + loss_v + self.accelerator.backward(loss) + if self.config.max_grad_norm is not None: + if self.accelerator.sync_gradients: + self.accelerator.clip_grad_norm_(self.model_params, self.config.max_grad_norm) + self.optimizer.step() + if self.config.use_habana: # pragma: no cover + self.htcore.mark_step() + # we call optimizer.zero_grad() every time and let `accelerator` handle accumulation + # see https://huggingface.co/docs/accelerate/usage_guides/gradient_accumulation#the-finished-code + self.optimizer.zero_grad() + return train_stats + + def wrap_fw_for_hpu_graph_mode(self, model: PreTrainedModelWrapper): + model = self.accelerator.unwrap_model(model) + if hasattr(model, "hpu_graph_fw"): + model.forward = model.hpu_graph_fw + else: + from habana_frameworks.torch.hpu import wrap_in_hpu_graph + + model.orig_fw = model.forward + model = wrap_in_hpu_graph(model) + model.hpu_graph_fw = model.forward + + def unwrap_fw_for_hpu_graph_mode(self, model: PreTrainedModelWrapper): + model = self.accelerator.unwrap_model(model) + if hasattr(model, "orig_fw"): + model.forward = model.orig_fw + + def wrap_generation_for_hpu_graph_mode(self, model: PreTrainedModelWrapper): + from habana_frameworks.torch.hpu import wrap_in_hpu_graph + + model = self.accelerator.unwrap_model(model) + if getattr(model, "is_peft_model", False): + if hasattr(model.pretrained_model.base_model.model, "hpu_graph_fw"): + model.pretrained_model.base_model.model.forward = model.pretrained_model.base_model.model.hpu_graph_fw + else: + model.pretrained_model.base_model.model.orig_fw = model.pretrained_model.base_model.model.forward + model.pretrained_model.base_model.model = wrap_in_hpu_graph(model.pretrained_model.base_model.model) + model.pretrained_model.base_model.model.hpu_graph_fw = model.pretrained_model.base_model.model.forward + else: + if hasattr(model.pretrained_model, "hpu_graph_fw"): + model.pretrained_model.forward = model.pretrained_model.hpu_graph_fw + else: + model.pretrained_model.orig_fw = model.pretrained_model.forward + model.pretrained_model = wrap_in_hpu_graph(model.pretrained_model) + model.pretrained_model.hpu_graph_fw = model.pretrained_model.forward + + def unwrap_generation_for_hpu_graph_mode(self, model: PreTrainedModelWrapper): + model = self.accelerator.unwrap_model(model) + if getattr(model, "is_peft_model", False): + if hasattr(model.pretrained_model.base_model.model, "orig_fw"): + model.pretrained_model.base_model.model.forward = model.pretrained_model.base_model.model.orig_fw + else: + if hasattr(model.pretrained_model, "orig_fw"): + model.pretrained_model.forward = model.pretrained_model.orig_fw From e7f83d93dfbeb290670a42ce4a41cc57c76e2104 Mon Sep 17 00:00:00 2001 From: "Wang, Yi A" Date: Wed, 17 Jan 2024 03:43:20 -0800 Subject: [PATCH 2/6] refactor ppo example Signed-off-by: Wang, Yi A --- examples/trl/README.md | 79 +++++++ .../{stack_llama/rl_training.py => ppo.py} | 23 +- .../trl/{stack_llama => }/reward_modeling.py | 37 +-- examples/trl/stack_llama/README.md | 18 -- .../trl/stack_llama/merge_peft_adapter.py | 50 ---- .../trl/stack_llama/supervised_finetuning.py | 215 ------------------ optimum/habana/trl/trainer/ppo_trainer.py | 2 +- 7 files changed, 119 insertions(+), 305 deletions(-) rename examples/trl/{stack_llama/rl_training.py => ppo.py} (92%) rename examples/trl/{stack_llama => }/reward_modeling.py (89%) delete mode 100644 examples/trl/stack_llama/README.md delete mode 100644 examples/trl/stack_llama/merge_peft_adapter.py delete mode 100644 examples/trl/stack_llama/supervised_finetuning.py diff --git a/examples/trl/README.md b/examples/trl/README.md index 7206ddbffb..8049349da6 100644 --- a/examples/trl/README.md +++ b/examples/trl/README.md @@ -72,3 +72,82 @@ python run_generation.py \ --prompt "Here is my prompt" ``` + + +## PPO pipeline + +### Training + +The following example is for the creation of StackLlaMa 2: a Stack exchange llama-v2-7b model. +There are three main steps to the PPO training process: +1. Supervised fine-tuning of the base llama-v2-7b model to create llama-v2-7b-se: + ``` + python ../gaudi_spawn.py --world_size 8 --use_mpi sft.py \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --output_dir="./sft" \ + --max_steps=500 \ + --logging_steps=10 \ + --save_steps=100 \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=1 \ + --gradient_accumulation_steps=2 \ + --learning_rate=1e-4 \ + --lr_scheduler_type="cosine" \ + --warmup_steps=100 \ + --weight_decay=0.05 \ + --optim="paged_adamw_32bit" \ + --lora_target_modules "q_proj" "v_proj" \ + --bf16 \ + --remove_unused_columns=False \ + --run_name="sft_llama2" \ + --report_to=none \ + --use_habana \ + --use_lazy_mode + ``` +2. Reward modeling using dialog pairs from the SE dataset on the llama-v2-7b-se to create llama-v2-7b-se-rm + ``` + python ../gaudi_spawn.py --world_size 8 --use_mpi reward_modeling.py \ + --model_name=./sft/final_merged_checkpoint \ + --tokenizer_name=meta-llama/Llama-2-7b-hf \ + --output_dir=./rm + ``` + To merge the adaptors into the base model we can use the `merge_peft_adapter.py` helper script that comes with TRL: + + ``` + python merge_peft_adapter.py --base_model_name="meta-llama/Llama-2-7b-hf" --adapter_model_name="rm" --output_name="rm_merged_checkpoint" + ``` + +3. RL fine-tuning of llama-v2-7b-se with the llama-v2-7b-se-rm reward model: + ``` + python ../gaudi_spawn.py --world_size 8 --use_mpi ppo.py \ + --log_with=wandb \ + --model_name=./sft/final_merged_checkpoint \ + --reward_model_name=./rm_merged_checkpoint \ + --tokenizer_name=meta-llama/Llama-2-7b-hf \ + --adafactor=False \ + --output_max_length=128 \ + --batch_size=8 \ + --gradient_accumulation_steps=8 \ + --batched_gen=True \ + --ppo_epochs=4 \ + --seed=0 \ + --learning_rate=1.4e-5 \ + --early_stopping=True \ + --output_dir=llama-se-rl-finetune + ``` + To merge the adaptors into the base model we can use the `merge_peft_adapter.py` helper script that comes with TRL: + + ``` + python merge_peft_adapter.py --base_model_name="meta-llama/Llama-2-7b-hf" --adapter_model_name="llama-se-rl-finetune" --output_name="rl_merged_checkpoint" + ``` + +### Running the model +We can load the PPO-trained LoRA adaptors which were saved by the PPO training step and run it through the [text-generation example](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation). + +``` +python run_generation.py \ +--model_name_or_path ../trl/rl_merged_checkpoint/ \ +--use_hpu_graphs --use_kv_cache --batch_size 1 --bf16 --max_new_tokens 100 \ +--prompt "Here is my prompt" +``` + diff --git a/examples/trl/stack_llama/rl_training.py b/examples/trl/ppo.py similarity index 92% rename from examples/trl/stack_llama/rl_training.py rename to examples/trl/ppo.py index 53ec5b7251..23ecbcabeb 100644 --- a/examples/trl/stack_llama/rl_training.py +++ b/examples/trl/ppo.py @@ -1,6 +1,6 @@ # copy from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/rl_training.py, enable it for Gaudi2 from dataclasses import dataclass, field -from typing import Optional +from typing import List, Optional import torch from datasets import load_dataset @@ -26,8 +26,8 @@ class ScriptArguments: # NOTE: gpt2 models use Conv1D instead of Linear layers which are not yet supported in 8 bit mode # models like gpt-neo* models are more suitable. - model_name: Optional[str] = field(default="", metadata={"help": "the model name"}) - tokenizer_name: Optional[str] = field(default="", metadata={"help": "the tokenizer name"}) + model_name: Optional[str] = field(default="meta-llama/Llama-2-7b-hf", metadata={"help": "the model name"}) + tokenizer_name: Optional[str] = field(default="meta-llama/Llama-2-7b-hf", metadata={"help": "the tokenizer name"}) reward_model_name: Optional[str] = field(default="", metadata={"help": "the reward model name"}) log_with: Optional[str] = field(default=None, metadata={"help": "use 'wandb' to log with wandb"}) learning_rate: Optional[float] = field(default=1.41e-5, metadata={"help": "the learning rate"}) @@ -57,6 +57,13 @@ class ScriptArguments: adap_kl_ctrl: Optional[bool] = field(default=True, metadata={"help": "Use adaptive KL control, otherwise linear"}) use_habana: Optional[bool] = field(default=True, metadata={"help": "use habana for RL training"}) + lora_alpha: Optional[float] = field(default=32, metadata={"help": "the lora alpha parameter"}) + lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"}) + lora_r: Optional[int] = field(default=16, metadata={"help": "the lora r parameter"}) + lora_target_modules: List[str] = field( + default_factory=lambda: None, + metadata={"help": "Target modules for the LoRA method."}, + ) adapt_PreTrainedModelWrapper_to_gaudi() @@ -170,9 +177,10 @@ def collator(data): # Now let's build the model, the reference model, and the tokenizer. current_device = GaudiAccelerator().local_process_index lora_config = LoraConfig( - r=16, - lora_alpha=32, - lora_dropout=0.05, + r=script_args.lora_r, + lora_alpha=script_args.lora_alpha, + lora_dropout=script_args.lora_dropout, + target_modules=script_args.lora_target_modules, bias="none", task_type="CAUSAL_LM", ) @@ -266,7 +274,6 @@ def collator(data): output_length_sampler = LengthSampler(output_min_length, output_max_length) else: output_length_sampler = LengthSampler(output_max_length, output_max_length + 1) - for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)): if epoch >= config.total_ppo_epochs: break @@ -292,3 +299,5 @@ def collator(data): if script_args.save_freq and epoch and epoch % script_args.save_freq == 0: ppo_trainer.save_pretrained(script_args.output_dir + f"step_{epoch}") + +ppo_trainer.save_pretrained(script_args.output_dir) diff --git a/examples/trl/stack_llama/reward_modeling.py b/examples/trl/reward_modeling.py similarity index 89% rename from examples/trl/stack_llama/reward_modeling.py rename to examples/trl/reward_modeling.py index 32ce0faf50..a57cfa575a 100644 --- a/examples/trl/stack_llama/reward_modeling.py +++ b/examples/trl/reward_modeling.py @@ -45,13 +45,13 @@ class ScriptArguments: learning_rate: Optional[float] = field(default=2e-5) weight_decay: Optional[float] = field(default=0.001) model_name: Optional[str] = field( - default="gpt2", + default="meta-llama/Llama-2-7b-hf", metadata={ "help": "The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc." }, ) tokenizer_name: Optional[str] = field( - default=None, + default="meta-llama/Llama-2-7b-hf", metadata={ "help": "The tokenizer for your model, if left empty will use the default for your model", }, @@ -91,6 +91,17 @@ class ScriptArguments: default=False, metadata={"help": "Whether to run eval after the first step"}, ) + output_dir: Optional[str] = field(default="./results", metadata={"help": "the output directory"}) + save_steps: Optional[int] = field(default=500, metadata={"help": "the saving frequency"}) + eval_steps: Optional[int] = field(default=500, metadata={"help": "the evaluation frequency"}) + logging_steps: Optional[int] = field(default=10, metadata={"help": "the logging frequency"}) + lora_alpha: Optional[float] = field(default=32, metadata={"help": "the lora alpha parameter"}) + lora_dropout: Optional[float] = field(default=0.1, metadata={"help": "the lora dropout parameter"}) + lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"}) + lora_target_modules: List[str] = field( + default_factory=lambda: None, + metadata={"help": "Target modules for the LoRA method."}, + ) parser = HfArgumentParser(ScriptArguments) @@ -105,21 +116,18 @@ class ScriptArguments: eval_dataset = eval_dataset.select(range(script_args.eval_subset)) # Define the training args. Needs to be done before the model is loaded if you are using deepspeed. model_name_split = script_args.model_name.split("/")[-1] -output_name = ( - f"{model_name_split}_peft_stack-exchange-paired_rmts__{script_args.train_subset}_{script_args.learning_rate}" -) training_args = GaudiTrainingArguments( - output_dir=output_name, + output_dir=script_args.output_dir, learning_rate=script_args.learning_rate, per_device_train_batch_size=script_args.per_device_train_batch_size, per_device_eval_batch_size=script_args.per_device_eval_batch_size, num_train_epochs=script_args.num_train_epochs, weight_decay=script_args.weight_decay, evaluation_strategy="steps", - eval_steps=500, + eval_steps=script_args.eval_steps, save_strategy="steps", - save_steps=500, + save_steps=script_args.save_steps, gradient_accumulation_steps=script_args.gradient_accumulation_steps, gradient_checkpointing=script_args.gradient_checkpointing, deepspeed=script_args.deepspeed, @@ -128,7 +136,7 @@ class ScriptArguments: label_names=[], bf16=script_args.bf16, logging_strategy="steps", - logging_steps=10, + logging_steps=script_args.logging_steps, optim=script_args.optim, lr_scheduler_type=script_args.lr_scheduler_type, report_to="none", @@ -140,13 +148,14 @@ class ScriptArguments: tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_auth_token=True) tokenizer.pad_token = tokenizer.eos_token - peft_config = LoraConfig( task_type=TaskType.SEQ_CLS, inference_mode=False, - r=8, - lora_alpha=32, - lora_dropout=0.1, + r=script_args.lora_r, + lora_alpha=script_args.lora_alpha, + lora_dropout=script_args.lora_dropout, + target_modules=script_args.lora_target_modules, + bias="none", ) torch.autograd.set_detect_anomaly(True) model = AutoModelForSequenceClassification.from_pretrained( @@ -310,4 +319,4 @@ def on_step_end(self, args, state, control, **kwargs): trainer.train(script_args.resume_from_checkpoint) print("Saving last checkpoint of the model") -trainer.save_model(output_name + "_peft_last_checkpoint") +trainer.save_model(script_args.output_dir) diff --git a/examples/trl/stack_llama/README.md b/examples/trl/stack_llama/README.md deleted file mode 100644 index 51a9728ed4..0000000000 --- a/examples/trl/stack_llama/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# RLHF pipeline for the creation of StackLLaMa: a Stack exchange llama-7b model. -There were three main steps to the training process: -1. Supervised fine-tuning of the base llama-7b model to create llama-7b-se: - - `torchrun --nnodes 1 --nproc_per_node 8 supervised_finetuning.py --model_path= --streaming --learning_rate 1e-5 --max_steps 5000 --bf16 --output_dir ./llama-se` -2. Reward modeling using dialog pairs from the SE dataset using the llama-7b-se to create llama-7b-se-rm: - - `torchrun --nnodes 1 --nproc_per_node 8 reward_modeling.py --model_name=` -3. RL fine-tuning of llama-7b-se with the llama-7b-se-rm reward model: - - `torchrun --nnodes 1 --nproc_per_node 8 rl_training.py --log_with=wandb --model_name= --reward_model_name= --adafactor=False --tokenizer_name= --save_freq=100 --output_max_length=128 --batch_size=8 --gradient_accumulation_steps=8 --batched_gen=True --ppo_epochs=4 --seed=0 --learning_rate=1.4e-5 --early_stopping=True --output_dir=llama-se-rl-finetune-128-8-8-1.4e-5_adam` - - -LoRA layers were using at all stages to reduce memory requirements. -At each stage the peft adapter layers were merged with the base model, using: -```shell -python merge_peft_adapter.py --adapter_model_name=XXX --base_model_name=YYY --output_name=ZZZ -``` -Note that this script requires `peft>=0.3.0`. - -For access to the base llama-7b model, please see Meta's [release](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) and [request form](https://docs.google.com/forms/d/e/1FAIpQLSfqNECQnMkycAp2jP4Z9TFX0cGR4uf7b_fBxjY_OjhJILlKGA/viewform). diff --git a/examples/trl/stack_llama/merge_peft_adapter.py b/examples/trl/stack_llama/merge_peft_adapter.py deleted file mode 100644 index 8913fc62a4..0000000000 --- a/examples/trl/stack_llama/merge_peft_adapter.py +++ /dev/null @@ -1,50 +0,0 @@ -# copy from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/merge_peft_adapter.py. -# only difference is removal of model.push_to_hub -from dataclasses import dataclass, field -from typing import Optional - -import torch -from peft import PeftConfig, PeftModel -from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, HfArgumentParser - - -@dataclass -class ScriptArguments: - """ - The input names representing the Adapter and Base model fine-tuned with PEFT, and the output name representing the - merged model. - """ - - adapter_model_name: Optional[str] = field(default=None, metadata={"help": "the adapter name"}) - base_model_name: Optional[str] = field(default=None, metadata={"help": "the base model name"}) - output_name: Optional[str] = field(default=None, metadata={"help": "the merged model name"}) - - -parser = HfArgumentParser(ScriptArguments) -script_args = parser.parse_args_into_dataclasses()[0] -assert script_args.adapter_model_name is not None, "please provide the name of the Adapter you would like to merge" -assert script_args.base_model_name is not None, "please provide the name of the Base model" -assert script_args.output_name is not None, "please provide the output name of the merged model" - -peft_config = PeftConfig.from_pretrained(script_args.adapter_model_name) -if peft_config.task_type == "SEQ_CLS": - # The sequence classification task is used for the reward model in PPO - model = AutoModelForSequenceClassification.from_pretrained( - script_args.base_model_name, num_labels=1, torch_dtype=torch.bfloat16 - ) -else: - model = AutoModelForCausalLM.from_pretrained( - script_args.base_model_name, return_dict=True, torch_dtype=torch.bfloat16 - ) - -tokenizer = AutoTokenizer.from_pretrained(script_args.base_model_name) - -# Load the PEFT model -model = PeftModel.from_pretrained(model, script_args.adapter_model_name) -model.eval() - -model = model.merge_and_unload() - -model.save_pretrained(f"{script_args.output_name}") -tokenizer.save_pretrained(f"{script_args.output_name}") -# model.push_to_hub(f"{script_args.output_name}", use_temp_dir=False) diff --git a/examples/trl/stack_llama/supervised_finetuning.py b/examples/trl/stack_llama/supervised_finetuning.py deleted file mode 100644 index a61bca6e3b..0000000000 --- a/examples/trl/stack_llama/supervised_finetuning.py +++ /dev/null @@ -1,215 +0,0 @@ -# copy from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/supervised_finetuning.py, enable it for Gaudi2 - -import argparse -import os - -import torch -from datasets import load_dataset -from peft import LoraConfig -from tqdm import tqdm -from transformers import AutoModelForCausalLM, AutoTokenizer, logging, set_seed -from trl.trainer import ConstantLengthDataset - -from optimum.habana import GaudiConfig, GaudiTrainingArguments -from optimum.habana.trl import GaudiSFTTrainer - - -""" -Fine-Tune Llama-7b on SE paired dataset -""" - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--model_path", type=str, default="") - parser.add_argument("--dataset_name", type=str, default="lvwerra/stack-exchange-paired") - parser.add_argument("--subset", type=str, default="data/finetune") - parser.add_argument("--split", type=str, default="train") - parser.add_argument("--size_valid_set", type=int, default=4000) - parser.add_argument("--streaming", action="store_true") - parser.add_argument("--shuffle_buffer", type=int, default=5000) - - parser.add_argument("--seq_length", type=int, default=1024) - parser.add_argument("--max_steps", type=int, default=10000) - parser.add_argument("--batch_size", type=int, default=4) - parser.add_argument("--gradient_accumulation_steps", type=int, default=1) - parser.add_argument("--eos_token_id", type=int, default=49152) - - parser.add_argument("--learning_rate", type=float, default=1e-4) - parser.add_argument("--lr_scheduler_type", type=str, default="cosine") - parser.add_argument("--num_warmup_steps", type=int, default=100) - parser.add_argument("--weight_decay", type=float, default=0.05) - - parser.add_argument("--local_rank", type=int, default=0) - parser.add_argument("--fp16", action="store_true", default=False) - parser.add_argument("--bf16", action="store_true", default=False) - parser.add_argument("--gradient_checkpointing", action="store_true", default=False) - parser.add_argument("--seed", type=int, default=0) - parser.add_argument("--num_workers", type=int, default=None) - parser.add_argument("--output_dir", type=str, default="./checkpoints") - parser.add_argument("--log_freq", default=1, type=int) - parser.add_argument("--eval_freq", default=1000, type=int) - parser.add_argument("--save_freq", default=1000, type=int) - - return parser.parse_args() - - -def chars_token_ratio(dataset, tokenizer, nb_examples=400): - """ - Estimate the average number of characters per token in the dataset. - """ - total_characters, total_tokens = 0, 0 - for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples): - text = prepare_sample_text(example) - total_characters += len(text) - if tokenizer.is_fast: - total_tokens += len(tokenizer(text).tokens()) - else: - total_tokens += len(tokenizer.tokenize(text)) - - return total_characters / total_tokens - - -def print_trainable_parameters(model): - """ - Prints the number of trainable parameters in the model. - """ - trainable_params = 0 - all_param = 0 - for _, param in model.named_parameters(): - all_param += param.numel() - if param.requires_grad: - trainable_params += param.numel() - print( - f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" - ) - - -def prepare_sample_text(example): - """Prepare the text from a sample of the dataset.""" - text = f"Question: {example['question']}\n\nAnswer: {example['response_j']}" - return text - - -def create_datasets(tokenizer, args): - dataset = load_dataset( - args.dataset_name, - data_dir=args.subset, - split=args.split, - use_auth_token=True, - num_proc=args.num_workers if not args.streaming else None, - streaming=args.streaming, - ) - if args.streaming: - print("Loading the dataset in streaming mode") - valid_data = dataset.take(args.size_valid_set) - train_data = dataset.skip(args.size_valid_set) - train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=args.seed) - else: - dataset = dataset.train_test_split(test_size=0.005, seed=args.seed) - train_data = dataset["train"] - valid_data = dataset["test"] - print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}") - - chars_per_token = chars_token_ratio(train_data, tokenizer) - print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}") - - train_dataset = ConstantLengthDataset( - tokenizer, - train_data, - formatting_func=prepare_sample_text, - infinite=True, - seq_length=args.seq_length, - chars_per_token=chars_per_token, - ) - valid_dataset = ConstantLengthDataset( - tokenizer, - valid_data, - formatting_func=prepare_sample_text, - infinite=False, - seq_length=args.seq_length, - chars_per_token=chars_per_token, - ) - return train_dataset, valid_dataset - - -def run_training(args, train_data, val_data): - print("Loading the model") - - lora_config = LoraConfig( - r=16, - lora_alpha=32, - lora_dropout=0.05, - bias="none", - task_type="CAUSAL_LM", - ) - - train_data.start_iteration = 0 - - print("Starting main loop") - - training_args = GaudiTrainingArguments( - output_dir=args.output_dir, - dataloader_drop_last=True, - evaluation_strategy="steps", - max_steps=args.max_steps, - eval_steps=args.eval_freq, - save_steps=args.save_freq, - logging_steps=args.log_freq, - per_device_train_batch_size=args.batch_size, - per_device_eval_batch_size=args.batch_size, - learning_rate=args.learning_rate, - lr_scheduler_type=args.lr_scheduler_type, - warmup_steps=args.num_warmup_steps, - gradient_accumulation_steps=args.gradient_accumulation_steps, - gradient_checkpointing=args.gradient_checkpointing, - fp16=args.fp16, - bf16=args.bf16, - weight_decay=args.weight_decay, - run_name="llama-7b-finetuned", - report_to="none", - ddp_find_unused_parameters=False, - use_habana=True, - use_lazy_mode=True, - ) - model = AutoModelForCausalLM.from_pretrained(args.model_path, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16) - - gaudi_config = GaudiConfig() - gaudi_config.use_fused_adam = True - gaudi_config.use_fused_clip_norm = True - - trainer = GaudiSFTTrainer( - model=model, - gaudi_config=gaudi_config, - args=training_args, - train_dataset=train_data, - eval_dataset=val_data, - peft_config=lora_config, - packing=True, - ) - - print_trainable_parameters(trainer.model) - - print("Training...") - trainer.train() - - print("Saving last checkpoint of the model") - trainer.save_model(os.path.join(args.output_dir, "final_checkpoint/")) - - -def main(args): - tokenizer = AutoTokenizer.from_pretrained(args.model_path) - train_dataset, eval_dataset = create_datasets(tokenizer, args) - run_training(args, train_dataset, eval_dataset) - - -if __name__ == "__main__": - args = get_args() - assert args.model_path != "", "Please provide the llama model path" - - set_seed(args.seed) - os.makedirs(args.output_dir, exist_ok=True) - - logging.set_verbosity_error() - - main(args) diff --git a/optimum/habana/trl/trainer/ppo_trainer.py b/optimum/habana/trl/trainer/ppo_trainer.py index 5ef2745e1e..20d18d2996 100644 --- a/optimum/habana/trl/trainer/ppo_trainer.py +++ b/optimum/habana/trl/trainer/ppo_trainer.py @@ -255,7 +255,7 @@ def __init__( elif is_npu_available(): self.current_device = torch.device("npu:0") elif self.accelerator.device.type == "hpu": - self.current_device = torch.device("hpu:0") + self.current_device = torch.device("hpu") else: self.current_device = torch.device("cuda:0") From 20d38bce73c6411f23943fa68bd8a601ac840f6b Mon Sep 17 00:00:00 2001 From: "Wang, Yi A" Date: Tue, 30 Jan 2024 17:33:22 -0800 Subject: [PATCH 3/6] ppo and reward model update Signed-off-by: Wang, Yi A --- examples/trl/ppo.py | 24 ++++-- examples/trl/reward_modeling.py | 72 +---------------- optimum/habana/trl/__init__.py | 1 + optimum/habana/trl/models/__init__.py | 2 - optimum/habana/trl/models/modeling_base.py | 11 +-- optimum/habana/trl/trainer/__init__.py | 1 + optimum/habana/trl/trainer/ppo_config.py | 9 ++- optimum/habana/trl/trainer/ppo_trainer.py | 18 ++--- optimum/habana/trl/trainer/reward_trainer.py | 82 ++++++++++++++++++++ 9 files changed, 119 insertions(+), 101 deletions(-) create mode 100644 optimum/habana/trl/trainer/reward_trainer.py diff --git a/examples/trl/ppo.py b/examples/trl/ppo.py index 23ecbcabeb..c46ef1f450 100644 --- a/examples/trl/ppo.py +++ b/examples/trl/ppo.py @@ -31,7 +31,8 @@ class ScriptArguments: reward_model_name: Optional[str] = field(default="", metadata={"help": "the reward model name"}) log_with: Optional[str] = field(default=None, metadata={"help": "use 'wandb' to log with wandb"}) learning_rate: Optional[float] = field(default=1.41e-5, metadata={"help": "the learning rate"}) - output_max_length: Optional[int] = field(default=128, metadata={"help": "maximum length for generation"}) + output_max_length: Optional[int] = field(default=128, metadata={"help": "maximum output length for generation"}) + input_max_length: Optional[int] = field(default=512, metadata={"help": "maximum input length for generation"}) mini_batch_size: Optional[int] = field(default=1, metadata={"help": "the PPO minibatch size"}) batch_size: Optional[int] = field(default=32, metadata={"help": "the batch size"}) ppo_epochs: Optional[int] = field(default=4, metadata={"help": "the number of ppo epochs"}) @@ -64,6 +65,15 @@ class ScriptArguments: default_factory=lambda: None, metadata={"help": "Target modules for the LoRA method."}, ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + ) + }, + ) adapt_PreTrainedModelWrapper_to_gaudi() @@ -88,12 +98,14 @@ class ScriptArguments: adap_kl_ctrl=script_args.adap_kl_ctrl, use_habana=script_args.use_habana, pad_for_acceleration=script_args.use_habana, - pad_max_len=512 + script_args.output_max_length, - pad_max_input_len=512, + pad_max_len=script_args.input_max_length + script_args.output_max_length, + pad_max_input_len=script_args.input_max_length, ) train_dataset = load_dataset("lvwerra/stack-exchange-paired", data_dir="data/rl", split="train") -train_dataset = train_dataset.select(range(100000)) +if script_args.max_train_samples is not None: + max_train_samples = min(len(train_dataset), script_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) original_columns = train_dataset.column_names # We then define the arguments to pass to the sentiment analysis pipeline. @@ -106,7 +118,7 @@ class ScriptArguments: } if config.pad_for_acceleration: sent_kwargs["padding"] = "max_length" - sent_kwargs["max_length"] = 512 + script_args.output_max_length + sent_kwargs["max_length"] = script_args.input_max_length + script_args.output_max_length tokenizer = AutoTokenizer.from_pretrained(script_args.tokenizer_name) # GPT-2 tokenizer has a pad token, but it is not eos_token by default. We need to set it to eos_token. @@ -225,8 +237,6 @@ def collator(data): # model name and the sentiment analysis pipeline arguments. Let's also make sure to # set the device to the same device as the PPOTrainer. device = ppo_trainer.accelerator.device -if ppo_trainer.accelerator.num_processes == 1 and torch.cuda.is_available(): - device = 0 reward_model = AutoModelForSequenceClassification.from_pretrained( reward_model_name, diff --git a/examples/trl/reward_modeling.py b/examples/trl/reward_modeling.py index a57cfa575a..e15d73309a 100644 --- a/examples/trl/reward_modeling.py +++ b/examples/trl/reward_modeling.py @@ -1,24 +1,22 @@ # copy from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/reward_modeling.py, enable it for Gaudi2 from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional import evaluate import numpy as np import torch -import torch.nn as nn from datasets import load_dataset from peft import LoraConfig, TaskType, get_peft_model from transformers import ( AutoModelForSequenceClassification, AutoTokenizer, HfArgumentParser, - PreTrainedTokenizerBase, TrainerCallback, ) -from transformers.utils import PaddingStrategy -from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments +from optimum.habana import GaudiConfig, GaudiTrainingArguments +from optimum.habana.trl import GaudiRewardTrainer, RewardDataCollatorWithPadding # Define and parse arguments. @@ -115,7 +113,6 @@ class ScriptArguments: if script_args.eval_subset > 0: eval_dataset = eval_dataset.select(range(script_args.eval_subset)) # Define the training args. Needs to be done before the model is loaded if you are using deepspeed. -model_name_split = script_args.model_name.split("/")[-1] training_args = GaudiTrainingArguments( output_dir=script_args.output_dir, @@ -215,56 +212,6 @@ def preprocess_function(examples): lambda x: len(x["input_ids_j"]) <= script_args.max_length and len(x["input_ids_k"]) <= script_args.max_length ) - -# We need to define a special data collator that batches the data in our j vs k format. -@dataclass -class RewardDataCollatorWithPadding: - tokenizer: PreTrainedTokenizerBase - padding: Union[bool, str, PaddingStrategy] = True - max_length: Optional[int] = None - pad_to_multiple_of: Optional[int] = None - return_tensors: str = "pt" - - def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]: - features_j = [] - features_k = [] - for feature in features: - features_j.append( - { - "input_ids": feature["input_ids_j"], - "attention_mask": feature["attention_mask_j"], - } - ) - features_k.append( - { - "input_ids": feature["input_ids_k"], - "attention_mask": feature["attention_mask_k"], - } - ) - batch_j = self.tokenizer.pad( - features_j, - padding=self.padding, - max_length=self.max_length, - pad_to_multiple_of=self.pad_to_multiple_of, - return_tensors=self.return_tensors, - ) - batch_k = self.tokenizer.pad( - features_k, - padding=self.padding, - max_length=self.max_length, - pad_to_multiple_of=self.pad_to_multiple_of, - return_tensors=self.return_tensors, - ) - batch = { - "input_ids_j": batch_j["input_ids"], - "attention_mask_j": batch_j["attention_mask"], - "input_ids_k": batch_k["input_ids"], - "attention_mask_k": batch_k["attention_mask"], - "return_loss": True, - } - return batch - - # Define the metric that we'll use for validation. accuracy = evaluate.load("accuracy") @@ -278,23 +225,12 @@ def compute_metrics(eval_pred): return accuracy.compute(predictions=predictions, references=labels) -class RewardTrainer(GaudiTrainer): - # Define how to compute the reward loss. We use the InstructGPT pairwise logloss: https://arxiv.org/abs/2203.02155 - def compute_loss(self, model, inputs, return_outputs=False): - rewards_j = model(input_ids=inputs["input_ids_j"], attention_mask=inputs["attention_mask_j"])[0] - rewards_k = model(input_ids=inputs["input_ids_k"], attention_mask=inputs["attention_mask_k"])[0] - loss = -nn.functional.logsigmoid(rewards_j - rewards_k).mean() - if return_outputs: - return loss, {"rewards_j": rewards_j, "rewards_k": rewards_k} - return loss - - gaudi_config = GaudiConfig() gaudi_config.use_fused_adam = True gaudi_config.use_fused_clip_norm = True # Train the model, woohoo. -trainer = RewardTrainer( +trainer = GaudiRewardTrainer( model=model, gaudi_config=gaudi_config, args=training_args, diff --git a/optimum/habana/trl/__init__.py b/optimum/habana/trl/__init__.py index 90c9624dc1..838d54560a 100644 --- a/optimum/habana/trl/__init__.py +++ b/optimum/habana/trl/__init__.py @@ -2,4 +2,5 @@ from .trainer.dpo_trainer import GaudiDPOTrainer from .trainer.ppo_config import GaudiPPOConfig from .trainer.ppo_trainer import GaudiPPOTrainer +from .trainer.reward_trainer import GaudiRewardTrainer, RewardDataCollatorWithPadding from .trainer.sft_trainer import GaudiSFTTrainer diff --git a/optimum/habana/trl/models/__init__.py b/optimum/habana/trl/models/__init__.py index 22bf871003..36736572c3 100644 --- a/optimum/habana/trl/models/__init__.py +++ b/optimum/habana/trl/models/__init__.py @@ -1,5 +1,3 @@ -# flake8: noqa - # Copyright 2022 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/optimum/habana/trl/models/modeling_base.py b/optimum/habana/trl/models/modeling_base.py index ad02ffd1a7..fcdc7ddc3f 100644 --- a/optimum/habana/trl/models/modeling_base.py +++ b/optimum/habana/trl/models/modeling_base.py @@ -15,9 +15,7 @@ import torch from trl import PreTrainedModelWrapper -from trl.import_utils import is_npu_available, is_xpu_available -from optimum.habana.accelerate import GaudiPartialState as PartialState from optimum.habana.utils import to_device_dtype @@ -31,15 +29,10 @@ def gaudi_get_current_device(): Copied from PreTrainedModelWrapper._get_current_device: https://github.com/huggingface/trl/blob/v0.7.6/trl/models/modeling_base.py#L392 - add hpu device """ - state = PartialState() - if is_xpu_available(): - return f"xpu:{state.local_process_index}" - elif is_npu_available(): - return f"npu:{state.local_process_index}" - elif hasattr(torch, "hpu") and torch.hpu.is_available(): + if hasattr(torch, "hpu") and torch.hpu.is_available(): return "hpu" else: - return state.local_process_index if torch.cuda.is_available() else "cpu" + return "cpu" def gaudi_save_pretrained(self, *args, **kwargs): diff --git a/optimum/habana/trl/trainer/__init__.py b/optimum/habana/trl/trainer/__init__.py index b17262fc82..700ea8355d 100644 --- a/optimum/habana/trl/trainer/__init__.py +++ b/optimum/habana/trl/trainer/__init__.py @@ -21,3 +21,4 @@ from .dpo_trainer import GaudiDPOTrainer from .ppo_config import GaudiPPOConfig from .ppo_trainer import GaudiPPOTrainer +from .reward_trainer import GaudiRewardTrainer, RewardDataCollatorWithPadding diff --git a/optimum/habana/trl/trainer/ppo_config.py b/optimum/habana/trl/trainer/ppo_config.py index 03bf06dbca..12a3a9185e 100644 --- a/optimum/habana/trl/trainer/ppo_config.py +++ b/optimum/habana/trl/trainer/ppo_config.py @@ -25,12 +25,13 @@ class GaudiPPOConfig(PPOConfig): """ use_habana: bool = False - """Use habana. Only applicable if use_habana is True""" + """Indicate if habana is used""" pad_for_acceleration: bool = False - """Use pad_for_acceleration. Only applicable if pad_for_acceleration is True""" + """Indicate if padding is used for acceleration. """ pad_max_len: int = 0 - """Use pad_for_acceleration. Only applicable if pad_for_acceleration is True""" + """max total length including padding. Only applicable if pad_for_acceleration is True""" pad_max_input_len: int = 0 + """max input length including padding. Only applicable if pad_for_acceleration is True""" def __post_init__(self): self.backward_batch_size = self.mini_batch_size * self.gradient_accumulation_steps @@ -61,7 +62,7 @@ def __post_init__(self): ) if self.use_habana: - from optimum.habana.transformers.modeling_utils import ( # pylint: disable=E0611, E0401 + from optimum.habana.transformers.modeling_utils import ( adapt_transformers_to_gaudi, ) diff --git a/optimum/habana/trl/trainer/ppo_trainer.py b/optimum/habana/trl/trainer/ppo_trainer.py index 20d18d2996..ec7fae4727 100644 --- a/optimum/habana/trl/trainer/ppo_trainer.py +++ b/optimum/habana/trl/trainer/ppo_trainer.py @@ -38,7 +38,7 @@ stack_dicts, stats_to_np, ) -from trl.import_utils import is_npu_available, is_torch_greater_2_0, is_xpu_available +from trl.import_utils import is_torch_greater_2_0 from trl.models import SUPPORTED_ARCHITECTURES, PreTrainedModelWrapper, create_reference_model from trl.trainer import AdaptiveKLController, BaseTrainer, FixedKLController, RunningMoments @@ -47,7 +47,7 @@ from . import GaudiPPOConfig -class GaudiPPOTrainer(PPOTrainer, BaseTrainer): +class GaudiPPOTrainer(PPOTrainer): def __init__( self, config: GaudiPPOConfig = None, @@ -63,7 +63,7 @@ def __init__( """ Copied from PPOTrainer.__init__: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L145 The only differences are: - - add new args for guadi in config + - add new args for Guadi in config - use GaudiAccelerator instead of Accelerator """ BaseTrainer.__init__(self, config) @@ -84,7 +84,7 @@ def __init__( ) # Step 1: Initialize Accelerator if config.use_habana: - from optimum.habana.accelerate import GaudiAccelerator as Accelerator # pylint: disable=E0611, E0401 + from optimum.habana.accelerate import GaudiAccelerator as Accelerator else: from accelerate import Accelerator self.accelerator = Accelerator( @@ -250,14 +250,10 @@ def __init__( if not getattr(self.model, "is_sequential_parallel", False): self.current_device = self.accelerator.device else: - if is_xpu_available(): - self.current_device = torch.device("xpu:0") - elif is_npu_available(): - self.current_device = torch.device("npu:0") - elif self.accelerator.device.type == "hpu": + if self.accelerator.device.type == "hpu": self.current_device = torch.device("hpu") else: - self.current_device = torch.device("cuda:0") + self.current_device = torch.device("cpu") PPODecorators.optimize_device_cache = self.config.optimize_device_cache @@ -558,7 +554,7 @@ def step( early_stop = False if self.config.use_habana: self.unwrap_fw_for_hpu_graph_mode(self.model) - import habana_frameworks.torch as ht # pylint: disable=E0611, E0401 + import habana_frameworks.torch as ht model = self.accelerator.unwrap_model(self.model) if not hasattr(model, "wrap_train_in_graph"): diff --git a/optimum/habana/trl/trainer/reward_trainer.py b/optimum/habana/trl/trainer/reward_trainer.py new file mode 100644 index 0000000000..04e1575a3a --- /dev/null +++ b/optimum/habana/trl/trainer/reward_trainer.py @@ -0,0 +1,82 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Union + +import torch.nn as nn +from transformers import ( + PreTrainedTokenizerBase, +) +from transformers.utils import PaddingStrategy + +from optimum.habana import GaudiTrainer + + +class GaudiRewardTrainer(GaudiTrainer): + # Define how to compute the reward loss. We use the InstructGPT pairwise logloss: https://arxiv.org/abs/2203.02155 + def compute_loss(self, model, inputs, return_outputs=False): + rewards_j = model(input_ids=inputs["input_ids_j"], attention_mask=inputs["attention_mask_j"])[0] + rewards_k = model(input_ids=inputs["input_ids_k"], attention_mask=inputs["attention_mask_k"])[0] + loss = -nn.functional.logsigmoid(rewards_j - rewards_k).mean() + if return_outputs: + return loss, {"rewards_j": rewards_j, "rewards_k": rewards_k} + return loss + + +@dataclass +class RewardDataCollatorWithPadding: + tokenizer: PreTrainedTokenizerBase + padding: Union[bool, str, PaddingStrategy] = True + max_length: Optional[int] = None + pad_to_multiple_of: Optional[int] = None + return_tensors: str = "pt" + + def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]: + features_j = [] + features_k = [] + for feature in features: + features_j.append( + { + "input_ids": feature["input_ids_j"], + "attention_mask": feature["attention_mask_j"], + } + ) + features_k.append( + { + "input_ids": feature["input_ids_k"], + "attention_mask": feature["attention_mask_k"], + } + ) + batch_j = self.tokenizer.pad( + features_j, + padding=self.padding, + max_length=self.max_length, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors=self.return_tensors, + ) + batch_k = self.tokenizer.pad( + features_k, + padding=self.padding, + max_length=self.max_length, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors=self.return_tensors, + ) + batch = { + "input_ids_j": batch_j["input_ids"], + "attention_mask_j": batch_j["attention_mask"], + "input_ids_k": batch_k["input_ids"], + "attention_mask_k": batch_k["attention_mask"], + "return_loss": True, + } + return batch From c058ad2f262ee362bd3c0540867a15a81d2c40a6 Mon Sep 17 00:00:00 2001 From: "Wang, Yi A" Date: Mon, 5 Feb 2024 04:53:25 -0800 Subject: [PATCH 4/6] update PPO Signed-off-by: Wang, Yi A --- examples/trl/ppo.py | 1 - optimum/habana/trl/trainer/ppo_config.py | 3 +-- optimum/habana/trl/trainer/ppo_trainer.py | 4 ++-- optimum/habana/trl/trainer/reward_trainer.py | 9 ++++++++- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/examples/trl/ppo.py b/examples/trl/ppo.py index c46ef1f450..22ea73ab03 100644 --- a/examples/trl/ppo.py +++ b/examples/trl/ppo.py @@ -97,7 +97,6 @@ class ScriptArguments: init_kl_coef=script_args.init_kl_coef, adap_kl_ctrl=script_args.adap_kl_ctrl, use_habana=script_args.use_habana, - pad_for_acceleration=script_args.use_habana, pad_max_len=script_args.input_max_length + script_args.output_max_length, pad_max_input_len=script_args.input_max_length, ) diff --git a/optimum/habana/trl/trainer/ppo_config.py b/optimum/habana/trl/trainer/ppo_config.py index 12a3a9185e..49e798be6b 100644 --- a/optimum/habana/trl/trainer/ppo_config.py +++ b/optimum/habana/trl/trainer/ppo_config.py @@ -51,10 +51,9 @@ def __post_init__(self): raise ImportError( "Please install wandb to use wandb logging. You can do this by running `pip install wandb`." ) + self.pad_for_acceleration = (self.pad_max_len > 0) and (self.pad_max_input_len > 0) if self.pad_for_acceleration: - if self.pad_max_input_len == 0: - raise AssertionError("pad_max_input_len ({self.pad_max_input_len}) must be set for pad input ") if self.pad_max_input_len >= self.pad_max_len: raise AssertionError( "pad_max_input_len ({self.pad_max_input_len}) must be smaller " diff --git a/optimum/habana/trl/trainer/ppo_trainer.py b/optimum/habana/trl/trainer/ppo_trainer.py index ec7fae4727..16c3534332 100644 --- a/optimum/habana/trl/trainer/ppo_trainer.py +++ b/optimum/habana/trl/trainer/ppo_trainer.py @@ -63,7 +63,7 @@ def __init__( """ Copied from PPOTrainer.__init__: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L145 The only differences are: - - add new args for Guadi in config + - add new args for Gaudi in config - use GaudiAccelerator instead of Accelerator """ BaseTrainer.__init__(self, config) @@ -825,7 +825,7 @@ def train_minibatch( if self.accelerator.sync_gradients: self.accelerator.clip_grad_norm_(self.model_params, self.config.max_grad_norm) self.optimizer.step() - if self.config.use_habana: # pragma: no cover + if self.config.use_habana: self.htcore.mark_step() # we call optimizer.zero_grad() every time and let `accelerator` handle accumulation # see https://huggingface.co/docs/accelerate/usage_guides/gradient_accumulation#the-finished-code diff --git a/optimum/habana/trl/trainer/reward_trainer.py b/optimum/habana/trl/trainer/reward_trainer.py index 04e1575a3a..bbb0c761fe 100644 --- a/optimum/habana/trl/trainer/reward_trainer.py +++ b/optimum/habana/trl/trainer/reward_trainer.py @@ -24,7 +24,10 @@ class GaudiRewardTrainer(GaudiTrainer): - # Define how to compute the reward loss. We use the InstructGPT pairwise logloss: https://arxiv.org/abs/2203.02155 + """ + Copied from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/reward_modeling.py#L266 + """ + def compute_loss(self, model, inputs, return_outputs=False): rewards_j = model(input_ids=inputs["input_ids_j"], attention_mask=inputs["attention_mask_j"])[0] rewards_k = model(input_ids=inputs["input_ids_k"], attention_mask=inputs["attention_mask_k"])[0] @@ -36,6 +39,10 @@ def compute_loss(self, model, inputs, return_outputs=False): @dataclass class RewardDataCollatorWithPadding: + """ + Copied from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/reward_modeling.py#L206 + """ + tokenizer: PreTrainedTokenizerBase padding: Union[bool, str, PaddingStrategy] = True max_length: Optional[int] = None From f563ad84b0fea3d3a4a82c4c34144e27a4267e94 Mon Sep 17 00:00:00 2001 From: "Wang, Yi A" Date: Thu, 8 Feb 2024 04:33:14 -0800 Subject: [PATCH 5/6] add evaluate and scikit-learn to requirement.txt Signed-off-by: Wang, Yi A --- examples/trl/requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/trl/requirements.txt b/examples/trl/requirements.txt index c980a4b30c..2c944e2168 100644 --- a/examples/trl/requirements.txt +++ b/examples/trl/requirements.txt @@ -3,3 +3,5 @@ peft == 0.6.2 datasets wandb tyro +evaluate +scikit-learn From 4a03a4b879c6a342cf7d493ef6698abfad4c96a9 Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Sun, 11 Feb 2024 05:43:11 +0100 Subject: [PATCH 6/6] Remove W&B logs in example --- examples/trl/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/trl/README.md b/examples/trl/README.md index 8049349da6..537e1dbb4e 100644 --- a/examples/trl/README.md +++ b/examples/trl/README.md @@ -120,7 +120,6 @@ There are three main steps to the PPO training process: 3. RL fine-tuning of llama-v2-7b-se with the llama-v2-7b-se-rm reward model: ``` python ../gaudi_spawn.py --world_size 8 --use_mpi ppo.py \ - --log_with=wandb \ --model_name=./sft/final_merged_checkpoint \ --reward_model_name=./rm_merged_checkpoint \ --tokenizer_name=meta-llama/Llama-2-7b-hf \