From 6102aa1c3a76675917f96569eab53d99bf39d2f1 Mon Sep 17 00:00:00 2001
From: "Wang, Yi A" <yi.a.wang@intel.com>
Date: Wed, 27 Dec 2023 01:58:44 -0800
Subject: [PATCH 1/6] add PPO and stack_llama support

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 examples/trl/stack_llama/README.md            |  18 +
 .../trl/stack_llama/merge_peft_adapter.py     |  50 +
 examples/trl/stack_llama/reward_modeling.py   | 313 +++++++
 examples/trl/stack_llama/rl_training.py       | 294 ++++++
 .../trl/stack_llama/supervised_finetuning.py  | 215 +++++
 optimum/habana/trl/__init__.py                |   3 +
 optimum/habana/trl/models/__init__.py         |  17 +
 optimum/habana/trl/models/modeling_base.py    |  71 ++
 optimum/habana/trl/trainer/__init__.py        |   2 +
 optimum/habana/trl/trainer/ppo_config.py      |  70 ++
 optimum/habana/trl/trainer/ppo_trainer.py     | 881 ++++++++++++++++++
 11 files changed, 1934 insertions(+)
 create mode 100644 examples/trl/stack_llama/README.md
 create mode 100644 examples/trl/stack_llama/merge_peft_adapter.py
 create mode 100644 examples/trl/stack_llama/reward_modeling.py
 create mode 100644 examples/trl/stack_llama/rl_training.py
 create mode 100644 examples/trl/stack_llama/supervised_finetuning.py
 create mode 100644 optimum/habana/trl/models/__init__.py
 create mode 100644 optimum/habana/trl/models/modeling_base.py
 create mode 100644 optimum/habana/trl/trainer/ppo_config.py
 create mode 100644 optimum/habana/trl/trainer/ppo_trainer.py

diff --git a/examples/trl/stack_llama/README.md b/examples/trl/stack_llama/README.md
new file mode 100644
index 0000000000..51a9728ed4
--- /dev/null
+++ b/examples/trl/stack_llama/README.md
@@ -0,0 +1,18 @@
+# RLHF pipeline for the creation of StackLLaMa: a Stack exchange llama-7b model.
+There were three main steps to the training process:
+1. Supervised fine-tuning of the base llama-7b model to create llama-7b-se:
+    - `torchrun --nnodes 1  --nproc_per_node 8 supervised_finetuning.py --model_path=<LLAMA_MODEL_PATH> --streaming --learning_rate 1e-5 --max_steps 5000 --bf16 --output_dir ./llama-se`
+2. Reward modeling using dialog pairs from the SE dataset using the llama-7b-se to create llama-7b-se-rm:
+    - `torchrun --nnodes 1  --nproc_per_node 8 reward_modeling.py --model_name=<LLAMA_SE_MODEL>`
+3. RL fine-tuning of llama-7b-se with the llama-7b-se-rm reward model:
+    - `torchrun --nnodes 1  --nproc_per_node 8 rl_training.py --log_with=wandb --model_name=<LLAMA_SE_MODEL> --reward_model_name=<LLAMA_SE_RM_MODEL> --adafactor=False --tokenizer_name=<LLAMA_TOKENIZER> --save_freq=100 --output_max_length=128 --batch_size=8 --gradient_accumulation_steps=8 --batched_gen=True --ppo_epochs=4 --seed=0 --learning_rate=1.4e-5 --early_stopping=True --output_dir=llama-se-rl-finetune-128-8-8-1.4e-5_adam`
+
+
+LoRA layers were using at all stages to reduce memory requirements. 
+At each stage the peft adapter layers were merged with the base model, using: 
+```shell
+python merge_peft_adapter.py --adapter_model_name=XXX --base_model_name=YYY --output_name=ZZZ
+```
+Note that this script requires `peft>=0.3.0`.
+
+For access to the base llama-7b model, please see Meta's [release](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) and [request form](https://docs.google.com/forms/d/e/1FAIpQLSfqNECQnMkycAp2jP4Z9TFX0cGR4uf7b_fBxjY_OjhJILlKGA/viewform).
diff --git a/examples/trl/stack_llama/merge_peft_adapter.py b/examples/trl/stack_llama/merge_peft_adapter.py
new file mode 100644
index 0000000000..8913fc62a4
--- /dev/null
+++ b/examples/trl/stack_llama/merge_peft_adapter.py
@@ -0,0 +1,50 @@
+# copy from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/merge_peft_adapter.py.
+# only difference is removal of model.push_to_hub
+from dataclasses import dataclass, field
+from typing import Optional
+
+import torch
+from peft import PeftConfig, PeftModel
+from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, HfArgumentParser
+
+
+@dataclass
+class ScriptArguments:
+    """
+    The input names representing the Adapter and Base model fine-tuned with PEFT, and the output name representing the
+    merged model.
+    """
+
+    adapter_model_name: Optional[str] = field(default=None, metadata={"help": "the adapter name"})
+    base_model_name: Optional[str] = field(default=None, metadata={"help": "the base model name"})
+    output_name: Optional[str] = field(default=None, metadata={"help": "the merged model name"})
+
+
+parser = HfArgumentParser(ScriptArguments)
+script_args = parser.parse_args_into_dataclasses()[0]
+assert script_args.adapter_model_name is not None, "please provide the name of the Adapter you would like to merge"
+assert script_args.base_model_name is not None, "please provide the name of the Base model"
+assert script_args.output_name is not None, "please provide the output name of the merged model"
+
+peft_config = PeftConfig.from_pretrained(script_args.adapter_model_name)
+if peft_config.task_type == "SEQ_CLS":
+    # The sequence classification task is used for the reward model in PPO
+    model = AutoModelForSequenceClassification.from_pretrained(
+        script_args.base_model_name, num_labels=1, torch_dtype=torch.bfloat16
+    )
+else:
+    model = AutoModelForCausalLM.from_pretrained(
+        script_args.base_model_name, return_dict=True, torch_dtype=torch.bfloat16
+    )
+
+tokenizer = AutoTokenizer.from_pretrained(script_args.base_model_name)
+
+# Load the PEFT model
+model = PeftModel.from_pretrained(model, script_args.adapter_model_name)
+model.eval()
+
+model = model.merge_and_unload()
+
+model.save_pretrained(f"{script_args.output_name}")
+tokenizer.save_pretrained(f"{script_args.output_name}")
+# model.push_to_hub(f"{script_args.output_name}", use_temp_dir=False)
diff --git a/examples/trl/stack_llama/reward_modeling.py b/examples/trl/stack_llama/reward_modeling.py
new file mode 100644
index 0000000000..32ce0faf50
--- /dev/null
+++ b/examples/trl/stack_llama/reward_modeling.py
@@ -0,0 +1,313 @@
+# copy from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/reward_modeling.py, enable it for Gaudi2
+
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Union
+
+import evaluate
+import numpy as np
+import torch
+import torch.nn as nn
+from datasets import load_dataset
+from peft import LoraConfig, TaskType, get_peft_model
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    HfArgumentParser,
+    PreTrainedTokenizerBase,
+    TrainerCallback,
+)
+from transformers.utils import PaddingStrategy
+
+from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
+
+
+# Define and parse arguments.
+@dataclass
+class ScriptArguments:
+    """
+    These arguments vary depending on how many GPUs you have, what their capacity and features are, and what size model you want to train.
+    """
+
+    local_rank: Optional[int] = field(default=-1, metadata={"help": "Used for multi-gpu"})
+    resume_from_checkpoint: Optional[bool] = field(
+        default=False,
+        metadata={"help": "If you want to resume training where it left off."},
+    )
+    deepspeed: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Path to deepspeed config if using deepspeed. You may need this if the model that you want to train doesn't fit on a single GPU."
+        },
+    )
+    per_device_train_batch_size: Optional[int] = field(default=4)
+    per_device_eval_batch_size: Optional[int] = field(default=1)
+    gradient_accumulation_steps: Optional[int] = field(default=1)
+    learning_rate: Optional[float] = field(default=2e-5)
+    weight_decay: Optional[float] = field(default=0.001)
+    model_name: Optional[str] = field(
+        default="gpt2",
+        metadata={
+            "help": "The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc."
+        },
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The tokenizer for your model, if left empty will use the default for your model",
+        },
+    )
+    bf16: Optional[bool] = field(
+        default=True,
+        metadata={
+            "help": "This essentially cuts the training time in half if you want to sacrifice a little precision and have a supported GPU."
+        },
+    )
+    num_train_epochs: Optional[int] = field(
+        default=1,
+        metadata={"help": "The number of training epochs for the reward model."},
+    )
+    train_subset: Optional[int] = field(
+        default=100000,
+        metadata={"help": "The size of the subset of the training data to use"},
+    )
+    eval_subset: Optional[int] = field(
+        default=50000,
+        metadata={"help": "The size of the subset of the eval data to use"},
+    )
+    gradient_checkpointing: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enables gradient checkpointing."},
+    )
+    optim: Optional[str] = field(
+        default="adamw_hf",
+        metadata={"help": "The optimizer to use."},
+    )
+    lr_scheduler_type: Optional[str] = field(
+        default="linear",
+        metadata={"help": "The lr scheduler"},
+    )
+    max_length: Optional[int] = field(default=512)
+    eval_first_step: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether to run eval after the first step"},
+    )
+
+
+parser = HfArgumentParser(ScriptArguments)
+script_args = parser.parse_args_into_dataclasses()[0]
+
+# Load the human stack-exchange-paired dataset for tuning the reward model.
+train_dataset = load_dataset("lvwerra/stack-exchange-paired", data_dir="data/reward", split="train")
+if script_args.train_subset > 0:
+    train_dataset = train_dataset.select(range(script_args.train_subset))
+eval_dataset = load_dataset("lvwerra/stack-exchange-paired", data_dir="data/evaluation", split="train")
+if script_args.eval_subset > 0:
+    eval_dataset = eval_dataset.select(range(script_args.eval_subset))
+# Define the training args. Needs to be done before the model is loaded if you are using deepspeed.
+model_name_split = script_args.model_name.split("/")[-1]
+output_name = (
+    f"{model_name_split}_peft_stack-exchange-paired_rmts__{script_args.train_subset}_{script_args.learning_rate}"
+)
+
+training_args = GaudiTrainingArguments(
+    output_dir=output_name,
+    learning_rate=script_args.learning_rate,
+    per_device_train_batch_size=script_args.per_device_train_batch_size,
+    per_device_eval_batch_size=script_args.per_device_eval_batch_size,
+    num_train_epochs=script_args.num_train_epochs,
+    weight_decay=script_args.weight_decay,
+    evaluation_strategy="steps",
+    eval_steps=500,
+    save_strategy="steps",
+    save_steps=500,
+    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
+    gradient_checkpointing=script_args.gradient_checkpointing,
+    deepspeed=script_args.deepspeed,
+    local_rank=script_args.local_rank,
+    remove_unused_columns=False,
+    label_names=[],
+    bf16=script_args.bf16,
+    logging_strategy="steps",
+    logging_steps=10,
+    optim=script_args.optim,
+    lr_scheduler_type=script_args.lr_scheduler_type,
+    report_to="none",
+    use_habana=True,
+    use_lazy_mode=True,
+)
+# Load the value-head model and tokenizer.
+tokenizer_name = script_args.tokenizer_name if script_args.tokenizer_name is not None else script_args.model_name
+tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_auth_token=True)
+tokenizer.pad_token = tokenizer.eos_token
+
+
+peft_config = LoraConfig(
+    task_type=TaskType.SEQ_CLS,
+    inference_mode=False,
+    r=8,
+    lora_alpha=32,
+    lora_dropout=0.1,
+)
+torch.autograd.set_detect_anomaly(True)
+model = AutoModelForSequenceClassification.from_pretrained(
+    script_args.model_name, num_labels=1, torch_dtype=torch.bfloat16
+)
+
+model = get_peft_model(model, peft_config)
+model.print_trainable_parameters()
+
+# Need to do this for gpt2, because it doesn't have an official pad token.
+tokenizer.pad_token = tokenizer.eos_token
+model.config.pad_token_id = tokenizer.eos_token_id
+model.config.use_cache = not script_args.gradient_checkpointing
+num_proc = 24  # Can adjust to be higher if you have more processors.
+original_columns = train_dataset.column_names
+
+
+# Turn the dataset into pairs of post + summaries, where text_j is the preferred question + answer and text_k is the other.
+# Then tokenize the dataset.
+def preprocess_function(examples):
+    new_examples = {
+        "input_ids_j": [],
+        "attention_mask_j": [],
+        "input_ids_k": [],
+        "attention_mask_k": [],
+    }
+    for question, response_j, response_k in zip(examples["question"], examples["response_j"], examples["response_k"]):
+        tokenized_j = tokenizer("Question: " + question + "\n\nAnswer: " + response_j, truncation=True)
+        tokenized_k = tokenizer("Question: " + question + "\n\nAnswer: " + response_k, truncation=True)
+
+        new_examples["input_ids_j"].append(tokenized_j["input_ids"])
+        new_examples["attention_mask_j"].append(tokenized_j["attention_mask"])
+        new_examples["input_ids_k"].append(tokenized_k["input_ids"])
+        new_examples["attention_mask_k"].append(tokenized_k["attention_mask"])
+
+    return new_examples
+
+
+# preprocess the dataset and filter out QAs that are longer than script_args.max_length
+train_dataset = train_dataset.map(
+    preprocess_function,
+    batched=True,
+    num_proc=num_proc,
+    remove_columns=original_columns,
+)
+train_dataset = train_dataset.filter(
+    lambda x: len(x["input_ids_j"]) <= script_args.max_length and len(x["input_ids_k"]) <= script_args.max_length
+)
+
+eval_dataset = eval_dataset.map(
+    preprocess_function,
+    batched=True,
+    num_proc=num_proc,
+    remove_columns=original_columns,
+)
+eval_dataset = eval_dataset.filter(
+    lambda x: len(x["input_ids_j"]) <= script_args.max_length and len(x["input_ids_k"]) <= script_args.max_length
+)
+
+
+# We need to define a special data collator that batches the data in our j vs k format.
+@dataclass
+class RewardDataCollatorWithPadding:
+    tokenizer: PreTrainedTokenizerBase
+    padding: Union[bool, str, PaddingStrategy] = True
+    max_length: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    return_tensors: str = "pt"
+
+    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
+        features_j = []
+        features_k = []
+        for feature in features:
+            features_j.append(
+                {
+                    "input_ids": feature["input_ids_j"],
+                    "attention_mask": feature["attention_mask_j"],
+                }
+            )
+            features_k.append(
+                {
+                    "input_ids": feature["input_ids_k"],
+                    "attention_mask": feature["attention_mask_k"],
+                }
+            )
+        batch_j = self.tokenizer.pad(
+            features_j,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors=self.return_tensors,
+        )
+        batch_k = self.tokenizer.pad(
+            features_k,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors=self.return_tensors,
+        )
+        batch = {
+            "input_ids_j": batch_j["input_ids"],
+            "attention_mask_j": batch_j["attention_mask"],
+            "input_ids_k": batch_k["input_ids"],
+            "attention_mask_k": batch_k["attention_mask"],
+            "return_loss": True,
+        }
+        return batch
+
+
+# Define the metric that we'll use for validation.
+accuracy = evaluate.load("accuracy")
+
+
+def compute_metrics(eval_pred):
+    predictions, _ = eval_pred
+    # Here, predictions is rewards_j and rewards_k.
+    # We want to see how much of the time rewards_j > rewards_k.
+    predictions = np.argmax(predictions, axis=0)
+    labels = np.zeros(predictions.shape)
+    return accuracy.compute(predictions=predictions, references=labels)
+
+
+class RewardTrainer(GaudiTrainer):
+    # Define how to compute the reward loss. We use the InstructGPT pairwise logloss: https://arxiv.org/abs/2203.02155
+    def compute_loss(self, model, inputs, return_outputs=False):
+        rewards_j = model(input_ids=inputs["input_ids_j"], attention_mask=inputs["attention_mask_j"])[0]
+        rewards_k = model(input_ids=inputs["input_ids_k"], attention_mask=inputs["attention_mask_k"])[0]
+        loss = -nn.functional.logsigmoid(rewards_j - rewards_k).mean()
+        if return_outputs:
+            return loss, {"rewards_j": rewards_j, "rewards_k": rewards_k}
+        return loss
+
+
+gaudi_config = GaudiConfig()
+gaudi_config.use_fused_adam = True
+gaudi_config.use_fused_clip_norm = True
+
+# Train the model, woohoo.
+trainer = RewardTrainer(
+    model=model,
+    gaudi_config=gaudi_config,
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset,
+    compute_metrics=compute_metrics,
+    data_collator=RewardDataCollatorWithPadding(
+        tokenizer=tokenizer, max_length=script_args.max_length, padding="max_length"
+    ),
+)
+
+
+if script_args.eval_first_step:
+
+    class EvaluateFirstStepCallback(TrainerCallback):
+        def on_step_end(self, args, state, control, **kwargs):
+            if state.global_step == 1:
+                control.should_evaluate = True
+
+    trainer.add_callback(EvaluateFirstStepCallback())
+
+trainer.train(script_args.resume_from_checkpoint)
+
+print("Saving last checkpoint of the model")
+trainer.save_model(output_name + "_peft_last_checkpoint")
diff --git a/examples/trl/stack_llama/rl_training.py b/examples/trl/stack_llama/rl_training.py
new file mode 100644
index 0000000000..53ec5b7251
--- /dev/null
+++ b/examples/trl/stack_llama/rl_training.py
@@ -0,0 +1,294 @@
+# copy from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/rl_training.py, enable it for Gaudi2
+from dataclasses import dataclass, field
+from typing import Optional
+
+import torch
+from datasets import load_dataset
+from peft import LoraConfig
+from tqdm import tqdm
+from transformers import Adafactor, AutoModelForSequenceClassification, AutoTokenizer, HfArgumentParser, pipeline
+from trl import AutoModelForCausalLMWithValueHead
+from trl.core import LengthSampler
+
+from optimum.habana.accelerate import GaudiAccelerator
+from optimum.habana.trl import GaudiPPOConfig, GaudiPPOTrainer, adapt_PreTrainedModelWrapper_to_gaudi
+from optimum.habana.utils import set_seed
+
+
+tqdm.pandas()
+
+
+@dataclass
+class ScriptArguments:
+    """
+    The name of the Casual LM model we wish to fine with PPO
+    """
+
+    # NOTE: gpt2 models use Conv1D instead of Linear layers which are not yet supported in 8 bit mode
+    # models like gpt-neo* models are more suitable.
+    model_name: Optional[str] = field(default="", metadata={"help": "the model name"})
+    tokenizer_name: Optional[str] = field(default="", metadata={"help": "the tokenizer name"})
+    reward_model_name: Optional[str] = field(default="", metadata={"help": "the reward model name"})
+    log_with: Optional[str] = field(default=None, metadata={"help": "use 'wandb' to log with wandb"})
+    learning_rate: Optional[float] = field(default=1.41e-5, metadata={"help": "the learning rate"})
+    output_max_length: Optional[int] = field(default=128, metadata={"help": "maximum length for generation"})
+    mini_batch_size: Optional[int] = field(default=1, metadata={"help": "the PPO minibatch size"})
+    batch_size: Optional[int] = field(default=32, metadata={"help": "the batch size"})
+    ppo_epochs: Optional[int] = field(default=4, metadata={"help": "the number of ppo epochs"})
+    gradient_accumulation_steps: Optional[int] = field(
+        default=4, metadata={"help": "the number of gradient accumulation steps"}
+    )
+    adafactor: Optional[bool] = field(default=False, metadata={"help": "whether to use the adafactor optimizer"})
+    early_stopping: Optional[bool] = field(default=False, metadata={"help": "whether to early stop"})
+    target_kl: Optional[float] = field(default=0.1, metadata={"help": "kl target for early stopping"})
+    reward_baseline: Optional[float] = field(
+        default=0.0,
+        metadata={"help": "a baseline value that is subtracted from the reward"},
+    )
+    batched_gen: Optional[bool] = field(default=False, metadata={"help": "whether to use the batched text gen"})
+    save_freq: Optional[int] = field(default=None, metadata={"help": "n steps to save the model"})
+    output_dir: Optional[str] = field(default="runs/", metadata={"help": "n steps to save the model"})
+    seed: Optional[int] = field(default=0, metadata={"help": "the seed"})
+    steps: Optional[int] = field(default=20000, metadata={"help": "number of epochs"})
+    init_kl_coef: Optional[float] = field(
+        default=0.2,
+        metadata={"help": "Initial KL penalty coefficient (used for adaptive and linear control)"},
+    )
+
+    adap_kl_ctrl: Optional[bool] = field(default=True, metadata={"help": "Use adaptive KL control, otherwise linear"})
+    use_habana: Optional[bool] = field(default=True, metadata={"help": "use habana for RL training"})
+
+
+adapt_PreTrainedModelWrapper_to_gaudi()
+parser = HfArgumentParser(ScriptArguments)
+script_args: ScriptArguments = parser.parse_args_into_dataclasses()[0]
+reward_model_name = script_args.reward_model_name
+dataset_name = "lvwerra/stack-exchange-paired"
+config = GaudiPPOConfig(
+    steps=script_args.steps,
+    model_name=script_args.model_name,
+    learning_rate=script_args.learning_rate,
+    log_with=script_args.log_with,
+    batch_size=script_args.batch_size,
+    mini_batch_size=script_args.mini_batch_size,
+    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
+    optimize_cuda_cache=True,
+    early_stopping=script_args.early_stopping,
+    target_kl=script_args.target_kl,
+    ppo_epochs=script_args.ppo_epochs,
+    seed=script_args.seed,
+    init_kl_coef=script_args.init_kl_coef,
+    adap_kl_ctrl=script_args.adap_kl_ctrl,
+    use_habana=script_args.use_habana,
+    pad_for_acceleration=script_args.use_habana,
+    pad_max_len=512 + script_args.output_max_length,
+    pad_max_input_len=512,
+)
+
+train_dataset = load_dataset("lvwerra/stack-exchange-paired", data_dir="data/rl", split="train")
+train_dataset = train_dataset.select(range(100000))
+original_columns = train_dataset.column_names
+
+# We then define the arguments to pass to the sentiment analysis pipeline.
+# We set `return_all_scores` to True to get the sentiment score for each token.
+sent_kwargs = {
+    "return_all_scores": True,
+    "function_to_apply": "none",
+    "batch_size": 16,
+    "truncation": True,
+}
+if config.pad_for_acceleration:
+    sent_kwargs["padding"] = "max_length"
+    sent_kwargs["max_length"] = 512 + script_args.output_max_length
+
+tokenizer = AutoTokenizer.from_pretrained(script_args.tokenizer_name)
+# GPT-2 tokenizer has a pad token, but it is not eos_token by default. We need to set it to eos_token.
+# only for this model.
+
+if getattr(tokenizer, "pad_token", None) is None:
+    tokenizer.pad_token = tokenizer.eos_token
+
+
+# Below is an example function to build the dataset. In our case, we use the IMDB dataset
+# from the `datasets` library. One should customize this function to train the model on
+# its own dataset.
+def build_dataset(
+    tokenizer,
+    dataset_name="lvwerra/stack-exchange-paired",
+):
+    """
+    Build dataset for training. This builds the dataset from `load_dataset`, one should
+    customize this function to train the model on its own dataset.
+
+    Args:
+        dataset_name (`str`):
+            The name of the dataset to be loaded.
+
+    Returns:
+        dataloader (`torch.utils.data.DataLoader`):
+            The dataloader for the dataset.
+    """
+
+    num_proc = 24
+
+    def preprocess_function(examples):
+        new_examples = {
+            "query": [],
+            "input_ids": [],
+        }
+        for question in examples["question"]:
+            query = "Question: " + question + "\n\nAnswer: "
+            tokenized_question = tokenizer(query, truncation=True)
+            new_examples["query"].append(query)
+            new_examples["input_ids"].append(tokenized_question["input_ids"])
+
+        return new_examples
+
+    ds = train_dataset.map(
+        preprocess_function,
+        batched=True,
+        num_proc=num_proc,
+        remove_columns=original_columns,
+    )
+    ds = ds.filter(lambda x: len(x["input_ids"]) < 512, batched=False)
+
+    ds.set_format(type="torch")
+    return ds
+
+
+# We retrieve the dataloader by calling the `build_dataset` function.
+dataset = build_dataset(tokenizer)
+
+
+def collator(data):
+    return {key: [d[key] for d in data] for key in data[0]}
+
+
+# set seed before initializing value head for deterministic eval
+set_seed(config.seed)
+
+# Now let's build the model, the reference model, and the tokenizer.
+current_device = GaudiAccelerator().local_process_index
+lora_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    lora_dropout=0.05,
+    bias="none",
+    task_type="CAUSAL_LM",
+)
+model = AutoModelForCausalLMWithValueHead.from_pretrained(
+    config.model_name,
+    peft_config=lora_config,
+    torch_dtype=torch.bfloat16,
+    low_cpu_mem_usage=True,
+)
+
+optimizer = None
+model = model.to(torch.bfloat16)
+
+if script_args.use_habana:
+    ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(
+        config.model_name,
+        torch_dtype=torch.bfloat16,
+        low_cpu_mem_usage=True,
+    )
+else:
+    ref_model = None
+if script_args.adafactor:
+    optimizer = Adafactor(
+        filter(lambda p: p.requires_grad, model.parameters()),
+        scale_parameter=False,
+        relative_step=False,
+        warmup_init=False,
+        lr=config.learning_rate,
+    )
+# We then build the PPOTrainer, passing the model, the reference model, the tokenizer
+ppo_trainer = GaudiPPOTrainer(
+    config,
+    model,
+    ref_model=ref_model,
+    tokenizer=tokenizer,
+    dataset=dataset,
+    data_collator=collator,
+    optimizer=optimizer,
+)
+
+# We then build the sentiment analysis pipeline using our reward model, passing the
+# model name and the sentiment analysis pipeline arguments. Let's also make sure to
+# set the device to the same device as the PPOTrainer.
+device = ppo_trainer.accelerator.device
+if ppo_trainer.accelerator.num_processes == 1 and torch.cuda.is_available():
+    device = 0
+
+reward_model = AutoModelForSequenceClassification.from_pretrained(
+    reward_model_name,
+    num_labels=1,
+    low_cpu_mem_usage=True,
+    torch_dtype=torch.bfloat16,
+)
+
+if config.use_habana:
+    from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+
+    reward_model = wrap_in_hpu_graph(reward_model)
+
+if device.type == "hpu":
+    device = "hpu"
+
+sentiment_pipe = pipeline(
+    "sentiment-analysis",
+    model=reward_model,
+    tokenizer=tokenizer,
+    return_token_type_ids=False,
+    device=device,
+    model_kwargs={
+        "low_cpu_mem_usage": True,
+        "torch_dtype": torch.bfloat16,
+    },
+)
+
+if sentiment_pipe.model.config.pad_token_id is None:
+    sentiment_pipe.model.config.pad_token_id = tokenizer.pad_token_id
+# We then define the arguments to pass to the `generate` function. These arguments
+# are passed to the `generate` function of the PPOTrainer, which is a wrapper around
+# the `generate` function of the trained model.
+generation_kwargs = {
+    # "min_length": -1,
+    "top_k": 0.0,
+    "top_p": 1.0,
+    "do_sample": True,
+    "pad_token_id": tokenizer.pad_token_id,
+    "eos_token_id": 100_000,
+}
+output_min_length = 32
+output_max_length = script_args.output_max_length
+if not config.pad_for_acceleration:
+    output_length_sampler = LengthSampler(output_min_length, output_max_length)
+else:
+    output_length_sampler = LengthSampler(output_max_length, output_max_length + 1)
+
+for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
+    if epoch >= config.total_ppo_epochs:
+        break
+
+    question_tensors = batch["input_ids"]
+
+    response_tensors = ppo_trainer.generate(
+        question_tensors,
+        return_prompt=False,
+        length_sampler=output_length_sampler,
+        **generation_kwargs,
+    )
+    batch["response"] = tokenizer.batch_decode(response_tensors, skip_special_tokens=True)
+
+    # Compute reward score (using the sentiment analysis pipeline)
+    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
+    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
+    rewards = [torch.tensor(output[0]["score"] - script_args.reward_baseline) for output in pipe_outputs]
+
+    # Run PPO step
+    stats = ppo_trainer.step(question_tensors, response_tensors, rewards)
+    ppo_trainer.log_stats(stats, batch, rewards)
+
+    if script_args.save_freq and epoch and epoch % script_args.save_freq == 0:
+        ppo_trainer.save_pretrained(script_args.output_dir + f"step_{epoch}")
diff --git a/examples/trl/stack_llama/supervised_finetuning.py b/examples/trl/stack_llama/supervised_finetuning.py
new file mode 100644
index 0000000000..a61bca6e3b
--- /dev/null
+++ b/examples/trl/stack_llama/supervised_finetuning.py
@@ -0,0 +1,215 @@
+# copy from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/supervised_finetuning.py, enable it for Gaudi2
+
+import argparse
+import os
+
+import torch
+from datasets import load_dataset
+from peft import LoraConfig
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer, logging, set_seed
+from trl.trainer import ConstantLengthDataset
+
+from optimum.habana import GaudiConfig, GaudiTrainingArguments
+from optimum.habana.trl import GaudiSFTTrainer
+
+
+"""
+Fine-Tune Llama-7b on SE paired dataset
+"""
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, default="")
+    parser.add_argument("--dataset_name", type=str, default="lvwerra/stack-exchange-paired")
+    parser.add_argument("--subset", type=str, default="data/finetune")
+    parser.add_argument("--split", type=str, default="train")
+    parser.add_argument("--size_valid_set", type=int, default=4000)
+    parser.add_argument("--streaming", action="store_true")
+    parser.add_argument("--shuffle_buffer", type=int, default=5000)
+
+    parser.add_argument("--seq_length", type=int, default=1024)
+    parser.add_argument("--max_steps", type=int, default=10000)
+    parser.add_argument("--batch_size", type=int, default=4)
+    parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
+    parser.add_argument("--eos_token_id", type=int, default=49152)
+
+    parser.add_argument("--learning_rate", type=float, default=1e-4)
+    parser.add_argument("--lr_scheduler_type", type=str, default="cosine")
+    parser.add_argument("--num_warmup_steps", type=int, default=100)
+    parser.add_argument("--weight_decay", type=float, default=0.05)
+
+    parser.add_argument("--local_rank", type=int, default=0)
+    parser.add_argument("--fp16", action="store_true", default=False)
+    parser.add_argument("--bf16", action="store_true", default=False)
+    parser.add_argument("--gradient_checkpointing", action="store_true", default=False)
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--num_workers", type=int, default=None)
+    parser.add_argument("--output_dir", type=str, default="./checkpoints")
+    parser.add_argument("--log_freq", default=1, type=int)
+    parser.add_argument("--eval_freq", default=1000, type=int)
+    parser.add_argument("--save_freq", default=1000, type=int)
+
+    return parser.parse_args()
+
+
+def chars_token_ratio(dataset, tokenizer, nb_examples=400):
+    """
+    Estimate the average number of characters per token in the dataset.
+    """
+    total_characters, total_tokens = 0, 0
+    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
+        text = prepare_sample_text(example)
+        total_characters += len(text)
+        if tokenizer.is_fast:
+            total_tokens += len(tokenizer(text).tokens())
+        else:
+            total_tokens += len(tokenizer.tokenize(text))
+
+    return total_characters / total_tokens
+
+
+def print_trainable_parameters(model):
+    """
+    Prints the number of trainable parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
+    )
+
+
+def prepare_sample_text(example):
+    """Prepare the text from a sample of the dataset."""
+    text = f"Question: {example['question']}\n\nAnswer: {example['response_j']}"
+    return text
+
+
+def create_datasets(tokenizer, args):
+    dataset = load_dataset(
+        args.dataset_name,
+        data_dir=args.subset,
+        split=args.split,
+        use_auth_token=True,
+        num_proc=args.num_workers if not args.streaming else None,
+        streaming=args.streaming,
+    )
+    if args.streaming:
+        print("Loading the dataset in streaming mode")
+        valid_data = dataset.take(args.size_valid_set)
+        train_data = dataset.skip(args.size_valid_set)
+        train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=args.seed)
+    else:
+        dataset = dataset.train_test_split(test_size=0.005, seed=args.seed)
+        train_data = dataset["train"]
+        valid_data = dataset["test"]
+        print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")
+
+    chars_per_token = chars_token_ratio(train_data, tokenizer)
+    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")
+
+    train_dataset = ConstantLengthDataset(
+        tokenizer,
+        train_data,
+        formatting_func=prepare_sample_text,
+        infinite=True,
+        seq_length=args.seq_length,
+        chars_per_token=chars_per_token,
+    )
+    valid_dataset = ConstantLengthDataset(
+        tokenizer,
+        valid_data,
+        formatting_func=prepare_sample_text,
+        infinite=False,
+        seq_length=args.seq_length,
+        chars_per_token=chars_per_token,
+    )
+    return train_dataset, valid_dataset
+
+
+def run_training(args, train_data, val_data):
+    print("Loading the model")
+
+    lora_config = LoraConfig(
+        r=16,
+        lora_alpha=32,
+        lora_dropout=0.05,
+        bias="none",
+        task_type="CAUSAL_LM",
+    )
+
+    train_data.start_iteration = 0
+
+    print("Starting main loop")
+
+    training_args = GaudiTrainingArguments(
+        output_dir=args.output_dir,
+        dataloader_drop_last=True,
+        evaluation_strategy="steps",
+        max_steps=args.max_steps,
+        eval_steps=args.eval_freq,
+        save_steps=args.save_freq,
+        logging_steps=args.log_freq,
+        per_device_train_batch_size=args.batch_size,
+        per_device_eval_batch_size=args.batch_size,
+        learning_rate=args.learning_rate,
+        lr_scheduler_type=args.lr_scheduler_type,
+        warmup_steps=args.num_warmup_steps,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        gradient_checkpointing=args.gradient_checkpointing,
+        fp16=args.fp16,
+        bf16=args.bf16,
+        weight_decay=args.weight_decay,
+        run_name="llama-7b-finetuned",
+        report_to="none",
+        ddp_find_unused_parameters=False,
+        use_habana=True,
+        use_lazy_mode=True,
+    )
+    model = AutoModelForCausalLM.from_pretrained(args.model_path, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16)
+
+    gaudi_config = GaudiConfig()
+    gaudi_config.use_fused_adam = True
+    gaudi_config.use_fused_clip_norm = True
+
+    trainer = GaudiSFTTrainer(
+        model=model,
+        gaudi_config=gaudi_config,
+        args=training_args,
+        train_dataset=train_data,
+        eval_dataset=val_data,
+        peft_config=lora_config,
+        packing=True,
+    )
+
+    print_trainable_parameters(trainer.model)
+
+    print("Training...")
+    trainer.train()
+
+    print("Saving last checkpoint of the model")
+    trainer.save_model(os.path.join(args.output_dir, "final_checkpoint/"))
+
+
+def main(args):
+    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+    train_dataset, eval_dataset = create_datasets(tokenizer, args)
+    run_training(args, train_dataset, eval_dataset)
+
+
+if __name__ == "__main__":
+    args = get_args()
+    assert args.model_path != "", "Please provide the llama model path"
+
+    set_seed(args.seed)
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    logging.set_verbosity_error()
+
+    main(args)
diff --git a/optimum/habana/trl/__init__.py b/optimum/habana/trl/__init__.py
index e80fac8b8a..90c9624dc1 100644
--- a/optimum/habana/trl/__init__.py
+++ b/optimum/habana/trl/__init__.py
@@ -1,2 +1,5 @@
+from .models.modeling_base import adapt_PreTrainedModelWrapper_to_gaudi
 from .trainer.dpo_trainer import GaudiDPOTrainer
+from .trainer.ppo_config import GaudiPPOConfig
+from .trainer.ppo_trainer import GaudiPPOTrainer
 from .trainer.sft_trainer import GaudiSFTTrainer
diff --git a/optimum/habana/trl/models/__init__.py b/optimum/habana/trl/models/__init__.py
new file mode 100644
index 0000000000..22bf871003
--- /dev/null
+++ b/optimum/habana/trl/models/__init__.py
@@ -0,0 +1,17 @@
+# flake8: noqa
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .modeling_base import adapt_PreTrainedModelWrapper_to_gaudi
diff --git a/optimum/habana/trl/models/modeling_base.py b/optimum/habana/trl/models/modeling_base.py
new file mode 100644
index 0000000000..ad02ffd1a7
--- /dev/null
+++ b/optimum/habana/trl/models/modeling_base.py
@@ -0,0 +1,71 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import torch
+from trl import PreTrainedModelWrapper
+from trl.import_utils import is_npu_available, is_xpu_available
+
+from optimum.habana.accelerate import GaudiPartialState as PartialState
+from optimum.habana.utils import to_device_dtype
+
+
+def adapt_PreTrainedModelWrapper_to_gaudi():
+    PreTrainedModelWrapper._get_current_device = gaudi_get_current_device
+    PreTrainedModelWrapper.save_pretrained = gaudi_save_pretrained
+
+
+def gaudi_get_current_device():
+    """
+    Copied from PreTrainedModelWrapper._get_current_device: https://github.com/huggingface/trl/blob/v0.7.6/trl/models/modeling_base.py#L392
+    - add hpu device
+    """
+    state = PartialState()
+    if is_xpu_available():
+        return f"xpu:{state.local_process_index}"
+    elif is_npu_available():
+        return f"npu:{state.local_process_index}"
+    elif hasattr(torch, "hpu") and torch.hpu.is_available():
+        return "hpu"
+    else:
+        return state.local_process_index if torch.cuda.is_available() else "cpu"
+
+
+def gaudi_save_pretrained(self, *args, **kwargs):
+    """
+    Copied from PreTrainedModelWrapper.save_pretrained: https://github.com/huggingface/trl/blob/v0.7.6/trl/models/modeling_base.py#L528
+    - to cpu if model dict is in hpu
+    """
+    state_dict = kwargs.get("state_dict")
+    if state_dict is None:
+        state_dict = self.state_dict()
+        kwargs["state_dict"] = state_dict
+
+    if self.__class__._get_current_device() == "hpu":
+        state_dict = to_device_dtype(state_dict, target_device=torch.device("cpu"))
+
+    # if it is a peft model only save the `v_head` state_dict and
+    # pop the `state_dict` from the kwargs to avoid slient bugs with `peft`
+    if self.is_peft_model:
+        save_path = args[0]
+        save_path = os.path.join(save_path, "pytorch_model.bin")
+        torch.save(state_dict, save_path)
+        _ = kwargs.pop("state_dict", None)
+
+    if self.__class__._get_current_device() == "hpu":
+        state_dict = self.pretrained_model.state_dict()
+        state_dict = to_device_dtype(state_dict, target_device=torch.device("cpu"))
+        kwargs["state_dict"] = state_dict
+
+    return self.pretrained_model.save_pretrained(*args, **kwargs)
diff --git a/optimum/habana/trl/trainer/__init__.py b/optimum/habana/trl/trainer/__init__.py
index 13bf554fd7..b17262fc82 100644
--- a/optimum/habana/trl/trainer/__init__.py
+++ b/optimum/habana/trl/trainer/__init__.py
@@ -19,3 +19,5 @@
 
 from .sft_trainer import GaudiSFTTrainer
 from .dpo_trainer import GaudiDPOTrainer
+from .ppo_config import GaudiPPOConfig
+from .ppo_trainer import GaudiPPOTrainer
diff --git a/optimum/habana/trl/trainer/ppo_config.py b/optimum/habana/trl/trainer/ppo_config.py
new file mode 100644
index 0000000000..03bf06dbca
--- /dev/null
+++ b/optimum/habana/trl/trainer/ppo_config.py
@@ -0,0 +1,70 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+
+import numpy as np
+from trl import PPOConfig, is_wandb_available
+from trl.trainer.utils import exact_div
+
+
+@dataclass
+class GaudiPPOConfig(PPOConfig):
+    """
+    Configuration class for GaudiPPOTrainer
+    """
+
+    use_habana: bool = False
+    """Use habana. Only applicable if use_habana is True"""
+    pad_for_acceleration: bool = False
+    """Use pad_for_acceleration. Only applicable if pad_for_acceleration is True"""
+    pad_max_len: int = 0
+    """Use pad_for_acceleration. Only applicable if pad_for_acceleration is True"""
+    pad_max_input_len: int = 0
+
+    def __post_init__(self):
+        self.backward_batch_size = self.mini_batch_size * self.gradient_accumulation_steps
+        exact_div(
+            self.batch_size,
+            self.backward_batch_size,
+            "`batch_size`",
+            "`mini_batch_size * gradient_accumulation_steps`",
+            "`batch_size` must be a multiple of `mini_batch_size * gradient_accumulation_steps`",
+        )
+        self.total_ppo_epochs = int(np.ceil(self.steps / self.batch_size))
+
+        # check if wandb is installed
+        if self.log_with == "wandb":
+            # raise error if wandb is not installed
+            if not is_wandb_available():
+                raise ImportError(
+                    "Please install wandb to use wandb logging. You can do this by running `pip install wandb`."
+                )
+
+        if self.pad_for_acceleration:
+            if self.pad_max_input_len == 0:
+                raise AssertionError("pad_max_input_len ({self.pad_max_input_len}) must be set for pad input ")
+            if self.pad_max_input_len >= self.pad_max_len:
+                raise AssertionError(
+                    "pad_max_input_len ({self.pad_max_input_len}) must be smaller "
+                    " then pad_max_len ({self.pad_max_len})"
+                )
+
+        if self.use_habana:
+            from optimum.habana.transformers.modeling_utils import (  # pylint: disable=E0611, E0401
+                adapt_transformers_to_gaudi,
+            )
+
+            adapt_transformers_to_gaudi()
+
+        assert self.kl_penalty in ["kl", "abs", "mse", "full"]
diff --git a/optimum/habana/trl/trainer/ppo_trainer.py b/optimum/habana/trl/trainer/ppo_trainer.py
new file mode 100644
index 0000000000..5ef2745e1e
--- /dev/null
+++ b/optimum/habana/trl/trainer/ppo_trainer.py
@@ -0,0 +1,881 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import time
+import typing
+import warnings
+from contextlib import nullcontext
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+from accelerate.utils import ProjectConfiguration
+from datasets import Dataset
+from torch.optim import Adam
+from transformers import (
+    DataCollatorForLanguageModeling,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerBase,
+    PreTrainedTokenizerFast,
+)
+from trl import PPOTrainer
+from trl.core import (
+    WANDB_PADDING,
+    PPODecorators,
+    convert_to_scalar,
+    logprobs_from_logits,
+    stack_dicts,
+    stats_to_np,
+)
+from trl.import_utils import is_npu_available, is_torch_greater_2_0, is_xpu_available
+from trl.models import SUPPORTED_ARCHITECTURES, PreTrainedModelWrapper, create_reference_model
+from trl.trainer import AdaptiveKLController, BaseTrainer, FixedKLController, RunningMoments
+
+from optimum.habana.utils import set_seed
+
+from . import GaudiPPOConfig
+
+
+class GaudiPPOTrainer(PPOTrainer, BaseTrainer):
+    def __init__(
+        self,
+        config: GaudiPPOConfig = None,
+        model: PreTrainedModelWrapper = None,
+        ref_model: Optional[PreTrainedModelWrapper] = None,
+        tokenizer: PreTrainedTokenizerBase = None,
+        dataset: Optional[Union[torch.utils.data.Dataset, Dataset]] = None,
+        optimizer: Optional[torch.optim.Optimizer] = None,
+        data_collator: Optional[typing.Callable] = None,
+        num_shared_layers: Optional[int] = None,
+        lr_scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
+    ):
+        """
+        Copied from PPOTrainer.__init__: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L145
+        The only differences are:
+        - add new args for guadi in config
+        - use GaudiAccelerator instead of Accelerator
+        """
+        BaseTrainer.__init__(self, config)
+
+        # initial seed for reproducible experiments
+        set_seed(config.seed)
+
+        # Step 0: check positional arguments validity
+        if not isinstance(config, GaudiPPOConfig):
+            raise ValueError(f"config must be a PPOConfig, got {type(config)}")
+        if not isinstance(tokenizer, (PreTrainedTokenizerBase)):
+            raise ValueError(
+                f"tokenizer must be a PreTrainedTokenizerBase like a PreTrainedTokenizer or a PreTrainedTokenizerFast, got {type(tokenizer)}"
+            )
+        if not isinstance(model, (SUPPORTED_ARCHITECTURES)):
+            raise ValueError(
+                f"model must be a PreTrainedModelWrapper, got {type(model)} - supported architectures are: {SUPPORTED_ARCHITECTURES}"
+            )
+        # Step 1: Initialize Accelerator
+        if config.use_habana:
+            from optimum.habana.accelerate import GaudiAccelerator as Accelerator  # pylint: disable=E0611, E0401
+        else:
+            from accelerate import Accelerator
+        self.accelerator = Accelerator(
+            log_with=config.log_with,
+            gradient_accumulation_steps=config.gradient_accumulation_steps,
+            project_config=ProjectConfiguration(**config.project_kwargs),
+            **config.accelerator_kwargs,
+        )
+
+        # Step 1.1 Runtime variables filled by the accelerator
+        config.world_size = self.accelerator.num_processes
+        config.global_backward_batch_size = config.backward_batch_size * config.world_size
+        config.global_batch_size = config.batch_size * config.world_size
+
+        self.model = model.to(self.accelerator.device.type)
+        self.model_params = filter(lambda p: p.requires_grad, self.model.parameters())
+        self.is_encoder_decoder = hasattr(self.model, "is_encoder_decoder")
+        self.is_peft_model = getattr(self.model, "is_peft_model", False)
+        config.is_encoder_decoder = self.is_encoder_decoder
+        config.is_peft_model = self.is_peft_model
+
+        is_using_tensorboard = config.log_with is not None and config.log_with == "tensorboard"
+        self.accelerator.init_trackers(
+            config.tracker_project_name,
+            config={"trl_ppo_trainer_config": config.to_dict()} if not is_using_tensorboard else config.to_dict(),
+            init_kwargs=config.tracker_kwargs,
+        )
+        self.is_using_text_environment = getattr(config, "use_text_environment", False)
+
+        if isinstance(ref_model, SUPPORTED_ARCHITECTURES):
+            self.ref_model = ref_model.to(self.accelerator.device.type)
+            if num_shared_layers is not None:
+                warnings.warn(
+                    "num_shared_layers is ignored when ref_model is provided. Two different models are used for the "
+                    "model and the reference model and no layers are shared.",
+                    UserWarning,
+                )
+        elif ref_model is None and not self.is_peft_model:
+            self.ref_model = create_reference_model(self.model, num_shared_layers=num_shared_layers)
+        elif self.is_peft_model:
+            self.ref_model = None
+        else:
+            raise ValueError(
+                f"ref_model must be a PreTrainedModelWrapper or `None`, got {type(ref_model)} - supported "
+                f"architectures are: {SUPPORTED_ARCHITECTURES} "
+            )
+        self.optional_peft_ctx = (
+            self.accelerator.unwrap_model(self.model).pretrained_model.disable_adapter
+            if self.is_peft_model
+            else nullcontext
+        )
+
+        if not (isinstance(tokenizer, PreTrainedTokenizer) or isinstance(tokenizer, PreTrainedTokenizerFast)):
+            raise ValueError(
+                "tokenizer must be a transformers.PreTrainedTokenizer or transformers.PreTrainedTokenizerFast"
+            )
+        self.tokenizer = tokenizer
+
+        if dataset is not None and not (isinstance(dataset, torch.utils.data.Dataset) or isinstance(dataset, Dataset)):
+            raise ValueError("dataset must be a torch.utils.data.Dataset or datasets.Dataset")
+        elif dataset is None:
+            warnings.warn(
+                "No dataset is provided. Make sure to set config.batch_size to the correct value before training.",
+                UserWarning,
+            )
+        self.dataset = dataset
+        self._signature_columns = None
+        if self.dataset is not None:
+            self.dataloader = self.prepare_dataloader(self.dataset, data_collator)
+        elif self.dataset is None and self.accelerator.num_processes > 1:
+            warnings.warn(
+                "No dataset is provided. In a multi-GPU setting, this will lead to an error. You should"
+                " prepare your dataloader yourself with `dataloader = ppo_trainer.accelerator.prepare(dataloader)`"
+                " and using `torch.utils.data.DataLoader`, or pass a dataset to the `PPOTrainer`. Please "
+                " refer to the documentation for more details.",
+                UserWarning,
+            )
+            self.dataloader = None
+        else:
+            self.dataloader = None
+
+        # Step 3: Initialize optimizer and data collator
+        self.data_collator = DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
+        if optimizer is None:
+            self.optimizer = Adam(
+                filter(lambda p: p.requires_grad, self.model.parameters()),
+                lr=self.config.learning_rate,
+            )
+        else:
+            self.optimizer = optimizer
+
+        self.lr_scheduler = lr_scheduler
+        if self.lr_scheduler is not None:
+            lr_scheduler_class = (
+                torch.optim.lr_scheduler._LRScheduler
+                if not is_torch_greater_2_0()
+                else torch.optim.lr_scheduler.LRScheduler
+            )
+
+            if not isinstance(self.lr_scheduler, lr_scheduler_class):
+                raise ValueError(
+                    "lr_scheduler must be a torch.optim.lr_scheduler._LRScheduler or torch.optim.lr_scheduler.LRScheduler (for torch >= 2.0)"
+                )
+
+        if self.config.adap_kl_ctrl:
+            self.kl_ctl = AdaptiveKLController(self.config.init_kl_coef, self.config.target, self.config.horizon)
+        else:
+            self.kl_ctl = FixedKLController(self.config.init_kl_coef)
+
+        if self.accelerator.distributed_type == "MULTI_HPU":
+            from accelerate.utils import DistributedDataParallelKwargs
+
+            kwargs = {}
+            kwargs["find_unused_parameters"] = True
+            kwargs["gradient_as_bucket_view"] = True
+            self.accelerator.ddp_handler = DistributedDataParallelKwargs(**kwargs)
+
+        # Safety checkers for DS integration
+        is_deepspeed_used = self.accelerator.distributed_type == "DEEPSPEED" and hasattr(
+            self.accelerator.state, "deepspeed_plugin"
+        )
+
+        (
+            self.model,
+            self.optimizer,
+            self.data_collator,
+            self.dataloader,
+            self.lr_scheduler,
+        ) = self.accelerator.prepare(
+            self.model,
+            self.optimizer,
+            self.data_collator,
+            self.dataloader,
+            self.lr_scheduler,
+        )
+        if is_deepspeed_used:
+            # Quantized models are already set on the correct device
+            if not self.is_peft_model and not (
+                getattr(self.ref_model.pretrained_model, "is_loaded_in_8bit", False)
+                or getattr(self.ref_model.pretrained_model, "is_loaded_in_4bit", False)
+            ):
+                self.ref_model = self._prepare_deepspeed(self.ref_model)
+        else:
+            self.ref_model = self.accelerator.prepare(self.ref_model)
+
+        # In a distributed setup, only logging needs to be performed on the main process
+        # check: https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html
+        # or: https://discuss.pytorch.org/t/use-distributed-data-parallel-correctly/82500/11
+        self.is_distributed = self.accelerator.num_processes > 1
+
+        # init the current step
+        self.current_step = 0
+
+        # init variables for pushing model to hub
+        if config.push_to_hub_if_best_kwargs:
+            if "repo_id" not in config.push_to_hub_if_best_kwargs:
+                raise ValueError("You have to specify repo_id in order to push the model to the hub!")
+            self.push_to_hub_kwargs = config.push_to_hub_if_best_kwargs
+            self.compare_step = 0
+            self.highest_reward = torch.tensor(-float("inf"))
+
+        # post process for PP
+        if not getattr(self.model, "is_sequential_parallel", False):
+            self.current_device = self.accelerator.device
+        else:
+            if is_xpu_available():
+                self.current_device = torch.device("xpu:0")
+            elif is_npu_available():
+                self.current_device = torch.device("npu:0")
+            elif self.accelerator.device.type == "hpu":
+                self.current_device = torch.device("hpu:0")
+            else:
+                self.current_device = torch.device("cuda:0")
+
+        PPODecorators.optimize_device_cache = self.config.optimize_device_cache
+
+        self.running = RunningMoments(self.accelerator)
+        if config.use_habana:
+            import habana_frameworks.torch.core as htcore
+
+            self.htcore = htcore
+
+    def generate(
+        self,
+        query_tensor: Union[torch.Tensor, List[torch.Tensor]],
+        length_sampler: Callable = None,
+        batch_size: int = 4,
+        return_prompt: bool = True,
+        generate_ref_response: bool = False,
+        **generation_kwargs,
+    ):
+        """
+        Copied from PPOTrainer.generate: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L433
+        The only differences are:
+        - add hpu graph for acceleration
+        """
+        if generate_ref_response:
+            ref_model = self.model if self.is_peft_model else self.ref_model
+        if isinstance(query_tensor, List):
+            if self.config.use_habana:
+                self.wrap_generation_for_hpu_graph_mode(self.model)
+            response = self._generate_batched(
+                self.model,
+                query_tensor,
+                length_sampler=length_sampler,
+                batch_size=batch_size,
+                return_prompt=return_prompt,
+                **generation_kwargs,
+            )
+            if generate_ref_response:
+                with self.optional_peft_ctx():
+                    if self.config.use_habana:
+                        self.wrap_generation_for_hpu_graph_mode(ref_model)
+                    ref_response = self._generate_batched(
+                        ref_model,
+                        query_tensor,
+                        length_sampler=length_sampler,
+                        batch_size=batch_size,
+                        return_prompt=return_prompt,
+                        **generation_kwargs,
+                    )
+
+        else:
+            if len(query_tensor.shape) == 2:
+                raise ValueError(
+                    "query_tensor must be a tensor of shape (`seq_len`) or a list of tensors of shape (`seq_len`)"
+                )
+
+            if length_sampler is not None:
+                generation_kwargs["max_new_tokens"] = length_sampler()
+            if self.config.use_habana:
+                self.wrap_generation_for_hpu_graph_mode(self.model)
+            response = self.accelerator.unwrap_model(self.model).generate(
+                input_ids=query_tensor.unsqueeze(dim=0), **generation_kwargs
+            )
+            if generate_ref_response:
+                with self.optional_peft_ctx():
+                    if self.config.use_habana:
+                        self.wrap_generation_for_hpu_graph_mode(ref_model)
+                    ref_response = ref_model.generate(input_ids=query_tensor.unsqueeze(dim=0), **generation_kwargs)
+
+            if not return_prompt and not self.is_encoder_decoder:
+                response = response[:, query_tensor.shape[0] :]
+                if generate_ref_response:
+                    ref_response = ref_response[:, query_tensor.shape[0] :]
+
+        if generate_ref_response:
+            return response, ref_response
+        return response
+
+    def _generate_batched(
+        self,
+        model: PreTrainedModelWrapper,
+        query_tensors: List[torch.Tensor],
+        length_sampler: Callable = None,
+        batch_size: int = 4,
+        return_prompt: bool = True,
+        pad_to_multiple_of: int = None,
+        remove_padding: bool = True,
+        **generation_kwargs,
+    ):
+        """
+        Copied from PPOTrainer._generate_batched: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L509
+        The only differences are:
+        - pad to pad_max_input_len to get static shape for generation acceleration
+        - use lazy mode and hpu_graphs for generation in hpu
+        """
+        outputs = []
+
+        padding_side_default = self.tokenizer.padding_side
+        if not self.is_encoder_decoder:
+            self.tokenizer.padding_side = "left"
+
+        # in case we have fewer examples than bs
+        batch_size = min(len(query_tensors), batch_size)
+
+        for i in range(0, len(query_tensors), batch_size):
+            if length_sampler is not None:
+                generation_kwargs["max_new_tokens"] = length_sampler()
+
+            # prevent overflow if query tensors are not even multiple of bs
+            end_index = min(len(query_tensors), i + batch_size)
+
+            batch = query_tensors[i:end_index]
+            batch_mask = [torch.ones_like(element) for element in batch]
+            inputs = {"input_ids": batch, "attention_mask": batch_mask}
+
+            if self.config.pad_for_acceleration and self.config.pad_max_input_len > 0:
+                padded_inputs = self.tokenizer.pad(
+                    inputs,
+                    padding="max_length",
+                    max_length=self.config.pad_max_input_len,
+                    pad_to_multiple_of=pad_to_multiple_of,
+                    return_tensors="pt",
+                ).to(self.current_device)
+            else:
+                padded_inputs = self.tokenizer.pad(
+                    inputs,
+                    padding=True,
+                    max_length=None,
+                    pad_to_multiple_of=pad_to_multiple_of,
+                    return_tensors="pt",
+                ).to(self.current_device)
+
+            if self.config.use_habana:
+                generation_kwargs["ignore_eos"] = False
+                generation_kwargs["lazy_mode"] = True
+                generation_kwargs["hpu_graphs"] = True
+
+            generations = self.accelerator.unwrap_model(model).generate(**padded_inputs, **generation_kwargs)
+
+            for generation, mask in zip(generations, padded_inputs["attention_mask"]):
+                if not self.is_encoder_decoder:
+                    output = generation[(1 - mask).sum() :]  # remove padding
+                else:
+                    output = generation
+
+                if not return_prompt and not self.is_encoder_decoder:
+                    output = output[(mask).sum() :]  # remove prompt
+
+                if remove_padding and self.tokenizer.eos_token_id in output:
+                    pad_mask = output == self.tokenizer.eos_token_id
+                    pad_start = torch.nonzero(pad_mask, as_tuple=False)[0, 0].item()
+                    output = output[: pad_start + 1]  # keep the eos token at the end
+
+                outputs.append(output)
+
+        self.tokenizer.padding_side = padding_side_default
+        return outputs
+
+    @PPODecorators.empty_device_cache()
+    def step(
+        self,
+        queries: List[torch.LongTensor],
+        responses: List[torch.LongTensor],
+        scores: List[torch.FloatTensor],
+        response_masks: Optional[List[torch.LongTensor]] = None,
+    ):
+        """
+        Copied from PPOTrainer.step: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L620
+        The only differences are:
+        - use hpu_graphs for sampling and training
+        - remove duplicated padding if padding is done in prepare_model_inputs
+        """
+        bs = self.config.batch_size
+
+        queries, responses, scores, response_masks = self._step_safety_checker(
+            bs, queries, responses, scores, response_masks
+        )
+        scores = torch.tensor(scores, device=self.current_device)
+        if self.config.use_score_scaling:
+            # Score scaling
+            scores_mean, scores_std = self.running.update(scores)
+            tensor_to_kwargs = {"dtype": scores.dtype, "device": scores.device}
+            score_scaling_factor = self.running.std.to(**tensor_to_kwargs) + torch.finfo(scores.dtype).eps
+            if self.config.use_score_norm:
+                scores = (scores - self.running.mean.to(**tensor_to_kwargs)) / score_scaling_factor
+            else:
+                scores /= score_scaling_factor
+
+        if self.config.score_clip is not None:
+            # Score clipping
+            scores_dtype = scores.dtype
+            scores = torch.clip(scores.float(), -self.config.score_clip, self.config.score_clip).to(dtype=scores_dtype)
+
+        # if we want to push best model to the hub
+        if hasattr(self, "highest_reward"):
+            if self.compare_step % self.config.compare_steps == 0:
+                curr_mean_reward = scores.mean()
+                # if the best reward ever seen
+                if curr_mean_reward > self.highest_reward:
+                    self.highest_reward = curr_mean_reward
+                    # push model to hub
+                    self.push_to_hub(**self.push_to_hub_kwargs)
+            self.compare_step += 1
+
+        timing = {}
+        t0 = time.time()
+
+        t = time.time()
+
+        model_inputs = self.prepare_model_inputs(queries, responses)
+
+        if self.is_distributed and not self.config.pad_for_acceleration:
+            pad_first = self.tokenizer.padding_side == "left"
+
+            model_inputs["input_ids"] = self.accelerator.pad_across_processes(
+                model_inputs["input_ids"],
+                dim=1,
+                pad_index=self.tokenizer.pad_token_id,
+                pad_first=pad_first,
+            )
+            model_inputs["attention_mask"] = self.accelerator.pad_across_processes(
+                model_inputs["attention_mask"], dim=1, pad_index=0, pad_first=pad_first
+            )
+            if self.is_encoder_decoder:
+                model_inputs["decoder_input_ids"] = self.accelerator.pad_across_processes(
+                    model_inputs["decoder_input_ids"],
+                    dim=1,
+                    pad_index=self.tokenizer.pad_token_id,
+                    pad_first=pad_first,
+                )
+                model_inputs["decoder_attention_mask"] = self.accelerator.pad_across_processes(
+                    model_inputs["decoder_attention_mask"],
+                    dim=1,
+                    pad_index=0,
+                    pad_first=pad_first,
+                )
+
+        model_inputs_names = list(model_inputs.keys())
+
+        full_kl_penalty = self.config.kl_penalty == "full"
+
+        with torch.no_grad():
+            if self.config.use_habana:
+                self.unwrap_generation_for_hpu_graph_mode(self.model)
+                self.wrap_fw_for_hpu_graph_mode(self.model)
+                if self.ref_model is not None:
+                    self.unwrap_generation_for_hpu_graph_mode(self.ref_model)
+                    self.wrap_fw_for_hpu_graph_mode(self.ref_model)
+            all_logprobs, logits_or_none, values, masks = self.batched_forward_pass(
+                self.model,
+                queries,
+                responses,
+                model_inputs,
+                response_masks=response_masks,
+                return_logits=full_kl_penalty,
+            )
+            with self.optional_peft_ctx():
+                ref_logprobs, ref_logits_or_none, _, _ = self.batched_forward_pass(
+                    self.model if self.is_peft_model else self.ref_model,
+                    queries,
+                    responses,
+                    model_inputs,
+                    return_logits=full_kl_penalty,
+                )
+
+        timing["time/ppo/forward_pass"] = time.time() - t
+
+        with torch.no_grad():
+            t = time.time()
+            if full_kl_penalty:
+                active_full_logprobs = logprobs_from_logits(logits_or_none, None, gather=False)
+                ref_full_logprobs = logprobs_from_logits(ref_logits_or_none, None, gather=False)
+
+                rewards, non_score_reward = self.compute_rewards(
+                    scores, active_full_logprobs, ref_full_logprobs, masks
+                )
+            else:
+                rewards, non_score_reward = self.compute_rewards(scores, all_logprobs, ref_logprobs, masks)
+            timing["time/ppo/compute_rewards"] = time.time() - t
+
+            t = time.time()
+            values, advantages, returns = self.compute_advantages(values, rewards, masks)
+            timing["time/ppo/compute_advantages"] = time.time() - t
+
+        # upcast to float32 to avoid dataset issues
+        batch_dict = {
+            "queries": queries,
+            "responses": responses,
+            "logprobs": all_logprobs.to(torch.float32),
+            "values": values.to(torch.float32),
+            "masks": masks,
+            "advantages": advantages,
+            "returns": returns,
+        }
+        batch_dict.update(model_inputs)
+
+        t = time.time()
+        all_stats = []
+        early_stop = False
+        if self.config.use_habana:
+            self.unwrap_fw_for_hpu_graph_mode(self.model)
+            import habana_frameworks.torch as ht  # pylint: disable=E0611, E0401
+
+            model = self.accelerator.unwrap_model(self.model)
+            if not hasattr(model, "wrap_train_in_graph"):
+                ht.hpu.ModuleCacher()(model=model, inplace=True)
+                setattr(model, "wrap_train_in_graph", model.forward)
+            else:
+                model.forward = getattr(model, "wrap_train_in_graph")
+
+        for _ in range(self.config.ppo_epochs):
+            if early_stop:
+                break
+            b_inds = np.random.permutation(bs)
+            for backward_batch_start in range(0, bs, self.config.backward_batch_size):
+                backward_batch_end = backward_batch_start + self.config.backward_batch_size
+                backward_batch_inds = b_inds[backward_batch_start:backward_batch_end]
+
+                for mini_batch_start in range(0, self.config.backward_batch_size, self.config.mini_batch_size):
+                    mini_batch_end = mini_batch_start + self.config.mini_batch_size
+                    mini_batch_inds = backward_batch_inds[mini_batch_start:mini_batch_end]
+                    mini_batch_dict = {
+                        "logprobs": batch_dict["logprobs"][mini_batch_inds],
+                        "values": batch_dict["values"][mini_batch_inds],
+                        "masks": batch_dict["masks"][mini_batch_inds],
+                        # hacks: the queries and responses are ragged.
+                        "queries": [batch_dict["queries"][i] for i in mini_batch_inds],
+                        "responses": [batch_dict["responses"][i] for i in mini_batch_inds],
+                        "advantages": batch_dict["advantages"][mini_batch_inds],
+                        "returns": batch_dict["returns"][mini_batch_inds],
+                    }
+                    for k in model_inputs_names:
+                        mini_batch_dict[k] = batch_dict[k][mini_batch_inds]
+                    with self.accelerator.accumulate(self.model):
+                        model_inputs = {k: mini_batch_dict[k] for k in model_inputs_names}
+
+                        logprobs, logits, vpreds, _ = self.batched_forward_pass(
+                            self.model,
+                            mini_batch_dict["queries"],
+                            mini_batch_dict["responses"],
+                            model_inputs,
+                            return_logits=True,
+                        )
+                        train_stats = self.train_minibatch(
+                            mini_batch_dict["logprobs"],
+                            mini_batch_dict["values"],
+                            logprobs,
+                            logits,
+                            vpreds,
+                            mini_batch_dict["masks"],
+                            mini_batch_dict["advantages"],
+                            mini_batch_dict["returns"],
+                        )
+                        all_stats.append(train_stats)
+
+            # typically, early stopping is done at the epoch level
+            if self.config.early_stopping:
+                policykl = train_stats["policy/policykl"]
+                early_stop = self._early_stop(policykl)
+                if early_stop:
+                    break
+
+        timing["time/ppo/optimize_step"] = time.time() - t
+
+        t = time.time()
+        train_stats = stack_dicts(all_stats)
+
+        # reshape advantages/ratios such that they are not averaged.
+        train_stats["policy/advantages"] = torch.flatten(train_stats["policy/advantages"]).unsqueeze(0)
+        train_stats["policy/advantages"] = torch.nan_to_num(train_stats["policy/advantages"], WANDB_PADDING)
+        train_stats["policy/ratio"] = torch.flatten(train_stats["policy/ratio"]).unsqueeze(0)
+
+        stats = self.record_step_stats(
+            scores=scores,
+            logprobs=all_logprobs,
+            ref_logprobs=ref_logprobs,
+            non_score_reward=non_score_reward,
+            train_stats=train_stats,
+            kl_coef=self.kl_ctl.value,
+            masks=masks,
+            queries=queries,
+            responses=responses,
+        )
+        # Gather/Reduce stats from all processes
+        if self.is_distributed:
+            stats = self.gather_stats(stats)
+        stats = stats_to_np(stats)
+        timing["time/ppo/calc_stats"] = time.time() - t
+        stats["ppo/learning_rate"] = self.optimizer.param_groups[0]["lr"]
+
+        # Update the KL control - multiply the batch_size by the number of processes
+        self.kl_ctl.update(
+            stats["objective/kl"],
+            self.config.batch_size * self.accelerator.num_processes,
+        )
+
+        # Log the total ppo time
+        timing["time/ppo/total"] = time.time() - t0
+        stats.update(timing)
+
+        # post-process stats for tensorboard and other loggers
+        if self.config.log_with != "wandb":
+            stats = convert_to_scalar(stats)
+
+        if self.lr_scheduler is not None:
+            self.lr_scheduler.step()
+
+        return stats
+
+    def prepare_model_inputs(self, queries: torch.Tensor, responses: torch.Tensor):
+        """
+        Copied from PPOTrainer.prepare_model_inputs: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L921
+        The only differences are:
+        - add padding to model inputs for static shape support in forward
+        """
+        if self.is_encoder_decoder:
+            input_data = self.data_collator(
+                [{"input_ids": q, "attention_mask": torch.ones_like(q)} for q in queries]
+            ).to(self.current_device)
+
+            decoder_inputs = self.data_collator(
+                [{"input_ids": r, "attention_mask": torch.ones_like(r)} for r in responses]
+            ).to(self.current_device)
+
+            input_data["decoder_input_ids"] = decoder_inputs["input_ids"]
+            input_data["decoder_attention_mask"] = decoder_inputs["attention_mask"]
+        else:
+            input_ids = [torch.cat([q, r]) for q, r in zip(queries, responses)]
+            input_data = self.data_collator(
+                [{"input_ids": ids, "attention_mask": torch.ones_like(ids)} for ids in input_ids]
+            ).to(self.current_device)
+
+        if self.config.pad_for_acceleration:
+            input_data["input_ids"] = torch.nn.functional.pad(
+                input_data["input_ids"],
+                (0, self.config.pad_max_len - input_data["input_ids"].shape[1]),
+                value=self.tokenizer.pad_token_id,
+            )
+            input_data["attention_mask"] = torch.nn.functional.pad(
+                input_data["attention_mask"],
+                (
+                    0,
+                    self.config.pad_max_len - input_data["attention_mask"].shape[1],
+                ),
+                value=0,
+            )
+            if self.is_encoder_decoder:
+                input_data["decoder_input_ids"] = torch.nn.functional.pad(
+                    input_data["decoder_input_ids"],
+                    (
+                        0,
+                        self.config.pad_max_len - input_data["decoder_input_ids"].shape[1],
+                    ),
+                    value=self.tokenizer.pad_token_id,
+                )
+                input_data["decoder_attention_mask"] = torch.nn.functional.pad(
+                    input_data["decoder_attention_mask"],
+                    (
+                        0,
+                        self.config.pad_max_len - input_data["decoder_attention_mask"].shape[1],
+                    ),
+                    value=0,
+                )
+
+        input_data.pop("labels", None)  # we don't want to compute LM losses
+        return input_data
+
+    @PPODecorators.empty_device_cache()
+    def batched_forward_pass(
+        self,
+        model: PreTrainedModelWrapper,
+        queries: torch.Tensor,
+        responses: torch.Tensor,
+        model_inputs: dict,
+        return_logits: bool = False,
+        response_masks: Optional[torch.Tensor] = None,
+    ):
+        """
+        Copied from PPOTrainer.batched_forward_pass: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L943
+        The only differences are:
+        - input_kwargs/output need to clone() to avoid overidden in hpu
+        """
+        bs = len(queries)
+        fbs = self.config.mini_batch_size
+        all_logprobs = []
+        all_logits = []
+        all_masks = []
+        all_values = []
+
+        model.eval()
+
+        for i in range(math.ceil(bs / fbs)):
+            input_kwargs = {key: value[i * fbs : (i + 1) * fbs].clone() for key, value in model_inputs.items()}
+            query_batch = queries[i * fbs : (i + 1) * fbs]
+            response_batch = responses[i * fbs : (i + 1) * fbs]
+            if response_masks is not None:
+                response_masks_batch = response_masks[i * fbs : (i + 1) * fbs]
+            logits, _, values = model(**input_kwargs)
+
+            if self.is_encoder_decoder:
+                input_ids = input_kwargs["decoder_input_ids"]
+                attention_mask = input_kwargs["decoder_attention_mask"]
+            else:
+                input_ids = input_kwargs["input_ids"]
+                attention_mask = input_kwargs["attention_mask"]
+
+            logprobs = logprobs_from_logits(logits[:, :-1, :], input_ids[:, 1:])
+            masks = torch.zeros_like(attention_mask)
+            masks[:, :-1] = attention_mask[:, 1:]
+
+            for j in range(len(query_batch)):
+                if self.is_encoder_decoder:
+                    # Decoder sentence starts always in the index 1 after padding in the Enc-Dec Models
+                    start = 1
+                    end = attention_mask[j, :].sum() - 1
+                else:
+                    start = len(query_batch[j]) - 1  # logprobs starts from the second query token
+                    if attention_mask[j, 0] == 0:  # offset left padding
+                        start += attention_mask[j, :].nonzero()[0]
+                    end = start + len(response_batch[j])
+                    if response_masks is not None:
+                        response_masks_batch[j] = torch.cat(
+                            (torch.zeros_like(query_batch[j]), response_masks_batch[j])
+                        )[1:]
+
+                masks[j, :start] = 0
+                masks[j, end:] = 0
+                if response_masks is not None:
+                    masks[j, start:end] = masks[j, start:end] * response_masks_batch[j][start:end]
+
+            if return_logits:
+                all_logits.append(logits.clone())
+            else:
+                del logits
+            all_values.append(values.clone())
+            all_logprobs.append(logprobs)
+            all_masks.append(masks)
+
+        return (
+            torch.cat(all_logprobs),
+            torch.cat(all_logits)[:, :-1] if return_logits else None,
+            torch.cat(all_values)[:, :-1],
+            torch.cat(all_masks)[:, :-1],
+        )
+
+    @PPODecorators.empty_device_cache()
+    def train_minibatch(
+        self,
+        old_logprobs: torch.FloatTensor,
+        values: torch.FloatTensor,
+        logprobs: torch.FloatTensor,
+        logits: torch.FloatTensor,
+        vpreds: torch.FloatTensor,
+        mask: torch.LongTensor,
+        advantages: torch.FloatTensor,
+        returns: torch.FloatTensor,
+    ):
+        """
+        Copied from PPOTrainer.batched_forward_pass: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L1034
+        The only differences are:
+        - add htcore.mark_step
+        """
+        self.model.train()
+        loss_p, loss_v, train_stats = self.loss(
+            old_logprobs, values, logits, vpreds, logprobs, mask, advantages, returns
+        )
+        loss = loss_p + loss_v
+        self.accelerator.backward(loss)
+        if self.config.max_grad_norm is not None:
+            if self.accelerator.sync_gradients:
+                self.accelerator.clip_grad_norm_(self.model_params, self.config.max_grad_norm)
+        self.optimizer.step()
+        if self.config.use_habana:  # pragma: no cover
+            self.htcore.mark_step()
+        # we call optimizer.zero_grad() every time and let `accelerator` handle accumulation
+        # see https://huggingface.co/docs/accelerate/usage_guides/gradient_accumulation#the-finished-code
+        self.optimizer.zero_grad()
+        return train_stats
+
+    def wrap_fw_for_hpu_graph_mode(self, model: PreTrainedModelWrapper):
+        model = self.accelerator.unwrap_model(model)
+        if hasattr(model, "hpu_graph_fw"):
+            model.forward = model.hpu_graph_fw
+        else:
+            from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+
+            model.orig_fw = model.forward
+            model = wrap_in_hpu_graph(model)
+            model.hpu_graph_fw = model.forward
+
+    def unwrap_fw_for_hpu_graph_mode(self, model: PreTrainedModelWrapper):
+        model = self.accelerator.unwrap_model(model)
+        if hasattr(model, "orig_fw"):
+            model.forward = model.orig_fw
+
+    def wrap_generation_for_hpu_graph_mode(self, model: PreTrainedModelWrapper):
+        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+
+        model = self.accelerator.unwrap_model(model)
+        if getattr(model, "is_peft_model", False):
+            if hasattr(model.pretrained_model.base_model.model, "hpu_graph_fw"):
+                model.pretrained_model.base_model.model.forward = model.pretrained_model.base_model.model.hpu_graph_fw
+            else:
+                model.pretrained_model.base_model.model.orig_fw = model.pretrained_model.base_model.model.forward
+                model.pretrained_model.base_model.model = wrap_in_hpu_graph(model.pretrained_model.base_model.model)
+                model.pretrained_model.base_model.model.hpu_graph_fw = model.pretrained_model.base_model.model.forward
+        else:
+            if hasattr(model.pretrained_model, "hpu_graph_fw"):
+                model.pretrained_model.forward = model.pretrained_model.hpu_graph_fw
+            else:
+                model.pretrained_model.orig_fw = model.pretrained_model.forward
+                model.pretrained_model = wrap_in_hpu_graph(model.pretrained_model)
+                model.pretrained_model.hpu_graph_fw = model.pretrained_model.forward
+
+    def unwrap_generation_for_hpu_graph_mode(self, model: PreTrainedModelWrapper):
+        model = self.accelerator.unwrap_model(model)
+        if getattr(model, "is_peft_model", False):
+            if hasattr(model.pretrained_model.base_model.model, "orig_fw"):
+                model.pretrained_model.base_model.model.forward = model.pretrained_model.base_model.model.orig_fw
+        else:
+            if hasattr(model.pretrained_model, "orig_fw"):
+                model.pretrained_model.forward = model.pretrained_model.orig_fw

From e7f83d93dfbeb290670a42ce4a41cc57c76e2104 Mon Sep 17 00:00:00 2001
From: "Wang, Yi A" <yi.a.wang@intel.com>
Date: Wed, 17 Jan 2024 03:43:20 -0800
Subject: [PATCH 2/6] refactor ppo example

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 examples/trl/README.md                        |  79 +++++++
 .../{stack_llama/rl_training.py => ppo.py}    |  23 +-
 .../trl/{stack_llama => }/reward_modeling.py  |  37 +--
 examples/trl/stack_llama/README.md            |  18 --
 .../trl/stack_llama/merge_peft_adapter.py     |  50 ----
 .../trl/stack_llama/supervised_finetuning.py  | 215 ------------------
 optimum/habana/trl/trainer/ppo_trainer.py     |   2 +-
 7 files changed, 119 insertions(+), 305 deletions(-)
 rename examples/trl/{stack_llama/rl_training.py => ppo.py} (92%)
 rename examples/trl/{stack_llama => }/reward_modeling.py (89%)
 delete mode 100644 examples/trl/stack_llama/README.md
 delete mode 100644 examples/trl/stack_llama/merge_peft_adapter.py
 delete mode 100644 examples/trl/stack_llama/supervised_finetuning.py

diff --git a/examples/trl/README.md b/examples/trl/README.md
index 7206ddbffb..8049349da6 100644
--- a/examples/trl/README.md
+++ b/examples/trl/README.md
@@ -72,3 +72,82 @@ python run_generation.py \
 --prompt "Here is my prompt"
 
 ```
+
+
+## PPO pipeline
+
+### Training
+
+The following example is for the creation of StackLlaMa 2: a Stack exchange llama-v2-7b model.
+There are three main steps to the PPO training process:
+1. Supervised fine-tuning of the base llama-v2-7b model to create llama-v2-7b-se:
+    ```
+    python ../gaudi_spawn.py --world_size 8 --use_mpi sft.py \
+        --model_name_or_path meta-llama/Llama-2-7b-hf \
+        --output_dir="./sft" \
+        --max_steps=500 \
+        --logging_steps=10 \
+        --save_steps=100 \
+        --per_device_train_batch_size=4 \
+        --per_device_eval_batch_size=1 \
+        --gradient_accumulation_steps=2 \
+        --learning_rate=1e-4 \
+        --lr_scheduler_type="cosine" \
+        --warmup_steps=100 \
+        --weight_decay=0.05 \
+        --optim="paged_adamw_32bit" \
+        --lora_target_modules "q_proj" "v_proj" \
+        --bf16 \
+        --remove_unused_columns=False \
+        --run_name="sft_llama2" \
+        --report_to=none \
+        --use_habana \
+        --use_lazy_mode
+    ```
+2. Reward modeling using dialog pairs from the SE dataset on the llama-v2-7b-se to create llama-v2-7b-se-rm
+    ```
+    python ../gaudi_spawn.py --world_size 8 --use_mpi reward_modeling.py \
+        --model_name=./sft/final_merged_checkpoint \
+        --tokenizer_name=meta-llama/Llama-2-7b-hf \
+        --output_dir=./rm
+    ```
+    To merge the adaptors into the base model we can use the `merge_peft_adapter.py` helper script that comes with TRL:
+
+    ```
+    python merge_peft_adapter.py --base_model_name="meta-llama/Llama-2-7b-hf" --adapter_model_name="rm" --output_name="rm_merged_checkpoint"
+    ```
+
+3. RL fine-tuning of llama-v2-7b-se with the llama-v2-7b-se-rm reward model:
+    ```
+    python ../gaudi_spawn.py --world_size 8 --use_mpi ppo.py \
+        --log_with=wandb \
+        --model_name=./sft/final_merged_checkpoint \
+        --reward_model_name=./rm_merged_checkpoint \
+        --tokenizer_name=meta-llama/Llama-2-7b-hf \
+        --adafactor=False \
+        --output_max_length=128 \
+        --batch_size=8 \
+        --gradient_accumulation_steps=8 \
+        --batched_gen=True \
+        --ppo_epochs=4 \
+        --seed=0 \
+        --learning_rate=1.4e-5 \
+        --early_stopping=True \
+        --output_dir=llama-se-rl-finetune
+    ```
+    To merge the adaptors into the base model we can use the `merge_peft_adapter.py` helper script that comes with TRL:
+
+    ```
+    python merge_peft_adapter.py --base_model_name="meta-llama/Llama-2-7b-hf" --adapter_model_name="llama-se-rl-finetune" --output_name="rl_merged_checkpoint"
+    ```
+
+### Running the model
+We can load the PPO-trained LoRA adaptors which were saved by the PPO training step and run it through the [text-generation example](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation).
+
+```
+python run_generation.py \
+--model_name_or_path ../trl/rl_merged_checkpoint/ \
+--use_hpu_graphs --use_kv_cache --batch_size 1 --bf16 --max_new_tokens 100 \
+--prompt "Here is my prompt"
+```
+
diff --git a/examples/trl/stack_llama/rl_training.py b/examples/trl/ppo.py
similarity index 92%
rename from examples/trl/stack_llama/rl_training.py
rename to examples/trl/ppo.py
index 53ec5b7251..23ecbcabeb 100644
--- a/examples/trl/stack_llama/rl_training.py
+++ b/examples/trl/ppo.py
@@ -1,6 +1,6 @@
 # copy from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/rl_training.py, enable it for Gaudi2
 from dataclasses import dataclass, field
-from typing import Optional
+from typing import List, Optional
 
 import torch
 from datasets import load_dataset
@@ -26,8 +26,8 @@ class ScriptArguments:
 
     # NOTE: gpt2 models use Conv1D instead of Linear layers which are not yet supported in 8 bit mode
     # models like gpt-neo* models are more suitable.
-    model_name: Optional[str] = field(default="", metadata={"help": "the model name"})
-    tokenizer_name: Optional[str] = field(default="", metadata={"help": "the tokenizer name"})
+    model_name: Optional[str] = field(default="meta-llama/Llama-2-7b-hf", metadata={"help": "the model name"})
+    tokenizer_name: Optional[str] = field(default="meta-llama/Llama-2-7b-hf", metadata={"help": "the tokenizer name"})
     reward_model_name: Optional[str] = field(default="", metadata={"help": "the reward model name"})
     log_with: Optional[str] = field(default=None, metadata={"help": "use 'wandb' to log with wandb"})
     learning_rate: Optional[float] = field(default=1.41e-5, metadata={"help": "the learning rate"})
@@ -57,6 +57,13 @@ class ScriptArguments:
 
     adap_kl_ctrl: Optional[bool] = field(default=True, metadata={"help": "Use adaptive KL control, otherwise linear"})
     use_habana: Optional[bool] = field(default=True, metadata={"help": "use habana for RL training"})
+    lora_alpha: Optional[float] = field(default=32, metadata={"help": "the lora alpha parameter"})
+    lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"})
+    lora_r: Optional[int] = field(default=16, metadata={"help": "the lora r parameter"})
+    lora_target_modules: List[str] = field(
+        default_factory=lambda: None,
+        metadata={"help": "Target modules for the LoRA method."},
+    )
 
 
 adapt_PreTrainedModelWrapper_to_gaudi()
@@ -170,9 +177,10 @@ def collator(data):
 # Now let's build the model, the reference model, and the tokenizer.
 current_device = GaudiAccelerator().local_process_index
 lora_config = LoraConfig(
-    r=16,
-    lora_alpha=32,
-    lora_dropout=0.05,
+    r=script_args.lora_r,
+    lora_alpha=script_args.lora_alpha,
+    lora_dropout=script_args.lora_dropout,
+    target_modules=script_args.lora_target_modules,
     bias="none",
     task_type="CAUSAL_LM",
 )
@@ -266,7 +274,6 @@ def collator(data):
     output_length_sampler = LengthSampler(output_min_length, output_max_length)
 else:
     output_length_sampler = LengthSampler(output_max_length, output_max_length + 1)
-
 for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
     if epoch >= config.total_ppo_epochs:
         break
@@ -292,3 +299,5 @@ def collator(data):
 
     if script_args.save_freq and epoch and epoch % script_args.save_freq == 0:
         ppo_trainer.save_pretrained(script_args.output_dir + f"step_{epoch}")
+
+ppo_trainer.save_pretrained(script_args.output_dir)
diff --git a/examples/trl/stack_llama/reward_modeling.py b/examples/trl/reward_modeling.py
similarity index 89%
rename from examples/trl/stack_llama/reward_modeling.py
rename to examples/trl/reward_modeling.py
index 32ce0faf50..a57cfa575a 100644
--- a/examples/trl/stack_llama/reward_modeling.py
+++ b/examples/trl/reward_modeling.py
@@ -45,13 +45,13 @@ class ScriptArguments:
     learning_rate: Optional[float] = field(default=2e-5)
     weight_decay: Optional[float] = field(default=0.001)
     model_name: Optional[str] = field(
-        default="gpt2",
+        default="meta-llama/Llama-2-7b-hf",
         metadata={
             "help": "The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc."
         },
     )
     tokenizer_name: Optional[str] = field(
-        default=None,
+        default="meta-llama/Llama-2-7b-hf",
         metadata={
             "help": "The tokenizer for your model, if left empty will use the default for your model",
         },
@@ -91,6 +91,17 @@ class ScriptArguments:
         default=False,
         metadata={"help": "Whether to run eval after the first step"},
     )
+    output_dir: Optional[str] = field(default="./results", metadata={"help": "the output directory"})
+    save_steps: Optional[int] = field(default=500, metadata={"help": "the saving frequency"})
+    eval_steps: Optional[int] = field(default=500, metadata={"help": "the evaluation frequency"})
+    logging_steps: Optional[int] = field(default=10, metadata={"help": "the logging frequency"})
+    lora_alpha: Optional[float] = field(default=32, metadata={"help": "the lora alpha parameter"})
+    lora_dropout: Optional[float] = field(default=0.1, metadata={"help": "the lora dropout parameter"})
+    lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"})
+    lora_target_modules: List[str] = field(
+        default_factory=lambda: None,
+        metadata={"help": "Target modules for the LoRA method."},
+    )
 
 
 parser = HfArgumentParser(ScriptArguments)
@@ -105,21 +116,18 @@ class ScriptArguments:
     eval_dataset = eval_dataset.select(range(script_args.eval_subset))
 # Define the training args. Needs to be done before the model is loaded if you are using deepspeed.
 model_name_split = script_args.model_name.split("/")[-1]
-output_name = (
-    f"{model_name_split}_peft_stack-exchange-paired_rmts__{script_args.train_subset}_{script_args.learning_rate}"
-)
 
 training_args = GaudiTrainingArguments(
-    output_dir=output_name,
+    output_dir=script_args.output_dir,
     learning_rate=script_args.learning_rate,
     per_device_train_batch_size=script_args.per_device_train_batch_size,
     per_device_eval_batch_size=script_args.per_device_eval_batch_size,
     num_train_epochs=script_args.num_train_epochs,
     weight_decay=script_args.weight_decay,
     evaluation_strategy="steps",
-    eval_steps=500,
+    eval_steps=script_args.eval_steps,
     save_strategy="steps",
-    save_steps=500,
+    save_steps=script_args.save_steps,
     gradient_accumulation_steps=script_args.gradient_accumulation_steps,
     gradient_checkpointing=script_args.gradient_checkpointing,
     deepspeed=script_args.deepspeed,
@@ -128,7 +136,7 @@ class ScriptArguments:
     label_names=[],
     bf16=script_args.bf16,
     logging_strategy="steps",
-    logging_steps=10,
+    logging_steps=script_args.logging_steps,
     optim=script_args.optim,
     lr_scheduler_type=script_args.lr_scheduler_type,
     report_to="none",
@@ -140,13 +148,14 @@ class ScriptArguments:
 tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_auth_token=True)
 tokenizer.pad_token = tokenizer.eos_token
 
-
 peft_config = LoraConfig(
     task_type=TaskType.SEQ_CLS,
     inference_mode=False,
-    r=8,
-    lora_alpha=32,
-    lora_dropout=0.1,
+    r=script_args.lora_r,
+    lora_alpha=script_args.lora_alpha,
+    lora_dropout=script_args.lora_dropout,
+    target_modules=script_args.lora_target_modules,
+    bias="none",
 )
 torch.autograd.set_detect_anomaly(True)
 model = AutoModelForSequenceClassification.from_pretrained(
@@ -310,4 +319,4 @@ def on_step_end(self, args, state, control, **kwargs):
 trainer.train(script_args.resume_from_checkpoint)
 
 print("Saving last checkpoint of the model")
-trainer.save_model(output_name + "_peft_last_checkpoint")
+trainer.save_model(script_args.output_dir)
diff --git a/examples/trl/stack_llama/README.md b/examples/trl/stack_llama/README.md
deleted file mode 100644
index 51a9728ed4..0000000000
--- a/examples/trl/stack_llama/README.md
+++ /dev/null
@@ -1,18 +0,0 @@
-# RLHF pipeline for the creation of StackLLaMa: a Stack exchange llama-7b model.
-There were three main steps to the training process:
-1. Supervised fine-tuning of the base llama-7b model to create llama-7b-se:
-    - `torchrun --nnodes 1  --nproc_per_node 8 supervised_finetuning.py --model_path=<LLAMA_MODEL_PATH> --streaming --learning_rate 1e-5 --max_steps 5000 --bf16 --output_dir ./llama-se`
-2. Reward modeling using dialog pairs from the SE dataset using the llama-7b-se to create llama-7b-se-rm:
-    - `torchrun --nnodes 1  --nproc_per_node 8 reward_modeling.py --model_name=<LLAMA_SE_MODEL>`
-3. RL fine-tuning of llama-7b-se with the llama-7b-se-rm reward model:
-    - `torchrun --nnodes 1  --nproc_per_node 8 rl_training.py --log_with=wandb --model_name=<LLAMA_SE_MODEL> --reward_model_name=<LLAMA_SE_RM_MODEL> --adafactor=False --tokenizer_name=<LLAMA_TOKENIZER> --save_freq=100 --output_max_length=128 --batch_size=8 --gradient_accumulation_steps=8 --batched_gen=True --ppo_epochs=4 --seed=0 --learning_rate=1.4e-5 --early_stopping=True --output_dir=llama-se-rl-finetune-128-8-8-1.4e-5_adam`
-
-
-LoRA layers were using at all stages to reduce memory requirements. 
-At each stage the peft adapter layers were merged with the base model, using: 
-```shell
-python merge_peft_adapter.py --adapter_model_name=XXX --base_model_name=YYY --output_name=ZZZ
-```
-Note that this script requires `peft>=0.3.0`.
-
-For access to the base llama-7b model, please see Meta's [release](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) and [request form](https://docs.google.com/forms/d/e/1FAIpQLSfqNECQnMkycAp2jP4Z9TFX0cGR4uf7b_fBxjY_OjhJILlKGA/viewform).
diff --git a/examples/trl/stack_llama/merge_peft_adapter.py b/examples/trl/stack_llama/merge_peft_adapter.py
deleted file mode 100644
index 8913fc62a4..0000000000
--- a/examples/trl/stack_llama/merge_peft_adapter.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# copy from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/merge_peft_adapter.py.
-# only difference is removal of model.push_to_hub
-from dataclasses import dataclass, field
-from typing import Optional
-
-import torch
-from peft import PeftConfig, PeftModel
-from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, HfArgumentParser
-
-
-@dataclass
-class ScriptArguments:
-    """
-    The input names representing the Adapter and Base model fine-tuned with PEFT, and the output name representing the
-    merged model.
-    """
-
-    adapter_model_name: Optional[str] = field(default=None, metadata={"help": "the adapter name"})
-    base_model_name: Optional[str] = field(default=None, metadata={"help": "the base model name"})
-    output_name: Optional[str] = field(default=None, metadata={"help": "the merged model name"})
-
-
-parser = HfArgumentParser(ScriptArguments)
-script_args = parser.parse_args_into_dataclasses()[0]
-assert script_args.adapter_model_name is not None, "please provide the name of the Adapter you would like to merge"
-assert script_args.base_model_name is not None, "please provide the name of the Base model"
-assert script_args.output_name is not None, "please provide the output name of the merged model"
-
-peft_config = PeftConfig.from_pretrained(script_args.adapter_model_name)
-if peft_config.task_type == "SEQ_CLS":
-    # The sequence classification task is used for the reward model in PPO
-    model = AutoModelForSequenceClassification.from_pretrained(
-        script_args.base_model_name, num_labels=1, torch_dtype=torch.bfloat16
-    )
-else:
-    model = AutoModelForCausalLM.from_pretrained(
-        script_args.base_model_name, return_dict=True, torch_dtype=torch.bfloat16
-    )
-
-tokenizer = AutoTokenizer.from_pretrained(script_args.base_model_name)
-
-# Load the PEFT model
-model = PeftModel.from_pretrained(model, script_args.adapter_model_name)
-model.eval()
-
-model = model.merge_and_unload()
-
-model.save_pretrained(f"{script_args.output_name}")
-tokenizer.save_pretrained(f"{script_args.output_name}")
-# model.push_to_hub(f"{script_args.output_name}", use_temp_dir=False)
diff --git a/examples/trl/stack_llama/supervised_finetuning.py b/examples/trl/stack_llama/supervised_finetuning.py
deleted file mode 100644
index a61bca6e3b..0000000000
--- a/examples/trl/stack_llama/supervised_finetuning.py
+++ /dev/null
@@ -1,215 +0,0 @@
-# copy from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/supervised_finetuning.py, enable it for Gaudi2
-
-import argparse
-import os
-
-import torch
-from datasets import load_dataset
-from peft import LoraConfig
-from tqdm import tqdm
-from transformers import AutoModelForCausalLM, AutoTokenizer, logging, set_seed
-from trl.trainer import ConstantLengthDataset
-
-from optimum.habana import GaudiConfig, GaudiTrainingArguments
-from optimum.habana.trl import GaudiSFTTrainer
-
-
-"""
-Fine-Tune Llama-7b on SE paired dataset
-"""
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_path", type=str, default="")
-    parser.add_argument("--dataset_name", type=str, default="lvwerra/stack-exchange-paired")
-    parser.add_argument("--subset", type=str, default="data/finetune")
-    parser.add_argument("--split", type=str, default="train")
-    parser.add_argument("--size_valid_set", type=int, default=4000)
-    parser.add_argument("--streaming", action="store_true")
-    parser.add_argument("--shuffle_buffer", type=int, default=5000)
-
-    parser.add_argument("--seq_length", type=int, default=1024)
-    parser.add_argument("--max_steps", type=int, default=10000)
-    parser.add_argument("--batch_size", type=int, default=4)
-    parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
-    parser.add_argument("--eos_token_id", type=int, default=49152)
-
-    parser.add_argument("--learning_rate", type=float, default=1e-4)
-    parser.add_argument("--lr_scheduler_type", type=str, default="cosine")
-    parser.add_argument("--num_warmup_steps", type=int, default=100)
-    parser.add_argument("--weight_decay", type=float, default=0.05)
-
-    parser.add_argument("--local_rank", type=int, default=0)
-    parser.add_argument("--fp16", action="store_true", default=False)
-    parser.add_argument("--bf16", action="store_true", default=False)
-    parser.add_argument("--gradient_checkpointing", action="store_true", default=False)
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--num_workers", type=int, default=None)
-    parser.add_argument("--output_dir", type=str, default="./checkpoints")
-    parser.add_argument("--log_freq", default=1, type=int)
-    parser.add_argument("--eval_freq", default=1000, type=int)
-    parser.add_argument("--save_freq", default=1000, type=int)
-
-    return parser.parse_args()
-
-
-def chars_token_ratio(dataset, tokenizer, nb_examples=400):
-    """
-    Estimate the average number of characters per token in the dataset.
-    """
-    total_characters, total_tokens = 0, 0
-    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
-        text = prepare_sample_text(example)
-        total_characters += len(text)
-        if tokenizer.is_fast:
-            total_tokens += len(tokenizer(text).tokens())
-        else:
-            total_tokens += len(tokenizer.tokenize(text))
-
-    return total_characters / total_tokens
-
-
-def print_trainable_parameters(model):
-    """
-    Prints the number of trainable parameters in the model.
-    """
-    trainable_params = 0
-    all_param = 0
-    for _, param in model.named_parameters():
-        all_param += param.numel()
-        if param.requires_grad:
-            trainable_params += param.numel()
-    print(
-        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
-    )
-
-
-def prepare_sample_text(example):
-    """Prepare the text from a sample of the dataset."""
-    text = f"Question: {example['question']}\n\nAnswer: {example['response_j']}"
-    return text
-
-
-def create_datasets(tokenizer, args):
-    dataset = load_dataset(
-        args.dataset_name,
-        data_dir=args.subset,
-        split=args.split,
-        use_auth_token=True,
-        num_proc=args.num_workers if not args.streaming else None,
-        streaming=args.streaming,
-    )
-    if args.streaming:
-        print("Loading the dataset in streaming mode")
-        valid_data = dataset.take(args.size_valid_set)
-        train_data = dataset.skip(args.size_valid_set)
-        train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=args.seed)
-    else:
-        dataset = dataset.train_test_split(test_size=0.005, seed=args.seed)
-        train_data = dataset["train"]
-        valid_data = dataset["test"]
-        print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")
-
-    chars_per_token = chars_token_ratio(train_data, tokenizer)
-    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")
-
-    train_dataset = ConstantLengthDataset(
-        tokenizer,
-        train_data,
-        formatting_func=prepare_sample_text,
-        infinite=True,
-        seq_length=args.seq_length,
-        chars_per_token=chars_per_token,
-    )
-    valid_dataset = ConstantLengthDataset(
-        tokenizer,
-        valid_data,
-        formatting_func=prepare_sample_text,
-        infinite=False,
-        seq_length=args.seq_length,
-        chars_per_token=chars_per_token,
-    )
-    return train_dataset, valid_dataset
-
-
-def run_training(args, train_data, val_data):
-    print("Loading the model")
-
-    lora_config = LoraConfig(
-        r=16,
-        lora_alpha=32,
-        lora_dropout=0.05,
-        bias="none",
-        task_type="CAUSAL_LM",
-    )
-
-    train_data.start_iteration = 0
-
-    print("Starting main loop")
-
-    training_args = GaudiTrainingArguments(
-        output_dir=args.output_dir,
-        dataloader_drop_last=True,
-        evaluation_strategy="steps",
-        max_steps=args.max_steps,
-        eval_steps=args.eval_freq,
-        save_steps=args.save_freq,
-        logging_steps=args.log_freq,
-        per_device_train_batch_size=args.batch_size,
-        per_device_eval_batch_size=args.batch_size,
-        learning_rate=args.learning_rate,
-        lr_scheduler_type=args.lr_scheduler_type,
-        warmup_steps=args.num_warmup_steps,
-        gradient_accumulation_steps=args.gradient_accumulation_steps,
-        gradient_checkpointing=args.gradient_checkpointing,
-        fp16=args.fp16,
-        bf16=args.bf16,
-        weight_decay=args.weight_decay,
-        run_name="llama-7b-finetuned",
-        report_to="none",
-        ddp_find_unused_parameters=False,
-        use_habana=True,
-        use_lazy_mode=True,
-    )
-    model = AutoModelForCausalLM.from_pretrained(args.model_path, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16)
-
-    gaudi_config = GaudiConfig()
-    gaudi_config.use_fused_adam = True
-    gaudi_config.use_fused_clip_norm = True
-
-    trainer = GaudiSFTTrainer(
-        model=model,
-        gaudi_config=gaudi_config,
-        args=training_args,
-        train_dataset=train_data,
-        eval_dataset=val_data,
-        peft_config=lora_config,
-        packing=True,
-    )
-
-    print_trainable_parameters(trainer.model)
-
-    print("Training...")
-    trainer.train()
-
-    print("Saving last checkpoint of the model")
-    trainer.save_model(os.path.join(args.output_dir, "final_checkpoint/"))
-
-
-def main(args):
-    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
-    train_dataset, eval_dataset = create_datasets(tokenizer, args)
-    run_training(args, train_dataset, eval_dataset)
-
-
-if __name__ == "__main__":
-    args = get_args()
-    assert args.model_path != "", "Please provide the llama model path"
-
-    set_seed(args.seed)
-    os.makedirs(args.output_dir, exist_ok=True)
-
-    logging.set_verbosity_error()
-
-    main(args)
diff --git a/optimum/habana/trl/trainer/ppo_trainer.py b/optimum/habana/trl/trainer/ppo_trainer.py
index 5ef2745e1e..20d18d2996 100644
--- a/optimum/habana/trl/trainer/ppo_trainer.py
+++ b/optimum/habana/trl/trainer/ppo_trainer.py
@@ -255,7 +255,7 @@ def __init__(
             elif is_npu_available():
                 self.current_device = torch.device("npu:0")
             elif self.accelerator.device.type == "hpu":
-                self.current_device = torch.device("hpu:0")
+                self.current_device = torch.device("hpu")
             else:
                 self.current_device = torch.device("cuda:0")
 

From 20d38bce73c6411f23943fa68bd8a601ac840f6b Mon Sep 17 00:00:00 2001
From: "Wang, Yi A" <yi.a.wang@intel.com>
Date: Tue, 30 Jan 2024 17:33:22 -0800
Subject: [PATCH 3/6] ppo and reward model update

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 examples/trl/ppo.py                          | 24 ++++--
 examples/trl/reward_modeling.py              | 72 +----------------
 optimum/habana/trl/__init__.py               |  1 +
 optimum/habana/trl/models/__init__.py        |  2 -
 optimum/habana/trl/models/modeling_base.py   | 11 +--
 optimum/habana/trl/trainer/__init__.py       |  1 +
 optimum/habana/trl/trainer/ppo_config.py     |  9 ++-
 optimum/habana/trl/trainer/ppo_trainer.py    | 18 ++---
 optimum/habana/trl/trainer/reward_trainer.py | 82 ++++++++++++++++++++
 9 files changed, 119 insertions(+), 101 deletions(-)
 create mode 100644 optimum/habana/trl/trainer/reward_trainer.py

diff --git a/examples/trl/ppo.py b/examples/trl/ppo.py
index 23ecbcabeb..c46ef1f450 100644
--- a/examples/trl/ppo.py
+++ b/examples/trl/ppo.py
@@ -31,7 +31,8 @@ class ScriptArguments:
     reward_model_name: Optional[str] = field(default="", metadata={"help": "the reward model name"})
     log_with: Optional[str] = field(default=None, metadata={"help": "use 'wandb' to log with wandb"})
     learning_rate: Optional[float] = field(default=1.41e-5, metadata={"help": "the learning rate"})
-    output_max_length: Optional[int] = field(default=128, metadata={"help": "maximum length for generation"})
+    output_max_length: Optional[int] = field(default=128, metadata={"help": "maximum output length for generation"})
+    input_max_length: Optional[int] = field(default=512, metadata={"help": "maximum input length for generation"})
     mini_batch_size: Optional[int] = field(default=1, metadata={"help": "the PPO minibatch size"})
     batch_size: Optional[int] = field(default=32, metadata={"help": "the batch size"})
     ppo_epochs: Optional[int] = field(default=4, metadata={"help": "the number of ppo epochs"})
@@ -64,6 +65,15 @@ class ScriptArguments:
         default_factory=lambda: None,
         metadata={"help": "Target modules for the LoRA method."},
     )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set."
+            )
+        },
+    )
 
 
 adapt_PreTrainedModelWrapper_to_gaudi()
@@ -88,12 +98,14 @@ class ScriptArguments:
     adap_kl_ctrl=script_args.adap_kl_ctrl,
     use_habana=script_args.use_habana,
     pad_for_acceleration=script_args.use_habana,
-    pad_max_len=512 + script_args.output_max_length,
-    pad_max_input_len=512,
+    pad_max_len=script_args.input_max_length + script_args.output_max_length,
+    pad_max_input_len=script_args.input_max_length,
 )
 
 train_dataset = load_dataset("lvwerra/stack-exchange-paired", data_dir="data/rl", split="train")
-train_dataset = train_dataset.select(range(100000))
+if script_args.max_train_samples is not None:
+    max_train_samples = min(len(train_dataset), script_args.max_train_samples)
+    train_dataset = train_dataset.select(range(max_train_samples))
 original_columns = train_dataset.column_names
 
 # We then define the arguments to pass to the sentiment analysis pipeline.
@@ -106,7 +118,7 @@ class ScriptArguments:
 }
 if config.pad_for_acceleration:
     sent_kwargs["padding"] = "max_length"
-    sent_kwargs["max_length"] = 512 + script_args.output_max_length
+    sent_kwargs["max_length"] = script_args.input_max_length + script_args.output_max_length
 
 tokenizer = AutoTokenizer.from_pretrained(script_args.tokenizer_name)
 # GPT-2 tokenizer has a pad token, but it is not eos_token by default. We need to set it to eos_token.
@@ -225,8 +237,6 @@ def collator(data):
 # model name and the sentiment analysis pipeline arguments. Let's also make sure to
 # set the device to the same device as the PPOTrainer.
 device = ppo_trainer.accelerator.device
-if ppo_trainer.accelerator.num_processes == 1 and torch.cuda.is_available():
-    device = 0
 
 reward_model = AutoModelForSequenceClassification.from_pretrained(
     reward_model_name,
diff --git a/examples/trl/reward_modeling.py b/examples/trl/reward_modeling.py
index a57cfa575a..e15d73309a 100644
--- a/examples/trl/reward_modeling.py
+++ b/examples/trl/reward_modeling.py
@@ -1,24 +1,22 @@
 # copy from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/reward_modeling.py, enable it for Gaudi2
 
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Union
+from typing import List, Optional
 
 import evaluate
 import numpy as np
 import torch
-import torch.nn as nn
 from datasets import load_dataset
 from peft import LoraConfig, TaskType, get_peft_model
 from transformers import (
     AutoModelForSequenceClassification,
     AutoTokenizer,
     HfArgumentParser,
-    PreTrainedTokenizerBase,
     TrainerCallback,
 )
-from transformers.utils import PaddingStrategy
 
-from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
+from optimum.habana import GaudiConfig, GaudiTrainingArguments
+from optimum.habana.trl import GaudiRewardTrainer, RewardDataCollatorWithPadding
 
 
 # Define and parse arguments.
@@ -115,7 +113,6 @@ class ScriptArguments:
 if script_args.eval_subset > 0:
     eval_dataset = eval_dataset.select(range(script_args.eval_subset))
 # Define the training args. Needs to be done before the model is loaded if you are using deepspeed.
-model_name_split = script_args.model_name.split("/")[-1]
 
 training_args = GaudiTrainingArguments(
     output_dir=script_args.output_dir,
@@ -215,56 +212,6 @@ def preprocess_function(examples):
     lambda x: len(x["input_ids_j"]) <= script_args.max_length and len(x["input_ids_k"]) <= script_args.max_length
 )
 
-
-# We need to define a special data collator that batches the data in our j vs k format.
-@dataclass
-class RewardDataCollatorWithPadding:
-    tokenizer: PreTrainedTokenizerBase
-    padding: Union[bool, str, PaddingStrategy] = True
-    max_length: Optional[int] = None
-    pad_to_multiple_of: Optional[int] = None
-    return_tensors: str = "pt"
-
-    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
-        features_j = []
-        features_k = []
-        for feature in features:
-            features_j.append(
-                {
-                    "input_ids": feature["input_ids_j"],
-                    "attention_mask": feature["attention_mask_j"],
-                }
-            )
-            features_k.append(
-                {
-                    "input_ids": feature["input_ids_k"],
-                    "attention_mask": feature["attention_mask_k"],
-                }
-            )
-        batch_j = self.tokenizer.pad(
-            features_j,
-            padding=self.padding,
-            max_length=self.max_length,
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            return_tensors=self.return_tensors,
-        )
-        batch_k = self.tokenizer.pad(
-            features_k,
-            padding=self.padding,
-            max_length=self.max_length,
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            return_tensors=self.return_tensors,
-        )
-        batch = {
-            "input_ids_j": batch_j["input_ids"],
-            "attention_mask_j": batch_j["attention_mask"],
-            "input_ids_k": batch_k["input_ids"],
-            "attention_mask_k": batch_k["attention_mask"],
-            "return_loss": True,
-        }
-        return batch
-
-
 # Define the metric that we'll use for validation.
 accuracy = evaluate.load("accuracy")
 
@@ -278,23 +225,12 @@ def compute_metrics(eval_pred):
     return accuracy.compute(predictions=predictions, references=labels)
 
 
-class RewardTrainer(GaudiTrainer):
-    # Define how to compute the reward loss. We use the InstructGPT pairwise logloss: https://arxiv.org/abs/2203.02155
-    def compute_loss(self, model, inputs, return_outputs=False):
-        rewards_j = model(input_ids=inputs["input_ids_j"], attention_mask=inputs["attention_mask_j"])[0]
-        rewards_k = model(input_ids=inputs["input_ids_k"], attention_mask=inputs["attention_mask_k"])[0]
-        loss = -nn.functional.logsigmoid(rewards_j - rewards_k).mean()
-        if return_outputs:
-            return loss, {"rewards_j": rewards_j, "rewards_k": rewards_k}
-        return loss
-
-
 gaudi_config = GaudiConfig()
 gaudi_config.use_fused_adam = True
 gaudi_config.use_fused_clip_norm = True
 
 # Train the model, woohoo.
-trainer = RewardTrainer(
+trainer = GaudiRewardTrainer(
     model=model,
     gaudi_config=gaudi_config,
     args=training_args,
diff --git a/optimum/habana/trl/__init__.py b/optimum/habana/trl/__init__.py
index 90c9624dc1..838d54560a 100644
--- a/optimum/habana/trl/__init__.py
+++ b/optimum/habana/trl/__init__.py
@@ -2,4 +2,5 @@
 from .trainer.dpo_trainer import GaudiDPOTrainer
 from .trainer.ppo_config import GaudiPPOConfig
 from .trainer.ppo_trainer import GaudiPPOTrainer
+from .trainer.reward_trainer import GaudiRewardTrainer, RewardDataCollatorWithPadding
 from .trainer.sft_trainer import GaudiSFTTrainer
diff --git a/optimum/habana/trl/models/__init__.py b/optimum/habana/trl/models/__init__.py
index 22bf871003..36736572c3 100644
--- a/optimum/habana/trl/models/__init__.py
+++ b/optimum/habana/trl/models/__init__.py
@@ -1,5 +1,3 @@
-# flake8: noqa
-
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/optimum/habana/trl/models/modeling_base.py b/optimum/habana/trl/models/modeling_base.py
index ad02ffd1a7..fcdc7ddc3f 100644
--- a/optimum/habana/trl/models/modeling_base.py
+++ b/optimum/habana/trl/models/modeling_base.py
@@ -15,9 +15,7 @@
 
 import torch
 from trl import PreTrainedModelWrapper
-from trl.import_utils import is_npu_available, is_xpu_available
 
-from optimum.habana.accelerate import GaudiPartialState as PartialState
 from optimum.habana.utils import to_device_dtype
 
 
@@ -31,15 +29,10 @@ def gaudi_get_current_device():
     Copied from PreTrainedModelWrapper._get_current_device: https://github.com/huggingface/trl/blob/v0.7.6/trl/models/modeling_base.py#L392
     - add hpu device
     """
-    state = PartialState()
-    if is_xpu_available():
-        return f"xpu:{state.local_process_index}"
-    elif is_npu_available():
-        return f"npu:{state.local_process_index}"
-    elif hasattr(torch, "hpu") and torch.hpu.is_available():
+    if hasattr(torch, "hpu") and torch.hpu.is_available():
         return "hpu"
     else:
-        return state.local_process_index if torch.cuda.is_available() else "cpu"
+        return "cpu"
 
 
 def gaudi_save_pretrained(self, *args, **kwargs):
diff --git a/optimum/habana/trl/trainer/__init__.py b/optimum/habana/trl/trainer/__init__.py
index b17262fc82..700ea8355d 100644
--- a/optimum/habana/trl/trainer/__init__.py
+++ b/optimum/habana/trl/trainer/__init__.py
@@ -21,3 +21,4 @@
 from .dpo_trainer import GaudiDPOTrainer
 from .ppo_config import GaudiPPOConfig
 from .ppo_trainer import GaudiPPOTrainer
+from .reward_trainer import GaudiRewardTrainer, RewardDataCollatorWithPadding
diff --git a/optimum/habana/trl/trainer/ppo_config.py b/optimum/habana/trl/trainer/ppo_config.py
index 03bf06dbca..12a3a9185e 100644
--- a/optimum/habana/trl/trainer/ppo_config.py
+++ b/optimum/habana/trl/trainer/ppo_config.py
@@ -25,12 +25,13 @@ class GaudiPPOConfig(PPOConfig):
     """
 
     use_habana: bool = False
-    """Use habana. Only applicable if use_habana is True"""
+    """Indicate if habana is used"""
     pad_for_acceleration: bool = False
-    """Use pad_for_acceleration. Only applicable if pad_for_acceleration is True"""
+    """Indicate if padding is used for acceleration. """
     pad_max_len: int = 0
-    """Use pad_for_acceleration. Only applicable if pad_for_acceleration is True"""
+    """max total length including padding. Only applicable if pad_for_acceleration is True"""
     pad_max_input_len: int = 0
+    """max input length including padding. Only applicable if pad_for_acceleration is True"""
 
     def __post_init__(self):
         self.backward_batch_size = self.mini_batch_size * self.gradient_accumulation_steps
@@ -61,7 +62,7 @@ def __post_init__(self):
                 )
 
         if self.use_habana:
-            from optimum.habana.transformers.modeling_utils import (  # pylint: disable=E0611, E0401
+            from optimum.habana.transformers.modeling_utils import (
                 adapt_transformers_to_gaudi,
             )
 
diff --git a/optimum/habana/trl/trainer/ppo_trainer.py b/optimum/habana/trl/trainer/ppo_trainer.py
index 20d18d2996..ec7fae4727 100644
--- a/optimum/habana/trl/trainer/ppo_trainer.py
+++ b/optimum/habana/trl/trainer/ppo_trainer.py
@@ -38,7 +38,7 @@
     stack_dicts,
     stats_to_np,
 )
-from trl.import_utils import is_npu_available, is_torch_greater_2_0, is_xpu_available
+from trl.import_utils import is_torch_greater_2_0
 from trl.models import SUPPORTED_ARCHITECTURES, PreTrainedModelWrapper, create_reference_model
 from trl.trainer import AdaptiveKLController, BaseTrainer, FixedKLController, RunningMoments
 
@@ -47,7 +47,7 @@
 from . import GaudiPPOConfig
 
 
-class GaudiPPOTrainer(PPOTrainer, BaseTrainer):
+class GaudiPPOTrainer(PPOTrainer):
     def __init__(
         self,
         config: GaudiPPOConfig = None,
@@ -63,7 +63,7 @@ def __init__(
         """
         Copied from PPOTrainer.__init__: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L145
         The only differences are:
-        - add new args for guadi in config
+        - add new args for Guadi in config
         - use GaudiAccelerator instead of Accelerator
         """
         BaseTrainer.__init__(self, config)
@@ -84,7 +84,7 @@ def __init__(
             )
         # Step 1: Initialize Accelerator
         if config.use_habana:
-            from optimum.habana.accelerate import GaudiAccelerator as Accelerator  # pylint: disable=E0611, E0401
+            from optimum.habana.accelerate import GaudiAccelerator as Accelerator
         else:
             from accelerate import Accelerator
         self.accelerator = Accelerator(
@@ -250,14 +250,10 @@ def __init__(
         if not getattr(self.model, "is_sequential_parallel", False):
             self.current_device = self.accelerator.device
         else:
-            if is_xpu_available():
-                self.current_device = torch.device("xpu:0")
-            elif is_npu_available():
-                self.current_device = torch.device("npu:0")
-            elif self.accelerator.device.type == "hpu":
+            if self.accelerator.device.type == "hpu":
                 self.current_device = torch.device("hpu")
             else:
-                self.current_device = torch.device("cuda:0")
+                self.current_device = torch.device("cpu")
 
         PPODecorators.optimize_device_cache = self.config.optimize_device_cache
 
@@ -558,7 +554,7 @@ def step(
         early_stop = False
         if self.config.use_habana:
             self.unwrap_fw_for_hpu_graph_mode(self.model)
-            import habana_frameworks.torch as ht  # pylint: disable=E0611, E0401
+            import habana_frameworks.torch as ht
 
             model = self.accelerator.unwrap_model(self.model)
             if not hasattr(model, "wrap_train_in_graph"):
diff --git a/optimum/habana/trl/trainer/reward_trainer.py b/optimum/habana/trl/trainer/reward_trainer.py
new file mode 100644
index 0000000000..04e1575a3a
--- /dev/null
+++ b/optimum/habana/trl/trainer/reward_trainer.py
@@ -0,0 +1,82 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Union
+
+import torch.nn as nn
+from transformers import (
+    PreTrainedTokenizerBase,
+)
+from transformers.utils import PaddingStrategy
+
+from optimum.habana import GaudiTrainer
+
+
+class GaudiRewardTrainer(GaudiTrainer):
+    # Define how to compute the reward loss. We use the InstructGPT pairwise logloss: https://arxiv.org/abs/2203.02155
+    def compute_loss(self, model, inputs, return_outputs=False):
+        rewards_j = model(input_ids=inputs["input_ids_j"], attention_mask=inputs["attention_mask_j"])[0]
+        rewards_k = model(input_ids=inputs["input_ids_k"], attention_mask=inputs["attention_mask_k"])[0]
+        loss = -nn.functional.logsigmoid(rewards_j - rewards_k).mean()
+        if return_outputs:
+            return loss, {"rewards_j": rewards_j, "rewards_k": rewards_k}
+        return loss
+
+
+@dataclass
+class RewardDataCollatorWithPadding:
+    tokenizer: PreTrainedTokenizerBase
+    padding: Union[bool, str, PaddingStrategy] = True
+    max_length: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    return_tensors: str = "pt"
+
+    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
+        features_j = []
+        features_k = []
+        for feature in features:
+            features_j.append(
+                {
+                    "input_ids": feature["input_ids_j"],
+                    "attention_mask": feature["attention_mask_j"],
+                }
+            )
+            features_k.append(
+                {
+                    "input_ids": feature["input_ids_k"],
+                    "attention_mask": feature["attention_mask_k"],
+                }
+            )
+        batch_j = self.tokenizer.pad(
+            features_j,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors=self.return_tensors,
+        )
+        batch_k = self.tokenizer.pad(
+            features_k,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors=self.return_tensors,
+        )
+        batch = {
+            "input_ids_j": batch_j["input_ids"],
+            "attention_mask_j": batch_j["attention_mask"],
+            "input_ids_k": batch_k["input_ids"],
+            "attention_mask_k": batch_k["attention_mask"],
+            "return_loss": True,
+        }
+        return batch

From c058ad2f262ee362bd3c0540867a15a81d2c40a6 Mon Sep 17 00:00:00 2001
From: "Wang, Yi A" <yi.a.wang@intel.com>
Date: Mon, 5 Feb 2024 04:53:25 -0800
Subject: [PATCH 4/6] update PPO

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 examples/trl/ppo.py                          | 1 -
 optimum/habana/trl/trainer/ppo_config.py     | 3 +--
 optimum/habana/trl/trainer/ppo_trainer.py    | 4 ++--
 optimum/habana/trl/trainer/reward_trainer.py | 9 ++++++++-
 4 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/examples/trl/ppo.py b/examples/trl/ppo.py
index c46ef1f450..22ea73ab03 100644
--- a/examples/trl/ppo.py
+++ b/examples/trl/ppo.py
@@ -97,7 +97,6 @@ class ScriptArguments:
     init_kl_coef=script_args.init_kl_coef,
     adap_kl_ctrl=script_args.adap_kl_ctrl,
     use_habana=script_args.use_habana,
-    pad_for_acceleration=script_args.use_habana,
     pad_max_len=script_args.input_max_length + script_args.output_max_length,
     pad_max_input_len=script_args.input_max_length,
 )
diff --git a/optimum/habana/trl/trainer/ppo_config.py b/optimum/habana/trl/trainer/ppo_config.py
index 12a3a9185e..49e798be6b 100644
--- a/optimum/habana/trl/trainer/ppo_config.py
+++ b/optimum/habana/trl/trainer/ppo_config.py
@@ -51,10 +51,9 @@ def __post_init__(self):
                 raise ImportError(
                     "Please install wandb to use wandb logging. You can do this by running `pip install wandb`."
                 )
+        self.pad_for_acceleration = (self.pad_max_len > 0) and (self.pad_max_input_len > 0)
 
         if self.pad_for_acceleration:
-            if self.pad_max_input_len == 0:
-                raise AssertionError("pad_max_input_len ({self.pad_max_input_len}) must be set for pad input ")
             if self.pad_max_input_len >= self.pad_max_len:
                 raise AssertionError(
                     "pad_max_input_len ({self.pad_max_input_len}) must be smaller "
diff --git a/optimum/habana/trl/trainer/ppo_trainer.py b/optimum/habana/trl/trainer/ppo_trainer.py
index ec7fae4727..16c3534332 100644
--- a/optimum/habana/trl/trainer/ppo_trainer.py
+++ b/optimum/habana/trl/trainer/ppo_trainer.py
@@ -63,7 +63,7 @@ def __init__(
         """
         Copied from PPOTrainer.__init__: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L145
         The only differences are:
-        - add new args for Guadi in config
+        - add new args for Gaudi in config
         - use GaudiAccelerator instead of Accelerator
         """
         BaseTrainer.__init__(self, config)
@@ -825,7 +825,7 @@ def train_minibatch(
             if self.accelerator.sync_gradients:
                 self.accelerator.clip_grad_norm_(self.model_params, self.config.max_grad_norm)
         self.optimizer.step()
-        if self.config.use_habana:  # pragma: no cover
+        if self.config.use_habana:
             self.htcore.mark_step()
         # we call optimizer.zero_grad() every time and let `accelerator` handle accumulation
         # see https://huggingface.co/docs/accelerate/usage_guides/gradient_accumulation#the-finished-code
diff --git a/optimum/habana/trl/trainer/reward_trainer.py b/optimum/habana/trl/trainer/reward_trainer.py
index 04e1575a3a..bbb0c761fe 100644
--- a/optimum/habana/trl/trainer/reward_trainer.py
+++ b/optimum/habana/trl/trainer/reward_trainer.py
@@ -24,7 +24,10 @@
 
 
 class GaudiRewardTrainer(GaudiTrainer):
-    # Define how to compute the reward loss. We use the InstructGPT pairwise logloss: https://arxiv.org/abs/2203.02155
+    """
+    Copied from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/reward_modeling.py#L266
+    """
+
     def compute_loss(self, model, inputs, return_outputs=False):
         rewards_j = model(input_ids=inputs["input_ids_j"], attention_mask=inputs["attention_mask_j"])[0]
         rewards_k = model(input_ids=inputs["input_ids_k"], attention_mask=inputs["attention_mask_k"])[0]
@@ -36,6 +39,10 @@ def compute_loss(self, model, inputs, return_outputs=False):
 
 @dataclass
 class RewardDataCollatorWithPadding:
+    """
+    Copied from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/reward_modeling.py#L206
+    """
+
     tokenizer: PreTrainedTokenizerBase
     padding: Union[bool, str, PaddingStrategy] = True
     max_length: Optional[int] = None

From f563ad84b0fea3d3a4a82c4c34144e27a4267e94 Mon Sep 17 00:00:00 2001
From: "Wang, Yi A" <yi.a.wang@intel.com>
Date: Thu, 8 Feb 2024 04:33:14 -0800
Subject: [PATCH 5/6] add evaluate and scikit-learn to requirement.txt

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 examples/trl/requirements.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/trl/requirements.txt b/examples/trl/requirements.txt
index c980a4b30c..2c944e2168 100644
--- a/examples/trl/requirements.txt
+++ b/examples/trl/requirements.txt
@@ -3,3 +3,5 @@ peft == 0.6.2
 datasets
 wandb
 tyro
+evaluate
+scikit-learn

From 4a03a4b879c6a342cf7d493ef6698abfad4c96a9 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Sun, 11 Feb 2024 05:43:11 +0100
Subject: [PATCH 6/6] Remove W&B logs in example

---
 examples/trl/README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/trl/README.md b/examples/trl/README.md
index 8049349da6..537e1dbb4e 100644
--- a/examples/trl/README.md
+++ b/examples/trl/README.md
@@ -120,7 +120,6 @@ There are three main steps to the PPO training process:
 3. RL fine-tuning of llama-v2-7b-se with the llama-v2-7b-se-rm reward model:
     ```
     python ../gaudi_spawn.py --world_size 8 --use_mpi ppo.py \
-        --log_with=wandb \
         --model_name=./sft/final_merged_checkpoint \
         --reward_model_name=./rm_merged_checkpoint \
         --tokenizer_name=meta-llama/Llama-2-7b-hf \