CarperAI · cat-state · Apr 6, 2023 · Mar 13, 2023 · Mar 13, 2023 · Mar 13, 2023
diff --git a/trlx/data/configs.py b/trlx/data/configs.py
@@ -196,6 +196,9 @@ class TrainConfig:
 
     :param seed: Random seed
     :type seed: int
+
+    :param minibatch_size: Size of model input during one forward pass. Must divide batch size
+    :type minibatch_size: int
     """
 
     total_steps: int
@@ -223,6 +226,8 @@ class TrainConfig:
 
     seed: int = 1000
 
+    minibatch_size: Optional[int] = None
+
     @classmethod
     def from_dict(cls, config: Dict[str, Any]):
         return cls(**config)

diff --git a/trlx/trainer/accelerate_base_trainer.py b/trlx/trainer/accelerate_base_trainer.py
@@ -16,6 +16,7 @@
 
 import trlx.utils.logging as logging
 from trlx.data.configs import TRLConfig
+from trlx.data.ppo_types import PPORLBatch
 from trlx.trainer import BaseRLTrainer, register_trainer
 from trlx.utils import (
     filter_non_scalars,
@@ -45,6 +46,12 @@ class AccelerateRLTrainer(BaseRLTrainer):
     def __init__(self, config, **kwargs):  # noqa: C901
         super().__init__(config, **kwargs)
         self.max_length = config.train.seq_length
+        if config.train.minibatch_size:
+            assert config.train.batch_size % config.train.minibatch_size == 0, "Minibatch size must divide batch size"
+            self.mb_size = config.train.minibatch_size
+        else:
+            self.mb_size = config.train.batch_size
+        self.num_mb = config.train.batch_size // self.mb_size
         self.accelerator = Accelerator(log_with=config.train.tracker, logging_dir=config.train.logging_dir)
 
         if self.accelerator.state.deepspeed_plugin is not None:
@@ -468,18 +475,40 @@ def learn(self):  # noqa: C901
         for _ in range(self.config.train.epochs):
             # For each batch
             for batch in self.train_dataloader:
+                mbs = [
+                    PPORLBatch(
+                        query_tensors=batch.query_tensors[mbi * self.mb_size : (mbi + 1) * self.mb_size],
+                        response_tensors=batch.response_tensors[mbi * self.mb_size : (mbi + 1) * self.mb_size],
+                        logprobs=batch.logprobs[mbi * self.mb_size : (mbi + 1) * self.mb_size],
+                        values=batch.values[mbi * self.mb_size : (mbi + 1) * self.mb_size],
+                        rewards=batch.rewards[mbi * self.mb_size : (mbi + 1) * self.mb_size],
+                    )
+                    for mbi in range(self.num_mb)
+                ]
                 # For each update per batch
                 for _ in range(self.n_updates_per_batch):
                     # Note that whereas standard policy gradient methods perform one
                     # gradient update per batch, PPO for example commonly performs
                     # multiple gradient updates on the same batch of data.
                     # https://arxiv.org/pdf/1707.06347.pdf
-                    forward_time = time()
-                    loss, stats = self.loss(batch)
-                    forward_time = time() - forward_time
-                    backward_time = time()
-                    self.accelerator.backward(loss)
-                    backward_time = time() - backward_time
+                    forward_time = 0
+                    backward_time = 0
+                    stats_accum = []
+                    for mb in mbs:
+                        forward_time -= time()
+                        loss, stats = self.loss(mb)
+                        forward_time += time()
+                        loss /= self.num_mb
+                        backward_time -= time()
+                        self.accelerator.backward(loss)
+                        backward_time += time()
+                        stats_accum.append(stats)
+
+                    forward_time /= self.num_mb
+                    backward_time /= self.num_mb
+                    # TODO(Dahoas): Best way to combine stats between mbs?
+                    # How does accelerate do it?
+                    stats = {key: sum([stats[key] for stats in stats_accum]) / self.num_mb for key in stats_accum[0]}
 
                     self.opt.step()
                     self.opt.zero_grad()

diff --git a/trlx/trainer/accelerate_ppo_trainer.py b/trlx/trainer/accelerate_ppo_trainer.py
@@ -434,11 +434,6 @@ def make_experience(self, num_rollouts: int = 1024, iter_count: int = 0):  # noq
                 ref_logprobs = logprobs_of_labels(ref_logits[:, :-1, :], all_tokens[:, 1:])
 
             n_samples: int = samples.shape[0]
-            logprobs = logprobs.cpu()
-            ref_logprobs = ref_logprobs.cpu()
-            prompt_tensors = prompt_tensors.cpu()
-            sample_outputs = sample_outputs.cpu()
-            values = values.cpu()[:, :-1]
 
             # Estimate the KL divergence between the model and reference model
             if self.config.model.model_arch_type == "seq2seq":
@@ -447,16 +442,23 @@ def make_experience(self, num_rollouts: int = 1024, iter_count: int = 0):  # noq
             else:
                 start = prompt_tensors.shape[1] - 1
 
+            log_ratio = (logprobs - ref_logprobs) * attention_mask[:, :-1]
+            self.mean_kl = (log_ratio.exp() - 1 - log_ratio).mean().to(device)
+
+            logprobs = logprobs.cpu()
+            ref_logprobs = ref_logprobs.cpu()
+            prompt_tensors = prompt_tensors.cpu()
+            sample_outputs = sample_outputs.cpu()
+            values = values.cpu()[:, :-1]
+
             ends = start + attention_mask[:, start:].sum(1)
 
             # Get the logprobs and values, for tokens that are not padding
             # or beginning of sequences tokens. These are from the model (not the reference model)
             all_values = [values[ix, start : ends[ix]] for ix in range(n_samples)]
             all_logprobs = [logprobs[ix, start : ends[ix]] for ix in range(n_samples)]
 
-            log_ratio = (logprobs - ref_logprobs) * attention_mask[:, :-1].cpu()
-            self.mean_kl = (log_ratio.exp() - 1 - log_ratio).mean().to(device)
-            kl_penalty = self.kl_ctl.value * -log_ratio
+            kl_penalty = self.kl_ctl.value * -log_ratio.cpu()
             kl_penalty = [xs[start : ends[ix]] for ix, xs in enumerate(kl_penalty)]
 
             rollout_count = 0