From 8528f5a8d955e0a3a15aa1df51160109ff778381 Mon Sep 17 00:00:00 2001 From: Charlie Ruan Date: Thu, 19 Mar 2026 17:25:47 +0000 Subject: [PATCH 1/2] [StepWise] Trivial fix to avg_response_length metric --- skyrl/train/trainer.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/skyrl/train/trainer.py b/skyrl/train/trainer.py index dd18e73ee5..5f56cc3229 100644 --- a/skyrl/train/trainer.py +++ b/skyrl/train/trainer.py @@ -676,15 +676,9 @@ def convert_to_training_input(self, generator_output: GeneratorOutput, uids: Lis training_input.metadata["trajectory_ids"] = [ trajectory_id.to_string() for trajectory_id in generator_output["trajectory_ids"] ] - training_input.metadata["avg_response_length"] = sum( - len(sample_response_ids) - for sample_response_ids, is_last_step in zip(response_ids, generator_output["is_last_step"]) - if is_last_step - ) / len(response_ids) - else: - training_input.metadata["avg_response_length"] = sum( - len(sample_response_ids) for sample_response_ids in response_ids - ) / len(response_ids) + training_input.metadata["avg_response_length"] = sum( + len(sample_response_ids) for sample_response_ids in response_ids + ) / len(response_ids) logger.info(f"Number of sequences before padding: {len(training_input['sequences'])}") training_input = self.pad_batch(training_input) From fe5fcf99c38fb53bc9af5b86d964fe615afa2427 Mon Sep 17 00:00:00 2001 From: Charlie Ruan Date: Thu, 19 Mar 2026 17:27:44 +0000 Subject: [PATCH 2/2] doc --- docs/content/docs/tutorials/step-wise-training.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/content/docs/tutorials/step-wise-training.mdx b/docs/content/docs/tutorials/step-wise-training.mdx index af3810e202..ac1eccf22b 100644 --- a/docs/content/docs/tutorials/step-wise-training.mdx +++ b/docs/content/docs/tutorials/step-wise-training.mdx @@ -29,7 +29,7 @@ When step-wise is enabled, a batch of T trajectories with an average of M turns - **Each mini-batch contains the same number of sequences** (`policy_mini_batch_size * n_samples`), but those sequences are now step-samples rather than full trajectories. The effective number of trajectories per mini-batch is reduced. The number of mini-batches (and hence optimizer steps) per training batch increases by the average number of turns — so if you have `train_batch_size=mini_batch_size=32` with an average of 3 turns, you get 3 optimizer steps instead of 1 for each training step. It is also possible that a mini-batch boundary falls mid-trajectory. - **Advantages are computed on last steps only**, then broadcast to all steps of the same trajectory. This is mathematically equivalent to non-step-wise advantage computation for GRPO. - **Training time grows as O(T²) vs O(T)**, since each trajectory of T turns becomes T sequences to forward (each with a growing prompt prefix), as opposed to 1 sequence. SkyRL will support prefix-aware merging of per-step sequences when the prefix matches (WIP), which brings the cost back to O(T) in the common case. -- **Metrics** like `generate/avg_sequence_length` are per-turn rather than per-trajectory. +- **Metrics** like `generate/avg_num_tokens` and `generate/avg_response_length` are per-turn rather than per-trajectory, since each training sample is a single turn. Some algorithms have their behavior altered by step-wise decomposition, since each turn is now treated as its own sequence: