NVIDIA-NeMo · SahilJain314 · May 9, 2025 · May 9, 2025 · May 9, 2025 · May 11, 2025
@@ -351,7 +351,7 @@ If you have trained a model and saved the checkpoint in the Pytorch DCP format,
 
 ```sh
 # Example for a GRPO checkpoint at step 170
-uv run python examples/convert_dcp_to_hf.py \
+uv run python examples/converters/convert_dcp_to_hf.py \
     --config results/grpo/step_170/config.yaml \
     --dcp-ckpt-path results/grpo/step_170/policy/weights/ \
     --hf-ckpt-path results/grpo/hf

@@ -31,8 +31,13 @@ NeMo-RL uses the vLLM V1 runtime for both synchronous and asynchronous inference
 
 ### Context Parallel with FSDP2
 
-NeMo-RL implemented this feature based on torch CP [implementation](https://github.com/pytorch/pytorch/blob/main/torch/distributed/tensor/experimental/_attention.py). And we inherit its limitations.
+- NeMo-RL implemented this feature based on torch CP [implementation](https://github.com/pytorch/pytorch/blob/main/torch/distributed/tensor/experimental/_attention.py). And we inherit its limitations.
 Whether model level support CP only depends on arguments passed to `torch.nn.functional.scaled_dot_product_attention`. Current NeMo-RL passed all ones attention mask to `model.forward`. For Gemma-3, it won't ignore attention mask as result `attn_bias` is not None which is not supported by torch CP. Please see [assertion](https://github.com/pytorch/pytorch/blob/134179474539648ba7dee1317959529fbd0e7f89/torch/distributed/tensor/experimental/_attention.py#L262) .
+ - Context parallel can't be used together with sequence packing. Sequence packing requires `attn_implementation="flash_attention_2"`, this conflict with context parallel requires SDPA impl. Refer to [here](https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/modeling_utils.py#L2317) for more details.
+
+
+- It's a known issue that context parallel can't be used together with sequence parallel.
+Refer to [here](https://github.com/NVIDIA-NeMo/RL/issues/659) for more details.
 
 ## vLLM Async Rollout Timeout
 

@@ -55,6 +55,9 @@ policy:
   dynamic_batching:
     enabled: False
 
+  sequence_packing:
+    enabled: False
+
   # makes the training sequence length divisible by the tensor parallel size
   # this is useful for sequence parallel training
   make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}

@@ -21,6 +21,9 @@ policy:
   dynamic_batching:
     enabled: False
 
+  sequence_packing:
+    enabled: False
+
   optimizer:
     name: "torch.optim.AdamW"
     kwargs:

@@ -51,16 +51,26 @@ policy:
     tensor_parallel_size: 1
     context_parallel_size: 1
     custom_parallel_plan: null
+
+  megatron_cfg:
+    enabled: false
 
   # dynamic_batching improves performance by ensuring logprob and training microbatches
   # have a sufficent number of tokens to maximize GPU utilization. Specifically, variable length
   # responses are sorted by sequence length and bucketed into microbatches with a total
   # amount of tokens is approximately close to 'train_mb_tokens' and 'logprob_mb_tokens' for the
   # training and logprob stages respectively.
   dynamic_batching:
+    enabled: False
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
+    sequence_length_round: 64
+
+  sequence_packing:
     enabled: True
     train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
     logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
+    algorithm: "modified_first_fit_decreasing"
     sequence_length_round: 64
 
   # makes the training sequence length divisible by the tensor parallel size

@@ -49,14 +49,19 @@ policy:
   # responses are sorted by sequence length and bucketed into microbatches with a total
   # amount of tokens is approximately close to 'train_mb_tokens' and 'logprob_mb_tokens' for the
   # training and logprob stages respectively.
+  # 
+  # We disable it for Megatron as it is incompatible with Pipeline parallelism. Instead, we use sequence packing
   dynamic_batching:
     enabled: False
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
+    sequence_length_round: 64
 
   sequence_packing:
-    enabled: False # coming soon
+    enabled: True
     train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
     logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    algorithm: "modified_ffd"
+    algorithm: "modified_first_fit_decreasing"
     sequence_length_round: 64
 
   max_grad_norm: 1.0
@@ -116,7 +121,7 @@ policy:
       weight_decay_incr_style: "constant"
       lr_decay_style: "constant"
       lr_decay_iters: null
-      lr_warmup_iters: 50
+      lr_warmup_iters: 13
       lr_warmup_init: 5.0e-7
 
     distributed_data_parallel_config:

@@ -49,7 +49,7 @@ policy:
       weight_decay_incr_style: "constant"
       lr_decay_style: "constant"
       lr_decay_iters: null
-      lr_warmup_iters: 50
+      lr_warmup_iters: 13
       lr_warmup_init: 3.0e-8
 
   generation:
@@ -62,7 +62,7 @@ policy:
     stop_strings: null
     vllm_cfg:
       tensor_parallel_size: 4
-      gpu_memory_utilization: 0.8
+      gpu_memory_utilization: 0.6
       max_model_len: ${policy.max_total_sequence_length}
 
 cluster:

@@ -54,7 +54,7 @@ policy:
       weight_decay_incr_style: "constant"
       lr_decay_style: "constant"
       lr_decay_iters: null
-      lr_warmup_iters: 50
+      lr_warmup_iters: 13
       lr_warmup_init: 3.0e-8
 
   generation:
@@ -67,9 +67,9 @@ policy:
     stop_strings: null
     vllm_cfg:
       tensor_parallel_size: 1
-      gpu_memory_utilization: 0.8
+      gpu_memory_utilization: 0.6
       max_model_len: ${policy.max_total_sequence_length}
 
 cluster:
   gpus_per_node: 8
-  num_nodes: 1
+  num_nodes: 1
@@ -29,11 +29,11 @@ policy:
     enabled: true
     empty_unused_memory_level: 1
     converter_type: "LlamaForCausalLM"
-    tensor_model_parallel_size: 4
-    pipeline_model_parallel_size: 4
+    tensor_model_parallel_size: 2
+    pipeline_model_parallel_size: 1
     context_parallel_size: 1
     expert_tensor_parallel_size: 1
-    expert_model_parallel_size: 4
+    expert_model_parallel_size: 8
     sequence_parallel: True
     pipeline_dtype: ${policy.precision}
 
@@ -52,7 +52,7 @@ policy:
       weight_decay_incr_style: "constant"
       lr_decay_style: "constant"
       lr_decay_iters: null
-      lr_warmup_iters: 50
+      lr_warmup_iters: 13
       lr_warmup_init: 3.0e-8
 
     env_vars:
@@ -68,7 +68,8 @@ policy:
     stop_strings: null
     vllm_cfg:
       tensor_parallel_size: 4
-      gpu_memory_utilization: 0.8
+      gpu_memory_utilization: 0.7
+      enforce_eager: false
       max_model_len: ${policy.max_total_sequence_length}
 
 cluster:

@@ -44,6 +44,12 @@ policy:
   dynamic_batching:
     enabled: false
 
+  sequence_packing:
+    enabled: False
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    algorithm: "modified_first_fit_decreasing"
+    sequence_length_round: 64
+
   # makes the training sequence length divisible by the tensor parallel size
   # this is useful for sequence parallel training
   make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
@@ -121,7 +127,7 @@ policy:
       average_in_collective: true
       data_parallel_sharding_strategy: "optim_grads_params"
 
-    
+
 data:
   max_input_seq_length: ${policy.max_total_sequence_length}
   dataset_name: "squad"

@@ -37,6 +37,9 @@ policy:
     context_parallel_size: 1
     custom_parallel_plan: null
 
+  sequence_packing:
+    enabled: False
+
   dynamic_batching:
     enabled: false
 

@@ -31,6 +31,8 @@
 from nemo_rl.utils.config import load_config, parse_hydra_overrides
 from nemo_rl.utils.logger import get_next_experiment_dir
 
+OmegaConf.register_new_resolver("mul", lambda a, b: a * b)
+
 
 def parse_args():
     """Parse command line arguments."""

@@ -114,6 +114,7 @@ def __call__(
         global_valid_toks: torch.Tensor,
         vocab_parallel_rank: Optional[int] = None,
         vocab_parallel_group: Optional[torch.distributed.ProcessGroup] = None,
+        context_parallel_group: Optional[torch.distributed.ProcessGroup] = None,
     ) -> tuple[torch.Tensor, dict]:
         """Clipped Policy Gradient RL loss function."""
         token_mask = data["token_mask"][:, 1:]
@@ -149,7 +150,10 @@ def __call__(
                 vocab_end_index=(vocab_parallel_rank + 1) * next_token_logits.shape[-1],
                 tp_group=vocab_parallel_group,
                 inference_only=False,
+                cp_group=context_parallel_group,
             )
+            # slice off to the correct length to remove potential CP padding
+            curr_logprobs = curr_logprobs[:, : data["input_ids"].shape[1] - 1]
         elif isinstance(next_token_logits, torch.distributed.tensor.DTensor):
             curr_logprobs = get_logprobs_from_vocab_parallel_logits(
                 next_token_logits, data["input_ids"], seq_index=seq_index
@@ -312,6 +316,7 @@ def __call__(
         global_valid_toks: Tensor,
         vocab_parallel_rank: Optional[int] = None,
         vocab_parallel_group: Optional[torch.distributed.ProcessGroup] = None,
+        context_parallel_group: Optional[torch.distributed.ProcessGroup] = None,
         dpo_loss: bool = False,
         dpo_average_log_probs: bool = False,
     ) -> tuple[torch.Tensor, dict[str, Any]]:
@@ -335,7 +340,10 @@ def __call__(
                 vocab_end_index=(vocab_parallel_rank + 1) * next_token_logits.shape[-1],
                 tp_group=vocab_parallel_group,
                 inference_only=False,
+                cp_group=context_parallel_group,
             )
+            # slice off to the correct length to remove potential CP padding
+            token_logprobs = token_logprobs[:, : data["input_ids"].shape[1] - 1]
         elif isinstance(next_token_logits, torch.distributed.tensor.DTensor):
             token_logprobs = get_logprobs_from_vocab_parallel_logits(
                 next_token_logits, data["input_ids"]
@@ -466,6 +474,7 @@ def _preference_loss(
         global_valid_seqs: Tensor,
         vocab_parallel_rank: Optional[int] = None,
         vocab_parallel_group: Optional[torch.distributed.ProcessGroup] = None,
+        context_parallel_group: Optional[torch.distributed.ProcessGroup] = None,
     ) -> tuple[Tensor, Tensor, Tensor, Tensor]:
         ## TODO(@ashors): there's some duplicate code here with the NLLLoss function. We should refactor
         token_mask = data["token_mask"][:, 1:]
@@ -483,7 +492,10 @@ def _preference_loss(
                 vocab_end_index=(vocab_parallel_rank + 1) * next_token_logits.shape[-1],
                 tp_group=vocab_parallel_group,
                 inference_only=False,
+                cp_group=context_parallel_group,
             )
+            # slice off to the correct length to remove potential CP padding
+            token_logprobs = token_logprobs[:, : data["input_ids"].shape[1] - 1]
         elif isinstance(next_token_logits, torch.distributed.tensor.DTensor):
             token_logprobs = get_logprobs_from_vocab_parallel_logits(
                 next_token_logits, data["input_ids"]
@@ -548,6 +560,7 @@ def __call__(
         global_valid_toks: Tensor | None,
         vocab_parallel_rank: Optional[int] = None,
         vocab_parallel_group: Optional[torch.distributed.ProcessGroup] = None,
+        context_parallel_group: Optional[torch.distributed.ProcessGroup] = None,
     ) -> tuple[torch.Tensor, dict[str, Any]]:
         sft_loss_chosen = torch.tensor(0.0)
         if self.sft_loss_weight > 0:
@@ -561,6 +574,7 @@ def __call__(
                 global_valid_toks=global_valid_toks,  ## unused because sft loss returned is at the sample level
                 vocab_parallel_rank=vocab_parallel_rank,
                 vocab_parallel_group=vocab_parallel_group,
+                context_parallel_group=context_parallel_group,
                 dpo_loss=True,
                 dpo_average_log_probs=self.sft_average_log_probs,
             )
@@ -582,6 +596,7 @@ def __call__(
             global_valid_seqs,
             vocab_parallel_rank=vocab_parallel_rank,
             vocab_parallel_group=vocab_parallel_group,
+            context_parallel_group=context_parallel_group,
         )
 
         dpo_loss = (
@@ -601,3 +616,83 @@ def __call__(
             "rewards_rejected_mean": rewards_rejected_mean.item(),
             "num_valid_samples": num_valid_samples.item(),
         }
+
+
+class SequencePackingLossWrapper:
+    def __init__(
+        self,
+        loss_fn: LossFunction,
+        cu_seqlens_q: Tensor,
+        cu_seqlens_q_padded: Optional[Tensor] = None,
+    ):
+        self.loss_fn = loss_fn
+        self.cu_seqlens_q = cu_seqlens_q
+        self.cu_seqlens_q_padded = cu_seqlens_q_padded
+
+    def __call__(
+        self,
+        next_token_logits: Tensor,
+        data: BatchedDataDict[Any],
+        global_valid_seqs: Tensor | None,
+        global_valid_toks: Tensor | None,
+        vocab_parallel_rank: Optional[int] = None,
+        vocab_parallel_group: Optional[torch.distributed.ProcessGroup] = None,
+        context_parallel_group: Optional[torch.distributed.ProcessGroup] = None,
+    ) -> tuple[Tensor, dict[str, Any]]:
+        """Wraps a loss function to handle sequence packing by doing one sequence at a time to avoid excessive padding."""
+        unpadded_cu_seqlens = self.cu_seqlens_q
+        unpadded_seq_lengths = self.cu_seqlens_q[1:] - self.cu_seqlens_q[:-1]
+        if self.cu_seqlens_q_padded is not None:
+            padded_cu_seqlens = self.cu_seqlens_q_padded
+            padded_seq_lengths = (
+                self.cu_seqlens_q_padded[1:] - self.cu_seqlens_q_padded[:-1]
+            )
+        else:
+            padded_cu_seqlens = unpadded_cu_seqlens
+            padded_seq_lengths = unpadded_seq_lengths
+        seq_starts = padded_cu_seqlens[:-1]
+        seq_ends = padded_cu_seqlens[1:]
+
+        loss_accum = 0
+        metrics_accum = {}
+        for seq_idx in range(len(seq_starts)):
+            seq_start = seq_starts[seq_idx].item()
+            seq_end = seq_ends[seq_idx].item()
+
+            # get sequence and unpad all 'data' tensors. The data dict is a BatchedDataDict of unpacked tensors
+            seq_data = data.slice(seq_idx, seq_idx + 1)
+            unpadded_seq_data = {}
+            for k, v in seq_data.items():
+                if isinstance(v, torch.Tensor) and v.ndim > 1 and v.shape[1] > 1:
+                    unpadded_seq_data[k] = v[:, : unpadded_seq_lengths[seq_idx]]
+                else:
+                    unpadded_seq_data[k] = v
+
+            # get next_token_logits
+            cp_size = (
+                1
+                if context_parallel_group is None
+                else torch.distributed.get_world_size(context_parallel_group)
+            )
+            logit_slice_idxs = slice(
+                seq_start // cp_size,
+                (seq_start + padded_seq_lengths[seq_idx]) // cp_size,
+            )
+            next_token_logits_slice = next_token_logits[:, logit_slice_idxs, :]
+
+            loss, metrics = self.loss_fn(
+                next_token_logits_slice,
+                unpadded_seq_data,
+                global_valid_seqs,
+                global_valid_toks,
+                vocab_parallel_rank=vocab_parallel_rank,
+                vocab_parallel_group=vocab_parallel_group,
+                context_parallel_group=context_parallel_group,
+            )
+            loss_accum += loss
+            for k, v in metrics.items():
+                if k not in metrics_accum:
+                    metrics_accum[k] = 0
+                metrics_accum[k] += v
+
+        return loss_accum, metrics_accum