Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/configs/grpo_math_1B.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ loss_fn:
# Async GRPO requires importance sampling correction enabled
# Set to true when async_grpo.enabled is true
use_importance_sampling_correction: false
truncated_importance_sampling_ratio: null
sequence_level_importance_ratios: false
token_level_loss: true

Expand Down
1 change: 1 addition & 0 deletions examples/configs/vlm_grpo_3B.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ loss_fn:
# (default off) loss formulation improvements (docs/guides/grpo.md#loss)
use_on_policy_kl_approximation: false
use_importance_sampling_correction: false
truncated_importance_sampling_ratio: null
token_level_loss: true

checkpointing:
Expand Down
1 change: 1 addition & 0 deletions examples/configs/vlm_grpo_3B_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ loss_fn:
ratio_clip_c: null
use_on_policy_kl_approximation: false
use_importance_sampling_correction: false
truncated_importance_sampling_ratio: null
token_level_loss: true
checkpointing:
enabled: true
Expand Down
17 changes: 17 additions & 0 deletions nemo_rl/algorithms/loss_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class ClippedPGLossConfig(TypedDict):
ratio_clip_c: float
use_on_policy_kl_approximation: bool
use_importance_sampling_correction: bool
truncated_importance_sampling_ratio: float | None
token_level_loss: bool
# If True, apply the off-policy importance-sampling correction at the
# sequence level (one weight per generated sample), as in GSPO.
Expand Down Expand Up @@ -113,6 +114,9 @@ def __init__(self, cfg: ClippedPGLossConfig):
self.use_importance_sampling_correction = cfg[
"use_importance_sampling_correction"
]
self.truncated_importance_sampling_ratio = cfg[
"truncated_importance_sampling_ratio"
]
# Whether to compute importance weights per-sequence instead of per-token.
self.sequence_level_importance_ratios = cfg.get(
"sequence_level_importance_ratios",
Expand All @@ -125,6 +129,13 @@ def __init__(self, cfg: ClippedPGLossConfig):
assert self.loss_type == LossType.SEQUENCE_LEVEL, (
"sequence-level importance sampling (e.g. GSPO) is mutually exclusive with token-level loss"
)
if self.truncated_importance_sampling_ratio is not None:
assert self.use_importance_sampling_correction, (
"truncated_importance_sampling_ratio is only supported when use_importance_sampling_correction is True"
)
assert self.truncated_importance_sampling_ratio > 0, (
"truncated_importance_sampling_ratio should be positive"
)

def __call__(
self,
Expand Down Expand Up @@ -280,6 +291,12 @@ def __call__(
actor_importance_weights_expanded = torch.nan_to_num(
actor_importance_weights_expanded, nan=0.0, posinf=0.0, neginf=0.0
)
# TIS see https://fengyao.notion.site/off-policy-rl
if self.truncated_importance_sampling_ratio is not None:
actor_importance_weights_expanded = torch.clamp(
actor_importance_weights_expanded,
max=self.truncated_importance_sampling_ratio,
)
actor_importance_weights = actor_importance_weights_expanded
del actor_importance_weights_expanded
if self.use_importance_sampling_correction:
Expand Down
2 changes: 2 additions & 0 deletions tests/unit/algorithms/test_grpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -889,6 +889,8 @@ def val_iter(self):
"ratio_clip_c": 1.0,
"use_on_policy_kl_approximation": False,
"use_importance_sampling_correction": False,
"truncated_importance_sampling_ratio": None,
"sequence_level_importance_ratios": False,
"token_level_loss": True,
}
)
Expand Down
Loading
Loading