NVIDIA-NeMo · terrykong · Oct 21, 2025 · Sep 30, 2025 · Oct 13, 2025 · Oct 13, 2025
@@ -43,6 +43,7 @@ loss_fn:
   # Async GRPO requires importance sampling correction enabled
   # Set to true when async_grpo.enabled is true
   use_importance_sampling_correction: false
+  truncated_importance_sampling_ratio: null
   sequence_level_importance_ratios: false
   token_level_loss: true
 

@@ -39,6 +39,7 @@ loss_fn:
   # (default off) loss formulation improvements (docs/guides/grpo.md#loss)
   use_on_policy_kl_approximation: false
   use_importance_sampling_correction: false
+  truncated_importance_sampling_ratio: null
   token_level_loss: true
 
 checkpointing:

@@ -35,6 +35,7 @@ loss_fn:
   ratio_clip_c: null
   use_on_policy_kl_approximation: false
   use_importance_sampling_correction: false
+  truncated_importance_sampling_ratio: null
   token_level_loss: true
 checkpointing:
   enabled: true

@@ -42,6 +42,7 @@ class ClippedPGLossConfig(TypedDict):
     ratio_clip_c: float
     use_on_policy_kl_approximation: bool
     use_importance_sampling_correction: bool
+    truncated_importance_sampling_ratio: float | None
     token_level_loss: bool
     # If True, apply the off-policy importance-sampling correction at the
     # sequence level (one weight per generated sample), as in GSPO.
@@ -113,6 +114,9 @@ def __init__(self, cfg: ClippedPGLossConfig):
         self.use_importance_sampling_correction = cfg[
             "use_importance_sampling_correction"
         ]
+        self.truncated_importance_sampling_ratio = cfg[
+            "truncated_importance_sampling_ratio"
+        ]
         # Whether to compute importance weights per-sequence instead of per-token.
         self.sequence_level_importance_ratios = cfg.get(
             "sequence_level_importance_ratios",
@@ -125,6 +129,13 @@ def __init__(self, cfg: ClippedPGLossConfig):
             assert self.loss_type == LossType.SEQUENCE_LEVEL, (
                 "sequence-level importance sampling (e.g. GSPO) is mutually exclusive with token-level loss"
             )
+        if self.truncated_importance_sampling_ratio is not None:
+            assert self.use_importance_sampling_correction, (
+                "truncated_importance_sampling_ratio is only supported when use_importance_sampling_correction is True"
+            )
+            assert self.truncated_importance_sampling_ratio > 0, (
+                "truncated_importance_sampling_ratio should be positive"
+            )
 
     def __call__(
         self,
@@ -280,6 +291,12 @@ def __call__(
             actor_importance_weights_expanded = torch.nan_to_num(
                 actor_importance_weights_expanded, nan=0.0, posinf=0.0, neginf=0.0
             )
+        # TIS see https://fengyao.notion.site/off-policy-rl
+        if self.truncated_importance_sampling_ratio is not None:
+            actor_importance_weights_expanded = torch.clamp(
+                actor_importance_weights_expanded,
+                max=self.truncated_importance_sampling_ratio,
+            )
         actor_importance_weights = actor_importance_weights_expanded
         del actor_importance_weights_expanded
         if self.use_importance_sampling_correction:

@@ -889,6 +889,8 @@ def val_iter(self):
             "ratio_clip_c": 1.0,
             "use_on_policy_kl_approximation": False,
             "use_importance_sampling_correction": False,
+            "truncated_importance_sampling_ratio": None,
+            "sequence_level_importance_ratios": False,
             "token_level_loss": True,
         }
     )