NVIDIA-NeMo · terrykong · Mar 24, 2025 · Mar 23, 2025 · Mar 23, 2025 · Mar 23, 2025
@@ -21,6 +21,7 @@ dist/
 # Cache
 uv_cache/
 hf_home/
+hf_datasets_cache/
 *logs/
 datasets/
 docker/

@@ -78,3 +78,31 @@ When enabled, the pretty logging will generate formatted text similar to:
 
 ![Validation Pretty Logging Example](../assets/val-log.png)
 
+## GPU Metric Logging
+
+Reinforcer monitors GPU memory and utilization through [system metrics](https://docs.ray.io/en/latest/ray-observability/reference/system-metrics.html#system-metrics) exposed by Ray nodes. While Ray makes these metrics available for tools like Prometheus, Reinforcer directly polls GPU memory and utilization data and logs them to TensorBoard and/or Weights & Biases.
+
+This approach allows us to offer the same GPU metric tracking on all loggers (not just wandb) and simplifies the implementation greatly.
+
+This feature is enabled with the `monitor_gpus` configuration parameter and the frequency of collection and flushing to the loggers is controlled by `gpu_collection_interval` and `gpu_flush_interval` (both in seconds), respectively:
+
+```python
+logger:
+  wandb_enabled: false
+  tensorboard_enabled: false
+  monitor_gpus: true
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+```
+
+:::{note}
+While monitoring through the remote workers is possible, it requires some delicate implementation details to make sure:
+* sending logs back to driver does not incur a large overhead
+* metrics are easily interpretable since we may be double counting due to colocated workers
+* workers gracefully flush their logs in the event of failure
+* the logging is the same for tensorboard and wandb
+* some workers which spawn other workers correctly report the total usage of the grandchild worker
+
+These reasons lead us to the simple implementation of collecting on the driver
+:::
@@ -77,10 +77,14 @@ logger:
   num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal
   wandb_enabled: false
   tensorboard_enabled: false
+  monitor_gpus: false  # If true, will monitor GPU usage and log to wandb and/or tensorboard
   wandb:
     project: "grpo-dev"
     name: "grpo-dev-logger"
   tensorboard: {}
+  gpu_monitoring:
+    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
+    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
 
 cluster:
   gpus_per_node: 1

@@ -44,13 +44,17 @@ data:
 
 logger:
   log_dir: "logs"  # Base directory for all logs
-  wandb_enabled: true
+  wandb_enabled: false
   tensorboard_enabled: false
+  monitor_gpus: false  # If true, will monitor GPU usage and log to wandb and/or tensorboard
   wandb:
     project: "sft-dev"
     name: "sft-dev-logger"
   tensorboard:
     log_dir: "tb_logs"
+  gpu_monitoring:
+    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
+    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
 
 cluster:
   gpus_per_node: 8

@@ -195,7 +195,10 @@ def setup_data(data_config: DataConfig, policy_config: PolicyConfig, env_configs
     task_data_processors["math"] = (math_task_spec, openinstructmath2_data_processor)
 
     math_env = MathEnvironment.options(
-        runtime_env={"py_executable": MathEnvironment.DEFAULT_PY_EXECUTABLE}
+        runtime_env={
+            "py_executable": MathEnvironment.DEFAULT_PY_EXECUTABLE,
+            "env_vars": dict(os.environ),  # Pass thru all user environment variables
+        }
     ).remote(env_configs["math"])
     dataset = AllTaskProcessedDataset(
         data.formatted_ds["train"],

@@ -137,6 +137,12 @@ def setup(
     logger_config = master_config["logger"]
     cluster_config = master_config["cluster"]
 
+    # ==========================
+    #         Logger
+    # ==========================
+    logger = Logger(logger_config)
+    logger.log_hyperparams(master_config)
+
     # ==========================
     #      Checkpointing
     # ==========================
@@ -238,8 +244,6 @@ def setup(
     )
 
     loss_fn = ClippedPGLossFn(loss_config)
-    logger = Logger(logger_config)
-    logger.log_hyperparams(master_config)
 
     print("\n" + "=" * 60)
     print(" " * 18 + "SETUP COMPLETE")

@@ -166,4 +166,8 @@ def __call__(
             num_unmasked_tokens = torch.tensor(1)
         loss = -torch.sum(token_logprobs * mask) / num_unmasked_tokens
 
-        return loss, {"loss": loss.item(), "num_unmasked_tokens": num_unmasked_tokens.item(), "total_tokens": mask.numel()}
+        return loss, {
+            "loss": loss.item(),
+            "num_unmasked_tokens": num_unmasked_tokens.item(),
+            "total_tokens": mask.numel(),
+        }
@@ -61,6 +61,7 @@ class SFTConfig(TypedDict):
     val_at_start: bool
     seed: int
 
+
 class MasterConfig(TypedDict):
     policy: PolicyConfig
     data: DataConfig
@@ -102,6 +103,12 @@ def setup(
     cluster_config = master_config["cluster"]
     sft_config = master_config["sft"]
 
+    # ==========================
+    #         Logger
+    # ==========================
+    logger = Logger(logger_config)
+    logger.log_hyperparams(master_config)
+
     # ==========================
     #      Checkpointing
     # ==========================
@@ -179,9 +186,6 @@ def setup(
     loss_fn = NLLLoss()
     print(f"  ✓ Model initialized")
 
-    logger = Logger(logger_config)
-    logger.log_hyperparams(master_config)
-
     print("\n" + "=" * 60)
     print(" " * 18 + "SETUP COMPLETE")
     print("=" * 60 + "\n")

@@ -123,6 +123,7 @@ def masked_mean(values, mask, dim=None):
         return values[mask.bool()].mean()
     return as_masked_tensor(values, mask.bool()).mean(dim=dim).to_tensor(torch.nan)
 
+
 def set_seed(seed: int):
     """Sets the seed for python, numpy, and pytorch."""
     random.seed(seed)

@@ -57,7 +57,10 @@ def __repr__(self):
 
         This makes it easier to identify which worker is producing specific log messages.
         """
-        return f"{self.__class__.__name__}[rank={torch.distributed.get_rank()}]"
+        if torch.distributed.is_initialized():
+            return f"{self.__class__.__name__}[rank={torch.distributed.get_rank()}]"
+        else:
+            return f"{self.__class__.__name__}"
 
     def __init__(
         self,
@@ -123,8 +126,7 @@ def do_fsdp(model):
         if init_optimizer:
             optimizer_cls = import_class_from_path(self.cfg["optimizer"]["name"])
             self.optimizer = optimizer_cls(
-                self.model.parameters(),
-                **self.cfg["optimizer"]["kwargs"]
+                self.model.parameters(), **self.cfg["optimizer"]["kwargs"]
             )
         else:
             self.optimizer = None
@@ -830,6 +832,11 @@ def load_checkpoint(self, weights_path: str, optimizer_path: Optional[str] = Non
             else:
                 print("WARNING: No scheduler checkpoint provided")
 
+    def shutdown(self):
+        """Shutdown the policy."""
+        #
+        pass
+
 
 class HfPolicy(PolicyInterface, GenerationInterface):
     def __init__(