NVIDIA-NeMo · zpqiu · Apr 15, 2025
@@ -277,9 +277,10 @@ def refit_policy_generation(
     """Refit the policy generation interface with the latest policy weights."""
     policy.offload_before_refit()
     ipc_handles = policy.get_weights_ipc_handles()
-    policy_generation.prepare_for_generation()
+    policy_generation.prepare_for_generation(tags=["weights"])
     policy_generation.update_weights(ipc_handles)
     policy.offload_after_refit()
+    policy_generation.prepare_for_generation(tags=["kv_cache"])
 
 
 def generate_responses(

@@ -410,8 +410,12 @@ def sleep(self):
         gc.collect()
         torch.cuda.empty_cache()
 
-    def wake_up(self):
-        self.llm.wake_up()
+    def wake_up(self, **kwargs):
+        # tags like ["weights", "kv_cache"]
+        if "tags" in kwargs:
+            self.llm.wake_up(tags=kwargs["tags"])
+        else:
+            self.llm.wake_up()
 
 
 class VllmGeneration(GenerationInterface):
@@ -580,7 +584,7 @@ def prepare_for_generation(self, *args, **kwargs):
         try:
             # Use run_all_workers_single_data for methods that don't need data
             futures = self.worker_group.run_all_workers_single_data(
-                "wake_up", respect_tied_workers=True
+                "wake_up", respect_tied_workers=True, **kwargs
             )
             # Wait for all futures to complete
             results = ray.get(futures)