verl-project · vermouth1992 · Aug 25, 2025 · Aug 23, 2025 · Aug 23, 2025
@@ -48,6 +48,13 @@ python3 -m verl.trainer.main_ppo \
     actor_rollout_ref.rollout.enable_chunked_prefill=False \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.ref.profiler.enable=True \
+    actor_rollout_ref.ref.profiler.ranks=$PROFILE_RANKS \
+    actor_rollout_ref.ref.profiler.all_ranks=$PROFILE_RANKS_ALL \
+    actor_rollout_ref.ref.profiler.tool_config.npu.discrete=$DISCRETE \
+    actor_rollout_ref.ref.profiler.tool_config.npu.contents=$CONTENTS \
+    actor_rollout_ref.ref.profiler.tool_config.npu.level=$LEVEL \
+    actor_rollout_ref.ref.profiler.tool_config.npu.analysis=$ANALYSIS \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger=console \

@@ -46,6 +46,12 @@ python3 -m verl.trainer.main_ppo \
     actor_rollout_ref.rollout.enable_chunked_prefill=False \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.ref.profiler.enable=True \
+    actor_rollout_ref.ref.profiler.all_ranks=$PROFILE_RANKS_ALL \
+    actor_rollout_ref.ref.profiler.tool_config.npu.discrete=$DISCRETE \
+    actor_rollout_ref.ref.profiler.tool_config.npu.contents=$CONTENTS \
+    actor_rollout_ref.ref.profiler.tool_config.npu.level=$LEVEL \
+    actor_rollout_ref.ref.profiler.tool_config.npu.analysis=$ANALYSIS \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger=console \

diff --git a/tests/utils/test_nvtx_profile.py b/tests/utils/test_nvtx_profile.py
@@ -120,8 +120,9 @@ def test_annotate_decorator(self):
         mock_self = MagicMock()
         mock_self.profiler = self.profiler
         mock_self.profiler.this_step = True
+        decorator = mock_self.profiler.annotate(message="test")
 
-        @NsightSystemsProfiler.annotate(message="test")
+        @decorator
         def test_func(self, *args, **kwargs):
             return "result"
 

diff --git a/tests/utils/test_special_mstx_profile.py b/tests/utils/test_special_mstx_profile.py
@@ -149,8 +149,9 @@ def test_annotate_decorator_applied_correctly(self):
             mock_start_patch.return_value = mock_mark_range
 
             with patch("verl.utils.profiler.mstx_profile.get_npu_profiler") as mock_get_profiler:
+                decorator = mock_worker.profiler.annotate(message="test")
 
-                @NPUProfiler.annotate(message="test")
+                @decorator
                 def test_func(self, *args, **kwargs):
                     return "result"
 
@@ -171,8 +172,9 @@ def test_annotate_when_profiler_disabled(self):
             patch("verl.utils.profiler.mstx_profile.mark_end_range") as mock_end_patch,
             patch("verl.utils.profiler.mstx_profile.get_npu_profiler") as mock_get_profiler,
         ):
+            decorator = mock_worker.profiler.annotate(message="test")
 
-            @NPUProfiler.annotate(message="test")
+            @decorator
             def test_func(self, *args, **kwargs):
                 return "result"
 
@@ -193,8 +195,9 @@ def test_annotate_when_this_step_disabled(self):
             patch("verl.utils.profiler.mstx_profile.mark_end_range") as mock_end_patch,
             patch("verl.utils.profiler.mstx_profile.get_npu_profiler") as mock_get_profiler,
         ):
+            decorator = mock_worker.profiler.annotate(message="test")
 
-            @NPUProfiler.annotate(message="test")
+            @decorator
             def test_func(self, *args, **kwargs):
                 return "result"
 
@@ -221,8 +224,9 @@ def test_annotate_discrete_mode_enabled(self):
         ):
             mock_start_patch.return_value = mock_mark_range
             mock_get_profiler.return_value = mock_profile_npu
+            decorator = mock_worker.profiler.annotate(message="test", role="test_role")
 
-            @NPUProfiler.annotate(message="test", role="test_role")
+            @decorator
             def test_func(self, *args, **kwargs):
                 return "result"
 
@@ -253,8 +257,9 @@ def test_annotate_with_default_message(self):
             patch("verl.utils.profiler.mstx_profile.mark_end_range") as mock_end_patch,
         ):
             mock_start_patch.return_value = mock_mark_range
+            decorator = mock_worker.profiler.annotate()
 
-            @NPUProfiler.annotate()
+            @decorator
             def test_func(self, *args, **kwargs):
                 return "result"
 

diff --git a/verl/utils/profiler/mstx_profile.py b/verl/utils/profiler/mstx_profile.py
@@ -214,8 +214,7 @@ def stop(self):
                 self.profile_npu.stop()
                 NPUProfiler._define_count -= 1
 
-    @staticmethod
-    def annotate(message: Optional[str] = None, role: Optional[str] = None, **kwargs) -> Callable:
+    def annotate(self, message: Optional[str] = None, role: Optional[str] = None, **kwargs_outer) -> Callable:
         """Decorate a Worker member function to profile the current rank in the current training step.
 
         Requires the target function to be a member function of a Worker,
@@ -230,32 +229,32 @@ def annotate(message: Optional[str] = None, role: Optional[str] = None, **kwargs
 
         def decorator(func):
             @functools.wraps(func)
-            def wrapper(self, *args, **kwargs):
-                if not self.profiler.enable:
-                    return func(self, *args, **kwargs)
+            def wrapper(*args, **kwargs_inner):
+                if not self.enable:
+                    return func(*args, **kwargs_inner)
 
                 profile_name = message or func.__name__
-                discrete_mode = self.profiler.discrete
-                profile_enable = self.profiler.this_step and self.profiler.enable
+                discrete_mode = self.discrete
+                profile_enable = self.this_step and self.enable
 
                 if not profile_enable:
-                    return func(self, *args, **kwargs)
+                    return func(*args, **kwargs_inner)
 
                 if profile_enable:
                     if not discrete_mode:
                         mark_range = mark_start_range(message=profile_name)
                     else:
                         profile_npu = get_npu_profiler(
-                            contents=self.profiler.profile_contents,
-                            profile_level=self.profiler.profile_level,
-                            profile_save_path=self.profiler.profile_save_path,
-                            analysis=self.profiler.analysis,
+                            contents=self.profile_contents,
+                            profile_level=self.profile_level,
+                            profile_save_path=self.profile_save_path,
+                            analysis=self.analysis,
                             role=role,
                         )
                         profile_npu.start()
                         mark_range = mark_start_range(message=profile_name)
 
-                result = func(self, *args, **kwargs)
+                result = func(*args, **kwargs_inner)
 
                 if profile_enable:
                     if not discrete_mode:

diff --git a/verl/utils/profiler/nvtx_profile.py b/verl/utils/profiler/nvtx_profile.py
@@ -149,13 +149,13 @@ def stop(self):
             if not self.discrete:
                 torch.cuda.profiler.stop()
 
-    @staticmethod
     def annotate(
+        self,
         message: Optional[str] = None,
         color: Optional[str] = None,
         domain: Optional[str] = None,
         category: Optional[str] = None,
-        **kwargs,
+        **kwargs_outer,
     ) -> Callable:
         """Decorate a Worker member function to profile the current rank in the current training step.
 
@@ -175,22 +175,22 @@ def annotate(
 
         def decorator(func):
             @functools.wraps(func)
-            def wrapper(self, *args, **kwargs):
-                if not self.profiler.enable:
-                    return func(self, *args, **kwargs)
+            def wrapper(*args, **kwargs_inner):
+                if not self.enable:
+                    return func(*args, **kwargs_inner)
 
                 profile_name = message or func.__name__
 
-                if self.profiler.this_step:
-                    if self.profiler.discrete:
+                if self.this_step:
+                    if self.discrete:
                         torch.cuda.profiler.start()
                     mark_range = mark_start_range(message=profile_name, color=color, domain=domain, category=category)
 
-                result = func(self, *args, **kwargs)
+                result = func(*args, **kwargs_inner)
 
-                if self.profiler.this_step:
+                if self.this_step:
                     mark_end_range(mark_range)
-                    if self.profiler.discrete:
+                    if self.discrete:
                         torch.cuda.profiler.stop()
 
                 return result

diff --git a/verl/utils/profiler/profile.py b/verl/utils/profiler/profile.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import functools
 import os
 from typing import Callable, Optional
 
@@ -226,16 +227,35 @@ def start(self, **kwargs):
     def stop(self):
         return getattr(self._impl, "stop", lambda: None)()
 
-    @staticmethod
+    @classmethod
     def annotate(
+        cls,
         message: Optional[str] = None,
         color: Optional[str] = None,
         domain: Optional[str] = None,
         category: Optional[str] = None,
-        **kwargs,
+        **kwargs_outer,
     ) -> Callable:
         def decorator(func):
-            return func
+            @functools.wraps(func)
+            def wrapper(self_instance, *args, **kwargs_inner):
+                profiler = getattr(self_instance, "profiler", None)
+                if not profiler:
+                    return func(self_instance, *args, **kwargs_inner)
+
+                impl = profiler._impl
+                if hasattr(impl, "annotate"):
+                    try:
+                        actual_decorator = impl.annotate(
+                            message=message, color=color, domain=domain, category=category, **kwargs_outer
+                        )
+
+                        return actual_decorator(func)(self_instance, *args, **kwargs_inner)
+                    except Exception:
+                        return func(self_instance, *args, **kwargs_inner)
+                return func(self_instance, *args, **kwargs_inner)
+
+            return wrapper
 
         return decorator