diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index e2c3f09f3e5..dbb632bd24b 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -834,6 +834,7 @@ class ProfileReq: activities: Optional[List[str]] = None with_stack: Optional[bool] = None record_shapes: Optional[bool] = None + profile_id: Optional[str] = None @dataclass diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 5fb0a749a63..218b6743c39 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -391,6 +391,7 @@ def __init__( self.torch_profiler = None self.torch_profiler_output_dir: Optional[str] = None self.profiler_activities: Optional[List[str]] = None + self.profiler_id: Optional[str] = None self.profiler_target_forward_ct: Optional[int] = None # Init metrics stats @@ -1805,6 +1806,7 @@ def profile(self, recv_req: ProfileReq): recv_req.activities, recv_req.with_stack, recv_req.record_shapes, + recv_req.profile_id, ) else: return self.stop_profile() @@ -1816,6 +1818,7 @@ def start_profile( activities: Optional[List[str]], with_stack: Optional[bool], record_shapes: Optional[bool], + profile_id: Optional[str], ) -> None: if self.profiler_activities: return ProfileReqOutput( @@ -1830,9 +1833,11 @@ def start_profile( self.torch_profiler_output_dir = output_dir self.profiler_activities = activities + self.profiler_id = profile_id logger.info( - "Profiling starts. Traces will be saved to: %s", + "Profiling starts. Traces will be saved to: %s (with id %s)", self.torch_profiler_output_dir, + self.profiler_id, ) activity_map = { @@ -1874,14 +1879,14 @@ def stop_profile(self) -> None: self.torch_profiler.export_chrome_trace( os.path.join( self.torch_profiler_output_dir, - str(time.time()) + f"-TP-{self.tp_rank}" + ".trace.json.gz", + self.profiler_id + f"-TP-{self.tp_rank}" + ".trace.json.gz", ) ) if "MEM" in self.profiler_activities: memory_profile_path = os.path.join( self.torch_profiler_output_dir, - str(time.time()) + f"-TP-{self.tp_rank}-memory" + ".pickle", + self.profiler_id + f"-TP-{self.tp_rank}-memory" + ".pickle", ) torch.cuda.memory._dump_snapshot(memory_profile_path) torch.cuda.memory._record_memory_history(enabled=None) diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 1acd97f5b1c..a391dd719e1 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -650,6 +650,7 @@ async def start_profile( output_dir=output_dir, num_steps=num_steps, activities=activities, + profile_id=str(time.time()), ) result = (await self.start_profile_communicator(req))[0] if not result.success: