@@ -91,7 +91,7 @@ def __init__(self, max_num_seqs, config, tensor_parallel_size, splitwise_role, l
9191 self .finish_execution_pool = ThreadPoolExecutor (max_workers = 1 )
9292 self .lock = threading .Lock ()
9393 self .to_be_rescheduled_request_id_set = set ()
94- main_process_metrics .max_batch_size . set ( max_num_seqs )
94+ main_process_metrics .set_value ( "max_batch_size" , max_num_seqs )
9595
9696 self .using_extend_tables_req_id = set ()
9797
@@ -144,13 +144,12 @@ def _trigger_preempt(self, request, num_new_blocks, preempted_reqs, scheduled_re
144144 if preempted_req .request_id in self .req_dict :
145145 del self .req_dict [preempted_req .request_id ]
146146 self ._free_blocks (preempted_req )
147- main_process_metrics . num_requests_running . dec ( 1 )
147+ llm_logger . info ( f"Preemption is triggered! Preempted request id: { preempted_req . request_id } " )
148148 else :
149149 self ._free_blocks (preempted_req )
150150 preempted_req .cached_block_num = 0
151151 self .to_be_rescheduled_request_id_set .add (preempted_req .request_id )
152- main_process_metrics .num_requests_waiting .inc (1 )
153- main_process_metrics .num_requests_running .dec (1 )
152+ llm_logger .info (f"Preemption is triggered! Preempted request id: { preempted_req .request_id } " )
154153 preempted_reqs .append (preempted_req )
155154 scheduled_reqs .append (self ._prepare_preempt_task (preempted_req ))
156155 if preempted_req == request :
@@ -414,8 +413,6 @@ def schedule(self):
414413 request , self .config .cache_config .block_size , request .num_computed_tokens
415414 )
416415 request .status = RequestStatus .RUNNING
417- main_process_metrics .num_requests_waiting .dec (1 )
418- main_process_metrics .num_requests_running .inc (1 )
419416 if self .config .scheduler_config .splitwise_role == "mixed" :
420417 allocated_position = self .get_available_position ()
421418 request .idx = allocated_position
@@ -460,8 +457,6 @@ def schedule(self):
460457 request , self .config .cache_config .block_size , request .num_computed_tokens
461458 )
462459 request .status = RequestStatus .RUNNING
463- main_process_metrics .num_requests_waiting .dec (1 )
464- main_process_metrics .num_requests_running .inc (1 )
465460 else :
466461 if self .config .cache_config .enable_prefix_caching :
467462 self ._free_blocks (request )
@@ -520,11 +515,17 @@ def schedule(self):
520515 continue
521516
522517 if scheduled_reqs :
523- task_used_block_num = sum ([len (task .block_tables ) if task else 0 for task in self .tasks_list ])
524- main_process_metrics .available_gpu_block_num .set (self .total_block_number () - task_used_block_num )
525- main_process_metrics .batch_size .set (self .max_num_seqs - self .available_batch ())
526- main_process_metrics .gpu_cache_usage_perc .set (self .get_gpu_cache_usage_perc ())
527518 llm_logger .debug (f"schedued_reqs: { scheduled_reqs } " )
519+
520+ # Update metrics
521+ num_tasks = sum ([1 if task else 0 for task in self .tasks_list ])
522+ num_blocks_used_by_tasks = sum ([len (task .block_tables ) if task else 0 for task in self .tasks_list ])
523+ main_process_metrics .set_value ("available_gpu_block_num" , self .total_block_number () - num_blocks_used_by_tasks )
524+ main_process_metrics .set_value ("batch_size" , self .max_num_seqs - self .available_batch ())
525+ main_process_metrics .set_value ("gpu_cache_usage_perc" , self .get_gpu_cache_usage_perc ())
526+ main_process_metrics .set_value ("num_requests_running" , len (self .running ))
527+ main_process_metrics .set_value ("num_requests_waiting" , num_tasks - len (self .running ))
528+
528529 return scheduled_reqs
529530
530531 def get_available_position (self ) -> int :
@@ -566,9 +567,9 @@ def get_prefix_cached_blocks(self, request: Request):
566567 request .skip_allocate = False
567568
568569 # Report the number of cached tokens to Prometheus metrics
569- main_process_metrics .prefix_cache_token_num . inc ( matched_token_num )
570- main_process_metrics .prefix_gpu_cache_token_num . inc ( request .gpu_cache_token_num )
571- main_process_metrics .prefix_cpu_cache_token_num . inc ( request .cpu_cache_token_num )
570+ main_process_metrics .inc_value ( "prefix_cache_token_num" , matched_token_num )
571+ main_process_metrics .inc_value ( "prefix_gpu_cache_token_num" , request .gpu_cache_token_num )
572+ main_process_metrics .inc_value ( "prefix_cpu_cache_token_num" , request .cpu_cache_token_num )
572573
573574 if matched_token_num == request .need_prefill_tokens :
574575 request .num_computed_tokens = matched_token_num - self .config .cache_config .block_size
0 commit comments