@@ -57,14 +57,12 @@ def __init__(self,
5757 self .perf_metrics_max_requests = config .perf_metrics_max_requests
5858 if self .perf_metrics_max_requests > 0 :
5959 # record corresponding keys of context and generation servers for perf metrics
60- # (ctx_server, gen_server, ctx_request_id, server_start_ts , server_first_token_ts)
60+ # (ctx_server, gen_server, ctx_request_id, server_arrival_time , server_first_token_ts)
6161 self .perf_metrics_keys = deque (maxlen = self .perf_metrics_max_requests )
6262 self .perf_metrics_keys_lock = asyncio .Lock ()
6363 # server_url -> {ctx_request_id: perf_metrics}
6464 self .server_perf_metrics : dict [str , dict [int , dict ]] = {}
6565
66- # server_url -> the perf metric timestamp offset between the disagg server and worker server
67- self .server_perf_ts_offsets : dict [str , float ] = {}
6866 else :
6967 self .perf_metrics_keys = None
7068 self .perf_metrics_keys_lock = None
@@ -110,7 +108,7 @@ async def lifespan(app: FastAPI):
110108 await self .wait_for_servers_ready (server_start_timeout_secs )
111109
112110 if self .perf_metrics_max_requests > 0 :
113- await self .query_perf_ts_offsets (self .session )
111+ await self .set_steady_clock_offsets (self .session )
114112
115113 if self .metadata_server :
116114 logger .info ("Starting server monitoring via metadata service" )
@@ -143,7 +141,7 @@ async def lifespan(app: FastAPI):
143141 @self .app .middleware ("http" )
144142 async def add_process_time_header (raw_request : Request , call_next ):
145143 start_time = time .monotonic ()
146- raw_request .state .server_start_ts = start_time
144+ raw_request .state .server_arrival_time = start_time
147145 response = await call_next (raw_request )
148146 return response
149147
@@ -202,7 +200,7 @@ async def version(self) -> JSONResponse:
202200
203201 async def _add_perf_metrics_keys (self , ctx_server : str , gen_server : str , ctx_request_id : int , raw_request : Request ):
204202 async with self .perf_metrics_keys_lock :
205- self .perf_metrics_keys .append ((ctx_server , gen_server , ctx_request_id , raw_request .state .server_start_ts , raw_request .state .server_first_token_ts ))
203+ self .perf_metrics_keys .append ((ctx_server , gen_server , ctx_request_id , raw_request .state .server_arrival_time , raw_request .state .server_first_token_ts ))
206204
207205 async def perf_metrics (self ) -> JSONResponse :
208206 if self .perf_metrics_keys is None :
@@ -239,27 +237,23 @@ async def perf_metrics(self) -> JSONResponse:
239237 raise exc
240238
241239 remain_keys = []
242- for ctx_server , gen_server , ctx_request_id , server_start_ts , server_first_token_ts in self .perf_metrics_keys :
240+ for ctx_server , gen_server , ctx_request_id , server_arrival_time , server_first_token_ts in self .perf_metrics_keys :
243241 gen_perf_metrics = self .server_perf_metrics [gen_server ].pop (ctx_request_id , None )
244242 if gen_perf_metrics is None :
245243 # generation not finished
246- remain_keys .append ((ctx_server , gen_server , ctx_request_id , server_start_ts , server_first_token_ts ))
244+ remain_keys .append ((ctx_server , gen_server , ctx_request_id , server_arrival_time , server_first_token_ts ))
247245 continue
248246 ctx_perf_metrics = self .server_perf_metrics [ctx_server ].pop (ctx_request_id , None )
249247 return_metrics .append ({
250248 "ctx_server" : ctx_server ,
251249 "gen_server" : gen_server ,
252- "disagg_server_start_ts " : server_start_ts ,
250+ "disagg_server_arrival_time " : server_arrival_time ,
253251 "disagg_server_first_token_ts" : server_first_token_ts ,
254252 "ctx_perf_metrics" : ctx_perf_metrics ,
255253 "gen_perf_metrics" : gen_perf_metrics })
256254 self .perf_metrics_keys = deque (remain_keys , maxlen = self .perf_metrics_max_requests )
257255
258- response = {
259- "server_perf_timestamp_offsets" : self .server_perf_ts_offsets ,
260- "perf_metrics" : return_metrics
261- }
262- return JSONResponse (content = response )
256+ return JSONResponse (content = return_metrics )
263257
264258
265259 async def openai_completion (self , req : CompletionRequest , raw_request : Request ) -> Response :
@@ -514,28 +508,35 @@ async def send_completion_request(self, url: str, request: CompletionRequest) ->
514508 async def send_chat_request (self , url : str , request : ChatCompletionRequest ) -> ChatCompletionResponse :
515509 return await self .send_request (url , request , "/v1/chat/completions" , ChatCompletionResponse , self .create_chat_generator )
516510
517- async def query_perf_ts_offsets (self , session : aiohttp .ClientSession ):
518- async def query_perf_ts_offset (server_url : str ) -> Optional [float ]:
511+ async def set_steady_clock_offsets (self , session : aiohttp .ClientSession ):
512+ STEADY_CLOCK_OFFSET_ENDPOINT = "/steady_clock_offset"
513+ async def query_steady_clock_offset (server_url : str ) -> Optional [float ]:
519514 try :
520515 originate_ts = time .monotonic ()
521- async with session .get (server_url + '/perf_ts_offset' ) as response :
516+ async with session .get (server_url + STEADY_CLOCK_OFFSET_ENDPOINT ) as response :
522517 destination_ts = time .monotonic ()
523518 if response .status == 200 :
524519 response = await response .json ()
520+ # Compute the steady clock timestamp difference using the NTP clock synchronization algorithm. https://en.wikipedia.org/wiki/Network_Time_Protocol#Clock_synchronization_algorithm
525521 receive_ts = response ['receive_ts' ]
526522 transmit_ts = response ['transmit_ts' ]
527523 delay = (destination_ts - originate_ts ) - (transmit_ts - receive_ts )
528- offset = - ((receive_ts - originate_ts ) + (transmit_ts - destination_ts )) / 2
524+ offset = ((receive_ts - originate_ts ) + (transmit_ts - destination_ts )) / 2
529525 return delay , offset
530526 else :
531527 return None , None
532528 except Exception :
533529 return None
530+ async def set_steady_clock_offset (server_url : str , offset : float ) -> Optional [float ]:
531+ payload = {"offset" : offset }
532+ async with session .post (server_url + STEADY_CLOCK_OFFSET_ENDPOINT , json = payload ) as response :
533+ if response .status != 200 :
534+ logger .warning (f"Cannot set disagg server steady clock offset for server { server_url } , the perf metrics timestamps could be mis-aligned" )
534535 for server_url in self .ctx_servers + self .gen_servers :
535- delay , offset = await query_perf_ts_offset (server_url )
536- self .server_perf_ts_offsets [server_url ] = offset
536+ delay , offset = await query_steady_clock_offset (server_url )
537537 logger .info (f'Server: { server_url } , delay: { delay } second, offset: { offset } second' )
538- logger .info (f"Server perf metrics timestamp offsets: { self .server_perf_ts_offsets } " )
538+ # Negate the offset so that worker servers can adjust their steady block by adding the new offset
539+ await set_steady_clock_offset (server_url , - offset )
539540
540541 @classmethod
541542 async def check_server_ready (cls , session : aiohttp .ClientSession , server_url : str ) -> bool :
0 commit comments