@@ -80,7 +80,6 @@ type StreamingServer struct {
8080 director Director
8181}
8282
83-
8483// RequestContext stores context information during the life time of an HTTP request.
8584// TODO: The requestContext is gathering a ton of fields. A future refactor needs to tease these fields apart.
8685// Specifically, there are fields related to the ext-proc protocol, and then fields related to the lifecycle of the request.
@@ -92,33 +91,33 @@ type RequestContext struct {
9291 ResolvedTargetModel string
9392 RequestReceivedTimestamp time.Time
9493 ResponseCompleteTimestamp time.Time
95- FirstTokenTimestamp time.Time
96- LastTokenTimestamp time.Time
94+ FirstTokenTimestamp time.Time
95+ LastTokenTimestamp time.Time
9796 RequestSize int
9897 Usage Usage
9998 ResponseSize int
10099 ResponseComplete bool
101100 ResponseStatusCode string
102101 RequestRunning bool
103102 Request * Request
104- Prompt string
105- GeneratedTokenCount int
103+ Prompt string
104+ GeneratedTokenCount int
106105
107- LastSeenMetrics * backendmetrics.MetricsState
108- SchedulingResult * schedulingtypes.SchedulingResult
106+ LastSeenMetrics * backendmetrics.MetricsState
107+ SchedulingResult * schedulingtypes.SchedulingResult
109108
110109 SchedulingRequest * schedulingtypes.LLMRequest
111110
112111 RequestState StreamRequestState
113112 ModelServerStreaming bool
114113
115- TTFT float64
114+ TTFT float64
116115 PredictedTTFT float64
117116
118117 PredictedTPOTObservations []float64
119- TPOTObservations []float64
120- AvgTPOT float64
121- AvgPredictedTPOT float64
118+ TPOTObservations []float64
119+ AvgTPOT float64
120+ AvgPredictedTPOT float64
122121
123122 TokenSampler * requtil.TokenSampler
124123
@@ -133,17 +132,14 @@ type RequestContext struct {
133132 respTrailerResp * extProcPb.ProcessingResponse
134133}
135134
136-
137-
138135type Request struct {
139136 Headers map [string ]string
140- Body map [string ]interface {}
137+ Body map [string ]any
141138 Metadata map [string ]any
142139}
143140type Response struct {
144- Headers map [string ]string
141+ Headers map [string ]string
145142 Trailers map [string ]string
146-
147143}
148144type StreamRequestState int
149145
@@ -170,17 +166,17 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
170166 RequestState : RequestReceived ,
171167 Request : & Request {
172168 Headers : make (map [string ]string ),
173- Body : make (map [string ]interface {} ),
169+ Body : make (map [string ]any ),
174170 Metadata : make (map [string ]any ),
175171 },
176172 Response : & Response {
177- Headers : make (map [string ]string ),
173+ Headers : make (map [string ]string ),
178174 Trailers : make (map [string ]string ),
179175 },
180176 }
181177
182178 var body []byte
183- var responseBody map [string ]interface {}
179+ var responseBody map [string ]any
184180
185181 // Create error handling var as each request should only report once for
186182 // error metrics. This doesn't cover the error "Cannot receive stream request" because
@@ -302,49 +298,44 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
302298 metrics .RecordResponseSizes (reqCtx .Model , reqCtx .ResolvedTargetModel , reqCtx .ResponseSize )
303299
304300 if s .director .IsPredictorAvailable () {
305- // var sumActual, sumPred float64
306- // for _, actual := range reqCtx.TPOTObservations {
307- // sumActual += actual
308-
309- // }
310- // for _, prediction := range reqCtx.PredictedTPOTObservations {
311- // sumPred += prediction
312-
313- // }
314-
315- // avgActual := sumActual / float64(len(reqCtx.TPOTObservations))
316- // avgPred := sumPred / float64(len(reqCtx.PredictedTPOTObservations))
317-
318- // reqCtx.AvgTPOT = avgActual
319- // reqCtx.AvgPredictedTPOT = avgPred
320-
321-
322- // Compute MAPE for TTFT
323- mapeTTFT := 0.0
324- if reqCtx .TTFT > 0 {
325- mapeTTFT = math .Abs ((reqCtx .TTFT - reqCtx .PredictedTTFT )/ reqCtx .TTFT ) * 100
326- logger .V (logutil .DEBUG ).Info ("Averages calculated" , "avgActualTTFT" , reqCtx .TTFT , "avgPredictedTTFT" , reqCtx .PredictedTTFT )
327- logger .V (logutil .DEBUG ).Info ("MAPE TTFT computed" , "mapeTTFT%" , mapeTTFT )
328- metrics .RecordRequestTTFT (ctx , reqCtx .Model , reqCtx .ResolvedTargetModel , reqCtx .TTFT / 1000 )
329- metrics .RecordRequestPredictedTTFT (ctx , reqCtx .Model , reqCtx .ResolvedTargetModel , reqCtx .PredictedTTFT / 1000 )
330- metrics .RecordRequestTTFTPredictionMape (ctx , reqCtx .Model , reqCtx .ResolvedTargetModel , mapeTTFT )
331-
332- }
333-
334-
335- mapeTPOT := 0.0
336- if reqCtx .AvgTPOT > 0 {
337- mapeTPOT = math .Abs ((reqCtx .AvgTPOT - reqCtx .AvgPredictedTPOT )/ reqCtx .AvgTPOT ) * 100
338- logger .V (logutil .DEBUG ).Info ("Averages calculated" , "avgActualTPOT" , reqCtx .AvgTPOT , "avgPredictedTPOT" , reqCtx .AvgPredictedTPOT )
339- logger .V (logutil .DEBUG ).Info ("MAPE TPOT computed" , "mapeTPOT%" , mapeTPOT )
340- metrics .RecordRequestTPOT (ctx , reqCtx .Model , reqCtx .ResolvedTargetModel , reqCtx .AvgTPOT / 1000 )
341- metrics .RecordRequestPredictedTPOT (ctx , reqCtx .Model , reqCtx .ResolvedTargetModel , reqCtx .AvgPredictedTPOT / 1000 )
342- metrics .RecordRequestTPOTPredictionMape (ctx , reqCtx .Model , reqCtx .ResolvedTargetModel , mapeTPOT )
301+ // var sumActual, sumPred float64
302+ // for _, actual := range reqCtx.TPOTObservations {
303+ // sumActual += actual
304+
305+ // }
306+ // for _, prediction := range reqCtx.PredictedTPOTObservations {
307+ // sumPred += prediction
308+
309+ // }
310+
311+ // avgActual := sumActual / float64(len(reqCtx.TPOTObservations))
312+ // avgPred := sumPred / float64(len(reqCtx.PredictedTPOTObservations))
313+
314+ // reqCtx.AvgTPOT = avgActual
315+ // reqCtx.AvgPredictedTPOT = avgPred
316+
317+ // Compute MAPE for TTFT
318+ mapeTTFT := 0.0
319+ if reqCtx .TTFT > 0 {
320+ mapeTTFT = math .Abs ((reqCtx .TTFT - reqCtx .PredictedTTFT )/ reqCtx .TTFT ) * 100
321+ logger .V (logutil .DEBUG ).Info ("Averages calculated" , "avgActualTTFT" , reqCtx .TTFT , "avgPredictedTTFT" , reqCtx .PredictedTTFT )
322+ logger .V (logutil .DEBUG ).Info ("MAPE TTFT computed" , "mapeTTFT%" , mapeTTFT )
323+ metrics .RecordRequestTTFT (ctx , reqCtx .Model , reqCtx .ResolvedTargetModel , reqCtx .TTFT / 1000 )
324+ metrics .RecordRequestPredictedTTFT (ctx , reqCtx .Model , reqCtx .ResolvedTargetModel , reqCtx .PredictedTTFT / 1000 )
325+ metrics .RecordRequestTTFTPredictionMape (ctx , reqCtx .Model , reqCtx .ResolvedTargetModel , mapeTTFT )
326+
327+ }
328+
329+ mapeTPOT := 0.0
330+ if reqCtx .AvgTPOT > 0 {
331+ mapeTPOT = math .Abs ((reqCtx .AvgTPOT - reqCtx .AvgPredictedTPOT )/ reqCtx .AvgTPOT ) * 100
332+ logger .V (logutil .DEBUG ).Info ("Averages calculated" , "avgActualTPOT" , reqCtx .AvgTPOT , "avgPredictedTPOT" , reqCtx .AvgPredictedTPOT )
333+ logger .V (logutil .DEBUG ).Info ("MAPE TPOT computed" , "mapeTPOT%" , mapeTPOT )
334+ metrics .RecordRequestTPOT (ctx , reqCtx .Model , reqCtx .ResolvedTargetModel , reqCtx .AvgTPOT / 1000 )
335+ metrics .RecordRequestPredictedTPOT (ctx , reqCtx .Model , reqCtx .ResolvedTargetModel , reqCtx .AvgPredictedTPOT / 1000 )
336+ metrics .RecordRequestTPOTPredictionMape (ctx , reqCtx .Model , reqCtx .ResolvedTargetModel , mapeTPOT )
337+ }
343338 }
344- }
345-
346-
347-
348339
349340 }
350341
@@ -380,21 +371,21 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
380371 }
381372 case * extProcPb.ProcessingRequest_ResponseTrailers :
382373 logger .V (logutil .DEFAULT ).Info ("Processing response trailers" , "trailers" , v .ResponseTrailers .Trailers )
383- if reqCtx .ModelServerStreaming {
384-
374+ if reqCtx .ModelServerStreaming {
375+
385376 var trailerErr error
386377 reqCtx , trailerErr = s .HandleResponseTrailers (ctx , reqCtx )
387378 if trailerErr != nil {
388- logger .V (logutil .DEFAULT ).Error (trailerErr , "Failed to process response trailers" )
389- }
379+ logger .V (logutil .DEFAULT ).Error (trailerErr , "Failed to process response trailers" )
380+ }
390381 reqCtx .respTrailerResp = s .generateResponseTrailerResponse (reqCtx )
391- }
382+ }
392383 }
393384
394385 // Handle the err and fire an immediate response.
395386 if err != nil {
396387 logger .V (logutil .DEFAULT ).Error (err , "Failed to process request" , "request" , req )
397- resp , err := BuildErrResponse (err )
388+ resp , err := buildErrResponse (err )
398389 if err != nil {
399390 return err
400391 }
@@ -475,9 +466,7 @@ func (r *RequestContext) updateStateAndSendIfNeeded(srv extProcPb.ExternalProces
475466 return nil
476467}
477468
478-
479-
480- func BuildErrResponse (err error ) (* extProcPb.ProcessingResponse , error ) {
469+ func buildErrResponse (err error ) (* extProcPb.ProcessingResponse , error ) {
481470 var resp * extProcPb.ProcessingResponse
482471
483472 switch errutil .CanonicalCode (err ) {
@@ -504,6 +493,17 @@ func BuildErrResponse(err error) (*extProcPb.ProcessingResponse, error) {
504493 },
505494 },
506495 }
496+ // This code can be returned by the director when there are no candidate pods for the request scheduling.
497+ case errutil .ServiceUnavailable :
498+ resp = & extProcPb.ProcessingResponse {
499+ Response : & extProcPb.ProcessingResponse_ImmediateResponse {
500+ ImmediateResponse : & extProcPb.ImmediateResponse {
501+ Status : & envoyTypePb.HttpStatus {
502+ Code : envoyTypePb .StatusCode_ServiceUnavailable ,
503+ },
504+ },
505+ },
506+ }
507507 // This code can be returned when users provide invalid json request.
508508 case errutil .BadRequest :
509509 resp = & extProcPb.ProcessingResponse {
0 commit comments