@@ -290,7 +290,11 @@ async fn completions(
290290
291291 Ok ( sse_stream. into_response ( ) )
292292 } else {
293- // TODO: report ISL/OSL for non-streaming requests
293+ // Tap the stream to collect metrics for non-streaming requests without altering items
294+ let stream = stream. inspect ( move |response| {
295+ process_metrics_only ( response, & mut response_collector) ;
296+ } ) ;
297+
294298 let response = NvCreateCompletionResponse :: from_annotated_stream ( stream)
295299 . await
296300 . map_err ( |e| {
@@ -515,7 +519,10 @@ async fn chat_completions(
515519
516520 Ok ( sse_stream. into_response ( ) )
517521 } else {
518- // TODO: report ISL/OSL for non-streaming requests
522+ let stream = stream. inspect ( move |response| {
523+ process_metrics_only ( response, & mut response_collector) ;
524+ } ) ;
525+
519526 let response = NvCreateChatCompletionResponse :: from_annotated_stream ( stream)
520527 . await
521528 . map_err ( |e| {
@@ -911,6 +918,17 @@ impl<T> From<Annotated<T>> for EventConverter<T> {
911918 }
912919}
913920
921+ fn process_metrics_only < T > (
922+ annotated : & Annotated < T > ,
923+ response_collector : & mut ResponseMetricCollector ,
924+ ) {
925+ // update metrics
926+ if let Ok ( Some ( metrics) ) = LLMMetricAnnotation :: from_annotation ( annotated) {
927+ response_collector. observe_current_osl ( metrics. output_tokens ) ;
928+ response_collector. observe_response ( metrics. input_tokens , metrics. chunk_tokens ) ;
929+ }
930+ }
931+
914932fn process_event_converter < T : Serialize > (
915933 annotated : EventConverter < T > ,
916934 response_collector : & mut ResponseMetricCollector ,
0 commit comments