diff --git a/python/sglang/srt/managers/scheduler_metrics_mixin.py b/python/sglang/srt/managers/scheduler_metrics_mixin.py index 27dd2f3b5ca..aa558f2a494 100644 --- a/python/sglang/srt/managers/scheduler_metrics_mixin.py +++ b/python/sglang/srt/managers/scheduler_metrics_mixin.py @@ -331,7 +331,7 @@ def log_decode_stats( # Others self.calculate_utilization() - self.metrics_collector.log_stats(self.stats) + self.metrics_collector.log_stats(self.stats, is_decode_stats=True) self._emit_kv_metrics() self._publish_kv_events() diff --git a/python/sglang/srt/metrics/collector.py b/python/sglang/srt/metrics/collector.py index b0e7750c3c1..b5f941d8729 100644 --- a/python/sglang/srt/metrics/collector.py +++ b/python/sglang/srt/metrics/collector.py @@ -529,7 +529,7 @@ def observe_per_stage_req_latency(self, stage: str, latency: float) -> None: def observe_queue_time(self, latency: float) -> None: self._log_histogram(self.queue_time, latency) - def log_stats(self, stats: SchedulerStats) -> None: + def log_stats(self, stats: SchedulerStats, is_decode_stats: bool = False) -> None: self._log_gauge(self.num_running_reqs, stats.num_running_reqs) self._log_gauge(self.num_used_tokens, stats.num_used_tokens) self._log_gauge(self.token_usage, stats.token_usage) @@ -543,11 +543,14 @@ def log_stats(self, stats: SchedulerStats) -> None: self._log_gauge( self.num_running_reqs_offline_batch, stats.num_running_reqs_offline_batch ) - self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate) + + if not is_decode_stats: + self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate) # Speculative decoding - self._log_gauge(self.spec_accept_length, stats.spec_accept_length) - self._log_gauge(self.spec_accept_rate, stats.spec_accept_rate) + if is_decode_stats: + self._log_gauge(self.spec_accept_length, stats.spec_accept_length) + self._log_gauge(self.spec_accept_rate, stats.spec_accept_rate) # PD disaggregation self._log_gauge(