diff --git a/python/sglang/srt/managers/scheduler_runtime_checker_mixin.py b/python/sglang/srt/managers/scheduler_runtime_checker_mixin.py index 14edf1e673a0..34e7634fce6c 100644 --- a/python/sglang/srt/managers/scheduler_runtime_checker_mixin.py +++ b/python/sglang/srt/managers/scheduler_runtime_checker_mixin.py @@ -134,7 +134,7 @@ def update_scheduler_stats(self, stats: SchedulerStats) -> None: class SchedulerRuntimeCheckerMixin: - def _alive_streaming_session_count(self: Scheduler) -> int: + def _streaming_session_count(self: Scheduler) -> int: return sum( 1 for session in self.session_controller.sessions.values() @@ -506,7 +506,7 @@ def _maybe_log_idle_metrics(self: Scheduler): return self.get_pool_stats().update_scheduler_stats(self.stats) - self.stats.num_streaming_sessions = self._alive_streaming_session_count() + self.stats.num_streaming_sessions = self._streaming_session_count() self.stats.streaming_session_held_tokens = self._session_held_tokens() priority_enabled = self.enable_priority_scheduling diff --git a/python/sglang/srt/managers/session_controller.py b/python/sglang/srt/managers/session_controller.py index 063a6192de98..cd7e6f141785 100644 --- a/python/sglang/srt/managers/session_controller.py +++ b/python/sglang/srt/managers/session_controller.py @@ -312,7 +312,7 @@ def _close(self, session_id: str): ) return - # No active request -- safe to release immediately. + # No owning request -- safe to release immediately. if session.streaming and session.req_nodes: req = next(iter(session.req_nodes.values())).req req.session = None diff --git a/python/sglang/srt/mem_cache/common.py b/python/sglang/srt/mem_cache/common.py index 391eea077780..1a43f29cb7c1 100644 --- a/python/sglang/srt/mem_cache/common.py +++ b/python/sglang/srt/mem_cache/common.py @@ -9,7 +9,7 @@ from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache, EvictParams from sglang.srt.mem_cache.memory_pool import HybridReqToTokenPool, ReqToTokenPool -from sglang.srt.mem_cache.session_aware_cache import SessionAwareCache +from sglang.srt.mem_cache.session_aware_cache import SessionAwareCache, _is_streaming from sglang.srt.mem_cache.swa_memory_pool import SWATokenToKVPoolAllocator from sglang.srt.server_args import get_global_server_args from sglang.srt.utils import support_triton @@ -487,10 +487,8 @@ def release_kv_cache(req: Req, tree_cache: BasePrefixCache, is_insert: bool = Tr # cache_finished_req below (which also sets req_pool_idx = None). from sglang.srt.managers.schedule_batch import FINISH_ABORT - is_streaming_session = ( - isinstance(tree_cache, SessionAwareCache) - and getattr(req, "session", None) is not None - and req.session.streaming + is_streaming_session = isinstance(tree_cache, SessionAwareCache) and _is_streaming( + req ) is_aborted_streaming = is_streaming_session and isinstance( getattr(req, "finished_reason", None), FINISH_ABORT diff --git a/python/sglang/srt/observability/metrics_collector.py b/python/sglang/srt/observability/metrics_collector.py index f02ac70eee74..18aabf1f9c96 100644 --- a/python/sglang/srt/observability/metrics_collector.py +++ b/python/sglang/srt/observability/metrics_collector.py @@ -686,7 +686,7 @@ def __init__( if self.enable_streaming_session: self.num_streaming_sessions = Gauge( name="sglang:num_streaming_sessions", - documentation="The number of active streaming sessions.", + documentation="The number of streaming sessions.", labelnames=labels.keys(), multiprocess_mode="mostrecent", ) diff --git a/python/sglang/srt/observability/scheduler_metrics_mixin.py b/python/sglang/srt/observability/scheduler_metrics_mixin.py index 6a313958f0b0..ffc058cdf1c3 100644 --- a/python/sglang/srt/observability/scheduler_metrics_mixin.py +++ b/python/sglang/srt/observability/scheduler_metrics_mixin.py @@ -604,7 +604,7 @@ def report_decode_stats( self.stats.cache_hit_rate = cache_hit_rate self.stats.max_total_num_tokens = self.max_total_num_tokens - self.stats.num_streaming_sessions = self._alive_streaming_session_count() + self.stats.num_streaming_sessions = self._streaming_session_count() self.stats.streaming_session_held_tokens = self._session_held_tokens() # Speculative decoding