Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def update_scheduler_stats(self, stats: SchedulerStats) -> None:


class SchedulerRuntimeCheckerMixin:
def _alive_streaming_session_count(self: Scheduler) -> int:
def _streaming_session_count(self: Scheduler) -> int:
return sum(
1
for session in self.session_controller.sessions.values()
Expand Down Expand Up @@ -506,7 +506,7 @@ def _maybe_log_idle_metrics(self: Scheduler):
return

self.get_pool_stats().update_scheduler_stats(self.stats)
self.stats.num_streaming_sessions = self._alive_streaming_session_count()
self.stats.num_streaming_sessions = self._streaming_session_count()
self.stats.streaming_session_held_tokens = self._session_held_tokens()

priority_enabled = self.enable_priority_scheduling
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/srt/managers/session_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ def _close(self, session_id: str):
)
return

# No active request -- safe to release immediately.
# No owning request -- safe to release immediately.
if session.streaming and session.req_nodes:
req = next(iter(session.req_nodes.values())).req
req.session = None
Expand Down
8 changes: 3 additions & 5 deletions python/sglang/srt/mem_cache/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache, EvictParams
from sglang.srt.mem_cache.memory_pool import HybridReqToTokenPool, ReqToTokenPool
from sglang.srt.mem_cache.session_aware_cache import SessionAwareCache
from sglang.srt.mem_cache.session_aware_cache import SessionAwareCache, _is_streaming
from sglang.srt.mem_cache.swa_memory_pool import SWATokenToKVPoolAllocator
from sglang.srt.server_args import get_global_server_args
from sglang.srt.utils import support_triton
Expand Down Expand Up @@ -487,10 +487,8 @@ def release_kv_cache(req: Req, tree_cache: BasePrefixCache, is_insert: bool = Tr
# cache_finished_req below (which also sets req_pool_idx = None).
from sglang.srt.managers.schedule_batch import FINISH_ABORT

is_streaming_session = (
isinstance(tree_cache, SessionAwareCache)
and getattr(req, "session", None) is not None
and req.session.streaming
is_streaming_session = isinstance(tree_cache, SessionAwareCache) and _is_streaming(
req
)
is_aborted_streaming = is_streaming_session and isinstance(
getattr(req, "finished_reason", None), FINISH_ABORT
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/srt/observability/metrics_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -686,7 +686,7 @@ def __init__(
if self.enable_streaming_session:
self.num_streaming_sessions = Gauge(
name="sglang:num_streaming_sessions",
documentation="The number of active streaming sessions.",
documentation="The number of streaming sessions.",
labelnames=labels.keys(),
multiprocess_mode="mostrecent",
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -604,7 +604,7 @@ def report_decode_stats(
self.stats.cache_hit_rate = cache_hit_rate

self.stats.max_total_num_tokens = self.max_total_num_tokens
self.stats.num_streaming_sessions = self._alive_streaming_session_count()
self.stats.num_streaming_sessions = self._streaming_session_count()
self.stats.streaming_session_held_tokens = self._session_held_tokens()

# Speculative decoding
Expand Down
Loading