From fd75ed8c7248a4fc0f2571cefbd24e678a001bf2 Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Sun, 23 Nov 2025 23:33:24 +0800 Subject: [PATCH] move --- python/sglang/srt/managers/scheduler.py | 60 ------------------- .../scheduler_runtime_checker_mixin.py | 59 ++++++++++++++++++ 2 files changed, 59 insertions(+), 60 deletions(-) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index cc5d2fed645b..e1bd79331479 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -1627,66 +1627,6 @@ def handle_batch_embedding_request( for tokenized_req in recv_req: self.handle_embedding_request(tokenized_req) - def _get_token_info(self): - available_size = self.token_to_kv_pool_allocator.available_size() - evictable_size = self.tree_cache.evictable_size() - num_used = self.max_total_num_tokens - (available_size + evictable_size) - token_usage = num_used / self.max_total_num_tokens - return num_used, token_usage, available_size, evictable_size - - def _get_mamba_token_info(self): - is_radix_tree = isinstance(self.tree_cache, MambaRadixCache) - full_available_size = self.token_to_kv_pool_allocator.available_size() - full_evictable_size = ( - self.tree_cache.full_evictable_size() if is_radix_tree else 0 - ) - mamba_available_size = self.req_to_token_pool.mamba_pool.available_size() - mamba_evictable_size = ( - self.tree_cache.mamba_evictable_size() if is_radix_tree else 0 - ) - full_num_used = self.token_to_kv_pool_allocator.size - ( - full_available_size + full_evictable_size - ) - mamba_num_used = self.req_to_token_pool.mamba_pool.size - ( - mamba_available_size + mamba_evictable_size - ) - full_token_usage = full_num_used / self.token_to_kv_pool_allocator.size - mamba_usage = mamba_num_used / self.req_to_token_pool.mamba_pool.size - return ( - full_num_used, - mamba_num_used, - full_token_usage, - mamba_usage, - full_available_size, - full_evictable_size, - mamba_available_size, - mamba_evictable_size, - ) - - def _get_swa_token_info(self): - full_available_size = self.token_to_kv_pool_allocator.full_available_size() - full_evictable_size = self.tree_cache.full_evictable_size() - swa_available_size = self.token_to_kv_pool_allocator.swa_available_size() - swa_evictable_size = self.tree_cache.swa_evictable_size() - full_num_used = self.full_tokens_per_layer - ( - full_available_size + full_evictable_size - ) - swa_num_used = self.swa_tokens_per_layer - ( - swa_available_size + swa_evictable_size - ) - full_token_usage = full_num_used / self.full_tokens_per_layer - swa_token_usage = swa_num_used / self.swa_tokens_per_layer - return ( - full_num_used, - swa_num_used, - full_token_usage, - swa_token_usage, - full_available_size, - full_evictable_size, - swa_available_size, - swa_evictable_size, - ) - def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: # Merge the prefill batch into the running batch chunked_req_to_exclude = set() diff --git a/python/sglang/srt/managers/scheduler_runtime_checker_mixin.py b/python/sglang/srt/managers/scheduler_runtime_checker_mixin.py index d566bf1ede74..59e5c5a4619b 100644 --- a/python/sglang/srt/managers/scheduler_runtime_checker_mixin.py +++ b/python/sglang/srt/managers/scheduler_runtime_checker_mixin.py @@ -25,6 +25,65 @@ class SchedulerRuntimeCheckerMixin: + def _get_token_info(self: Scheduler): + available_size = self.token_to_kv_pool_allocator.available_size() + evictable_size = self.tree_cache.evictable_size() + num_used = self.max_total_num_tokens - (available_size + evictable_size) + token_usage = num_used / self.max_total_num_tokens + return num_used, token_usage, available_size, evictable_size + + def _get_mamba_token_info(self: Scheduler): + is_radix_tree = isinstance(self.tree_cache, MambaRadixCache) + full_available_size = self.token_to_kv_pool_allocator.available_size() + full_evictable_size = ( + self.tree_cache.full_evictable_size() if is_radix_tree else 0 + ) + mamba_available_size = self.req_to_token_pool.mamba_pool.available_size() + mamba_evictable_size = ( + self.tree_cache.mamba_evictable_size() if is_radix_tree else 0 + ) + full_num_used = self.token_to_kv_pool_allocator.size - ( + full_available_size + full_evictable_size + ) + mamba_num_used = self.req_to_token_pool.mamba_pool.size - ( + mamba_available_size + mamba_evictable_size + ) + full_token_usage = full_num_used / self.token_to_kv_pool_allocator.size + mamba_usage = mamba_num_used / self.req_to_token_pool.mamba_pool.size + return ( + full_num_used, + mamba_num_used, + full_token_usage, + mamba_usage, + full_available_size, + full_evictable_size, + mamba_available_size, + mamba_evictable_size, + ) + + def _get_swa_token_info(self: Scheduler): + full_available_size = self.token_to_kv_pool_allocator.full_available_size() + full_evictable_size = self.tree_cache.full_evictable_size() + swa_available_size = self.token_to_kv_pool_allocator.swa_available_size() + swa_evictable_size = self.tree_cache.swa_evictable_size() + full_num_used = self.full_tokens_per_layer - ( + full_available_size + full_evictable_size + ) + swa_num_used = self.swa_tokens_per_layer - ( + swa_available_size + swa_evictable_size + ) + full_token_usage = full_num_used / self.full_tokens_per_layer + swa_token_usage = swa_num_used / self.swa_tokens_per_layer + return ( + full_num_used, + swa_num_used, + full_token_usage, + swa_token_usage, + full_available_size, + full_evictable_size, + swa_available_size, + swa_evictable_size, + ) def _check_hybrid_memory(self: Scheduler): (