From e78792b9bcfd3421d7df255e1f191b91c7194ae3 Mon Sep 17 00:00:00 2001
From: Kangyan Zhou <zky314343421@gmail.com>
Date: Sat, 28 Feb 2026 21:11:41 -0800
Subject: [PATCH] [Bugfix] Add missing auto_create_handle_loop to communicator
 methods

The handle_loop asyncio task in TokenizerManager is responsible for
receiving responses from schedulers via ZMQ and dispatching them to
the appropriate _Communicator. However, handle_loop is lazily started
by auto_create_handle_loop() and several communicator methods were
missing this call.

This caused /server_info (and other endpoints like /flush_cache,
/get_loads) to hang indefinitely when called on freshly-started
servers that had not yet processed any inference request -- because
no inference request had triggered auto_create_handle_loop() yet,
the scheduler responses were never received.

This is particularly critical for PD disaggregation setups where
the sglang router's service discovery calls /server_info as the
very first interaction with worker pods during the discover_metadata
step, before any generate request is sent.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../sglang/srt/managers/tokenizer_communicator_mixin.py  | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/python/sglang/srt/managers/tokenizer_communicator_mixin.py b/python/sglang/srt/managers/tokenizer_communicator_mixin.py
index d6892431ed88..e08e0955bc2f 100644
--- a/python/sglang/srt/managers/tokenizer_communicator_mixin.py
+++ b/python/sglang/srt/managers/tokenizer_communicator_mixin.py
@@ -344,10 +344,12 @@ def _get_communicator_dispatcher(self: TokenizerManager):
         )
 
     async def flush_cache(self: TokenizerManager) -> FlushCacheReqOutput:
+        self.auto_create_handle_loop()
         return (await self.flush_cache_communicator(FlushCacheReqInput()))[0]
 
     async def clear_hicache_storage(self: TokenizerManager) -> ClearHiCacheReqOutput:
         """Clear the hierarchical cache storage."""
+        self.auto_create_handle_loop()
         # Delegate to the scheduler to handle HiCacheStorage clearing
         return (await self.clear_hicache_storage_communicator(ClearHiCacheReqInput()))[
             0
@@ -361,6 +363,7 @@ async def attach_hicache_storage(
         hicache_write_policy: Optional[str] = None,
     ) -> AttachHiCacheStorageReqOutput:
         """Attach (enable) HiCache storage backend at runtime."""
+        self.auto_create_handle_loop()
         results = await self.attach_hicache_storage_communicator(
             AttachHiCacheStorageReqInput(
                 hicache_storage_backend=hicache_storage_backend,
@@ -392,6 +395,7 @@ async def detach_hicache_storage(
         self: TokenizerManager,
     ) -> DetachHiCacheStorageReqOutput:
         """Detach (disable) HiCache storage backend at runtime."""
+        self.auto_create_handle_loop()
         results = await self.detach_hicache_storage_communicator(
             DetachHiCacheStorageReqInput()
         )
@@ -855,6 +859,7 @@ async def slow_down(
         await self.slow_down_communicator(obj)
 
     async def get_internal_state(self: TokenizerManager) -> List[Dict[Any, Any]]:
+        self.auto_create_handle_loop()
         req = GetInternalStateReq()
         responses: List[GetInternalStateReqOutput] = (
             await self.get_internal_state_communicator(req)
@@ -865,6 +870,7 @@ async def get_internal_state(self: TokenizerManager) -> List[Dict[Any, Any]]:
     async def set_internal_state(
         self: TokenizerManager, obj: SetInternalStateReq
     ) -> List[bool]:
+        self.auto_create_handle_loop()
         responses: List[SetInternalStateReqOutput] = (
             await self.set_internal_state_communicator(obj)
         )
@@ -873,9 +879,11 @@ async def set_internal_state(
     async def dumper_control(
         self: TokenizerManager, obj: DumperControlReqInput
     ) -> List[DumperControlReqOutput]:
+        self.auto_create_handle_loop()
         return await self.dumper_control_communicator(obj)
 
     async def get_load(self: TokenizerManager) -> List[GetLoadReqOutput]:
+        self.auto_create_handle_loop()
         req = GetLoadReqInput()
         return await self.get_load_communicator(req)
 
@@ -894,6 +902,7 @@ async def get_loads(
         Returns:
             List of GetLoadsReqOutput, one per scheduler (filtered by dp_rank if specified)
         """
+        self.auto_create_handle_loop()
         req = GetLoadsReqInput(
             include=include if include else ["all"],
             dp_rank=dp_rank,