3535 ResumeMemoryOccupationReqInput ,
3636 UpdateWeightsFromTensorReqInput ,
3737)
38- from sglang .srt .openai_api .protocol import Tool
38+
39+ # from sglang.srt.openai_api.protocol import Tool
3940from sglang .srt .sampling .sampling_params import SamplingParams
4041from sglang .srt .server_args import ServerArgs
4142from sglang .srt .utils import (
@@ -135,9 +136,6 @@ def __init__(self, **kwargs):
135136
136137 async def release_memory_occupation (self , tags : Optional [list [str ]] = None ):
137138 """Release GPU occupation temporarily."""
138- if self ._need_reload :
139- await self .release_memory_occupation ()
140- self ._need_reload = False
141139 if tags is None :
142140 obj = ReleaseMemoryOccupationReqInput ()
143141 else :
@@ -149,7 +147,9 @@ async def resume_memory_occupation(self, tags: Optional[list[str]] = None):
149147 # because __init__ is a sync method, it can not call the async release_memory_occupation
150148 # have to move release_memory_occupation from __init__ to here
151149 # For multi-stage awake, we run release weight and kv_cache when we resume weights for the first time.
152- await self .release_memory_occupation ()
150+ if self ._need_reload :
151+ await self .release_memory_occupation ()
152+ self ._need_reload = False
153153
154154 if tags is None :
155155 obj = ResumeMemoryOccupationReqInput ()
0 commit comments