diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 7d14c98da840..3e788e6fb47c 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -1609,6 +1609,7 @@ def retract_all(self, server_args: ServerArgs): def retract_decode( self, server_args: ServerArgs, + buf_multiplier: int = 1, ) -> Tuple[List[Req], float, List[Req]]: """Retract the decoding requests when there is not enough memory.""" sorted_indices = list(range(len(self.reqs))) @@ -1630,7 +1631,9 @@ def retract_decode( retracted_reqs = [] first_iter = True while first_iter or ( - not self.check_decode_mem(selected_indices=sorted_indices) + not self.check_decode_mem( + selected_indices=sorted_indices, buf_multiplier=buf_multiplier + ) ): if len(sorted_indices) == 1: # Corner case: only one request left diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index f4b93518f40d..075f26df3226 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -1965,7 +1965,7 @@ def update_running_batch(self, batch: ScheduleBatch) -> Optional[ScheduleBatch]: ): old_ratio = self.new_token_ratio retracted_reqs, new_token_ratio, reqs_to_abort = batch.retract_decode( - self.server_args + self.server_args, self.decode_mem_cache_buf_multiplier ) self.num_retracted_reqs = len(retracted_reqs) self.new_token_ratio = new_token_ratio diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 4e82119e9b51..dbbf6e66dd64 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -1666,7 +1666,9 @@ def init_memory_pool( self.max_total_num_tokens = self.profile_max_num_token(total_gpu_memory) if (small_kv_size := envs.SGLANG_CI_SMALL_KV_SIZE.get()) > 0: - # Use a small KV cache pool size for local tests + logger.info( + f"Use a small KV cache pool size ({small_kv_size}) for local tests" + ) self.max_total_num_tokens = small_kv_size if max_num_reqs is None: diff --git a/test/registered/spec/eagle/test_eagle_infer_b.py b/test/registered/spec/eagle/test_eagle_infer_b.py index 4bff4953c1fe..fb2fbae984df 100644 --- a/test/registered/spec/eagle/test_eagle_infer_b.py +++ b/test/registered/spec/eagle/test_eagle_infer_b.py @@ -292,7 +292,8 @@ def setUpClass(cls): # These config helps find a leak. # FIXME(lsyin): use override context manager envs.SGLANG_CI_SMALL_KV_SIZE.set(4500) - super().setUpClass() + with envs.SGLANG_TEST_RETRACT.override(True): + super().setUpClass() class TestEAGLEServerTriton(TestEAGLEServerBasic):