diff --git a/python/sglang/srt/disaggregation/decode.py b/python/sglang/srt/disaggregation/decode.py index e75582736f49..5c1e5359a206 100644 --- a/python/sglang/srt/disaggregation/decode.py +++ b/python/sglang/srt/disaggregation/decode.py @@ -918,6 +918,9 @@ def _commit_transfer_to_req(self, decode_req: DecodeRequest) -> bool: # Case 3: Success - commit the transfer decode_req.req.output_ids.append(output_id[0].item()) decode_req.req.cached_tokens = cached_tokens[0].item() + decode_req.req.cached_tokens_device = cached_tokens[1].item() + decode_req.req.cached_tokens_host = cached_tokens[2].item() + decode_req.req.cached_tokens_storage = cached_tokens[3].item() if not self.spec_algorithm.is_none(): decode_req.req.output_topk_p = output_topk_p decode_req.req.output_topk_index = output_topk_index diff --git a/python/sglang/srt/disaggregation/utils.py b/python/sglang/srt/disaggregation/utils.py index b7b3b0238861..ef27f15a0762 100644 --- a/python/sglang/srt/disaggregation/utils.py +++ b/python/sglang/srt/disaggregation/utils.py @@ -227,6 +227,9 @@ def set_buf(self, req: Req): self.output_ids[req.metadata_buffer_index][0] = req.output_ids[0] self.cached_tokens[req.metadata_buffer_index][0] = req.cached_tokens + self.cached_tokens[req.metadata_buffer_index][1] = req.cached_tokens_device + self.cached_tokens[req.metadata_buffer_index][2] = req.cached_tokens_host + self.cached_tokens[req.metadata_buffer_index][3] = req.cached_tokens_storage if req.return_logprob: if req.output_token_logprobs_val: # not none or empty list self.output_token_logprobs_val[req.metadata_buffer_index][0] = ( diff --git a/python/sglang/srt/managers/scheduler_output_processor_mixin.py b/python/sglang/srt/managers/scheduler_output_processor_mixin.py index 570cc81c2db0..ef31d34d6c37 100644 --- a/python/sglang/srt/managers/scheduler_output_processor_mixin.py +++ b/python/sglang/srt/managers/scheduler_output_processor_mixin.py @@ -55,15 +55,10 @@ def _get_cached_tokens_details(self, req: Req) -> Optional[dict]: """Get detailed cache breakdown for a request, if available. Returns: - - None if HiCache is not enabled - - {"device": X, "host": Y} if HiCache enabled but L3 storage is not - - {"device": X, "host": Y, "storage": Z, "storage_backend": "..."} if L3 enabled + - None if no cached tokens at all + - {"device": X, "host": Y} without storage breakdown + - {"device": X, "host": Y, "storage": Z} with storage breakdown """ - # Only show details if HiCache is enabled - if not getattr(self, "enable_hierarchical_cache", False): - return None - - # Only show if there are any cached tokens if ( req.cached_tokens_device > 0 or req.cached_tokens_host > 0 @@ -78,6 +73,13 @@ def _get_cached_tokens_details(self, req: Req) -> Optional[dict]: details["storage"] = req.cached_tokens_storage details["storage_backend"] = self._get_storage_backend_type() return details + + if req.cached_tokens > 0: + return { + "device": req.cached_tokens, + "host": 0, + } + return None def process_batch_result_prebuilt(self: Scheduler, batch: ScheduleBatch):