-
Notifications
You must be signed in to change notification settings - Fork 662
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support chunked prefill when radix cache is disabled (#811)
- Loading branch information
Showing
9 changed files
with
164 additions
and
27 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
from abc import ABC, abstractmethod | ||
|
||
|
||
class BasePrefixCache(ABC): | ||
"""Cache can be indexed by either rid or key.""" | ||
|
||
@abstractmethod | ||
def reset(self): | ||
pass | ||
|
||
@abstractmethod | ||
def match_prefix(self, **kwargs): | ||
pass | ||
|
||
@abstractmethod | ||
def insert(self, **kwargs): | ||
pass | ||
|
||
@abstractmethod | ||
def cache_req(self, **kwargs): | ||
pass | ||
|
||
@abstractmethod | ||
def evict(self, num_tokens, evict_callback): | ||
pass | ||
|
||
@abstractmethod | ||
def inc_lock_ref(self, node): | ||
pass | ||
|
||
@abstractmethod | ||
def dec_lock_ref(self, node): | ||
pass | ||
|
||
@abstractmethod | ||
def evictable_size(self): | ||
pass | ||
|
||
def total_size(self): | ||
raise NotImplementedError | ||
|
||
def pretty_print(self): | ||
raise NotImplementedError |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
"""Cache for chunked prefill, used when RadixCache is disabled.""" | ||
|
||
from sglang.srt.mem_cache.base_cache import BasePrefixCache | ||
|
||
|
||
class ChunkCacheEntry: | ||
def __init__(self, rid, value): | ||
self.rid = rid | ||
self.value = value | ||
|
||
|
||
class ChunkCache(BasePrefixCache): | ||
def __init__(self, req_to_token_pool, token_to_kv_pool): | ||
self.disable = True | ||
self.req_to_token_pool = req_to_token_pool | ||
self.token_to_kv_pool = token_to_kv_pool | ||
|
||
self.reset() | ||
|
||
def reset(self): | ||
self.entries = {} | ||
|
||
def match_prefix(self, rid, **kwargs): | ||
if rid not in self.entries: | ||
return [], None | ||
|
||
entry = self.entries[rid] | ||
return entry.value, entry | ||
|
||
def cache_req( | ||
self, rid, token_ids, req_pool_idx, del_in_memory_pool=True, **kwargs | ||
): | ||
indices = self.req_to_token_pool.req_to_token[req_pool_idx, : len(token_ids)] | ||
if del_in_memory_pool: | ||
assert rid in self.entries | ||
self.req_to_token_pool.free(req_pool_idx) | ||
self.token_to_kv_pool.free(indices) | ||
return | ||
|
||
if rid not in self.entries: | ||
self.entries[rid] = ChunkCacheEntry(rid, indices) | ||
|
||
entry = self.entries[rid] | ||
entry.value = indices | ||
return indices, entry | ||
|
||
def insert(self): | ||
raise NotImplementedError | ||
|
||
def evict(self, num_tokens, evict_callback): | ||
pass | ||
|
||
def inc_lock_ref(self, node): | ||
return 0 | ||
|
||
def dec_lock_ref(self, node): | ||
return 0 | ||
|
||
def evictable_size(self): | ||
return 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters