Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
4bc170c
[Feature] support prefix cache in DP
Oct 10, 2025
1851e80
fix
Oct 10, 2025
983a8a8
Merge branch 'develop' into update_ep
ltd0924 Oct 10, 2025
2a9e046
Update common_engine.py
ltd0924 Oct 10, 2025
58fced4
Merge branch 'develop' into update_ep
ltd0924 Oct 10, 2025
f5733dd
Update common_engine.py
ltd0924 Oct 10, 2025
427cf47
Update common_engine.py
ltd0924 Oct 10, 2025
00bdcc2
Update common_engine.py
ltd0924 Oct 10, 2025
0822d63
[BugFix] fix workers more than 1
Oct 11, 2025
85b6990
Merge branch 'develop' into update_ep
ltd0924 Oct 11, 2025
0acf059
fix
Oct 11, 2025
667d146
Update api_server.py
ltd0924 Oct 12, 2025
a03dfe6
fix
Oct 12, 2025
141abd7
Update api_server.py
ltd0924 Oct 13, 2025
1f07ecd
fix
Oct 13, 2025
a531165
Merge branch 'develop' into update_ep
ltd0924 Oct 13, 2025
3670530
Merge branch 'develop' into update_ep
ltd0924 Oct 14, 2025
ab6f741
Merge branch 'develop' into update_ep
ltd0924 Oct 15, 2025
90cd313
[Fearture] Support mm model close prefix cache
Oct 16, 2025
e60d098
Merge branch 'develop' into update_ep
ltd0924 Oct 16, 2025
0053f17
Update api_server.py
ltd0924 Oct 16, 2025
03d9f22
Update engine_client.py
ltd0924 Oct 20, 2025
9ade15e
Update engine_client.py
ltd0924 Oct 20, 2025
7a93f0c
add test
Oct 20, 2025
a38a272
Merge branch 'develop' into update_ep
ltd0924 Oct 20, 2025
842cde7
Update test_chat.py
ltd0924 Oct 20, 2025
bd4ec3c
fix
Oct 20, 2025
b3112ba
Merge branch 'develop' into update_ep
ltd0924 Oct 20, 2025
33d9093
fix
Oct 20, 2025
334d29a
Update test_chat.py
ltd0924 Oct 20, 2025
6957bdc
Update test_chat.py
ltd0924 Oct 20, 2025
2cdcc02
Merge branch 'develop' into update_ep
Jiang-Jia-Jun Oct 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions fastdeploy/cache_manager/cache_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,18 @@
logger = get_logger("prefix_cache_manager", "prefix_cache_manager.log")


DISABLE_PREFIX_CACHE_MM_MODEL: set[str] = {
"Ernie5ForCausalLM",
}


def is_mm_model_disable_prefix_cache(model_config):
"""
check if the model architecture is in DISABLE_PREFIX_CACHE_MM_MODEL
"""
return model_config._architecture in DISABLE_PREFIX_CACHE_MM_MODEL


class CacheStatus(Enum):
"""
cache status enum class
Expand Down
27 changes: 27 additions & 0 deletions fastdeploy/entrypoints/engine_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,13 @@ def __init__(
self.enable_splitwise = splitwise_role != "mixed"
max_chips_per_node = 16 if current_platform.is_iluvatar() else 8

if self.enable_mm and self.enable_prefix_caching:
from fastdeploy.cache_manager.cache_data import (
is_mm_model_disable_prefix_cache,
)

self.disable_prefix_mm = is_mm_model_disable_prefix_cache(model_config)

if tensor_parallel_size <= max_chips_per_node:
self.is_master = True
else:
Expand Down Expand Up @@ -152,6 +159,16 @@ async def format_and_add_data(self, prompts: dict):
await self.add_requests(prompts)
return prompts["prompt_token_ids"]

def _check_mm_disable_prefix_cache(self, task):
is_multimodal_data = False
if self.disable_prefix_mm:
multimodal_inputs = task.get("multimodal_inputs", [])
if multimodal_inputs:
token_type_ids = multimodal_inputs.get("token_type_ids", [])
if token_type_ids:
is_multimodal_data = np.sum(token_type_ids) > 0
return is_multimodal_data

async def add_requests(self, task):
"""
Add a new request to the queue.
Expand All @@ -174,6 +191,16 @@ async def add_requests(self, task):
else:
self.data_processor.process_request_dict(task, self.max_model_len)

if self.enable_mm and self.enable_prefix_caching:
if self._check_mm_disable_prefix_cache(task):
api_server_logger.error(
"The current service does not support processing requests containing multimodal data when prefix cache is enabled. Please send only text-based requests or disable prefix cache"
)
raise EngineError(
"The current service does not support processing requests containing multimodal data when prefix cache is enabled. Please send only text-based requests or disable prefix cache",
error_code=400,
)

task["prompt_token_ids_len"] = len(task["prompt_token_ids"])
input_ids_len = task["prompt_token_ids_len"]
task["max_tokens"] = min(self.max_model_len - input_ids_len, task.get("max_tokens"))
Expand Down
2 changes: 1 addition & 1 deletion fastdeploy/output/token_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def _process_batch_output_use_zmq(self, receive_datas):
if self.tokens_counter[task_id] == 0:
if task.messages is not None:
result.prompt = task.messages
result.num_cached_tokens = task.num_cached_tokens
result.num_cached_tokens = task.num_cached_tokens

is_prefill = task.disaggregate_info is not None and task.disaggregate_info["role"] == "prefill"
result = self._process_per_token(task, i, token_ids, result, is_prefill)
Expand Down
10 changes: 7 additions & 3 deletions tests/entrypoints/test_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,13 @@
class TestChat(unittest.TestCase):
"""Test case for chat functionality"""

COMMON_PREFIX = "I am a highly capable, compassionate, and trustworthy AI assistant dedicated to providing you with exceptional support. Whatever questions or challenges you may have, I will utilize my full capabilities to offer thoughtful and comprehensive assistance. As your intelligent companion, I consistently maintain honesty, transparency, and patience to ensure our interactions are both productive and enjoyable."

PROMPTS = [
[{"content": "The color of tomato is ", "role": "user"}],
[{"content": "The equation 2+3= ", "role": "user"}],
[{"content": "The equation 4-1= ", "role": "user"}],
[{"content": "PaddlePaddle is ", "role": "user"}],
[{"content": COMMON_PREFIX + "The color of tomato is ", "role": "user"}],
[{"content": COMMON_PREFIX + "The equation 2+3= ", "role": "user"}],
[{"content": COMMON_PREFIX + "The equation 4-1= ", "role": "user"}],
]

@classmethod
Expand All @@ -58,6 +60,8 @@ def tearDownClass(cls):
def test_chat(self):
outputs = self.llm.chat(messages=self.PROMPTS, sampling_params=None)
self.assertEqual(len(self.PROMPTS), len(outputs))
self.assertEqual(outputs[-1].num_cached_tokens, outputs[-2].num_cached_tokens)
self.assertEqual(outputs[-1].num_cached_tokens, 64)

def test_chat_with_tools(self):
"""Test chat with tools:
Expand Down
Loading