Skip to content

Commit e8030d0

Browse files
author
ltd0924
committed
fix
1 parent 535989f commit e8030d0

File tree

4 files changed

+9
-5
lines changed

4 files changed

+9
-5
lines changed

fastdeploy/cache_manager/cache_messager.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -775,6 +775,7 @@ def main():
775775
key_cache_shape_list[2],
776776
key_cache_shape_list[3],
777777
]
778+
value_cache_shape = []
778779
if value_cache_shape_list:
779780
value_cache_shape = [
780781
num_gpu_blocks,

fastdeploy/cache_manager/cache_transfer_manager.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,7 @@ def _init_gpu_cache(self, args):
215215
self.key_cache_shape[2],
216216
self.key_cache_shape[3],
217217
]
218+
value_cache_shape = []
218219
if self.value_cache_shape:
219220
value_cache_shape = [
220221
num_gpu_blocks,
@@ -257,9 +258,9 @@ def _init_gpu_cache(self, args):
257258
logger.info(f"[rank {self.rank}/{self.n_ranks}] done init cache (full) gmem alloc : {memory_allocated()}")
258259

259260
def _init_cpu_cache(self, args):
260-
key_cache_size = args.key_cache_shape[1] * args.key_cache_shape[2] * args.key_cache_shape[3]
261+
key_cache_size = self.key_cache_shape[1] * self.key_cache_shape[2] * self.key_cache_shape[3]
261262
if args.value_cache_shape:
262-
value_cache_size = args.value_cache_shape[1] * args.value_cache_shape[2] * args.value_cache_shape[3]
263+
value_cache_size = self.value_cache_shape[1] * self.value_cache_shape[2] * self.value_cache_shape[3]
263264
else:
264265
value_cache_size = 0
265266
if args.cache_dtype == "bfloat16":
@@ -270,7 +271,9 @@ def _init_cpu_cache(self, args):
270271
raise ValueError(f"Unsupported cache dtype: {args.cache_dtype}")
271272
key_need_to_allocate_bytes = args.num_cpu_blocks * cache_bytes * key_cache_size
272273
value_need_to_allocate_bytes = args.num_cpu_blocks * cache_bytes * value_cache_size
273-
# logger.info(f"[rank {self.rank}/{self.n_ranks}] ..swap space size : { / 1024 ** 3:.2f}GB")
274+
logger.info(
275+
f"[rank {self.rank}/{self.n_ranks}] ..swap space size : {(key_need_to_allocate_bytes + value_need_to_allocate_bytes) / 1024 ** 3:.2f}GB"
276+
)
274277
if args.num_cpu_blocks == 0:
275278
logger.info(f"[rank {self.rank}/{self.n_ranks}] 💡 no swap space (cpu cache) is specified.")
276279
self.swap_space_ready_signal.value[self.rank] = 1

fastdeploy/demo/offline_disaggregated_demo.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
from fastdeploy.entrypoints.llm import LLM
2222

23-
model_name_or_path = "/root/PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
23+
model_name_or_path = "baidu/ERNIE-4.5-0.3B-Paddle"
2424

2525

2626
def start_decode(model_name_or_path):

fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ def get_kv_cache_shape(
194194
"""
195195
Calculate kv cache shape
196196
"""
197-
key_cache_shape = value_cache_shape = [max_num_blocks, self.kv_num_heads, self.block_size, self.head_dim]
197+
key_cache_shape = value_cache_shape = [max_num_blocks, self.num_kv_heads, self.block_size, self.head_dim]
198198
return key_cache_shape, value_cache_shape
199199

200200
def transpose(self, hidden_states):

0 commit comments

Comments
 (0)