Skip to content

Commit 18064c6

Browse files
author
niushengxiao
committed
fix: multi tp support
1 parent ae220d7 commit 18064c6

File tree

3 files changed

+45
-31
lines changed

3 files changed

+45
-31
lines changed

lightllm/server/multi_level_kv_cache/cpu_cache_client.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -213,17 +213,18 @@ def _create_shm_cpu_kv_cache(self):
213213

214214
def _attach_shm_cpu_kv_cache(self):
215215
shm_ptr = attach_shm_kv_cache_ptr()
216-
device_ptr = register_shm_ptr_to_pin(shm_ptr=shm_ptr, size=self.kv_cache_tensor_meta.calcu_size())
216+
register_shm_ptr_to_pin(shm_ptr=shm_ptr, size=self.kv_cache_tensor_meta.calcu_size())
217+
numpy_array = np.frombuffer(
218+
memoryview((ctypes.c_uint8 * self.kv_cache_tensor_meta.calcu_size()).from_address(shm_ptr)), dtype=np.uint8
219+
)
217220
shape = (
218221
self.kv_cache_tensor_meta.page_num,
219222
self.kv_cache_tensor_meta.layer_num,
220223
self.kv_cache_tensor_meta.token_page_size,
221224
self.kv_cache_tensor_meta.num_heads,
222225
self.kv_cache_tensor_meta.head_dim,
223226
)
224-
self.cpu_kv_cache_tensor = torch.empty(size=shape, dtype=torch.bfloat16, device="meta")
225-
# 将指针绑定到 tensor上,方便triton获取真实的地址。
226-
self.cpu_kv_cache_tensor.data_ptr = lambda: device_ptr
227+
self.cpu_kv_cache_tensor = torch.from_numpy(numpy_array).view(dtype=torch.bfloat16).view(shape)
227228
return
228229

229230

lightllm/server/router/model_infer/mode_backend/multi_level_kv_cache.py

Lines changed: 39 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from lightllm.common.basemodel.triton_kernel.kv_cache_offload import offload_gpu_kv_to_cpu, load_cpu_kv_to_gpu
1212
from lightllm.server.router.model_infer.infer_batch import g_infer_context
1313
from lightllm.utils.log_utils import init_logger
14+
from lightllm.utils.infer_utils import mark_start, mark_end
1415

1516
logger = init_logger(__name__)
1617

@@ -84,9 +85,15 @@ def handle_finished_reqs(self, finished_reqs: List[InferReq]) -> List[InferReq]:
8485
else:
8586
assert req.cpu_cache_task_status.is_not_started()
8687
# 发起将请求的 kv cache 卸载到 cpu cache 中的任务
88+
# if self.backend.is_master_in_dp:
89+
# mark_start("blueswhen offload_kv_to_cpu")
90+
torch.cuda.synchronize()
8791
trans_task = self._start_kv_cache_offload_task(
8892
req=req, cpu_kv_cache_stream=g_infer_context.get_cpu_kv_cache_stream()
8993
)
94+
torch.cuda.synchronize()
95+
# if self.backend.is_master_in_dp:
96+
# mark_end("blueswhen offload_kv_to_cpu")
9097

9198
if trans_task is not None:
9299
self.cpu_cache_handle_queue.append(trans_task)
@@ -101,44 +108,51 @@ def _start_kv_cache_offload_task(
101108
self, req: InferReq, cpu_kv_cache_stream: torch.cuda.Stream
102109
) -> Optional["TransTask"]:
103110
with torch.cuda.stream(cpu_kv_cache_stream):
104-
# 重新计算基于完整序列的hash值,而不是只基于输入
105-
all_token_hash_list = self._compute_full_sequence_hash(req)
106-
block_size = req.cur_kv_len // self.args.cpu_cache_token_page_size
107-
move_block_size = min(block_size, len(all_token_hash_list))
108-
if move_block_size == 0:
109-
req.cpu_cache_task_status = InferReq._CpuCacheTaskStatus.FINISHED
110-
return None
111111
if self.backend.is_master_in_dp:
112-
self.cpu_cache_client.lock.acquire_sleep1ms()
113-
page_list, ready_list = self.cpu_cache_client.allocate_pages(
114-
all_token_hash_list[:move_block_size],
115-
disk_offload_enable=self.args.enable_disk_cache,
116-
)
117-
self.cpu_cache_client.lock.release()
112+
all_token_hash_list = self._compute_full_sequence_hash(req)
113+
block_size = req.cur_kv_len // self.args.cpu_cache_token_page_size
114+
move_block_size = min(block_size, len(all_token_hash_list))
115+
116+
if move_block_size == 0:
117+
dist.broadcast_object_list([0], group=self.gloo_group, group_src=0)
118+
req.cpu_cache_task_status = InferReq._CpuCacheTaskStatus.FINISHED
119+
return None
120+
121+
try:
122+
self.cpu_cache_client.lock.acquire_sleep1ms()
123+
page_list, ready_list = self.cpu_cache_client.allocate_pages(
124+
all_token_hash_list[:move_block_size],
125+
disk_offload_enable=self.args.enable_disk_cache,
126+
)
127+
finally:
128+
self.cpu_cache_client.lock.release()
129+
118130
item_size = len(page_list)
119-
dist.broadcast_object_list([item_size], group=self.gloo_group, group_src=0)
120131
if item_size == 0:
132+
dist.broadcast_object_list([0], group=self.gloo_group, group_src=0)
121133
req.cpu_cache_task_status = InferReq._CpuCacheTaskStatus.FINISHED
122134
return None
123-
dist.broadcast_object_list(page_list, group=self.gloo_group, group_src=0)
124-
dist.broadcast_object_list(ready_list, group=self.gloo_group, group_src=0)
135+
136+
broadcast_data = {
137+
'item_size': item_size,
138+
'page_list': page_list,
139+
'ready_list': ready_list
140+
}
141+
dist.broadcast_object_list([broadcast_data], group=self.gloo_group, group_src=0)
125142
else:
126143
recv_list = [None]
127144
dist.broadcast_object_list(recv_list, group=self.gloo_group, group_src=0)
128-
item_size = recv_list[0]
129-
if item_size == 0:
145+
if isinstance(recv_list[0], int) and recv_list[0] == 0:
130146
req.cpu_cache_task_status = InferReq._CpuCacheTaskStatus.FINISHED
131147
return None
132-
page_list = [None] * item_size
133-
ready_list = [None] * item_size
134-
dist.broadcast_object_list(page_list, group=self.gloo_group, group_src=0)
135-
dist.broadcast_object_list(ready_list, group=self.gloo_group, group_src=0)
148+
broadcast_data = recv_list[0]
149+
item_size = broadcast_data['item_size']
150+
page_list = broadcast_data['page_list']
151+
ready_list = broadcast_data['ready_list']
136152

137153
page_indexes = torch.tensor(page_list, dtype=torch.int32, device="cpu", pin_memory=True)
138154
page_readies = torch.tensor(ready_list, dtype=torch.bool, device="cpu", pin_memory=True)
139-
140155
token_indexes = self.backend.model.req_manager.req_to_token_indexs[req.req_idx, 0 : req.cur_kv_len]
141-
142156
offload_gpu_kv_to_cpu(
143157
token_indexes=token_indexes,
144158
gpu_kv_cache=self.backend.model.mem_manager.kv_buffer,
@@ -147,8 +161,7 @@ def _start_kv_cache_offload_task(
147161
page_readies=page_readies,
148162
)
149163

150-
# 用一个allreduce 操作和 sync_event 来确保所有gpu worker都完成对cpu kv cache的写入。
151-
dist.all_reduce(tensor=self.sync_tensor, group=self.sync_group, async_op=False)
164+
# dist.all_reduce(tensor=self.sync_tensor, group=self.sync_group, async_op=False)
152165
sync_event = torch.cuda.Event()
153166
sync_event.record()
154167
req.cpu_cache_task_status = InferReq._CpuCacheTaskStatus.RUNNING

test/benchmark/service/benchmark_qps.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ def main():
342342

343343
assert args.tokenizer_path is not None
344344
model_name.append(args.tokenizer_path)
345-
# seed_all(args.seed)
345+
seed_all(args.seed)
346346
url = args.url
347347
tokenizer = get_tokenizer(args.tokenizer_path)
348348
if args.data_path is not None:

0 commit comments

Comments
 (0)