-
Notifications
You must be signed in to change notification settings - Fork 492
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
support new backend cambricon (#3002)
* [dlinfer]add camb support * [camb] fix multiple of 8, exp raise core dump * [camb] fix multiple of 8, exp raise core dump * [camb] format * [camb]pow of 2 better * [camb]rm local_adapterids * [camb]modify graph runner * [camb]mock graph runner * [camb]add requirements.txt * [camb]post init set block_size to 16 * lint
- Loading branch information
Showing
11 changed files
with
182 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
# Copyright (c) OpenMMLab. All rights reserved. | ||
from .ascend import AscendOpsBackend # noqa: F401 | ||
from .camb import CambOpsBackend # noqa: F401 | ||
from .maca import MacaOpsBackend # noqa: F401 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# Copyright (c) OpenMMLab. All rights reserved. | ||
from .op_backend import CambOpsBackend # noqa: F401 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
# Copyright (c) OpenMMLab. All rights reserved. | ||
from typing import Tuple | ||
|
||
import torch | ||
|
||
from lmdeploy.pytorch.config import BackendConfig, CacheConfig, ModelConfig | ||
from lmdeploy.utils import get_logger | ||
|
||
from ..op_backend import DlinferOpsBackend | ||
|
||
logger = get_logger('lmdeploy') | ||
|
||
|
||
class CambOpsBackend(DlinferOpsBackend): | ||
"""camb layer backend.""" | ||
total_slots = None | ||
|
||
@staticmethod | ||
def get_name() -> str: | ||
"""backend name.""" | ||
return 'camb' | ||
|
||
@staticmethod | ||
def get_k_block_shape( | ||
block_size: int, | ||
num_heads: int, | ||
head_size: int, | ||
dtype: torch.dtype, | ||
) -> Tuple[int, ...]: | ||
return ( | ||
num_heads, | ||
block_size, | ||
head_size, | ||
) | ||
|
||
@staticmethod | ||
def get_v_block_shape( | ||
block_size: int, | ||
num_heads: int, | ||
head_size: int, | ||
dtype: torch.dtype, | ||
) -> Tuple[int, ...]: | ||
return ( | ||
num_heads, | ||
block_size, | ||
head_size, | ||
) | ||
|
||
@classmethod | ||
def update_step_context(cls, step_context): | ||
"""update step context.""" | ||
|
||
def get_total_slots(): | ||
if cls.total_slots is None: | ||
cls.total_slots = torch.arange( | ||
block_num * block_size, | ||
dtype=torch.int32, | ||
device=step_context.block_offsets.device) | ||
cls.total_slots = cls.total_slots.view(block_num, block_size) | ||
return cls.total_slots | ||
|
||
kv_start_indices = [] | ||
block_num, _, block_size, _ = step_context.kv_caches[0][0].shape | ||
|
||
is_unpaged_prefill = False | ||
q_start_loc = step_context.q_start_loc | ||
q_seqlens = step_context.q_seqlens | ||
kv_seqlens = step_context.kv_seqlens.to(torch.int32) | ||
block_offsets = step_context.block_offsets.to(torch.int32) | ||
max_q_seq_len = torch.max(q_seqlens).cpu().item() | ||
max_kv_seq_len = torch.max(kv_seqlens).cpu().item() | ||
|
||
cu_seqlens = torch.cat( | ||
(q_start_loc, q_seqlens.sum().unsqueeze(0))).int() | ||
cu_seq_lens_kv = None | ||
|
||
q_seqlens_list = step_context.q_seqlens.tolist() | ||
kv_seqlens_list = step_context.kv_seqlens.tolist() | ||
if not step_context.is_decoding: | ||
is_unpaged_prefill = q_seqlens_list == kv_seqlens_list | ||
# get kv_indices | ||
for i in range(q_start_loc.size(0)): | ||
q_seq_len = q_seqlens_list[i] | ||
kv_seq_len = kv_seqlens_list[i] | ||
# collect kv start indices. | ||
history_length = kv_seq_len - q_seq_len | ||
total_slots = get_total_slots() | ||
slot_tables = total_slots[block_offsets[i]].view(-1) | ||
slots = slot_tables[history_length:kv_seq_len] | ||
kv_start_indices.append(slots) | ||
kv_start_indices = torch.cat(kv_start_indices) | ||
if not is_unpaged_prefill: | ||
cu_seq_lens_kv = torch.cat( | ||
(torch.tensor([0], device=kv_seqlens.device), | ||
kv_seqlens.cumsum(0))).int() | ||
else: | ||
# collect kv_start_indices without using a for-loop, | ||
# (fill kv-cache for just ONE token during the decoding phase) | ||
idx = (step_context.kv_seqlens - 1) % block_size | ||
block_num = (step_context.kv_seqlens - 1) // block_size | ||
last_block = block_offsets.gather( # dtype of gather must be int64 | ||
1, block_num.view(-1, 1)).view(-1) | ||
kv_start_indices = (last_block * block_size + idx).to(torch.int32) | ||
|
||
attn_meta_cls = cls.get_attention_metadata_cls() | ||
attn_metadata = attn_meta_cls( | ||
step_context.is_decoding, | ||
block_offsets, | ||
q_start_loc=cu_seqlens, | ||
cu_seq_lens_kv=cu_seq_lens_kv, | ||
q_seqlens=q_seqlens, | ||
kv_seqlens=kv_seqlens, | ||
kv_start_indices=kv_start_indices, | ||
block_size=block_size, | ||
attention_mask=None, | ||
is_unpaged_prefill=is_unpaged_prefill, | ||
max_q_seq_len=max_q_seq_len, | ||
max_kv_seq_len=max_kv_seq_len, | ||
) | ||
|
||
step_context.attn_metadata = attn_metadata | ||
return step_context | ||
|
||
@staticmethod | ||
def build_graph_runner(model: torch.nn.Module, model_config: ModelConfig, | ||
cache_config: CacheConfig, | ||
backend_config: BackendConfig, | ||
device: torch.device): | ||
"""build graph runner.""" | ||
from lmdeploy.pytorch.backends.cuda.graph_runner import CUDAGraphRunner | ||
return CUDAGraphRunner(model, model_config, cache_config, | ||
backend_config, device) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
'ascend', | ||
'npu', | ||
'maca', | ||
'camb', | ||
] | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
accelerate==1.2.0 | ||
einops | ||
fastapi | ||
fire | ||
mmengine-lite | ||
numpy<2.0.0 | ||
openai | ||
outlines<0.1.0 | ||
peft<=0.11.1 | ||
pillow | ||
protobuf | ||
pydantic>2.0.0 | ||
pynvml | ||
safetensors | ||
sentencepiece | ||
shortuuid | ||
tiktoken | ||
torch==2.4.0 | ||
torchvision<=0.19.0,>=0.15.0 | ||
transformers | ||
uvicorn |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
-r requirements/build.txt | ||
-r requirements/runtime_camb.txt | ||
-r requirements/lite.txt | ||
-r requirements/serve.txt |