Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion vllm_ascend/patch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,15 @@
# Future Plan:
# Its a workaround in vllm-ascend to enable multi-node dp inference, maybe removed if vllm have better plan
# on multi-node dp inference implementation
#
# 4. `ParallelConfig.stateless_init_dp_group`
# Why:
# vLLM use gloo backend by default to initialize stateless dp process gourp, but we want to use hccl here to
# get better performance
# How:
# adopt nccl backend to init process group
# Related PR (if no, explain why): no related PR, we want add this ability into vllm
# Future Plan:
# Remove those patch when vllm merged them
# * Worker Patch:
# ===============
# ** File: worker/patch_0_8_4/patch_metrics.py **
Expand Down
15 changes: 15 additions & 0 deletions vllm_ascend/patch/platform/patch_common/patch_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,21 @@ def parallel_config_get_dp_port(self) -> int:
return port


def ascend_stateless_init_dp_group(self) -> "ProcessGroup":
from vllm.distributed.utils import \
stateless_init_torch_distributed_process_group

dp_group = stateless_init_torch_distributed_process_group(
self.data_parallel_master_ip,
self.get_next_dp_init_port(),
self.data_parallel_rank,
self.data_parallel_size,
backend="hccl")

return dp_group


vllm.distributed.parallel_state.destroy_model_parallel = ascend_destroy_model_parallel
vllm.distributed.stateless_init_torch_distributed_process_group = ascend_stateless_init_torch_distributed_process_group
ParallelConfig.get_next_dp_init_port = parallel_config_get_dp_port
ParallelConfig.stateless_init_dp_group = ascend_stateless_init_dp_group