Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion vllm_ascend/ascend_forward_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def set_ascend_forward_context(
# the performance may degrade due to the switching of communication methods.
if is_moe_model(vllm_config):
sp_enabled = enable_sp(vllm_config) and \
tp_world_size > 1
tp_world_size > 1 and num_tokens is not None
else:
sp_enabled = enable_sp(vllm_config) and \
tp_world_size > 1 and \
Expand Down
24 changes: 15 additions & 9 deletions vllm_ascend/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
_DEFAULT_BUFFER_SIZE = 200
_MIN_DP_BUFFER_SIZE = 50
_IS_MOE_MODEL = None
_ENABLE_SP = None


def is_310p():
Expand Down Expand Up @@ -605,15 +606,20 @@ def dense_optim_enable() -> bool:


def enable_sp(vllm_config=None) -> bool:
if vllm_config is None:
from vllm.config import get_current_vllm_config
vllm_config = get_current_vllm_config()
return (
vllm_config.compilation_config.pass_config.enable_sequence_parallelism
or envs_ascend.VLLM_ASCEND_ENABLE_FLASHCOMM1
# Flash comm 1 should be enabled by env VLLM_ASCEND_ENABLE_FLASHCOMM1
# We retain the env VLLM_ASCEND_ENABLE_FLASHCOMM here for backward compatibility.
or bool(int(os.getenv("VLLM_ASCEND_ENABLE_FLASHCOMM", '0'))))
global _ENABLE_SP
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

caching _ENABLE_SP is a good idea, but it seems will also print a lot of logs when both _ENABLE_SP and vllm_config are None, maybe we chould assert vllm_config is not None when _ENABLE_SP is None to remind developers to pass in this parameter

Copy link
Copy Markdown
Collaborator Author

@realliujiaxu realliujiaxu Oct 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The first call of enable_sp() is in linear_op when initializing model, which is in set_current_vllm_config context. So
vllm_config can only can only be obtained from get_current_vllm_config when _ENABLE_SP is None
image

if _ENABLE_SP is None:
if vllm_config is None:
from vllm.config import get_current_vllm_config
vllm_config = get_current_vllm_config()
_ENABLE_SP = (
vllm_config.compilation_config.pass_config.
enable_sequence_parallelism
or envs_ascend.VLLM_ASCEND_ENABLE_FLASHCOMM1
# Flash comm 1 should be enabled by env VLLM_ASCEND_ENABLE_FLASHCOMM1
# We retain the env VLLM_ASCEND_ENABLE_FLASHCOMM here for backward compatibility.
or bool(int(os.getenv("VLLM_ASCEND_ENABLE_FLASHCOMM", '0'))))

return _ENABLE_SP
Comment thread
realliujiaxu marked this conversation as resolved.


# TODO remove it after vllm has this func
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/worker/model_runner_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -810,7 +810,7 @@ def _sync_metadata_across_dp(
# Create a tensor for num_tokens_after_padding
num_tokens_after_padding = torch.tensor([max_tokens_across_dp] *
self.dp_size,
device="npu",
device="cpu",
dtype=torch.int32)

return max_tokens_across_dp, num_tokens_after_padding, global_with_prefill, global_enable_dbo
Expand Down
Loading