Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 10 additions & 12 deletions vllm_ascend/ops/fused_moe/token_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@
from vllm_ascend.distributed.parallel_state import get_mc2_group
from vllm_ascend.ops.fused_moe.comm_utils import (
async_all_to_all, gather_from_sequence_parallel_region)
from vllm_ascend.utils import (AscendDeviceType, get_ascend_device_type,
is_hierarchical_communication_enabled)
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type


@dataclass
Expand Down Expand Up @@ -117,10 +116,6 @@ def __init__(self, **kwargs):
self.need_extra_args = (
get_ascend_device_type() == AscendDeviceType.A3)

# NOTE: When in A2, setting the environment variables HCCL_INTRA_PCIE_ENABLE=1 and
# HCCL_INTRA_ROCE_ENABLE=0 can reduce cross-machine communication traffic and significantly
# improve communication performance.
self.need_expert_scale = is_hierarchical_communication_enabled()
self.with_quant = False

# Here we need to calculate the global_bs = max_bs_per_rank * ep_world_size to execute
Expand Down Expand Up @@ -158,6 +153,7 @@ def get_dispatch_mc2_kwargs(
else:
quant_mode = 0
moe_expert_num = len(expert_map)

kwargs_mc2 = {
"x": hidden_states,
"expert_ids": topk_ids,
Expand All @@ -166,8 +162,12 @@ def get_dispatch_mc2_kwargs(
"moe_expert_num": moe_expert_num,
"global_bs": self.global_bs,
"expert_token_nums_type": 0,
"expert_scales": topk_weights.to(torch.float32),
}
Comment on lines +165 to 166
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The expert_scales parameter should not be added unconditionally, as it is specific to hierarchical communication which is being enabled for A2 devices. It should be moved into the conditional block for A2 devices. Please remove it from this general dictionary initialization.

        }


if get_ascend_device_type() == AscendDeviceType.A2:
kwargs_mc2["comm_alg"] = "hierarchy"
Comment on lines +168 to +169
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

To ensure hierarchical communication is correctly configured only for A2 devices, the expert_scales parameter should be added here, inside the conditional block, along with comm_alg.

        if get_ascend_device_type() == AscendDeviceType.A2:
            kwargs_mc2["comm_alg"] = "hierarchy"
            kwargs_mc2["expert_scales"] = topk_weights.to(torch.float32)


stage1_kwargs = {
"scales": None,
"quant_mode": quant_mode,
Expand All @@ -181,11 +181,6 @@ def get_dispatch_mc2_kwargs(
"tp_world_size": 1,
"tp_rank_id": 0,
})
if self.need_expert_scale:
stage1_kwargs.update({
"expert_scales":
topk_weights.to(torch.float32),
})

kwargs_mc2.update(stage1_kwargs)
return kwargs_mc2
Expand Down Expand Up @@ -263,8 +258,12 @@ def get_combine_mc_kwargs(self, hidden_states: torch.Tensor,
"shared_expert_rank_num": 0,
"moe_expert_num": moe_expert_num,
"global_bs": self.global_bs,
"expand_scales": expand_scales,
}

if get_ascend_device_type() == AscendDeviceType.A2:
kwargs_mc2["comm_alg"] = "hierarchy"

if self.with_quant:
tp_recv_counts = torch.empty(1,
dtype=torch.int32,
Expand All @@ -275,7 +274,6 @@ def get_combine_mc_kwargs(self, hidden_states: torch.Tensor,
"group_ep": self.moe_all_to_all_group_name,
"ep_world_size": self.ep_world_size,
"ep_rank_id": self.ep_rank_id,
"expand_scales": expand_scales,
}

if self.enable_dispatch_v2:
Expand Down
8 changes: 0 additions & 8 deletions vllm_ascend/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -958,14 +958,6 @@ def calculate_dp_buffer_size() -> int:
return max(dp_buffer_size, _MIN_DP_BUFFER_SIZE)


# Currently, when in A2, setting the environment variables HCCL_INTRA_PCIE_ENABLE=1
# and HCCL_INTRA_ROCE_ENABLE=0 can reduce cross-machine communication traffic and
# significantly improve communication performance of MC2 ops dispatch/combine.
def is_hierarchical_communication_enabled():
return (os.getenv("HCCL_INTRA_ROCE_ENABLE", "") == "0"
and os.getenv("HCCL_INTRA_PCIE_ENABLE", "") == "1")


def has_layer_idx(model_instance: torch.nn.Module) -> bool:
if model_instance is None:
return False
Expand Down
Loading