Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions tests/e2e/singlecard/test_aclgraph_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,10 @@
quantization="ascend",
prompts=PROMPTS_SHORT,
golden_answers=[
'\nI am a 20 year old student from the UK. I am currently studying for a degree in English Literature and Creative Writing. I have a passion',
'\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2',
' a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the',
' Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art',
' here.\nThe future of AI is here.\nThe future of AI is here.\nThe future of AI is here.\nThe future of AI is'
' here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of'
],
)

Expand Down
26 changes: 13 additions & 13 deletions tests/ut/ops/test_token_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,11 +164,11 @@ def setUp(self):
self.dispatcher = TokenDispatcherWithAllGather(**kwargs)

# Mock NPU functions
self.patcher_npu_moe_init_routing_v2 = patch(
'torch_npu.npu_moe_init_routing_v2')
self.mock_npu_moe_init_routing_v2 = self.patcher_npu_moe_init_routing_v2.start(
self.patcher_npu_moe_init_routing_custom = patch(
'torch.ops._C_ascend.npu_moe_init_routing_custom')
self.mock_npu_moe_init_routing_custom = self.patcher_npu_moe_init_routing_custom.start(
)
self.mock_npu_moe_init_routing_v2.return_value = (
self.mock_npu_moe_init_routing_custom.return_value = (
torch.randn(6, 128), # sorted_hidden_states
torch.tensor([0, 1, 2, 3, 4, 5]), # expanded_row_idx
torch.tensor([0, 1, 0, 1, 0, 1]), # expanded_expert_idx
Expand All @@ -180,7 +180,7 @@ def setUp(self):
self.mock_npu_moe_token_unpermute.return_value = torch.randn(6, 128)

def tearDown(self):
self.patcher_npu_moe_init_routing_v2.stop()
self.patcher_npu_moe_init_routing_custom.stop()
self.patcher_npu_moe_token_unpermute.stop()

def test_token_dispatch_without_expert_map(self):
Expand All @@ -192,8 +192,8 @@ def test_token_dispatch_without_expert_map(self):
topk_ids, None)

# Verify npu_moe_init_routing is called
self.mock_npu_moe_init_routing_v2.assert_called_once()
args, kwargs = self.mock_npu_moe_init_routing_v2.call_args
self.mock_npu_moe_init_routing_custom.assert_called_once()
args, kwargs = self.mock_npu_moe_init_routing_custom.call_args

self.assertEqual(results.group_list_type, 1)

Expand All @@ -207,8 +207,8 @@ def test_token_dispatch_with_expert_map(self):
topk_ids, None)

# Verify npu_moe_init_routing is called
self.mock_npu_moe_init_routing_v2.assert_called_once()
args, kwargs = self.mock_npu_moe_init_routing_v2.call_args
self.mock_npu_moe_init_routing_custom.assert_called_once()
args, kwargs = self.mock_npu_moe_init_routing_custom.call_args

self.assertEqual(results.group_list_type, 1)

Expand Down Expand Up @@ -366,11 +366,11 @@ def setUp(self):
self.mock_npu_dynamic_quant.return_value = (torch.randn(16, 16),
torch.randn(16))

# Mock torch_npu.npu_moe_init_routing_v2
patcher11 = patch('torch_npu.npu_moe_init_routing_v2')
self.mock_npu_moe_init_routing_v2 = patcher11.start()
# Mock torch.ops._C_ascend.npu_moe_init_routing_custom
patcher11 = patch('torch.ops._C_ascend.npu_moe_init_routing_custom')
self.mock_npu_moe_init_routing_custom = patcher11.start()
self.addCleanup(patcher11.stop)
self.mock_npu_moe_init_routing_v2.return_value = (torch.randn(
self.mock_npu_moe_init_routing_custom.return_value = (torch.randn(
16, 16), torch.arange(16), None, torch.randn(16))

# Mock torch.repeat_interleave
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/ops/fused_moe/token_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ def token_dispatch(self,
global_num_experts = self.num_experts_local

sorted_hidden_states, expanded_row_idx, expert_tokens, pertoken_scale = (
torch_npu.npu_moe_init_routing_v2(
torch.ops._C_ascend.npu_moe_init_routing_custom(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

This change enables the npu_moe_init_routing_custom custom operator. However, there appears to be a critical bug in its C++ implementation that will cause incorrect kernel dispatching.

In csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_init_routing_v2_tiling.h, the GetTilingKey() function returns keys for different scenarios. For example:

  • TILING_KEY_DROPLESS_SORT_ONE_CORE = 10001
  • TILING_KEY_DROPLESS_SORT_MULTI_CORE = 10002
  • TILING_KEY_DROP_PAD_MODE_SORT_ONE_CORE = 10011
  • TILING_KEY_DROP_PAD_MODE_SORT_MULTI_CORE = 10012

However, in csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_init_routing_quant_v2.cpp, the kernel dispatch logic checks for different, mismatched values:

  • if (tilingKey == 10000 || tilingKey == 10100 || ...) to select MoeV2SortOneCore.
  • else if (tilingKey == 10010 || tilingKey == 10110 || ...) to select MoeV2SortMultiCore.

This mismatch means that for many cases, the wrong sorting kernel (or no sorting kernel at all) will be executed. For instance, when GetTilingKey() returns 10001, neither of the if/else if conditions for sorting will be met, and the sort step will be skipped.

This bug in the underlying C++ implementation must be fixed before this custom operator is enabled, as it will likely lead to incorrect behavior.

Comment thread
zzzzwwjj marked this conversation as resolved.
hidden_states,
topk_ids,
scale=pertoken_scale,
Expand Down
Loading