Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
689e9f4
[Bugfix] raise runtime error when npumodelrunner init failed
zhenwenqi2024 Jan 4, 2026
b779782
[Bugfix] raise runtime error when npumodelrunner init failed
zhenwenqi2024 Jan 4, 2026
db9a80d
[Bugfix] raise runtime error when npumodelrunner init failed
zhenwenqi2024 Jan 4, 2026
46cc074
[Bugfix] raise runtime error when npumodelrunner init failed
zhenwenqi2024 Jan 4, 2026
742c069
[Bugfix] fix dcp_only bug and add e2e accuracy test for dcp only and …
zhenwenqi2024 Jan 4, 2026
a1b5a43
[Bugfix] fix dcp_only bug and add e2e accuracy test for dcp only and …
zhenwenqi2024 Jan 4, 2026
034afa0
[Bugfix] fix dcp_only bug and add e2e accuracy test for dcp only and …
zhenwenqi2024 Jan 5, 2026
15fb03e
[Bugfix] fix dcp_only bug and add e2e accuracy test for dcp only and …
zhenwenqi2024 Jan 5, 2026
da5070e
[Bugfix] fix dcp_only bug and add e2e accuracy test for dcp only and …
zhenwenqi2024 Jan 5, 2026
e527494
[Bugfix] fix dcp_only bug and add e2e accuracy test for dcp only and …
zhenwenqi2024 Jan 6, 2026
7a70329
Merge branch 'vllm-project:main' into main
zhenwenqi2024 Jan 6, 2026
c74c6f4
[Bugfix]dcp&pcp support mlapo
zhenwenqi2024 Jan 6, 2026
371a0d1
[Bugfix]dcp&pcp support mlapo
zhenwenqi2024 Jan 7, 2026
06074d7
[Bugfix]dcp&pcp support mlapo
zhenwenqi2024 Jan 7, 2026
3c24ce4
[Bugfix]dcp&pcp support mlapo
zhenwenqi2024 Jan 7, 2026
46473d5
[Bugfix]dcp&pcp support mlapo
zhenwenqi2024 Jan 7, 2026
a8d01f4
[Bugfix]dcp&pcp support mlapo
zhenwenqi2024 Jan 7, 2026
10d72d9
[Bugfix]dcp&pcp support mlapo
zhenwenqi2024 Jan 7, 2026
b01520e
[Bugfix]dcp&pcp support mlapo
zhenwenqi2024 Jan 7, 2026
0c26c87
[Bugfix] raise runtime error when npumodelrunner init failed
zhenwenqi2024 Jan 7, 2026
cb2b8d3
[Bugfix] raise runtime error when npumodelrunner init failed
zhenwenqi2024 Jan 7, 2026
a8e32dc
Merge remote-tracking branch 'upstream/main'
zhenwenqi2024 Jan 7, 2026
b4fe416
[Bugfix]dcp&pcp support mlapo
zhenwenqi2024 Jan 7, 2026
076e2c1
Merge branch 'main' of https://github.com/zhenwenqi2024/vllm-ascend
zhenwenqi2024 Jan 7, 2026
a255771
[Bugfix]dcp&pcp support mlapo
zhenwenqi2024 Jan 7, 2026
ccf8479
[Bugfix]dcp&pcp support mlapo
zhenwenqi2024 Jan 7, 2026
a9ff5ba
[Bugfix]dcp&pcp support mlapo
zhenwenqi2024 Jan 7, 2026
b77c1db
[Bugfix]dcp&pcp support mlapo
zhenwenqi2024 Jan 7, 2026
ddef018
[Bugfix]dcp&pcp support mlapo
zhenwenqi2024 Jan 8, 2026
b188842
[Bugfix]dcp&pcp support mlapo
zhenwenqi2024 Jan 8, 2026
5cd44eb
[Bugfix]dcp&pcp support mlapo
zhenwenqi2024 Jan 8, 2026
837bfbb
[Bugfix]dcp&pcp support mlapo
zhenwenqi2024 Jan 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/source/user_guide/feature_guide/context_parallel.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ To learn more about the theory and implementation details of context parallel, p
Currently context parallel can be used together with most other features, supported features are as follows:
| | Eager | Graph | Prefix <br> Cache | Chunked <br> Prefill | SpecDecode <br> (MTP) | PD <br> disaggregation | MLAPO |
| ------- | ----- | ----- | ------ | ------ | ----- | ----- | ----- |
| **PCP** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
| **DCP** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
| **PCP** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
| **DCP** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |

## How to use Context Parallel
You can enable `PCP` and `DCP` by `prefill_context_parallel_size` and `decode_context_parallel_size`, refer to the following example:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def test_qwen3_next_mtp_acceptance_tp4(model_name):
for num_accepted_tokens in num_accepted_tokens_per_pos
]

match = all(abs(a - b) < 0.05 for a, b in zip(acceptance_per_pos, golden))
match = all(abs(a - b) < 0.06 for a, b in zip(acceptance_per_pos, golden))
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Due to magicmtp has merged(#5542), acceptance_per_pos has increased, and this ci test will be fixed by #5332.
This modification is too crude, please wait pr5332 ready.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#5332 seems don't work... Let's use this method to solve it first and prevent it from affecting other PRs.

if not match:
print(f"acceptance_per_pos: {acceptance_per_pos}")
print(f"golden: {golden}")
Expand Down
3 changes: 3 additions & 0 deletions tests/ut/spec_decode/test_mtp_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ def test_prepare_inputs_padded(self, mock_cpu_gpu_buffer):
[0, 8, 16, 24], dtype=torch.int32)
mock_common_attn_metadata.seq_lens = torch.tensor([8, 8, 8],
dtype=torch.int32)
mock_common_attn_metadata.num_actual_tokens = 24
mock_common_attn_metadata.num_reqs = 3
mock_common_attn_metadata.num_computed_tokens_cpu = torch.tensor(
[5, 6, 7], dtype=torch.int32)
Expand All @@ -293,10 +294,12 @@ def test_prepare_inputs_padded(self, mock_cpu_gpu_buffer):
mock_runner.actual_seq_lengths_q = MagicMock()
mock_runner.attn_state = MagicMock()
mock_runner.graph_pad_size = 0
mock_runner.pcp_size = 1
mock_runner.decode_token_per_req = MagicMock()

proposer = MagicMock(spec=MtpProposer)
proposer.runner = mock_runner
proposer.pcp_size = 1
proposer.arange = torch.arange(100, dtype=torch.int32)
proposer.prepare_inputs_padded = MtpProposer.prepare_inputs_padded.__get__(
proposer)
Expand Down
23 changes: 21 additions & 2 deletions vllm_ascend/attention/context_parallel/mla_cp.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,26 @@ def __init__(
dtype=torch.uint8,
device=device)

def build(
self,
common_prefix_len: int,
common_attn_metadata: AscendCommonAttentionMetadata,
fast_build: bool = False,
) -> AscendMLAMetadata:
metadata_cls = super().build(common_prefix_len, common_attn_metadata)
if self.num_prefills == 0 and self.pcp_size > 1:
self.slot_mapping[:self.
num_decode_tokens] = self.slot_mapping[:self.
num_decode_tokens
* self.
pcp_size:
self.
pcp_size]
self.slot_mapping[self.num_decode_tokens:self.num_decode_tokens *
self.pcp_size].fill_(-1)
metadata_cls.slot_mapping = self.slot_mapping
return metadata_cls

@classmethod
def get_cudagraph_support(
cls: type["AscendMlaCPMetadataBuilder"],
Expand Down Expand Up @@ -363,8 +383,7 @@ def mla_preprocess_decode(self, q_c, kv_no_split, kv_cache, attn_metadata):
decode_ql_nope, decode_q_pe = self.reorg_decode_q(
decode_ql_nope, decode_q_pe)
decode_q_pe = self.rope_single(decode_q_pe, cos, sin)
decode_slots = attn_metadata.slot_mapping[:num_decode_tokens *
self.pcp_size:self.pcp_size]
decode_slots = attn_metadata.slot_mapping[:num_decode_tokens]
decode_kv_no_split = kv_no_split[:num_decode_tokens]
decode_k_pe, decode_k_nope = self.exec_kv_decode(
decode_kv_no_split, cos, sin, kv_cache, decode_slots)
Expand Down
3 changes: 1 addition & 2 deletions vllm_ascend/attention/mla_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,6 @@ def build(
if self.num_decodes > 0:
decode_metadata = self.build_decode_metadata(
common_prefix_len, common_attn_metadata)

return self.metadata_cls( # type: ignore
num_actual_tokens_pcp_padded=self.num_actual_tokens,
num_input_tokens=common_attn_metadata.num_input_tokens,
Expand Down Expand Up @@ -1334,7 +1333,7 @@ def _mla_preprocess_only_decode(self, hidden_states, kv_cache,
self.W_UK_T,
decode_k_nope,
decode_k_pe,
attn_metadata.slot_mapping[:bsz].flatten(),
attn_metadata.slot_mapping[:bsz],
quant_scale0=self.quant_scale0,
quant_offset0=self.quant_offset0,
bias0=self.quant_bias_qkv,
Expand Down
3 changes: 2 additions & 1 deletion vllm_ascend/spec_decode/eagle_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -750,7 +750,8 @@ def prepare_inputs_padded(
query_start_loc_cpu=query_start_loc_cpu,
seq_lens_cpu=common_attn_metadata.seq_lens_cpu,
num_reqs=common_attn_metadata.num_reqs,
num_actual_tokens=total_num_tokens,
num_actual_tokens=common_attn_metadata.num_actual_tokens
if self.pcp_size > 1 else total_num_tokens,
num_input_tokens=common_attn_metadata.num_input_tokens,
max_query_len=new_query_len_per_req.max().item(),
actual_seq_lengths_q=self.runner.actual_seq_lengths_q,
Expand Down
7 changes: 5 additions & 2 deletions vllm_ascend/worker/model_runner_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -912,12 +912,15 @@ def _prepare_inputs(
self.input_batch)
blk_table.slot_mapping.gpu[maybe_pcp_full_tokens:].fill_(-1)
if self.pcp_size > 1:
slot_mapping = self.pcp_manager.get_padded_slot_mapping(
slot_mapping_pcp = self.pcp_manager.get_padded_slot_mapping(
total_num_scheduled_tokens,
slot_mapping,
)
blk_table.slot_mapping.gpu[:self.pcp_manager.
num_actual_tokens_pcp_padded] = slot_mapping
num_actual_tokens_pcp_padded] = slot_mapping_pcp
slot_mapping = blk_table.slot_mapping.gpu[:self.
pcp_manager.
num_actual_tokens_pcp_padded]
Comment thread
zhenwenqi2024 marked this conversation as resolved.

# NOTE: This is a temporary hack, now in GPUModelRunner, this prepare_inputs
# has been split to multiple parts, and there are 3 parts that is related to this
Expand Down
Loading