Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion vllm_ascend/spec_decode/eagle_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,8 @@ def dummy_run(self,
num_tokens_across_dp: Optional[torch.Tensor] = None,
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
batch_descriptor=None,
dummy_compute_logits=lambda hidden_states: None):
dummy_compute_logits=lambda hidden_states: None,
is_profile=False):
# update global cos, sin
update_cos_sin(self.positions[:num_tokens])

Expand Down
6 changes: 4 additions & 2 deletions vllm_ascend/spec_decode/mtp_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,8 @@ def dummy_run(self,
num_tokens_across_dp=None,
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
batch_descriptor=None,
dummy_compute_logits=lambda hidden_states: None) -> None:
dummy_compute_logits=lambda hidden_states: None,
is_profile=False) -> None:

(
num_tokens,
Expand Down Expand Up @@ -299,7 +300,8 @@ def dummy_run(self,
num_actual_tokens=0,
aclgraph_runtime_mode=aclgraph_runtime_mode,
batch_descriptor=batch_descriptor,
is_mtp_model=True):
is_mtp_model=True,
in_profile_run=is_profile):
if self.enable_shared_expert_dp:
positions = positions.unsqueeze(-1)
positions = torch.ops.vllm.maybe_pad_and_reduce(positions)
Expand Down
3 changes: 2 additions & 1 deletion vllm_ascend/spec_decode/ngram_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ def dummy_run(self,
num_tokens_across_dp=None,
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
batch_descriptor=None,
dummy_compute_logits=lambda hidden_states: None):
dummy_compute_logits=lambda hidden_states: None,
is_profile=False):
pass

def generate_token_ids(self,
Expand Down
3 changes: 2 additions & 1 deletion vllm_ascend/spec_decode/suffix_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ def dummy_run(self,
num_tokens_across_dp=None,
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
batch_descriptor=None,
dummy_compute_logits=lambda hidden_states: None):
dummy_compute_logits=lambda hidden_states: None,
is_profile=False):
pass

def generate_token_ids(self,
Expand Down
3 changes: 2 additions & 1 deletion vllm_ascend/worker/model_runner_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -2162,7 +2162,8 @@ def dummy_drafter_compute_logits(hidden_states):
aclgraph_runtime_mode=aclgraph_runtime_mode,
batch_descriptor=batch_descriptor,
dummy_compute_logits=dummy_drafter_compute_logits,
in_graph_capturing=not force_attention)
in_graph_capturing=not force_attention,
is_profile=is_profile)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

This change introduces the is_profile keyword argument to the dummy_run call. However, the Proposer interface and its other implementations (EagleProposer, NgramProposer, SuffixDecodingProposer) have not been updated to accept this argument. This will cause a TypeError at runtime if a proposer other than MtpProposer is used. To fix this, you should update the base Proposer interface in vllm_ascend/spec_decode/interface.py and all its subclasses to include is_profile=False in their dummy_run method signatures.

if is_profile and self.dynamic_eplb:
self.model.clear_all_moe_loads()
if not is_profile and self.dynamic_eplb:
Expand Down
Loading