Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions vllm/v1/core/sched/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,9 @@ def schedule(self) -> SchedulerOutput:
self.waiting.pop_request()
skipped_waiting_requests.prepend_request(request)
continue

# Keep track of number of tokens to load from remote
# for the request st we can compute actual throughput
request.num_external_computed_tokens = ext_tokens
num_external_computed_tokens = ext_tokens

# Total computed tokens (local + external).
Expand Down Expand Up @@ -1081,6 +1083,7 @@ def update_from_output(
trace_headers=request.trace_headers,
num_cached_tokens=request.num_cached_tokens,
num_nans_in_logits=request.num_nans_in_logits,
num_external_computed_tokens=request.num_external_computed_tokens,
)
)
else:
Expand Down Expand Up @@ -1533,9 +1536,12 @@ def _update_requests_with_invalid_blocks(
marked_invalid_block = True
# Truncate the computed tokens at the first failed block
request.num_computed_tokens = idx * self.block_size
total_affected_tokens += (
num_affected_tokens = (
req_num_computed_tokens - request.num_computed_tokens
)
total_affected_tokens += num_affected_tokens
# Prefill is to be recomputed locally, track its performance.
request.num_external_computed_tokens -= num_affected_tokens

if is_affected:
if not marked_invalid_block:
Expand Down
2 changes: 2 additions & 0 deletions vllm/v1/engine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ class EngineCoreOutput(
trace_headers: Mapping[str, str] | None = None
# The number of tokens with prefix cache hits.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, this comment looks incorrect ... assuming "prefix cache" refers to the local cache?

                    # Total computed tokens (local + external).                                                                                                                 
                    num_computed_tokens = (
                        num_new_local_computed_tokens + num_external_computed_tokens
                    )
                ...
                # Count the number of prefix cached tokens.                                                                                                                     
                if request.num_cached_tokens < 0:
                    request.num_cached_tokens = num_computed_tokens

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not familiar with it cc @chaunceyjiang

num_cached_tokens: int = 0
# The number of tokens that have been computed remotely.
num_external_computed_tokens: int = 0
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd be tempted to refactor these two into a PrefillStats object ... and only include that in the ECO when the prefill completes ... especially if we ever wanted to also send like num_locally_cached_tokens too

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have a strong opinion on this tbh, we can probably wait to have a few more things to bundle before executing the suggestion


# The number of NaNs in logits.
# A value greater than 0 indicates that the output is corrupted.
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/metrics/loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def _reset(self, now):

def _track_iteration_stats(self, iteration_stats: IterationStats):
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Presumably you want to update the Prometheus metric too?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@markmc which one? I intentionally left self.counter_prompt_tokens unchanged to avoid replacing the actual prompt count.
Should I just make a new one for local tokens?

# Save tracked stats for token counters.
self.num_prompt_tokens += iteration_stats.num_prompt_tokens
self.num_prompt_tokens += iteration_stats.num_local_prompt_tokens
Comment thread
NickLucche marked this conversation as resolved.
self.num_generation_tokens += iteration_stats.num_generation_tokens
self.num_corrupted_reqs += iteration_stats.num_corrupted_reqs

Expand Down
5 changes: 5 additions & 0 deletions vllm/v1/metrics/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,8 @@ def __init__(self):
self.num_generation_tokens = 0
self.num_prompt_tokens = 0
self.num_preempted_reqs = 0
# Num of prompt tokens that have been computed locally.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the naming here a big confusing? By "computed locally" here we mean both computed and locally cached?

If you just tracked num_external_computed_tokens and then subtracted it in _track_iteration_stats() would that be more clear?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By "computed locally" here we mean both computed and locally cached?

Yes the behavior is unchanged, cached ones would still result in higher throughput even in regular aggregated setup.

If you just tracked num_external_computed_tokens and then subtracted it in _track_iteration_stats() would that be more clear?

I think looking at the diff

self.num_prompt_tokens += iteration_stats.num_prompt_tokens
-->
self.num_prompt_tokens += iteration_stats.num_local_prompt_tokens

this is pretty clear that I just want to rule out the remote tokens ie I assume the semantic was the intended one from the beginning, it's just "local" used to be redundant

self.num_local_prompt_tokens = 0
self.finished_requests: list[FinishedRequestStats] = []
self.max_num_generation_tokens_iter: list[int] = []
self.n_params_iter: list[int] = []
Expand Down Expand Up @@ -250,6 +252,9 @@ def update_from_output(
self.num_generation_tokens += num_new_generation_tokens
if is_prefilling:
self.num_prompt_tokens += prompt_len
self.num_local_prompt_tokens += (
prompt_len - output.num_external_computed_tokens
)
Comment thread
NickLucche marked this conversation as resolved.

first_token_latency = self._time_since(req_stats.arrival_time)
self.time_to_first_tokens_iter.append(first_token_latency)
Expand Down
5 changes: 4 additions & 1 deletion vllm/v1/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,12 @@ def __init__(
# indicates that the output is corrupted
self.num_nans_in_logits = 0

# The number of requests being preempted by the scheduler
# The number of times the request was preempted by the scheduler.
self.num_preemptions = 0

# The number of tokens that have been computed remotely.
self.num_external_computed_tokens = 0

self.block_hashes: list[BlockHash] = []
self.get_hash_new_full_blocks: Callable[[], list[BlockHash]] | None = None
if block_hasher is not None:
Expand Down