Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions tests/v1/entrypoints/llm/test_struct_output_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,6 @@
("Qwen/Qwen2.5-1.5B-Instruct", "auto"),
]

platform_args = {}
if current_platform.is_rocm():
platform_args["async_scheduling"] = False


class CarType(str, Enum):
sedan = "sedan"
Expand Down Expand Up @@ -138,7 +134,6 @@ def test_structured_output(
load_format="auto" if not model_name.startswith("mistralai/") else "hf",
config_format="auto" if not model_name.startswith("mistralai/") else "hf",
speculative_config=speculative_config,
**platform_args,
)

#
Expand Down
4 changes: 3 additions & 1 deletion vllm/v1/worker/gpu_input_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,9 @@ def update_req_spec_token_ids(
# _prepare_input_ids.
start_index = self.num_tokens_no_spec[req_index]
end_token_index = start_index + num_spec_tokens
self.token_ids_cpu[req_index, start_index:end_token_index] = spec_token_ids
# Replace -1 values with 0 to avoid embedding lookup errors
safe_spec_token_ids = [tok if tok != -1 else 0 for tok in spec_token_ids]
self.token_ids_cpu[req_index, start_index:end_token_index] = safe_spec_token_ids
Comment on lines +467 to +468
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

This is a great fix for the embedding lookup error. However, there's a related issue on the next line.

cur_spec_token_ids is being extended with the original spec_token_ids, which can contain -1 placeholders. This can cause a RuntimeError from torch.bincount when penalties (like frequency or presence penalty) are applied, as bincount does not support negative indices. This would happen when async_scheduling is False and penalties are enabled.

To prevent this potential crash, you should also use safe_spec_token_ids to extend cur_spec_token_ids.

Suggested change for line 469:

cur_spec_token_ids.extend(safe_spec_token_ids)

cur_spec_token_ids.extend(spec_token_ids)

def remove_request(self, req_id: str) -> int | None:
Expand Down