-
Notifications
You must be signed in to change notification settings - Fork 910
[BugFix] Fix the accuracy issue of multimodal input. #1020
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -1,12 +1,13 @@ | ||||||
| # SPDX-License-Identifier: Apache-2.0 | ||||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||||
| import asyncio | ||||||
| import copy | ||||||
| import time | ||||||
| import weakref | ||||||
| from collections.abc import AsyncGenerator, Iterable, Sequence | ||||||
| from dataclasses import asdict | ||||||
| from pprint import pformat | ||||||
| from typing import Any, cast | ||||||
| from typing import Any | ||||||
|
|
||||||
| from vllm.config import VllmConfig | ||||||
| from vllm.inputs.preprocess import InputPreprocessor | ||||||
|
|
@@ -32,7 +33,7 @@ | |||||
| from vllm_omni.entrypoints.utils import ( | ||||||
| get_final_stage_id_for_e2e, | ||||||
| ) | ||||||
| from vllm_omni.inputs.data import OmniPromptType, OmniSamplingParams, OmniTokensPrompt | ||||||
| from vllm_omni.inputs.data import OmniPromptType, OmniSamplingParams | ||||||
|
|
||||||
| # Internal imports (our code) | ||||||
| from vllm_omni.lora.request import LoRARequest | ||||||
|
|
@@ -304,38 +305,25 @@ async def generate( | |||||
| req_state = ClientRequestState(request_id) | ||||||
| req_state.metrics = metrics | ||||||
| self.request_states[request_id] = req_state | ||||||
|
|
||||||
| sp0: SamplingParams = sampling_params_list[0] # type: ignore[index] | ||||||
| task = { | ||||||
| "request_id": request_id, | ||||||
| "engine_inputs": prompt, | ||||||
| "sampling_params": sp0, | ||||||
| } | ||||||
| self.stage_list[0].submit(task) | ||||||
| metrics.stage_first_ts[0] = metrics.stage_first_ts[0] or time.time() | ||||||
| _req_start_ts[request_id] = time.time() | ||||||
| logger.info( | ||||||
| f"[{self._name}] Entering scheduling loop: stages={num_stages}, final_stage={final_stage_id_for_e2e}" | ||||||
| ) | ||||||
| if self.async_chunk: | ||||||
| stage_queues = {stage_id: asyncio.Queue() for stage_id in range(num_stages)} | ||||||
| req_state.stage_queues = stage_queues | ||||||
| for i in range(num_stages): | ||||||
| sp: SamplingParams = cast(SamplingParams, sampling_params_list[i]) | ||||||
| engine_inputs = cast(OmniTokensPrompt, prompt) | ||||||
| if i != 0: | ||||||
| prompt_token_ids = engine_inputs["prompt_token_ids"] | ||||||
| prompt_1 = engine_inputs.copy() | ||||||
| prompt_1["prompt_token_ids"] = [0] * compute_talker_prompt_ids_length(prompt_token_ids) | ||||||
| prompt_1["multi_modal_data"] = prompt_1["mm_processor_kwargs"] = None | ||||||
| engine_inputs = prompt_1 | ||||||
|
|
||||||
| task = { | ||||||
| "request_id": request_id, | ||||||
| "engine_inputs": engine_inputs, | ||||||
| "sampling_params": sp, | ||||||
| } | ||||||
| self.stage_list[i].submit(task) | ||||||
| metrics.stage_first_ts[i] = metrics.stage_first_ts[0] or time.time() | ||||||
|
|
||||||
| logger.info(f"[{self._name}] Enqueued request {request_id} to stage-{str(i)}") | ||||||
|
|
||||||
| _req_start_ts[request_id] = time.time() | ||||||
|
|
||||||
| logger.info( | ||||||
| f"[{self._name}] Entering scheduling loop: " | ||||||
| f"stages={num_stages}, final_stage={final_stage_id_for_e2e}" | ||||||
| ) | ||||||
| async for output in self._process_async_results( | ||||||
|
Comment on lines
+308
to
323
|
||||||
| request_id, | ||||||
| prompt, | ||||||
| sampling_params_list, | ||||||
| req_state, | ||||||
| metrics, | ||||||
| final_stage_id_for_e2e, | ||||||
|
|
@@ -344,22 +332,6 @@ async def generate( | |||||
| ): | ||||||
| yield output | ||||||
| else: | ||||||
| sp0: SamplingParams = sampling_params_list[0] # type: ignore[index] | ||||||
| task = { | ||||||
| "request_id": request_id, | ||||||
| "engine_inputs": prompt, | ||||||
| "sampling_params": sp0, | ||||||
| } | ||||||
| self.stage_list[0].submit(task) | ||||||
|
|
||||||
| _req_start_ts[request_id] = time.time() | ||||||
| # Mark first input time for stage-0 | ||||||
| metrics.stage_first_ts[0] = metrics.stage_first_ts[0] or time.time() | ||||||
| logger.info( | ||||||
| f"[{self._name}] Entering scheduling loop: " | ||||||
| f"stages={num_stages}, final_stage={final_stage_id_for_e2e}" | ||||||
| ) | ||||||
|
|
||||||
| async for output in self._process_sequential_results( | ||||||
| request_id, | ||||||
| req_state, | ||||||
|
|
@@ -390,23 +362,43 @@ async def generate( | |||||
| async def _process_async_results( | ||||||
| self, | ||||||
| request_id: str, | ||||||
| prompt: Any, | ||||||
| sampling_params_list: list[SamplingParams], | ||||||
| req_state: ClientRequestState, | ||||||
| metrics: OrchestratorMetrics, | ||||||
| final_stage_id_for_e2e: int, | ||||||
| req_start_ts: dict[int, float], | ||||||
| wall_start_ts: float, | ||||||
| ) -> AsyncGenerator[OmniRequestOutput, None]: | ||||||
| all_stages_finished = {stage_id: False for stage_id in range(final_stage_id_for_e2e + 1)} | ||||||
| submit_flag = True | ||||||
| while not all(all_stages_finished.values()): | ||||||
| for stage_id, stage in enumerate(self.stage_list[: final_stage_id_for_e2e + 1]): | ||||||
| if all_stages_finished[stage_id]: | ||||||
| continue | ||||||
| result = await req_state.stage_queues[stage_id].get() | ||||||
| logger.info(f"[{self._name}] Received result from stage-{stage_id}: {result}") | ||||||
| try: | ||||||
| result = req_state.stage_queues[stage_id].get_nowait() | ||||||
| except asyncio.QueueEmpty: | ||||||
| await asyncio.sleep(0.001) | ||||||
| continue | ||||||
|
Comment on lines
+379
to
+383
|
||||||
|
|
||||||
| engine_outputs, finished, output_to_yield = self._process_single_result( | ||||||
| result, stage, stage_id, metrics, req_start_ts, wall_start_ts, final_stage_id_for_e2e | ||||||
| ) | ||||||
|
|
||||||
| if submit_flag and stage_id == 0: | ||||||
| submit_flag = False | ||||||
| prompt_token_ids = engine_outputs.prompt_token_ids | ||||||
| engine_input = copy.deepcopy(prompt) | ||||||
|
||||||
| engine_input = copy.deepcopy(prompt) | |
| engine_input = dict(prompt) |
Copilot
AI
Jan 28, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Async chunk submission currently enqueues tasks for stages 1..len(self.stage_list)-1, even when final_stage_id_for_e2e is smaller. Stages beyond final_stage_id_for_e2e won’t be drained by _process_async_results, so their outputs can accumulate in stage_queues (memory growth) and do unnecessary work. Limit submission to stages up to final_stage_id_for_e2e (and/or only the stages required for the selected output_modalities).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When
async_chunkis enabled, the stage-0 task is submitted beforereq_state.stage_queuesis created. The output handler only routes results tostage_queuesif the attribute exists; otherwise it falls back toreq_state.queue, which_process_async_resultsnever drains. If stage-0 responds quickly (small prompts, cached model), its first result can be enqueued to the fallback queue and the async loop will wait forever onstage_queues, effectively hanging the request. Creatingstage_queuesbeforesubmit()(or draining the fallback queue in async mode) avoids this race.Useful? React with 👍 / 👎.