Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/design/plugin_system.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,3 +155,4 @@ The interface for the model/module may change during vLLM's development. If you
- `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0.
- `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.v1.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
- `seed_everything` platform interface is deprecated. It has been removed in v0.16.0. Please use `vllm.utils.torch_utils.set_random_seed` instead.
- `prompt` in `Platform.validate_request` is deprecated and will be removed in v0.18.0.
24 changes: 0 additions & 24 deletions vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,6 @@ def enqueue(
),
params=seq_params,
lora_requests=seq_lora_requests,
tokenization_kwargs=tokenization_kwargs,
priorities=seq_priority,
)

Expand Down Expand Up @@ -1813,7 +1812,6 @@ def _run_completion(
params=seq_params,
use_tqdm=use_tqdm,
lora_requests=seq_lora_requests,
tokenization_kwargs=tokenization_kwargs,
priorities=seq_priority,
)

Expand Down Expand Up @@ -1872,7 +1870,6 @@ def _run_chat(
params=seq_params,
lora_requests=seq_lora_requests,
use_tqdm=use_tqdm,
tokenization_kwargs=tokenization_kwargs,
)

def _render_and_run_requests(
Expand All @@ -1881,7 +1878,6 @@ def _render_and_run_requests(
params: Sequence[SamplingParams | PoolingParams],
*,
lora_requests: Sequence[LoRARequest | None] | None = None,
tokenization_kwargs: dict[str, Any] | None = None,
priorities: Sequence[int] | None = None,
use_tqdm: bool | Callable[..., tqdm] = True,
):
Expand All @@ -1899,7 +1895,6 @@ def _render_and_run_requests(
prompts=prompts,
params=params,
lora_requests=lora_requests,
tokenization_kwargs=tokenization_kwargs,
priorities=priorities,
)

Expand All @@ -1911,7 +1906,6 @@ def _render_and_add_requests(
params: Sequence[SamplingParams | PoolingParams],
*,
lora_requests: Sequence[LoRARequest | None] | None = None,
tokenization_kwargs: dict[str, Any] | None = None,
priorities: Sequence[int] | None = None,
) -> list[str]:
added_request_ids: list[str] = []
Expand All @@ -1922,7 +1916,6 @@ def _render_and_add_requests(
prompt,
params[i],
lora_request=None if lora_requests is None else lora_requests[i],
tokenization_kwargs=tokenization_kwargs,
priority=0 if priorities is None else priorities[i],
)
added_request_ids.append(request_id)
Expand All @@ -1938,7 +1931,6 @@ def _add_request(
prompt: ProcessorInputs,
params: SamplingParams | PoolingParams,
lora_request: LoRARequest | None = None,
tokenization_kwargs: dict[str, Any] | None = None,
priority: int = 0,
) -> str:
if isinstance(params, SamplingParams):
Expand All @@ -1947,27 +1939,11 @@ def _add_request(

request_id = str(next(self.request_counter))

if params.truncate_prompt_tokens is not None:
params_type = type(params).__name__
warnings.warn(
f"The `truncate_prompt_tokens` parameter in `{params_type}` "
"is deprecated and will be removed in v0.16. "
"Please pass it via `tokenization_kwargs` instead.",
DeprecationWarning,
stacklevel=2,
)

tokenization_kwargs = merge_kwargs(
tokenization_kwargs,
dict(truncate_prompt_tokens=params.truncate_prompt_tokens),
)

return self.llm_engine.add_request(
request_id,
prompt,
params,
lora_request=lora_request,
tokenization_kwargs=tokenization_kwargs,
priority=priority,
)

Expand Down
5 changes: 2 additions & 3 deletions vllm/platforms/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from torch.distributed import PrefixStore, ProcessGroup

from vllm.config import VllmConfig
from vllm.inputs import ProcessorInputs, PromptType
from vllm.inputs import ProcessorInputs
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
from vllm.utils.argparse_utils import FlexibleArgumentParser
Expand Down Expand Up @@ -568,9 +568,8 @@ def opaque_attention_op(cls) -> bool:
@classmethod
def validate_request(
cls,
prompt: "PromptType | ProcessorInputs",
params: "SamplingParams | PoolingParams",
processed_inputs: "ProcessorInputs",
params: "SamplingParams | PoolingParams",
) -> None:
"""Raises if this request is unsupported on this platform"""

Expand Down
23 changes: 7 additions & 16 deletions vllm/v1/engine/async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from vllm.outputs import STREAM_FINISHED, PoolingRequestOutput, RequestOutput
from vllm.plugins.io_processors import get_io_processor
from vllm.pooling_params import PoolingParams
from vllm.renderers import merge_kwargs, renderer_from_config
from vllm.renderers import renderer_from_config
from vllm.renderers.inputs.preprocess import extract_prompt_components
from vllm.sampling_params import RequestOutputKind, SamplingParams
from vllm.tasks import SupportedTask
Expand Down Expand Up @@ -319,21 +319,6 @@ async def add_request(
"prompt logprobs"
)

if params.truncate_prompt_tokens is not None:
params_type = type(params).__name__
warnings.warn(
f"The `truncate_prompt_tokens` parameter in `{params_type}` "
"is deprecated and will be removed in v0.16. "
"Please pass it via `tokenization_kwargs` instead.",
DeprecationWarning,
stacklevel=2,
)

tokenization_kwargs = merge_kwargs(
tokenization_kwargs,
dict(truncate_prompt_tokens=params.truncate_prompt_tokens),
)

if isinstance(prompt, AsyncGenerator):
if reasoning_ended is not None:
raise NotImplementedError
Expand All @@ -353,6 +338,12 @@ async def add_request(

# Convert Input --> Request.
if isinstance(prompt, EngineCoreRequest):
logger.warning_once(
"Passing EngineCoreRequest to AsyncLLM.generate() and .add_requests() "
"is deprecated and will be removed in v0.18. You should instead pass "
"the outputs of Renderer.render_cmpl() or Renderer.render_chat()."
)

request = prompt
if request_id != request.request_id:
logger.warning_once(
Expand Down
60 changes: 53 additions & 7 deletions vllm/v1/engine/input_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import time
import warnings
from collections.abc import Mapping
from typing import Any, Literal

Expand All @@ -28,6 +29,7 @@
from vllm.tasks import POOLING_TASKS, SupportedTask
from vllm.tokenizers import TokenizerLike
from vllm.utils import length_from_prompt_token_ids_or_embeds, random_uuid
from vllm.utils.func_utils import supports_kw
from vllm.utils.jsontree import json_iter_leaves
from vllm.v1.engine import EngineCoreRequest

Expand Down Expand Up @@ -72,6 +74,33 @@ def __init__(
mm_registry=mm_registry,
)

from vllm.platforms import current_platform

platform_validate_request = current_platform.validate_request
if supports_kw(platform_validate_request, "prompt"):
logger.warning_once(
"The signature of Platform.validate_request has changed from "
"`(cls, prompt, params, processed_inputs) -> None` to "
"`(cls, processed_inputs, params) -> None`. The old signature "
"will no longer be supported starting from v0.18."
)

orig_validate_request = platform_validate_request

def compat_validate_request(
processed_inputs: ProcessorInputs,
params: SamplingParams | PoolingParams,
):
return orig_validate_request(
processed_inputs,
params,
processed_inputs, # type: ignore
) # type: ignore

platform_validate_request = compat_validate_request

self._platform_validate_request = platform_validate_request
Comment thread
DarkLight1337 marked this conversation as resolved.

@property
def tokenizer(self) -> TokenizerLike | None:
return self.renderer.tokenizer
Expand All @@ -87,6 +116,16 @@ def _validate_params(
supported_tasks: tuple[SupportedTask, ...] | None,
):
"""Raise `ValueError` if SamplingParams or PoolingParams is not valid."""
if params.truncate_prompt_tokens is not None:
params_type = type(params).__name__
warnings.warn(
f"The `truncate_prompt_tokens` parameter in `{params_type}` "
"is deprecated and will be removed in v0.17. "
"Please pass it via `tokenization_kwargs` instead.",
DeprecationWarning,
stacklevel=2,
)

if isinstance(params, SamplingParams):
params.verify(
self.model_config,
Expand Down Expand Up @@ -211,11 +250,24 @@ def process_inputs(
)

if isinstance(prompt, dict) and "type" in prompt:
if tokenization_kwargs:
logger.warning_once(
"Passing tokenization_kwargs to InputProcessor is deprecated "
"and will be removed in v0.18. You should instead pass "
"them to Renderer.render_cmpl() or Renderer.render_chat()."
)

if arrival_time is None:
arrival_time = prompt.get("arrival_time", time.time()) # type: ignore[assignment]

processed_inputs: ProcessorInputs = prompt # type: ignore[assignment]
else:
logger.warning_once(
"Passing raw prompts to InputProcessor is deprecated "
"and will be removed in v0.18. You should instead pass "
"the outputs of Renderer.render_cmpl() or Renderer.render_chat()."
)

if arrival_time is None:
arrival_time = time.time()

Expand All @@ -224,13 +276,7 @@ def process_inputs(
tokenization_kwargs=tokenization_kwargs,
)

from vllm.platforms import current_platform

current_platform.validate_request(
prompt=prompt,
params=params,
processed_inputs=processed_inputs,
)
self._platform_validate_request(processed_inputs, params)
Comment thread
DarkLight1337 marked this conversation as resolved.

encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
self._validate_model_inputs(encoder_inputs, decoder_inputs)
Expand Down
8 changes: 7 additions & 1 deletion vllm/v1/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,10 +234,16 @@ def add_request(

# Process raw inputs into the request.
if isinstance(prompt, EngineCoreRequest):
logger.warning_once(
"Passing EngineCoreRequest to LLMEngine.generate() and .add_requests() "
"is deprecated and will be removed in v0.18. You should instead pass "
"the outputs of Renderer.render_cmpl() or Renderer.render_chat()."
)

request = prompt
if request_id != request.request_id:
logger.warning_once(
"AsyncLLM.add_request() was passed a request_id parameter that "
"LLMEngine.add_request() was passed a request_id parameter that "
"does not match the EngineCoreRequest.request_id attribute. The "
"latter will be used, and the former will be ignored."
)
Expand Down