Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 12 additions & 10 deletions vllm/benchmarks/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,16 +129,17 @@ def get_random_lora_request(

Args:
tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
LoRA is selected. max_loras (Optional[int]): The maximum number of
LoRAs available. If None, LoRA is not used. lora_path
(Optional[str]): Path to the LoRA parameters on disk. If None, LoRA
is not used.
LoRA is selected.
max_loras (Optional[int]): The maximum number of LoRAs available.
If `None`, LoRA is not used.
lora_path (Optional[str]): Path to the LoRA parameters on disk.
If `None`, LoRA is not used.

Returns:
tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first
element is a LoRARequest (or None if not applicable) and the second
element is the tokenizer associated with the LoRA request (or the
base tokenizer).
A tuple with the following elements:
- A new [LoRARequest] (or `None` if not applicable)/
- The tokenizer associated with the LoRA request
(or the base tokenizer).
"""
if max_loras is None or lora_path is None:
return None, tokenizer
Expand Down Expand Up @@ -167,7 +168,7 @@ def sample(self, tokenizer: PreTrainedTokenizerBase,

Args:
tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
for processing the dataset's text.
for processing the dataset's text.
num_requests (int): The number of sample requests to generate.

Returns:
Expand All @@ -184,7 +185,8 @@ def maybe_oversample_requests(self, requests: list[SampleRequest],

Args:
requests (List[SampleRequest]): The current list of sampled
requests. num_requests (int): The target number of requests.
requests.
num_requests (int): The target number of requests.
"""
if len(requests) < num_requests:
random.seed(self.random_seed)
Expand Down
2 changes: 1 addition & 1 deletion vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4552,7 +4552,7 @@ def contains_object_print(text):
text (str): The text to check

Returns:
bool: True if a match is found, False otherwise
result (bool): `True` if a match is found, `False` otherwise.
"""
pattern = r'at 0x[a-fA-F0-9]{2,16}>'
match = re.search(pattern, text)
Expand Down
9 changes: 5 additions & 4 deletions vllm/distributed/kv_transfer/kv_connector/v1/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,10 +210,11 @@ def get_num_new_matched_tokens(
computed tokens for this request

Returns:
* the number of tokens that can be loaded from the
external KV cache beyond what is already computed.
* true if external KV cache tokens will be loaded
asynchronously (between scheduler steps).
A tuple with the following elements:
- The number of tokens that can be loaded from the
external KV cache beyond what is already computed.
- `True` if external KV cache tokens will be loaded
asynchronously (between scheduler steps).
"""
pass

Expand Down
8 changes: 4 additions & 4 deletions vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,12 +136,12 @@ def _prepare_recv_buffer(self, metadata: Metadata) -> torch.Tensor:
Create a buffer to receive the tensor based on the provided metadata.

Parameters:
- metadata: A dictionary with keys "dtype" and "shape", describing
the tensor's data type and shape.
metadata (Metadata): A dictionary with keys "dtype" and "shape",
describing the tensor's data type and shape.

Returns:
- buffer: A tensor of the specified type and shape, allocated on
self.device.
buffer (torch.Tensor): A tensor of the specified type and shape,
allocated on `self.device`.
"""
return torch.empty(metadata["shape"],
dtype=metadata["dtype"],
Expand Down
40 changes: 15 additions & 25 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,26 +130,16 @@ class LLMEngine:
iteration-level scheduling and efficient memory management to maximize the
serving throughput.

The {class}`~vllm.LLM` class wraps this class for offline batched inference
and the {class}`AsyncLLMEngine` class wraps this class for online serving.
The [LLM][vllm.LLM] class wraps this class for offline batched inference
and the [AsyncLLMEngine][] class wraps this class for online serving.

The config arguments are derived from {class}`~vllm.EngineArgs`. (See
{ref}`engine-args`)
The config arguments are derived from [EngineArgs][vllm.EngineArgs]. (See
[engine-args][])

Args:
model_config: The configuration related to the LLM model.
cache_config: The configuration related to the KV cache memory
management.
parallel_config: The configuration related to distributed execution.
scheduler_config: The configuration related to the request scheduler.
device_config: The configuration related to the device.
lora_config (Optional): The configuration related to serving multi-LoRA.
speculative_config (Optional): The configuration related to speculative
decoding.
vllm_config: The configuration for initializing and running vLLM.
executor_class: The model executor class for managing distributed
execution.
prompt_adapter_config (Optional): The configuration related to serving
prompt adapters.
log_stats: Whether to log statistics.
usage_context: Specified entry point, used for usage info collection.
"""
Expand Down Expand Up @@ -695,11 +685,12 @@ def add_request(

Args:
request_id: The unique ID of the request.
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
prompt: The prompt to the LLM. See
[PromptType][vllm.inputs.PromptType]
for more details about the format of each input.
params: Parameters for sampling or pooling.
{class}`~vllm.SamplingParams` for text generation.
{class}`~vllm.PoolingParams` for pooling.
[SamplingParams][vllm.SamplingParams] for text generation.
[PoolingParams][vllm.PoolingParams] for pooling.
arrival_time: The arrival time of the request. If None, we use
the current monotonic time.
lora_request: The LoRA request to add.
Expand All @@ -711,10 +702,11 @@ def add_request(
Details:
- Set arrival_time to the current time if it is None.
- Set prompt_token_ids to the encoded prompt if it is None.
- Create `n` number of {class}`~vllm.Sequence` objects.
- Create a {class}`~vllm.SequenceGroup` object
from the list of {class}`~vllm.Sequence`.
- Add the {class}`~vllm.SequenceGroup` object to the scheduler.
- Create `n` number of [Sequence][vllm.Sequence] objects.
- Create a [SequenceGroup][vllm.SequenceGroup] object
from the list of [Sequence][vllm.Sequence].
- Add the [SequenceGroup][vllm.SequenceGroup] object to the
scheduler.

Example:
>>> # initialize engine
Expand Down Expand Up @@ -861,9 +853,7 @@ def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
request_id: The ID(s) of the request to abort.

Details:
- Refer to the
{meth}`~vllm.core.scheduler.Scheduler.abort_seq_group`
from class {class}`~vllm.core.scheduler.Scheduler`.
- Refer to [vllm.core.scheduler.Scheduler.abort_seq_group][].

Example:
>>> # initialize engine and add a request with request_id
Expand Down
63 changes: 32 additions & 31 deletions vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,8 @@ class LLM:
to eager mode. Additionally for encoder-decoder models, if the
sequence length of the encoder input is larger than this, we fall
back to the eager mode.
disable_custom_all_reduce: See {class}`~vllm.config.ParallelConfig`
disable_custom_all_reduce: See
[ParallelConfig][vllm.config.ParallelConfig].
disable_async_output_proc: Disable async output processing.
This may result in lower performance.
hf_token: The token to use as HTTP bearer authorization for remote files
Expand All @@ -128,12 +129,12 @@ class LLM:
compilation_config: Either an integer or a dictionary. If it is an
integer, it is used as the level of compilation optimization. If it
is a dictionary, it can specify the full compilation configuration.
**kwargs: Arguments for {class}`~vllm.EngineArgs`. (See
{ref}`engine-args`)
**kwargs: Arguments for [EngineArgs][vllm.EngineArgs]. (See
[engine-args][])

Note:
This class is intended to be used for offline inference. For online
serving, use the {class}`~vllm.AsyncLLMEngine` class instead.
serving, use the [AsyncLLMEngine][vllm.AsyncLLMEngine] class instead.
"""

DEPRECATE_LEGACY: ClassVar[bool] = True
Expand All @@ -142,7 +143,7 @@ class LLM:
DEPRECATE_INIT_POSARGS: ClassVar[bool] = True
"""
A flag to toggle whether to deprecate positional arguments in
{meth}`LLM.__init__`.
[LLM.__init__][].
"""

@classmethod
Expand Down Expand Up @@ -403,7 +404,7 @@ def generate(

Args:
prompts: The prompts to the LLM. You may pass a sequence of prompts
for batch inference. See {class}`~vllm.inputs.PromptType`
for batch inference. See [PromptType][vllm.inputs.PromptType]
for more details about the format of each prompts.
sampling_params: The sampling parameters for text generation. If
None, we use the default sampling parameters.
Expand Down Expand Up @@ -669,7 +670,7 @@ def chat(
Generate responses for a chat conversation.

The chat conversation is converted into a text prompt using the
tokenizer and calls the {meth}`generate` method to generate the
tokenizer and calls the [generate][] method to generate the
responses.

Multi-modal inputs can be passed in the same way you would pass them
Expand All @@ -678,8 +679,8 @@ def chat(
Args:
messages: A list of conversations or a single conversation.

- Each conversation is represented as a list of messages.
- Each message is a dictionary with 'role' and 'content' keys.
- Each conversation is represented as a list of messages.
- Each message is a dictionary with 'role' and 'content' keys.

sampling_params: The sampling parameters for text generation.
If None, we use the default sampling parameters. When it
Expand All @@ -689,27 +690,27 @@ def chat(
use_tqdm: Whether to use tqdm to display the progress bar.
lora_request: LoRA request to use for generation, if any.
chat_template: The template to use for structuring the chat.
If not provided, the model's default chat template will be used.
If not provided, the model's default chat template will be used.
chat_template_content_format: The format to render message content.

- "string" will render the content as a string.
Example: ``"Who are you?"``
- "openai" will render the content as a list of dictionaries,
similar to OpenAI schema.
Example: ``[{"type": "text", "text": "Who are you?"}]``
- "string" will render the content as a string.
Example: `"Who are you?"`
- "openai" will render the content as a list of dictionaries,
similar to OpenAI schema.
Example: `[{"type": "text", "text": "Who are you?"}]`

add_generation_prompt: If True, adds a generation template
to each message.
continue_final_message: If True, continues the final message in
the conversation instead of starting a new one. Cannot be
``True`` if ``add_generation_prompt`` is also ``True``.
`True` if `add_generation_prompt` is also `True`.
chat_template_kwargs: Additional kwargs to pass to the chat
template.
mm_processor_kwargs: Multimodal processor kwarg overrides for this
chat request. Only used for offline requests.

Returns:
A list of ``RequestOutput`` objects containing the generated
A list of `RequestOutput` objects containing the generated
responses in the same order as the input messages.
"""
list_of_messages: list[list[ChatCompletionMessageParam]]
Expand Down Expand Up @@ -908,7 +909,7 @@ def encode(

Args:
prompts: The prompts to the LLM. You may pass a sequence of prompts
for batch inference. See {class}`~vllm.inputs.PromptType`
for batch inference. See [PromptType][vllm.inputs.PromptType]
for more details about the format of each prompts.
pooling_params: The pooling parameters for pooling. If None, we
use the default pooling parameters.
Expand Down Expand Up @@ -997,7 +998,7 @@ def embed(

Args:
prompts: The prompts to the LLM. You may pass a sequence of prompts
for batch inference. See {class}`~vllm.inputs.PromptType`
for batch inference. See [PromptType][vllm.inputs.PromptType]
for more details about the format of each prompts.
pooling_params: The pooling parameters for pooling. If None, we
use the default pooling parameters.
Expand All @@ -1007,7 +1008,7 @@ def embed(
generation, if any.

Returns:
A list of ``EmbeddingRequestOutput`` objects containing the
A list of `EmbeddingRequestOutput` objects containing the
embedding vectors in the same order as the input prompts.
"""
if self.llm_engine.model_config.task != "embed":
Expand Down Expand Up @@ -1041,15 +1042,15 @@ def classify(

Args:
prompts: The prompts to the LLM. You may pass a sequence of prompts
for batch inference. See {class}`~vllm.inputs.PromptType`
for batch inference. See [PromptType][vllm.inputs.PromptType]
for more details about the format of each prompts.
use_tqdm: Whether to use tqdm to display the progress bar.
lora_request: LoRA request to use for generation, if any.
prompt_adapter_request: Prompt Adapter request to use for
generation, if any.

Returns:
A list of ``ClassificationRequestOutput`` objects containing the
A list of `ClassificationRequestOutput` objects containing the
embedding vectors in the same order as the input prompts.
"""
if self.llm_engine.model_config.task != "classify":
Expand Down Expand Up @@ -1159,29 +1160,29 @@ def score(
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
) -> list[ScoringRequestOutput]:
"""Generate similarity scores for all pairs ``<text,text_pair>``.
"""Generate similarity scores for all pairs `<text,text_pair>`.

The inputs can be ``1 -> 1``, ``1 -> N`` or ``N -> N``.
In the ``1 - N`` case the ``text_1`` sentence will be replicated ``N``
times to pair with the ``text_2`` sentences.
The inputs can be `1 -> 1`, `1 -> N` or `N -> N`.
In the `1 - N` case the `text_1` sentence will be replicated `N`
times to pair with the `text_2` sentences.
The input pairs are used to build a list of prompts for the
cross encoder model. This class automatically batches the prompts,
considering the memory constraint. For the best performance, put all
of your texts into a single list and pass it to this method.

Args:
text_1: can be a single prompt or a list of prompts, in which
case it has to have the same length as the ``text_2`` list
case it has to have the same length as the `text_2` list
text_2: The texts to pair with the query to form the input
to the LLM. See {class}`~vllm.inputs.PromptType` for
to the LLM. See [PromptType][vllm.inputs.PromptType] for
more details about the format of each prompts.
use_tqdm: Whether to use tqdm to display the progress bar.
lora_request: LoRA request to use for generation, if any.
prompt_adapter_request: Prompt Adapter request to use for
generation, if any.

Returns:
A list of ``ScoringRequestOutput`` objects containing the
A list of `ScoringRequestOutput` objects containing the
generated scores in the same order as the input prompts.
"""
runner_type = self.llm_engine.model_config.runner_type
Expand Down Expand Up @@ -1282,13 +1283,13 @@ def sleep(self, level: int = 1):

def wake_up(self, tags: Optional[list[str]] = None):
"""
Wake up the engine from sleep mode. See the {meth}`sleep` method
Wake up the engine from sleep mode. See the [sleep][] method
for more details.

Args:
tags: An optional list of tags to reallocate the engine memory
for specific memory allocations. Values must be in
("weights", "kv_cache",). If None, all memory is reallocated.
`("weights", "kv_cache")`. If None, all memory is reallocated.
wake_up should be called with all tags (or None) before the
engine is used again.
"""
Expand Down
2 changes: 1 addition & 1 deletion vllm/multimodal/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
dispatch data processing according to the target model.

Info:
{ref}`mm-processing`
[mm-processing][]
"""

__all__ = [
Expand Down
4 changes: 2 additions & 2 deletions vllm/multimodal/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def register_processor(
invoked to transform the data into a dictionary of model inputs.

Info:
{ref}`mm-processing`
[mm-processing][]
"""

def wrapper(model_cls: N) -> N:
Expand Down Expand Up @@ -260,7 +260,7 @@ def create_processor(
Create a multi-modal processor for a specific model and tokenizer.

Info:
{ref}`mm-processing`
[mm-processing][]
"""
if not model_config.is_multimodal_model:
raise ValueError(f"{model_config.model} is not a multimodal model")
Expand Down
Loading