diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 00000000000..07e613097b6 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,30 @@ +{ + "name": "vllm", + "build": { + "dockerfile": "Dockerfile" + }, + "remoteUser": "devuser", + "customizations": { + "vscode": { + "extensions": [ + // Python development + "ms-python.python", + "charliermarsh.ruff", + // Rust development + "rust-lang.rust-analyzer", + "tamasfe.even-better-toml" + ] + } + }, + "forwardPorts": [], + "runArgs": [ + "--gpus", + "all" + ], + // The two lines below ensures that your local changes in the sglang + // repo is automatically synced to the sglang pip package installed + // in the dev docker container. You can remove / comment out these + // two lines if you prefer to sync code changes manually. + "workspaceMount": "source=${localWorkspaceFolder},target=/vllm-workspace/vllm,type=bind", + "workspaceFolder": "/vllm-workspace/vllm" +} diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev new file mode 100644 index 00000000000..6fc91a0beb1 --- /dev/null +++ b/docker/Dockerfile.dev @@ -0,0 +1,234 @@ +# Extend from the base vllm image +FROM vllm/vllm-openai:latest + +# Override the base image's entrypoint with a shell +ENTRYPOINT ["/bin/bash"] + +# Install development tools and utilities +RUN apt-get update && apt-get install -y \ + gdb \ + ninja-build \ + vim \ + tmux \ + htop \ + wget \ + curl \ + locales \ + lsof \ + git \ + git-lfs \ + zsh \ + tree \ + silversearcher-ag \ + cloc \ + unzip \ + pkg-config \ + libssl-dev \ + bear \ + ccache \ + && apt install -y rdma-core infiniband-diags openssh-server perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +RUN apt update -y \ + && apt install -y --no-install-recommends gnupg \ + && echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu2004/amd64 /" | tee /etc/apt/sources.list.d/nvidia-devtools.list \ + && apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub \ + && apt update -y \ + && apt install nsight-systems-cli -y + +# Set up locale +RUN locale-gen en_US.UTF-8 +ENV LANG en_US.UTF-8 +ENV LANGUAGE en_US:en +ENV LC_ALL en_US.UTF-8 + +# Install minimal Python packages +RUN python3 -m pip install --no-cache-dir \ + pytest \ + black \ + isort \ + icdiff \ + scikit_build_core \ + uv \ + pre-commit + +# Install diff-so-fancy +RUN curl -LSso /usr/local/bin/diff-so-fancy https://github.com/so-fancy/diff-so-fancy/releases/download/v1.4.4/diff-so-fancy \ + && chmod +x /usr/local/bin/diff-so-fancy + +# Install clang-format +RUN curl -LSso /usr/local/bin/clang-format https://github.com/muttleyxd/clang-tools-static-binaries/releases/download/master-32d3ac78/clang-format-16_linux-amd64 \ + && chmod +x /usr/local/bin/clang-format + +# Install clangd +RUN curl -L https://github.com/clangd/clangd/releases/download/18.1.3/clangd-linux-18.1.3.zip -o clangd.zip \ + && unzip clangd.zip \ + && cp -r clangd_18.1.3/bin/* /usr/local/bin/ \ + && cp -r clangd_18.1.3/lib/* /usr/local/lib/ \ + && rm -rf clangd_18.1.3 clangd.zip + +# Install CMake +RUN wget https://github.com/Kitware/CMake/releases/download/v3.31.1/cmake-3.31.1-linux-x86_64.tar.gz \ + && tar -xzf cmake-3.31.1-linux-x86_64.tar.gz \ + && cp -r cmake-3.31.1-linux-x86_64/bin/* /usr/local/bin/ \ + && cp -r cmake-3.31.1-linux-x86_64/share/* /usr/local/share/ \ + && rm -rf cmake-3.31.1-linux-x86_64 cmake-3.31.1-linux-x86_64.tar.gz + +# Add yank script +RUN echo '#!/bin/bash' > /usr/local/bin/yank && \ + echo 'put() {' >> /usr/local/bin/yank && \ + echo ' esc=$1' >> /usr/local/bin/yank && \ + echo ' test -n "$TMUX" -o -z "${TERM##screen*}" && esc="\033Ptmux;\033$esc\033\\"' >> /usr/local/bin/yank && \ + echo ' printf "$esc"' >> /usr/local/bin/yank && \ + echo '}' >> /usr/local/bin/yank && \ + echo 'put "\033]52;c;!\a"' >> /usr/local/bin/yank && \ + echo 'buf=$( cat "$@" )' >> /usr/local/bin/yank && \ + echo 'len=$( printf %s "$buf" | wc -c ) max=74994' >> /usr/local/bin/yank && \ + echo 'test $len -gt $max && echo "$0: input is $(( len - max )) bytes too long" >&2' >> /usr/local/bin/yank && \ + echo 'put "\033]52;c;$( printf %s "$buf" | head -c $max | base64 | tr -d '\''\r\n'\'' )\a"' >> /usr/local/bin/yank && \ + echo 'test -n "$TMUX" && tmux set-buffer "$buf" ||:' >> /usr/local/bin/yank && \ + chmod +x /usr/local/bin/yank + +# Install oh-my-zsh and plugins +RUN sh -c "$(curl -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended \ + && git clone https://github.com/zsh-users/zsh-autosuggestions ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-autosuggestions \ + && git clone https://github.com/zsh-users/zsh-syntax-highlighting.git ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-syntax-highlighting + +# Configure Vim +RUN echo 'function! Yank(text) abort' > /root/.vimrc && \ + echo ' let escape = system("yank", a:text)' >> /root/.vimrc && \ + echo ' if v:shell_error' >> /root/.vimrc && \ + echo ' echoerr escape' >> /root/.vimrc && \ + echo ' else' >> /root/.vimrc && \ + echo ' call writefile([escape], "/dev/tty", "b")' >> /root/.vimrc && \ + echo ' endif' >> /root/.vimrc && \ + echo 'endfunction' >> /root/.vimrc && \ + echo '' >> /root/.vimrc && \ + echo 'noremap y y:call Yank(@0)' >> /root/.vimrc && \ + echo '' >> /root/.vimrc && \ + echo '" automatically run yank(1) whenever yanking in Vim' >> /root/.vimrc && \ + echo 'function! CopyYank() abort' >> /root/.vimrc && \ + echo ' call Yank(join(v:event.regcontents, "\n"))' >> /root/.vimrc && \ + echo 'endfunction' >> /root/.vimrc && \ + echo '' >> /root/.vimrc && \ + echo 'autocmd TextYankPost * call CopyYank()' >> /root/.vimrc && \ + echo '' >> /root/.vimrc && \ + echo '" Basic settings' >> /root/.vimrc && \ + echo 'set number' >> /root/.vimrc && \ + echo 'syntax on' >> /root/.vimrc && \ + echo 'set mouse=a' >> /root/.vimrc && \ + echo 'filetype indent on' >> /root/.vimrc && \ + echo '' >> /root/.vimrc && \ + echo '" Indentation' >> /root/.vimrc && \ + echo 'set autoindent nosmartindent' >> /root/.vimrc && \ + echo 'set smarttab' >> /root/.vimrc && \ + echo 'set expandtab' >> /root/.vimrc && \ + echo 'set shiftwidth=4' >> /root/.vimrc && \ + echo 'set softtabstop=4' >> /root/.vimrc && \ + echo '' >> /root/.vimrc && \ + echo '" Visual guides' >> /root/.vimrc && \ + echo 'set colorcolumn=120' >> /root/.vimrc && \ + echo 'highlight ColorColumn ctermbg=5' >> /root/.vimrc && \ + echo '' >> /root/.vimrc && \ + echo '" Status line' >> /root/.vimrc && \ + echo 'set laststatus=2' >> /root/.vimrc && \ + echo 'set statusline=%<%f\ %h%m%r%=%{\"[\".(&fenc==\"\"?&enc:&fenc).((exists(\"+bomb\")\ &&\ &bomb)?\",B\":\"\").\"]\ \"}%k\ %-14.(%l,%c%V%)\ %P' >> /root/.vimrc && \ + echo '' >> /root/.vimrc && \ + echo '" Backspace behavior' >> /root/.vimrc && \ + echo 'set backspace=2' >> /root/.vimrc && \ + echo '' >> /root/.vimrc && \ + echo '" Encoding' >> /root/.vimrc && \ + echo 'set encoding=utf-8' >> /root/.vimrc && \ + echo 'set fileencoding=utf-8' >> /root/.vimrc + +# Configure tmux +RUN echo '# Pane border styling' > /root/.tmux.conf && \ + echo 'set -g pane-border-style fg=#742727,bg=black' >> /root/.tmux.conf && \ + echo 'set -g pane-active-border-style fg=red,bg=black' >> /root/.tmux.conf && \ + echo '' >> /root/.tmux.conf && \ + echo '# Status bar styling' >> /root/.tmux.conf && \ + echo 'set -g status-style bg=#0C8A92,fg=black' >> /root/.tmux.conf && \ + echo '' >> /root/.tmux.conf && \ + echo '# Change prefix key to backtick' >> /root/.tmux.conf && \ + echo 'set-option -g prefix `' >> /root/.tmux.conf && \ + echo 'unbind C-b' >> /root/.tmux.conf && \ + echo 'bind-key ` send-prefix' >> /root/.tmux.conf && \ + echo '' >> /root/.tmux.conf && \ + echo '# Split panes using - and = with current path' >> /root/.tmux.conf && \ + echo 'unbind \"' >> /root/.tmux.conf && \ + echo 'bind - splitw -v -c \"#{pane_current_path}\"' >> /root/.tmux.conf && \ + echo 'unbind %' >> /root/.tmux.conf && \ + echo 'bind = splitw -h -c \"#{pane_current_path}\"' >> /root/.tmux.conf && \ + echo '' >> /root/.tmux.conf && \ + echo '# Vi mode settings' >> /root/.tmux.conf && \ + echo 'bind-key -T copy-mode-vi Y send-keys -X copy-pipe \"yank > #{pane_tty}\"' >> /root/.tmux.conf && \ + echo 'set-window-option -g mode-keys vi' >> /root/.tmux.conf && \ + echo '' >> /root/.tmux.conf && \ + echo '# Other settings' >> /root/.tmux.conf && \ + echo 'set-option -g escape-time 0' >> /root/.tmux.conf && \ + echo 'set-option -g base-index 1' >> /root/.tmux.conf && \ + echo 'set-window-option -g mouse on' >> /root/.tmux.conf + +# Configure Git +RUN git config --global core.editor "vim" \ + && git config --global core.whitespace "fix,-indent-with-non-tab,trailing-space,cr-at-eol" \ + && git config --global core.pager "diff-so-fancy | less --tabs=4 -RFX" \ + && git config --global color.ui true \ + && git config --global color."diff-highlight".oldNormal "red bold" \ + && git config --global color."diff-highlight".oldHighlight "red bold 52" \ + && git config --global color."diff-highlight".newNormal "green bold" \ + && git config --global color."diff-highlight".newHighlight "green bold 22" \ + && git config --global color.diff.meta "11" \ + && git config --global color.diff.frag "magenta bold" \ + && git config --global color.diff.commit "yellow bold" \ + && git config --global color.diff.old "red bold" \ + && git config --global color.diff.new "green bold" \ + && git config --global color.diff.whitespace "red reverse" \ + && git config --global alias.lg "log --color --graph --pretty=format:'%Cred%h%Creset - %s %Cgreen(%cr) %C(bold blue)<%an>%Creset%C(auto)%d%Creset' --abbrev-commit --" \ + && git config --global http.sslVerify false \ + && git config --global pull.rebase true + +# Configure zsh +COPY --chown=root:root <<-"EOF" /root/.zshrc +export ZSH="/root/.oh-my-zsh" + +# Theme +ZSH_THEME="robbyrussell" + +# Plugins +plugins=( + git + z + zsh-autosuggestions + zsh-syntax-highlighting +) + +source $ZSH/oh-my-zsh.sh + +# Aliases +alias ll='ls -alF' +alias la='ls -A' +alias l='ls -CF' +alias vi='vim' + +# Enhanced history +HISTSIZE=10000 +SAVEHIST=10000 +setopt HIST_IGNORE_ALL_DUPS +setopt HIST_FIND_NO_DUPS +setopt INC_APPEND_HISTORY +EOF + +# Set workspace directory +WORKDIR /vllm-workspace + +RUN git clone --depth=1 https://github.com/vllm-project/vllm.git + +# Create .devcontainer and .vscode directories +RUN mkdir -p /vllm-workspace/vllm/.devcontainer /vllm-workspace/vllm/.vscode + +# Copy .devcontainer and .vscode from host to container +COPY .devcontainer /vllm-workspace/vllm/.devcontainer/ + +ENV PYTHONPATH=/vllm-workspace/vllm diff --git a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py new file mode 100644 index 00000000000..5a619f8b765 --- /dev/null +++ b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py @@ -0,0 +1,131 @@ +# SPDX-License-Identifier: Apache-2.0 + +import pytest +from transformers import AutoTokenizer + +from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation +from vllm.entrypoints.openai.tool_parsers.llama_tool_parser import ( + Llama3JsonToolParser) + + +@pytest.fixture +def parser(): + # Use a small tokenizer for testing + tokenizer = AutoTokenizer.from_pretrained("gpt2") + return Llama3JsonToolParser(tokenizer) + + +def test_extract_tool_calls_simple(parser): + # Test with a simple tool call + model_output = ('Here is the result: {"name": "getOpenIncidentsTool", ' + '"parameters": {}} Would you like to know more?') + result = parser.extract_tool_calls(model_output, None) + + assert isinstance(result, ExtractedToolCallInformation) + assert result.tools_called is True + assert len(result.tool_calls) == 1 + assert result.tool_calls[0].type == "function" + assert result.tool_calls[0].function.name == "getOpenIncidentsTool" + assert result.tool_calls[0].function.arguments == "{}" + assert result.content is None + + +def test_extract_tool_calls_with_arguments(parser): + # Test with a tool call that has arguments + model_output = ( + '{"name": "searchTool", "parameters": {"query": "test query", ' + '"limit": 10}}') + result = parser.extract_tool_calls(model_output, None) + + assert result.tools_called is True + assert len(result.tool_calls) == 1 + assert result.tool_calls[0].function.name == "searchTool" + assert '"query": "test query"' in result.tool_calls[0].function.arguments + assert '"limit": 10' in result.tool_calls[0].function.arguments + + +def test_extract_tool_calls_no_json(parser): + # Test with text that doesn't contain a JSON object + model_output = "This is just some text without any tool calls" + result = parser.extract_tool_calls(model_output, None) + + assert result.tools_called is False + assert len(result.tool_calls) == 0 + assert result.content == model_output + + +def test_extract_tool_calls_invalid_json(parser): + # Test with invalid JSON + model_output = '{"name": "invalidTool", "parameters": {invalid json}' + result = parser.extract_tool_calls(model_output, None) + + assert result.tools_called is False + assert len(result.tool_calls) == 0 + assert result.content == model_output + + +def test_extract_tool_calls_with_arguments_key(parser): + # Test with a tool call that uses "arguments" instead of "parameters" + model_output = '{"name": "searchTool", "arguments": {"query": "test"}}' + result = parser.extract_tool_calls(model_output, None) + + assert result.tools_called is True + assert len(result.tool_calls) == 1 + assert result.tool_calls[0].function.name == "searchTool" + assert '"query": "test"' in result.tool_calls[0].function.arguments + + +def test_extract_tool_calls_multiple_json(parser): + # Test with multiple JSONs separated by semicolons + model_output = ( + '{"name": "searchTool", "parameters": {"query": "test1"}}; ' + '{"name": "getOpenIncidentsTool", "parameters": {}}; ' + '{"name": "searchTool", "parameters": {"query": "test2"}}') + result = parser.extract_tool_calls(model_output, None) + + assert result.tools_called is True + assert len(result.tool_calls) == 3 + + # Check first tool call + assert result.tool_calls[0].function.name == "searchTool" + assert '"query": "test1"' in result.tool_calls[0].function.arguments + + # Check second tool call + assert result.tool_calls[1].function.name == "getOpenIncidentsTool" + assert result.tool_calls[1].function.arguments == "{}" + + # Check third tool call + assert result.tool_calls[2].function.name == "searchTool" + assert '"query": "test2"' in result.tool_calls[2].function.arguments + + +def test_extract_tool_calls_multiple_json_with_whitespace(parser): + # Test with multiple JSONs separated by semicolons and extra whitespace + model_output = ( + '{"name": "searchTool", "parameters": {"query": "test1"}} ; ' + '{"name": "getOpenIncidentsTool", "parameters": {}} ; ' + '{"name": "searchTool", "parameters": {"query": "test2"}}') + result = parser.extract_tool_calls(model_output, None) + + assert result.tools_called is True + assert len(result.tool_calls) == 3 + assert result.tool_calls[0].function.name == "searchTool" + assert result.tool_calls[1].function.name == "getOpenIncidentsTool" + assert result.tool_calls[2].function.name == "searchTool" + + +def test_extract_tool_calls_multiple_json_with_surrounding_text(parser): + # Test with multiple JSONs and surrounding text + model_output = ( + 'Here are the results: ' + '{"name": "searchTool", "parameters": {"query": "test1"}}; ' + '{"name": "getOpenIncidentsTool", "parameters": {}}; ' + '{"name": "searchTool", "parameters": {"query": "test2"}} ' + 'Would you like to know more?') + result = parser.extract_tool_calls(model_output, None) + + assert result.tools_called is True + assert len(result.tool_calls) == 3 + assert result.tool_calls[0].function.name == "searchTool" + assert result.tool_calls[1].function.name == "getOpenIncidentsTool" + assert result.tool_calls[2].function.name == "searchTool" diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py new file mode 100644 index 00000000000..f3871b60c3f --- /dev/null +++ b/tests/models/multimodal/processing/test_mllama4.py @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests for mllama's multimodal preprocessing and profiling.""" +import pytest +from torch import prod +from transformers import Llama4Config + +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.profiling import MultiModalProfiler + +from ...utils import build_model_context + + +@pytest.mark.parametrize("model_id", ["meta-llama/Llama-Guard-4-12B"]) +@pytest.mark.parametrize("max_model_len", [4096, 8192, 25600, 131072]) +def test_profiling(model_id: str, max_model_len: int): + model_config_kwargs = { + "max_model_len": max_model_len, + } + ctx = build_model_context( + model_id, + model_config_kwargs=model_config_kwargs, + limit_mm_per_prompt={"image": 1}, + ) + + mm_config = ctx.get_mm_config() + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + profiler = MultiModalProfiler(processor) + + decoder_dummy_data = profiler.get_decoder_dummy_data( + max_model_len, + mm_counts=mm_config.limit_per_prompt, + ) + dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs( + max_model_len, + mm_counts=mm_config.limit_per_prompt, + ) + + hf_config = ctx.get_hf_config(Llama4Config) + + mm_kwargs = processor.apply( + prompt=dummy_mm_data.prompt, + mm_data=dummy_mm_data.mm_data, + hf_processor_mm_kwargs=dict(), + )["mm_kwargs"] + + image_size = hf_config.vision_config.image_size + patch_size = hf_config.vision_config.patch_size + downsample_ratio = int( + round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2))) + tokens_per_patch = ((image_size // patch_size)**2) // downsample_ratio + chunks_per_image = prod(mm_kwargs["patches_per_image"]) + total_num_patches = chunks_per_image * tokens_per_patch + num_tiles = mm_kwargs["aspect_ratios"][0][0] * mm_kwargs["aspect_ratios"][ + 0][1] # x-y seperator tokens + total_tokens = total_num_patches.item() + num_tiles.item( + ) + 3 # image start, image, image end + + profiled_tokens = profiler.get_mm_max_contiguous_tokens( + max_model_len, + mm_counts=mm_config.limit_per_prompt, + ) + + assert total_tokens == profiled_tokens["image"] + assert total_tokens == sum( + placeholder.length for placeholder in + decoder_dummy_data.multi_modal_placeholders["image"]) diff --git a/tests/models/registry.py b/tests/models/registry.py index e6543c19734..acfe91f46cb 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -337,7 +337,9 @@ def check_available_online( trust_remote_code=True, v0_only=True), "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501 - max_model_len=10240), + max_model_len=10240, + extras={"llama-guard-4": "meta-llama/Llama-Guard-4-12B"}, # noqa: E501 + ), "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf", extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501 "mistral-fp8": "nm-testing/pixtral-12b-FP8-dynamic"}), # noqa: E501 diff --git a/vllm/entrypoints/openai/middleware.py b/vllm/entrypoints/openai/middleware.py new file mode 100644 index 00000000000..dab6a4270b1 --- /dev/null +++ b/vllm/entrypoints/openai/middleware.py @@ -0,0 +1,28 @@ +import time + +from fastapi import Request +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +async def log_opc_header(request: Request, call_next): + # Log at the start and end of a POST request + if request.method == "POST": + opc_request_id = request.headers.get("opc-request-id", "unknown") + logger.info(f"POST Request Start - opc-request-id: {opc_request_id}") + + try: + response = await call_next(request) + logger.info( + f"POST Request End - opc-request-id: {opc_request_id}, " + f"status_code: {response.status_code}") + return response + except Exception as e: + logger.error( + f"Exception during POST request with " + f"opc-request-id: {opc_request_id}, error: {e}") + raise + + # For non-POST requests, just pass through + return await call_next(request) diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py index 5698bc70af2..07b992dad31 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py @@ -3,7 +3,6 @@ import json from collections.abc import Sequence -from json import JSONDecoder from typing import Union import partial_json_parser @@ -31,11 +30,11 @@ @ToolParserManager.register_module("llama4_json") class Llama3JsonToolParser(ToolParser): """ - Tool call parser for Llama 3.1 models intended for use with the + Tool call parser for Llama 3.x and 4 models intended for use with the examples/tool_chat_template_llama.jinja template. - Used when --enable-auto-tool-choice --tool-call-parser llama3_json - are all set + Used when --enable-auto-tool-choice --tool-call-parser llama3_json or + llama4_json are set. """ def __init__(self, tokenizer: PreTrainedTokenizerBase): @@ -51,54 +50,57 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase): self.bot_token = "<|python_tag|>" self.bot_token_id = tokenizer.encode(self.bot_token, add_special_tokens=False)[0] - self.tool_call_regex = re.compile(r"\[{.*?}\]", re.DOTALL) + # Updated regex to match multiple JSONs separated by semicolons + # This pattern is more robust and can handle nested JSON objects + self.tool_call_regex = re.compile( + r'{[^{}]*(?:{[^{}]*}[^{}]*)*}(?:\s*;\s*{[^{}]*(?:{[^{}]*}[^{}]*)*})*', + re.DOTALL) def extract_tool_calls( self, model_output: str, request: ChatCompletionRequest) -> ExtractedToolCallInformation: """ Extract the tool calls from a complete model response. + Only extracts JSON content and ignores any surrounding plain text. + Supports both single JSON and multiple JSONs separated by semicolons. """ - # case -- if a tool call token is not present, return a text response - if not (model_output.startswith(self.bot_token) - or model_output.startswith('{')): + # Quick check before running regex + if not (self.bot_token in model_output or '{' in model_output): + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + # Find JSON object(s) in the text using regex + match = self.tool_call_regex.search(model_output) + if not match: return ExtractedToolCallInformation(tools_called=False, tool_calls=[], content=model_output) try: - # load the JSON, and then use it to build the Function and - # Tool Call - dec = JSONDecoder() - function_call_arr = [] - - # depending on the prompt format the Llama model may or may not - # prefix the output with the <|python_tag|> token - start_idx = len(self.bot_token) if model_output.startswith( - self.bot_token) else 0 - while start_idx < len(model_output): - (obj, end_idx) = dec.raw_decode(model_output[start_idx:]) - start_idx += end_idx + len('; ') - function_call_arr.append(obj) - - tool_calls: list[ToolCall] = [ - ToolCall( - type="function", - function=FunctionCall( - name=raw_function_call["name"], - # function call args are JSON but as a string - arguments=json.dumps(raw_function_call["arguments"] \ - if "arguments" in raw_function_call \ - else raw_function_call["parameters"], - ensure_ascii=False))) - for raw_function_call in function_call_arr - ] - - # get any content before the tool call - ret = ExtractedToolCallInformation(tools_called=True, - tool_calls=tool_calls, - content=None) - return ret + json_str = match.group(0) + # Split by semicolon and strip whitespace + json_objects = [obj.strip() for obj in json_str.split(';')] + + tool_calls: list[ToolCall] = [] + for json_obj in json_objects: + if not json_obj: # Skip empty strings + continue + obj = json.loads(json_obj) + tool_calls.append( + ToolCall( + type="function", + function=FunctionCall( + name=obj["name"], + # function call args are JSON but as a string + arguments=json.dumps(obj["arguments"] \ + if "arguments" in obj \ + else obj["parameters"]))) + ) + + return ExtractedToolCallInformation(tools_called=True, + tool_calls=tool_calls, + content=None) except Exception: logger.exception("Error in extracting tool call from response.") diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 1faecb7bd24..1e77e50334d 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -175,11 +175,14 @@ def _get_dummy_mm_inputs( def _get_mm_num_tokens( self, mm_inputs: MultiModalInputs, + mm_embeddings_only: bool = True, ) -> Mapping[str, int]: placeholders_by_modality = mm_inputs["mm_placeholders"] return { - modality: sum(item.get_num_embeds() for item in placeholders) + modality: + sum(item.get_num_embeds() if mm_embeddings_only else item.length + for item in placeholders) for modality, placeholders in placeholders_by_modality.items() } @@ -248,11 +251,34 @@ def get_decoder_dummy_data( multi_modal_placeholders=mm_inputs["mm_placeholders"], ) - def get_mm_max_tokens( + def _get_mm_max_tokens( self, seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, + mm_embeddings_only: bool = True, ) -> Mapping[str, int]: mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts) - return self._get_mm_num_tokens(mm_inputs) + return self._get_mm_num_tokens(mm_inputs, + mm_embeddings_only=mm_embeddings_only) + + def get_mm_max_contiguous_tokens( + self, + seq_len: int, + mm_counts: Optional[Mapping[str, int]] = None, + ): + """ + Returns the maximum length of the multimodal (image placeholders+text) + tokens, including any break/text tokens in-between image embeddings. + + [IMG] [IMG] [IMG] [IMG] [IMG] [IMG] + Returns 9, even when the number of image embeddings is 6. + + This is important to take into account when profiling and + initializing the encoder cache size. + """ + + return self._get_mm_max_tokens(seq_len, + mm_counts, + mm_embeddings_only=False) +>>>>>>> 0e36abf99 ([Bugfix] Correct max tokens for non-contiguous embeds (#21798)) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 27aaa661c35..c9a2a60afea 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -129,7 +129,7 @@ def get_max_tokens_per_item_by_modality( seq_len = model_config.max_model_len mm_limits = self.get_mm_limits_per_prompt(model_config) - return profiler.get_mm_max_tokens( + return profiler.get_mm_max_contiguous_tokens( seq_len, { modality: 1