diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 00000000000..07e613097b6
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,30 @@
+{
+    "name": "vllm",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "remoteUser": "devuser",
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                // Python development
+                "ms-python.python",
+                "charliermarsh.ruff",
+                // Rust development
+                "rust-lang.rust-analyzer",
+                "tamasfe.even-better-toml"
+            ]
+        }
+    },
+    "forwardPorts": [],
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    // The two lines below ensures that your local changes in the sglang
+    // repo is automatically synced to the sglang pip package installed
+    // in the dev docker container. You can remove / comment out these
+    // two lines if you prefer to sync code changes manually.
+    "workspaceMount": "source=${localWorkspaceFolder},target=/vllm-workspace/vllm,type=bind",
+    "workspaceFolder": "/vllm-workspace/vllm"
+}
diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev
new file mode 100644
index 00000000000..6fc91a0beb1
--- /dev/null
+++ b/docker/Dockerfile.dev
@@ -0,0 +1,234 @@
+# Extend from the base vllm image
+FROM vllm/vllm-openai:latest
+
+# Override the base image's entrypoint with a shell
+ENTRYPOINT ["/bin/bash"]
+
+# Install development tools and utilities
+RUN apt-get update && apt-get install -y \
+    gdb \
+    ninja-build \
+    vim \
+    tmux \
+    htop \
+    wget \
+    curl \
+    locales \
+    lsof \
+    git \
+    git-lfs \
+    zsh \
+    tree \
+    silversearcher-ag \
+    cloc \
+    unzip \
+    pkg-config \
+    libssl-dev \
+    bear \
+    ccache \
+    && apt install -y rdma-core infiniband-diags openssh-server perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+RUN apt update -y \
+    && apt install -y --no-install-recommends gnupg \
+    && echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu2004/amd64 /" | tee /etc/apt/sources.list.d/nvidia-devtools.list \
+    && apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub \
+    && apt update -y \
+    && apt install nsight-systems-cli -y
+
+# Set up locale
+RUN locale-gen en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US:en
+ENV LC_ALL en_US.UTF-8
+
+# Install minimal Python packages
+RUN python3 -m pip install --no-cache-dir \
+    pytest \
+    black \
+    isort \
+    icdiff \
+    scikit_build_core \
+    uv \
+    pre-commit
+
+# Install diff-so-fancy
+RUN curl -LSso /usr/local/bin/diff-so-fancy https://github.com/so-fancy/diff-so-fancy/releases/download/v1.4.4/diff-so-fancy \
+    && chmod +x /usr/local/bin/diff-so-fancy
+
+# Install clang-format
+RUN curl -LSso /usr/local/bin/clang-format https://github.com/muttleyxd/clang-tools-static-binaries/releases/download/master-32d3ac78/clang-format-16_linux-amd64 \
+    && chmod +x /usr/local/bin/clang-format
+
+# Install clangd
+RUN curl -L https://github.com/clangd/clangd/releases/download/18.1.3/clangd-linux-18.1.3.zip -o clangd.zip \
+    && unzip clangd.zip \
+    && cp -r clangd_18.1.3/bin/* /usr/local/bin/ \
+    && cp -r clangd_18.1.3/lib/* /usr/local/lib/ \
+    && rm -rf clangd_18.1.3 clangd.zip
+
+# Install CMake
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.31.1/cmake-3.31.1-linux-x86_64.tar.gz \
+    && tar -xzf cmake-3.31.1-linux-x86_64.tar.gz \
+    && cp -r cmake-3.31.1-linux-x86_64/bin/* /usr/local/bin/ \
+    && cp -r cmake-3.31.1-linux-x86_64/share/* /usr/local/share/ \
+    && rm -rf cmake-3.31.1-linux-x86_64 cmake-3.31.1-linux-x86_64.tar.gz
+
+# Add yank script
+RUN echo '#!/bin/bash' > /usr/local/bin/yank && \
+    echo 'put() {' >> /usr/local/bin/yank && \
+    echo '  esc=$1' >> /usr/local/bin/yank && \
+    echo '  test -n "$TMUX" -o -z "${TERM##screen*}" && esc="\033Ptmux;\033$esc\033\\"' >> /usr/local/bin/yank && \
+    echo '  printf "$esc"' >> /usr/local/bin/yank && \
+    echo '}' >> /usr/local/bin/yank && \
+    echo 'put "\033]52;c;!\a"' >> /usr/local/bin/yank && \
+    echo 'buf=$( cat "$@" )' >> /usr/local/bin/yank && \
+    echo 'len=$( printf %s "$buf" | wc -c ) max=74994' >> /usr/local/bin/yank && \
+    echo 'test $len -gt $max && echo "$0: input is $(( len - max )) bytes too long" >&2' >> /usr/local/bin/yank && \
+    echo 'put "\033]52;c;$( printf %s "$buf" | head -c $max | base64 | tr -d '\''\r\n'\'' )\a"' >> /usr/local/bin/yank && \
+    echo 'test -n "$TMUX" && tmux set-buffer "$buf" ||:' >> /usr/local/bin/yank && \
+    chmod +x /usr/local/bin/yank
+
+# Install oh-my-zsh and plugins
+RUN sh -c "$(curl -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended \
+    && git clone https://github.com/zsh-users/zsh-autosuggestions ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-autosuggestions \
+    && git clone https://github.com/zsh-users/zsh-syntax-highlighting.git ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-syntax-highlighting
+
+# Configure Vim
+RUN echo 'function! Yank(text) abort' > /root/.vimrc && \
+    echo '  let escape = system("yank", a:text)' >> /root/.vimrc && \
+    echo '  if v:shell_error' >> /root/.vimrc && \
+    echo '    echoerr escape' >> /root/.vimrc && \
+    echo '  else' >> /root/.vimrc && \
+    echo '    call writefile([escape], "/dev/tty", "b")' >> /root/.vimrc && \
+    echo '  endif' >> /root/.vimrc && \
+    echo 'endfunction' >> /root/.vimrc && \
+    echo '' >> /root/.vimrc && \
+    echo 'noremap <silent> <Leader>y y:<C-U>call Yank(@0)<CR>' >> /root/.vimrc && \
+    echo '' >> /root/.vimrc && \
+    echo '" automatically run yank(1) whenever yanking in Vim' >> /root/.vimrc && \
+    echo 'function! CopyYank() abort' >> /root/.vimrc && \
+    echo '  call Yank(join(v:event.regcontents, "\n"))' >> /root/.vimrc && \
+    echo 'endfunction' >> /root/.vimrc && \
+    echo '' >> /root/.vimrc && \
+    echo 'autocmd TextYankPost * call CopyYank()' >> /root/.vimrc && \
+    echo '' >> /root/.vimrc && \
+    echo '" Basic settings' >> /root/.vimrc && \
+    echo 'set number' >> /root/.vimrc && \
+    echo 'syntax on' >> /root/.vimrc && \
+    echo 'set mouse=a' >> /root/.vimrc && \
+    echo 'filetype indent on' >> /root/.vimrc && \
+    echo '' >> /root/.vimrc && \
+    echo '" Indentation' >> /root/.vimrc && \
+    echo 'set autoindent nosmartindent' >> /root/.vimrc && \
+    echo 'set smarttab' >> /root/.vimrc && \
+    echo 'set expandtab' >> /root/.vimrc && \
+    echo 'set shiftwidth=4' >> /root/.vimrc && \
+    echo 'set softtabstop=4' >> /root/.vimrc && \
+    echo '' >> /root/.vimrc && \
+    echo '" Visual guides' >> /root/.vimrc && \
+    echo 'set colorcolumn=120' >> /root/.vimrc && \
+    echo 'highlight ColorColumn ctermbg=5' >> /root/.vimrc && \
+    echo '' >> /root/.vimrc && \
+    echo '" Status line' >> /root/.vimrc && \
+    echo 'set laststatus=2' >> /root/.vimrc && \
+    echo 'set statusline=%<%f\ %h%m%r%=%{\"[\".(&fenc==\"\"?&enc:&fenc).((exists(\"+bomb\")\ &&\ &bomb)?\",B\":\"\").\"]\ \"}%k\ %-14.(%l,%c%V%)\ %P' >> /root/.vimrc && \
+    echo '' >> /root/.vimrc && \
+    echo '" Backspace behavior' >> /root/.vimrc && \
+    echo 'set backspace=2' >> /root/.vimrc && \
+    echo '' >> /root/.vimrc && \
+    echo '" Encoding' >> /root/.vimrc && \
+    echo 'set encoding=utf-8' >> /root/.vimrc && \
+    echo 'set fileencoding=utf-8' >> /root/.vimrc
+
+# Configure tmux
+RUN echo '# Pane border styling' > /root/.tmux.conf && \
+    echo 'set -g pane-border-style fg=#742727,bg=black' >> /root/.tmux.conf && \
+    echo 'set -g pane-active-border-style fg=red,bg=black' >> /root/.tmux.conf && \
+    echo '' >> /root/.tmux.conf && \
+    echo '# Status bar styling' >> /root/.tmux.conf && \
+    echo 'set -g status-style bg=#0C8A92,fg=black' >> /root/.tmux.conf && \
+    echo '' >> /root/.tmux.conf && \
+    echo '# Change prefix key to backtick' >> /root/.tmux.conf && \
+    echo 'set-option -g prefix `' >> /root/.tmux.conf && \
+    echo 'unbind C-b' >> /root/.tmux.conf && \
+    echo 'bind-key ` send-prefix' >> /root/.tmux.conf && \
+    echo '' >> /root/.tmux.conf && \
+    echo '# Split panes using - and = with current path' >> /root/.tmux.conf && \
+    echo 'unbind \"' >> /root/.tmux.conf && \
+    echo 'bind - splitw -v -c \"#{pane_current_path}\"' >> /root/.tmux.conf && \
+    echo 'unbind %' >> /root/.tmux.conf && \
+    echo 'bind = splitw -h -c \"#{pane_current_path}\"' >> /root/.tmux.conf && \
+    echo '' >> /root/.tmux.conf && \
+    echo '# Vi mode settings' >> /root/.tmux.conf && \
+    echo 'bind-key -T copy-mode-vi Y send-keys -X copy-pipe \"yank > #{pane_tty}\"' >> /root/.tmux.conf && \
+    echo 'set-window-option -g mode-keys vi' >> /root/.tmux.conf && \
+    echo '' >> /root/.tmux.conf && \
+    echo '# Other settings' >> /root/.tmux.conf && \
+    echo 'set-option -g escape-time 0' >> /root/.tmux.conf && \
+    echo 'set-option -g base-index 1' >> /root/.tmux.conf && \
+    echo 'set-window-option -g mouse on' >> /root/.tmux.conf
+
+# Configure Git
+RUN git config --global core.editor "vim" \
+    && git config --global core.whitespace "fix,-indent-with-non-tab,trailing-space,cr-at-eol" \
+    && git config --global core.pager "diff-so-fancy | less --tabs=4 -RFX" \
+    && git config --global color.ui true \
+    && git config --global color."diff-highlight".oldNormal "red bold" \
+    && git config --global color."diff-highlight".oldHighlight "red bold 52" \
+    && git config --global color."diff-highlight".newNormal "green bold" \
+    && git config --global color."diff-highlight".newHighlight "green bold 22" \
+    && git config --global color.diff.meta "11" \
+    && git config --global color.diff.frag "magenta bold" \
+    && git config --global color.diff.commit "yellow bold" \
+    && git config --global color.diff.old "red bold" \
+    && git config --global color.diff.new "green bold" \
+    && git config --global color.diff.whitespace "red reverse" \
+    && git config --global alias.lg "log --color --graph --pretty=format:'%Cred%h%Creset - %s %Cgreen(%cr) %C(bold blue)<%an>%Creset%C(auto)%d%Creset' --abbrev-commit --" \
+    && git config --global http.sslVerify false \
+    && git config --global pull.rebase true
+
+# Configure zsh
+COPY --chown=root:root <<-"EOF" /root/.zshrc
+export ZSH="/root/.oh-my-zsh"
+
+# Theme
+ZSH_THEME="robbyrussell"
+
+# Plugins
+plugins=(
+    git
+    z
+    zsh-autosuggestions
+    zsh-syntax-highlighting
+)
+
+source $ZSH/oh-my-zsh.sh
+
+# Aliases
+alias ll='ls -alF'
+alias la='ls -A'
+alias l='ls -CF'
+alias vi='vim'
+
+# Enhanced history
+HISTSIZE=10000
+SAVEHIST=10000
+setopt HIST_IGNORE_ALL_DUPS
+setopt HIST_FIND_NO_DUPS
+setopt INC_APPEND_HISTORY
+EOF
+
+# Set workspace directory
+WORKDIR /vllm-workspace
+
+RUN git clone --depth=1 https://github.com/vllm-project/vllm.git
+
+# Create .devcontainer and .vscode directories
+RUN mkdir -p /vllm-workspace/vllm/.devcontainer /vllm-workspace/vllm/.vscode
+
+# Copy .devcontainer and .vscode from host to container
+COPY .devcontainer /vllm-workspace/vllm/.devcontainer/
+
+ENV PYTHONPATH=/vllm-workspace/vllm
diff --git a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
new file mode 100644
index 00000000000..5a619f8b765
--- /dev/null
+++ b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
@@ -0,0 +1,131 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation
+from vllm.entrypoints.openai.tool_parsers.llama_tool_parser import (
+    Llama3JsonToolParser)
+
+
+@pytest.fixture
+def parser():
+    # Use a small tokenizer for testing
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    return Llama3JsonToolParser(tokenizer)
+
+
+def test_extract_tool_calls_simple(parser):
+    # Test with a simple tool call
+    model_output = ('Here is the result: {"name": "getOpenIncidentsTool", '
+                    '"parameters": {}} Would you like to know more?')
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert isinstance(result, ExtractedToolCallInformation)
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].type == "function"
+    assert result.tool_calls[0].function.name == "getOpenIncidentsTool"
+    assert result.tool_calls[0].function.arguments == "{}"
+    assert result.content is None
+
+
+def test_extract_tool_calls_with_arguments(parser):
+    # Test with a tool call that has arguments
+    model_output = (
+        '{"name": "searchTool", "parameters": {"query": "test query", '
+        '"limit": 10}}')
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].function.name == "searchTool"
+    assert '"query": "test query"' in result.tool_calls[0].function.arguments
+    assert '"limit": 10' in result.tool_calls[0].function.arguments
+
+
+def test_extract_tool_calls_no_json(parser):
+    # Test with text that doesn't contain a JSON object
+    model_output = "This is just some text without any tool calls"
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is False
+    assert len(result.tool_calls) == 0
+    assert result.content == model_output
+
+
+def test_extract_tool_calls_invalid_json(parser):
+    # Test with invalid JSON
+    model_output = '{"name": "invalidTool", "parameters": {invalid json}'
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is False
+    assert len(result.tool_calls) == 0
+    assert result.content == model_output
+
+
+def test_extract_tool_calls_with_arguments_key(parser):
+    # Test with a tool call that uses "arguments" instead of "parameters"
+    model_output = '{"name": "searchTool", "arguments": {"query": "test"}}'
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].function.name == "searchTool"
+    assert '"query": "test"' in result.tool_calls[0].function.arguments
+
+
+def test_extract_tool_calls_multiple_json(parser):
+    # Test with multiple JSONs separated by semicolons
+    model_output = (
+        '{"name": "searchTool", "parameters": {"query": "test1"}}; '
+        '{"name": "getOpenIncidentsTool", "parameters": {}}; '
+        '{"name": "searchTool", "parameters": {"query": "test2"}}')
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 3
+
+    # Check first tool call
+    assert result.tool_calls[0].function.name == "searchTool"
+    assert '"query": "test1"' in result.tool_calls[0].function.arguments
+
+    # Check second tool call
+    assert result.tool_calls[1].function.name == "getOpenIncidentsTool"
+    assert result.tool_calls[1].function.arguments == "{}"
+
+    # Check third tool call
+    assert result.tool_calls[2].function.name == "searchTool"
+    assert '"query": "test2"' in result.tool_calls[2].function.arguments
+
+
+def test_extract_tool_calls_multiple_json_with_whitespace(parser):
+    # Test with multiple JSONs separated by semicolons and extra whitespace
+    model_output = (
+        '{"name": "searchTool", "parameters": {"query": "test1"}} ; '
+        '{"name": "getOpenIncidentsTool", "parameters": {}} ; '
+        '{"name": "searchTool", "parameters": {"query": "test2"}}')
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 3
+    assert result.tool_calls[0].function.name == "searchTool"
+    assert result.tool_calls[1].function.name == "getOpenIncidentsTool"
+    assert result.tool_calls[2].function.name == "searchTool"
+
+
+def test_extract_tool_calls_multiple_json_with_surrounding_text(parser):
+    # Test with multiple JSONs and surrounding text
+    model_output = (
+        'Here are the results: '
+        '{"name": "searchTool", "parameters": {"query": "test1"}}; '
+        '{"name": "getOpenIncidentsTool", "parameters": {}}; '
+        '{"name": "searchTool", "parameters": {"query": "test2"}} '
+        'Would you like to know more?')
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 3
+    assert result.tool_calls[0].function.name == "searchTool"
+    assert result.tool_calls[1].function.name == "getOpenIncidentsTool"
+    assert result.tool_calls[2].function.name == "searchTool"
diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py
new file mode 100644
index 00000000000..f3871b60c3f
--- /dev/null
+++ b/tests/models/multimodal/processing/test_mllama4.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for mllama's multimodal preprocessing and profiling."""
+import pytest
+from torch import prod
+from transformers import Llama4Config
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.profiling import MultiModalProfiler
+
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["meta-llama/Llama-Guard-4-12B"])
+@pytest.mark.parametrize("max_model_len", [4096, 8192, 25600, 131072])
+def test_profiling(model_id: str, max_model_len: int):
+    model_config_kwargs = {
+        "max_model_len": max_model_len,
+    }
+    ctx = build_model_context(
+        model_id,
+        model_config_kwargs=model_config_kwargs,
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    mm_config = ctx.get_mm_config()
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    profiler = MultiModalProfiler(processor)
+
+    decoder_dummy_data = profiler.get_decoder_dummy_data(
+        max_model_len,
+        mm_counts=mm_config.limit_per_prompt,
+    )
+    dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs(
+        max_model_len,
+        mm_counts=mm_config.limit_per_prompt,
+    )
+
+    hf_config = ctx.get_hf_config(Llama4Config)
+
+    mm_kwargs = processor.apply(
+        prompt=dummy_mm_data.prompt,
+        mm_data=dummy_mm_data.mm_data,
+        hf_processor_mm_kwargs=dict(),
+    )["mm_kwargs"]
+
+    image_size = hf_config.vision_config.image_size
+    patch_size = hf_config.vision_config.patch_size
+    downsample_ratio = int(
+        round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2)))
+    tokens_per_patch = ((image_size // patch_size)**2) // downsample_ratio
+    chunks_per_image = prod(mm_kwargs["patches_per_image"])
+    total_num_patches = chunks_per_image * tokens_per_patch
+    num_tiles = mm_kwargs["aspect_ratios"][0][0] * mm_kwargs["aspect_ratios"][
+        0][1]  # x-y seperator tokens
+    total_tokens = total_num_patches.item() + num_tiles.item(
+    ) + 3  # image start, image, image end
+
+    profiled_tokens = profiler.get_mm_max_contiguous_tokens(
+        max_model_len,
+        mm_counts=mm_config.limit_per_prompt,
+    )
+
+    assert total_tokens == profiled_tokens["image"]
+    assert total_tokens == sum(
+        placeholder.length for placeholder in
+        decoder_dummy_data.multi_modal_placeholders["image"])
diff --git a/tests/models/registry.py b/tests/models/registry.py
index e6543c19734..acfe91f46cb 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -337,7 +337,9 @@ def check_available_online(
                                                       trust_remote_code=True,
                                                       v0_only=True),
     "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct",   # noqa: E501
-                                                      max_model_len=10240),
+                                                      max_model_len=10240,
+                                                      extras={"llama-guard-4": "meta-llama/Llama-Guard-4-12B"}, # noqa: E501
+                                                      ),
     "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
                                                      extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501
                                                              "mistral-fp8": "nm-testing/pixtral-12b-FP8-dynamic"}),  # noqa: E501
diff --git a/vllm/entrypoints/openai/middleware.py b/vllm/entrypoints/openai/middleware.py
new file mode 100644
index 00000000000..dab6a4270b1
--- /dev/null
+++ b/vllm/entrypoints/openai/middleware.py
@@ -0,0 +1,28 @@
+import time
+
+from fastapi import Request
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+async def log_opc_header(request: Request, call_next):
+    # Log at the start and end of a POST request
+    if request.method == "POST":
+        opc_request_id = request.headers.get("opc-request-id", "unknown")
+        logger.info(f"POST Request Start - opc-request-id: {opc_request_id}")
+
+        try:
+            response = await call_next(request)
+            logger.info(
+                f"POST Request End - opc-request-id: {opc_request_id}, "
+                f"status_code: {response.status_code}")
+            return response
+        except Exception as e:
+            logger.error(
+                f"Exception during POST request with "
+                f"opc-request-id: {opc_request_id}, error: {e}")
+            raise
+
+    # For non-POST requests, just pass through
+    return await call_next(request)
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 5698bc70af2..07b992dad31 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -3,7 +3,6 @@
 
 import json
 from collections.abc import Sequence
-from json import JSONDecoder
 from typing import Union
 
 import partial_json_parser
@@ -31,11 +30,11 @@
 @ToolParserManager.register_module("llama4_json")
 class Llama3JsonToolParser(ToolParser):
     """
-    Tool call parser for Llama 3.1 models intended for use with the
+    Tool call parser for Llama 3.x and 4 models intended for use with the
     examples/tool_chat_template_llama.jinja template.
 
-    Used when --enable-auto-tool-choice --tool-call-parser llama3_json 
-    are all set
+    Used when --enable-auto-tool-choice --tool-call-parser llama3_json or 
+    llama4_json are set.
     """
 
     def __init__(self, tokenizer: PreTrainedTokenizerBase):
@@ -51,54 +50,57 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase):
         self.bot_token = "<|python_tag|>"
         self.bot_token_id = tokenizer.encode(self.bot_token,
                                              add_special_tokens=False)[0]
-        self.tool_call_regex = re.compile(r"\[{.*?}\]", re.DOTALL)
+        # Updated regex to match multiple JSONs separated by semicolons
+        # This pattern is more robust and can handle nested JSON objects
+        self.tool_call_regex = re.compile(
+            r'{[^{}]*(?:{[^{}]*}[^{}]*)*}(?:\s*;\s*{[^{}]*(?:{[^{}]*}[^{}]*)*})*',
+            re.DOTALL)
 
     def extract_tool_calls(
             self, model_output: str,
             request: ChatCompletionRequest) -> ExtractedToolCallInformation:
         """
         Extract the tool calls from a complete model response.
+        Only extracts JSON content and ignores any surrounding plain text.
+        Supports both single JSON and multiple JSONs separated by semicolons.
         """
-        # case -- if a tool call token is not present, return a text response
-        if not (model_output.startswith(self.bot_token)
-                or model_output.startswith('{')):
+        # Quick check before running regex
+        if not (self.bot_token in model_output or '{' in model_output):
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        # Find JSON object(s) in the text using regex
+        match = self.tool_call_regex.search(model_output)
+        if not match:
             return ExtractedToolCallInformation(tools_called=False,
                                                 tool_calls=[],
                                                 content=model_output)
 
         try:
-            # load the JSON, and then use it to build the Function and
-            # Tool Call
-            dec = JSONDecoder()
-            function_call_arr = []
-
-            # depending on the prompt format the Llama model may or may not
-            # prefix the output with the <|python_tag|> token
-            start_idx = len(self.bot_token) if model_output.startswith(
-                self.bot_token) else 0
-            while start_idx < len(model_output):
-                (obj, end_idx) = dec.raw_decode(model_output[start_idx:])
-                start_idx += end_idx + len('; ')
-                function_call_arr.append(obj)
-
-            tool_calls: list[ToolCall] = [
-                ToolCall(
-                    type="function",
-                    function=FunctionCall(
-                        name=raw_function_call["name"],
-                        # function call args are JSON but as a string
-                        arguments=json.dumps(raw_function_call["arguments"] \
-                                if "arguments" in raw_function_call \
-                                else raw_function_call["parameters"],
-                                ensure_ascii=False)))
-                for raw_function_call in function_call_arr
-            ]
-
-            # get any content before  the tool call
-            ret = ExtractedToolCallInformation(tools_called=True,
-                                               tool_calls=tool_calls,
-                                               content=None)
-            return ret
+            json_str = match.group(0)
+            # Split by semicolon and strip whitespace
+            json_objects = [obj.strip() for obj in json_str.split(';')]
+
+            tool_calls: list[ToolCall] = []
+            for json_obj in json_objects:
+                if not json_obj:  # Skip empty strings
+                    continue
+                obj = json.loads(json_obj)
+                tool_calls.append(
+                    ToolCall(
+                        type="function",
+                        function=FunctionCall(
+                            name=obj["name"],
+                            # function call args are JSON but as a string
+                            arguments=json.dumps(obj["arguments"] \
+                                    if "arguments" in obj \
+                                    else obj["parameters"])))
+                )
+
+            return ExtractedToolCallInformation(tools_called=True,
+                                                tool_calls=tool_calls,
+                                                content=None)
 
         except Exception:
             logger.exception("Error in extracting tool call from response.")
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 1faecb7bd24..1e77e50334d 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -175,11 +175,14 @@ def _get_dummy_mm_inputs(
     def _get_mm_num_tokens(
         self,
         mm_inputs: MultiModalInputs,
+        mm_embeddings_only: bool = True,
     ) -> Mapping[str, int]:
         placeholders_by_modality = mm_inputs["mm_placeholders"]
 
         return {
-            modality: sum(item.get_num_embeds() for item in placeholders)
+            modality:
+            sum(item.get_num_embeds() if mm_embeddings_only else item.length
+                for item in placeholders)
             for modality, placeholders in placeholders_by_modality.items()
         }
 
@@ -248,11 +251,34 @@ def get_decoder_dummy_data(
             multi_modal_placeholders=mm_inputs["mm_placeholders"],
         )
 
-    def get_mm_max_tokens(
+    def _get_mm_max_tokens(
         self,
         seq_len: int,
         mm_counts: Optional[Mapping[str, int]] = None,
+        mm_embeddings_only: bool = True,
     ) -> Mapping[str, int]:
         mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
 
-        return self._get_mm_num_tokens(mm_inputs)
+        return self._get_mm_num_tokens(mm_inputs,
+                                       mm_embeddings_only=mm_embeddings_only)
+
+    def get_mm_max_contiguous_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ):
+        """
+        Returns the maximum length of the multimodal (image placeholders+text)
+        tokens, including any break/text tokens in-between image embeddings.
+
+        <im_start> [IMG] [IMG] [IMG] <row_break> [IMG] [IMG] [IMG] <im_end>
+        Returns 9, even when the number of image embeddings is 6.
+        
+        This is important to take into account when profiling and
+        initializing the encoder cache size.
+        """
+
+        return self._get_mm_max_tokens(seq_len,
+                                       mm_counts,
+                                       mm_embeddings_only=False)
+>>>>>>> 0e36abf99 ([Bugfix] Correct max tokens for non-contiguous embeds (#21798))
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 27aaa661c35..c9a2a60afea 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -129,7 +129,7 @@ def get_max_tokens_per_item_by_modality(
         seq_len = model_config.max_model_len
         mm_limits = self.get_mm_limits_per_prompt(model_config)
 
-        return profiler.get_mm_max_tokens(
+        return profiler.get_mm_max_contiguous_tokens(
             seq_len,
             {
                 modality: 1