Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion components/backends/vllm/src/dynamo/vllm/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,11 @@ def overwrite_args(config):

defaults = {
"task": "generate",
"skip_tokenizer_init": True,
# As of vLLM >=0.10.0 the engine unconditionally calls
# `sampling_params.update_from_tokenizer(...)`, so we can no longer
# skip tokenizer initialisation. Setting this to **False** avoids
# a NoneType error when the processor accesses the tokenizer.
"skip_tokenizer_init": False,
"disable_log_requests": True,
# KV routing relies on logging KV metrics
"disable_log_stats": False,
Expand Down
2 changes: 2 additions & 0 deletions components/backends/vllm/src/dynamo/vllm/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ async def generate(self, request):
prompt = TokensPrompt(prompt_token_ids=request["token_ids"])

sampling_params = SamplingParams(**self.default_sampling_params)

sampling_params.detokenize = False
for key, value in request["sampling_options"].items():
if value is not None and hasattr(sampling_params, key):
setattr(sampling_params, key, value)
Expand Down
6 changes: 5 additions & 1 deletion components/backends/vllm/src/dynamo/vllm/publisher.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def record(
self,
scheduler_stats: Optional[SchedulerStats],
iteration_stats: Optional[IterationStats],
engine_idx: int = 0,
):
pass

Expand All @@ -51,7 +52,10 @@ def set_num_request_total_slots(self, request_total_slots):
self.request_total_slots = request_total_slots

def record(
self, scheduler_stats: SchedulerStats, iteration_stats: Optional[IterationStats]
self,
scheduler_stats: SchedulerStats,
iteration_stats: Optional[IterationStats],
engine_idx: int = 0,
):
# request_total_slots and kv_total_blocks are properties of model + gpu
# we should only publish them once, not every metric update
Expand Down
28 changes: 10 additions & 18 deletions container/Dockerfile.vllm
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,15 @@ ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
ARG RELEASE_BUILD
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
ARG VLLM_REF="059d4cd"
ARG TORCH_BACKEND="cu128"

# After this commit deepgemm API changed
# 1.0.0 -> 2.0.0
ARG DEEPGEMM_REF="03d0be3"
ARG FLASHINF_REF="1d72ed4"

# Make sure to update the dependency version in pyproject.toml when updating this
ARG VLLM_VERSION="0.9.2"
ARG VLLM_REF="v0.10.0"
ARG TORCH_BACKEND="cu128"

# Match 0.10.0 vLLM release
# https://github.com/vllm-project/vllm/releases/tag/v0.10.0
ARG DEEPGEMM_REF="1876566"
ARG FLASHINF_REF="v0.2.8rc1"

# Define general architecture ARGs for supporting both x86 and aarch64 builds.
# ARCH: Used for package suffixes (e.g., amd64, arm64)
Expand All @@ -42,11 +41,10 @@ ARG ARCH_ALT=x86_64

FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS base

# Redeclare ARCH, ARCH_ALT, TORCH_BACKEND, VLLM_VERSION so they're available in this stage
# Redeclare ARCH, ARCH_ALT, TORCH_BACKEND so they're available in this stage
ARG ARCH
ARG ARCH_ALT
ARG TORCH_BACKEND
ARG VLLM_VERSION

USER root
ARG PYTHON_VERSION=3.12
Expand Down Expand Up @@ -195,15 +193,11 @@ ENV CUDA_HOME=/usr/local/cuda

RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
--mount=type=cache,target=/root/.cache/uv \
if [ "$ARCH" = "arm64" ]; then \
# TODO - split vllm, DeepEP, DeepGeMM, PPLX installs
# Should be able to select how you want your build to go
cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \
chmod +x /tmp/install_vllm.sh && \
/tmp/install_vllm.sh --editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt --deepgemm-ref $DEEPGEMM_REF --flashinf-ref $FLASHINF_REF --torch-backend $TORCH_BACKEND; \
else \
uv pip install "vllm==${VLLM_VERSION}"; \
fi
/tmp/install_vllm.sh --editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt --deepgemm-ref $DEEPGEMM_REF --flashinf-ref $FLASHINF_REF --torch-backend $TORCH_BACKEND;

ENV LD_LIBRARY_PATH=\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
Expand Down Expand Up @@ -464,9 +458,7 @@ COPY --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
COPY --from=wheel_builder /workspace/dist/*.whl wheelhouse/

# Copies vllm, DeepEP, DeepGEMM, PPLX repos (all editable installs) and nvshmem binaries
RUN if [ "$ARCH" = "arm64" ]; then \
COPY --from=base /opt/vllm /opt/vllm; \
fi
COPY --from=base /opt/vllm /opt/vllm

ENV LD_LIBRARY_PATH=\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
Expand Down
6 changes: 3 additions & 3 deletions container/deps/vllm/install_vllm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ set -euo pipefail

# Parse arguments
EDITABLE=true
VLLM_REF="059d4cd"
VLLM_REF="v0.10.0"
MAX_JOBS=16
INSTALLATION_DIR=/tmp
ARCH=$(uname -m)
DEEPGEMM_REF="6c9558e"
FLASHINF_REF="1d72ed4"
DEEPGEMM_REF="1876566"
FLASHINF_REF="v0.2.8rc1"
TORCH_BACKEND="cu128"

# Convert x86_64 to amd64 for consistency with Docker ARG
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ trtllm =[
vllm = [
"uvloop",
"nixl",
"vllm==0.9.2",
"vllm==0.10.0",
]

sglang = [
Expand Down
2 changes: 1 addition & 1 deletion tests/serve/test_vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ class VLLMConfig:
endpoints: List[str]
response_handlers: List[Callable[[Any], str]]
model: str
timeout: int = 60
timeout: int = 120
delayed_start: int = 0


Expand Down
Loading