ai-dynamo · alec-flowers · Jul 28, 2025 · Jul 25, 2025 · Jul 26, 2025 · Jul 28, 2025
@@ -207,7 +207,11 @@ def overwrite_args(config):
 
     defaults = {
         "task": "generate",
-        "skip_tokenizer_init": True,
+        # As of vLLM >=0.10.0 the engine unconditionally calls
+        # `sampling_params.update_from_tokenizer(...)`, so we can no longer
+        # skip tokenizer initialisation.  Setting this to **False** avoids
+        # a NoneType error when the processor accesses the tokenizer.
+        "skip_tokenizer_init": False,
         "disable_log_requests": True,
         # KV routing relies on logging KV metrics
         "disable_log_stats": False,

@@ -110,6 +110,8 @@ async def generate(self, request):
         prompt = TokensPrompt(prompt_token_ids=request["token_ids"])
 
         sampling_params = SamplingParams(**self.default_sampling_params)
+
+        sampling_params.detokenize = False
         for key, value in request["sampling_options"].items():
             if value is not None and hasattr(sampling_params, key):
                 setattr(sampling_params, key, value)

@@ -25,6 +25,7 @@ def record(
         self,
         scheduler_stats: Optional[SchedulerStats],
         iteration_stats: Optional[IterationStats],
+        engine_idx: int = 0,
     ):
         pass
 
@@ -51,7 +52,10 @@ def set_num_request_total_slots(self, request_total_slots):
         self.request_total_slots = request_total_slots
 
     def record(
-        self, scheduler_stats: SchedulerStats, iteration_stats: Optional[IterationStats]
+        self,
+        scheduler_stats: SchedulerStats,
+        iteration_stats: Optional[IterationStats],
+        engine_idx: int = 0,
     ):
         # request_total_slots and kv_total_blocks are properties of model + gpu
         # we should only publish them once, not every metric update

@@ -10,16 +10,15 @@ ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
 ARG RELEASE_BUILD
 ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
 ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
-ARG VLLM_REF="059d4cd"
-ARG TORCH_BACKEND="cu128"
-
-# After this commit deepgemm API changed
-# 1.0.0 -> 2.0.0
-ARG DEEPGEMM_REF="03d0be3"
-ARG FLASHINF_REF="1d72ed4"
 
 # Make sure to update the dependency version in pyproject.toml when updating this
-ARG VLLM_VERSION="0.9.2"
+ARG VLLM_REF="v0.10.0"
+ARG TORCH_BACKEND="cu128"
+
+# Match 0.10.0 vLLM release
+# https://github.com/vllm-project/vllm/releases/tag/v0.10.0
+ARG DEEPGEMM_REF="1876566"
+ARG FLASHINF_REF="v0.2.8rc1"
 
 # Define general architecture ARGs for supporting both x86 and aarch64 builds.
 #   ARCH: Used for package suffixes (e.g., amd64, arm64)
@@ -42,11 +41,10 @@ ARG ARCH_ALT=x86_64
 
 FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS base
 
-# Redeclare ARCH, ARCH_ALT, TORCH_BACKEND, VLLM_VERSION so they're available in this stage
+# Redeclare ARCH, ARCH_ALT, TORCH_BACKEND so they're available in this stage
 ARG ARCH
 ARG ARCH_ALT
 ARG TORCH_BACKEND
-ARG VLLM_VERSION
 
 USER root
 ARG PYTHON_VERSION=3.12
@@ -195,15 +193,11 @@ ENV CUDA_HOME=/usr/local/cuda
 
 RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
     --mount=type=cache,target=/root/.cache/uv \
-    if [ "$ARCH" = "arm64" ]; then \
         # TODO - split vllm, DeepEP, DeepGeMM, PPLX installs
         # Should be able to select how you want your build to go
         cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \
         chmod +x /tmp/install_vllm.sh && \
-        /tmp/install_vllm.sh --editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt --deepgemm-ref $DEEPGEMM_REF --flashinf-ref $FLASHINF_REF --torch-backend $TORCH_BACKEND; \
-    else \
-        uv pip install "vllm==${VLLM_VERSION}"; \
-    fi
+        /tmp/install_vllm.sh --editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt --deepgemm-ref $DEEPGEMM_REF --flashinf-ref $FLASHINF_REF --torch-backend $TORCH_BACKEND;
 
 ENV LD_LIBRARY_PATH=\
 /opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
@@ -464,9 +458,7 @@ COPY --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
 COPY --from=wheel_builder /workspace/dist/*.whl wheelhouse/
 
 # Copies vllm, DeepEP, DeepGEMM, PPLX repos (all editable installs) and nvshmem binaries
-RUN if [ "$ARCH" = "arm64" ]; then \
-        COPY --from=base /opt/vllm /opt/vllm; \
-    fi
+COPY --from=base /opt/vllm /opt/vllm
 
 ENV LD_LIBRARY_PATH=\
 /opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\

@@ -20,12 +20,12 @@ set -euo pipefail
 
 # Parse arguments
 EDITABLE=true
-VLLM_REF="059d4cd"
+VLLM_REF="v0.10.0"
 MAX_JOBS=16
 INSTALLATION_DIR=/tmp
 ARCH=$(uname -m)
-DEEPGEMM_REF="6c9558e"
-FLASHINF_REF="1d72ed4"
+DEEPGEMM_REF="1876566"
+FLASHINF_REF="v0.2.8rc1"
 TORCH_BACKEND="cu128"
 
 # Convert x86_64 to amd64 for consistency with Docker ARG

diff --git a/pyproject.toml b/pyproject.toml
@@ -67,7 +67,7 @@ trtllm =[
 vllm = [
     "uvloop",
     "nixl",
-    "vllm==0.9.2",
+    "vllm==0.10.0",
 ]
 
 sglang = [

@@ -59,7 +59,7 @@ class VLLMConfig:
     endpoints: List[str]
     response_handlers: List[Callable[[Any], str]]
     model: str
-    timeout: int = 60
+    timeout: int = 120
     delayed_start: int = 0