NixOS · happysalada · Nov 23, 2025 · Nov 18, 2025 · daniel-fahey · Nov 22, 2025
diff --git a/pkgs/development/python-modules/vllm/0002-setup.py-nix-support-respect-cmakeFlags.patch b/pkgs/development/python-modules/vllm/0002-setup.py-nix-support-respect-cmakeFlags.patch
@@ -1,19 +1,10 @@
-From 10b7e8330bdba319a4162cceb8e5dd4280215b04 Mon Sep 17 00:00:00 2001
-From: SomeoneSerge <else@someonex.net>
-Date: Wed, 31 Jul 2024 12:06:15 +0000
-Subject: [PATCH 2/2] setup.py: nix-support (respect cmakeFlags)
-
----
- setup.py | 10 ++++++++++
- 1 file changed, 10 insertions(+)
-
 diff --git a/setup.py b/setup.py
-index 01e006f9..14762146 100644
+index e9b36e2a2..bc9e2f1e3 100644
 --- a/setup.py
 +++ b/setup.py
-@@ -15,6 +15,15 @@ from setuptools import Extension, find_packages, setup
- from setuptools.command.build_ext import build_ext
- from torch.utils.cpp_extension import CUDA_HOME
+@@ -20,6 +20,15 @@ from setuptools.command.build_ext import build_ext
+ from setuptools_scm import get_version
+ from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
 
 +import os
 +import json
@@ -27,14 +18,11 @@ index 01e006f9..14762146 100644
 
  def load_module_from_path(module_name, path):
      spec = importlib.util.spec_from_file_location(module_name, path)
-@@ -159,6 +168,7 @@ class cmake_build_ext(build_ext):
-             '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={}'.format(outdir),
-             '-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY={}'.format(self.build_temp),
-             '-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
+@@ -151,6 +160,7 @@ class cmake_build_ext(build_ext):
+         cmake_args = [
+             "-DCMAKE_BUILD_TYPE={}".format(cfg),
+             "-DVLLM_TARGET_DEVICE={}".format(VLLM_TARGET_DEVICE),
 +            *NIX_ATTRS["cmakeFlags"],
          ]
 
          verbose = envs.VERBOSE
--- 
-2.45.1
-
diff --git a/pkgs/development/python-modules/vllm/0003-propagate-pythonpath.patch b/pkgs/development/python-modules/vllm/0003-propagate-pythonpath.patch
@@ -1,12 +1,13 @@
 diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
-index 81623def..2a6e2c92 100644
+index a2de597c8..4c2410209 100644
 --- a/vllm/model_executor/models/registry.py
 +++ b/vllm/model_executor/models/registry.py
-@@ -521,6 +521,7 @@ def _run_in_subprocess(fn: Callable[[], _T]) -> _T:
+@@ -1121,7 +1121,7 @@ def _run_in_subprocess(fn: Callable[[], _T]) -> _T:
+         # cannot use `sys.executable __file__` here because the script
          # contains relative imports
-         returned = subprocess.run(_SUBPROCESS_COMMAND,
-                                   input=input_bytes,
-+                                  env={'PYTHONPATH': ':'.join(sys.path)},
-                                   capture_output=True)
+         returned = subprocess.run(
+-            _SUBPROCESS_COMMAND, input=input_bytes, capture_output=True
++            _SUBPROCESS_COMMAND, input=input_bytes, capture_output=True, env={'PYTHONPATH': ':'.join(sys.path)},
+         )
 
          # check if the subprocess is successful
diff --git a/pkgs/development/python-modules/vllm/0005-drop-intel-reqs.patch b/pkgs/development/python-modules/vllm/0005-drop-intel-reqs.patch
@@ -1,8 +1,8 @@
 diff --git a/requirements/cpu.txt b/requirements/cpu.txt
-index 2db6d87ee..37f816170 100644
+index d11787df4..71575d707 100644
 --- a/requirements/cpu.txt
 +++ b/requirements/cpu.txt
-@@ -21,9 +21,6 @@ torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
+@@ -20,9 +20,6 @@ torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
  torchvision==0.23.0; platform_machine == "ppc64le"
  datasets # for benchmark scripts
 

diff --git a/pkgs/development/python-modules/vllm/default.nix b/pkgs/development/python-modules/vllm/default.nix
@@ -34,6 +34,7 @@
   uvicorn,
   pydantic,
   aioprometheus,
+  anthropic,
   nvidia-ml-py,
   openai,
   pyzmq,
@@ -53,6 +54,7 @@
   compressed-tensors,
   mistral-common,
   msgspec,
+  model-hosting-container-standards,
   numactl,
   tokenizers,
   oneDNN,
@@ -98,10 +100,11 @@ let
   # see CMakeLists.txt, grepping for CUTLASS_REVISION
   # https://github.com/vllm-project/vllm/blob/v${version}/CMakeLists.txt
   cutlass = fetchFromGitHub {
+    name = "cutlass-source";
     owner = "NVIDIA";
     repo = "cutlass";
-    tag = "v4.0.0";
-    hash = "sha256-HJY+Go1viPkSVZPEs/NyMtYJzas4mMLiIZF3kNX+WgA=";
+    tag = "v4.2.1";
+    hash = "sha256-iP560D5Vwuj6wX1otJhwbvqe/X4mYVeKTpK533Wr5gY=";
   };
 
   # FlashMLA's Blackwell (SM100) kernels were developed against CUTLASS v3.9.0
@@ -126,10 +129,11 @@ let
     # grep for GIT_TAG in the following file
     # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/flashmla.cmake
     src = fetchFromGitHub {
+      name = "FlashMLA-source";
       owner = "vllm-project";
       repo = "FlashMLA";
-      rev = "5f65b85703c7ed75fda01e06495077caad207c3f";
-      hash = "sha256-DO9EFNSoAgyfRRc095v1UjT+Zdzk4cFY0+n28FVEwI0=";
+      rev = "46d64a8ebef03fa50b4ae74937276a5c940e3f95";
+      hash = "sha256-jtMzWB5hKz8mJGsdK6q4YpQbGp9IrQxbwmB3a64DIl0=";
     };
 
     dontConfigure = true;
@@ -145,6 +149,16 @@ let
     '';
   };
 
+  # grep for GIT_TAG in the following file
+  # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/qutlass.cmake
+  qutlass = fetchFromGitHub {
+    name = "qutlass-source";
+    owner = "IST-DASLab";
+    repo = "qutlass";
+    rev = "830d2c4537c7396e14a02a46fbddd18b5d107c65";
+    hash = "sha256-aG4qd0vlwP+8gudfvHwhtXCFmBOJKQQTvcwahpEqC84=";
+  };
+
   vllm-flash-attn' = lib.defaultTo (stdenv.mkDerivation {
     pname = "vllm-flash-attn";
     # https://github.com/vllm-project/flash-attention/blob/${src.rev}/vllm_flash_attn/__init__.py
@@ -153,10 +167,11 @@ let
     # grep for GIT_TAG in the following file
     # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/vllm_flash_attn.cmake
     src = fetchFromGitHub {
+      name = "flash-attention-source";
       owner = "vllm-project";
       repo = "flash-attention";
-      rev = "ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a";
-      hash = "sha256-2r0Habd/kBpvM4/aQFIYyj+uQAa3M9gjk3DcBZHFNfA=";
+      rev = "58e0626a692f09241182582659e3bf8f16472659";
+      hash = "sha256-ewdZd7LuBKBV0y3AaGRWISJzjg6cu59D2OtgqoDjrbM=";
     };
 
     patches = [
@@ -284,7 +299,7 @@ in
 
 buildPythonPackage rec {
   pname = "vllm";
-  version = "0.11.0";
+  version = "0.11.2";
   pyproject = true;
 
   stdenv = torch.stdenv;
@@ -293,38 +308,31 @@ buildPythonPackage rec {
     owner = "vllm-project";
     repo = "vllm";
     tag = "v${version}";
-    hash = "sha256-47TPvvPQvVbh6Gm2yvi+xhWZ8tSma91rp9hp/SBrEY8=";
+    hash = "sha256-DoSlkFmR3KKEtfSfdRB++0CZeeXgxmM3zZjONlxbe8U=";
   };
 
   patches = [
     ./0002-setup.py-nix-support-respect-cmakeFlags.patch
     ./0003-propagate-pythonpath.patch
     ./0005-drop-intel-reqs.patch
-    # TODO: Remove the below patches when included in vLLM release
-    (fetchpatch {
-      url = "https://github.com/vllm-project/vllm/commit/9705fba7b727a3b9c275b012258608531e2223d1.patch";
-      hash = "sha256-DxRGLiwkegMlMjqFmFc0igpaVv06/Y2WjL+ISoIOET4=";
-    })
-    # patch above is previous commit needed to apply patch below
-    # oneDNN / CPU fix from https://github.com/vllm-project/vllm/pull/26401
-    (fetchpatch {
-      url = "https://github.com/vllm-project/vllm/commit/d7be1f2a480bdc62a6a1ec0126a401e3d42985fe.patch";
-      hash = "sha256-Zi1k5wiOPjsbWHFKpcLq9Ns43wIP37Mbvesi5K80zaQ=";
-    })
   ];
 
   postPatch = ''
     # Remove vendored pynvml entirely
     rm vllm/third_party/pynvml.py
     substituteInPlace tests/utils.py \
-      --replace-fail "from vllm.third_party.pynvml import" "from pynvml import"
-    substituteInPlace vllm/utils/__init__.py \
-      --replace-fail "import vllm.third_party.pynvml" "import pynvml"
+      --replace-fail \
+        "from vllm.third_party.pynvml import" \
+        "from pynvml import"
+    substituteInPlace vllm/utils/import_utils.py \
+      --replace-fail \
+        "import vllm.third_party.pynvml as pynvml" \
+        "import pynvml"
 
     # pythonRelaxDeps does not cover build-system
     substituteInPlace pyproject.toml \
       --replace-fail "torch ==" "torch >=" \
-      --replace-fail "setuptools>=77.0.3,<80.0.0" "setuptools"
+      --replace-fail "setuptools>=77.0.3,<81.0.0" "setuptools"
 
     # Ignore the python version check because it hard-codes minor versions and
     # lags behind `ray`'s python interpreter support
@@ -393,6 +401,7 @@ buildPythonPackage rec {
 
   dependencies = [
     aioprometheus
+    anthropic
     blake3
     cachetools
     cbor2
@@ -424,6 +433,7 @@ buildPythonPackage rec {
     partial-json-parser
     compressed-tensors
     mistral-common
+    model-hosting-container-standards
     torch
     torchaudio
     torchvision
@@ -460,6 +470,7 @@ buildPythonPackage rec {
     (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}")
     (lib.cmakeFeature "FLASH_MLA_SRC_DIR" "${lib.getDev flashmla}")
     (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn'}")
+    (lib.cmakeFeature "QUTLASS_SRC_DIR" "${lib.getDev qutlass}")
     (lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}")
     (lib.cmakeFeature "CUTLASS_NVCC_ARCHS_ENABLED" "${cudaPackages.flags.cmakeCudaArchitecturesString}")
     (lib.cmakeFeature "CUDA_TOOLKIT_ROOT_DIR" "${symlinkJoin {