vllm-project · mgoin · May 2, 2026 · May 2, 2026 · May 2, 2026 · May 2, 2026
@@ -105,7 +105,11 @@ steps:
   device: h100
   num_devices: 1
   source_file_dependencies:
+  - cmake/external_projects/deepgemm.cmake
   - tools/install_deepgemm.sh
+  - tools/build_deepgemm_C.py
+  - tools/setup_deepgemm_pythons.sh
+  - tools/check_wheel_deepgemm.py
   - vllm/utils/deep_gemm.py
   - vllm/model_executor/layers/fused_moe
   - vllm/model_executor/layers/quantization
@@ -115,6 +119,7 @@ steps:
   - tests/kernels/attention/test_deepgemm_attention.py
   - tests/quantization/test_cutlass_w4a16.py
   commands:
+    - python3 ../tools/check_wheel_deepgemm.py
     - pytest -v -s kernels/quantization/test_block_fp8.py
     - pytest -v -s kernels/moe/test_deepgemm.py
     - pytest -v -s kernels/moe/test_batched_deepgemm.py

@@ -53,49 +53,67 @@ cuda_archs_loose_intersection(DEEPGEMM_ARCHS
 if(DEEPGEMM_ARCHS)
   message(STATUS "DeepGEMM CUDA architectures: ${DEEPGEMM_ARCHS}")
 
-  find_package(CUDAToolkit REQUIRED)
-
-  #
-  # Build the _C pybind11 extension from DeepGEMM's C++ source.
-  # This is a CXX-only module — CUDA kernels are JIT-compiled at runtime.
-  #
-  Python_add_library(_deep_gemm_C MODULE WITH_SOABI
-    "${deepgemm_SOURCE_DIR}/csrc/python_api.cpp")
-
-  # The pybind11 module name must be _C to match DeepGEMM's Python imports.
-  set_target_properties(_deep_gemm_C PROPERTIES OUTPUT_NAME "_C")
-
-  target_compile_definitions(_deep_gemm_C PRIVATE
-    "-DTORCH_EXTENSION_NAME=_C")
-
-  target_include_directories(_deep_gemm_C PRIVATE
-    "${deepgemm_SOURCE_DIR}/csrc"
-    "${deepgemm_SOURCE_DIR}/deep_gemm/include"
-    "${deepgemm_SOURCE_DIR}/third-party/cutlass/include"
-    "${deepgemm_SOURCE_DIR}/third-party/cutlass/tools/util/include"
-    "${deepgemm_SOURCE_DIR}/third-party/fmt/include")
-
-  target_compile_options(_deep_gemm_C PRIVATE
-    $<$<COMPILE_LANGUAGE:CXX>:-std=c++17>
-    $<$<COMPILE_LANGUAGE:CXX>:-O3>
-    $<$<COMPILE_LANGUAGE:CXX>:-Wno-psabi>
-    $<$<COMPILE_LANGUAGE:CXX>:-Wno-deprecated-declarations>)
-
-  # torch_python is required because DeepGEMM uses pybind11 type casters
-  # for at::Tensor (via PYBIND11_MODULE), unlike vLLM's own extensions which
-  # use torch::Library custom ops.
-  find_library(TORCH_PYTHON_LIBRARY torch_python
-    PATHS "${TORCH_INSTALL_PREFIX}/lib"
-    REQUIRED)
-
-  target_link_libraries(_deep_gemm_C PRIVATE
-    torch ${TORCH_LIBRARIES} "${TORCH_PYTHON_LIBRARY}"
-    CUDA::cudart CUDA::nvrtc)
-
-  # Install the shared library into the vendored package directory
-  install(TARGETS _deep_gemm_C
-    LIBRARY DESTINATION vllm/third_party/deep_gemm
-    COMPONENT _deep_gemm_C)
+  # Build _C once per interpreter in DEEPGEMM_PYTHON_INTERPRETERS (":"-
+  # separated paths) so the wheel imports cleanly on every supported Python.
+  # Unset → fall back to the build interpreter (editable / source builds).
+  # The compile is delegated to tools/build_deepgemm_C.py and always runs
+  # against the build interpreter's torch — target Pythons don't need torch.
+  # Note: empty-but-set env vars are still DEFINED in cmake; treat empty as
+  # unset so an empty interpreter list falls back to the build interpreter
+  # rather than silently skipping the per-Python build.
+  if(NOT "$ENV{DEEPGEMM_PYTHON_INTERPRETERS}" STREQUAL "")
+    string(REPLACE ":" ";" _dg_pythons "$ENV{DEEPGEMM_PYTHON_INTERPRETERS}")
+  else()
+    set(_dg_pythons "${Python_EXECUTABLE}")
+  endif()
+  message(STATUS "DeepGEMM _C will be built for: ${_dg_pythons}")
+
+  # Header set fed to add_custom_command's DEPENDS so a header-only edit
+  # (in upstream DeepGEMM or its vendored cutlass/fmt) re-triggers the
+  # rebuild. add_custom_command does no implicit header scanning, unlike
+  # add_library.
+  file(GLOB_RECURSE _dg_headers
+    "${deepgemm_SOURCE_DIR}/csrc/*.h"
+    "${deepgemm_SOURCE_DIR}/csrc/*.hpp"
+    "${deepgemm_SOURCE_DIR}/deep_gemm/include/*.h"
+    "${deepgemm_SOURCE_DIR}/deep_gemm/include/*.hpp"
+    "${deepgemm_SOURCE_DIR}/deep_gemm/include/*.cuh")
+
+  set(_dg_markers)
+  set(_dg_seen_soabis)
+  foreach(_pybin IN LISTS _dg_pythons)
+    execute_process(
+      COMMAND "${_pybin}" -c
+        "import sysconfig; print(sysconfig.get_config_var('SOABI'))"
+      OUTPUT_VARIABLE _dg_soabi
+      OUTPUT_STRIP_TRAILING_WHITESPACE
+      COMMAND_ERROR_IS_FATAL ANY)
+    # Dedup so duplicate paths (or two paths resolving to the same CPython)
+    # don't register conflicting build rules.
+    if(_dg_soabi IN_LIST _dg_seen_soabis)
+      continue()
+    endif()
+    list(APPEND _dg_seen_soabis "${_dg_soabi}")
+    set(_dg_dir "${CMAKE_CURRENT_BINARY_DIR}/deepgemm_C_${_dg_soabi}")
+    set(_dg_marker "${_dg_dir}/.built")
+    add_custom_command(
+      OUTPUT "${_dg_marker}"
+      COMMAND "${Python_EXECUTABLE}"
+              "${CMAKE_SOURCE_DIR}/tools/build_deepgemm_C.py"
+              "${deepgemm_SOURCE_DIR}" "${_dg_dir}" "${_pybin}"
+      COMMAND "${CMAKE_COMMAND}" -E touch "${_dg_marker}"
+      DEPENDS "${CMAKE_SOURCE_DIR}/tools/build_deepgemm_C.py"
+              "${deepgemm_SOURCE_DIR}/csrc/python_api.cpp"
+              ${_dg_headers}
+      COMMENT "Building DeepGEMM _C for ${_pybin}"
+      VERBATIM)
+    list(APPEND _dg_markers "${_dg_marker}")
+    install(DIRECTORY "${_dg_dir}/"
+      DESTINATION vllm/third_party/deep_gemm
+      COMPONENT _deep_gemm_C
+      FILES_MATCHING PATTERN "_C.cpython-*.so")
+  endforeach()
+  add_custom_target(_deep_gemm_C ALL DEPENDS ${_dg_markers})
 
   #
   # Vendor DeepGEMM Python package files

@@ -301,6 +301,15 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         python3 use_existing_torch.py --prefix; \
     fi
 
+# Provision a bare interpreter for each CPython covered by `requires-python`
+# so DeepGEMM `_C` is built once per Python and bundled side-by-side in the
+# wheel; cmake reads DEEPGEMM_PYTHON_INTERPRETERS in deepgemm.cmake's
+# foreach loop. The matrix is derived from pyproject.toml.
+COPY tools/setup_deepgemm_pythons.sh tools/build_deepgemm_C.py tools/
+ENV DEEPGEMM_VENV_PREFIX=/opt/dgenv
+RUN --mount=type=cache,target=/root/.cache/uv \
+    tools/setup_deepgemm_pythons.sh > /tmp/dg_pythons.txt
+
 # Build the vLLM wheel
 # if USE_SCCACHE is set, use sccache to speed up compilation
 # AWS credentials mounted at ~/.aws/credentials for sccache S3 auth (optional)
@@ -328,6 +337,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         && export VLLM_PRECOMPILED_WHEEL_COMMIT="${VLLM_MERGE_BASE_COMMIT}" \
         && export VLLM_MAIN_CUDA_VERSION="${VLLM_MAIN_CUDA_VERSION}" \
         && export VLLM_DOCKER_BUILD_CONTEXT=1 \
+        && export DEEPGEMM_PYTHON_INTERPRETERS=$(cat /tmp/dg_pythons.txt) \
         && sccache --show-stats \
         && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
         && sccache --show-stats; \
@@ -345,6 +355,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
         export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
         export VLLM_PRECOMPILED_WHEEL_COMMIT="${VLLM_MERGE_BASE_COMMIT}" && \
         export VLLM_DOCKER_BUILD_CONTEXT=1 && \
+        export DEEPGEMM_PYTHON_INTERPRETERS=$(cat /tmp/dg_pythons.txt) && \
         python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
 

diff --git a/tools/build_deepgemm_C.py b/tools/build_deepgemm_C.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Build DeepGEMM's `_C` pybind11 extension for a target Python.
+
+Driven from `cmake/external_projects/deepgemm.cmake`. The driver is the
+build interpreter (which has torch); the *target* Python is only used for
+its header path and SOABI. This avoids needing torch installed in N venvs
+to produce N matching `.so` files.
+
+Usage: python build_deepgemm_C.py <DEEPGEMM_SRC_DIR> <OUTPUT_DIR> <TARGET_PY>
+"""
+
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+import torch
+from torch.utils import cpp_extension
+
+if len(sys.argv) != 4:
+    sys.exit(f"usage: {sys.argv[0]} <SRC> <OUT> <TARGET_PY>")
+
+src = Path(sys.argv[1]).resolve()
+out = Path(sys.argv[2]).resolve()
+target_py = sys.argv[3]
+out.mkdir(parents=True, exist_ok=True)
+
+info = json.loads(
+    subprocess.check_output(
+        [
+            target_py,
+            "-c",
+            "import sysconfig, json; "
+            "print(json.dumps({k: sysconfig.get_config_var(k) "
+            "for k in ('EXT_SUFFIX', 'INCLUDEPY')}))",
+        ]
+    ).decode()
+)
+
+cuda_home = cpp_extension.CUDA_HOME
+if cuda_home is None:
+    sys.exit("CUDA_HOME not found; cannot build DeepGEMM _C")
+# CCCL lives outside the standard CUDAToolkit search, mirroring DeepGEMM's
+# own setup.py.
+includes = [
+    info["INCLUDEPY"],
+    f"{cuda_home}/include",
+    f"{cuda_home}/include/cccl",
+    str(src / "csrc"),
+    str(src / "deep_gemm/include"),
+    str(src / "third-party/cutlass/include"),
+    str(src / "third-party/cutlass/tools/util/include"),
+    str(src / "third-party/fmt/include"),
+    *cpp_extension.include_paths(device_type="cuda"),
+]
+
+cmd = [
+    os.environ.get("CXX", "g++"),
+    "-shared",
+    "-fPIC",
+    "-std=c++17",
+    "-O3",
+    "-g0",
+    "-Wno-psabi",
+    "-Wno-deprecated-declarations",
+    "-DTORCH_API_INCLUDE_EXTENSION_H",
+    "-DTORCH_EXTENSION_NAME=_C",
+    f"-D_GLIBCXX_USE_CXX11_ABI={int(torch.compiled_with_cxx11_abi())}",
+    *(f"-I{p}" for p in includes),
+    str(src / "csrc/python_api.cpp"),
+    *(f"-L{p}" for p in cpp_extension.library_paths(device_type="cuda")),
+    f"-L{cuda_home}/lib64",
+    "-ltorch",
+    "-ltorch_python",
+    "-ltorch_cpu",
+    "-ltorch_cuda",
+    "-lc10",
+    "-lc10_cuda",
+    "-lcudart",
+    "-lnvrtc",
+    "-o",
+    str(out / f"_C{info['EXT_SUFFIX']}"),
+]
+print("[build_deepgemm_C] " + " ".join(cmd), flush=True)
+subprocess.check_call(cmd)
diff --git a/tools/check_wheel_deepgemm.py b/tools/check_wheel_deepgemm.py
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Assert the installed vLLM has a `_C.cpython-X.Y-*.so` for every CPython
+covered by `requires-python`. Fails closed if a Python's `.so` is missing
+from the wheel — i.e. the regression that surfaced in #41476/#41512.
+
+Run from a CI test job after vLLM is installed, e.g. the H100 deepgemm
+kernel tests in .buildkite/test_areas/kernels.yaml.
+"""
+
+import importlib.util
+import os
+import sys
+from pathlib import Path
+
+import regex as re
+import tomllib
+
+SO_RE = re.compile(r"^_C\.cpython-(\d)(\d+)-")
+
+
+def required_pythons() -> list[str]:
+    pyproject = Path(__file__).resolve().parent.parent / "pyproject.toml"
+    spec = tomllib.loads(pyproject.read_text())["project"]["requires-python"]
+    m = re.match(r">=3\.(\d+),<3\.(\d+)", spec)
+    if not m:
+        sys.exit(f"unexpected requires-python format: {spec!r}")
+    return [f"3.{v}" for v in range(int(m[1]), int(m[2]))]
+
+
+spec = importlib.util.find_spec("vllm.third_party.deep_gemm")
+if spec is None or spec.origin is None:
+    sys.exit("vllm.third_party.deep_gemm not importable; is vllm installed?")
+pkg_dir = Path(spec.origin).parent
+
+found = {f"{m[1]}.{m[2]}" for f in os.listdir(pkg_dir) if (m := SO_RE.match(f))}
+required = required_pythons()
+missing = [v for v in required if v not in found]
+print(f"deepgemm _C: found {sorted(found)}, required {required}, missing {missing}")
+sys.exit(1 if missing else 0)
diff --git a/tools/setup_deepgemm_pythons.sh b/tools/setup_deepgemm_pythons.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# Provision bare Python interpreters for the DeepGEMM `_C` per-Python build
+# and print a colon-separated list of their paths to stdout.
+#
+# Each target Python only needs a working interpreter — torch is not
+# installed since `tools/build_deepgemm_C.py` runs from the build interpreter.
+# uv re-uses any matching system Python and downloads a managed build
+# otherwise.
+#
+# Usage:
+#   export DEEPGEMM_PYTHON_INTERPRETERS=$(tools/setup_deepgemm_pythons.sh)
+#   python setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
+#
+# With no args, expands to every CPython covered by `requires-python` in
+# pyproject.toml. Pass explicit versions (e.g. `3.10 3.11`) to override.
+#
+# Skip this script if you don't have uv: set DEEPGEMM_PYTHON_INTERPRETERS
+# directly to existing interpreter paths. Editable / single-Python builds
+# don't need the env var at all (cmake falls back to the build interpreter).
+#
+# Optional: DEEPGEMM_VENV_PREFIX (default: /tmp/dgenv).
+set -euo pipefail
+
+if [ "$#" -eq 0 ]; then
+  # Derive the matrix from `requires-python = ">=3.X,<3.Y"` in pyproject.toml.
+  pyproject="$(dirname "$0")/../pyproject.toml"
+  spec=$(grep -E '^requires-python' "$pyproject" \
+         | grep -oE '>=3\.[0-9]+,<3\.[0-9]+')
+  lo=${spec#>=3.}; lo=${lo%%,*}
+  hi=${spec##*<3.}
+  set -- $(seq "$lo" $((hi - 1)) | sed 's/^/3./')
+fi
+
+prefix="${DEEPGEMM_VENV_PREFIX:-/tmp/dgenv}"
+mkdir -p "$prefix"
+
+paths=""
+for V in "$@"; do
+  venv="$prefix/$V"
+  # Force a managed (uv-downloaded) Python so dev headers are bundled.
+  # System Pythons on the build base may lack headers (manylinux's
+  # /opt/python/cpXY-cpXY are off PATH; an apt-installed python3.X often
+  # has no -dev), and the per-Python build needs Python.h.
+  [ -x "$venv/bin/python" ] || \
+    uv venv --python "$V" "$venv" --python-preference only-managed --seed \
+      >/dev/null
+  paths="$paths:$venv/bin/python"
+done
+echo "${paths#:}"