Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .buildkite/test_areas/kernels.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,11 @@ steps:
device: h100
num_devices: 1
source_file_dependencies:
- cmake/external_projects/deepgemm.cmake
- tools/install_deepgemm.sh
- tools/build_deepgemm_C.py
- tools/setup_deepgemm_pythons.sh
- tools/check_wheel_deepgemm.py
- vllm/utils/deep_gemm.py
- vllm/model_executor/layers/fused_moe
- vllm/model_executor/layers/quantization
Expand All @@ -115,6 +119,7 @@ steps:
- tests/kernels/attention/test_deepgemm_attention.py
- tests/quantization/test_cutlass_w4a16.py
commands:
- python3 ../tools/check_wheel_deepgemm.py
- pytest -v -s kernels/quantization/test_block_fp8.py
- pytest -v -s kernels/moe/test_deepgemm.py
- pytest -v -s kernels/moe/test_batched_deepgemm.py
Expand Down
104 changes: 61 additions & 43 deletions cmake/external_projects/deepgemm.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -53,49 +53,67 @@ cuda_archs_loose_intersection(DEEPGEMM_ARCHS
if(DEEPGEMM_ARCHS)
message(STATUS "DeepGEMM CUDA architectures: ${DEEPGEMM_ARCHS}")

find_package(CUDAToolkit REQUIRED)

#
# Build the _C pybind11 extension from DeepGEMM's C++ source.
# This is a CXX-only module — CUDA kernels are JIT-compiled at runtime.
#
Python_add_library(_deep_gemm_C MODULE WITH_SOABI
"${deepgemm_SOURCE_DIR}/csrc/python_api.cpp")

# The pybind11 module name must be _C to match DeepGEMM's Python imports.
set_target_properties(_deep_gemm_C PROPERTIES OUTPUT_NAME "_C")

target_compile_definitions(_deep_gemm_C PRIVATE
"-DTORCH_EXTENSION_NAME=_C")

target_include_directories(_deep_gemm_C PRIVATE
"${deepgemm_SOURCE_DIR}/csrc"
"${deepgemm_SOURCE_DIR}/deep_gemm/include"
"${deepgemm_SOURCE_DIR}/third-party/cutlass/include"
"${deepgemm_SOURCE_DIR}/third-party/cutlass/tools/util/include"
"${deepgemm_SOURCE_DIR}/third-party/fmt/include")

target_compile_options(_deep_gemm_C PRIVATE
$<$<COMPILE_LANGUAGE:CXX>:-std=c++17>
$<$<COMPILE_LANGUAGE:CXX>:-O3>
$<$<COMPILE_LANGUAGE:CXX>:-Wno-psabi>
$<$<COMPILE_LANGUAGE:CXX>:-Wno-deprecated-declarations>)

# torch_python is required because DeepGEMM uses pybind11 type casters
# for at::Tensor (via PYBIND11_MODULE), unlike vLLM's own extensions which
# use torch::Library custom ops.
find_library(TORCH_PYTHON_LIBRARY torch_python
PATHS "${TORCH_INSTALL_PREFIX}/lib"
REQUIRED)

target_link_libraries(_deep_gemm_C PRIVATE
torch ${TORCH_LIBRARIES} "${TORCH_PYTHON_LIBRARY}"
CUDA::cudart CUDA::nvrtc)

# Install the shared library into the vendored package directory
install(TARGETS _deep_gemm_C
LIBRARY DESTINATION vllm/third_party/deep_gemm
COMPONENT _deep_gemm_C)
# Build _C once per interpreter in DEEPGEMM_PYTHON_INTERPRETERS (":"-
# separated paths) so the wheel imports cleanly on every supported Python.
# Unset → fall back to the build interpreter (editable / source builds).
# The compile is delegated to tools/build_deepgemm_C.py and always runs
# against the build interpreter's torch — target Pythons don't need torch.
# Note: empty-but-set env vars are still DEFINED in cmake; treat empty as
# unset so an empty interpreter list falls back to the build interpreter
# rather than silently skipping the per-Python build.
if(NOT "$ENV{DEEPGEMM_PYTHON_INTERPRETERS}" STREQUAL "")
string(REPLACE ":" ";" _dg_pythons "$ENV{DEEPGEMM_PYTHON_INTERPRETERS}")
else()
set(_dg_pythons "${Python_EXECUTABLE}")
endif()
message(STATUS "DeepGEMM _C will be built for: ${_dg_pythons}")

# Header set fed to add_custom_command's DEPENDS so a header-only edit
# (in upstream DeepGEMM or its vendored cutlass/fmt) re-triggers the
# rebuild. add_custom_command does no implicit header scanning, unlike
# add_library.
file(GLOB_RECURSE _dg_headers
"${deepgemm_SOURCE_DIR}/csrc/*.h"
"${deepgemm_SOURCE_DIR}/csrc/*.hpp"
"${deepgemm_SOURCE_DIR}/deep_gemm/include/*.h"
"${deepgemm_SOURCE_DIR}/deep_gemm/include/*.hpp"
"${deepgemm_SOURCE_DIR}/deep_gemm/include/*.cuh")

set(_dg_markers)
set(_dg_seen_soabis)
foreach(_pybin IN LISTS _dg_pythons)
execute_process(
COMMAND "${_pybin}" -c
"import sysconfig; print(sysconfig.get_config_var('SOABI'))"
OUTPUT_VARIABLE _dg_soabi
OUTPUT_STRIP_TRAILING_WHITESPACE
COMMAND_ERROR_IS_FATAL ANY)
# Dedup so duplicate paths (or two paths resolving to the same CPython)
# don't register conflicting build rules.
if(_dg_soabi IN_LIST _dg_seen_soabis)
continue()
endif()
list(APPEND _dg_seen_soabis "${_dg_soabi}")
set(_dg_dir "${CMAKE_CURRENT_BINARY_DIR}/deepgemm_C_${_dg_soabi}")
set(_dg_marker "${_dg_dir}/.built")
add_custom_command(
OUTPUT "${_dg_marker}"
COMMAND "${Python_EXECUTABLE}"
"${CMAKE_SOURCE_DIR}/tools/build_deepgemm_C.py"
"${deepgemm_SOURCE_DIR}" "${_dg_dir}" "${_pybin}"
COMMAND "${CMAKE_COMMAND}" -E touch "${_dg_marker}"
DEPENDS "${CMAKE_SOURCE_DIR}/tools/build_deepgemm_C.py"
"${deepgemm_SOURCE_DIR}/csrc/python_api.cpp"
${_dg_headers}
COMMENT "Building DeepGEMM _C for ${_pybin}"
VERBATIM)
Comment on lines +99 to +109
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 The new per-Python add_custom_command lists only tools/build_deepgemm_C.py and csrc/python_api.cpp in DEPENDS, but the compile pulls in headers from csrc/, deep_gemm/include/, third-party/cutlass/, and third-party/fmt/. Unlike the prior Python_add_library (which got compiler-driven depfiles), add_custom_command does no implicit header scanning, so a header-only change leaves the cached _C.so untouched on incremental rebuilds. Nit — narrow trigger window (CI does clean Docker builds, DeepGEMM is FetchContent-pinned), but it is a real regression vs. the prior behavior; consider extending DEPENDS to include header globs (note IMPLICIT_DEPENDS CXX is Make-generator-only, so it's a no-op under Ninja).

Extended reasoning...

What the bug is. The per-Python build rule in cmake/external_projects/deepgemm.cmake (lines 88–97) registers:

add_custom_command(
  OUTPUT "${_dg_marker}"
  COMMAND "${Python_EXECUTABLE}" "${CMAKE_SOURCE_DIR}/tools/build_deepgemm_C.py" ...
  COMMAND "${CMAKE_COMMAND}" -E touch "${_dg_marker}"
  DEPENDS "${CMAKE_SOURCE_DIR}/tools/build_deepgemm_C.py"
          "${deepgemm_SOURCE_DIR}/csrc/python_api.cpp"
  ...)

DEPENDS only names the build driver and the single .cpp translation unit. But tools/build_deepgemm_C.py invokes g++ with -I paths into ${deepgemm_SOURCE_DIR}/csrc, deep_gemm/include, third-party/cutlass/include, third-party/cutlass/tools/util/include, and third-party/fmt/include. None of those headers are tracked.

Why this is a regression. The pre-PR code used Python_add_library(_deep_gemm_C MODULE WITH_SOABI ...). That is a real cmake target, and cmake feeds the compile rule through the generator (-MMD -MF depfiles consumed by Ninja/Make), so any #included header transitively invalidated the .so. add_custom_command does no implicit header scanning — only the literal DEPENDS list is consulted. So the new code has strictly weaker dependency tracking than the old code.

Step-by-step proof of the silent-stale-binary case.

  1. A developer sets DEEPGEMM_SRC_DIR=/path/to/local/DeepGEMM and runs pip install -e . once. cmake builds the marker ${_dg_dir}/.built and the corresponding _C.cpython-*.so.
  2. Developer edits /path/to/local/DeepGEMM/deep_gemm/include/deep_gemm/jit_kernels/foo.hpp (or any of the cutlass/fmt headers) to change a function signature consumed by csrc/python_api.cpp.
  3. They rerun the build. cmake re-evaluates the custom command and checks the mtime of every entry in DEPENDS. Only tools/build_deepgemm_C.py and csrc/python_api.cpp are listed — both unchanged — so .built is considered up to date and the rule does not fire.
  4. The previously-built _C.so is reused as-is. It links against the old header signatures while the rest of the (newly-recompiled) project now depends on the new ones, producing silent symbol/ABI drift at runtime instead of an honest rebuild.

A second triggering scenario: a future GIT_TAG bump in deepgemm.cmake lands a header-only change in upstream DeepGEMM (no churn to csrc/python_api.cpp). After FetchContent checks out the new tree, the same logic skips the rebuild on incremental cmake invocations.

Why existing code does not prevent this. The depfile-driven implicit tracking only exists for add_library/add_executable targets. Nothing else fills the gap here: the _dg_marker touchstamp is purely a function of DEPENDS mtimes, and the install rule is keyed off the marker.

Addressing the refutation. The refutation is right that the practical trigger window is narrow: CI does clean Docker builds (so it never hits this path), DeepGEMM is FetchContent-pinned, and most users install prebuilt wheels. That is exactly why this is filed as a nit rather than blocking. But the refutation overstates the case by saying the issue "almost never occurs" — the DEEPGEMM_SRC_DIR workflow is documented in setup_deepgemm_pythons.sh and is the obvious iteration loop for anyone debugging DeepGEMM integration locally. The "rm -rf build && rebuild" workaround is correct but only helps people who already know their .so is stale, which is the root foot-gun.

Suggested fix. Two reasonable options, both small:

  • Extend DEPENDS to include header globs:
    file(GLOB_RECURSE _dg_headers
      "${deepgemm_SOURCE_DIR}/csrc/*.h"
      "${deepgemm_SOURCE_DIR}/csrc/*.hpp"
      "${deepgemm_SOURCE_DIR}/deep_gemm/include/*.hpp"
      "${deepgemm_SOURCE_DIR}/deep_gemm/include/*.cuh")
    add_custom_command(... DEPENDS ${_dg_headers} ...)
    (Optionally add cutlass/fmt globs, but those are vendored upstream and rarely modified.)
  • Or have tools/build_deepgemm_C.py emit a .d depfile via -MD -MF and pass it back via DEPFILE (Ninja-only, but vLLM uses Ninja).

Note: the original bug suggested IMPLICIT_DEPENDS CXX, which works only with the Makefile generator. vLLM commonly uses Ninja, so that fix would be a no-op there — explicit globs (or a DEPFILE) are the robust path.

list(APPEND _dg_markers "${_dg_marker}")
install(DIRECTORY "${_dg_dir}/"
DESTINATION vllm/third_party/deep_gemm
COMPONENT _deep_gemm_C
FILES_MATCHING PATTERN "_C.cpython-*.so")
endforeach()
add_custom_target(_deep_gemm_C ALL DEPENDS ${_dg_markers})

#
# Vendor DeepGEMM Python package files
Expand Down
11 changes: 11 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,15 @@ RUN --mount=type=cache,target=/root/.cache/uv \
python3 use_existing_torch.py --prefix; \
fi

# Provision a bare interpreter for each CPython covered by `requires-python`
# so DeepGEMM `_C` is built once per Python and bundled side-by-side in the
# wheel; cmake reads DEEPGEMM_PYTHON_INTERPRETERS in deepgemm.cmake's
# foreach loop. The matrix is derived from pyproject.toml.
COPY tools/setup_deepgemm_pythons.sh tools/build_deepgemm_C.py tools/
ENV DEEPGEMM_VENV_PREFIX=/opt/dgenv
RUN --mount=type=cache,target=/root/.cache/uv \
tools/setup_deepgemm_pythons.sh > /tmp/dg_pythons.txt

# Build the vLLM wheel
# if USE_SCCACHE is set, use sccache to speed up compilation
# AWS credentials mounted at ~/.aws/credentials for sccache S3 auth (optional)
Expand Down Expand Up @@ -328,6 +337,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
&& export VLLM_PRECOMPILED_WHEEL_COMMIT="${VLLM_MERGE_BASE_COMMIT}" \
&& export VLLM_MAIN_CUDA_VERSION="${VLLM_MAIN_CUDA_VERSION}" \
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
&& export DEEPGEMM_PYTHON_INTERPRETERS=$(cat /tmp/dg_pythons.txt) \
&& sccache --show-stats \
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
&& sccache --show-stats; \
Expand All @@ -345,6 +355,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
export VLLM_PRECOMPILED_WHEEL_COMMIT="${VLLM_MERGE_BASE_COMMIT}" && \
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
export DEEPGEMM_PYTHON_INTERPRETERS=$(cat /tmp/dg_pythons.txt) && \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi

Expand Down
87 changes: 87 additions & 0 deletions tools/build_deepgemm_C.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Build DeepGEMM's `_C` pybind11 extension for a target Python.

Driven from `cmake/external_projects/deepgemm.cmake`. The driver is the
build interpreter (which has torch); the *target* Python is only used for
its header path and SOABI. This avoids needing torch installed in N venvs
to produce N matching `.so` files.

Usage: python build_deepgemm_C.py <DEEPGEMM_SRC_DIR> <OUTPUT_DIR> <TARGET_PY>
"""

import json
import os
import subprocess
import sys
Comment thread
mgoin marked this conversation as resolved.
from pathlib import Path

import torch
from torch.utils import cpp_extension

if len(sys.argv) != 4:
sys.exit(f"usage: {sys.argv[0]} <SRC> <OUT> <TARGET_PY>")

src = Path(sys.argv[1]).resolve()
out = Path(sys.argv[2]).resolve()
target_py = sys.argv[3]
out.mkdir(parents=True, exist_ok=True)

info = json.loads(
subprocess.check_output(
[
target_py,
"-c",
"import sysconfig, json; "
"print(json.dumps({k: sysconfig.get_config_var(k) "
"for k in ('EXT_SUFFIX', 'INCLUDEPY')}))",
]
).decode()
)

cuda_home = cpp_extension.CUDA_HOME
Comment thread
mgoin marked this conversation as resolved.
if cuda_home is None:
sys.exit("CUDA_HOME not found; cannot build DeepGEMM _C")
# CCCL lives outside the standard CUDAToolkit search, mirroring DeepGEMM's
# own setup.py.
includes = [
info["INCLUDEPY"],
f"{cuda_home}/include",
f"{cuda_home}/include/cccl",
str(src / "csrc"),
str(src / "deep_gemm/include"),
str(src / "third-party/cutlass/include"),
str(src / "third-party/cutlass/tools/util/include"),
str(src / "third-party/fmt/include"),
*cpp_extension.include_paths(device_type="cuda"),
]

cmd = [
os.environ.get("CXX", "g++"),
"-shared",
"-fPIC",
"-std=c++17",
"-O3",
"-g0",
"-Wno-psabi",
"-Wno-deprecated-declarations",
"-DTORCH_API_INCLUDE_EXTENSION_H",
"-DTORCH_EXTENSION_NAME=_C",
f"-D_GLIBCXX_USE_CXX11_ABI={int(torch.compiled_with_cxx11_abi())}",
*(f"-I{p}" for p in includes),
str(src / "csrc/python_api.cpp"),
*(f"-L{p}" for p in cpp_extension.library_paths(device_type="cuda")),
f"-L{cuda_home}/lib64",
"-ltorch",
"-ltorch_python",
"-ltorch_cpu",
"-ltorch_cuda",
"-lc10",
"-lc10_cuda",
"-lcudart",
"-lnvrtc",
"-o",
str(out / f"_C{info['EXT_SUFFIX']}"),
]
print("[build_deepgemm_C] " + " ".join(cmd), flush=True)
subprocess.check_call(cmd)
41 changes: 41 additions & 0 deletions tools/check_wheel_deepgemm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

"""Assert the installed vLLM has a `_C.cpython-X.Y-*.so` for every CPython
covered by `requires-python`. Fails closed if a Python's `.so` is missing
from the wheel — i.e. the regression that surfaced in #41476/#41512.

Run from a CI test job after vLLM is installed, e.g. the H100 deepgemm
kernel tests in .buildkite/test_areas/kernels.yaml.
"""

import importlib.util
import os
import sys
from pathlib import Path

import regex as re
Comment thread
mgoin marked this conversation as resolved.
import tomllib

SO_RE = re.compile(r"^_C\.cpython-(\d)(\d+)-")


def required_pythons() -> list[str]:
pyproject = Path(__file__).resolve().parent.parent / "pyproject.toml"
spec = tomllib.loads(pyproject.read_text())["project"]["requires-python"]
m = re.match(r">=3\.(\d+),<3\.(\d+)", spec)
if not m:
sys.exit(f"unexpected requires-python format: {spec!r}")
return [f"3.{v}" for v in range(int(m[1]), int(m[2]))]


spec = importlib.util.find_spec("vllm.third_party.deep_gemm")
if spec is None or spec.origin is None:
sys.exit("vllm.third_party.deep_gemm not importable; is vllm installed?")
pkg_dir = Path(spec.origin).parent

found = {f"{m[1]}.{m[2]}" for f in os.listdir(pkg_dir) if (m := SO_RE.match(f))}
required = required_pythons()
missing = [v for v in required if v not in found]
print(f"deepgemm _C: found {sorted(found)}, required {required}, missing {missing}")
sys.exit(1 if missing else 0)
49 changes: 49 additions & 0 deletions tools/setup_deepgemm_pythons.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env bash
# Provision bare Python interpreters for the DeepGEMM `_C` per-Python build
# and print a colon-separated list of their paths to stdout.
#
# Each target Python only needs a working interpreter — torch is not
# installed since `tools/build_deepgemm_C.py` runs from the build interpreter.
# uv re-uses any matching system Python and downloads a managed build
# otherwise.
#
# Usage:
# export DEEPGEMM_PYTHON_INTERPRETERS=$(tools/setup_deepgemm_pythons.sh)
# python setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
#
# With no args, expands to every CPython covered by `requires-python` in
# pyproject.toml. Pass explicit versions (e.g. `3.10 3.11`) to override.
#
# Skip this script if you don't have uv: set DEEPGEMM_PYTHON_INTERPRETERS
# directly to existing interpreter paths. Editable / single-Python builds
# don't need the env var at all (cmake falls back to the build interpreter).
#
# Optional: DEEPGEMM_VENV_PREFIX (default: /tmp/dgenv).
set -euo pipefail

if [ "$#" -eq 0 ]; then
# Derive the matrix from `requires-python = ">=3.X,<3.Y"` in pyproject.toml.
pyproject="$(dirname "$0")/../pyproject.toml"
spec=$(grep -E '^requires-python' "$pyproject" \
| grep -oE '>=3\.[0-9]+,<3\.[0-9]+')
lo=${spec#>=3.}; lo=${lo%%,*}
hi=${spec##*<3.}
set -- $(seq "$lo" $((hi - 1)) | sed 's/^/3./')
fi

prefix="${DEEPGEMM_VENV_PREFIX:-/tmp/dgenv}"
mkdir -p "$prefix"

paths=""
for V in "$@"; do
venv="$prefix/$V"
# Force a managed (uv-downloaded) Python so dev headers are bundled.
# System Pythons on the build base may lack headers (manylinux's
# /opt/python/cpXY-cpXY are off PATH; an apt-installed python3.X often
# has no -dev), and the per-Python build needs Python.h.
[ -x "$venv/bin/python" ] || \
uv venv --python "$V" "$venv" --python-preference only-managed --seed \
>/dev/null
paths="$paths:$venv/bin/python"
done
echo "${paths#:}"
Loading