Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
8fcb247
NIXL EP: suffixed builds for CUDA versions and PyTorch versions
ovidiusm May 15, 2026
8fc3d56
Fix torch version pin
ovidiusm May 15, 2026
da2f6df
Use extra index correctly
ovidiusm May 15, 2026
771346a
Add missing repair wheel step
ovidiusm May 15, 2026
a79b000
Refactor to create venv in a controlled way
ovidiusm May 15, 2026
37509dd
Clean up diff
ovidiusm May 15, 2026
4d380cd
Rename flags variable
ovidiusm May 15, 2026
b299774
Remove torch 2.13 build, it fails
ovidiusm May 15, 2026
3fec58f
Fix mypy CI check
ovidiusm May 15, 2026
3896d93
Revert C++ 20 flags, out of scope and unsafe to build only parts of t…
ovidiusm May 15, 2026
3511722
Fix import of binary bindings
ovidiusm May 15, 2026
96b3b7e
Refactor meson build scripts to track deps correctly
ovidiusm May 15, 2026
90e282c
Rename variable for clarity
ovidiusm May 15, 2026
b30995a
Refactor wheel merge from bash into Python
ovidiusm May 15, 2026
758c025
Refactor bash script to simplify
ovidiusm May 15, 2026
deb5840
Format python code
ovidiusm May 15, 2026
99c1962
Apply suggestion from @rakhmets
ovidiusm May 18, 2026
6c5a83c
Apply suggestions from code review
ovidiusm May 18, 2026
3a4bee9
Clean up comments in meson.build
ovidiusm May 18, 2026
67112d7
Clean up comments in meson.build
ovidiusm May 18, 2026
edf4b94
Rename function _load_ep_backend to _load_ep_module
ovidiusm May 18, 2026
edda9d1
Merge branch 'main' into nixl-ep-wheel-dispatch
ovidiusm May 18, 2026
8d59ad7
Update build script to address torch installation issue and simplify …
ovidiusm May 18, 2026
83264e8
Fix help and comment
ovidiusm May 18, 2026
3d10ba5
Address comment
ovidiusm May 18, 2026
a8f6bdb
Refactoring
ovidiusm May 18, 2026
ca86eb7
Address comments
ovidiusm May 21, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions contrib/Dockerfile.manylinux
Original file line number Diff line number Diff line change
Expand Up @@ -354,11 +354,14 @@ RUN echo "/usr/local/nixl/lib/$ARCH-linux-gnu" > /etc/ld.so.conf.d/nixl.conf &&
# Create the wheel
# No need to specifically add path to libcuda.so here, meson finds the stubs and links them
ARG WHL_PYTHON_VERSIONS="3.10,3.11,3.12,3.13,3.14"
ARG WHL_TORCH_VERSIONS="2.11,2.12"
ARG WHL_PLATFORM="manylinux_2_28_$ARCH"
RUN IFS=',' read -ra PYTHON_VERSIONS <<< "$WHL_PYTHON_VERSIONS" && \
export UV_INDEX="https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d .)" && \
export UV_INDEX_STRATEGY=unsafe-best-match && \
if [ "$BUILD_NIXL_EP" = "true" ]; then EP_BUILD_FLAG="--build-nixl-ep"; else EP_BUILD_FLAG=""; fi && \
if [ "$BUILD_NIXL_EP" = "true" ]; then \
EP_BUILD_FLAGS="--build-nixl-ep --torch-versions $WHL_TORCH_VERSIONS"; \
else \
EP_BUILD_FLAGS=""; \
fi && \
rm -rf dist && mkdir -p dist && \
for PYTHON_VERSION in "${PYTHON_VERSIONS[@]}"; do \
export PATH=$VIRTUAL_ENV/bin:$PATH && \
Expand All @@ -368,7 +371,7 @@ RUN IFS=',' read -ra PYTHON_VERSIONS <<< "$WHL_PYTHON_VERSIONS" && \
--ucx-plugins-dir /usr/lib64/ucx \
--nixl-plugins-dir $NIXL_PLUGIN_DIR \
--output-dir dist \
$EP_BUILD_FLAG ; \
$EP_BUILD_FLAGS ; \
done

# Copy the meta package wheel to the dist directory, which will be used to push to PyPI.
Expand Down
234 changes: 220 additions & 14 deletions contrib/build-wheel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# Parse arguments
PYTHON_VERSION="3.12"
ARCH=$(uname -m)
WHL_PLATFORM="manylinux_2_39_$ARCH"
UCX_PLUGINS_DIR="/usr/lib64/ucx"
NIXL_PLUGINS_DIR="/usr/local/nixl/lib/$ARCH-linux-gnu/plugins"
OUTPUT_DIR="dist"
BUILD_NIXL_EP="false"
TORCH_VERSIONS=""

# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--python-version)
Expand Down Expand Up @@ -60,6 +59,7 @@ while [[ $# -gt 0 ]]; do
echo " --ucx-plugins-dir: Directory to find UCX plugins in (default: $UCX_PLUGINS_DIR)"
echo " --nixl-plugins-dir: Directory to find NIXL plugins in (default: $NIXL_PLUGINS_DIR)"
echo " --build-nixl-ep: Build wheel with nixl_ep package included (requires CUDA sm90-compatible environment)"
echo " --torch-versions: Comma-separated list of torch versions to build the wheel for (default: $TORCH_VERSIONS)"
echo " --help: Show this help message"
echo ""
echo "Must be executed from the root of the NIXL repository."
Expand All @@ -69,17 +69,26 @@ while [[ $# -gt 0 ]]; do
BUILD_NIXL_EP="true"
shift
;;
--torch-versions)
TORCH_VERSIONS=$2
shift
shift
;;
Comment thread
coderabbitai[bot] marked this conversation as resolved.
*)
echo "Unknown argument: $1"
exit 1
;;
esac
done

if [ "$BUILD_NIXL_EP" = "true" ] && [ -z "$TORCH_VERSIONS" ]; then
echo "ERROR: --build-nixl-ep requires --torch-versions (e.g. --torch-versions 2.11,2.12)" >&2
exit 1
fi

set -e
set -x

# Build the wheel
TMP_DIR=$(mktemp -d)

CUDA_MAJOR=$(nvcc --version | grep -Eo 'release [0-9]+\.[0-9]+' | cut -d' ' -f2 | cut -d'.' -f1)
Expand All @@ -88,21 +97,218 @@ if [ "$CUDA_MAJOR" -ne 12 ] && [ "$CUDA_MAJOR" -ne 13 ]; then
echo "Invalid CUDA_MAJOR: '$CUDA_MAJOR'"
exit 1
fi
AUDITWHEEL_EXCLUDES="--exclude libcuda* --exclude libcufile* --exclude libssl* --exclude libcrypto* --exclude libefa* --exclude libhwloc* --exclude libfabric* --exclude libtorch* --exclude libc10* --exclude libdoca*"

PKG_NAME="nixl-cu${CUDA_MAJOR}"
CU_TAG="cu$(nvcc --version | grep -Eo 'release [0-9]+\.[0-9]+' | cut -d' ' -f2 | tr -d .)"
./contrib/tomlutil.py --wheel-name $PKG_NAME pyproject.toml
if [ "$BUILD_NIXL_EP" = "true" ]; then
uv build --wheel --out-dir $TMP_DIR --python $PYTHON_VERSION \
-Csetup-args=-Dbuild_nixl_ep=true \
-Csetup-args=-Dbuild_examples=true

TORCH_STABLE_INDEX="https://download.pytorch.org/whl/${CU_TAG}"
TORCH_NIGHTLY_INDEX="https://download.pytorch.org/whl/nightly/${CU_TAG}"

# Build deps for the per-iteration venv; torch is installed separately.
BUILD_DEPS=(
"meson"
"meson-python"
"pybind11"
"patchelf"
"pyyaml"
"types-PyYAML"
"setuptools>=80.9.0"
)

# Slugify a dotted version (e.g. "2.13" -> "213", "3.10" -> "310") so it can
# be used unambiguously as a path component.
slug() { echo "${1//./}"; }

# Path for a per-iteration build venv. One venv per (python, torch) tuple
# so torch's transitive footprint (nvidia-*, triton, sympy, …) never bleeds
# across torch versions. Lives in /workspace, not /tmp, so it inherits the
# image's UV_CACHE_DIR layout and is visible to debugging.
venv_path() {
local VER=${1:-}
if [ -n "$VER" ]; then
echo "/workspace/venv-torch$(slug "$VER")-py$(slug "$PYTHON_VERSION")"
else
echo "/workspace/venv-py$(slug "$PYTHON_VERSION")"
fi
}

# Echo "stable", "nightly", or "unavailable" depending on whether
# torch==${VER}.* resolves from the stable cu index, the nightly cu
# index (with --pre), or neither.
torch_classify() {
local VER=$1
local CLASS="unavailable"
local PROBE="/workspace/venv-probe-py$(slug "$PYTHON_VERSION")"
rm -rf "$PROBE"
if uv venv "$PROBE" --python "$PYTHON_VERSION" >/dev/null 2>&1; then
if uv pip install --dry-run \
--python "$PROBE/bin/python" \
--index-url "$TORCH_STABLE_INDEX" \
"torch==${VER}.*" >/dev/null 2>&1; then
CLASS="stable"
elif uv pip install --dry-run --pre \
--python "$PROBE/bin/python" \
--extra-index-url "$TORCH_STABLE_INDEX" \
--extra-index-url "$TORCH_NIGHTLY_INDEX" \
--index-strategy unsafe-best-match \
"torch==${VER}.*" >/dev/null 2>&1; then
Comment thread
coderabbitai[bot] marked this conversation as resolved.
CLASS="nightly"
fi
fi
rm -rf "$PROBE"
echo "$CLASS"
}

# Install torch from the cu index, isolated from PyPI: with PyPI as a
# fallback its plain `torch==X.Y.0` beats cu nightly's `X.Y.0.dev*+cuXX`
# (PEP 440: final > pre-release).
install_torch() {
local VENV_PATH=$1
local VER=$2
local CHANNEL=$3
local MAJOR="${VER%%.*}"
local MINOR="${VER##*.}"

if [ "$CHANNEL" = "nightly" ]; then
uv pip install \
--python "$VENV_PATH/bin/python" \
--index-url "$TORCH_NIGHTLY_INDEX" \
--pre \
"torch>=${MAJOR}.${MINOR}.0.dev0,<${MAJOR}.$((MINOR + 1))"
else
uv pip install \
--python "$VENV_PATH/bin/python" \
--index-url "$TORCH_STABLE_INDEX" \
"torch==${VER}.*"
fi
}

# Build the wheel for the current PYTHON_VERSION (and optional torch VER).
# Each iteration uses a fresh venv so torch's dependencies
# (nvidia-* wheels, triton, sympy, …) do not leak across iterations.
build_wheel() {
local OUT_DIR=$1
local VER=${2:-}

local VENV_PATH
VENV_PATH=$(venv_path "$VER")
local CHANNEL="stable"
[ -n "$VER" ] && CHANNEL=$(torch_classify "$VER")

echo "=== Provisioning ${VENV_PATH} (python ${PYTHON_VERSION}${VER:+, torch ${VER} [${CHANNEL}]}) ==="
rm -rf "$VENV_PATH"
uv venv "$VENV_PATH" --python "$PYTHON_VERSION"
uv pip install --python "$VENV_PATH/bin/python" "${BUILD_DEPS[@]}"
[ -n "$VER" ] && install_torch "$VENV_PATH" "$VER" "$CHANNEL"

# Activate so meson's `find_installation('python3')` resolves to this
# venv's interpreter (which has the right torch).
# shellcheck disable=SC1091
source "$VENV_PATH/bin/activate"

local BUILD_ARGS=(
--wheel
--no-build-isolation
--out-dir "$OUT_DIR"
--python "$VENV_PATH/bin/python"
)
if [ "$BUILD_NIXL_EP" = "true" ]; then
BUILD_ARGS+=(
-Csetup-args=-Dbuild_nixl_ep=true
-Csetup-args=-Dbuild_examples=true
)
fi
uv build "${BUILD_ARGS[@]}"

deactivate
# torch + nvidia-* in each venv is several GB; tear down so the docker
# layer does not get too large across the (python, torch) matrix.
rm -rf "$VENV_PATH"
}

repair_wheel() {
local IN_DIR=$1
local OUT_DIR=$2
mkdir -p "$OUT_DIR"
auditwheel repair $AUDITWHEEL_EXCLUDES "$IN_DIR"/nixl*.whl --plat "$WHL_PLATFORM" --wheel-dir "$OUT_DIR"
./contrib/wheel_add_ucx_plugins.py --ucx-plugins-dir "$UCX_PLUGINS_DIR" --nixl-plugins-dir "$NIXL_PLUGINS_DIR" "$OUT_DIR"/*.whl
}

# Echo the path of the single .whl in $1, or exit if the count is not 1.
get_wheel_path() {
local dir=$1 wheels
shopt -s nullglob
wheels=("$dir"/*.whl)
shopt -u nullglob
if [ ${#wheels[@]} -ne 1 ]; then
echo "expected 1 wheel in $dir, got ${#wheels[@]}: ${wheels[*]}" >&2
exit 1
fi
echo "${wheels[0]}"
}

if [ "$BUILD_NIXL_EP" = "true" ] && [ -n "$TORCH_VERSIONS" ]; then
# Multi-torch: build the full wheel with the first torch, then merge
# the per-torch .so from the others into it.
IFS=',' read -ra TORCH_REQUESTED <<< "$TORCH_VERSIONS"

# Filter to torch versions actually resolvable for this (Python, CUDA) combo.
TORCH_ARRAY=()
SKIPPED=()
for TORCH in "${TORCH_REQUESTED[@]}"; do
if [ "$(torch_classify "$TORCH")" = "unavailable" ]; then
SKIPPED+=("$TORCH")
else
TORCH_ARRAY+=("$TORCH")
fi
done

if [ ${#SKIPPED[@]} -gt 0 ]; then
echo "=== Skipping torch versions (no wheel on index for Python ${PYTHON_VERSION} + ${CU_TAG}): ${SKIPPED[*]} ==="
fi
if [ ${#TORCH_ARRAY[@]} -eq 0 ]; then
echo "ERROR: none of the requested torch versions (${TORCH_REQUESTED[*]}) are available for Python ${PYTHON_VERSION} + ${CU_TAG}"
exit 1
fi
echo "=== Building for torch versions: ${TORCH_ARRAY[*]} ==="

FIRST_TORCH="${TORCH_ARRAY[0]}"
echo "=== Building wheel with torch ${FIRST_TORCH} ==="
build_wheel "$TMP_DIR" "$FIRST_TORCH"
repair_wheel "$TMP_DIR" "$TMP_DIR/dist"
BASE_WHL=$(get_wheel_path "$TMP_DIR/dist")

for ((i=1; i<${#TORCH_ARRAY[@]}; i++)); do
TORCH="${TORCH_ARRAY[$i]}"
echo "=== Building nixl_ep .so for torch ${TORCH} ==="

EP_TMP=$(mktemp -d)
build_wheel "$EP_TMP" "$TORCH"
repair_wheel "$EP_TMP" "$EP_TMP/dist"

# Merge only the torch-versioned .so. Both wheels were built
# against the same outer C++ build, so its DT_NEEDED entries
# (libucp-<hash>.so etc.) match what auditwheel already bundled
# into $BASE_WHL.
TORCH_MM=$(echo "$TORCH" | tr -d '.')
EP_WHL=$(get_wheel_path "$EP_TMP/dist")
./contrib/wheel_merge.py \
--base-wheel "$BASE_WHL" \
--source-wheel "$EP_WHL" \
--pattern "nixl_ep_cpp_torch${TORCH_MM}.*" \
--target-dir "nixl_ep_cu${CUDA_MAJOR}"

rm -rf "$EP_TMP"
done

cp "$BASE_WHL" "$OUTPUT_DIR"
else
uv build --wheel --out-dir $TMP_DIR --python $PYTHON_VERSION
build_wheel "$TMP_DIR"
repair_wheel "$TMP_DIR" "$TMP_DIR/dist"
cp "$(get_wheel_path "$TMP_DIR/dist")" "$OUTPUT_DIR"
fi

# Bundle libraries
mkdir $TMP_DIR/dist
auditwheel repair --exclude 'libcuda*' --exclude 'libcufile*' --exclude 'libssl*' --exclude 'libcrypto*' --exclude 'libefa*' --exclude 'libhwloc*' --exclude 'libfabric*' --exclude 'libtorch*' --exclude 'libc10*' --exclude 'libdoca*' $TMP_DIR/nixl*.whl --plat $WHL_PLATFORM --wheel-dir $TMP_DIR/dist
./contrib/wheel_add_ucx_plugins.py --ucx-plugins-dir $UCX_PLUGINS_DIR --nixl-plugins-dir $NIXL_PLUGINS_DIR $TMP_DIR/dist/*.whl
cp $TMP_DIR/dist/*.whl $OUTPUT_DIR

# Clean up
rm -rf "$TMP_DIR"
Loading
Loading