From 8fcb247990569fddc28028f8dc48c1f739559f5f Mon Sep 17 00:00:00 2001 From: Ovidiu Mara Date: Fri, 15 May 2026 02:00:46 +0200 Subject: [PATCH 01/26] NIXL EP: suffixed builds for CUDA versions and PyTorch versions Signed-off-by: Ovidiu Mara --- contrib/Dockerfile.manylinux | 16 +++- contrib/build-wheel.sh | 90 ++++++++++++++++--- examples/device/ep/meson.build | 20 +++-- examples/device/ep/nixl_ep/__init__.py | 6 +- .../python/nixl-meta/nixl/meson.build | 15 ++++ .../python/nixl-meta/nixl_ep/__init__.py | 77 ++++++++++++++++ .../python/nixl-meta/pyproject.toml.in | 2 +- 7 files changed, 204 insertions(+), 22 deletions(-) create mode 100644 src/bindings/python/nixl-meta/nixl_ep/__init__.py diff --git a/contrib/Dockerfile.manylinux b/contrib/Dockerfile.manylinux index 14f1234c66..3dad548679 100644 --- a/contrib/Dockerfile.manylinux +++ b/contrib/Dockerfile.manylinux @@ -294,9 +294,11 @@ RUN rm -rf $VIRTUAL_ENV && uv venv $VIRTUAL_ENV --python $DEFAULT_PYTHON_VERSION ENV PATH="$VIRTUAL_ENV/bin:$PATH" # Install python dependencies RUN uv pip install --upgrade meson meson-python pybind11 patchelf pyYAML click setuptools tabulate auditwheel tomlkit -# Install PyTorch +# Install PyTorch (default version for meson setup; wheel builds may reinstall other versions) +ARG TORCH_VERSIONS="2.11,2.12,2.13" RUN export UV_INDEX="https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d .)" && \ - uv pip install 'torch==2.11.*' + FIRST_TORCH=$(echo "$TORCH_VERSIONS" | cut -d, -f1) && \ + uv pip install "torch==${FIRST_TORCH}.*" # Upgrade setuptools to latest version for compatibility with PEP 639 (license format) RUN uv pip install --upgrade 'setuptools>=80.9.0' @@ -356,9 +358,15 @@ RUN echo "/usr/local/nixl/lib/$ARCH-linux-gnu" > /etc/ld.so.conf.d/nixl.conf && ARG WHL_PYTHON_VERSIONS="3.10,3.11,3.12,3.13,3.14" ARG WHL_PLATFORM="manylinux_2_28_$ARCH" RUN IFS=',' read -ra PYTHON_VERSIONS <<< "$WHL_PYTHON_VERSIONS" && \ - export UV_INDEX="https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d .)" && \ + CU_TAG="cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d .)" && \ + export UV_EXTRA_INDEX_URL="https://download.pytorch.org/whl/nightly/${CU_TAG}" && \ + export UV_INDEX="https://download.pytorch.org/whl/${CU_TAG}" && \ export UV_INDEX_STRATEGY=unsafe-best-match && \ - if [ "$BUILD_NIXL_EP" = "true" ]; then EP_BUILD_FLAG="--build-nixl-ep"; else EP_BUILD_FLAG=""; fi && \ + if [ "$BUILD_NIXL_EP" = "true" ]; then \ + EP_BUILD_FLAG="--build-nixl-ep --torch-versions $TORCH_VERSIONS"; \ + else \ + EP_BUILD_FLAG=""; \ + fi && \ rm -rf dist && mkdir -p dist && \ for PYTHON_VERSION in "${PYTHON_VERSIONS[@]}"; do \ export PATH=$VIRTUAL_ENV/bin:$PATH && \ diff --git a/contrib/build-wheel.sh b/contrib/build-wheel.sh index 3a4b4a58d3..3ecc15ade9 100755 --- a/contrib/build-wheel.sh +++ b/contrib/build-wheel.sh @@ -23,6 +23,7 @@ UCX_PLUGINS_DIR="/usr/lib64/ucx" NIXL_PLUGINS_DIR="/usr/local/nixl/lib/$ARCH-linux-gnu/plugins" OUTPUT_DIR="dist" BUILD_NIXL_EP="false" +TORCH_VERSIONS="" # Parse arguments while [[ $# -gt 0 ]]; do @@ -69,6 +70,11 @@ while [[ $# -gt 0 ]]; do BUILD_NIXL_EP="true" shift ;; + --torch-versions) + TORCH_VERSIONS=$2 + shift + shift + ;; *) echo "Unknown argument: $1" exit 1 @@ -88,21 +94,83 @@ if [ "$CUDA_MAJOR" -ne 12 ] && [ "$CUDA_MAJOR" -ne 13 ]; then echo "Invalid CUDA_MAJOR: '$CUDA_MAJOR'" exit 1 fi +AUDITWHEEL_EXCLUDES="--exclude libcuda* --exclude libcufile* --exclude libssl* --exclude libcrypto* --exclude libefa* --exclude libhwloc* --exclude libfabric* --exclude libtorch* --exclude libc10* --exclude libdoca*" + PKG_NAME="nixl-cu${CUDA_MAJOR}" ./contrib/tomlutil.py --wheel-name $PKG_NAME pyproject.toml -if [ "$BUILD_NIXL_EP" = "true" ]; then - uv build --wheel --out-dir $TMP_DIR --python $PYTHON_VERSION \ - -Csetup-args=-Dbuild_nixl_ep=true \ - -Csetup-args=-Dbuild_examples=true + +install_torch() { + local VER=$1 + uv pip install --pre "torch==${VER}.*" +} + +build_wheel() { + local OUT_DIR=$1 + if [ "$BUILD_NIXL_EP" = "true" ]; then + uv build --wheel --out-dir "$OUT_DIR" --python $PYTHON_VERSION \ + -Csetup-args=-Dbuild_nixl_ep=true \ + -Csetup-args=-Dbuild_examples=true + else + uv build --wheel --out-dir "$OUT_DIR" --python $PYTHON_VERSION + fi +} + +repair_wheel() { + local IN_DIR=$1 + local OUT_DIR=$2 + mkdir -p "$OUT_DIR" + auditwheel repair $AUDITWHEEL_EXCLUDES "$IN_DIR"/nixl*.whl --plat $WHL_PLATFORM --wheel-dir "$OUT_DIR" + ./contrib/wheel_add_ucx_plugins.py --ucx-plugins-dir $UCX_PLUGINS_DIR --nixl-plugins-dir $NIXL_PLUGINS_DIR "$OUT_DIR"/*.whl +} + +if [ "$BUILD_NIXL_EP" = "true" ] && [ -n "$TORCH_VERSIONS" ]; then + # Multi-torch: build full wheel with first torch, then merge extra .so from others. + IFS=',' read -ra TV_ARRAY <<< "$TORCH_VERSIONS" + + FIRST_TORCH="${TV_ARRAY[0]}" + echo "=== Building wheel with torch ${FIRST_TORCH} ===" + install_torch "$FIRST_TORCH" + build_wheel "$TMP_DIR" + repair_wheel "$TMP_DIR" "$TMP_DIR/dist" + BASE_WHL=$(ls "$TMP_DIR"/dist/*.whl) + + for ((i=1; i<${#TV_ARRAY[@]}; i++)); do + TV="${TV_ARRAY[$i]}" + echo "=== Building nixl_ep .so for torch ${TV} ===" + install_torch "$TV" + + EP_TMP=$(mktemp -d) + build_wheel "$EP_TMP" + + # Extract torch-versioned .so from new wheel, inject into base wheel + EP_EXTRACT=$(mktemp -d) + unzip -o "$EP_TMP"/nixl*.whl -d "$EP_EXTRACT" + BASE_EXTRACT=$(mktemp -d) + unzip -o "$BASE_WHL" -d "$BASE_EXTRACT" + + TORCH_MM=$(echo "$TV" | tr -d '.') + find "$EP_EXTRACT" -name "nixl_ep_cpp_torch${TORCH_MM}*" -exec cp {} "$BASE_EXTRACT"/nixl_ep_cu${CUDA_MAJOR}/ \; + + # Regenerate RECORD + DIST_INFO=$(ls -d "$BASE_EXTRACT"/*.dist-info) + (cd "$BASE_EXTRACT" && find . -type f ! -name RECORD -printf '%P\n' | while read f; do + hash=$(python3 -c "import hashlib,base64; d=open('$f','rb').read(); print('sha256=' + base64.urlsafe_b64encode(hashlib.sha256(d).digest()).rstrip(b'=').decode())") + size=$(stat -c%s "$f") + echo "$f,$hash,$size" + done > "$DIST_INFO/RECORD" + echo "$(basename $DIST_INFO)/RECORD,," >> "$DIST_INFO/RECORD") + + rm -f "$BASE_WHL" + (cd "$BASE_EXTRACT" && zip -r "$BASE_WHL" .) + rm -rf "$EP_TMP" "$EP_EXTRACT" "$BASE_EXTRACT" + done + + cp "$BASE_WHL" "$OUTPUT_DIR" else - uv build --wheel --out-dir $TMP_DIR --python $PYTHON_VERSION + build_wheel "$TMP_DIR" + repair_wheel "$TMP_DIR" "$TMP_DIR/dist" + cp "$TMP_DIR"/dist/*.whl "$OUTPUT_DIR" fi -# Bundle libraries -mkdir $TMP_DIR/dist -auditwheel repair --exclude 'libcuda*' --exclude 'libcufile*' --exclude 'libssl*' --exclude 'libcrypto*' --exclude 'libefa*' --exclude 'libhwloc*' --exclude 'libfabric*' --exclude 'libtorch*' --exclude 'libc10*' --exclude 'libdoca*' $TMP_DIR/nixl*.whl --plat $WHL_PLATFORM --wheel-dir $TMP_DIR/dist -./contrib/wheel_add_ucx_plugins.py --ucx-plugins-dir $UCX_PLUGINS_DIR --nixl-plugins-dir $NIXL_PLUGINS_DIR $TMP_DIR/dist/*.whl -cp $TMP_DIR/dist/*.whl $OUTPUT_DIR - # Clean up rm -rf "$TMP_DIR" diff --git a/examples/device/ep/meson.build b/examples/device/ep/meson.build index a9ba19fb7a..8de8d56939 100644 --- a/examples/device/ep/meson.build +++ b/examples/device/ep/meson.build @@ -135,9 +135,18 @@ nixl_ep_install_rpath = join_paths(get_option('prefix'), get_option('libdir')) nixl_ep_install_rpath += ':' + join_paths(get_option('prefix'), get_option('libdir'), 'plugins') nixl_ep_install_rpath += ':' + torch_lib_dir -nixl_ep_ext = py.extension_module('nixl_ep_cpp', +# CUDA-versioned install dir so cu12 and cu13 wheels don't collide. +nixl_ep_install_dir = 'nixl_ep_' + cuda_wheel_dir.split('_')[-1] # nixl_ep_cu12 or nixl_ep_cu13 + +# Torch-versioned .so name so multiple torch ABIs coexist in a single wheel. +torch_ver = run_command(py, '-c', + 'import torch; print("".join(torch.__version__.split(".")[:2]))', + check: true).stdout().strip() +nixl_ep_ext_name = 'nixl_ep_cpp_torch' + torch_ver + +nixl_ep_ext = py.extension_module(nixl_ep_ext_name, nixl_ep_sources, - subdir: 'nixl_ep', + subdir: nixl_ep_install_dir, dependencies: [ nixl_dep, pybind_dep, @@ -165,11 +174,12 @@ custom_target('nixl_ep_py_copy', input: nixl_ep_py_files, command: [ 'bash', '-c', - 'cp -r @0@/nixl_ep @1@/ && cp @2@ @1@/nixl_ep/ && touch @3@'.format( + 'mkdir -p @1@/@4@ && cp @0@/nixl_ep/*.py @1@/@4@/ && cp @2@ @1@/@4@/ && touch @3@'.format( meson.current_source_dir(), meson.current_build_dir(), nixl_ep_ext.full_path(), - join_paths(meson.current_build_dir(), 'nixl_ep_py.stamp') + join_paths(meson.current_build_dir(), 'nixl_ep_py.stamp'), + nixl_ep_install_dir, ) ], depends: nixl_ep_ext, @@ -180,6 +190,6 @@ py.install_sources( 'nixl_ep/__init__.py', 'nixl_ep/buffer.py', 'nixl_ep/utils.py', - subdir: 'nixl_ep', + subdir: nixl_ep_install_dir, pure: false, ) diff --git a/examples/device/ep/nixl_ep/__init__.py b/examples/device/ep/nixl_ep/__init__.py index 488ad2fc29..0fa3ad030e 100644 --- a/examples/device/ep/nixl_ep/__init__.py +++ b/examples/device/ep/nixl_ep/__init__.py @@ -18,9 +18,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import importlib + import torch -from . import nixl_ep_cpp as _nixl_ep_cpp +_torch_mm = "".join(torch.__version__.split(".")[:2]) +_nixl_ep_cpp = importlib.import_module(f".nixl_ep_cpp_torch{_torch_mm}", __package__) + from .buffer import Buffer from .utils import EventOverlap diff --git a/src/bindings/python/nixl-meta/nixl/meson.build b/src/bindings/python/nixl-meta/nixl/meson.build index d71f55189b..9867ced11a 100644 --- a/src/bindings/python/nixl-meta/nixl/meson.build +++ b/src/bindings/python/nixl-meta/nixl/meson.build @@ -17,3 +17,18 @@ fs = import('fs') fs.copyfile('__init__.py') fs.copyfile('_api.py') fs.copyfile('logging.py') + +# nixl_ep meta-dispatcher: copy into build dir so the meta wheel picks it up +nixl_ep_meta_dir = join_paths(meson.current_build_dir(), '..', 'nixl_ep') +nixl_ep_meta_src = join_paths(meson.current_source_dir(), '..', 'nixl_ep', '__init__.py') +custom_target('nixl_ep_meta_copy', + output: 'nixl_ep_meta.stamp', + command: ['bash', '-c', + 'mkdir -p @0@ && cp @1@ @0@/__init__.py && touch @2@'.format( + nixl_ep_meta_dir, + nixl_ep_meta_src, + join_paths(meson.current_build_dir(), 'nixl_ep_meta.stamp'), + ) + ], + build_by_default: true, +) diff --git a/src/bindings/python/nixl-meta/nixl_ep/__init__.py b/src/bindings/python/nixl-meta/nixl_ep/__init__.py new file mode 100644 index 0000000000..b86783bc62 --- /dev/null +++ b/src/bindings/python/nixl-meta/nixl_ep/__init__.py @@ -0,0 +1,77 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""nixl_ep meta-dispatcher: selects the correct CUDA and torch ABI backend.""" + +import importlib +import sys +from typing import TYPE_CHECKING + + +def _get_torch_cuda_major() -> int | None: + """Return the CUDA major version that torch was built for, or None.""" + from torch.version import cuda as _torch_cuda_ver + + return int(_torch_cuda_ver.split(".")[0]) if _torch_cuda_ver else None + + +def _load_ep_backend() -> str: + cuda_major = _get_torch_cuda_major() + if cuda_major is not None: + pip_name = f"nixl-cu{cuda_major}" + mod_name = f"nixl_ep_cu{cuda_major}" + try: + return importlib.import_module(mod_name).__name__ + except ModuleNotFoundError as e: + if e.name != mod_name: + raise + raise ImportError( + f"torch reports CUDA {cuda_major} but {pip_name} is not installed" + ) from e + # CPU-only torch — use whatever backend is installed + for mod_name in ("nixl_ep_cu13", "nixl_ep_cu12"): + try: + return importlib.import_module(mod_name).__name__ + except ModuleNotFoundError as e: + if e.name != mod_name: + raise + continue + raise ImportError("No nixl_ep CUDA backend found") + + +_pkg = sys.modules[_load_ep_backend()] + +submodules = ["buffer", "utils"] +for sub_name in submodules: + # Import submodule from actual wheel + module = importlib.import_module(f"{_pkg.__name__}.{sub_name}") + # Make it accessible as nixl_ep.buffer, nixl_ep.utils + sys.modules[f"nixl_ep.{sub_name}"] = module + # Also add the submodule itself to the nixl_ep namespace + setattr(sys.modules[__name__], sub_name, module) + + # Expose all public symbols from the submodule under the nixl_ep namespace + for attr in dir(module): + if not attr.startswith("_"): + setattr(sys.modules[__name__], attr, getattr(module, attr)) + +# Expose public symbols from the backend __init__ (Config, topk_idx_t, etc.) +for attr in dir(_pkg): + if not attr.startswith("_"): + setattr(sys.modules[__name__], attr, getattr(_pkg, attr)) + +if TYPE_CHECKING: + from nixl_ep.buffer import Buffer # noqa: F401 + from nixl_ep.utils import EventOverlap # noqa: F401 diff --git a/src/bindings/python/nixl-meta/pyproject.toml.in b/src/bindings/python/nixl-meta/pyproject.toml.in index b9a957510f..23a95a4994 100644 --- a/src/bindings/python/nixl-meta/pyproject.toml.in +++ b/src/bindings/python/nixl-meta/pyproject.toml.in @@ -36,4 +36,4 @@ cu12 = ["nixl-cu12==@VERSION@"] cu13 = ["nixl-cu13==@VERSION@"] [tool.setuptools] -packages = ["nixl"] +packages = ["nixl", "nixl_ep"] From 8fc3d56435004fb075a749f12eff9b399f277913 Mon Sep 17 00:00:00 2001 From: Ovidiu Mara Date: Fri, 15 May 2026 03:17:28 +0200 Subject: [PATCH 02/26] Fix torch version pin Signed-off-by: Ovidiu Mara --- contrib/build-wheel.sh | 22 +++++++++++++++++----- contrib/tomlutil.py | 18 ++++++++++++------ 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/contrib/build-wheel.sh b/contrib/build-wheel.sh index 3ecc15ade9..68a730bc18 100755 --- a/contrib/build-wheel.sh +++ b/contrib/build-wheel.sh @@ -97,21 +97,33 @@ fi AUDITWHEEL_EXCLUDES="--exclude libcuda* --exclude libcufile* --exclude libssl* --exclude libcrypto* --exclude libefa* --exclude libhwloc* --exclude libfabric* --exclude libtorch* --exclude libc10* --exclude libdoca*" PKG_NAME="nixl-cu${CUDA_MAJOR}" +CU_TAG="cu$(nvcc --version | grep -Eo 'release [0-9]+\.[0-9]+' | cut -d' ' -f2 | tr -d .)" ./contrib/tomlutil.py --wheel-name $PKG_NAME pyproject.toml -install_torch() { +# Pin the torch build dep in pyproject.toml so uv build's isolated env resolves it. +pin_torch() { local VER=$1 - uv pip install --pre "torch==${VER}.*" + ./contrib/tomlutil.py --torch-version "$VER" pyproject.toml } +# uv build's isolated build env needs access to the nightly index too. +UV_BUILD_INDEX_FLAGS=( + --index-url "https://download.pytorch.org/whl/${CU_TAG}" + --extra-index-url "https://download.pytorch.org/whl/nightly/${CU_TAG}" + --index-strategy unsafe-best-match + --prerelease allow +) + build_wheel() { local OUT_DIR=$1 if [ "$BUILD_NIXL_EP" = "true" ]; then uv build --wheel --out-dir "$OUT_DIR" --python $PYTHON_VERSION \ + "${UV_BUILD_INDEX_FLAGS[@]}" \ -Csetup-args=-Dbuild_nixl_ep=true \ -Csetup-args=-Dbuild_examples=true else - uv build --wheel --out-dir "$OUT_DIR" --python $PYTHON_VERSION + uv build --wheel --out-dir "$OUT_DIR" --python $PYTHON_VERSION \ + "${UV_BUILD_INDEX_FLAGS[@]}" fi } @@ -129,7 +141,7 @@ if [ "$BUILD_NIXL_EP" = "true" ] && [ -n "$TORCH_VERSIONS" ]; then FIRST_TORCH="${TV_ARRAY[0]}" echo "=== Building wheel with torch ${FIRST_TORCH} ===" - install_torch "$FIRST_TORCH" + pin_torch "$FIRST_TORCH" build_wheel "$TMP_DIR" repair_wheel "$TMP_DIR" "$TMP_DIR/dist" BASE_WHL=$(ls "$TMP_DIR"/dist/*.whl) @@ -137,7 +149,7 @@ if [ "$BUILD_NIXL_EP" = "true" ] && [ -n "$TORCH_VERSIONS" ]; then for ((i=1; i<${#TV_ARRAY[@]}; i++)); do TV="${TV_ARRAY[$i]}" echo "=== Building nixl_ep .so for torch ${TV} ===" - install_torch "$TV" + pin_torch "$TV" EP_TMP=$(mktemp -d) build_wheel "$EP_TMP" diff --git a/contrib/tomlutil.py b/contrib/tomlutil.py index ed807aa84b..08f8e29550 100755 --- a/contrib/tomlutil.py +++ b/contrib/tomlutil.py @@ -21,6 +21,11 @@ parser = argparse.ArgumentParser() parser.add_argument("--wheel-name", type=str, help="Set the project name") +parser.add_argument( + "--torch-version", + type=str, + help="Pin the torch build dep to this major.minor (e.g. '2.13')", +) parser.add_argument("file", type=str, help="The toml file to modify") args = parser.parse_args() @@ -28,13 +33,14 @@ doc = tomlkit.parse(f.read()) if args.wheel_name: - # Set the wheel name - # Example: - # ```toml - # [project] - # name = "" - # ``` doc["project"]["name"] = args.wheel_name +if args.torch_version: + # Replace any existing "torch" or "torch==X.*" entry in build requires + requires = doc["build-system"]["requires"] + new_requires = [r for r in requires if not r.split("==")[0].strip() == "torch"] + new_requires.append(f"torch=={args.torch_version}.*") + doc["build-system"]["requires"] = new_requires + with open(args.file, "w") as f: f.write(tomlkit.dumps(doc)) From da2f6dfbb6dcf88556f9e00f9bc7e22fbd783be4 Mon Sep 17 00:00:00 2001 From: Ovidiu Mara Date: Fri, 15 May 2026 03:24:36 +0200 Subject: [PATCH 03/26] Use extra index correctly Signed-off-by: Ovidiu Mara --- contrib/build-wheel.sh | 46 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/contrib/build-wheel.sh b/contrib/build-wheel.sh index 68a730bc18..e98a7bba50 100755 --- a/contrib/build-wheel.sh +++ b/contrib/build-wheel.sh @@ -106,14 +106,34 @@ pin_torch() { ./contrib/tomlutil.py --torch-version "$VER" pyproject.toml } -# uv build's isolated build env needs access to the nightly index too. +# PyPI stays primary (for meson-python etc); PyTorch indexes are extras for torch. UV_BUILD_INDEX_FLAGS=( - --index-url "https://download.pytorch.org/whl/${CU_TAG}" + --extra-index-url "https://download.pytorch.org/whl/${CU_TAG}" --extra-index-url "https://download.pytorch.org/whl/nightly/${CU_TAG}" --index-strategy unsafe-best-match --prerelease allow ) +# Check whether torch==${VER}.* is resolvable from the configured indexes +# for the target Python version. Echoes "yes" on success, nothing otherwise. +torch_available() { + local VER=$1 + local CHECK_VENV + CHECK_VENV=$(mktemp -d)/venv + uv venv "$CHECK_VENV" --python "$PYTHON_VERSION" >/dev/null 2>&1 || return + # shellcheck disable=SC1090 + source "$CHECK_VENV/bin/activate" + if uv pip install --dry-run --pre \ + --extra-index-url "https://download.pytorch.org/whl/${CU_TAG}" \ + --extra-index-url "https://download.pytorch.org/whl/nightly/${CU_TAG}" \ + --index-strategy unsafe-best-match \ + "torch==${VER}.*" >/dev/null 2>&1; then + echo "yes" + fi + deactivate + rm -rf "$(dirname "$CHECK_VENV")" +} + build_wheel() { local OUT_DIR=$1 if [ "$BUILD_NIXL_EP" = "true" ]; then @@ -137,7 +157,27 @@ repair_wheel() { if [ "$BUILD_NIXL_EP" = "true" ] && [ -n "$TORCH_VERSIONS" ]; then # Multi-torch: build full wheel with first torch, then merge extra .so from others. - IFS=',' read -ra TV_ARRAY <<< "$TORCH_VERSIONS" + IFS=',' read -ra TV_REQUESTED <<< "$TORCH_VERSIONS" + + # Filter to torch versions actually resolvable for this (Python, CUDA) combo. + TV_ARRAY=() + SKIPPED=() + for TV in "${TV_REQUESTED[@]}"; do + if [ -n "$(torch_available "$TV")" ]; then + TV_ARRAY+=("$TV") + else + SKIPPED+=("$TV") + fi + done + + if [ ${#SKIPPED[@]} -gt 0 ]; then + echo "=== Skipping torch versions (no wheel on index for Python ${PYTHON_VERSION} + ${CU_TAG}): ${SKIPPED[*]} ===" + fi + if [ ${#TV_ARRAY[@]} -eq 0 ]; then + echo "ERROR: none of the requested torch versions (${TV_REQUESTED[*]}) are available for Python ${PYTHON_VERSION} + ${CU_TAG}" + exit 1 + fi + echo "=== Building for torch versions: ${TV_ARRAY[*]} ===" FIRST_TORCH="${TV_ARRAY[0]}" echo "=== Building wheel with torch ${FIRST_TORCH} ===" From 771346a719ddbabe49967fc619a66ab12a714dd5 Mon Sep 17 00:00:00 2001 From: Ovidiu Mara Date: Fri, 15 May 2026 03:35:05 +0200 Subject: [PATCH 04/26] Add missing repair wheel step Signed-off-by: Ovidiu Mara --- contrib/build-wheel.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/contrib/build-wheel.sh b/contrib/build-wheel.sh index e98a7bba50..3ad2e7f6ad 100755 --- a/contrib/build-wheel.sh +++ b/contrib/build-wheel.sh @@ -193,10 +193,12 @@ if [ "$BUILD_NIXL_EP" = "true" ] && [ -n "$TORCH_VERSIONS" ]; then EP_TMP=$(mktemp -d) build_wheel "$EP_TMP" + # Repair so the .so passes auditwheel for the manylinux tag before we extract it + repair_wheel "$EP_TMP" "$EP_TMP/dist" - # Extract torch-versioned .so from new wheel, inject into base wheel + # Extract torch-versioned .so from repaired wheel, inject into base wheel EP_EXTRACT=$(mktemp -d) - unzip -o "$EP_TMP"/nixl*.whl -d "$EP_EXTRACT" + unzip -o "$EP_TMP"/dist/*.whl -d "$EP_EXTRACT" BASE_EXTRACT=$(mktemp -d) unzip -o "$BASE_WHL" -d "$BASE_EXTRACT" From a79b000dc3fedf9e53456297dfad3791cc2d61a7 Mon Sep 17 00:00:00 2001 From: Ovidiu Mara Date: Fri, 15 May 2026 12:10:44 +0200 Subject: [PATCH 05/26] Refactor to create venv in a controlled way Signed-off-by: Ovidiu Mara --- contrib/build-wheel.sh | 193 ++++++++++++++++++++++++++++++++++------- 1 file changed, 163 insertions(+), 30 deletions(-) diff --git a/contrib/build-wheel.sh b/contrib/build-wheel.sh index 3ad2e7f6ad..88a9fa9a26 100755 --- a/contrib/build-wheel.sh +++ b/contrib/build-wheel.sh @@ -100,51 +100,186 @@ PKG_NAME="nixl-cu${CUDA_MAJOR}" CU_TAG="cu$(nvcc --version | grep -Eo 'release [0-9]+\.[0-9]+' | cut -d' ' -f2 | tr -d .)" ./contrib/tomlutil.py --wheel-name $PKG_NAME pyproject.toml -# Pin the torch build dep in pyproject.toml so uv build's isolated env resolves it. -pin_torch() { +# Index URLs for the resolved CUDA tag. Stable cu index hosts released +# wheels; nightly index hosts dev/pre-release wheels. +TORCH_STABLE_INDEX="https://download.pytorch.org/whl/${CU_TAG}" +TORCH_NIGHTLY_INDEX="https://download.pytorch.org/whl/nightly/${CU_TAG}" + +# Build deps every wheel build needs in its venv. Mirrors pyproject.toml's +# `build-system.requires` minus torch (which is added per-iteration with +# channel-appropriate constraints). pytest/build are not strictly needed by +# meson-python at build time but are kept in sync with pyproject.toml so +# nothing breaks if a backend hook decides to import them. +BUILD_DEPS=( + "meson" + "meson-python" + "pybind11" + "patchelf" + "pyyaml" + "types-PyYAML" + "setuptools>=80.9.0" +) + +# Channel cache for repeated lookups within a single script run. +declare -A TORCH_CHANNEL_CACHE + +# Slugify a dotted version (e.g. "2.13" -> "213", "3.10" -> "310") so it can +# be used unambiguously as a path component. +slug() { echo "${1//./}"; } + +# Path for a per-iteration build venv. One venv per (python, torch) tuple +# so torch's transitive footprint (nvidia-*, triton, sympy, …) never bleeds +# across torch versions. Lives in /workspace, not /tmp, so it inherits the +# image's UV_CACHE_DIR layout and is visible to debugging. +venv_path() { + local VER=${1:-} + if [ -n "$VER" ]; then + echo "/workspace/venv-torch$(slug "$VER")-py$(slug "$PYTHON_VERSION")" + else + echo "/workspace/venv-py$(slug "$PYTHON_VERSION")" + fi +} + +# Determine whether torch==${VER}.* has a stable release on the stable cu +# index for the target Python version. Echoes "stable" or "nightly". +# A torch version is treated as stable iff a non-pre-release wheel resolves +# from the stable cu index alone (no nightly index, no --pre). +torch_channel() { local VER=$1 - ./contrib/tomlutil.py --torch-version "$VER" pyproject.toml + if [ -n "${TORCH_CHANNEL_CACHE[$VER]:-}" ]; then + echo "${TORCH_CHANNEL_CACHE[$VER]}" + return + fi + local CHANNEL="nightly" + local CHECK_VENV="/workspace/venv-probe-py$(slug "$PYTHON_VERSION")" + rm -rf "$CHECK_VENV" + if uv venv "$CHECK_VENV" --python "$PYTHON_VERSION" >/dev/null 2>&1; then + # Unset UV_*INDEX* env vars (the wheel-build loop in + # Dockerfile.manylinux exports them with the nightly index, which + # would otherwise let `torch==X.Y.*` resolve from nightly and make + # us mis-classify a nightly-only release as stable). + if env -u UV_INDEX -u UV_EXTRA_INDEX_URL -u UV_INDEX_STRATEGY -u UV_DEFAULT_INDEX \ + uv pip install --dry-run \ + --python "$CHECK_VENV/bin/python" \ + --index-url "$TORCH_STABLE_INDEX" \ + "torch==${VER}.*" >/dev/null 2>&1; then + CHANNEL="stable" + fi + fi + rm -rf "$CHECK_VENV" + TORCH_CHANNEL_CACHE[$VER]="$CHANNEL" + echo "$CHANNEL" } -# PyPI stays primary (for meson-python etc); PyTorch indexes are extras for torch. -UV_BUILD_INDEX_FLAGS=( - --extra-index-url "https://download.pytorch.org/whl/${CU_TAG}" - --extra-index-url "https://download.pytorch.org/whl/nightly/${CU_TAG}" - --index-strategy unsafe-best-match - --prerelease allow -) +# Echo the torch requirement spec for the given (version, channel). Stable +# stays as `torch==X.Y.*`; nightly uses an explicit `.dev0` lower bound so +# pre-release/dev wheels are admissible without --prerelease=allow. +torch_spec() { + local VER=$1 + local CHANNEL=$2 + if [ "$CHANNEL" = "nightly" ]; then + local MAJOR="${VER%%.*}" + local MINOR="${VER##*.}" + echo "torch>=${MAJOR}.${MINOR}.0.dev0,<${MAJOR}.$((MINOR + 1))" + else + echo "torch==${VER}.*" + fi +} + +# Echo the uv index/resolution flags appropriate for the given channel, +# one flag per line. Stable builds use only the stable cu index and no +# pre-release allowance; nightly builds add the nightly index and +# --prerelease=allow. +torch_uv_flags() { + local CHANNEL=$1 + if [ "$CHANNEL" = "nightly" ]; then + printf '%s\n' \ + --extra-index-url "$TORCH_STABLE_INDEX" \ + --extra-index-url "$TORCH_NIGHTLY_INDEX" \ + --index-strategy unsafe-best-match \ + --prerelease allow + else + printf '%s\n' \ + --extra-index-url "$TORCH_STABLE_INDEX" \ + --index-strategy unsafe-best-match + fi +} -# Check whether torch==${VER}.* is resolvable from the configured indexes -# for the target Python version. Echoes "yes" on success, nothing otherwise. +# Check whether torch==${VER}.* is resolvable from any configured index +# (stable or nightly) for the target Python version. Echoes "yes" on +# success, nothing otherwise. torch_available() { local VER=$1 - local CHECK_VENV - CHECK_VENV=$(mktemp -d)/venv + local CHECK_VENV="/workspace/venv-probe-py$(slug "$PYTHON_VERSION")" + rm -rf "$CHECK_VENV" uv venv "$CHECK_VENV" --python "$PYTHON_VERSION" >/dev/null 2>&1 || return - # shellcheck disable=SC1090 - source "$CHECK_VENV/bin/activate" if uv pip install --dry-run --pre \ - --extra-index-url "https://download.pytorch.org/whl/${CU_TAG}" \ - --extra-index-url "https://download.pytorch.org/whl/nightly/${CU_TAG}" \ + --python "$CHECK_VENV/bin/python" \ + --extra-index-url "$TORCH_STABLE_INDEX" \ + --extra-index-url "$TORCH_NIGHTLY_INDEX" \ --index-strategy unsafe-best-match \ "torch==${VER}.*" >/dev/null 2>&1; then echo "yes" fi - deactivate - rm -rf "$(dirname "$CHECK_VENV")" + rm -rf "$CHECK_VENV" } +# Build the wheel for the current PYTHON_VERSION (and optional torch VER). +# Creates a fresh venv at venv_path, installs build deps + torch with the +# channel-appropriate flags, runs `uv build --no-build-isolation`, and +# tears the venv down so the next iteration starts from a clean slate. +# Doing it this way instead of `pip install --reinstall torch` avoids +# orphan packages from the previous torch's transitive footprint +# (nvidia-* wheels, triton, sympy, …) bleeding across iterations. build_wheel() { local OUT_DIR=$1 + local VER=${2:-} + + local VENV_PATH + VENV_PATH=$(venv_path "$VER") + local CHANNEL="stable" + [ -n "$VER" ] && CHANNEL=$(torch_channel "$VER") + + local UV_FLAGS=() + while IFS= read -r f; do UV_FLAGS+=("$f"); done < <(torch_uv_flags "$CHANNEL") + + local TORCH_PKG=() + [ -n "$VER" ] && TORCH_PKG+=("$(torch_spec "$VER" "$CHANNEL")") + + echo "=== Provisioning ${VENV_PATH} (python ${PYTHON_VERSION}${VER:+, torch ${VER} [${CHANNEL}]}) ===" + rm -rf "$VENV_PATH" + uv venv "$VENV_PATH" --python "$PYTHON_VERSION" + uv pip install \ + --python "$VENV_PATH/bin/python" \ + "${UV_FLAGS[@]}" \ + "${BUILD_DEPS[@]}" \ + "${TORCH_PKG[@]}" + + # Activate so meson's `find_installation('python3')` resolves to this + # venv's interpreter (which has the right torch). Deactivate before + # returning so the caller's auditwheel keeps using the orchestration + # venv on PATH. + # shellcheck disable=SC1091 + source "$VENV_PATH/bin/activate" + + local BUILD_ARGS=( + --wheel + --no-build-isolation + --out-dir "$OUT_DIR" + --python "$VENV_PATH/bin/python" + ) if [ "$BUILD_NIXL_EP" = "true" ]; then - uv build --wheel --out-dir "$OUT_DIR" --python $PYTHON_VERSION \ - "${UV_BUILD_INDEX_FLAGS[@]}" \ - -Csetup-args=-Dbuild_nixl_ep=true \ + BUILD_ARGS+=( + -Csetup-args=-Dbuild_nixl_ep=true -Csetup-args=-Dbuild_examples=true - else - uv build --wheel --out-dir "$OUT_DIR" --python $PYTHON_VERSION \ - "${UV_BUILD_INDEX_FLAGS[@]}" + ) fi + uv build "${BUILD_ARGS[@]}" + + deactivate + # Free disk: torch + nvidia-* wheels in a venv add up to several GB; + # 3 torches × 5 pythons would otherwise blow the docker layer budget. + rm -rf "$VENV_PATH" } repair_wheel() { @@ -181,18 +316,16 @@ if [ "$BUILD_NIXL_EP" = "true" ] && [ -n "$TORCH_VERSIONS" ]; then FIRST_TORCH="${TV_ARRAY[0]}" echo "=== Building wheel with torch ${FIRST_TORCH} ===" - pin_torch "$FIRST_TORCH" - build_wheel "$TMP_DIR" + build_wheel "$TMP_DIR" "$FIRST_TORCH" repair_wheel "$TMP_DIR" "$TMP_DIR/dist" BASE_WHL=$(ls "$TMP_DIR"/dist/*.whl) for ((i=1; i<${#TV_ARRAY[@]}; i++)); do TV="${TV_ARRAY[$i]}" echo "=== Building nixl_ep .so for torch ${TV} ===" - pin_torch "$TV" EP_TMP=$(mktemp -d) - build_wheel "$EP_TMP" + build_wheel "$EP_TMP" "$TV" # Repair so the .so passes auditwheel for the manylinux tag before we extract it repair_wheel "$EP_TMP" "$EP_TMP/dist" From 37509dd3b4a0f1db2fc3ca5736cebc55c7704386 Mon Sep 17 00:00:00 2001 From: Ovidiu Mara Date: Fri, 15 May 2026 12:34:33 +0200 Subject: [PATCH 06/26] Clean up diff Signed-off-by: Ovidiu Mara --- contrib/Dockerfile.manylinux | 13 ++++------ contrib/build-wheel.sh | 44 ++++++++++++++-------------------- contrib/tomlutil.py | 18 +++++--------- examples/device/ep/meson.build | 9 +++++-- 4 files changed, 35 insertions(+), 49 deletions(-) diff --git a/contrib/Dockerfile.manylinux b/contrib/Dockerfile.manylinux index 3dad548679..2d2a5f04c8 100644 --- a/contrib/Dockerfile.manylinux +++ b/contrib/Dockerfile.manylinux @@ -294,11 +294,9 @@ RUN rm -rf $VIRTUAL_ENV && uv venv $VIRTUAL_ENV --python $DEFAULT_PYTHON_VERSION ENV PATH="$VIRTUAL_ENV/bin:$PATH" # Install python dependencies RUN uv pip install --upgrade meson meson-python pybind11 patchelf pyYAML click setuptools tabulate auditwheel tomlkit -# Install PyTorch (default version for meson setup; wheel builds may reinstall other versions) -ARG TORCH_VERSIONS="2.11,2.12,2.13" +# Install PyTorch RUN export UV_INDEX="https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d .)" && \ - FIRST_TORCH=$(echo "$TORCH_VERSIONS" | cut -d, -f1) && \ - uv pip install "torch==${FIRST_TORCH}.*" + uv pip install 'torch==2.11.*' # Upgrade setuptools to latest version for compatibility with PEP 639 (license format) RUN uv pip install --upgrade 'setuptools>=80.9.0' @@ -355,15 +353,12 @@ RUN echo "/usr/local/nixl/lib/$ARCH-linux-gnu" > /etc/ld.so.conf.d/nixl.conf && # Create the wheel # No need to specifically add path to libcuda.so here, meson finds the stubs and links them +ARG WHL_TORCH_VERSIONS="2.11,2.12,2.13" ARG WHL_PYTHON_VERSIONS="3.10,3.11,3.12,3.13,3.14" ARG WHL_PLATFORM="manylinux_2_28_$ARCH" RUN IFS=',' read -ra PYTHON_VERSIONS <<< "$WHL_PYTHON_VERSIONS" && \ - CU_TAG="cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d .)" && \ - export UV_EXTRA_INDEX_URL="https://download.pytorch.org/whl/nightly/${CU_TAG}" && \ - export UV_INDEX="https://download.pytorch.org/whl/${CU_TAG}" && \ - export UV_INDEX_STRATEGY=unsafe-best-match && \ if [ "$BUILD_NIXL_EP" = "true" ]; then \ - EP_BUILD_FLAG="--build-nixl-ep --torch-versions $TORCH_VERSIONS"; \ + EP_BUILD_FLAG="--build-nixl-ep --torch-versions $WHL_TORCH_VERSIONS"; \ else \ EP_BUILD_FLAG=""; \ fi && \ diff --git a/contrib/build-wheel.sh b/contrib/build-wheel.sh index 88a9fa9a26..dfdfbe6998 100755 --- a/contrib/build-wheel.sh +++ b/contrib/build-wheel.sh @@ -105,11 +105,8 @@ CU_TAG="cu$(nvcc --version | grep -Eo 'release [0-9]+\.[0-9]+' | cut -d' ' -f2 | TORCH_STABLE_INDEX="https://download.pytorch.org/whl/${CU_TAG}" TORCH_NIGHTLY_INDEX="https://download.pytorch.org/whl/nightly/${CU_TAG}" -# Build deps every wheel build needs in its venv. Mirrors pyproject.toml's -# `build-system.requires` minus torch (which is added per-iteration with -# channel-appropriate constraints). pytest/build are not strictly needed by -# meson-python at build time but are kept in sync with pyproject.toml so -# nothing breaks if a backend hook decides to import them. +# Build deps installed into the per-iteration venv (torch is added +# separately with channel-appropriate constraints). BUILD_DEPS=( "meson" "meson-python" @@ -154,12 +151,7 @@ torch_channel() { local CHECK_VENV="/workspace/venv-probe-py$(slug "$PYTHON_VERSION")" rm -rf "$CHECK_VENV" if uv venv "$CHECK_VENV" --python "$PYTHON_VERSION" >/dev/null 2>&1; then - # Unset UV_*INDEX* env vars (the wheel-build loop in - # Dockerfile.manylinux exports them with the nightly index, which - # would otherwise let `torch==X.Y.*` resolve from nightly and make - # us mis-classify a nightly-only release as stable). - if env -u UV_INDEX -u UV_EXTRA_INDEX_URL -u UV_INDEX_STRATEGY -u UV_DEFAULT_INDEX \ - uv pip install --dry-run \ + if uv pip install --dry-run \ --python "$CHECK_VENV/bin/python" \ --index-url "$TORCH_STABLE_INDEX" \ "torch==${VER}.*" >/dev/null 2>&1; then @@ -292,40 +284,40 @@ repair_wheel() { if [ "$BUILD_NIXL_EP" = "true" ] && [ -n "$TORCH_VERSIONS" ]; then # Multi-torch: build full wheel with first torch, then merge extra .so from others. - IFS=',' read -ra TV_REQUESTED <<< "$TORCH_VERSIONS" + IFS=',' read -ra TORCH_REQUESTED <<< "$TORCH_VERSIONS" # Filter to torch versions actually resolvable for this (Python, CUDA) combo. - TV_ARRAY=() + TORCH_ARRAY=() SKIPPED=() - for TV in "${TV_REQUESTED[@]}"; do - if [ -n "$(torch_available "$TV")" ]; then - TV_ARRAY+=("$TV") + for TORCH in "${TORCH_REQUESTED[@]}"; do + if [ -n "$(torch_available "$TORCH")" ]; then + TORCH_ARRAY+=("$TORCH") else - SKIPPED+=("$TV") + SKIPPED+=("$TORCH") fi done if [ ${#SKIPPED[@]} -gt 0 ]; then echo "=== Skipping torch versions (no wheel on index for Python ${PYTHON_VERSION} + ${CU_TAG}): ${SKIPPED[*]} ===" fi - if [ ${#TV_ARRAY[@]} -eq 0 ]; then - echo "ERROR: none of the requested torch versions (${TV_REQUESTED[*]}) are available for Python ${PYTHON_VERSION} + ${CU_TAG}" + if [ ${#TORCH_ARRAY[@]} -eq 0 ]; then + echo "ERROR: none of the requested torch versions (${TORCH_REQUESTED[*]}) are available for Python ${PYTHON_VERSION} + ${CU_TAG}" exit 1 fi - echo "=== Building for torch versions: ${TV_ARRAY[*]} ===" + echo "=== Building for torch versions: ${TORCH_ARRAY[*]} ===" - FIRST_TORCH="${TV_ARRAY[0]}" + FIRST_TORCH="${TORCH_ARRAY[0]}" echo "=== Building wheel with torch ${FIRST_TORCH} ===" build_wheel "$TMP_DIR" "$FIRST_TORCH" repair_wheel "$TMP_DIR" "$TMP_DIR/dist" BASE_WHL=$(ls "$TMP_DIR"/dist/*.whl) - for ((i=1; i<${#TV_ARRAY[@]}; i++)); do - TV="${TV_ARRAY[$i]}" - echo "=== Building nixl_ep .so for torch ${TV} ===" + for ((i=1; i<${#TORCH_ARRAY[@]}; i++)); do + TORCH="${TORCH_ARRAY[$i]}" + echo "=== Building nixl_ep .so for torch ${TORCH} ===" EP_TMP=$(mktemp -d) - build_wheel "$EP_TMP" "$TV" + build_wheel "$EP_TMP" "$TORCH" # Repair so the .so passes auditwheel for the manylinux tag before we extract it repair_wheel "$EP_TMP" "$EP_TMP/dist" @@ -335,7 +327,7 @@ if [ "$BUILD_NIXL_EP" = "true" ] && [ -n "$TORCH_VERSIONS" ]; then BASE_EXTRACT=$(mktemp -d) unzip -o "$BASE_WHL" -d "$BASE_EXTRACT" - TORCH_MM=$(echo "$TV" | tr -d '.') + TORCH_MM=$(echo "$TORCH" | tr -d '.') find "$EP_EXTRACT" -name "nixl_ep_cpp_torch${TORCH_MM}*" -exec cp {} "$BASE_EXTRACT"/nixl_ep_cu${CUDA_MAJOR}/ \; # Regenerate RECORD diff --git a/contrib/tomlutil.py b/contrib/tomlutil.py index 08f8e29550..ed807aa84b 100755 --- a/contrib/tomlutil.py +++ b/contrib/tomlutil.py @@ -21,11 +21,6 @@ parser = argparse.ArgumentParser() parser.add_argument("--wheel-name", type=str, help="Set the project name") -parser.add_argument( - "--torch-version", - type=str, - help="Pin the torch build dep to this major.minor (e.g. '2.13')", -) parser.add_argument("file", type=str, help="The toml file to modify") args = parser.parse_args() @@ -33,14 +28,13 @@ doc = tomlkit.parse(f.read()) if args.wheel_name: + # Set the wheel name + # Example: + # ```toml + # [project] + # name = "" + # ``` doc["project"]["name"] = args.wheel_name -if args.torch_version: - # Replace any existing "torch" or "torch==X.*" entry in build requires - requires = doc["build-system"]["requires"] - new_requires = [r for r in requires if not r.split("==")[0].strip() == "torch"] - new_requires.append(f"torch=={args.torch_version}.*") - doc["build-system"]["requires"] = new_requires - with open(args.file, "w") as f: f.write(tomlkit.dumps(doc)) diff --git a/examples/device/ep/meson.build b/examples/device/ep/meson.build index 8de8d56939..8938159719 100644 --- a/examples/device/ep/meson.build +++ b/examples/device/ep/meson.build @@ -118,10 +118,15 @@ nixl_ep_rpath += ':' + nixl_lib_dir nixl_ep_rpath += ':' + join_paths(nixl_lib_dir, 'core') nixl_ep_rpath += ':' + join_paths(nixl_lib_dir, 'plugins') +# Torch >=2.13 ships public C10 headers (e.g. c10/core/AutogradState.h) +# that use C++20 features (default member initializers for bit-fields), so +# the nixl_ep extension must be compiled as C++20. C++20 is backwards- +# compatible with the project's default cpp_std=c++17, so older torch +# versions keep building. +nixl_ep_override_options = ['cpp_std=c++20', 'cuda_std=c++20'] # For now, nixl ep cannot be built with -G due to register usage limits -nixl_ep_override_options = [] if get_option('buildtype') == 'debug' - nixl_ep_override_options = ['optimization=3'] + nixl_ep_override_options += ['optimization=3'] nixl_ep_cpp_args += ['-g'] endif From 4d380cd1ebc729143aecfb54b033c1480ecd1690 Mon Sep 17 00:00:00 2001 From: Ovidiu Mara Date: Fri, 15 May 2026 12:35:55 +0200 Subject: [PATCH 07/26] Rename flags variable Signed-off-by: Ovidiu Mara --- contrib/Dockerfile.manylinux | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/contrib/Dockerfile.manylinux b/contrib/Dockerfile.manylinux index 2d2a5f04c8..a7f4e58cba 100644 --- a/contrib/Dockerfile.manylinux +++ b/contrib/Dockerfile.manylinux @@ -353,14 +353,14 @@ RUN echo "/usr/local/nixl/lib/$ARCH-linux-gnu" > /etc/ld.so.conf.d/nixl.conf && # Create the wheel # No need to specifically add path to libcuda.so here, meson finds the stubs and links them -ARG WHL_TORCH_VERSIONS="2.11,2.12,2.13" ARG WHL_PYTHON_VERSIONS="3.10,3.11,3.12,3.13,3.14" +ARG WHL_TORCH_VERSIONS="2.11,2.12,2.13" ARG WHL_PLATFORM="manylinux_2_28_$ARCH" RUN IFS=',' read -ra PYTHON_VERSIONS <<< "$WHL_PYTHON_VERSIONS" && \ if [ "$BUILD_NIXL_EP" = "true" ]; then \ - EP_BUILD_FLAG="--build-nixl-ep --torch-versions $WHL_TORCH_VERSIONS"; \ + FLAGS="--build-nixl-ep --torch-versions $WHL_TORCH_VERSIONS"; \ else \ - EP_BUILD_FLAG=""; \ + FLAGS=""; \ fi && \ rm -rf dist && mkdir -p dist && \ for PYTHON_VERSION in "${PYTHON_VERSIONS[@]}"; do \ @@ -371,7 +371,7 @@ RUN IFS=',' read -ra PYTHON_VERSIONS <<< "$WHL_PYTHON_VERSIONS" && \ --ucx-plugins-dir /usr/lib64/ucx \ --nixl-plugins-dir $NIXL_PLUGIN_DIR \ --output-dir dist \ - $EP_BUILD_FLAG ; \ + $FLAGS ; \ done # Copy the meta package wheel to the dist directory, which will be used to push to PyPI. From b299774a906dbc5c73e9b7064726a3d370c90f0f Mon Sep 17 00:00:00 2001 From: Ovidiu Mara Date: Fri, 15 May 2026 12:37:29 +0200 Subject: [PATCH 08/26] Remove torch 2.13 build, it fails Signed-off-by: Ovidiu Mara --- contrib/Dockerfile.manylinux | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/Dockerfile.manylinux b/contrib/Dockerfile.manylinux index a7f4e58cba..9a19d464e1 100644 --- a/contrib/Dockerfile.manylinux +++ b/contrib/Dockerfile.manylinux @@ -354,7 +354,7 @@ RUN echo "/usr/local/nixl/lib/$ARCH-linux-gnu" > /etc/ld.so.conf.d/nixl.conf && # Create the wheel # No need to specifically add path to libcuda.so here, meson finds the stubs and links them ARG WHL_PYTHON_VERSIONS="3.10,3.11,3.12,3.13,3.14" -ARG WHL_TORCH_VERSIONS="2.11,2.12,2.13" +ARG WHL_TORCH_VERSIONS="2.11,2.12" ARG WHL_PLATFORM="manylinux_2_28_$ARCH" RUN IFS=',' read -ra PYTHON_VERSIONS <<< "$WHL_PYTHON_VERSIONS" && \ if [ "$BUILD_NIXL_EP" = "true" ]; then \ From 3fec58fcd9a539f593861ee148084aab66f0aaed Mon Sep 17 00:00:00 2001 From: Ovidiu Mara Date: Fri, 15 May 2026 12:44:46 +0200 Subject: [PATCH 09/26] Fix mypy CI check Signed-off-by: Ovidiu Mara --- examples/device/ep/nixl_ep/__init__.py | 6 ++++-- pyproject.toml | 5 +++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/examples/device/ep/nixl_ep/__init__.py b/examples/device/ep/nixl_ep/__init__.py index 0fa3ad030e..c8ca63b11a 100644 --- a/examples/device/ep/nixl_ep/__init__.py +++ b/examples/device/ep/nixl_ep/__init__.py @@ -25,8 +25,10 @@ _torch_mm = "".join(torch.__version__.split(".")[:2]) _nixl_ep_cpp = importlib.import_module(f".nixl_ep_cpp_torch{_torch_mm}", __package__) -from .buffer import Buffer -from .utils import EventOverlap +# The submodules below import names from `_nixl_ep_cpp`, so the dynamic +# import above must run first; that's why these aren't at the top. +from .buffer import Buffer # noqa: E402 +from .utils import EventOverlap # noqa: E402 topk_idx_t = getattr(_nixl_ep_cpp, "topk_idx_t", torch.int64) Config = _nixl_ep_cpp.Config diff --git a/pyproject.toml b/pyproject.toml index cd08c35c80..a77d768313 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,11 @@ dependencies = ["torch", "numpy"] [tool.mypy] mypy_path = ["src/bindings/python/nixl-meta"] ignore_missing_imports = true +# Two `nixl_ep/__init__.py` files coexist legitimately: the meta-dispatcher +# under `src/bindings/python/nixl-meta/nixl_ep/` and the actual EP source +# under `examples/device/ep/nixl_ep/`. Without explicit_package_bases mypy +# refuses to resolve them and errors with "Duplicate module named nixl_ep". +explicit_package_bases = true [tool.isort] profile = "black" From 3896d931ab1f0ca426e8932fb64df8af4811fff0 Mon Sep 17 00:00:00 2001 From: Ovidiu Mara Date: Fri, 15 May 2026 12:49:09 +0200 Subject: [PATCH 10/26] Revert C++ 20 flags, out of scope and unsafe to build only parts of the code Signed-off-by: Ovidiu Mara --- examples/device/ep/meson.build | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/examples/device/ep/meson.build b/examples/device/ep/meson.build index 8938159719..8de8d56939 100644 --- a/examples/device/ep/meson.build +++ b/examples/device/ep/meson.build @@ -118,15 +118,10 @@ nixl_ep_rpath += ':' + nixl_lib_dir nixl_ep_rpath += ':' + join_paths(nixl_lib_dir, 'core') nixl_ep_rpath += ':' + join_paths(nixl_lib_dir, 'plugins') -# Torch >=2.13 ships public C10 headers (e.g. c10/core/AutogradState.h) -# that use C++20 features (default member initializers for bit-fields), so -# the nixl_ep extension must be compiled as C++20. C++20 is backwards- -# compatible with the project's default cpp_std=c++17, so older torch -# versions keep building. -nixl_ep_override_options = ['cpp_std=c++20', 'cuda_std=c++20'] # For now, nixl ep cannot be built with -G due to register usage limits +nixl_ep_override_options = [] if get_option('buildtype') == 'debug' - nixl_ep_override_options += ['optimization=3'] + nixl_ep_override_options = ['optimization=3'] nixl_ep_cpp_args += ['-g'] endif From 35117228d373bd1d401f5cc88d86d5cf026157b6 Mon Sep 17 00:00:00 2001 From: Ovidiu Mara Date: Fri, 15 May 2026 14:09:23 +0200 Subject: [PATCH 11/26] Fix import of binary bindings Signed-off-by: Ovidiu Mara --- examples/device/ep/meson.build | 20 ++++++++++++-------- examples/device/ep/nixl_ep/__init__.py | 6 +++++- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/examples/device/ep/meson.build b/examples/device/ep/meson.build index 8de8d56939..848886e40b 100644 --- a/examples/device/ep/meson.build +++ b/examples/device/ep/meson.build @@ -78,9 +78,19 @@ nixl_ep_inc_dirs = [ torch_inc_dirs, ] +# Torch-versioned .so name so multiple torch ABIs coexist in a single +# wheel. Defined up here because TORCH_EXTENSION_NAME (consumed by +# pybind11's PYBIND11_MODULE macro to generate the PyInit_ symbol) +# must match the .so filename Python loads, otherwise import fails with +# "dynamic module does not define module export function". +torch_ver = run_command(py, '-c', + 'import torch; print("".join(torch.__version__.split(".")[:2]))', + check: true).stdout().strip() +nixl_ep_ext_name = 'nixl_ep_cpp_torch' + torch_ver + nixl_ep_cpp_args = [ '-DHAVE_CUDA', - '-DTORCH_EXTENSION_NAME=nixl_ep_cpp', + '-DTORCH_EXTENSION_NAME=' + nixl_ep_ext_name, '-Wno-deprecated-declarations', '-Wno-unused-variable', '-Wno-sign-compare', @@ -90,7 +100,7 @@ nixl_ep_cpp_args = [ nixl_ep_cuda_args = [ '-DHAVE_CUDA', - '-DTORCH_EXTENSION_NAME=nixl_ep_cpp', + '-DTORCH_EXTENSION_NAME=' + nixl_ep_ext_name, '--expt-relaxed-constexpr', # Allow calling constexpr __host__ functions from __device__ functions '-arch=sm_90', # Only compile for sm90 (overrides global -gencode flags) '--ptxas-options=--register-usage-level=10', # Allow more register usage (matches setup.py) @@ -138,12 +148,6 @@ nixl_ep_install_rpath += ':' + torch_lib_dir # CUDA-versioned install dir so cu12 and cu13 wheels don't collide. nixl_ep_install_dir = 'nixl_ep_' + cuda_wheel_dir.split('_')[-1] # nixl_ep_cu12 or nixl_ep_cu13 -# Torch-versioned .so name so multiple torch ABIs coexist in a single wheel. -torch_ver = run_command(py, '-c', - 'import torch; print("".join(torch.__version__.split(".")[:2]))', - check: true).stdout().strip() -nixl_ep_ext_name = 'nixl_ep_cpp_torch' + torch_ver - nixl_ep_ext = py.extension_module(nixl_ep_ext_name, nixl_ep_sources, subdir: nixl_ep_install_dir, diff --git a/examples/device/ep/nixl_ep/__init__.py b/examples/device/ep/nixl_ep/__init__.py index c8ca63b11a..00e941598c 100644 --- a/examples/device/ep/nixl_ep/__init__.py +++ b/examples/device/ep/nixl_ep/__init__.py @@ -19,13 +19,17 @@ # limitations under the License. import importlib +import sys import torch _torch_mm = "".join(torch.__version__.split(".")[:2]) _nixl_ep_cpp = importlib.import_module(f".nixl_ep_cpp_torch{_torch_mm}", __package__) +# Alias the torch-versioned extension as `nixl_ep_cpp` so the static +# `from .nixl_ep_cpp import ...` imports in buffer.py / utils.py resolve. +sys.modules[f"{__package__}.nixl_ep_cpp"] = _nixl_ep_cpp -# The submodules below import names from `_nixl_ep_cpp`, so the dynamic +# The submodules below import names from `nixl_ep_cpp`, so the dynamic # import above must run first; that's why these aren't at the top. from .buffer import Buffer # noqa: E402 from .utils import EventOverlap # noqa: E402 From 96b3b7e866112235c6b11eb188aa1e379023219d Mon Sep 17 00:00:00 2001 From: Ovidiu Mara Date: Fri, 15 May 2026 15:55:11 +0200 Subject: [PATCH 12/26] Refactor meson build scripts to track deps correctly Signed-off-by: Ovidiu Mara --- src/bindings/python/nixl-meta/meson.build | 17 +++++++++++--- .../python/nixl-meta/nixl/meson.build | 23 ++++--------------- .../python/nixl-meta/nixl_ep/meson.build | 19 +++++++++++++++ 3 files changed, 38 insertions(+), 21 deletions(-) create mode 100644 src/bindings/python/nixl-meta/nixl_ep/meson.build diff --git a/src/bindings/python/nixl-meta/meson.build b/src/bindings/python/nixl-meta/meson.build index 6cb893f7f7..579bbdf1a9 100644 --- a/src/bindings/python/nixl-meta/meson.build +++ b/src/bindings/python/nixl-meta/meson.build @@ -34,16 +34,27 @@ source_root = meson.project_source_root() root_license_path = join_paths(source_root, 'LICENSE') license_path = fs.copyfile(root_license_path) -nixl_sources = files('nixl/__init__.py') - subdir('nixl') +subdir('nixl_ep') uv = find_program('uv', required: false) if uv.found() wheel_name = 'nixl-@0@-py3-none-any.whl'.format(meson.project_version()) + # Inputs intentionally point at the build-dir COPIES (return values of + # fs.copyfile() captured in the subdirs), not the source files. This way + # ninja runs the copies before `uv build` and incremental rebuilds pick + # up edits to the source __init__.py files. meta_wheel = custom_target( 'build_nixl_meta', - input: [pyproject_toml, readme_md, license_path] + nixl_sources, + input: [ + pyproject_toml, + readme_md, + license_path, + nixl_init_copy, + nixl_api_copy, + nixl_logging_copy, + nixl_ep_init_copy, + ], output: [wheel_name], command: [uv, 'build', '--wheel', '--out-dir', build_dir, build_dir], install: false, diff --git a/src/bindings/python/nixl-meta/nixl/meson.build b/src/bindings/python/nixl-meta/nixl/meson.build index 9867ced11a..e87cdc2411 100644 --- a/src/bindings/python/nixl-meta/nixl/meson.build +++ b/src/bindings/python/nixl-meta/nixl/meson.build @@ -14,21 +14,8 @@ # limitations under the License. fs = import('fs') -fs.copyfile('__init__.py') -fs.copyfile('_api.py') -fs.copyfile('logging.py') - -# nixl_ep meta-dispatcher: copy into build dir so the meta wheel picks it up -nixl_ep_meta_dir = join_paths(meson.current_build_dir(), '..', 'nixl_ep') -nixl_ep_meta_src = join_paths(meson.current_source_dir(), '..', 'nixl_ep', '__init__.py') -custom_target('nixl_ep_meta_copy', - output: 'nixl_ep_meta.stamp', - command: ['bash', '-c', - 'mkdir -p @0@ && cp @1@ @0@/__init__.py && touch @2@'.format( - nixl_ep_meta_dir, - nixl_ep_meta_src, - join_paths(meson.current_build_dir(), 'nixl_ep_meta.stamp'), - ) - ], - build_by_default: true, -) +# Return values are captured so the meta-wheel custom_target in the parent +# meson.build can depend on them and ninja runs the copies before `uv build`. +nixl_init_copy = fs.copyfile('__init__.py') +nixl_api_copy = fs.copyfile('_api.py') +nixl_logging_copy = fs.copyfile('logging.py') diff --git a/src/bindings/python/nixl-meta/nixl_ep/meson.build b/src/bindings/python/nixl-meta/nixl_ep/meson.build new file mode 100644 index 0000000000..6654c61a44 --- /dev/null +++ b/src/bindings/python/nixl-meta/nixl_ep/meson.build @@ -0,0 +1,19 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +fs = import('fs') +# Return value is captured so the meta-wheel custom_target in the parent +# meson.build can depend on it and ninja runs the copy before `uv build`. +nixl_ep_init_copy = fs.copyfile('__init__.py') From 90e282c89879c697903f0f615b055b644ea7fffa Mon Sep 17 00:00:00 2001 From: Ovidiu Mara Date: Fri, 15 May 2026 15:56:49 +0200 Subject: [PATCH 13/26] Rename variable for clarity Signed-off-by: Ovidiu Mara --- contrib/Dockerfile.manylinux | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/contrib/Dockerfile.manylinux b/contrib/Dockerfile.manylinux index 9a19d464e1..2469bba126 100644 --- a/contrib/Dockerfile.manylinux +++ b/contrib/Dockerfile.manylinux @@ -358,9 +358,9 @@ ARG WHL_TORCH_VERSIONS="2.11,2.12" ARG WHL_PLATFORM="manylinux_2_28_$ARCH" RUN IFS=',' read -ra PYTHON_VERSIONS <<< "$WHL_PYTHON_VERSIONS" && \ if [ "$BUILD_NIXL_EP" = "true" ]; then \ - FLAGS="--build-nixl-ep --torch-versions $WHL_TORCH_VERSIONS"; \ + EP_BUILD_FLAGS="--build-nixl-ep --torch-versions $WHL_TORCH_VERSIONS"; \ else \ - FLAGS=""; \ + EP_BUILD_FLAGS=""; \ fi && \ rm -rf dist && mkdir -p dist && \ for PYTHON_VERSION in "${PYTHON_VERSIONS[@]}"; do \ @@ -371,7 +371,7 @@ RUN IFS=',' read -ra PYTHON_VERSIONS <<< "$WHL_PYTHON_VERSIONS" && \ --ucx-plugins-dir /usr/lib64/ucx \ --nixl-plugins-dir $NIXL_PLUGIN_DIR \ --output-dir dist \ - $FLAGS ; \ + $EP_BUILD_FLAGS ; \ done # Copy the meta package wheel to the dist directory, which will be used to push to PyPI. From b30995a2c3468dda18c1c7e2c49a9ebbd0e24ea4 Mon Sep 17 00:00:00 2001 From: Ovidiu Mara Date: Fri, 15 May 2026 16:06:45 +0200 Subject: [PATCH 14/26] Refactor wheel merge from bash into Python Signed-off-by: Ovidiu Mara --- contrib/build-wheel.sh | 33 ++++----- contrib/wheel_merge.py | 159 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 172 insertions(+), 20 deletions(-) create mode 100755 contrib/wheel_merge.py diff --git a/contrib/build-wheel.sh b/contrib/build-wheel.sh index dfdfbe6998..abc88d21fa 100755 --- a/contrib/build-wheel.sh +++ b/contrib/build-wheel.sh @@ -321,27 +321,20 @@ if [ "$BUILD_NIXL_EP" = "true" ] && [ -n "$TORCH_VERSIONS" ]; then # Repair so the .so passes auditwheel for the manylinux tag before we extract it repair_wheel "$EP_TMP" "$EP_TMP/dist" - # Extract torch-versioned .so from repaired wheel, inject into base wheel - EP_EXTRACT=$(mktemp -d) - unzip -o "$EP_TMP"/dist/*.whl -d "$EP_EXTRACT" - BASE_EXTRACT=$(mktemp -d) - unzip -o "$BASE_WHL" -d "$BASE_EXTRACT" - + # Merge the torch-versioned .so from the freshly-built wheel into + # the base wheel and regenerate RECORD. Both wheels were built + # against the same outer C++ build, so the .so's DT_NEEDED entries + # (libucp-.so etc.) match the libs already bundled in + # $BASE_WHL by auditwheel. TORCH_MM=$(echo "$TORCH" | tr -d '.') - find "$EP_EXTRACT" -name "nixl_ep_cpp_torch${TORCH_MM}*" -exec cp {} "$BASE_EXTRACT"/nixl_ep_cu${CUDA_MAJOR}/ \; - - # Regenerate RECORD - DIST_INFO=$(ls -d "$BASE_EXTRACT"/*.dist-info) - (cd "$BASE_EXTRACT" && find . -type f ! -name RECORD -printf '%P\n' | while read f; do - hash=$(python3 -c "import hashlib,base64; d=open('$f','rb').read(); print('sha256=' + base64.urlsafe_b64encode(hashlib.sha256(d).digest()).rstrip(b'=').decode())") - size=$(stat -c%s "$f") - echo "$f,$hash,$size" - done > "$DIST_INFO/RECORD" - echo "$(basename $DIST_INFO)/RECORD,," >> "$DIST_INFO/RECORD") - - rm -f "$BASE_WHL" - (cd "$BASE_EXTRACT" && zip -r "$BASE_WHL" .) - rm -rf "$EP_TMP" "$EP_EXTRACT" "$BASE_EXTRACT" + EP_WHL=$(ls "$EP_TMP"/dist/*.whl) + ./contrib/wheel_merge.py \ + --base-wheel "$BASE_WHL" \ + --source-wheel "$EP_WHL" \ + --pattern "nixl_ep_cpp_torch${TORCH_MM}.*" \ + --target-dir "nixl_ep_cu${CUDA_MAJOR}" + + rm -rf "$EP_TMP" done cp "$BASE_WHL" "$OUTPUT_DIR" diff --git a/contrib/wheel_merge.py b/contrib/wheel_merge.py new file mode 100755 index 0000000000..df97f7259b --- /dev/null +++ b/contrib/wheel_merge.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 + +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Merge file(s) from one wheel into another, regenerating RECORD.""" + +from __future__ import annotations + +import argparse +import base64 +import csv +import fnmatch +import hashlib +import io +import os +import sys +import zipfile + + +def _sha256_b64(data: bytes) -> str: + digest = hashlib.sha256(data).digest() + return base64.urlsafe_b64encode(digest).rstrip(b"=").decode("ascii") + + +def _record_bytes(entries: list[tuple[str, bytes]], record_path: str) -> bytes: + rows = [ + [name, f"sha256={_sha256_b64(data)}", str(len(data))] + for name, data in entries + if name != record_path + ] + rows.append([record_path, "", ""]) + buf = io.StringIO() + csv.writer(buf).writerows(rows) + return buf.getvalue().encode("utf-8") + + +def merge( + base_wheel: str, + source_wheel: str, + pattern: str, + target_dir: str, +) -> list[str]: + """Merge files matching `pattern` from `source_wheel` into `base_wheel`. + + `pattern` is a fnmatch glob applied to the basename of each entry in + the source wheel; matches are placed under `target_dir/` inside the + base wheel. `base_wheel` is rewritten atomically with a regenerated + RECORD. Returns the list of merged entry names (relative to the + wheel root). + """ + target_dir = target_dir.rstrip("/") + + # Pull matching files out of the source wheel, rewriting their path so + # they land under target_dir/. + merged: dict[str, tuple[zipfile.ZipInfo, bytes]] = {} + with zipfile.ZipFile(source_wheel, "r") as zsrc: + for info in zsrc.infolist(): + name = os.path.basename(info.filename) + if not fnmatch.fnmatch(name, pattern): + continue + new_name = f"{target_dir}/{name}" + new_info = zipfile.ZipInfo(filename=new_name) + new_info.compress_type = info.compress_type + new_info.external_attr = info.external_attr + new_info.date_time = info.date_time + merged[new_name] = (new_info, zsrc.read(info)) + + if not merged: + raise SystemExit( + f"no files matched {pattern!r} in {source_wheel}" + ) + + # Read base wheel; pull out RECORD path so we can regenerate it. + by_name: dict[str, tuple[zipfile.ZipInfo, bytes]] = {} + record_path: str | None = None + with zipfile.ZipFile(base_wheel, "r") as zin: + for info in zin.infolist(): + if info.filename.endswith(".dist-info/RECORD"): + record_path = info.filename + continue + by_name[info.filename] = (info, zin.read(info)) + if record_path is None: + raise SystemExit(f"no .dist-info/RECORD found in {base_wheel}") + + # Apply merge (source overrides base if a name collides). + by_name.update(merged) + + # Sort for stable output; nice for diffing two wheels. + ordered = sorted(by_name) + record = _record_bytes([(n, by_name[n][1]) for n in ordered], record_path) + + record_info = zipfile.ZipInfo(filename=record_path) + record_info.compress_type = zipfile.ZIP_DEFLATED + + tmp_path = f"{base_wheel}.tmp" + with zipfile.ZipFile( + tmp_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=9 + ) as zout: + for name in ordered: + info, data = by_name[name] + zout.writestr(info, data) + zout.writestr(record_info, record) + os.replace(tmp_path, base_wheel) + + return sorted(merged) + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--base-wheel", + required=True, + help="wheel to merge into (rewritten in place)", + ) + parser.add_argument( + "--source-wheel", + required=True, + help="wheel to extract files from", + ) + parser.add_argument( + "--pattern", + required=True, + help="basename glob of files to merge (e.g. 'nixl_ep_cpp_torch212.*')", + ) + parser.add_argument( + "--target-dir", + required=True, + help="directory inside the base wheel for the merged files " + "(e.g. 'nixl_ep_cu13')", + ) + args = parser.parse_args() + + merged = merge( + base_wheel=args.base_wheel, + source_wheel=args.source_wheel, + pattern=args.pattern, + target_dir=args.target_dir, + ) + print(f"merged {len(merged)} file(s) into {args.base_wheel}:") + for name in merged: + print(f" {name}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 758c02561ca3b1d3253e42d0f916562e58859a92 Mon Sep 17 00:00:00 2001 From: Ovidiu Mara Date: Fri, 15 May 2026 16:12:47 +0200 Subject: [PATCH 15/26] Refactor bash script to simplify Signed-off-by: Ovidiu Mara --- contrib/build-wheel.sh | 75 +++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 41 deletions(-) diff --git a/contrib/build-wheel.sh b/contrib/build-wheel.sh index abc88d21fa..56638dc559 100755 --- a/contrib/build-wheel.sh +++ b/contrib/build-wheel.sh @@ -117,8 +117,9 @@ BUILD_DEPS=( "setuptools>=80.9.0" ) -# Channel cache for repeated lookups within a single script run. -declare -A TORCH_CHANNEL_CACHE +# Classification cache for repeated lookups within a single script run. +# Values: "stable" | "nightly" | "unavailable". +declare -A TORCH_CLASS_CACHE # Slugify a dotted version (e.g. "2.13" -> "213", "3.10" -> "310") so it can # be used unambiguously as a path component. @@ -137,30 +138,41 @@ venv_path() { fi } -# Determine whether torch==${VER}.* has a stable release on the stable cu -# index for the target Python version. Echoes "stable" or "nightly". -# A torch version is treated as stable iff a non-pre-release wheel resolves -# from the stable cu index alone (no nightly index, no --pre). -torch_channel() { +# Classify a requested torch version for the target Python: does +# `torch==${VER}.*` resolve from the stable cu index alone, only from the +# nightly index (with --pre), or from neither? Echoes one of: +# stable — resolvable from $TORCH_STABLE_INDEX without --pre +# nightly — resolvable from $TORCH_NIGHTLY_INDEX with --pre +# unavailable — not in either index for this Python +# Result is cached so each (PYTHON_VERSION, VER) probe runs at most once +# across the script's filter and build phases. +torch_classify() { local VER=$1 - if [ -n "${TORCH_CHANNEL_CACHE[$VER]:-}" ]; then - echo "${TORCH_CHANNEL_CACHE[$VER]}" + if [ -n "${TORCH_CLASS_CACHE[$VER]:-}" ]; then + echo "${TORCH_CLASS_CACHE[$VER]}" return fi - local CHANNEL="nightly" - local CHECK_VENV="/workspace/venv-probe-py$(slug "$PYTHON_VERSION")" - rm -rf "$CHECK_VENV" - if uv venv "$CHECK_VENV" --python "$PYTHON_VERSION" >/dev/null 2>&1; then + local CLASS="unavailable" + local PROBE="/workspace/venv-probe-py$(slug "$PYTHON_VERSION")" + rm -rf "$PROBE" + if uv venv "$PROBE" --python "$PYTHON_VERSION" >/dev/null 2>&1; then if uv pip install --dry-run \ - --python "$CHECK_VENV/bin/python" \ + --python "$PROBE/bin/python" \ --index-url "$TORCH_STABLE_INDEX" \ "torch==${VER}.*" >/dev/null 2>&1; then - CHANNEL="stable" + CLASS="stable" + elif uv pip install --dry-run --pre \ + --python "$PROBE/bin/python" \ + --extra-index-url "$TORCH_STABLE_INDEX" \ + --extra-index-url "$TORCH_NIGHTLY_INDEX" \ + --index-strategy unsafe-best-match \ + "torch==${VER}.*" >/dev/null 2>&1; then + CLASS="nightly" fi fi - rm -rf "$CHECK_VENV" - TORCH_CHANNEL_CACHE[$VER]="$CHANNEL" - echo "$CHANNEL" + rm -rf "$PROBE" + TORCH_CLASS_CACHE[$VER]="$CLASS" + echo "$CLASS" } # Echo the torch requirement spec for the given (version, channel). Stable @@ -197,25 +209,6 @@ torch_uv_flags() { fi } -# Check whether torch==${VER}.* is resolvable from any configured index -# (stable or nightly) for the target Python version. Echoes "yes" on -# success, nothing otherwise. -torch_available() { - local VER=$1 - local CHECK_VENV="/workspace/venv-probe-py$(slug "$PYTHON_VERSION")" - rm -rf "$CHECK_VENV" - uv venv "$CHECK_VENV" --python "$PYTHON_VERSION" >/dev/null 2>&1 || return - if uv pip install --dry-run --pre \ - --python "$CHECK_VENV/bin/python" \ - --extra-index-url "$TORCH_STABLE_INDEX" \ - --extra-index-url "$TORCH_NIGHTLY_INDEX" \ - --index-strategy unsafe-best-match \ - "torch==${VER}.*" >/dev/null 2>&1; then - echo "yes" - fi - rm -rf "$CHECK_VENV" -} - # Build the wheel for the current PYTHON_VERSION (and optional torch VER). # Creates a fresh venv at venv_path, installs build deps + torch with the # channel-appropriate flags, runs `uv build --no-build-isolation`, and @@ -230,7 +223,7 @@ build_wheel() { local VENV_PATH VENV_PATH=$(venv_path "$VER") local CHANNEL="stable" - [ -n "$VER" ] && CHANNEL=$(torch_channel "$VER") + [ -n "$VER" ] && CHANNEL=$(torch_classify "$VER") local UV_FLAGS=() while IFS= read -r f; do UV_FLAGS+=("$f"); done < <(torch_uv_flags "$CHANNEL") @@ -290,10 +283,10 @@ if [ "$BUILD_NIXL_EP" = "true" ] && [ -n "$TORCH_VERSIONS" ]; then TORCH_ARRAY=() SKIPPED=() for TORCH in "${TORCH_REQUESTED[@]}"; do - if [ -n "$(torch_available "$TORCH")" ]; then - TORCH_ARRAY+=("$TORCH") - else + if [ "$(torch_classify "$TORCH")" = "unavailable" ]; then SKIPPED+=("$TORCH") + else + TORCH_ARRAY+=("$TORCH") fi done From deb58405e96b5adc274ed46a61c799b4541fc8b2 Mon Sep 17 00:00:00 2001 From: Ovidiu Mara Date: Fri, 15 May 2026 16:18:19 +0200 Subject: [PATCH 16/26] Format python code Signed-off-by: Ovidiu Mara --- contrib/wheel_merge.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/contrib/wheel_merge.py b/contrib/wheel_merge.py index df97f7259b..b3efc4efdd 100755 --- a/contrib/wheel_merge.py +++ b/contrib/wheel_merge.py @@ -79,9 +79,7 @@ def merge( merged[new_name] = (new_info, zsrc.read(info)) if not merged: - raise SystemExit( - f"no files matched {pattern!r} in {source_wheel}" - ) + raise SystemExit(f"no files matched {pattern!r} in {source_wheel}") # Read base wheel; pull out RECORD path so we can regenerate it. by_name: dict[str, tuple[zipfile.ZipInfo, bytes]] = {} From 99c1962c8e969198d5b91bb4669b1b056dd3a70c Mon Sep 17 00:00:00 2001 From: ovidiusm Date: Mon, 18 May 2026 09:38:08 +0200 Subject: [PATCH 17/26] Apply suggestion from @rakhmets Co-authored-by: Raul Akhmetshin <74596089+rakhmets@users.noreply.github.com> Signed-off-by: ovidiusm --- contrib/wheel_merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/wheel_merge.py b/contrib/wheel_merge.py index b3efc4efdd..f858f8899e 100755 --- a/contrib/wheel_merge.py +++ b/contrib/wheel_merge.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); From 6c5a83c359eaa04017df494bbebad5ce83f7cf0c Mon Sep 17 00:00:00 2001 From: ovidiusm Date: Mon, 18 May 2026 09:38:40 +0200 Subject: [PATCH 18/26] Apply suggestions from code review Co-authored-by: Raul Akhmetshin <74596089+rakhmets@users.noreply.github.com> Signed-off-by: ovidiusm --- src/bindings/python/nixl-meta/nixl_ep/__init__.py | 2 +- src/bindings/python/nixl-meta/nixl_ep/meson.build | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/bindings/python/nixl-meta/nixl_ep/__init__.py b/src/bindings/python/nixl-meta/nixl_ep/__init__.py index b86783bc62..fdc751dc86 100644 --- a/src/bindings/python/nixl-meta/nixl_ep/__init__.py +++ b/src/bindings/python/nixl-meta/nixl_ep/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/src/bindings/python/nixl-meta/nixl_ep/meson.build b/src/bindings/python/nixl-meta/nixl_ep/meson.build index 6654c61a44..a35ff26f71 100644 --- a/src/bindings/python/nixl-meta/nixl_ep/meson.build +++ b/src/bindings/python/nixl-meta/nixl_ep/meson.build @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); From 3a4bee9ad8a056490cbc4cbd70a102730860250f Mon Sep 17 00:00:00 2001 From: ovidiusm Date: Mon, 18 May 2026 09:41:53 +0200 Subject: [PATCH 19/26] Clean up comments in meson.build Remove comments about return values for copyfile. Signed-off-by: ovidiusm --- src/bindings/python/nixl-meta/nixl/meson.build | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/bindings/python/nixl-meta/nixl/meson.build b/src/bindings/python/nixl-meta/nixl/meson.build index e87cdc2411..509fcecc10 100644 --- a/src/bindings/python/nixl-meta/nixl/meson.build +++ b/src/bindings/python/nixl-meta/nixl/meson.build @@ -14,8 +14,6 @@ # limitations under the License. fs = import('fs') -# Return values are captured so the meta-wheel custom_target in the parent -# meson.build can depend on them and ninja runs the copies before `uv build`. nixl_init_copy = fs.copyfile('__init__.py') nixl_api_copy = fs.copyfile('_api.py') nixl_logging_copy = fs.copyfile('logging.py') From 67112d749e036e424451cbef24392ddc6742ec3f Mon Sep 17 00:00:00 2001 From: ovidiusm Date: Mon, 18 May 2026 09:42:57 +0200 Subject: [PATCH 20/26] Clean up comments in meson.build Removed comments regarding the return value of the copyfile function. Signed-off-by: ovidiusm --- src/bindings/python/nixl-meta/nixl_ep/meson.build | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/bindings/python/nixl-meta/nixl_ep/meson.build b/src/bindings/python/nixl-meta/nixl_ep/meson.build index a35ff26f71..320b68a24f 100644 --- a/src/bindings/python/nixl-meta/nixl_ep/meson.build +++ b/src/bindings/python/nixl-meta/nixl_ep/meson.build @@ -14,6 +14,4 @@ # limitations under the License. fs = import('fs') -# Return value is captured so the meta-wheel custom_target in the parent -# meson.build can depend on it and ninja runs the copy before `uv build`. nixl_ep_init_copy = fs.copyfile('__init__.py') From edf4b94fb94015b488b7f8236c1524ef31a1cadc Mon Sep 17 00:00:00 2001 From: ovidiusm Date: Mon, 18 May 2026 09:44:43 +0200 Subject: [PATCH 21/26] Rename function _load_ep_backend to _load_ep_module Signed-off-by: ovidiusm --- src/bindings/python/nixl-meta/nixl_ep/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/bindings/python/nixl-meta/nixl_ep/__init__.py b/src/bindings/python/nixl-meta/nixl_ep/__init__.py index fdc751dc86..9d5017e95b 100644 --- a/src/bindings/python/nixl-meta/nixl_ep/__init__.py +++ b/src/bindings/python/nixl-meta/nixl_ep/__init__.py @@ -27,7 +27,7 @@ def _get_torch_cuda_major() -> int | None: return int(_torch_cuda_ver.split(".")[0]) if _torch_cuda_ver else None -def _load_ep_backend() -> str: +def _load_ep_module() -> str: cuda_major = _get_torch_cuda_major() if cuda_major is not None: pip_name = f"nixl-cu{cuda_major}" @@ -51,7 +51,7 @@ def _load_ep_backend() -> str: raise ImportError("No nixl_ep CUDA backend found") -_pkg = sys.modules[_load_ep_backend()] +_pkg = sys.modules[_load_ep_module()] submodules = ["buffer", "utils"] for sub_name in submodules: From 8d59ad7937feaa673c59dc82d9b5b3dc01c06c8d Mon Sep 17 00:00:00 2001 From: Ovidiu Mara Date: Mon, 18 May 2026 10:40:04 +0200 Subject: [PATCH 22/26] Update build script to address torch installation issue and simplify comments Signed-off-by: Ovidiu Mara --- contrib/build-wheel.sh | 107 ++++++++++++++--------------------------- 1 file changed, 35 insertions(+), 72 deletions(-) diff --git a/contrib/build-wheel.sh b/contrib/build-wheel.sh index 56638dc559..56742901f7 100755 --- a/contrib/build-wheel.sh +++ b/contrib/build-wheel.sh @@ -15,7 +15,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Parse arguments PYTHON_VERSION="3.12" ARCH=$(uname -m) WHL_PLATFORM="manylinux_2_39_$ARCH" @@ -25,7 +24,6 @@ OUTPUT_DIR="dist" BUILD_NIXL_EP="false" TORCH_VERSIONS="" -# Parse arguments while [[ $# -gt 0 ]]; do case $1 in --python-version) @@ -85,7 +83,6 @@ done set -e set -x -# Build the wheel TMP_DIR=$(mktemp -d) CUDA_MAJOR=$(nvcc --version | grep -Eo 'release [0-9]+\.[0-9]+' | cut -d' ' -f2 | cut -d'.' -f1) @@ -100,13 +97,10 @@ PKG_NAME="nixl-cu${CUDA_MAJOR}" CU_TAG="cu$(nvcc --version | grep -Eo 'release [0-9]+\.[0-9]+' | cut -d' ' -f2 | tr -d .)" ./contrib/tomlutil.py --wheel-name $PKG_NAME pyproject.toml -# Index URLs for the resolved CUDA tag. Stable cu index hosts released -# wheels; nightly index hosts dev/pre-release wheels. TORCH_STABLE_INDEX="https://download.pytorch.org/whl/${CU_TAG}" TORCH_NIGHTLY_INDEX="https://download.pytorch.org/whl/nightly/${CU_TAG}" -# Build deps installed into the per-iteration venv (torch is added -# separately with channel-appropriate constraints). +# Build deps for the per-iteration venv; torch is installed separately. BUILD_DEPS=( "meson" "meson-python" @@ -138,14 +132,9 @@ venv_path() { fi } -# Classify a requested torch version for the target Python: does -# `torch==${VER}.*` resolve from the stable cu index alone, only from the -# nightly index (with --pre), or from neither? Echoes one of: -# stable — resolvable from $TORCH_STABLE_INDEX without --pre -# nightly — resolvable from $TORCH_NIGHTLY_INDEX with --pre -# unavailable — not in either index for this Python -# Result is cached so each (PYTHON_VERSION, VER) probe runs at most once -# across the script's filter and build phases. +# Echo "stable", "nightly", or "unavailable" depending on whether +# torch==${VER}.* resolves from the stable cu index, the nightly cu +# index (with --pre), or neither. Cached. torch_classify() { local VER=$1 if [ -n "${TORCH_CLASS_CACHE[$VER]:-}" ]; then @@ -175,47 +164,33 @@ torch_classify() { echo "$CLASS" } -# Echo the torch requirement spec for the given (version, channel). Stable -# stays as `torch==X.Y.*`; nightly uses an explicit `.dev0` lower bound so -# pre-release/dev wheels are admissible without --prerelease=allow. -torch_spec() { - local VER=$1 - local CHANNEL=$2 - if [ "$CHANNEL" = "nightly" ]; then - local MAJOR="${VER%%.*}" - local MINOR="${VER##*.}" - echo "torch>=${MAJOR}.${MINOR}.0.dev0,<${MAJOR}.$((MINOR + 1))" - else - echo "torch==${VER}.*" - fi -} +# Install torch from the cu index, isolated from PyPI: with PyPI as a +# fallback its plain `torch==X.Y.0` beats cu nightly's `X.Y.0.dev*+cuXX` +# (PEP 440: final > pre-release). +install_torch() { + local VENV_PATH=$1 + local VER=$2 + local CHANNEL=$3 + local MAJOR="${VER%%.*}" + local MINOR="${VER##*.}" -# Echo the uv index/resolution flags appropriate for the given channel, -# one flag per line. Stable builds use only the stable cu index and no -# pre-release allowance; nightly builds add the nightly index and -# --prerelease=allow. -torch_uv_flags() { - local CHANNEL=$1 if [ "$CHANNEL" = "nightly" ]; then - printf '%s\n' \ - --extra-index-url "$TORCH_STABLE_INDEX" \ - --extra-index-url "$TORCH_NIGHTLY_INDEX" \ - --index-strategy unsafe-best-match \ - --prerelease allow + uv pip install \ + --python "$VENV_PATH/bin/python" \ + --index-url "$TORCH_NIGHTLY_INDEX" \ + --pre \ + "torch>=${MAJOR}.${MINOR}.0.dev0,<${MAJOR}.$((MINOR + 1))" else - printf '%s\n' \ - --extra-index-url "$TORCH_STABLE_INDEX" \ - --index-strategy unsafe-best-match + uv pip install \ + --python "$VENV_PATH/bin/python" \ + --index-url "$TORCH_STABLE_INDEX" \ + "torch==${VER}.*" fi } # Build the wheel for the current PYTHON_VERSION (and optional torch VER). -# Creates a fresh venv at venv_path, installs build deps + torch with the -# channel-appropriate flags, runs `uv build --no-build-isolation`, and -# tears the venv down so the next iteration starts from a clean slate. -# Doing it this way instead of `pip install --reinstall torch` avoids -# orphan packages from the previous torch's transitive footprint -# (nvidia-* wheels, triton, sympy, …) bleeding across iterations. +# Each iteration uses a fresh venv so torch's dependencies +# (nvidia-* wheels, triton, sympy, …) do not leak across iterations. build_wheel() { local OUT_DIR=$1 local VER=${2:-} @@ -225,25 +200,14 @@ build_wheel() { local CHANNEL="stable" [ -n "$VER" ] && CHANNEL=$(torch_classify "$VER") - local UV_FLAGS=() - while IFS= read -r f; do UV_FLAGS+=("$f"); done < <(torch_uv_flags "$CHANNEL") - - local TORCH_PKG=() - [ -n "$VER" ] && TORCH_PKG+=("$(torch_spec "$VER" "$CHANNEL")") - echo "=== Provisioning ${VENV_PATH} (python ${PYTHON_VERSION}${VER:+, torch ${VER} [${CHANNEL}]}) ===" rm -rf "$VENV_PATH" uv venv "$VENV_PATH" --python "$PYTHON_VERSION" - uv pip install \ - --python "$VENV_PATH/bin/python" \ - "${UV_FLAGS[@]}" \ - "${BUILD_DEPS[@]}" \ - "${TORCH_PKG[@]}" + uv pip install --python "$VENV_PATH/bin/python" "${BUILD_DEPS[@]}" + [ -n "$VER" ] && install_torch "$VENV_PATH" "$VER" "$CHANNEL" # Activate so meson's `find_installation('python3')` resolves to this - # venv's interpreter (which has the right torch). Deactivate before - # returning so the caller's auditwheel keeps using the orchestration - # venv on PATH. + # venv's interpreter (which has the right torch). # shellcheck disable=SC1091 source "$VENV_PATH/bin/activate" @@ -262,8 +226,8 @@ build_wheel() { uv build "${BUILD_ARGS[@]}" deactivate - # Free disk: torch + nvidia-* wheels in a venv add up to several GB; - # 3 torches × 5 pythons would otherwise blow the docker layer budget. + # torch + nvidia-* in each venv is several GB; tear down so the docker + # layer doesnt blow up across the (python, torch) matrix. rm -rf "$VENV_PATH" } @@ -276,7 +240,8 @@ repair_wheel() { } if [ "$BUILD_NIXL_EP" = "true" ] && [ -n "$TORCH_VERSIONS" ]; then - # Multi-torch: build full wheel with first torch, then merge extra .so from others. + # Multi-torch: build the full wheel with the first torch, then merge + # the per-torch .so from the others into it. IFS=',' read -ra TORCH_REQUESTED <<< "$TORCH_VERSIONS" # Filter to torch versions actually resolvable for this (Python, CUDA) combo. @@ -311,14 +276,12 @@ if [ "$BUILD_NIXL_EP" = "true" ] && [ -n "$TORCH_VERSIONS" ]; then EP_TMP=$(mktemp -d) build_wheel "$EP_TMP" "$TORCH" - # Repair so the .so passes auditwheel for the manylinux tag before we extract it repair_wheel "$EP_TMP" "$EP_TMP/dist" - # Merge the torch-versioned .so from the freshly-built wheel into - # the base wheel and regenerate RECORD. Both wheels were built - # against the same outer C++ build, so the .so's DT_NEEDED entries - # (libucp-.so etc.) match the libs already bundled in - # $BASE_WHL by auditwheel. + # Merge only the torch-versioned .so. Both wheels were built + # against the same outer C++ build, so its DT_NEEDED entries + # (libucp-.so etc.) match what auditwheel already bundled + # into $BASE_WHL. TORCH_MM=$(echo "$TORCH" | tr -d '.') EP_WHL=$(ls "$EP_TMP"/dist/*.whl) ./contrib/wheel_merge.py \ From 83264e8b988fb6960c54ca6d80591c57a525f963 Mon Sep 17 00:00:00 2001 From: Ovidiu Mara Date: Mon, 18 May 2026 10:46:46 +0200 Subject: [PATCH 23/26] Fix help and comment Signed-off-by: Ovidiu Mara --- contrib/build-wheel.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/contrib/build-wheel.sh b/contrib/build-wheel.sh index 56742901f7..882fa36b45 100755 --- a/contrib/build-wheel.sh +++ b/contrib/build-wheel.sh @@ -59,6 +59,7 @@ while [[ $# -gt 0 ]]; do echo " --ucx-plugins-dir: Directory to find UCX plugins in (default: $UCX_PLUGINS_DIR)" echo " --nixl-plugins-dir: Directory to find NIXL plugins in (default: $NIXL_PLUGINS_DIR)" echo " --build-nixl-ep: Build wheel with nixl_ep package included (requires CUDA sm90-compatible environment)" + echo " --torch-versions: Comma-separated list of torch versions to build the wheel for (default: $TORCH_VERSIONS)" echo " --help: Show this help message" echo "" echo "Must be executed from the root of the NIXL repository." @@ -227,7 +228,7 @@ build_wheel() { deactivate # torch + nvidia-* in each venv is several GB; tear down so the docker - # layer doesnt blow up across the (python, torch) matrix. + # layer does not get too large across the (python, torch) matrix. rm -rf "$VENV_PATH" } From 3d10ba509bdf586595f8dedc44f167f1d55964f2 Mon Sep 17 00:00:00 2001 From: Ovidiu Mara Date: Mon, 18 May 2026 10:58:05 +0200 Subject: [PATCH 24/26] Address comment Signed-off-by: Ovidiu Mara --- contrib/build-wheel.sh | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/contrib/build-wheel.sh b/contrib/build-wheel.sh index 882fa36b45..4136a429b1 100755 --- a/contrib/build-wheel.sh +++ b/contrib/build-wheel.sh @@ -240,6 +240,19 @@ repair_wheel() { ./contrib/wheel_add_ucx_plugins.py --ucx-plugins-dir $UCX_PLUGINS_DIR --nixl-plugins-dir $NIXL_PLUGINS_DIR "$OUT_DIR"/*.whl } +# Echo the path of the single .whl in $1, or exit if the count is not 1. +single_wheel() { + local dir=$1 wheels + shopt -s nullglob + wheels=("$dir"/*.whl) + shopt -u nullglob + if [ ${#wheels[@]} -ne 1 ]; then + echo "expected 1 wheel in $dir, got ${#wheels[@]}: ${wheels[*]}" >&2 + exit 1 + fi + echo "${wheels[0]}" +} + if [ "$BUILD_NIXL_EP" = "true" ] && [ -n "$TORCH_VERSIONS" ]; then # Multi-torch: build the full wheel with the first torch, then merge # the per-torch .so from the others into it. @@ -269,7 +282,7 @@ if [ "$BUILD_NIXL_EP" = "true" ] && [ -n "$TORCH_VERSIONS" ]; then echo "=== Building wheel with torch ${FIRST_TORCH} ===" build_wheel "$TMP_DIR" "$FIRST_TORCH" repair_wheel "$TMP_DIR" "$TMP_DIR/dist" - BASE_WHL=$(ls "$TMP_DIR"/dist/*.whl) + BASE_WHL=$(single_wheel "$TMP_DIR/dist") for ((i=1; i<${#TORCH_ARRAY[@]}; i++)); do TORCH="${TORCH_ARRAY[$i]}" @@ -284,7 +297,7 @@ if [ "$BUILD_NIXL_EP" = "true" ] && [ -n "$TORCH_VERSIONS" ]; then # (libucp-.so etc.) match what auditwheel already bundled # into $BASE_WHL. TORCH_MM=$(echo "$TORCH" | tr -d '.') - EP_WHL=$(ls "$EP_TMP"/dist/*.whl) + EP_WHL=$(single_wheel "$EP_TMP/dist") ./contrib/wheel_merge.py \ --base-wheel "$BASE_WHL" \ --source-wheel "$EP_WHL" \ From a8f6bdbbfa37ca22ae2488bbfdeece4596b08455 Mon Sep 17 00:00:00 2001 From: Ovidiu Mara Date: Mon, 18 May 2026 11:18:02 +0200 Subject: [PATCH 25/26] Refactoring Signed-off-by: Ovidiu Mara --- contrib/build-wheel.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/contrib/build-wheel.sh b/contrib/build-wheel.sh index 4136a429b1..56c71f2acf 100755 --- a/contrib/build-wheel.sh +++ b/contrib/build-wheel.sh @@ -236,12 +236,12 @@ repair_wheel() { local IN_DIR=$1 local OUT_DIR=$2 mkdir -p "$OUT_DIR" - auditwheel repair $AUDITWHEEL_EXCLUDES "$IN_DIR"/nixl*.whl --plat $WHL_PLATFORM --wheel-dir "$OUT_DIR" - ./contrib/wheel_add_ucx_plugins.py --ucx-plugins-dir $UCX_PLUGINS_DIR --nixl-plugins-dir $NIXL_PLUGINS_DIR "$OUT_DIR"/*.whl + auditwheel repair $AUDITWHEEL_EXCLUDES "$IN_DIR"/nixl*.whl --plat "$WHL_PLATFORM" --wheel-dir "$OUT_DIR" + ./contrib/wheel_add_ucx_plugins.py --ucx-plugins-dir "$UCX_PLUGINS_DIR" --nixl-plugins-dir "$NIXL_PLUGINS_DIR" "$OUT_DIR"/*.whl } # Echo the path of the single .whl in $1, or exit if the count is not 1. -single_wheel() { +get_wheel_path() { local dir=$1 wheels shopt -s nullglob wheels=("$dir"/*.whl) @@ -282,7 +282,7 @@ if [ "$BUILD_NIXL_EP" = "true" ] && [ -n "$TORCH_VERSIONS" ]; then echo "=== Building wheel with torch ${FIRST_TORCH} ===" build_wheel "$TMP_DIR" "$FIRST_TORCH" repair_wheel "$TMP_DIR" "$TMP_DIR/dist" - BASE_WHL=$(single_wheel "$TMP_DIR/dist") + BASE_WHL=$(get_wheel_path "$TMP_DIR/dist") for ((i=1; i<${#TORCH_ARRAY[@]}; i++)); do TORCH="${TORCH_ARRAY[$i]}" @@ -297,7 +297,7 @@ if [ "$BUILD_NIXL_EP" = "true" ] && [ -n "$TORCH_VERSIONS" ]; then # (libucp-.so etc.) match what auditwheel already bundled # into $BASE_WHL. TORCH_MM=$(echo "$TORCH" | tr -d '.') - EP_WHL=$(single_wheel "$EP_TMP/dist") + EP_WHL=$(get_wheel_path "$EP_TMP/dist") ./contrib/wheel_merge.py \ --base-wheel "$BASE_WHL" \ --source-wheel "$EP_WHL" \ @@ -311,7 +311,7 @@ if [ "$BUILD_NIXL_EP" = "true" ] && [ -n "$TORCH_VERSIONS" ]; then else build_wheel "$TMP_DIR" repair_wheel "$TMP_DIR" "$TMP_DIR/dist" - cp "$TMP_DIR"/dist/*.whl "$OUTPUT_DIR" + cp "$(get_wheel_path "$TMP_DIR/dist")" "$OUTPUT_DIR" fi # Clean up From ca86eb7e4a0e02505a3ee63799583dd9900ba581 Mon Sep 17 00:00:00 2001 From: Ovidiu Mara Date: Thu, 21 May 2026 16:11:02 +0200 Subject: [PATCH 26/26] Address comments Signed-off-by: Ovidiu Mara --- contrib/build-wheel.sh | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/contrib/build-wheel.sh b/contrib/build-wheel.sh index 56c71f2acf..6b3491119b 100755 --- a/contrib/build-wheel.sh +++ b/contrib/build-wheel.sh @@ -81,6 +81,11 @@ while [[ $# -gt 0 ]]; do esac done +if [ "$BUILD_NIXL_EP" = "true" ] && [ -z "$TORCH_VERSIONS" ]; then + echo "ERROR: --build-nixl-ep requires --torch-versions (e.g. --torch-versions 2.11,2.12)" >&2 + exit 1 +fi + set -e set -x @@ -112,10 +117,6 @@ BUILD_DEPS=( "setuptools>=80.9.0" ) -# Classification cache for repeated lookups within a single script run. -# Values: "stable" | "nightly" | "unavailable". -declare -A TORCH_CLASS_CACHE - # Slugify a dotted version (e.g. "2.13" -> "213", "3.10" -> "310") so it can # be used unambiguously as a path component. slug() { echo "${1//./}"; } @@ -135,13 +136,9 @@ venv_path() { # Echo "stable", "nightly", or "unavailable" depending on whether # torch==${VER}.* resolves from the stable cu index, the nightly cu -# index (with --pre), or neither. Cached. +# index (with --pre), or neither. torch_classify() { local VER=$1 - if [ -n "${TORCH_CLASS_CACHE[$VER]:-}" ]; then - echo "${TORCH_CLASS_CACHE[$VER]}" - return - fi local CLASS="unavailable" local PROBE="/workspace/venv-probe-py$(slug "$PYTHON_VERSION")" rm -rf "$PROBE" @@ -161,7 +158,6 @@ torch_classify() { fi fi rm -rf "$PROBE" - TORCH_CLASS_CACHE[$VER]="$CLASS" echo "$CLASS" }