From efb2bf0928597078190bfea67a54a217ccbe52d8 Mon Sep 17 00:00:00 2001 From: Itay Alroy Date: Tue, 4 Nov 2025 17:10:37 +0200 Subject: [PATCH] EXAMPLES: Introduce NIXL-EP example Add an example implementation of expert-parallel dispatch and combine operations using the NIXL device API. Co-authored-by: Roey Azran Co-authored-by: Micha Dery Co-authored-by: Michal Shalev Signed-off-by: Itay Alroy --- .github/workflows/clang-format.yml | 2 +- .github/workflows/copyright-check.sh | 6 +- .pre-commit-config.yaml | 1 + CONTRIBUTING.md | 4 + LICENSE | 9 + examples/device/ep/LICENSE-DeepEP | 21 + examples/device/ep/README.md | 102 ++ examples/device/ep/csrc/config.hpp | 130 ++ examples/device/ep/csrc/event.hpp | 65 + examples/device/ep/csrc/kernels/api.cuh | 140 ++ examples/device/ep/csrc/kernels/configs.cuh | 74 ++ examples/device/ep/csrc/kernels/exception.cuh | 73 ++ examples/device/ep/csrc/kernels/launch.cuh | 86 ++ examples/device/ep/csrc/kernels/nixl_ep.cu | 1154 +++++++++++++++++ examples/device/ep/csrc/kernels/utils.cuh | 467 +++++++ examples/device/ep/csrc/nixl_ep.cpp | 1055 +++++++++++++++ examples/device/ep/csrc/nixl_ep.hpp | 221 ++++ examples/device/ep/meson.build | 176 +++ examples/device/ep/nixl_ep/__init__.py | 24 + examples/device/ep/nixl_ep/buffer.py | 378 ++++++ examples/device/ep/nixl_ep/utils.py | 81 ++ examples/device/ep/scripts/reset_etcd.sh | 23 + examples/device/ep/tests/elastic/README.md | 34 + .../ep/tests/elastic/double_expansion.json | 5 + examples/device/ep/tests/elastic/elastic.py | 399 ++++++ .../tests/elastic/expansion_contraction.json | 6 + .../device/ep/tests/elastic/no_expansion.json | 3 + examples/device/ep/tests/elastic/plan.py | 95 ++ .../device/ep/tests/elastic/rank_server.py | 141 ++ .../ep/tests/elastic/single_expansion.json | 4 + .../elastic/single_expansion_16_ranks.json | 4 + .../device/ep/tests/elastic/single_rank.json | 3 + examples/device/ep/tests/utils.py | 247 ++++ examples/device/meson.build | 18 + examples/meson.build | 1 + meson.build | 10 +- meson_options.txt | 1 + src/core/meson.build | 3 +- src/plugins/ucx/meson.build | 3 +- src/utils/ucx/meson.build | 4 +- 40 files changed, 5264 insertions(+), 9 deletions(-) create mode 100644 examples/device/ep/LICENSE-DeepEP create mode 100644 examples/device/ep/README.md create mode 100644 examples/device/ep/csrc/config.hpp create mode 100644 examples/device/ep/csrc/event.hpp create mode 100644 examples/device/ep/csrc/kernels/api.cuh create mode 100644 examples/device/ep/csrc/kernels/configs.cuh create mode 100644 examples/device/ep/csrc/kernels/exception.cuh create mode 100644 examples/device/ep/csrc/kernels/launch.cuh create mode 100644 examples/device/ep/csrc/kernels/nixl_ep.cu create mode 100644 examples/device/ep/csrc/kernels/utils.cuh create mode 100644 examples/device/ep/csrc/nixl_ep.cpp create mode 100644 examples/device/ep/csrc/nixl_ep.hpp create mode 100644 examples/device/ep/meson.build create mode 100644 examples/device/ep/nixl_ep/__init__.py create mode 100644 examples/device/ep/nixl_ep/buffer.py create mode 100644 examples/device/ep/nixl_ep/utils.py create mode 100755 examples/device/ep/scripts/reset_etcd.sh create mode 100644 examples/device/ep/tests/elastic/README.md create mode 100644 examples/device/ep/tests/elastic/double_expansion.json create mode 100644 examples/device/ep/tests/elastic/elastic.py create mode 100644 examples/device/ep/tests/elastic/expansion_contraction.json create mode 100644 examples/device/ep/tests/elastic/no_expansion.json create mode 100644 examples/device/ep/tests/elastic/plan.py create mode 100644 examples/device/ep/tests/elastic/rank_server.py create mode 100644 examples/device/ep/tests/elastic/single_expansion.json create mode 100644 examples/device/ep/tests/elastic/single_expansion_16_ranks.json create mode 100644 examples/device/ep/tests/elastic/single_rank.json create mode 100644 examples/device/ep/tests/utils.py create mode 100644 examples/device/meson.build diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml index e1a0da0688..30c98a1ab5 100644 --- a/.github/workflows/clang-format.yml +++ b/.github/workflows/clang-format.yml @@ -18,7 +18,7 @@ jobs: FILE_PATTERN='\.(cpp|h|cc|c|cxx|hpp|cu|cuh)$' echo "### Modified C/C++ files:" - FILES=$(git diff --name-only HEAD^1 HEAD | grep -E "$FILE_PATTERN") || true + FILES=$(git diff --name-only HEAD^1 HEAD -- . ':(exclude)examples/device/ep' | grep -E "$FILE_PATTERN") || true [ -z "$FILES" ] && echo "(none)" || echo "$FILES" echo "### clang format errors:" diff --git a/.github/workflows/copyright-check.sh b/.github/workflows/copyright-check.sh index 1fe931983d..9669c8e819 100755 --- a/.github/workflows/copyright-check.sh +++ b/.github/workflows/copyright-check.sh @@ -21,7 +21,7 @@ for f in $(git ls-files); do *.png|*.jpg|*.jpeg|*.gif|*.ico|*.zip|*.rst|*.pyc|*.lock|*.md|*.svg|*.wrap|*.in|*.json|*.template|*.gitignore|*.python-version|*py.typed) continue ;; - CODEOWNERS|LICENSE|Doxyfile|.clang-format|.clang-tidy|.codespellrc) + CODEOWNERS|*LICENSE*|Doxyfile|.clang-format|.clang-tidy|.codespellrc) continue ;; esac @@ -39,7 +39,7 @@ for f in $(git ls-files); do # Extract copyright years (handles YYYY or YYYY-YYYY) copyright_years=$(echo "$header" | \ - grep -Eo 'Copyright \(c\) [0-9]{4}(-[0-9]{4})?' | \ + grep NVIDIA | grep -Eo 'Copyright \(c\) [0-9]{4}(-[0-9]{4})?' | \ sed -E 's/.* ([0-9]{4})(-[0-9]{4})?/\1\2/' || true) if [[ -z "$copyright_years" ]]; then @@ -57,7 +57,7 @@ for f in $(git ls-files); do fi # License line must exist - if ! echo "$header" | grep -Eq '^[[:space:]]*(#|//|\*|/\*|