From f68820436c88710eb0f04818ae36d5e1aa2f4a65 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Mon, 23 Oct 2023 12:29:30 +0800 Subject: [PATCH] Explicit build dependency on `nvidia_peermem` (#201) --- .github/workflows/{codeql.yml => codeql-analysis.yml} | 11 +++++++---- CMakeLists.txt | 11 +++++++++++ docs/quickstart.md | 7 ++++++- 3 files changed, 24 insertions(+), 5 deletions(-) rename .github/workflows/{codeql.yml => codeql-analysis.yml} (85%) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql-analysis.yml similarity index 85% rename from .github/workflows/codeql.yml rename to .github/workflows/codeql-analysis.yml index b478dc5ae..2db0a91fb 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql-analysis.yml @@ -2,8 +2,9 @@ name: "CodeQL" on: push: - branches: - - '**' + branches: [ main ] + pull_request: + branches: [ main ] schedule: - cron: "30 1 * * 1" @@ -42,8 +43,10 @@ jobs: run: | git config --global --add safe.directory /__w/mscclpp/mscclpp - - name: Autobuild - uses: github/codeql-action/autobuild@v2 + - name: Build + run: | + MPI_HOME=/usr/local/mpi cmake -DBYPASS_PEERMEM_CHECK=ON . + make -j - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v2 diff --git a/CMakeLists.txt b/CMakeLists.txt index 3b33a6e96..2aef18aa3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,6 +26,7 @@ option(USE_NPKIT "Use NPKIT" ON) option(BUILD_TESTS "Build tests" ON) option(BUILD_PYTHON_BINDINGS "Build Python bindings" ON) option(ALLOW_GDRCOPY "Use GDRCopy, if available" OFF) +option(BYPASS_PEERMEM_CHECK "Bypass checking nvidia_peermem" OFF) # Find CUDAToolkit. Set CUDA flags based on the detected CUDA version find_package(CUDAToolkit REQUIRED) @@ -44,6 +45,16 @@ if(CUDAToolkit_FOUND) endif() set(CUDA_LIBRARIES CUDA::cudart CUDA::cuda_driver) +# Find if nvidia_peermem is installed and loaded +if(NOT BYPASS_PEERMEM_CHECK) + execute_process(COMMAND sh -c "lsmod | grep nvidia_peermem" + RESULT_VARIABLE lsmod_result + OUTPUT_VARIABLE lsmod_output) + if(NOT lsmod_result EQUAL 0) + message(FATAL_ERROR "nvidia_peermem is not installed or not loaded.") + endif() +endif() + # Find ibverbs and libnuma find_package(IBVerbs REQUIRED) find_package(NUMA REQUIRED) diff --git a/docs/quickstart.md b/docs/quickstart.md index 585800236..9ccf1b6f9 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -11,7 +11,12 @@ * NVIDIA A100 GPUs + CUDA >= 11.1.1 * NVIDIA H100 GPUs + CUDA >= 12.0.0 * OS: tested over Ubuntu 18.04 and 20.04 -* Libraries: [libnuma](https://github.com/numactl/numactl), [GDRCopy](https://github.com/NVIDIA/gdrcopy) (optional), MPI (optional) +* Libraries: [libnuma](https://github.com/numactl/numactl), MPI (optional) +* Others + * `nvidia_peermem` driver should be loaded on all nodes. Check it via: + ``` + lsmod | grep nvidia_peermem + ``` ## Build from Source