Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gpu: HPC SDK 21.5, Singularity, HPCX MPI update #1709

Merged
merged 7 commits into from
Jun 24, 2021
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 41 additions & 13 deletions docker/Dockerfile.nvidia
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,19 @@
# BUILD:
# docker build --network=host --file docker/Dockerfile.nvidia --tag devito:nvidia .
#
# EXPERIMENTAL:
# LEGACY:
# (1) Option MPI 3.0:
# docker build --network=host --build-arg MPI_VER=3 --file docker/Dockerfile.nvidia --tag devito:nvidia .
#
# (2) Option MPI 4.0:
# Enabling and using MPI 4.0.5 works on R450 drivers, but is showing compatibility
# issues during testing on older R418 drivers.
#
# docker build --network=host --build-arg MPI_VER=4 --file docker/Dockerfile.nvidia --tag devito:nvidia .
#
# RUN:
# docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 devito:nvidia
# docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --device=/dev/infiniband/uverbs0 --device=/dev/infiniband/rdma_cm devito:nvidia
##############################################################
FROM python:3.6

Expand All @@ -31,23 +36,29 @@ RUN apt-get update -y && \
nodejs \
liblapack-dev \
libblas-dev \
libibverbs-dev \
texlive-latex-extra texlive-fonts-recommended dvipng cm-super && \
libibverbs-dev libmlx4-1 libmlx5-1 ibutils \
texlive-latex-extra texlive-fonts-recommended dvipng cm-super \
python-dev python3-dev python3-venv && \
wget -q -P /app/ \
https://developer.download.nvidia.com/hpc-sdk/21.3/nvhpc-21-3_21.3_amd64.deb \
https://developer.download.nvidia.com/hpc-sdk/21.3/nvhpc-21-3-cuda-multi_21.3_amd64.deb \
https://developer.download.nvidia.com/hpc-sdk/21.3/nvhpc-2021_21.3_amd64.deb && \
https://developer.download.nvidia.com/hpc-sdk/21.5/nvhpc-21-5_21.5_amd64.deb \
https://developer.download.nvidia.com/hpc-sdk/21.5/nvhpc-21-5-cuda-multi_21.5_amd64.deb \
https://developer.download.nvidia.com/hpc-sdk/21.5/nvhpc-2021_21.5_amd64.deb && \
wget -q -P /app/nvcomp_exts/ \
https://developer.download.nvidia.com/compute/nvcomp/2.0/local_installers/nvcomp_exts_x86_64_ubuntu18.04-2.0.tar.gz &&\
apt-get install -y -q \
/app/nvhpc-21-3_21.3_amd64.deb \
/app/nvhpc-21-3-cuda-multi_21.3_amd64.deb \
/app/nvhpc-2021_21.3_amd64.deb && \
/app/nvhpc-21-5_21.5_amd64.deb \
/app/nvhpc-21-5-cuda-multi_21.5_amd64.deb \
/app/nvhpc-2021_21.5_amd64.deb && \
tar -xvf /app/nvcomp_exts/nvcomp_exts_x86_64_ubuntu18.04-2.0.tar.gz -C /app/nvcomp_exts && \
apt-get update -y && \
rm -rf /app/nvhpc* && \
rm -rf /app/nvcomp_exts/nvcomp* && \
rm -rf /var/lib/apt/lists/*

ARG HPCSDK_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/2021
ARG HPCSDK_CUPTI=/opt/nvidia/hpc_sdk/Linux_x86_64/2021/cuda/11.3/extras/CUPTI
ARG MPI_VER=3
ARG HPCSDK_CUPTI=/opt/nvidia/hpc_sdk/Linux_x86_64/2021/cuda/11.2/extras/CUPTI
#MPI_VER options 3,4,HPCX
ARG MPI_VER=HPCX

# nvidia-container-runtime
ENV NVIDIA_VISIBLE_DEVICES all
Expand All @@ -60,12 +71,16 @@ RUN echo "$HPCSDK_HOME/cuda/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
echo "$HPCSDK_HOME/comm_libs/mpi/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
echo "$HPCSDK_CUPTI/lib64" >> /etc/ld.so.conf.d/nvidia.conf && \
echo "$HPCSDK_HOME/math_libs/lib64" >> /etc/ld.so.conf.d/nvidia.conf

# Compression
ENV NVCOMP_EXTS_ROOT /app/nvcomp_exts/ubuntu18.04/11.2
ENV bitcomp_DIR $NVCOMP_EXTS_ROOT/lib/

# Compiler, CUDA, and Library paths
ENV CUDA_HOME $HPCSDK_HOME/cuda
ENV CUDA_ROOT $HPCSDK_HOME/cuda/bin
ENV PATH $HPCSDK_HOME/compilers/bin:$HPCSDK_HOME/cuda/bin:$HPCSDK_HOME/comm_libs/mpi/bin:${PATH}
ENV LD_LIBRARY_PATH $HPCSDK_HOME/cuda/lib:$HPCSDK_HOME/cuda/lib64:$HPCSDK_HOME/compilers/lib:$HPCSDK_HOME/math_libs/lib64:$HPCSDK_HOME/comm_libs/mpi/lib:$HPCSDK_CUPTI/lib64:${LD_LIBRARY_PATH}
ENV LD_LIBRARY_PATH $HPCSDK_HOME/cuda/lib:$HPCSDK_HOME/cuda/lib64:$HPCSDK_HOME/compilers/lib:$HPCSDK_HOME/math_libs/lib64:$HPCSDK_HOME/comm_libs/mpi/lib:$HPCSDK_CUPTI/lib64:bitcomp_DIR:${LD_LIBRARY_PATH}

ADD ./requirements.txt /app/requirements.txt
ADD ./requirements-optional.txt /app/requirements-optional.txt
Expand All @@ -83,8 +98,16 @@ RUN python3 -m venv /venv && \
# MPI ROOT USER DEFAULTS
ENV OMPI_ALLOW_RUN_AS_ROOT=1
ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
ENV OMPI_MCA_rmaps_base_oversubscribe=1
ENV UCX_MEMTYPE_CACHE=no
ENV UCX_TLS=self,shm,cuda_copy
ENV UCX_NET_DEVICES=all
ENV UCX_SHM_DEVICES=all
ENV UCX_ACC_DEVICES=all
ENV NCCL_UCX_RNDV_THRESH=0
ENV NCCL_UCX_RNDV_SCHEME=get_zcopy
ENV NCCL_PLUGIN_P2P=ucx
ENV UCX_TLS=rc_x,sm,shm,cuda_copy,gdr_copy,cuda_ipc
ENV MELLANOX_MOUNT_DRIVER=1

ENV CPATH $HPCSDK_HOME/comm_libs/mpi/include:${CPATH}
ENV CFLAGS=-noswitcherror
Expand All @@ -94,6 +117,11 @@ RUN if [ "x$MPI_VER" = "x4" ]; then \
ln -sf $HPCSDK_HOME/comm_libs/openmpi4/openmpi-4.0.5 \
$HPCSDK_HOME/comm_libs/mpi ; \
fi; \
if [ "x$MPI_VER" = "HPCX" ]; then \
rm -f $HPCSDK_HOME/comm_libs/mpi && \
ln -sf $HPCSDK_HOME/comm_libs/hpcx/hpcx-2.7.4/ompi \
$HPCSDK_HOME/comm_libs/mpi ; \
fi; \
/venv/bin/pip install --no-cache-dir -r /app/requirements-mpi.txt && \
rm -rf ~/.cache/pip
ENV CFLAGS=
Expand Down
168 changes: 168 additions & 0 deletions docker/Singularity.nvidia.def
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
Bootstrap: docker
From: python:3.6

%help
##############################################################
# This Dockerfile contains the additional NVIDIA compilers,
# libraries, and plugins to enable OpenACC and NVIDIA GPU
# acceleration of Devito codes.
#
# BUILD:
# singularity build --fakeroot devito.nvidia.sif docker/Singularity.nvidia.def
#
# RUN:
# singularity run --nv --writable-tmpfs devito.nvidia.sif
##############################################################

%files
./requirements.txt /app/requirements.txt
./requirements-optional.txt /app/requirements-optional.txt
./requirements-nvidia.txt /app/requirements-nvidia.txt
./requirements-mpi.txt /app/requirements-mpi.txt
./devito /app/devito
./tests /app/tests
./scripts /app/scripts
./examples /app/examples
./benchmarks /app/benchmarks
setup.cfg /app/
docker/run-jupyterlab.sh /jupyter
docker/run-tests.sh /tests
docker/run-print-defaults.sh /print-defaults
docker/entrypoint.sh /entrypoint.sh
docker/nvdashboard.json /app/nvdashboard.json

%environment
export NVIDIA_VISIBLE_DEVICES=all
export NVIDIA_DRIVER_CAPABILITIES=compute,utility

export HPCSDK_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/2021
export HPCSDK_CUPTI=/opt/nvidia/hpc_sdk/Linux_x86_64/2021/cuda/11.2/extras/CUPTI
export NVCOMP_EXTS_ROOT=/app/nvcomp_exts/ubuntu18.04/11.2
export bitcomp_DIR=$NVCOMP_EXTS_ROOT/lib/

export CUDA_HOME=$HPCSDK_HOME/cuda
export CUDA_ROOT=$HPCSDK_HOME/cuda/bin
export PATH=$HPCSDK_HOME/compilers/bin:$HPCSDK_HOME/cuda/bin:$HPCSDK_HOME/comm_libs/mpi/bin:${PATH}
export LD_LIBRARY_PATH=$HPCSDK_HOME/cuda/lib:$HPCSDK_HOME/cuda/lib64:$HPCSDK_HOME/compilers/lib:$HPCSDK_HOME/math_libs/lib64:$HPCSDK_HOME/comm_libs/mpi/lib:$HPCSDK_CUPTI/lib64:bitcomp_DIR:${LD_LIBRARY_PATH}

# entrypoint.sh in docker, but need to add in build for singularity
export PATH=/venv/bin:${PATH}
export PYTHONPATH=/app:${PYTHONPATH}

export OMPI_ALLOW_RUN_AS_ROOT=1
export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
export OMPI_MCA_rmaps_base_oversubscribe=1
export UCX_MEMTYPE_CACHE=no
export UCX_MEMTYPE_CACHE=no
export UCX_NET_DEVICES=all
export UCX_SHM_DEVICES=all
export UCX_ACC_DEVICES=all
export NCCL_UCX_RNDV_THRESH=0
export NCCL_UCX_RNDV_SCHEME=get_zcopy
export NCCL_PLUGIN_P2P=ucx
export UCX_TLS=rc_x,sm,shm,cuda_copy,gdr_copy,cuda_ipc
export MELLANOX_MOUNT_DRIVER=1

## Environment Variables for OpenACC Builds
# Reference: https://github.com/devitocodes/devito/wiki/FAQ#can-i-manually-modify-the-c-code-generated-by-devito-and
export DEVITO_ARCH="nvc"
export DEVITO_LANGUAGE="openacc"
export DEVITO_PLATFORM=nvidiaX

# Options: [unset, 1] For PGI openacc; Should only be set after a first execution of the benchmark
# export DEVITO_JIT_BACKDOOR=1

# Enable logging, Options: [unset, PERF, DEBUG]
export DEVITO_LOGGING=DEBUG
#export PGI_ACC_TIME=1

# Set the home directory to our app user's home.
export HOME=/app
export APP_HOME=/app

%post -c /bin/bash

export DEBIAN_FRONTEND=noninteractive

# nodesource: nvdashboard requires nodejs>=10
apt-get update -y && \
apt-get install -y -q \
apt-utils \
vim \
curl \
mpich libmpich-dev && \
curl -sL https://deb.nodesource.com/setup_12.x | bash - && \
apt-get install -y -q \
nodejs \
liblapack-dev \
libblas-dev \
libibverbs-dev libmlx4-1 libmlx5-1 ibutils \
texlive-latex-extra texlive-fonts-recommended dvipng cm-super \
python-dev python3-dev python3-venv && \
wget -q -P /app/ \
https://developer.download.nvidia.com/hpc-sdk/21.5/nvhpc-21-5_21.5_amd64.deb \
https://developer.download.nvidia.com/hpc-sdk/21.5/nvhpc-21-5-cuda-multi_21.5_amd64.deb \
https://developer.download.nvidia.com/hpc-sdk/21.5/nvhpc-2021_21.5_amd64.deb && \
wget -q -P /app/nvcomp_exts/ \
https://developer.download.nvidia.com/compute/nvcomp/2.0/local_installers/nvcomp_exts_x86_64_ubuntu18.04-2.0.tar.gz &&\
apt-get install -y -q \
/app/nvhpc-21-5_21.5_amd64.deb \
/app/nvhpc-21-5-cuda-multi_21.5_amd64.deb \
/app/nvhpc-2021_21.5_amd64.deb && \
tar -xvf /app/nvcomp_exts/nvcomp_exts_x86_64_ubuntu18.04-2.0.tar.gz -C /app/nvcomp_exts && \
apt-get update -y && \
rm -rf /app/nvhpc* && \
rm -rf /app/nvcomp_exts/nvcomp* && \
rm -rf /var/lib/apt/lists/*

export HPCSDK_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/2021
export HPCSDK_CUPTI=/opt/nvidia/hpc_sdk/Linux_x86_64/2021/cuda/11.2/extras/CUPTI

# Compiler, CUDA, and Library paths
export CUDA_HOME=$HPCSDK_HOME/cuda
export CUDA_ROOT=$HPCSDK_HOME/cuda/bin
export PATH=$HPCSDK_HOME/compilers/bin:$HPCSDK_HOME/cuda/bin:$HPCSDK_HOME/comm_libs/mpi/bin:${PATH}
export LD_LIBRARY_PATH=$HPCSDK_HOME/cuda/lib:$HPCSDK_HOME/cuda/lib64:$HPCSDK_HOME/compilers/lib:$HPCSDK_HOME/math_libs/lib64:$HPCSDK_HOME/comm_libs/mpi/lib:$HPCSDK_CUPTI/lib64:${LD_LIBRARY_PATH}

python3 -m venv /venv && \
/venv/bin/pip install --no-cache-dir --upgrade pip && \
/venv/bin/pip install --no-cache-dir wheel && \
/venv/bin/pip install --no-cache-dir -r /app/requirements.txt && \
/venv/bin/pip install --no-cache-dir -r /app/requirements-optional.txt && \
/venv/bin/pip install --no-cache-dir -r /app/requirements-nvidia.txt && \
rm -rf ~/.cache/pip

# MPI ROOT USER DEFAULTS
export CPATH=$HPCSDK_HOME/comm_libs/mpi/include:${CPATH}
export CFLAGS=-noswitcherror

#MPI 3
# Do Nothing
#MPI 4
#rm -f $HPCSDK_HOME/comm_libs/mpi && \
#ln -sf $HPCSDK_HOME/comm_libs/openmpi4/openmpi-4.0.5 $HPCSDK_HOME/comm_libs/mpi ;
#HPCX
rm -f $HPCSDK_HOME/comm_libs/mpi && \
ln -sf $HPCSDK_HOME/comm_libs/hpcx/hpcx-2.7.4/ompi $HPCSDK_HOME/comm_libs/mpi ;

/venv/bin/pip install --no-cache-dir -r /app/requirements-mpi.txt && \
rm -rf ~/.cache/pip
export CFLAGS=

chmod -R 755 /app
chmod 777 /app
chmod 777 /print-defaults /jupyter /tests /entrypoint.sh && \
/venv/bin/jupyter labextension install jupyterlab-nvdashboard && \
/venv/bin/jupyter labextension install dask-labextension && \
/venv/bin/jupyter serverextension enable dask_labextension && \
/venv/bin/jupyter lab workspaces import /app/nvdashboard.json

%runscript
echo /jupyter "$@"
cd /app
exec /jupyter "$@"

%startscript
cd /app

%test
2 changes: 1 addition & 1 deletion docker/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@
find /app -type f -name '*.pyc' -delete

export PATH=/venv/bin:$PATH
export PYTHONPATH=$PYTHONPATH:/app
export PYTHONPATH=/app:$PYTHONPATH

exec "$@"