Skip to content

Commit

Permalink
Nsight (#343)
Browse files Browse the repository at this point in the history
* Added NCCL Nsight files

Signed-off-by: Ankur Srivastava <[email protected]>

* Added NCCL Nsight files

Signed-off-by: Ankur Srivastava <[email protected]>

* Updateed nccl and readme and fsdp

Signed-off-by: Ankur Srivastava <[email protected]>

* Updated NCCL

Signed-off-by: Ankur Srivastava <[email protected]>

* Added almost everything

Signed-off-by: Ankur Srivastava <[email protected]>

* Removed token

Signed-off-by: Ankur Srivastava <[email protected]>

---------

Signed-off-by: Ankur Srivastava <[email protected]>
  • Loading branch information
awsankur authored Jun 3, 2024
1 parent 765e627 commit 44e448e
Show file tree
Hide file tree
Showing 22 changed files with 1,269 additions and 19 deletions.
20 changes: 20 additions & 0 deletions 4.validation_and_observability/5.nsight/2.generate_recipes.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@

/fsx/nsight-efa/target-linux-x64/nsys recipe cuda_api_sum --input ${NSIGHT_REPORT_NAME}.nsys-rep

/fsx/nsight-efa/target-linux-x64/nsys recipe cuda_api_sync --input ${NSIGHT_REPORT_NAME}.nsys-rep

/fsx/nsight-efa/target-linux-x64/nsys recipe cuda_gpu_kern_pace --input ${NSIGHT_REPORT_NAME}.nsys-rep --name ncclDevKernel_ReduceScatter_Sum_f32_RING_LL

/fsx/nsight-efa/target-linux-x64/nsys recipe cuda_gpu_kern_pace --input ${NSIGHT_REPORT_NAME}.nsys-rep --name ncclDevKernel_AllGather_RING_LL

/fsx/nsight-efa/target-linux-x64/nsys recipe cuda_gpu_kern_sum --input ${NSIGHT_REPORT_NAME}.nsys-rep

/fsx/nsight-efa/target-linux-x64/nsys recipe cuda_gpu_mem_size_sum --input ${NSIGHT_REPORT_NAME}.nsys-rep

/fsx/nsight-efa/target-linux-x64/nsys recipe cuda_gpu_mem_time_sum --input ${NSIGHT_REPORT_NAME}.nsys-rep

/fsx/nsight-efa/target-linux-x64/nsys recipe cuda_gpu_time_util_map --input ${NSIGHT_REPORT_NAME}.nsys-rep

/fsx/nsight-efa/target-linux-x64/nsys recipe nccl_sum --input ${NSIGHT_REPORT_NAME}.nsys-rep

/fsx/nsight-efa/target-linux-x64/nsys recipe nccl_gpu_time_util_map --input ${NSIGHT_REPORT_NAME}.nsys-rep
198 changes: 198 additions & 0 deletions 4.validation_and_observability/5.nsight/EKS/Dockerfile.llama2-efa
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
apiVersion: "kubeflow.org/v1"
kind: PyTorchJob
metadata:
name: fsdp
spec:
elasticPolicy:
rdzvBackend: etcd
rdzvHost: etcd
rdzvPort: 2379
minReplicas: 1
maxReplicas: 96
maxRestarts: 100
#metrics:
# - type: Resource
# resource:
# name: cpu
# target:
# type: Utilization
# averageUtilization: 80
pytorchReplicaSpecs:
Worker:
replicas: 2
restartPolicy: OnFailure
template:
metadata:
labels:
app: fsdp
nvidia-devtools-sidecar-injector: enabled
spec:
volumes:
- name: shmem
#emptyDir:
# medium: Memory
hostPath:
path: /dev/shm
#nodeSelector:
# node.kubernetes.io/instance-type: "p5.48xlarge"
containers:
- name: pytorch
image: 159553542841.dkr.ecr.us-west-2.amazonaws.com/fsdp:llama2-efa-main-02-13
imagePullPolicy: Always
resources:
requests:
nvidia.com/gpu:
vpc.amazonaws.com/efa: 4
limits:
nvidia.com/gpu:
vpc.amazonaws.com/efa: 4
env:
# for P5 FI_* should be commented out
#- name: LOGLEVEL
# value: "DEBUG"
- name: FI_PROVIDER
value: efa
- name: FI_EFA_USE_DEVICE_RDMA
value: "1"
- name: FI_EFA_FORK_SAFE
value: "1"
- name: FI_LOG_LEVEL
value: "1"
- name: FI_EFA_ENABLE_SHM_TRANSFER
value: "1"
#- name: NCCL_DEBUG
# value: "INFO"
- name: NCCL_ASYNC_ERROR_HANDLING
value: "1"
#- name: NCCL_IGNORE_DISABLED_P2P
# value: "1"
- name: HF_TOKEN
value: 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
command:
- bash
- -c
- "torchrun --nproc_per_node=8 --nnodes=2 examples/finetuning.py --num_epochs=1 --batch_size_training=3 --enable_fsdp --pure_bf16 --model_name meta-llama/Llama-2-7b-hf --output_dir ."
volumeMounts:
- name: shmem
mountPath: /dev/shm
root@cb9511473ccc:/eks/deployment/distributed-training/pytorch/pytorchjob/fsdp# cat Dockerfile.llama2-efa
FROM nvidia/cuda:12.2.2-devel-ubuntu22.04

ARG EFA_INSTALLER_VERSION=1.29.1
ARG AWS_OFI_NCCL_VERSION=v1.7.3-aws
ARG NCCL_TESTS_VERSION=master
ARG NCCL_VERSION=2.18.5

RUN apt-get update -y
RUN apt-get remove -y --allow-change-held-packages \
libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 libnccl2 libnccl-dev

RUN rm -rf /opt/hpcx \
&& rm -rf /usr/local/mpi \
&& rm -f /etc/ld.so.conf.d/hpcx.conf \
&& ldconfig
ENV OPAL_PREFIX=

RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
git \
gcc \
vim \
kmod \
openssh-client \
openssh-server \
build-essential \
curl \
autoconf \
libtool \
gdb \
automake \
python3-distutils \
cmake \
apt-utils \
devscripts \
debhelper \
libsubunit-dev \
check \
pkg-config

RUN mkdir -p /var/run/sshd
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/lib:$LD_LIBRARY_PATH
ENV PATH /opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH

RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \
&& python3 /tmp/get-pip.py \
&& pip3 install awscli pynvml

#################################################
## Install NVIDIA GDRCopy
#RUN git clone https://github.com/NVIDIA/gdrcopy.git /opt/gdrcopy \
# && cd /opt/gdrcopy \
# && make lib_install install \
# && cd /opt/gdrcopy/tests \
# && make \
# && mv copylat copybw sanity apiperf /usr/bin/

#################################################
## Install EFA installer
RUN cd $HOME \
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
&& tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
&& cd aws-efa-installer \
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
&& rm -rf $HOME/aws-efa-installer

###################################################
## Install NCCL
RUN git clone https://github.com/NVIDIA/nccl -b v${NCCL_VERSION}-1 /opt/nccl \
&& cd /opt/nccl \
&& make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90"

###################################################
## Install AWS-OFI-NCCL plugin
RUN apt-get install libtool autoconf cmake nasm unzip pigz parallel nfs-common build-essential hwloc libhwloc-dev libjemalloc2 libnuma-dev numactl libjemalloc-dev preload htop iftop liblapack-dev libgfortran5 ipcalc wget curl devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms -y
RUN export OPAL_PREFIX="" \
&& git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \
&& cd /opt/aws-ofi-nccl \
&& git checkout ${AWS_OFI_NCCL_VERSION} \
&& ./autogen.sh \
&& ./configure --prefix=/opt/aws-ofi-nccl/install \
--with-libfabric=/opt/amazon/efa/ \
--with-cuda=/usr/local/cuda \
--with-nccl=/opt/nccl/build \
--with-mpi=/opt/amazon/openmpi/ \
&& make -j $(nproc) && make install
###################################################
## Install fsdp

RUN mkdir -p /workspace/

WORKDIR /workspace

#RUN git clone -b flop_counter https://github.com/facebookresearch/llama-recipes.git
#RUN git clone -b flop_counter_gc https://github.com/facebookresearch/llama-recipes.git
RUN git clone https://github.com/facebookresearch/llama-recipes.git

WORKDIR /workspace/llama-recipes

RUN pip3 install -U pip setuptools

RUN pip3 install fsspec==2023.1.0
RUN pip3 install huggingface_hub==0.17.0
RUN pip3 install -r requirements.txt

RUN pip3 install -e .

RUN pip3 install tabulate

RUN pip3 install protobuf

RUN pip3 install python-etcd

#RUN pip3 uninstall -y torch
#RUN pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121

ENV PYTHONPATH="${PYTHONPATH}:/workspace/llama-recipes/src"
35 changes: 35 additions & 0 deletions 4.validation_and_observability/5.nsight/EKS/custom_values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# If we dont specify the Nsight image, 2024.2 version is used by default.
# Will use 2024.4 version which is planned to be released by 5/24/2024
devtoolBinariesImage:
image: ${REGISTRY}.dkr.ecr.${REGION}.amazonaws.com/nsight-systems-cli:2024.4.1-ubuntu22.04
imagePullPolicy: Always

# Assuming EKS cluster has a FSx for Lustre filesystem mounted on it. Nsight reports will be saved in /fsx_shared
profile:
volumes:
[
{
"name": "nsys-output-volume",
"persistentVolumeClaim": { "claimName": "fsx-pvc" }
}
]
volumeMounts:
[
{
"name": "nsys-output-volume",
"mountPath": "/fsx_shared"
}
]

# CLI options: https://docs.nvidia.com/nsight-systems/UserGuide/index.html#cli-command-switches
# delay and duration values in secs

# Use %{} to include environment variables in the Nsight report filename

# The arguments for the Nsight Systems. The placeholders will be replaced with the actual values.
devtoolArgs: "profile --force-overwrite true --trace nvtx,cuda --delay 150 --duration 60 \
-o /fsx_shared/fsdp/auto_{PROCESS_NAME}_%{POD_FULLNAME}_%{CONTAINER_NAME}_{TIMESTAMP}_{UID}.nsys-rep"

injectionMatch: "^/usr/bin/python3 /usr/local/bin/torchrun.*$"
#injectionMatch: "^.*torchrun.*$"

77 changes: 77 additions & 0 deletions 4.validation_and_observability/5.nsight/EKS/fsdp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
apiVersion: "kubeflow.org/v1"
kind: PyTorchJob
metadata:
name: fsdp
spec:
elasticPolicy:
rdzvBackend: etcd
rdzvHost: etcd
rdzvPort: 2379
minReplicas: 1
maxReplicas: 96
maxRestarts: 100
#metrics:
# - type: Resource
# resource:
# name: cpu
# target:
# type: Utilization
# averageUtilization: 80
pytorchReplicaSpecs:
Worker:
replicas: 2
restartPolicy: OnFailure
template:
metadata:
labels:
app: fsdp
nvidia-devtools-sidecar-injector: enabled
spec:
volumes:
- name: shmem
#emptyDir:
# medium: Memory
hostPath:
path: /dev/shm
#nodeSelector:
# node.kubernetes.io/instance-type: "p5.48xlarge"
containers:
- name: pytorch
image: 159553542841.dkr.ecr.us-west-2.amazonaws.com/fsdp:llama2-efa-main-02-13
imagePullPolicy: Always
resources:
requests:
nvidia.com/gpu:
vpc.amazonaws.com/efa: 4
limits:
nvidia.com/gpu:
vpc.amazonaws.com/efa: 4
env:
# for P5 FI_* should be commented out
#- name: LOGLEVEL
# value: "DEBUG"
- name: FI_PROVIDER
value: efa
- name: FI_EFA_USE_DEVICE_RDMA
value: "1"
- name: FI_EFA_FORK_SAFE
value: "1"
- name: FI_LOG_LEVEL
value: "1"
- name: FI_EFA_ENABLE_SHM_TRANSFER
value: "1"
#- name: NCCL_DEBUG
# value: "INFO"
- name: NCCL_ASYNC_ERROR_HANDLING
value: "1"
#- name: NCCL_IGNORE_DISABLED_P2P
# value: "1"
- name: HF_TOKEN
value: <HF_token>
command:
- bash
- -c
- "torchrun --nproc_per_node=8 --nnodes=2 examples/finetuning.py --num_epochs=1 --batch_size_training=3 --enable_fsdp --pure_bf16 --model_name meta-llama/Llama-2-7b-hf --output_dir ."
volumeMounts:
- name: shmem
mountPath: /dev/shm
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 4 additions & 0 deletions 4.validation_and_observability/5.nsight/EKS/install-injector
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash -x

helm install -f custom_values.yaml \
devtools-sidecar-injector https://helm.ngc.nvidia.com/nvidia/devtools/charts/devtools-sidecar-injector-1.0.0.tgz
2 changes: 2 additions & 0 deletions 4.validation_and_observability/5.nsight/EKS/label-namespace
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash -x
kubectl label namespaces ${example-ns} nvidia-devtools-sidecar-injector=enabled --overwrite=true
10 changes: 10 additions & 0 deletions 4.validation_and_observability/5.nsight/EKS/move_report
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash -x

# kubectl cp -n <namespace> <pod-name>:<path> <destination-on-local-system>

FILE=auto_python3_default_fsdp-worker-1_pytorch_1715996702335_5a061871.nsys-rep

kubectl cp fsx-share-test:fsx_shared/fsdp/$FILE /eks/deployment/distributed-training/pytorch/pytorchjob/fsdp/$FILE

aws s3 cp $FILE s3://${S3_BUCKET}

17 changes: 17 additions & 0 deletions 4.validation_and_observability/5.nsight/EKS/uniinstall-injector
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash -x


helm uninstall devtools-sidecar-injector

kubectl delete namespace nvidia-devtools-sidecar-injector

kubectl delete mutatingwebhookconfigurations sidecar-injector-webhook
kubectl delete mutatingwebhookconfiguration nvidia-devtools-sidecar-injector-webhook

kubectl delete cm -n example-ns nvidia-devtools-sidecar-injector
kubectl delete cm -n example-ns nvidia-devtools-sidecar-injector-custom

kubectl delete cm nvidia-devtools-sidecar-injector
kubectl delete cm nvidia-devtools-sidecar-injector-custom

#kubectl get all --all-namespaces -l nvidia-devtools-sidecar-injector=enabled -o custom-columns=:.metadata.name,NS:.metadata.namespace,KIND:.kind --no-headers | while read name namespace >
Loading

0 comments on commit 44e448e

Please sign in to comment.