-
Notifications
You must be signed in to change notification settings - Fork 92
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Added NCCL Nsight files Signed-off-by: Ankur Srivastava <[email protected]> * Added NCCL Nsight files Signed-off-by: Ankur Srivastava <[email protected]> * Updateed nccl and readme and fsdp Signed-off-by: Ankur Srivastava <[email protected]> * Updated NCCL Signed-off-by: Ankur Srivastava <[email protected]> * Added almost everything Signed-off-by: Ankur Srivastava <[email protected]> * Removed token Signed-off-by: Ankur Srivastava <[email protected]> --------- Signed-off-by: Ankur Srivastava <[email protected]>
- Loading branch information
Showing
22 changed files
with
1,269 additions
and
19 deletions.
There are no files selected for viewing
20 changes: 20 additions & 0 deletions
20
4.validation_and_observability/5.nsight/2.generate_recipes.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
|
||
/fsx/nsight-efa/target-linux-x64/nsys recipe cuda_api_sum --input ${NSIGHT_REPORT_NAME}.nsys-rep | ||
|
||
/fsx/nsight-efa/target-linux-x64/nsys recipe cuda_api_sync --input ${NSIGHT_REPORT_NAME}.nsys-rep | ||
|
||
/fsx/nsight-efa/target-linux-x64/nsys recipe cuda_gpu_kern_pace --input ${NSIGHT_REPORT_NAME}.nsys-rep --name ncclDevKernel_ReduceScatter_Sum_f32_RING_LL | ||
|
||
/fsx/nsight-efa/target-linux-x64/nsys recipe cuda_gpu_kern_pace --input ${NSIGHT_REPORT_NAME}.nsys-rep --name ncclDevKernel_AllGather_RING_LL | ||
|
||
/fsx/nsight-efa/target-linux-x64/nsys recipe cuda_gpu_kern_sum --input ${NSIGHT_REPORT_NAME}.nsys-rep | ||
|
||
/fsx/nsight-efa/target-linux-x64/nsys recipe cuda_gpu_mem_size_sum --input ${NSIGHT_REPORT_NAME}.nsys-rep | ||
|
||
/fsx/nsight-efa/target-linux-x64/nsys recipe cuda_gpu_mem_time_sum --input ${NSIGHT_REPORT_NAME}.nsys-rep | ||
|
||
/fsx/nsight-efa/target-linux-x64/nsys recipe cuda_gpu_time_util_map --input ${NSIGHT_REPORT_NAME}.nsys-rep | ||
|
||
/fsx/nsight-efa/target-linux-x64/nsys recipe nccl_sum --input ${NSIGHT_REPORT_NAME}.nsys-rep | ||
|
||
/fsx/nsight-efa/target-linux-x64/nsys recipe nccl_gpu_time_util_map --input ${NSIGHT_REPORT_NAME}.nsys-rep |
198 changes: 198 additions & 0 deletions
198
4.validation_and_observability/5.nsight/EKS/Dockerfile.llama2-efa
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,198 @@ | ||
apiVersion: "kubeflow.org/v1" | ||
kind: PyTorchJob | ||
metadata: | ||
name: fsdp | ||
spec: | ||
elasticPolicy: | ||
rdzvBackend: etcd | ||
rdzvHost: etcd | ||
rdzvPort: 2379 | ||
minReplicas: 1 | ||
maxReplicas: 96 | ||
maxRestarts: 100 | ||
#metrics: | ||
# - type: Resource | ||
# resource: | ||
# name: cpu | ||
# target: | ||
# type: Utilization | ||
# averageUtilization: 80 | ||
pytorchReplicaSpecs: | ||
Worker: | ||
replicas: 2 | ||
restartPolicy: OnFailure | ||
template: | ||
metadata: | ||
labels: | ||
app: fsdp | ||
nvidia-devtools-sidecar-injector: enabled | ||
spec: | ||
volumes: | ||
- name: shmem | ||
#emptyDir: | ||
# medium: Memory | ||
hostPath: | ||
path: /dev/shm | ||
#nodeSelector: | ||
# node.kubernetes.io/instance-type: "p5.48xlarge" | ||
containers: | ||
- name: pytorch | ||
image: 159553542841.dkr.ecr.us-west-2.amazonaws.com/fsdp:llama2-efa-main-02-13 | ||
imagePullPolicy: Always | ||
resources: | ||
requests: | ||
nvidia.com/gpu: | ||
vpc.amazonaws.com/efa: 4 | ||
limits: | ||
nvidia.com/gpu: | ||
vpc.amazonaws.com/efa: 4 | ||
env: | ||
# for P5 FI_* should be commented out | ||
#- name: LOGLEVEL | ||
# value: "DEBUG" | ||
- name: FI_PROVIDER | ||
value: efa | ||
- name: FI_EFA_USE_DEVICE_RDMA | ||
value: "1" | ||
- name: FI_EFA_FORK_SAFE | ||
value: "1" | ||
- name: FI_LOG_LEVEL | ||
value: "1" | ||
- name: FI_EFA_ENABLE_SHM_TRANSFER | ||
value: "1" | ||
#- name: NCCL_DEBUG | ||
# value: "INFO" | ||
- name: NCCL_ASYNC_ERROR_HANDLING | ||
value: "1" | ||
#- name: NCCL_IGNORE_DISABLED_P2P | ||
# value: "1" | ||
- name: HF_TOKEN | ||
value: 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 | ||
command: | ||
- bash | ||
- -c | ||
- "torchrun --nproc_per_node=8 --nnodes=2 examples/finetuning.py --num_epochs=1 --batch_size_training=3 --enable_fsdp --pure_bf16 --model_name meta-llama/Llama-2-7b-hf --output_dir ." | ||
volumeMounts: | ||
- name: shmem | ||
mountPath: /dev/shm | ||
root@cb9511473ccc:/eks/deployment/distributed-training/pytorch/pytorchjob/fsdp# cat Dockerfile.llama2-efa | ||
FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 | ||
|
||
ARG EFA_INSTALLER_VERSION=1.29.1 | ||
ARG AWS_OFI_NCCL_VERSION=v1.7.3-aws | ||
ARG NCCL_TESTS_VERSION=master | ||
ARG NCCL_VERSION=2.18.5 | ||
|
||
RUN apt-get update -y | ||
RUN apt-get remove -y --allow-change-held-packages \ | ||
libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 libnccl2 libnccl-dev | ||
|
||
RUN rm -rf /opt/hpcx \ | ||
&& rm -rf /usr/local/mpi \ | ||
&& rm -f /etc/ld.so.conf.d/hpcx.conf \ | ||
&& ldconfig | ||
ENV OPAL_PREFIX= | ||
|
||
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ | ||
git \ | ||
gcc \ | ||
vim \ | ||
kmod \ | ||
openssh-client \ | ||
openssh-server \ | ||
build-essential \ | ||
curl \ | ||
autoconf \ | ||
libtool \ | ||
gdb \ | ||
automake \ | ||
python3-distutils \ | ||
cmake \ | ||
apt-utils \ | ||
devscripts \ | ||
debhelper \ | ||
libsubunit-dev \ | ||
check \ | ||
pkg-config | ||
|
||
RUN mkdir -p /var/run/sshd | ||
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ | ||
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ | ||
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config | ||
ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/lib:$LD_LIBRARY_PATH | ||
ENV PATH /opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH | ||
|
||
RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \ | ||
&& python3 /tmp/get-pip.py \ | ||
&& pip3 install awscli pynvml | ||
|
||
################################################# | ||
## Install NVIDIA GDRCopy | ||
#RUN git clone https://github.com/NVIDIA/gdrcopy.git /opt/gdrcopy \ | ||
# && cd /opt/gdrcopy \ | ||
# && make lib_install install \ | ||
# && cd /opt/gdrcopy/tests \ | ||
# && make \ | ||
# && mv copylat copybw sanity apiperf /usr/bin/ | ||
|
||
################################################# | ||
## Install EFA installer | ||
RUN cd $HOME \ | ||
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ | ||
&& tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ | ||
&& cd aws-efa-installer \ | ||
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ | ||
&& rm -rf $HOME/aws-efa-installer | ||
|
||
################################################### | ||
## Install NCCL | ||
RUN git clone https://github.com/NVIDIA/nccl -b v${NCCL_VERSION}-1 /opt/nccl \ | ||
&& cd /opt/nccl \ | ||
&& make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \ | ||
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90" | ||
|
||
################################################### | ||
## Install AWS-OFI-NCCL plugin | ||
RUN apt-get install libtool autoconf cmake nasm unzip pigz parallel nfs-common build-essential hwloc libhwloc-dev libjemalloc2 libnuma-dev numactl libjemalloc-dev preload htop iftop liblapack-dev libgfortran5 ipcalc wget curl devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms -y | ||
RUN export OPAL_PREFIX="" \ | ||
&& git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \ | ||
&& cd /opt/aws-ofi-nccl \ | ||
&& git checkout ${AWS_OFI_NCCL_VERSION} \ | ||
&& ./autogen.sh \ | ||
&& ./configure --prefix=/opt/aws-ofi-nccl/install \ | ||
--with-libfabric=/opt/amazon/efa/ \ | ||
--with-cuda=/usr/local/cuda \ | ||
--with-nccl=/opt/nccl/build \ | ||
--with-mpi=/opt/amazon/openmpi/ \ | ||
&& make -j $(nproc) && make install | ||
################################################### | ||
## Install fsdp | ||
|
||
RUN mkdir -p /workspace/ | ||
|
||
WORKDIR /workspace | ||
|
||
#RUN git clone -b flop_counter https://github.com/facebookresearch/llama-recipes.git | ||
#RUN git clone -b flop_counter_gc https://github.com/facebookresearch/llama-recipes.git | ||
RUN git clone https://github.com/facebookresearch/llama-recipes.git | ||
|
||
WORKDIR /workspace/llama-recipes | ||
|
||
RUN pip3 install -U pip setuptools | ||
|
||
RUN pip3 install fsspec==2023.1.0 | ||
RUN pip3 install huggingface_hub==0.17.0 | ||
RUN pip3 install -r requirements.txt | ||
|
||
RUN pip3 install -e . | ||
|
||
RUN pip3 install tabulate | ||
|
||
RUN pip3 install protobuf | ||
|
||
RUN pip3 install python-etcd | ||
|
||
#RUN pip3 uninstall -y torch | ||
#RUN pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121 | ||
|
||
ENV PYTHONPATH="${PYTHONPATH}:/workspace/llama-recipes/src" |
35 changes: 35 additions & 0 deletions
35
4.validation_and_observability/5.nsight/EKS/custom_values.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# If we dont specify the Nsight image, 2024.2 version is used by default. | ||
# Will use 2024.4 version which is planned to be released by 5/24/2024 | ||
devtoolBinariesImage: | ||
image: ${REGISTRY}.dkr.ecr.${REGION}.amazonaws.com/nsight-systems-cli:2024.4.1-ubuntu22.04 | ||
imagePullPolicy: Always | ||
|
||
# Assuming EKS cluster has a FSx for Lustre filesystem mounted on it. Nsight reports will be saved in /fsx_shared | ||
profile: | ||
volumes: | ||
[ | ||
{ | ||
"name": "nsys-output-volume", | ||
"persistentVolumeClaim": { "claimName": "fsx-pvc" } | ||
} | ||
] | ||
volumeMounts: | ||
[ | ||
{ | ||
"name": "nsys-output-volume", | ||
"mountPath": "/fsx_shared" | ||
} | ||
] | ||
|
||
# CLI options: https://docs.nvidia.com/nsight-systems/UserGuide/index.html#cli-command-switches | ||
# delay and duration values in secs | ||
|
||
# Use %{} to include environment variables in the Nsight report filename | ||
|
||
# The arguments for the Nsight Systems. The placeholders will be replaced with the actual values. | ||
devtoolArgs: "profile --force-overwrite true --trace nvtx,cuda --delay 150 --duration 60 \ | ||
-o /fsx_shared/fsdp/auto_{PROCESS_NAME}_%{POD_FULLNAME}_%{CONTAINER_NAME}_{TIMESTAMP}_{UID}.nsys-rep" | ||
|
||
injectionMatch: "^/usr/bin/python3 /usr/local/bin/torchrun.*$" | ||
#injectionMatch: "^.*torchrun.*$" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
apiVersion: "kubeflow.org/v1" | ||
kind: PyTorchJob | ||
metadata: | ||
name: fsdp | ||
spec: | ||
elasticPolicy: | ||
rdzvBackend: etcd | ||
rdzvHost: etcd | ||
rdzvPort: 2379 | ||
minReplicas: 1 | ||
maxReplicas: 96 | ||
maxRestarts: 100 | ||
#metrics: | ||
# - type: Resource | ||
# resource: | ||
# name: cpu | ||
# target: | ||
# type: Utilization | ||
# averageUtilization: 80 | ||
pytorchReplicaSpecs: | ||
Worker: | ||
replicas: 2 | ||
restartPolicy: OnFailure | ||
template: | ||
metadata: | ||
labels: | ||
app: fsdp | ||
nvidia-devtools-sidecar-injector: enabled | ||
spec: | ||
volumes: | ||
- name: shmem | ||
#emptyDir: | ||
# medium: Memory | ||
hostPath: | ||
path: /dev/shm | ||
#nodeSelector: | ||
# node.kubernetes.io/instance-type: "p5.48xlarge" | ||
containers: | ||
- name: pytorch | ||
image: 159553542841.dkr.ecr.us-west-2.amazonaws.com/fsdp:llama2-efa-main-02-13 | ||
imagePullPolicy: Always | ||
resources: | ||
requests: | ||
nvidia.com/gpu: | ||
vpc.amazonaws.com/efa: 4 | ||
limits: | ||
nvidia.com/gpu: | ||
vpc.amazonaws.com/efa: 4 | ||
env: | ||
# for P5 FI_* should be commented out | ||
#- name: LOGLEVEL | ||
# value: "DEBUG" | ||
- name: FI_PROVIDER | ||
value: efa | ||
- name: FI_EFA_USE_DEVICE_RDMA | ||
value: "1" | ||
- name: FI_EFA_FORK_SAFE | ||
value: "1" | ||
- name: FI_LOG_LEVEL | ||
value: "1" | ||
- name: FI_EFA_ENABLE_SHM_TRANSFER | ||
value: "1" | ||
#- name: NCCL_DEBUG | ||
# value: "INFO" | ||
- name: NCCL_ASYNC_ERROR_HANDLING | ||
value: "1" | ||
#- name: NCCL_IGNORE_DISABLED_P2P | ||
# value: "1" | ||
- name: HF_TOKEN | ||
value: <HF_token> | ||
command: | ||
- bash | ||
- -c | ||
- "torchrun --nproc_per_node=8 --nnodes=2 examples/finetuning.py --num_epochs=1 --batch_size_training=3 --enable_fsdp --pure_bf16 --model_name meta-llama/Llama-2-7b-hf --output_dir ." | ||
volumeMounts: | ||
- name: shmem | ||
mountPath: /dev/shm |
Binary file added
BIN
+238 KB
4.validation_and_observability/5.nsight/EKS/fsdp_eks_report_screenshot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
#!/bin/bash -x | ||
|
||
helm install -f custom_values.yaml \ | ||
devtools-sidecar-injector https://helm.ngc.nvidia.com/nvidia/devtools/charts/devtools-sidecar-injector-1.0.0.tgz |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
#!/bin/bash -x | ||
kubectl label namespaces ${example-ns} nvidia-devtools-sidecar-injector=enabled --overwrite=true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
#!/bin/bash -x | ||
|
||
# kubectl cp -n <namespace> <pod-name>:<path> <destination-on-local-system> | ||
|
||
FILE=auto_python3_default_fsdp-worker-1_pytorch_1715996702335_5a061871.nsys-rep | ||
|
||
kubectl cp fsx-share-test:fsx_shared/fsdp/$FILE /eks/deployment/distributed-training/pytorch/pytorchjob/fsdp/$FILE | ||
|
||
aws s3 cp $FILE s3://${S3_BUCKET} | ||
|
17 changes: 17 additions & 0 deletions
17
4.validation_and_observability/5.nsight/EKS/uniinstall-injector
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
#!/bin/bash -x | ||
|
||
|
||
helm uninstall devtools-sidecar-injector | ||
|
||
kubectl delete namespace nvidia-devtools-sidecar-injector | ||
|
||
kubectl delete mutatingwebhookconfigurations sidecar-injector-webhook | ||
kubectl delete mutatingwebhookconfiguration nvidia-devtools-sidecar-injector-webhook | ||
|
||
kubectl delete cm -n example-ns nvidia-devtools-sidecar-injector | ||
kubectl delete cm -n example-ns nvidia-devtools-sidecar-injector-custom | ||
|
||
kubectl delete cm nvidia-devtools-sidecar-injector | ||
kubectl delete cm nvidia-devtools-sidecar-injector-custom | ||
|
||
#kubectl get all --all-namespaces -l nvidia-devtools-sidecar-injector=enabled -o custom-columns=:.metadata.name,NS:.metadata.namespace,KIND:.kind --no-headers | while read name namespace > |
Oops, something went wrong.