From 74db02ccb26a0813b43fa4bd5217cb1609b5ddd0 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 26 Apr 2024 11:11:26 +0000 Subject: [PATCH 01/15] WIP --- .azure-pipelines/multi-nodes-test.yml | 2 +- test/deploy/hostfile | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index 7c9d35094..59f3a449d 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -14,7 +14,7 @@ jobs: cuda12: containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2 pool: - name: mscclpp-it + vmImage: ubuntu-latest container: image: $[ variables['containerImage'] ] diff --git a/test/deploy/hostfile b/test/deploy/hostfile index b1bfc1df3..a65c39b81 100644 --- a/test/deploy/hostfile +++ b/test/deploy/hostfile @@ -1,2 +1,2 @@ -azureuser@mscclit-000000 -azureuser@mscclit-000001 +azureuser@20.94.161.116:50000 +azureuser@20.94.161.116:50000 From 96444af222853beba781215c476b720d2125ccf1 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 26 Apr 2024 11:20:35 +0000 Subject: [PATCH 02/15] WIP --- .azure-pipelines/multi-nodes-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index 59f3a449d..ed4beb790 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -34,7 +34,7 @@ jobs: name: SshKeyFile displayName: Download key file inputs: - secureFile: ssh.key + secureFile: mscclpp-ssh.key - task: Bash@3 name: InstallPackages From 13afe85d03961c97e670284bf43907b1f915f34f Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 26 Apr 2024 14:45:31 +0000 Subject: [PATCH 03/15] WIP --- .azure-pipelines/multi-nodes-test.yml | 41 ++++++++++++++++++--------- test/deploy/hostfile | 4 +-- 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index ed4beb790..39906fbbb 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -10,13 +10,17 @@ jobs: strategy: matrix: cuda11: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 + containerImage: mscclpp.azurecr.io/mscclpp:base-dev-cuda11.8 cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2 + containerImage: mscclpp.azurecr.io/mscclpp:base-dev-cuda12.2 pool: vmImage: ubuntu-latest container: image: $[ variables['containerImage'] ] + endpoint: msccl-it + + variables: + hostName: "mscclit-vmss:50000" steps: - task: Bash@3 @@ -56,6 +60,15 @@ jobs: inlineScript: | az vmss start --name mscclit-vmss --resource-group msccl-IT + - task: Bash@3 + name: UpdateHostFile + displayName: Update host file + inputs: + targetType: 'inline' + script: | + set -e + echo "$(MSCCLPP-VMSS-IP) mscclit-vmss" | sudo tee -a /etc/hosts + - task: Bash@3 name: DeployTestEnv displayName: Deploy Test Env @@ -76,10 +89,10 @@ jobs: KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} rm -rf output/* mkdir -p output - touch output/mscclit-000000 - tail -f output/mscclit-000000 & + touch output/mscclit-vmss + tail -f output/mscclit-vmss & CHILD_PID=$! - parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ + parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test' kill $CHILD_PID @@ -95,10 +108,10 @@ jobs: KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} rm -rf output/* mkdir -p output - touch output/mscclit-000000 - tail -f output/mscclit-000000 & + touch output/mscclit-vmss + tail -f output/mscclit-vmss & CHILD_PID=$! - parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ + parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut' kill $CHILD_PID @@ -114,10 +127,10 @@ jobs: KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} rm -rf output/* mkdir -p output - touch output/mscclit-000000 - tail -f output/mscclit-000000 & + touch output/mscclit-vmss + tail -f output/mscclit-vmss & CHILD_PID=$! - parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ + parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests' kill $CHILD_PID @@ -133,10 +146,10 @@ jobs: KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} rm -rf output/* mkdir -p output - touch output/mscclit-000000 - tail -f output/mscclit-000000 & + touch output/mscclit-vmss + tail -f output/mscclit-vmss & CHILD_PID=$! - parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ + parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark' kill $CHILD_PID diff --git a/test/deploy/hostfile b/test/deploy/hostfile index a65c39b81..6260cc6ee 100644 --- a/test/deploy/hostfile +++ b/test/deploy/hostfile @@ -1,2 +1,2 @@ -azureuser@20.94.161.116:50000 -azureuser@20.94.161.116:50000 +azureuser@mscclit-vmss:50000 +azureuser@mscclit-vmss:50001 From 7b3b78e9aeb3913b39c179384085a438b9a8e3a9 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 26 Apr 2024 14:48:38 +0000 Subject: [PATCH 04/15] WIP --- .azure-pipelines/multi-nodes-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index 39906fbbb..99b4a0800 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -17,7 +17,7 @@ jobs: vmImage: ubuntu-latest container: image: $[ variables['containerImage'] ] - endpoint: msccl-it + endpoint: mscclpp-acr variables: hostName: "mscclit-vmss:50000" From 4d774bc71b5da4f53769aa677649d01923047f69 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 26 Apr 2024 15:28:09 +0000 Subject: [PATCH 05/15] WIP --- .azure-pipelines/multi-nodes-test.yml | 5 ++--- docker/base-dev-x.dockerfile | 2 +- docker/base-x.dockerfile | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index 99b4a0800..c60cfacf1 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -10,14 +10,13 @@ jobs: strategy: matrix: cuda11: - containerImage: mscclpp.azurecr.io/mscclpp:base-dev-cuda11.8 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 cuda12: - containerImage: mscclpp.azurecr.io/mscclpp:base-dev-cuda12.2 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2 pool: vmImage: ubuntu-latest container: image: $[ variables['containerImage'] ] - endpoint: mscclpp-acr variables: hostName: "mscclit-vmss:50000" diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile index d7f2166f1..d6236cd02 100644 --- a/docker/base-dev-x.dockerfile +++ b/docker/base-dev-x.dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 +ARG BASE_IMAGE FROM ${BASE_IMAGE} LABEL maintainer="MSCCL++" diff --git a/docker/base-x.dockerfile b/docker/base-x.dockerfile index bf29f718a..4be89c9d0 100644 --- a/docker/base-x.dockerfile +++ b/docker/base-x.dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=nvidia/cuda:12.1.1-devel-ubuntu20.04 +ARG BASE_IMAGE FROM ${BASE_IMAGE} LABEL maintainer="MSCCL++" From 47ab51d46353f48a231ca7412e95c36036b6752e Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sat, 27 Apr 2024 02:42:24 +0000 Subject: [PATCH 06/15] WIP --- .../multi-nodes-steps-template.yml | 141 +++++++++++++++ .azure-pipelines/multi-nodes-test.yml | 170 +++--------------- 2 files changed, 161 insertions(+), 150 deletions(-) create mode 100644 .azure-pipelines/multi-nodes-steps-template.yml diff --git a/.azure-pipelines/multi-nodes-steps-template.yml b/.azure-pipelines/multi-nodes-steps-template.yml new file mode 100644 index 000000000..673e0a457 --- /dev/null +++ b/.azure-pipelines/multi-nodes-steps-template.yml @@ -0,0 +1,141 @@ +steps: +- task: Bash@3 + name: Build + displayName: Build + inputs: + targetType: 'inline' + script: | + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .. + make -j + workingDirectory: '$(System.DefaultWorkingDirectory)' + +- task: DownloadSecureFile@1 + name: SshKeyFile + displayName: Download key file + inputs: + secureFile: mscclpp-ssh.key + +- task: Bash@3 + name: InstallPackages + displayName: Install Packages + inputs: + targetType: 'inline' + script: | + sudo apt-get update -y + sudo apt-get install pssh -y + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + +- task: AzureCLI@2 + name: StartVMSS + displayName: Start VMSS + inputs: + azureSubscription: msccl-it + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + az vmss start --name mscclit-vmss --resource-group msccl-IT + +- task: Bash@3 + name: UpdateHostFile + displayName: Update host file + inputs: + targetType: 'inline' + script: | + set -e + echo "$(MSCCLPP-VMSS-IP) mscclit-vmss" | sudo tee -a /etc/hosts + +- task: Bash@3 + name: DeployTestEnv + displayName: Deploy Test Env + inputs: + targetType: filePath + filePath: test/deploy/deploy.sh + workingDirectory: '$(System.DefaultWorkingDirectory)' + +- task: Bash@3 + name: RunMscclppTest + displayName: Run multi-nodes mscclpp-test + inputs: + targetType: 'inline' + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + rm -rf output/* + mkdir -p output + touch output/mscclit-vmss + tail -f output/mscclit-vmss & + CHILD_PID=$! + parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ + -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test' + kill $CHILD_PID + +- task: Bash@3 + name: RunMultiNodeUnitTest + displayName: Run multi-nodes unit tests + inputs: + targetType: 'inline' + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + rm -rf output/* + mkdir -p output + touch output/mscclit-vmss + tail -f output/mscclit-vmss & + CHILD_PID=$! + parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ + -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut' + kill $CHILD_PID + +- task: Bash@3 + name: RunMultiNodePythonTests + displayName: Run multi-nodes python tests + inputs: + targetType: 'inline' + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + rm -rf output/* + mkdir -p output + touch output/mscclit-vmss + tail -f output/mscclit-vmss & + CHILD_PID=$! + parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ + -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests' + kill $CHILD_PID + +- task: Bash@3 + name: RunMultiNodePythonBenchmark + displayName: Run multi-nodes python benchmark + inputs: + targetType: 'inline' + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + rm -rf output/* + mkdir -p output + touch output/mscclit-vmss + tail -f output/mscclit-vmss & + CHILD_PID=$! + parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ + -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark' + kill $CHILD_PID + +- task: AzureCLI@2 + name: StopVMSS + displayName: Deallocate VMSS + condition: always() + inputs: + azureSubscription: msccl-it + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + az vmss deallocate --name mscclit-vmss --resource-group msccl-IT diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index c60cfacf1..6c0c45c34 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -4,161 +4,31 @@ trigger: # Do not run multi-nodes-test for PR, we can trigger it manually pr: none +variables: + hostName: "mscclit-vmss:50000" + jobs: -- job: MultiNodesTest - displayName: Multi nodes test - strategy: - matrix: - cuda11: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 - cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2 +- job: MultiNodesTest-cuda11 + displayName: Multi nodes test cuda11 pool: vmImage: ubuntu-latest container: - image: $[ variables['containerImage'] ] - - variables: - hostName: "mscclit-vmss:50000" + image: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 steps: - - task: Bash@3 - name: Build - displayName: Build - inputs: - targetType: 'inline' - script: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .. - make -j - workingDirectory: '$(System.DefaultWorkingDirectory)' - - - task: DownloadSecureFile@1 - name: SshKeyFile - displayName: Download key file - inputs: - secureFile: mscclpp-ssh.key - - - task: Bash@3 - name: InstallPackages - displayName: Install Packages - inputs: - targetType: 'inline' - script: | - sudo apt-get update -y - sudo apt-get install pssh -y - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - - - task: AzureCLI@2 - name: StartVMSS - displayName: Start VMSS - inputs: - azureSubscription: msccl-it - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss start --name mscclit-vmss --resource-group msccl-IT - - - task: Bash@3 - name: UpdateHostFile - displayName: Update host file - inputs: - targetType: 'inline' - script: | - set -e - echo "$(MSCCLPP-VMSS-IP) mscclit-vmss" | sudo tee -a /etc/hosts - - - task: Bash@3 - name: DeployTestEnv - displayName: Deploy Test Env - inputs: - targetType: filePath - filePath: test/deploy/deploy.sh - workingDirectory: '$(System.DefaultWorkingDirectory)' + - template: multi-nodes-steps-template.yml + parameters: + hostName: ${{ variables.hostName }} - - task: Bash@3 - name: RunMscclppTest - displayName: Run multi-nodes mscclpp-test - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - rm -rf output/* - mkdir -p output - touch output/mscclit-vmss - tail -f output/mscclit-vmss & - CHILD_PID=$! - parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test' - kill $CHILD_PID - - - task: Bash@3 - name: RunMultiNodeUnitTest - displayName: Run multi-nodes unit tests - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - rm -rf output/* - mkdir -p output - touch output/mscclit-vmss - tail -f output/mscclit-vmss & - CHILD_PID=$! - parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut' - kill $CHILD_PID - - - task: Bash@3 - name: RunMultiNodePythonTests - displayName: Run multi-nodes python tests - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - rm -rf output/* - mkdir -p output - touch output/mscclit-vmss - tail -f output/mscclit-vmss & - CHILD_PID=$! - parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests' - kill $CHILD_PID - - - task: Bash@3 - name: RunMultiNodePythonBenchmark - displayName: Run multi-nodes python benchmark - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - rm -rf output/* - mkdir -p output - touch output/mscclit-vmss - tail -f output/mscclit-vmss & - CHILD_PID=$! - parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark' - kill $CHILD_PID +- job: MultiNodesTest-cuda12 + displayName: Multi nodes test cuda12 + dependsOn: MultiNodesTest-cuda11 # Make sure the vmss not be used by multiple jobs + pool: + vmImage: ubuntu-latest + container: + image: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2 - - task: AzureCLI@2 - name: StopVMSS - displayName: Deallocate VMSS - condition: always() - inputs: - azureSubscription: msccl-it - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss deallocate --name mscclit-vmss --resource-group msccl-IT + steps: + - template: multi-nodes-steps-template.yml + parameters: + hostName: ${{ variables.hostName }} From 28f9407b2fa08038f0a1ace76f6fc4fb1d311028 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sat, 27 Apr 2024 02:43:35 +0000 Subject: [PATCH 07/15] WIP --- .azure-pipelines/multi-nodes-test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index 6c0c45c34..07df90eda 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -8,7 +8,7 @@ variables: hostName: "mscclit-vmss:50000" jobs: -- job: MultiNodesTest-cuda11 +- job: MultiNodesTestCuda11 displayName: Multi nodes test cuda11 pool: vmImage: ubuntu-latest @@ -20,7 +20,7 @@ jobs: parameters: hostName: ${{ variables.hostName }} -- job: MultiNodesTest-cuda12 +- job: MultiNodesTestCuda12 displayName: Multi nodes test cuda12 dependsOn: MultiNodesTest-cuda11 # Make sure the vmss not be used by multiple jobs pool: From 8211d35467e52172caaecb686e7d8f8ff017baa1 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sat, 27 Apr 2024 02:44:37 +0000 Subject: [PATCH 08/15] WIP --- .azure-pipelines/multi-nodes-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index 07df90eda..07df34d9f 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -22,7 +22,7 @@ jobs: - job: MultiNodesTestCuda12 displayName: Multi nodes test cuda12 - dependsOn: MultiNodesTest-cuda11 # Make sure the vmss not be used by multiple jobs + dependsOn: MultiNodesTestCuda11 # Make sure the vmss not be used by multiple jobs pool: vmImage: ubuntu-latest container: From eb4485e3815a5f66183f5bd972c9922b43a1b4b5 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sat, 27 Apr 2024 03:01:19 +0000 Subject: [PATCH 09/15] debug --- .../multi-nodes-steps-template.yml | 245 +++++++++--------- 1 file changed, 123 insertions(+), 122 deletions(-) diff --git a/.azure-pipelines/multi-nodes-steps-template.yml b/.azure-pipelines/multi-nodes-steps-template.yml index 673e0a457..2dbc01315 100644 --- a/.azure-pipelines/multi-nodes-steps-template.yml +++ b/.azure-pipelines/multi-nodes-steps-template.yml @@ -1,40 +1,40 @@ steps: -- task: Bash@3 - name: Build - displayName: Build - inputs: - targetType: 'inline' - script: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .. - make -j - workingDirectory: '$(System.DefaultWorkingDirectory)' +# - task: Bash@3 +# name: Build +# displayName: Build +# inputs: +# targetType: 'inline' +# script: | +# mkdir build && cd build +# cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .. +# make -j +# workingDirectory: '$(System.DefaultWorkingDirectory)' -- task: DownloadSecureFile@1 - name: SshKeyFile - displayName: Download key file - inputs: - secureFile: mscclpp-ssh.key +# - task: DownloadSecureFile@1 +# name: SshKeyFile +# displayName: Download key file +# inputs: +# secureFile: mscclpp-ssh.key -- task: Bash@3 - name: InstallPackages - displayName: Install Packages - inputs: - targetType: 'inline' - script: | - sudo apt-get update -y - sudo apt-get install pssh -y - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash +# - task: Bash@3 +# name: InstallPackages +# displayName: Install Packages +# inputs: +# targetType: 'inline' +# script: | +# sudo apt-get update -y +# sudo apt-get install pssh -y +# curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash -- task: AzureCLI@2 - name: StartVMSS - displayName: Start VMSS - inputs: - azureSubscription: msccl-it - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss start --name mscclit-vmss --resource-group msccl-IT +# - task: AzureCLI@2 +# name: StartVMSS +# displayName: Start VMSS +# inputs: +# azureSubscription: msccl-it +# scriptType: bash +# scriptLocation: inlineScript +# inlineScript: | +# az vmss start --name mscclit-vmss --resource-group msccl-IT - task: Bash@3 name: UpdateHostFile @@ -44,98 +44,99 @@ steps: script: | set -e echo "$(MSCCLPP-VMSS-IP) mscclit-vmss" | sudo tee -a /etc/hosts + cat /etc/hosts -- task: Bash@3 - name: DeployTestEnv - displayName: Deploy Test Env - inputs: - targetType: filePath - filePath: test/deploy/deploy.sh - workingDirectory: '$(System.DefaultWorkingDirectory)' +# - task: Bash@3 +# name: DeployTestEnv +# displayName: Deploy Test Env +# inputs: +# targetType: filePath +# filePath: test/deploy/deploy.sh +# workingDirectory: '$(System.DefaultWorkingDirectory)' -- task: Bash@3 - name: RunMscclppTest - displayName: Run multi-nodes mscclpp-test - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - rm -rf output/* - mkdir -p output - touch output/mscclit-vmss - tail -f output/mscclit-vmss & - CHILD_PID=$! - parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test' - kill $CHILD_PID +# - task: Bash@3 +# name: RunMscclppTest +# displayName: Run multi-nodes mscclpp-test +# inputs: +# targetType: 'inline' +# script: | +# set -e +# HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile +# SSH_OPTION="StrictHostKeyChecking=no" +# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} +# rm -rf output/* +# mkdir -p output +# touch output/mscclit-vmss +# tail -f output/mscclit-vmss & +# CHILD_PID=$! +# parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ +# -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test' +# kill $CHILD_PID -- task: Bash@3 - name: RunMultiNodeUnitTest - displayName: Run multi-nodes unit tests - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - rm -rf output/* - mkdir -p output - touch output/mscclit-vmss - tail -f output/mscclit-vmss & - CHILD_PID=$! - parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut' - kill $CHILD_PID +# - task: Bash@3 +# name: RunMultiNodeUnitTest +# displayName: Run multi-nodes unit tests +# inputs: +# targetType: 'inline' +# script: | +# set -e +# HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile +# SSH_OPTION="StrictHostKeyChecking=no" +# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} +# rm -rf output/* +# mkdir -p output +# touch output/mscclit-vmss +# tail -f output/mscclit-vmss & +# CHILD_PID=$! +# parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ +# -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut' +# kill $CHILD_PID -- task: Bash@3 - name: RunMultiNodePythonTests - displayName: Run multi-nodes python tests - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - rm -rf output/* - mkdir -p output - touch output/mscclit-vmss - tail -f output/mscclit-vmss & - CHILD_PID=$! - parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests' - kill $CHILD_PID +# - task: Bash@3 +# name: RunMultiNodePythonTests +# displayName: Run multi-nodes python tests +# inputs: +# targetType: 'inline' +# script: | +# set -e +# HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile +# SSH_OPTION="StrictHostKeyChecking=no" +# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} +# rm -rf output/* +# mkdir -p output +# touch output/mscclit-vmss +# tail -f output/mscclit-vmss & +# CHILD_PID=$! +# parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ +# -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests' +# kill $CHILD_PID -- task: Bash@3 - name: RunMultiNodePythonBenchmark - displayName: Run multi-nodes python benchmark - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - rm -rf output/* - mkdir -p output - touch output/mscclit-vmss - tail -f output/mscclit-vmss & - CHILD_PID=$! - parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark' - kill $CHILD_PID +# - task: Bash@3 +# name: RunMultiNodePythonBenchmark +# displayName: Run multi-nodes python benchmark +# inputs: +# targetType: 'inline' +# script: | +# set -e +# HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile +# SSH_OPTION="StrictHostKeyChecking=no" +# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} +# rm -rf output/* +# mkdir -p output +# touch output/mscclit-vmss +# tail -f output/mscclit-vmss & +# CHILD_PID=$! +# parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ +# -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark' +# kill $CHILD_PID -- task: AzureCLI@2 - name: StopVMSS - displayName: Deallocate VMSS - condition: always() - inputs: - azureSubscription: msccl-it - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss deallocate --name mscclit-vmss --resource-group msccl-IT +# - task: AzureCLI@2 +# name: StopVMSS +# displayName: Deallocate VMSS +# condition: always() +# inputs: +# azureSubscription: msccl-it +# scriptType: bash +# scriptLocation: inlineScript +# inlineScript: | +# az vmss deallocate --name mscclit-vmss --resource-group msccl-IT From cedca5f85d5bdc6ed8ba19599a5e6d218bcca89b Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sat, 27 Apr 2024 03:08:25 +0000 Subject: [PATCH 10/15] WIP --- .../multi-nodes-steps-template.yml | 243 +++++++++--------- 1 file changed, 121 insertions(+), 122 deletions(-) diff --git a/.azure-pipelines/multi-nodes-steps-template.yml b/.azure-pipelines/multi-nodes-steps-template.yml index 2dbc01315..7d2b3b455 100644 --- a/.azure-pipelines/multi-nodes-steps-template.yml +++ b/.azure-pipelines/multi-nodes-steps-template.yml @@ -1,40 +1,40 @@ steps: -# - task: Bash@3 -# name: Build -# displayName: Build -# inputs: -# targetType: 'inline' -# script: | -# mkdir build && cd build -# cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .. -# make -j -# workingDirectory: '$(System.DefaultWorkingDirectory)' +- task: Bash@3 + name: Build + displayName: Build + inputs: + targetType: 'inline' + script: | + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .. + make -j + workingDirectory: '$(System.DefaultWorkingDirectory)' -# - task: DownloadSecureFile@1 -# name: SshKeyFile -# displayName: Download key file -# inputs: -# secureFile: mscclpp-ssh.key +- task: DownloadSecureFile@1 + name: SshKeyFile + displayName: Download key file + inputs: + secureFile: mscclpp-ssh.key -# - task: Bash@3 -# name: InstallPackages -# displayName: Install Packages -# inputs: -# targetType: 'inline' -# script: | -# sudo apt-get update -y -# sudo apt-get install pssh -y -# curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash +- task: Bash@3 + name: InstallPackages + displayName: Install Packages + inputs: + targetType: 'inline' + script: | + sudo apt-get update -y + sudo apt-get install pssh -y + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash -# - task: AzureCLI@2 -# name: StartVMSS -# displayName: Start VMSS -# inputs: -# azureSubscription: msccl-it -# scriptType: bash -# scriptLocation: inlineScript -# inlineScript: | -# az vmss start --name mscclit-vmss --resource-group msccl-IT +- task: AzureCLI@2 + name: StartVMSS + displayName: Start VMSS + inputs: + azureSubscription: msccl-it + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + az vmss start --name mscclit-vmss --resource-group msccl-IT - task: Bash@3 name: UpdateHostFile @@ -44,99 +44,98 @@ steps: script: | set -e echo "$(MSCCLPP-VMSS-IP) mscclit-vmss" | sudo tee -a /etc/hosts - cat /etc/hosts # - task: Bash@3 -# name: DeployTestEnv -# displayName: Deploy Test Env -# inputs: -# targetType: filePath -# filePath: test/deploy/deploy.sh -# workingDirectory: '$(System.DefaultWorkingDirectory)' + name: DeployTestEnv + displayName: Deploy Test Env + inputs: + targetType: filePath + filePath: test/deploy/deploy.sh + workingDirectory: '$(System.DefaultWorkingDirectory)' -# - task: Bash@3 -# name: RunMscclppTest -# displayName: Run multi-nodes mscclpp-test -# inputs: -# targetType: 'inline' -# script: | -# set -e -# HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile -# SSH_OPTION="StrictHostKeyChecking=no" -# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} -# rm -rf output/* -# mkdir -p output -# touch output/mscclit-vmss -# tail -f output/mscclit-vmss & -# CHILD_PID=$! -# parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ -# -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test' -# kill $CHILD_PID +- task: Bash@3 + name: RunMscclppTest + displayName: Run multi-nodes mscclpp-test + inputs: + targetType: 'inline' + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + rm -rf output/* + mkdir -p output + touch output/mscclit-vmss + tail -f output/mscclit-vmss & + CHILD_PID=$! + parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ + -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test' + kill $CHILD_PID -# - task: Bash@3 -# name: RunMultiNodeUnitTest -# displayName: Run multi-nodes unit tests -# inputs: -# targetType: 'inline' -# script: | -# set -e -# HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile -# SSH_OPTION="StrictHostKeyChecking=no" -# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} -# rm -rf output/* -# mkdir -p output -# touch output/mscclit-vmss -# tail -f output/mscclit-vmss & -# CHILD_PID=$! -# parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ -# -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut' -# kill $CHILD_PID +- task: Bash@3 + name: RunMultiNodeUnitTest + displayName: Run multi-nodes unit tests + inputs: + targetType: 'inline' + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + rm -rf output/* + mkdir -p output + touch output/mscclit-vmss + tail -f output/mscclit-vmss & + CHILD_PID=$! + parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ + -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut' + kill $CHILD_PID -# - task: Bash@3 -# name: RunMultiNodePythonTests -# displayName: Run multi-nodes python tests -# inputs: -# targetType: 'inline' -# script: | -# set -e -# HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile -# SSH_OPTION="StrictHostKeyChecking=no" -# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} -# rm -rf output/* -# mkdir -p output -# touch output/mscclit-vmss -# tail -f output/mscclit-vmss & -# CHILD_PID=$! -# parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ -# -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests' -# kill $CHILD_PID +- task: Bash@3 + name: RunMultiNodePythonTests + displayName: Run multi-nodes python tests + inputs: + targetType: 'inline' + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + rm -rf output/* + mkdir -p output + touch output/mscclit-vmss + tail -f output/mscclit-vmss & + CHILD_PID=$! + parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ + -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests' + kill $CHILD_PID -# - task: Bash@3 -# name: RunMultiNodePythonBenchmark -# displayName: Run multi-nodes python benchmark -# inputs: -# targetType: 'inline' -# script: | -# set -e -# HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile -# SSH_OPTION="StrictHostKeyChecking=no" -# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} -# rm -rf output/* -# mkdir -p output -# touch output/mscclit-vmss -# tail -f output/mscclit-vmss & -# CHILD_PID=$! -# parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ -# -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark' -# kill $CHILD_PID +- task: Bash@3 + name: RunMultiNodePythonBenchmark + displayName: Run multi-nodes python benchmark + inputs: + targetType: 'inline' + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + rm -rf output/* + mkdir -p output + touch output/mscclit-vmss + tail -f output/mscclit-vmss & + CHILD_PID=$! + parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ + -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark' + kill $CHILD_PID -# - task: AzureCLI@2 -# name: StopVMSS -# displayName: Deallocate VMSS -# condition: always() -# inputs: -# azureSubscription: msccl-it -# scriptType: bash -# scriptLocation: inlineScript -# inlineScript: | -# az vmss deallocate --name mscclit-vmss --resource-group msccl-IT +- task: AzureCLI@2 + name: StopVMSS + displayName: Deallocate VMSS + condition: always() + inputs: + azureSubscription: msccl-it + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + az vmss deallocate --name mscclit-vmss --resource-group msccl-IT From f40bdb2739549f88487098533cc717abeb87e63e Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sat, 27 Apr 2024 03:09:45 +0000 Subject: [PATCH 11/15] WIP --- .azure-pipelines/multi-nodes-steps-template.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure-pipelines/multi-nodes-steps-template.yml b/.azure-pipelines/multi-nodes-steps-template.yml index 7d2b3b455..673e0a457 100644 --- a/.azure-pipelines/multi-nodes-steps-template.yml +++ b/.azure-pipelines/multi-nodes-steps-template.yml @@ -45,7 +45,7 @@ steps: set -e echo "$(MSCCLPP-VMSS-IP) mscclit-vmss" | sudo tee -a /etc/hosts -# - task: Bash@3 +- task: Bash@3 name: DeployTestEnv displayName: Deploy Test Env inputs: From 8eae5858bbd564bab10dc4269670b94a0cac16c8 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sat, 27 Apr 2024 07:12:11 +0000 Subject: [PATCH 12/15] revert --- .../multi-nodes-steps-template.yml | 141 ---------------- .azure-pipelines/multi-nodes-test.yml | 156 +++++++++++++++--- 2 files changed, 135 insertions(+), 162 deletions(-) delete mode 100644 .azure-pipelines/multi-nodes-steps-template.yml diff --git a/.azure-pipelines/multi-nodes-steps-template.yml b/.azure-pipelines/multi-nodes-steps-template.yml deleted file mode 100644 index 673e0a457..000000000 --- a/.azure-pipelines/multi-nodes-steps-template.yml +++ /dev/null @@ -1,141 +0,0 @@ -steps: -- task: Bash@3 - name: Build - displayName: Build - inputs: - targetType: 'inline' - script: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .. - make -j - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: DownloadSecureFile@1 - name: SshKeyFile - displayName: Download key file - inputs: - secureFile: mscclpp-ssh.key - -- task: Bash@3 - name: InstallPackages - displayName: Install Packages - inputs: - targetType: 'inline' - script: | - sudo apt-get update -y - sudo apt-get install pssh -y - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - -- task: AzureCLI@2 - name: StartVMSS - displayName: Start VMSS - inputs: - azureSubscription: msccl-it - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss start --name mscclit-vmss --resource-group msccl-IT - -- task: Bash@3 - name: UpdateHostFile - displayName: Update host file - inputs: - targetType: 'inline' - script: | - set -e - echo "$(MSCCLPP-VMSS-IP) mscclit-vmss" | sudo tee -a /etc/hosts - -- task: Bash@3 - name: DeployTestEnv - displayName: Deploy Test Env - inputs: - targetType: filePath - filePath: test/deploy/deploy.sh - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: RunMscclppTest - displayName: Run multi-nodes mscclpp-test - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - rm -rf output/* - mkdir -p output - touch output/mscclit-vmss - tail -f output/mscclit-vmss & - CHILD_PID=$! - parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test' - kill $CHILD_PID - -- task: Bash@3 - name: RunMultiNodeUnitTest - displayName: Run multi-nodes unit tests - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - rm -rf output/* - mkdir -p output - touch output/mscclit-vmss - tail -f output/mscclit-vmss & - CHILD_PID=$! - parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut' - kill $CHILD_PID - -- task: Bash@3 - name: RunMultiNodePythonTests - displayName: Run multi-nodes python tests - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - rm -rf output/* - mkdir -p output - touch output/mscclit-vmss - tail -f output/mscclit-vmss & - CHILD_PID=$! - parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests' - kill $CHILD_PID - -- task: Bash@3 - name: RunMultiNodePythonBenchmark - displayName: Run multi-nodes python benchmark - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - rm -rf output/* - mkdir -p output - touch output/mscclit-vmss - tail -f output/mscclit-vmss & - CHILD_PID=$! - parallel-ssh -t 0 -H $(hostName) -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark' - kill $CHILD_PID - -- task: AzureCLI@2 - name: StopVMSS - displayName: Deallocate VMSS - condition: always() - inputs: - azureSubscription: msccl-it - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss deallocate --name mscclit-vmss --resource-group msccl-IT diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index 07df34d9f..e37a6302f 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -4,31 +4,145 @@ trigger: # Do not run multi-nodes-test for PR, we can trigger it manually pr: none -variables: - hostName: "mscclit-vmss:50000" - jobs: -- job: MultiNodesTestCuda11 - displayName: Multi nodes test cuda11 +- job: MultiNodesTest + displayName: Multi nodes test + strategy: + matrix: + cuda11: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 + cuda12: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2 pool: - vmImage: ubuntu-latest + name: mscclpp-it container: - image: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 + image: $[ variables['containerImage'] ] steps: - - template: multi-nodes-steps-template.yml - parameters: - hostName: ${{ variables.hostName }} + - task: Bash@3 + name: Build + displayName: Build + inputs: + targetType: 'inline' + script: | + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .. + make -j + workingDirectory: '$(System.DefaultWorkingDirectory)' -- job: MultiNodesTestCuda12 - displayName: Multi nodes test cuda12 - dependsOn: MultiNodesTestCuda11 # Make sure the vmss not be used by multiple jobs - pool: - vmImage: ubuntu-latest - container: - image: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2 + - task: DownloadSecureFile@1 + name: SshKeyFile + displayName: Download key file + inputs: + secureFile: ssh.key - steps: - - template: multi-nodes-steps-template.yml - parameters: - hostName: ${{ variables.hostName }} + - task: Bash@3 + name: InstallPackages + displayName: Install Packages + inputs: + targetType: 'inline' + script: | + sudo apt-get update -y + sudo apt-get install pssh -y + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + - task: AzureCLI@2 + name: StartVMSS + displayName: Start VMSS + inputs: + azureSubscription: msccl-it + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + az vmss start --name mscclit-vmss --resource-group msccl-IT + - task: Bash@3 + name: DeployTestEnv + displayName: Deploy Test Env + inputs: + targetType: filePath + filePath: test/deploy/deploy.sh + workingDirectory: '$(System.DefaultWorkingDirectory)' + + - task: Bash@3 + name: RunMscclppTest + displayName: Run multi-nodes mscclpp-test + inputs: + targetType: 'inline' + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + rm -rf output/* + mkdir -p output + touch output/mscclit-000000 + tail -f output/mscclit-000000 & + CHILD_PID=$! + parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ + -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test' + kill $CHILD_PID + + - task: Bash@3 + name: RunMultiNodeUnitTest + displayName: Run multi-nodes unit tests + inputs: + targetType: 'inline' + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + rm -rf output/* + mkdir -p output + touch output/mscclit-000000 + tail -f output/mscclit-000000 & + CHILD_PID=$! + parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ + -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut' + kill $CHILD_PID + - task: Bash@3 + name: RunMultiNodePythonTests + displayName: Run multi-nodes python tests + inputs: + targetType: 'inline' + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + rm -rf output/* + mkdir -p output + touch output/mscclit-000000 + tail -f output/mscclit-000000 & + CHILD_PID=$! + parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ + -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests' + kill $CHILD_PID + - task: Bash@3 + name: RunMultiNodePythonBenchmark + displayName: Run multi-nodes python benchmark + inputs: + targetType: 'inline' + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + rm -rf output/* + mkdir -p output + touch output/mscclit-000000 + tail -f output/mscclit-000000 & + CHILD_PID=$! + parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ + -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark' + kill $CHILD_PID + + - task: AzureCLI@2 + name: StopVMSS + displayName: Deallocate VMSS + condition: always() + inputs: + azureSubscription: msccl-it + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + az vmss deallocate --name mscclit-vmss --resource-group msccl-IT From a0b0f885e360f2ef23d4ceb361c787ac1556cec0 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sat, 27 Apr 2024 07:13:23 +0000 Subject: [PATCH 13/15] WIP --- .azure-pipelines/multi-nodes-test.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index e37a6302f..7c9d35094 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -45,6 +45,7 @@ jobs: sudo apt-get update -y sudo apt-get install pssh -y curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + - task: AzureCLI@2 name: StartVMSS displayName: Start VMSS @@ -54,6 +55,7 @@ jobs: scriptLocation: inlineScript inlineScript: | az vmss start --name mscclit-vmss --resource-group msccl-IT + - task: Bash@3 name: DeployTestEnv displayName: Deploy Test Env @@ -99,6 +101,7 @@ jobs: parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut' kill $CHILD_PID + - task: Bash@3 name: RunMultiNodePythonTests displayName: Run multi-nodes python tests @@ -117,6 +120,7 @@ jobs: parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests' kill $CHILD_PID + - task: Bash@3 name: RunMultiNodePythonBenchmark displayName: Run multi-nodes python benchmark From b410a09a45c61950d998355ff63341980161fab4 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sat, 27 Apr 2024 07:15:11 +0000 Subject: [PATCH 14/15] WIP --- .azure-pipelines/multi-nodes-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index 7c9d35094..bcf517569 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -34,7 +34,7 @@ jobs: name: SshKeyFile displayName: Download key file inputs: - secureFile: ssh.key + secureFile: mscclpp-ssh.key - task: Bash@3 name: InstallPackages From cdcac0081bb40708ebd9ef4e1e26c34756fe3d5d Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sat, 27 Apr 2024 07:29:50 +0000 Subject: [PATCH 15/15] WIP --- test/deploy/hostfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/deploy/hostfile b/test/deploy/hostfile index 6260cc6ee..b1bfc1df3 100644 --- a/test/deploy/hostfile +++ b/test/deploy/hostfile @@ -1,2 +1,2 @@ -azureuser@mscclit-vmss:50000 -azureuser@mscclit-vmss:50001 +azureuser@mscclit-000000 +azureuser@mscclit-000001