Refactor Python Packaing Pipeline (Training Cuda 11.8) (#19910)

### Description 1. Use stage to organize the pipeline and split building and testing 2. Move compilation on CPU machine 3. test stage can leverage existing artifacts 4. check wheel size, it gives warning if the size above 300M 5. docker image name wasn't change even the argument changed, which caused the docker image was always rebuilt. So update the docker image name according to the argument can save the docker build time. Pipeline duration reduced by 60% (2 hours -> 50 minutes) Compilation time reduced by 75% (1.5hours -> 20 minutes) GPU time reduced by 87% ( 8 hours to 1 hours) for debugging, the GPU time could be reduced by above 95%, because we can choose run only one test stage and skip building. ### Motivation and Context Make the pipeline efficient. Optimized https://dev.azure.com/aiinfra/Lotus/_build/results?buildId=424177&view=results Curent https://dev.azure.com/aiinfra/Lotus/_build/results?buildId=422393&view=results ---------
microsoft · Mar 14, 2024 · 87a9f77 · 87a9f77
1 parent 8b766bd
commit 87a9f77
Show file tree

Hide file tree

Showing 3 changed files with 279 additions and 178 deletions.
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml
@@ -8,6 +8,17 @@ resources:
     name: pypa/manylinux
     ref: 5eda9aded5462201e6310105728d33016e637ea7
 
+parameters:
+  - name: SpecificArtifact
+    displayName: Use Specific Artifact
+    type: boolean
+    default: false
+
+  - name: BuildId
+    displayName: Specific Artifact's BuildId
+    type: string
+    default: '0'
+
 stages:
 - template: templates/py-packaging-training-cuda-stage.yml
   parameters:
@@ -20,3 +31,5 @@ stages:
     agent_pool: Onnxruntime-Linux-GPU
     upload_wheel: 'yes'
     debug_build: false
+    SpecificArtifact: ${{ parameters.SpecificArtifact }}
+    BuildId: ${{ parameters.BuildId }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml
@@ -0,0 +1,229 @@
+parameters:
+  build_py_parameters: ''
+  torch_version: ''
+  opset_version: ''
+  cuda_version: ''
+  cmake_cuda_architectures: ''
+  docker_file: ''
+  upload_wheel: ''
+  debug_build:  ''
+  python_version: ''
+  stage_name: ''
+  SpecificArtifact: false
+  BuildId: '0'
+
+stages:
+  - stage: Build_${{ parameters.stage_name }}
+    variables:
+      - name: isMain
+        value: ${{ or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-')) }}
+      - name: finalStorage
+        ${{ if eq(variables['isMain'], 'true') }}:
+          value: '--final_storage'
+        ${{ else }}:
+          value: ''
+      - name: buildConfig
+        ${{ if eq(parameters['debug_build'], 'true') }}:
+          value: 'Debug'
+        ${{ else }}:
+          value: 'Release'
+      - name: PythonVersion
+        value: ${{ parameters.python_version }}
+      - name: Repository
+        value: onnxruntimetraininggpubuild_${{ parameters.python_version }}
+    dependsOn: []
+
+    jobs:
+    - job: Build
+      pool: onnxruntime-Ubuntu2204-AMD-CPU
+      steps:
+        - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+          displayName: 'Clean Agent Directories'
+          condition: always()
+
+        - task: CmdLine@2
+          displayName: 'check variables'
+          inputs:
+            script: |
+              echo "Branch is "${{ variables['Build.SourceBranch'] }} && \
+              echo "isMain is "${{ variables['isMain'] }} && \
+              echo "final_storage is "${{ variables['finalStorage'] }}
+
+        - checkout: self
+          clean: true
+          submodules: recursive
+
+        - template: set-python-manylinux-variables-step.yml
+
+        - template: get-docker-image-steps.yml
+          parameters:
+            Dockerfile: tools/ci_build/github/linux/docker/${{ parameters.docker_file }}
+            Context: tools/ci_build/github/linux/docker
+            DockerBuildArgs: >-
+              --build-arg TORCH_VERSION=${{ parameters.torch_version }}
+              --build-arg OPSET_VERSION=${{ parameters.opset_version }}
+              --build-arg PYTHON_VERSION=${{ parameters.python_version }}
+              --build-arg INSTALL_DEPS_EXTRA_ARGS=-tu
+              --build-arg BUILD_UID=$(id -u)
+              --network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64
+              --build-arg DEVTOOLSET_ROOTPATH=/usr
+              --build-arg PREPEND_PATH=/usr/local/cuda/bin:
+              --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64
+            Repository: $(Repository)
+
+        - task: CmdLine@2
+          displayName: 'build onnxruntime'
+          inputs:
+            script: |
+              set -e -x
+              mkdir -p $HOME/.onnx
+              docker run --rm -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" \
+                --volume /data/onnx:/data/onnx:ro \
+                --volume $(Build.SourcesDirectory):/onnxruntime_src \
+                --volume $(Build.BinariesDirectory):/build \
+                --volume /data/models:/build/models:ro \
+                --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+                -e NVIDIA_VISIBLE_DEVICES=all \
+                -e NIGHTLY_BUILD \
+                -e DEFAULT_TRAINING_PACKAGE_DEVICE \
+                -e BUILD_BUILDNUMBER \
+                -e ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION \
+                $(Repository) \
+                  $(PythonManylinuxDir)/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
+                    --build_dir /build \
+                    --config ${{ variables['buildConfig'] }} \
+                    --skip_submodule_sync \
+                    --parallel --use_binskim_compliant_compile_flags \
+                    --build_wheel \
+                    --enable_onnx_tests \
+                    ${{ parameters.build_py_parameters }} \
+                    --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=${{ parameters.cmake_cuda_architectures }}' onnxruntime_BUILD_UNIT_TESTS=OFF \
+                    --use_cuda --cuda_version=${{ parameters.cuda_version }} --cuda_home=/usr/local/cuda-${{ parameters.cuda_version }} --cudnn_home=/usr/local/cuda-${{ parameters.cuda_version }};
+            workingDirectory: $(Build.SourcesDirectory)
+
+        - task: CopyFiles@2
+          displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
+          inputs:
+            SourceFolder: '$(Build.BinariesDirectory)'
+            Contents: "${{ variables['buildConfig'] }}/dist/*.whl"
+            TargetFolder: '$(Build.ArtifactStagingDirectory)'
+
+        - task: PublishBuildArtifacts@1
+          displayName: 'Publish Artifact: ONNXRuntime python wheel and documentation'
+          inputs:
+            ArtifactName: "onnxruntime_gpu_${{ variables['buildConfig'] }}_${{ parameters.python_version }}"
+
+        - template: component-governance-component-detection-steps.yml
+          parameters:
+            condition: 'succeeded'
+
+        - template: clean-agent-build-directory-step.yml
+
+  - stage: Test_${{ parameters.stage_name }}
+    variables:
+      - name: isMain
+        value: ${{ or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-')) }}
+      - name: finalStorage
+        ${{ if eq(variables['isMain'], 'true') }}:
+          value: '--final_storage'
+        ${{ else }}:
+          value: ''
+      - name: buildConfig
+        ${{ if eq(parameters['debug_build'], 'true') }}:
+          value: 'Debug'
+        ${{ else }}:
+          value: 'Release'
+      - name: PythonVersion
+        value: ${{ parameters.python_version }}
+      - name: Repository
+        value: onnxruntimetraininggpubuild_${{ parameters.python_version }}
+    dependsOn: Build_${{ parameters.stage_name }}
+    jobs:
+    - job: Test_GPU
+      pool: Onnxruntime-Linux-GPU
+      steps:
+        - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+          displayName: 'Clean Agent Directories'
+          condition: always()
+
+        - checkout: self
+          clean: true
+          submodules: none
+
+        - template: set-python-manylinux-variables-step.yml
+
+        - template: flex-downloadPipelineArtifact.yml
+          parameters:
+            ArtifactName: "onnxruntime_gpu_${{ variables['buildConfig'] }}_${{ parameters.python_version }}"
+            StepName: 'Download Pipeline Artifact - Linux Training Build'
+            TargetPath: '$(Build.ArtifactStagingDirectory)'
+            SpecificArtifact: ${{ parameters.SpecificArtifact }}
+            BuildId: ${{ parameters.BuildId }}
+
+        - script: |
+            set -e -x
+            whlfilename=$(ls $(Build.ArtifactStagingDirectory)/Release/dist/*.whl | head -n 1)  ; \
+            echo $whlfilename ; du -sh $whlfilename ; \
+            (( $(wc -c < "$whlfilename") -  300*1024*1024 < 0 )) ||  ( echo 'Wheel size bigger than 300M'; exit 1)
+          displayName: 'Check wheel size'
+          continueOnError: true
+
+        - template: get-docker-image-steps.yml
+          parameters:
+            Dockerfile: tools/ci_build/github/linux/docker/${{ parameters.docker_file }}
+            Context: tools/ci_build/github/linux/docker
+            DockerBuildArgs: >-
+              --build-arg TORCH_VERSION=${{ parameters.torch_version }}
+              --build-arg OPSET_VERSION=${{ parameters.opset_version }}
+              --build-arg PYTHON_VERSION=${{ parameters.python_version }}
+              --build-arg INSTALL_DEPS_EXTRA_ARGS=-tu
+              --build-arg BUILD_UID=$(id -u)
+              --network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64
+              --build-arg DEVTOOLSET_ROOTPATH=/usr
+              --build-arg PREPEND_PATH=/usr/local/cuda/bin:
+              --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64
+            Repository: $(Repository)
+
+        - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist"
+          displayName: 'Mount MNIST'
+          condition: succeededOrFailed()
+          workingDirectory: $(Build.SourcesDirectory)
+
+        - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/bert-data" -d "/bert_data"
+          displayName: 'Mount bert-data'
+          condition: succeededOrFailed()
+          workingDirectory: $(Build.SourcesDirectory)
+
+        - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/hf-models-cache" -d "/hf_models_cache"
+          displayName: 'Mount hf-models-cache'
+          condition: succeededOrFailed()
+          workingDirectory: $(Build.SourcesDirectory)
+
+        - task: CmdLine@2
+          displayName: 'test ortmodule'
+          inputs:
+            script: |
+              set -ex ; \
+              whlfilename=$(ls $(Build.ArtifactStagingDirectory)/Release/dist/*.whl | head -n 1) ; \
+              echo $whlfilename ; \
+              basefilename=$(basename $whlfilename) ; \
+              docker run --rm \
+                --gpus all \
+                -e NVIDIA_VISIBLE_DEVICES=all \
+                --volume $(Build.ArtifactStagingDirectory):/build \
+                --volume /mnist:/mnist \
+                --volume /bert_data:/bert_data \
+                --volume /hf_models_cache:/hf_models_cache \
+                $(Repository) \
+                  bash -c " $(PythonManylinuxDir)/bin/python3 -m pip install /build/Release/dist/$basefilename && $(PythonManylinuxDir)/bin/python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install " ;
+            workingDirectory: $(Build.SourcesDirectory)
+
+        - task: CmdLine@2
+          displayName: 'Upload wheel'
+          condition: and(succeeded(), and(eq(variables['UploadWheel'], 'yes'), ne(variables['ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION'], 'true')))
+          inputs:
+            script: |
+              set -e -x
+              whlfilename=$(ls $(Build.ArtifactStagingDirectory)/Release/dist/*.whl | head -n 1) ; \
+              python3 tools/ci_build/upload_python_package_to_azure_storage.py \
+                  --python_wheel_path $whlfilename ${{ variables['finalStorage'] }}