Skip to content

Commit

Permalink
Refactor Python Packaing Pipeline (Training Cuda 11.8) (#19910)
Browse files Browse the repository at this point in the history
### Description
1. Use stage to organize the pipeline and split building and testing
2. Move compilation on CPU machine
3. test stage can leverage existing artifacts
4. check wheel size, it gives warning if the size above 300M
5. docker image name wasn't change even the argument changed, which
caused the docker image was always rebuilt. So update the docker image
name according to the argument can save the docker build time.

Pipeline duration reduced by 60% (2 hours ->  50 minutes)
Compilation time reduced by 75% (1.5hours -> 20 minutes)
GPU time reduced by 87% ( 8 hours to 1 hours)
for debugging, the GPU time could be reduced by above 95%, because we
can choose run only one test stage and skip building.

### Motivation and Context
Make the pipeline efficient.
Optimized

https://dev.azure.com/aiinfra/Lotus/_build/results?buildId=424177&view=results
Curent

https://dev.azure.com/aiinfra/Lotus/_build/results?buildId=422393&view=results

---------
  • Loading branch information
mszhanyi committed Mar 14, 2024
1 parent 8b766bd commit 87a9f77
Show file tree
Hide file tree
Showing 3 changed files with 279 additions and 178 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,17 @@ resources:
name: pypa/manylinux
ref: 5eda9aded5462201e6310105728d33016e637ea7

parameters:
- name: SpecificArtifact
displayName: Use Specific Artifact
type: boolean
default: false

- name: BuildId
displayName: Specific Artifact's BuildId
type: string
default: '0'

stages:
- template: templates/py-packaging-training-cuda-stage.yml
parameters:
Expand All @@ -20,3 +31,5 @@ stages:
agent_pool: Onnxruntime-Linux-GPU
upload_wheel: 'yes'
debug_build: false
SpecificArtifact: ${{ parameters.SpecificArtifact }}
BuildId: ${{ parameters.BuildId }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
parameters:
build_py_parameters: ''
torch_version: ''
opset_version: ''
cuda_version: ''
cmake_cuda_architectures: ''
docker_file: ''
upload_wheel: ''
debug_build: ''
python_version: ''
stage_name: ''
SpecificArtifact: false
BuildId: '0'

stages:
- stage: Build_${{ parameters.stage_name }}
variables:
- name: isMain
value: ${{ or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-')) }}
- name: finalStorage
${{ if eq(variables['isMain'], 'true') }}:
value: '--final_storage'
${{ else }}:
value: ''
- name: buildConfig
${{ if eq(parameters['debug_build'], 'true') }}:
value: 'Debug'
${{ else }}:
value: 'Release'
- name: PythonVersion
value: ${{ parameters.python_version }}
- name: Repository
value: onnxruntimetraininggpubuild_${{ parameters.python_version }}
dependsOn: []

jobs:
- job: Build
pool: onnxruntime-Ubuntu2204-AMD-CPU
steps:
- task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
displayName: 'Clean Agent Directories'
condition: always()

- task: CmdLine@2
displayName: 'check variables'
inputs:
script: |
echo "Branch is "${{ variables['Build.SourceBranch'] }} && \
echo "isMain is "${{ variables['isMain'] }} && \
echo "final_storage is "${{ variables['finalStorage'] }}
- checkout: self
clean: true
submodules: recursive

- template: set-python-manylinux-variables-step.yml

- template: get-docker-image-steps.yml
parameters:
Dockerfile: tools/ci_build/github/linux/docker/${{ parameters.docker_file }}
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: >-
--build-arg TORCH_VERSION=${{ parameters.torch_version }}
--build-arg OPSET_VERSION=${{ parameters.opset_version }}
--build-arg PYTHON_VERSION=${{ parameters.python_version }}
--build-arg INSTALL_DEPS_EXTRA_ARGS=-tu
--build-arg BUILD_UID=$(id -u)
--network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64
--build-arg DEVTOOLSET_ROOTPATH=/usr
--build-arg PREPEND_PATH=/usr/local/cuda/bin:
--build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64
Repository: $(Repository)

- task: CmdLine@2
displayName: 'build onnxruntime'
inputs:
script: |
set -e -x
mkdir -p $HOME/.onnx
docker run --rm -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" \
--volume /data/onnx:/data/onnx:ro \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /data/models:/build/models:ro \
--volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-e NVIDIA_VISIBLE_DEVICES=all \
-e NIGHTLY_BUILD \
-e DEFAULT_TRAINING_PACKAGE_DEVICE \
-e BUILD_BUILDNUMBER \
-e ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION \
$(Repository) \
$(PythonManylinuxDir)/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
--build_dir /build \
--config ${{ variables['buildConfig'] }} \
--skip_submodule_sync \
--parallel --use_binskim_compliant_compile_flags \
--build_wheel \
--enable_onnx_tests \
${{ parameters.build_py_parameters }} \
--cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=${{ parameters.cmake_cuda_architectures }}' onnxruntime_BUILD_UNIT_TESTS=OFF \
--use_cuda --cuda_version=${{ parameters.cuda_version }} --cuda_home=/usr/local/cuda-${{ parameters.cuda_version }} --cudnn_home=/usr/local/cuda-${{ parameters.cuda_version }};
workingDirectory: $(Build.SourcesDirectory)

- task: CopyFiles@2
displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
inputs:
SourceFolder: '$(Build.BinariesDirectory)'
Contents: "${{ variables['buildConfig'] }}/dist/*.whl"
TargetFolder: '$(Build.ArtifactStagingDirectory)'

- task: PublishBuildArtifacts@1
displayName: 'Publish Artifact: ONNXRuntime python wheel and documentation'
inputs:
ArtifactName: "onnxruntime_gpu_${{ variables['buildConfig'] }}_${{ parameters.python_version }}"

- template: component-governance-component-detection-steps.yml
parameters:
condition: 'succeeded'

- template: clean-agent-build-directory-step.yml

- stage: Test_${{ parameters.stage_name }}
variables:
- name: isMain
value: ${{ or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-')) }}
- name: finalStorage
${{ if eq(variables['isMain'], 'true') }}:
value: '--final_storage'
${{ else }}:
value: ''
- name: buildConfig
${{ if eq(parameters['debug_build'], 'true') }}:
value: 'Debug'
${{ else }}:
value: 'Release'
- name: PythonVersion
value: ${{ parameters.python_version }}
- name: Repository
value: onnxruntimetraininggpubuild_${{ parameters.python_version }}
dependsOn: Build_${{ parameters.stage_name }}
jobs:
- job: Test_GPU
pool: Onnxruntime-Linux-GPU
steps:
- task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
displayName: 'Clean Agent Directories'
condition: always()

- checkout: self
clean: true
submodules: none

- template: set-python-manylinux-variables-step.yml

- template: flex-downloadPipelineArtifact.yml
parameters:
ArtifactName: "onnxruntime_gpu_${{ variables['buildConfig'] }}_${{ parameters.python_version }}"
StepName: 'Download Pipeline Artifact - Linux Training Build'
TargetPath: '$(Build.ArtifactStagingDirectory)'
SpecificArtifact: ${{ parameters.SpecificArtifact }}
BuildId: ${{ parameters.BuildId }}

- script: |
set -e -x
whlfilename=$(ls $(Build.ArtifactStagingDirectory)/Release/dist/*.whl | head -n 1) ; \
echo $whlfilename ; du -sh $whlfilename ; \
(( $(wc -c < "$whlfilename") - 300*1024*1024 < 0 )) || ( echo 'Wheel size bigger than 300M'; exit 1)
displayName: 'Check wheel size'
continueOnError: true
- template: get-docker-image-steps.yml
parameters:
Dockerfile: tools/ci_build/github/linux/docker/${{ parameters.docker_file }}
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: >-
--build-arg TORCH_VERSION=${{ parameters.torch_version }}
--build-arg OPSET_VERSION=${{ parameters.opset_version }}
--build-arg PYTHON_VERSION=${{ parameters.python_version }}
--build-arg INSTALL_DEPS_EXTRA_ARGS=-tu
--build-arg BUILD_UID=$(id -u)
--network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64
--build-arg DEVTOOLSET_ROOTPATH=/usr
--build-arg PREPEND_PATH=/usr/local/cuda/bin:
--build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64
Repository: $(Repository)

- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist"
displayName: 'Mount MNIST'
condition: succeededOrFailed()
workingDirectory: $(Build.SourcesDirectory)

- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/bert-data" -d "/bert_data"
displayName: 'Mount bert-data'
condition: succeededOrFailed()
workingDirectory: $(Build.SourcesDirectory)

- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/hf-models-cache" -d "/hf_models_cache"
displayName: 'Mount hf-models-cache'
condition: succeededOrFailed()
workingDirectory: $(Build.SourcesDirectory)

- task: CmdLine@2
displayName: 'test ortmodule'
inputs:
script: |
set -ex ; \
whlfilename=$(ls $(Build.ArtifactStagingDirectory)/Release/dist/*.whl | head -n 1) ; \
echo $whlfilename ; \
basefilename=$(basename $whlfilename) ; \
docker run --rm \
--gpus all \
-e NVIDIA_VISIBLE_DEVICES=all \
--volume $(Build.ArtifactStagingDirectory):/build \
--volume /mnist:/mnist \
--volume /bert_data:/bert_data \
--volume /hf_models_cache:/hf_models_cache \
$(Repository) \
bash -c " $(PythonManylinuxDir)/bin/python3 -m pip install /build/Release/dist/$basefilename && $(PythonManylinuxDir)/bin/python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install " ;
workingDirectory: $(Build.SourcesDirectory)

- task: CmdLine@2
displayName: 'Upload wheel'
condition: and(succeeded(), and(eq(variables['UploadWheel'], 'yes'), ne(variables['ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION'], 'true')))
inputs:
script: |
set -e -x
whlfilename=$(ls $(Build.ArtifactStagingDirectory)/Release/dist/*.whl | head -n 1) ; \
python3 tools/ci_build/upload_python_package_to_azure_storage.py \
--python_wheel_path $whlfilename ${{ variables['finalStorage'] }}
Loading

0 comments on commit 87a9f77

Please sign in to comment.