Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ AllowShortLoopsOnASingleLine: true
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: Yes
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BraceWrapping:
Expand Down
127 changes: 77 additions & 50 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
@@ -1,49 +1,82 @@

jobs:
- job: Default
- job: DeepSpeed_Tests
timeoutInMinutes: 360
pool:
name: 'GPU_testing'
name: 'DS_testing'

strategy:
matrix:
Python36:
PyTorch12-CUDA100:
python.version: '3.6'
#Python35:
# python.version: '3.5'
#Python37:
cuda.version: '10.0'
pytorch.version: '1.2'
torchvision.version: '0.4.0'
runmodeltests: true
#PyTorch15-CUDA101:
# python.version: '3.7'
#Python38:
# python.version: '3.8'
# cuda.version: '10.1'
# pytorch.version: '1.5'
# torchvision.version: '0.6.1'
# runmodeltests: true
##PyTorch15-CUDA102:
# python.version: '3.7'
# cuda.version: '10.2'
# pytorch.version: '1.5'
# torchvision.version: '0.6.1'
# runmodeltests: true

variables:
conda_env: 'ds_test_py$(python.version)_cuda$(cuda.version)_pytorch$(pytorch.version)'

steps:
- task: UsePythonVersion@0
inputs:
versionSpec: '$(python.version)'
addToPath: true
architecture: 'x64'
displayName: 'Use Python $(python.version)'
# Unfortunately nvidia's nvcc_linux-64=<version> seems to install 10.1 regardless?
# Most of this complexity is a workaround to get the compiler toolchain to match the
# cudatoolkit runtime
- script: |
conda create --force --yes -n $(conda_env) python=$(python.version) cudatoolkit=$(cuda.version)
source activate $(conda_env)
conda install -q --yes conda
conda install -q --yes pip
conda install -q --yes gxx_linux-64
if [[ $(cuda.version) != "10.2" ]]; then conda install --yes -c conda-forge cudatoolkit-dev=$(cuda.version) ; fi
displayName: 'Setup environment python=$(python.version) pytorch=$(pytorch.version) cuda=$(cuda.version)'

# Manually install torch/torchvision first to enforce versioning.
- script: |
python -m pip install --upgrade pip
pip install --user -r requirements.txt
./install.sh --pip_sudo
displayName: 'Install dependencies'
source activate $(conda_env)
pip install --progress-bar=off torch==$(pytorch.version) torchvision==$(torchvision.version)
./install.sh
python basic_install_test.py
displayName: 'Install DeepSpeed'

- script: |
pre-commit run --all-files
displayName: 'Formatting checks'
source activate $(conda_env)
which python
python --version
which nvcc
nvcc --version
which deepspeed
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
python -c "import deepspeed; print('deepspeed:', deepspeed.__version__)"
displayName: 'Show environment'


- script: |
pytest --forked --verbose tests/unit/
source activate $(conda_env)
pytest --durations=0 --forked --verbose tests/unit/
displayName: 'Unit tests'

- script: |
source activate $(conda_env)
ln -s /data/Megatron-LM/data DeepSpeedExamples/Megatron-LM/
pip install --user -r DeepSpeedExamples/Megatron-LM/requirements.txt
pip install --progress-bar=off -r DeepSpeedExamples/Megatron-LM/requirements.txt
cd tests/model/
pytest -s run_sanity_check.py
rm -rf BingBertSquad/baseline
rm -rf Megatron_GPT2/baseline
pytest --durations=0 -s run_sanity_check.py
condition: and(succeeded(), eq(variables['runmodeltests'], true))
displayName: 'Model tests'

#BingBertSquad logs
Expand All @@ -52,35 +85,29 @@ jobs:
targetPath: '$(Build.SourcesDirectory)/tests/model/BingBertSquad/test/'
artifactName: BingBertSquad_logs
displayName: 'BingBertSquad log uploads'
condition: always()
condition: eq(variables['runmodeltests'], true)

# Megatron test logs
#- task: PublishPipelineArtifact@1
# inputs:
# targetPath: '$(Build.SourcesDirectory)/tests/model/Megatron_GPT2/test/'
# artifactName: Megatron_GPT2_logs
# displayName: 'Megatron GPT2 log uploads'
# condition: always()

#- task: PublishPipelineArtifact@1
# inputs:
# targetPath: '$(Build.SourcesDirectory)/tests/model/Megatron_GPT2/checkpoint_test_logs/'
# artifactName: Megatron_GPT2_checkpoint_logs
# displayName: 'Megatron GPT2 checkpoint log uploads'
# condition: always()
- job: Code_Quality_Checks
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we move this up to run before the actual tests so we can fail fast?

pool:
name: 'DS_testing'
variables:
conda_env: 'ds_codetest'

steps:
- script: |
conda create --force --yes -n $(conda_env) python=3.7
source activate $(conda_env)
displayName: 'Create code test environment'

#BingBert logs
#- task: PublishPipelineArtifact@1
# inputs:
# targetPath: '$(Build.SourcesDirectory)/tests/model/bing_bert/pretrain_test/'
# artifactName: BingBert_pretrain_logs
# displayName: 'BingBert pretrain logs'
# condition: always()
- script: |
source activate $(conda_env)
pip install pre-commit
pre-commit run --all-files
displayName: 'Formatting checks'

#- task: PublishPipelineArtifact@1
# inputs:
# targetPath: '$(Build.SourcesDirectory)/tests/model/bing_bert/checkpoint_test_logs/'
# artifactName: BingBert_checkpoint_logs
# displayName: 'BingBert checkpoint logs'
# condition: always()
- script: |
source activate $(conda_env)
pip install pylint
pylint --exit-zero deepspeed/
displayName: 'Code linter'
4 changes: 2 additions & 2 deletions install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,9 @@ else
fi

if [ "$pip_mirror" != "" ]; then
PIP_INSTALL="pip install -v -i $pip_mirror"
PIP_INSTALL="pip install --use-feature=2020-resolver -v -i $pip_mirror"
else
PIP_INSTALL="pip install -v"
PIP_INSTALL="pip install --use-feature=2020-resolver -v"
fi

if [ ! -f $hostfile ]; then
Expand Down