Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
ef3afa8
batch of refactored tests
mrwyattii Aug 12, 2022
c60f640
more test refactoring
mrwyattii Aug 13, 2022
cd2c6fe
fp16 test refactor
mrwyattii Aug 15, 2022
f9f5dc3
more refactors
mrwyattii Aug 15, 2022
fb077b6
added DistributedFixture class
mrwyattii Aug 19, 2022
f97dd20
applied DistributedFixture to first batch of tests as a trial
mrwyattii Aug 19, 2022
27ed180
added DistributedFixture test and documentation
mrwyattii Aug 19, 2022
fb59b1c
last tests
mrwyattii Aug 22, 2022
8c64612
Merge branch 'master' into refactor-more-tests
mrwyattii Aug 22, 2022
db57b66
fixes for refactored tests
mrwyattii Aug 23, 2022
9c27579
Merge branch 'master' into refactor-more-tests
mrwyattii Aug 23, 2022
5eab66a
remove subdirs in workflow files
mrwyattii Aug 23, 2022
5baa6db
fix pytest syntax error
mrwyattii Aug 23, 2022
88d262d
Merge branch 'master' into refactor-more-tests
mrwyattii Aug 23, 2022
70e1871
fix another syntax error
mrwyattii Aug 24, 2022
18ec909
Merge branch 'master' into refactor-more-tests
mrwyattii Sep 9, 2022
45f7ebb
update imports
mrwyattii Sep 9, 2022
c0a3955
use DistFixture with elastic checkpoint test
mrwyattii Sep 9, 2022
1ae3993
missing import
mrwyattii Sep 9, 2022
59b557b
update to shared class tmpdir for elastic test
mrwyattii Sep 9, 2022
be41b63
Merge branch 'master' into refactor-more-tests
mrwyattii Sep 9, 2022
eab78bc
moved test files
mrwyattii Sep 9, 2022
261b257
Merge branch 'master' into refactor-more-tests
mrwyattii Sep 9, 2022
c8c10e6
avoid duplicate test file name
mrwyattii Sep 9, 2022
3e372db
Merge branch 'master' into refactor-more-tests
mrwyattii Sep 26, 2022
4441c4f
last refactor and moving test files
mrwyattii Sep 28, 2022
b4acb36
Merge branch 'master' into refactor-more-tests
mrwyattii Sep 28, 2022
20403a4
formatting
mrwyattii Sep 28, 2022
03fd172
fix broken import
mrwyattii Sep 28, 2022
e107307
Merge branch 'master' into refactor-more-tests
tjruwase Sep 30, 2022
a24a4ac
testing forked AMD tests
mrwyattii Oct 3, 2022
9ce06a7
update abstract method
mrwyattii Oct 3, 2022
b825324
use blob storage for accelerate and transformers tests
mrwyattii Oct 3, 2022
23790d5
upgrade torch for acclerate CI
mrwyattii Oct 3, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/amd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,5 +67,5 @@ jobs:
run: |
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose unit/{autotuning,checkpoint,comm,compression,elasticity,inference,launcher,monitor,ops,profiling,runtime,utils}
#TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'sequential' unit/{autotuning,checkpoint,comm,compression,elasticity,inference,launcher,monitor,ops,profiling,runtime,utils}
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked -n 4 --verbose unit/
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
4 changes: 2 additions & 2 deletions .github/workflows/nv-accelerate-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
nvcc --version
pip install --upgrade pip
pip uninstall --yes torch torchvision
pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html
pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"

Expand All @@ -58,4 +58,4 @@ jobs:
# tmp fix: force newer datasets version
pip install "datasets>=2.0.0"
pip list
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose tests/deepspeed
HF_DATASETS_CACHE=/blob/datasets_cache/ TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose tests/deepspeed
4 changes: 2 additions & 2 deletions .github/workflows/nv-torch-latest-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,5 +61,5 @@ jobs:
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/{autotuning,checkpoint,comm,compression,elasticity,inference,launcher,monitor,ops,profiling,runtime,utils} --torch_ver="1.12" --cuda_ver="11.3"
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/{autotuning,checkpoint,comm,compression,elasticity,inference,launcher,monitor,ops,profiling,runtime,utils} --torch_ver="1.12" --cuda_ver="11.3"
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/ --torch_ver="1.12" --cuda_ver="11.3"
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/ --torch_ver="1.12" --cuda_ver="11.3"
2 changes: 1 addition & 1 deletion .github/workflows/nv-transformers-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,4 @@ jobs:
# force protobuf version due to issues
pip install "protobuf<4.21.0"
pip list
WANDB_DISABLED=true TORCH_EXTENSIONS_DIR=./torch-extensions RUN_SLOW=1 pytest --color=yes --durations=0 --verbose tests/deepspeed
HF_DATASETS_CACHE=/blob/datasets_cache/ TRANSFORMERS_CACHE=/blob/transformers_cache/ WANDB_DISABLED=true TORCH_EXTENSIONS_DIR=./torch-extensions RUN_SLOW=1 pytest --color=yes --durations=0 --verbose tests/deepspeed
11 changes: 10 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,5 +52,14 @@ def pytest_runtest_call(item):
# We want to use our own launching function for distributed tests
if getattr(item.cls, "is_dist_test", False):
dist_test_class = item.cls()
dist_test_class._run_test(item._request)
dist_test_class(item._request)
item.runtest = lambda: True # Dummy function so test is not run twice


@pytest.hookimpl(tryfirst=True)
def pytest_fixture_setup(fixturedef, request):
if getattr(fixturedef.func, "is_dist_fixture", False):
#for val in dir(request):
# print(val.upper(), getattr(request, val), "\n")
dist_fixture_class = fixturedef.func()
dist_fixture_class(request)
Empty file modified tests/unit/alexnet_model.py
100755 → 100644
Empty file.
103 changes: 52 additions & 51 deletions tests/unit/checkpoint/test_zero_optimizer.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
import deepspeed
from deepspeed.ops.op_builder import CPUAdamBuilder

from unit.common import DistributedTest
from unit.common import DistributedTest, DistributedFixture
from unit.simple_model import *
from unit.util import required_minimum_torch_version

from unit.checkpoint.common import *

import itertools
import pytest


Expand Down Expand Up @@ -192,18 +191,52 @@ def test_load_module_only(self, tmpdir, zero_stage):
load_module_only=True)


class ws4_model_checkpoint(DistributedFixture):
world_size = 4

def run(self, class_tmpdir, elastic_save, load_optim):
ds_config = {
"train_batch_size": 4,
"optimizer": {
"type": 'Adam'
},
"fp16": {
"enabled": True,
"initial_scale_power": 8
},
"zero_optimization": {
"stage": 2,
"elastic_checkpoint": elastic_save
}
}
hidden_dim = 10
model = SimpleModel(hidden_dim)

model, _, _, _ = deepspeed.initialize(config=ds_config,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=8,
hidden_dim=hidden_dim,
device=model.device)
for n, batch in enumerate(data_loader):
loss = model(batch[0], batch[1])
model.backward(loss)
model.step()

if load_optim:
torch.save(model.optimizer.optimizer.state_dict(),
os.path.join(class_tmpdir,
'opt-state-dict'))
model.save_checkpoint(class_tmpdir)


@pytest.mark.parametrize("elastic_save", [True, False])
@pytest.mark.parametrize("elastic_load", [True, False])
@pytest.mark.parametrize("load_optim", [True, False])
class TestZeROElasticCheckpoint(DistributedTest):
world_size = 2

@pytest.mark.parametrize(["elastic_save",
"elastic_load",
"load_optim"],
itertools.product(*[[True,
False],
[True,
False],
[True,
False]]))
def test_elastic_checkpoint_fixed_dp(self,
tmpdir,
elastic_save,
Expand Down Expand Up @@ -271,22 +304,12 @@ def test_elastic_checkpoint_fixed_dp(self,
model.backward(loss)
model.step()

@pytest.mark.parametrize(["elastic_save",
"elastic_load",
"load_optim"],
itertools.product(*[[True,
False],
[True,
False],
[True,
False]]))
def test_elastic_checkpoint_change_dp(self,
tmpdir,
ws4_model_checkpoint,
class_tmpdir,
elastic_save,
elastic_load,
load_optim):
pytest.skip(
'skip until DistributedTest can support changing world size within a test')
ds_config = {
"train_batch_size": 4,
"optimizer": {
Expand All @@ -298,43 +321,21 @@ def test_elastic_checkpoint_change_dp(self,
},
"zero_optimization": {
"stage": 2,
"elastic_checkpoint": elastic_save
"elastic_checkpoint": elastic_load
}
}
hidden_dim = 10
models = [SimpleModel(hidden_dim) for _ in range(2)]

# Save checkpoint with dp world size = 4
#TODO - remove this line @distributed_test(world_size=[4])
model, _, _, _ = deepspeed.initialize(config=ds_config,
model=models[0],
model_parameters=models[0].parameters())
data_loader = random_dataloader(model=model,
total_samples=8,
hidden_dim=hidden_dim,
device=model.device)
for n, batch in enumerate(data_loader):
loss = model(batch[0], batch[1])
model.backward(loss)
model.step()

if load_optim:
torch.save(model.optimizer.optimizer.state_dict(),
os.path.join(tmpdir,
'opt-state-dict'))
model.save_checkpoint(tmpdir)
model = SimpleModel(hidden_dim)

# Load checkpoint with dp world size = 2
#TODO - remove this line @distributed_test(world_size=[2])
ds_config["zero_optimization"]["elastic_checkpoint"] = elastic_load
model, _, _, _ = deepspeed.initialize(config=ds_config,
model=models[1],
model_parameters=models[1].parameters())
model=model,
model_parameters=model.parameters())
if load_optim:
with pytest.raises(deepspeed.runtime.zero.utils.ZeRORuntimeException):
model.load_checkpoint(tmpdir, load_optimizer_states=load_optim)
model.load_checkpoint(class_tmpdir, load_optimizer_states=load_optim)
else:
model.load_checkpoint(tmpdir, load_optimizer_states=load_optim)
model.load_checkpoint(class_tmpdir, load_optimizer_states=load_optim)


class TestZeROSaveLoadEdgeCase(DistributedTest):
Expand Down
37 changes: 36 additions & 1 deletion tests/unit/comm/test_dist.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import os
import torch
import deepspeed.comm as dist
import deepspeed

from unit.common import DistributedTest, get_master_port
from unit.common import DistributedTest, DistributedFixture, get_master_port
from unit.simple_model import SimpleModel

import pytest
Expand Down Expand Up @@ -64,6 +65,40 @@ def test_world_size_1(self):
assert dist.get_world_size() == 1


# Demonstration of the DistributedFixture class
@pytest.fixture(params=[2, 4])
def val1(request):
return request.param


@pytest.fixture(params=[16, 32])
def val2(request):
return request.param


class distributed_fixture(DistributedFixture):
world_size = 2

def run(self, class_tmpdir, val1, val2):
assert int(os.environ["WORLD_SIZE"]) == self.world_size
local_rank = os.environ["LOCAL_RANK"]
file_path = os.path.join(class_tmpdir, f"checkpoint-{local_rank}.pt")
with open(file_path, "w") as f:
f.write(f"{local_rank},{val1},{val2}")


class TestDistributedFixture(DistributedTest):
world_size = 1

def test(self, distributed_fixture, class_tmpdir, val1, val2):
for rank in range(2):
file_path = os.path.join(class_tmpdir, f"checkpoint-{rank}.pt")
with open(file_path, "r") as f:
chkpt = f.read()
assert chkpt == f"{rank},{val1},{val2}"
assert int(os.environ["WORLD_SIZE"]) == 1


class TestDistAllReduce(DistributedTest):
world_size = [1, 2, 4]

Expand Down
Loading