From ef3afa84b7e852f995629d2b21aaa8f0540f46aa Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Fri, 12 Aug 2022 15:15:59 -0700 Subject: [PATCH 01/25] batch of refactored tests --- tests/unit/test_aio.py | 154 ++------ tests/unit/test_config.py | 183 +++------ tests/unit/test_ds_initialize.py | 215 +++++------ tests/unit/test_dynamic_loss_scale.py | 244 +++++------- tests/unit/test_lr_schedulers.py | 537 ++++++++++---------------- tests/unit/test_moe.py | 6 +- tests/unit/test_moe_tp.py | 151 +++----- tests/unit/test_multi_output_model.py | 56 +-- tests/unit/test_partition.py | 58 +-- tests/unit/test_zero_context.py | 106 ++--- 10 files changed, 644 insertions(+), 1066 deletions(-) diff --git a/tests/unit/test_aio.py b/tests/unit/test_aio.py index 389d422bbc91..247c6a201537 100755 --- a/tests/unit/test_aio.py +++ b/tests/unit/test_aio.py @@ -5,7 +5,7 @@ import deepspeed import deepspeed.comm as dist from deepspeed.ops.aio import AsyncIOBuilder -from .common import distributed_test +from tests.unit.common import DistributedTest MEGA_BYTE = 1024**2 BLOCK_SIZE = MEGA_BYTE @@ -13,10 +13,8 @@ IO_SIZE = 16 * MEGA_BYTE IO_PARALLEL = 2 - -def _skip_if_no_aio(): - if not deepspeed.ops.__compatible_ops__[AsyncIOBuilder.NAME]: - pytest.skip('Skip tests since async-io is not compatible') +if not deepspeed.ops.__compatible_ops__[AsyncIOBuilder.NAME]: + pytest.skip('Skip tests since async-io is not compatible', allow_module_level=True) def _do_ref_write(tmpdir, index=0): @@ -48,20 +46,12 @@ def _validate_handle_state(handle, single_submit, overlap_events): assert handle.get_queue_depth() == QUEUE_DEPTH -@pytest.mark.parametrize('single_submit, overlap_events', - [(False, - False), - (False, - True), - (True, - False), - (True, - True)]) -def test_parallel_read(tmpdir, single_submit, overlap_events): - _skip_if_no_aio() - - @distributed_test(world_size=[2]) - def _test_parallel_read(single_submit, overlap_events): +@pytest.mark.parametrize("single_submit", [True, False]) +@pytest.mark.parametrize("overlap_events", [True, False]) +class TestRead(DistributedTest): + world_size = 2 + + def test_parallel_read(self, tmpdir, single_submit, overlap_events): ref_file, _ = _do_ref_write(tmpdir) aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu').pin_memory() @@ -80,34 +70,8 @@ def _test_parallel_read(single_submit, overlap_events): ref_buffer = list(f.read()) assert ref_buffer == aio_buffer.tolist() - _test_parallel_read(single_submit, overlap_events) - - -@pytest.mark.parametrize('single_submit, overlap_events, cuda_device', - [(False, - False, - False), - (False, - True, - False), - (True, - False, - False), - (True, - True, - False), - (False, - False, - True), - (True, - True, - True)]) -def test_async_read(tmpdir, single_submit, overlap_events, cuda_device): - - _skip_if_no_aio() - - @distributed_test(world_size=[2]) - def _test_async_read(single_submit, overlap_events, cuda_device): + @pytest.mark.parametrize("cuda_device", [True, False]) + def test_async_read(self, tmpdir, single_submit, overlap_events, cuda_device): ref_file, _ = _do_ref_write(tmpdir) if cuda_device: @@ -135,26 +99,14 @@ def _test_async_read(single_submit, overlap_events, cuda_device): ref_buffer = list(f.read()) assert ref_buffer == aio_buffer.tolist() - _test_async_read(single_submit, overlap_events, cuda_device) - - -@pytest.mark.parametrize('single_submit, overlap_events', - [(False, - False), - (False, - True), - (True, - False), - (True, - True)]) -def test_parallel_write(tmpdir, single_submit, overlap_events): - _skip_if_no_aio() +@pytest.mark.parametrize("single_submit", [True, False]) +@pytest.mark.parametrize("overlap_events", [True, False]) +class TestWrite(DistributedTest): + world_size = 2 - @distributed_test(world_size=[2]) - def _test_parallel_write(single_submit, overlap_events): + def test_parallel_write(self, tmpdir, single_submit, overlap_events): ref_file, ref_buffer = _do_ref_write(tmpdir) - aio_file, aio_buffer = _get_test_file_and_buffer(tmpdir, ref_buffer, False) h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, @@ -173,36 +125,9 @@ def _test_parallel_write(single_submit, overlap_events): filecmp.clear_cache() assert filecmp.cmp(ref_file, aio_file, shallow=False) - _test_parallel_write(single_submit, overlap_events) - - -@pytest.mark.parametrize('single_submit, overlap_events, cuda_device', - [(False, - False, - False), - (False, - True, - False), - (True, - False, - False), - (True, - True, - False), - (False, - False, - True), - (True, - True, - True)]) -def test_async_write(tmpdir, single_submit, overlap_events, cuda_device): - - _skip_if_no_aio() - - @distributed_test(world_size=[2]) - def _test_async_write(single_submit, overlap_events, cuda_device): + @pytest.mark.parametrize("cuda_device", [True, False]) + def test_async_write(tmpdir, single_submit, overlap_events, cuda_device): ref_file, ref_buffer = _do_ref_write(tmpdir) - aio_file, aio_buffer = _get_test_file_and_buffer(tmpdir, ref_buffer, cuda_device) h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, @@ -224,24 +149,13 @@ def _test_async_write(single_submit, overlap_events, cuda_device): filecmp.clear_cache() assert filecmp.cmp(ref_file, aio_file, shallow=False) - _test_async_write(single_submit, overlap_events, cuda_device) - -@pytest.mark.parametrize('async_queue, cuda_device', - [(2, - False), - (4, - False), - (2, - True), - (4, - True)]) -def test_async_queue_read(tmpdir, async_queue, cuda_device): +@pytest.mark.parametrize("cuda_device", [True, False]) +class TestAsyncQueue(DistributedTest): + world_size = 2 - _skip_if_no_aio() - - @distributed_test(world_size=[2]) - def _test_async_queue_read(async_queue, cuda_device): + @pytest.mark.parametrize("async_queue", [2, 4]) + def test_read(self, tmpdir, async_queue, cuda_device): ref_files = [] for i in range(async_queue): f, _ = _do_ref_write(tmpdir, i) @@ -277,24 +191,8 @@ def _test_async_queue_read(async_queue, cuda_device): ref_buffer = list(f.read()) assert ref_buffer == aio_buffers[i].tolist() - _test_async_queue_read(async_queue, cuda_device) - - -@pytest.mark.parametrize('async_queue, cuda_device', - [(2, - False), - (7, - False), - (2, - True), - (7, - True)]) -def test_async_queue_write(tmpdir, async_queue, cuda_device): - - _skip_if_no_aio() - - @distributed_test(world_size=[2]) - def _test_async_queue_write(async_queue, cuda_device): + @pytest.mark.parametrize("async_queue", [2, 7]) + def test_write(self, tmpdir, async_queue, cuda_device): ref_files = [] ref_buffers = [] for i in range(async_queue): @@ -331,5 +229,3 @@ def _test_async_queue_write(async_queue, cuda_device): filecmp.clear_cache() assert filecmp.cmp(ref_files[i], aio_files[i], shallow=False) - - _test_async_queue_write(async_queue, cuda_device) diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index feae74eef9c0..9d5cc3460ea7 100755 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -6,8 +6,8 @@ from deepspeed.runtime.zero.config import DeepSpeedZeroConfig -from .common import distributed_test, get_test_path -from .simple_model import SimpleModel, create_config_from_dict, random_dataloader +from tests.unit.common import DistributedTest, get_test_path +from tests.unit.simple_model import SimpleModel, create_config_from_dict, random_dataloader import deepspeed.comm as dist # A test on its own @@ -15,6 +15,23 @@ from deepspeed.runtime.config import DeepSpeedConfig, get_bfloat16_enabled +@pytest.fixture +def base_config(): + config_dict = { + "train_batch_size": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + } + }, + "fp16": { + "enabled": True + } + } + return config_dict + + def test_cuda(): assert (torch.cuda.is_available()) @@ -59,9 +76,10 @@ def _batch_assert(status, ds_config, batch, micro_batch, gas, success): (2,32,8,2,True), (2,33,17,2,False), (2,32,18,1,False)]) # yapf: disable -def test_batch_config(num_ranks, batch, micro_batch, gas, success): - @distributed_test(world_size=2) - def _test_batch_config(num_ranks, batch, micro_batch, gas, success): +class TestBatchConfig(DistributedTest): + world_size = 2 + + def test(self, num_ranks, batch, micro_batch, gas, success): assert dist.get_world_size() == num_ranks, \ 'The test assumes a world size of f{num_ranks}' @@ -104,9 +122,6 @@ def _test_batch_config(num_ranks, batch, micro_batch, gas, success): status = _run_batch_config(ds_config, train_batch=batch, gas=gas) _batch_assert(status, ds_config, batch, micro_batch, gas, success) - """Run batch config test """ - _test_batch_config(num_ranks, batch, micro_batch, gas, success) - def test_temp_config_json(tmpdir): config_dict = { @@ -141,32 +156,20 @@ def test_get_bfloat16_enabled(bf16_key): assert get_bfloat16_enabled(cfg) == True -def test_deprecated_deepscale_config(tmpdir): - config_dict = { - "train_batch_size": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 - } - }, - "fp16": { - "enabled": True - } - } +class TestDeprecatedDeepScaleConfig(DistributedTest): + world_size = 1 - config_path = create_config_from_dict(tmpdir, config_dict) - parser = argparse.ArgumentParser() - args = parser.parse_args(args='') - args.deepscale_config = config_path - args.local_rank = 0 + def test(self, base_config, tmpdir): - hidden_dim = 10 + config_path = create_config_from_dict(tmpdir, base_config) + parser = argparse.ArgumentParser() + args = parser.parse_args(args='') + args.deepscale_config = config_path + args.local_rank = 0 - model = SimpleModel(hidden_dim) + hidden_dim = 10 - @distributed_test(world_size=[1]) - def _test_deprecated_deepscale_config(args, model, hidden_dim): + model = SimpleModel(hidden_dim) model, _, _,_ = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters()) @@ -179,36 +182,15 @@ def _test_deprecated_deepscale_config(args, model, hidden_dim): model.backward(loss) model.step() - _test_deprecated_deepscale_config(args=args, model=model, hidden_dim=hidden_dim) - - -def test_dist_init_true(tmpdir): - config_dict = { - "train_batch_size": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 - } - }, - "fp16": { - "enabled": True - } - } - - config_path = create_config_from_dict(tmpdir, config_dict) - parser = argparse.ArgumentParser() - args = parser.parse_args(args='') - args.deepscale_config = config_path - args.local_rank = 0 - hidden_dim = 10 +class TestDistInit(DistributedTest): + world_size = 1 - model = SimpleModel(hidden_dim) + def test(self, base_config): + hidden_dim = 10 - @distributed_test(world_size=[1]) - def _test_dist_init_true(args, model, hidden_dim): - model, _, _,_ = deepspeed.initialize(args=args, + model = SimpleModel(hidden_dim) + model, _, _,_ = deepspeed.initialize(config=base_config, model=model, model_parameters=model.parameters(), dist_init_required=True) @@ -221,26 +203,17 @@ def _test_dist_init_true(args, model, hidden_dim): model.backward(loss) model.step() - _test_dist_init_true(args=args, model=model, hidden_dim=hidden_dim) - -def test_init_no_optimizer(tmpdir): - - config_dict = {"train_batch_size": 1, "fp16": {"enabled": True}} - config_path = create_config_from_dict(tmpdir, config_dict) - - @distributed_test(world_size=1) - def _helper(): - parser = argparse.ArgumentParser() - args = parser.parse_args(args='') - args.deepscale_config = config_path - args.local_rank = 0 +class TestInitNoOptimizer(DistributedTest): + world_size = 1 + def test(self, base_config): + del base_config["optimizer"] hidden_dim = 10 model = SimpleModel(hidden_dim=hidden_dim) - model, _, _, _ = deepspeed.initialize(args=args, model=model) + model, _, _, _ = deepspeed.initialize(config=base_config, model=model) data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=hidden_dim, @@ -252,27 +225,13 @@ def _helper(): with pytest.raises(AssertionError): model.step() - _helper() +class TestArgs(DistributedTest): + world_size = 1 -def test_none_args(tmpdir): - config = { - "train_batch_size": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 - } - }, - "fp16": { - "enabled": True - } - } - - @distributed_test(world_size=1) - def _helper(): + def test_none_args(self, base_config): model = SimpleModel(hidden_dim=10) - model, _, _, _ = deepspeed.initialize(args=None, model=model, config=config) + model, _, _, _ = deepspeed.initialize(args=None, model=model, config=base_config) data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=10, @@ -280,27 +239,9 @@ def _helper(): for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) - _helper() - - -def test_no_args(tmpdir): - config = { - "train_batch_size": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 - } - }, - "fp16": { - "enabled": True - } - } - - @distributed_test(world_size=1) - def _helper(): + def test_no_args(self, base_config): model = SimpleModel(hidden_dim=10) - model, _, _, _ = deepspeed.initialize(model=model, config=config) + model, _, _, _ = deepspeed.initialize(model=model, config=base_config) data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=10, @@ -308,28 +249,14 @@ def _helper(): for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) - _helper() +class TestNoModel(DistributedTest): + world_size = 1 -def test_no_model(tmpdir): - config = { - "train_batch_size": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 - } - }, - "fp16": { - "enabled": True - } - } - - @distributed_test(world_size=1) - def _helper(): + def test(self, base_config): model = SimpleModel(hidden_dim=10) with pytest.raises(AssertionError): - model, _, _, _ = deepspeed.initialize(model=None, config=config) + model, _, _, _ = deepspeed.initialize(model=None, config=base_config) with pytest.raises(AssertionError): - model, _, _, _ = deepspeed.initialize(model, config=config) + model, _, _, _ = deepspeed.initialize(model, config=base_config) diff --git a/tests/unit/test_ds_initialize.py b/tests/unit/test_ds_initialize.py index a9756af62200..5d1baf7ccf40 100644 --- a/tests/unit/test_ds_initialize.py +++ b/tests/unit/test_ds_initialize.py @@ -4,9 +4,9 @@ from torch.optim import Optimizer, Adam, AdamW from torch.optim.lr_scheduler import _LRScheduler, LambdaLR -from .simple_model import args_from_dict, SimpleModel, random_dataloader -from .common import distributed_test -from .util import required_torch_version +from tests.unit.simple_model import SimpleModel, random_dataloader +from tests.unit.common import DistributedTest +from tests.unit.util import required_torch_version import deepspeed from deepspeed.ops.adam import FusedAdam @@ -15,33 +15,32 @@ from deepspeed.runtime.utils import see_memory_usage -@pytest.mark.parametrize('zero_stage,world_size', [(0, 1), (3, 1)]) -def test_no_optim(zero_stage, world_size): - if zero_stage == 3 and not required_torch_version(): - pytest.skip("zero-3 param offload requires at least torch 1.8") - - ds_config = { - 'train_batch_size': world_size, - 'fp16': { - 'enabled': True - }, - 'zero_optimization': { - "stage": zero_stage, - "offload_param": { - "device": "cpu" +@pytest.mark.parametrize('zero_stage', [0, 3]) +class TestNoOptim(DistributedTest): + world_size = 1 + + def test(self, zero_stage): + if zero_stage == 3 and not required_torch_version(): + pytest.skip("zero-3 param offload requires at least torch 1.8") + + ds_config = { + 'train_batch_size': self.world_size, + 'fp16': { + 'enabled': True + }, + 'zero_optimization': { + "stage": zero_stage, + "offload_param": { + "device": "cpu" + } } } - } - # 20B test - #hidden_dim = 16 * 1024 - hidden_dim = 4 + # 20B test + #hidden_dim = 16 * 1024 + hidden_dim = 4 - @distributed_test(world_size=[world_size]) - def _go(hidden_dim): with deepspeed.zero.Init(enabled=zero_stage == 3, config_dict_or_path=ds_config): model = SimpleModel(hidden_dim, nlayers=78) - print('total number of parameters:', - sum([p.numel() for p in model.parameters()])) see_memory_usage('pre-init', force=True) model, _, _, _ = deepspeed.initialize(model=model, config=ds_config) see_memory_usage('post-init', force=True) @@ -50,36 +49,32 @@ def _go(hidden_dim): hidden_dim=hidden_dim, device=model.device, dtype=torch.half) - print(f"optimizer={model.optimizer}") for batch in data_loader: model(batch[0], batch[1]) see_memory_usage('post-fwds', force=True) - _go(hidden_dim) - @pytest.mark.parametrize('optimizer_type', [None, Optimizer, Callable]) -def test_client_optimizer(tmpdir, optimizer_type): - def _optimizer_callable(params) -> Optimizer: - return AdamW(params=params) - - hidden_dim = 10 - model = SimpleModel(hidden_dim) - - config_dict = {'train_batch_size': 1} - if optimizer_type is None: - client_optimizer = None - config_dict['optimizer'] = {'type': ADAM_OPTIMIZER} - elif optimizer_type is Optimizer: - client_optimizer = Adam(model.parameters()) - else: - client_optimizer = _optimizer_callable - - args = args_from_dict(tmpdir, config_dict) +class TestClientOptimizer(DistributedTest): + world_size = 1 + + def test(self, optimizer_type): + def _optimizer_callable(params) -> Optimizer: + return AdamW(params=params) + + hidden_dim = 10 + model = SimpleModel(hidden_dim) + + config_dict = {'train_batch_size': 1} + if optimizer_type is None: + client_optimizer = None + config_dict['optimizer'] = {'type': ADAM_OPTIMIZER} + elif optimizer_type is Optimizer: + client_optimizer = Adam(model.parameters()) + else: + client_optimizer = _optimizer_callable - @distributed_test(world_size=[1]) - def _test_client_optimizer(args, model, client_optimizer): - _, ds_optimizer, _, _ = deepspeed.initialize(args=args, + _, ds_optimizer, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=list(model.parameters()), optimizer=client_optimizer) @@ -90,91 +85,65 @@ def _test_client_optimizer(args, model, client_optimizer): else: assert isinstance(ds_optimizer, AdamW) - _test_client_optimizer(args=args, model=model, client_optimizer=client_optimizer) - - -@pytest.mark.parametrize('scheduler_type, optimizer_type', - [(None, - None), - (None, - Optimizer), - (None, - Callable), - (_LRScheduler, - None), - (_LRScheduler, - Optimizer), - (_LRScheduler, - Callable), - (Callable, - None), - (Callable, - Optimizer), - (Callable, - Callable)]) -def test_client_lr_scheduler(tmpdir, scheduler_type, optimizer_type): - def _my_lambda(epoch): - return epoch // 10 - - def _optimizer_callable(params) -> Optimizer: - return torch.optim.AdamW(params=params) - - def _lr_scheduler_callable(optimizer) -> _LRScheduler: - return LambdaLR(optimizer, _my_lambda) - - hidden_dim = 10 - model = SimpleModel(hidden_dim) - - config_dict = {'train_batch_size': 1} - - client_optimizer = None - client_scheduler = None - - if optimizer_type is None: - config_dict['optimizer'] = {'type': ADAM_OPTIMIZER} - elif optimizer_type is Optimizer: - client_optimizer = torch.optim.Adam(model.parameters()) - else: - client_optimizer = _optimizer_callable - - if scheduler_type is None: - config_dict['scheduler'] = {'type': WARMUP_LR, 'params': {}} - elif scheduler_type == _LRScheduler: - if isinstance(client_optimizer, Optimizer): - client_scheduler = LambdaLR(client_optimizer, _my_lambda) + +@pytest.mark.parametrize("scheduler_type", [None, _LRScheduler, Callable]) +@pytest.mark.parametrize("optimizer_type", [None, Optimizer, Callable]) +class TestClientLrScheduler(DistributedTest): + def test(self, scheduler_type, optimizer_type): + def _my_lambda(epoch): + return epoch // 10 + + def _optimizer_callable(params) -> Optimizer: + return torch.optim.AdamW(params=params) + + def _lr_scheduler_callable(optimizer) -> _LRScheduler: + return LambdaLR(optimizer, _my_lambda) + + hidden_dim = 10 + model = SimpleModel(hidden_dim) + + config_dict = {'train_batch_size': 1} + + client_optimizer = None + client_scheduler = None + + if optimizer_type is None: + config_dict['optimizer'] = {'type': ADAM_OPTIMIZER} + elif optimizer_type is Optimizer: + client_optimizer = torch.optim.Adam(model.parameters()) else: - # Verify invalid combination is correctly handled - client_scheduler = LambdaLR(torch.optim.Adam(model.parameters()), _my_lambda) - else: - client_scheduler = _lr_scheduler_callable + client_optimizer = _optimizer_callable - args = args_from_dict(tmpdir, config_dict) + if scheduler_type is None: + config_dict['scheduler'] = {'type': WARMUP_LR, 'params': {}} + elif scheduler_type == _LRScheduler: + if isinstance(client_optimizer, Optimizer): + client_scheduler = LambdaLR(client_optimizer, _my_lambda) + else: + # Verify invalid combination is correctly handled + client_scheduler = LambdaLR(torch.optim.Adam(model.parameters()), + _my_lambda) + else: + client_scheduler = _lr_scheduler_callable - @distributed_test(world_size=[1]) - def _test_client_lr_scheduler(args, model, optimizer, lr_scheduler): - if isinstance(lr_scheduler, - _LRScheduler) and not isinstance(optimizer, + if isinstance(client_scheduler, + _LRScheduler) and not isinstance(client_optimizer, Optimizer): with pytest.raises(AssertionError): - _, _, _, _ = deepspeed.initialize(args=args, + _, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=list(model.parameters()), - optimizer=optimizer, - lr_scheduler=lr_scheduler) + optimizer=client_optimizer, + lr_scheduler=client_scheduler) else: - _, _, _, ds_lr_scheduler = deepspeed.initialize(args=args, + _, _, _, ds_lr_scheduler = deepspeed.initialize(config=config_dict, model=model, model_parameters=list(model.parameters()), - optimizer=optimizer, - lr_scheduler=lr_scheduler) - if lr_scheduler is None: + optimizer=client_optimizer, + lr_scheduler=client_scheduler) + if client_scheduler is None: assert isinstance(ds_lr_scheduler, WarmupLR) - elif isinstance(lr_scheduler, _LRScheduler): - assert ds_lr_scheduler == lr_scheduler + elif isinstance(client_scheduler, _LRScheduler): + assert ds_lr_scheduler == client_scheduler else: assert isinstance(ds_lr_scheduler, LambdaLR) - - _test_client_lr_scheduler(args=args, - model=model, - optimizer=client_optimizer, - lr_scheduler=client_scheduler) diff --git a/tests/unit/test_dynamic_loss_scale.py b/tests/unit/test_dynamic_loss_scale.py index 3d9209fcc76a..4682e2bd749d 100755 --- a/tests/unit/test_dynamic_loss_scale.py +++ b/tests/unit/test_dynamic_loss_scale.py @@ -1,8 +1,8 @@ import torch import deepspeed import numpy as np -from .common import distributed_test -from .simple_model import SimpleModel, args_from_dict +from tests.unit.common import DistributedTest +from tests.unit.simple_model import SimpleModel def run_model_step(model, gradient_list): @@ -13,30 +13,29 @@ def run_model_step(model, gradient_list): model.step() -def test_fused_no_overflow(tmpdir): - config_dict = { - "train_batch_size": 1, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 +class TestFused(DistributedTest): + world_size = 1 + + def test_no_overflow(self): + config_dict = { + "train_batch_size": 1, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + } + }, + "fp16": { + "enabled": True, + "loss_scale": 0, + "initial_scale_power": 8, + "loss_scale_window": 2 } - }, - "fp16": { - "enabled": True, - "loss_scale": 0, - "initial_scale_power": 8, - "loss_scale_window": 2 } - } - args = args_from_dict(tmpdir, config_dict) - - @distributed_test(world_size=1) - def _test_fused_no_overflow(args): hidden_dim = 1 model = SimpleModel(hidden_dim) - model, optim, _, _ = deepspeed.initialize(args=args, + model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) @@ -54,33 +53,26 @@ def _test_fused_no_overflow(args): if optim.cur_iter % expected_scale_window == 0: expected_loss_scale *= 2 - _test_fused_no_overflow(args) - - -def test_fused_all_overflow(tmpdir): - config_dict = { - "train_batch_size": 1, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 + def test_all_overflow(self): + config_dict = { + "train_batch_size": 1, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + } + }, + "fp16": { + "enabled": True, + "loss_scale": 0, + "initial_scale_power": 4, + "loss_scale_window": 2 } - }, - "fp16": { - "enabled": True, - "loss_scale": 0, - "initial_scale_power": 4, - "loss_scale_window": 2 } - } - args = args_from_dict(tmpdir, config_dict) - - @distributed_test(world_size=1) - def _test_fused_all_overflow(args): hidden_dim = 1 model = SimpleModel(hidden_dim) - model, optim, _, _ = deepspeed.initialize(args=args, + model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) @@ -96,33 +88,26 @@ def _test_fused_all_overflow(args): assert optim.cur_scale == expected_loss_scale assert optim.cur_iter == (i + 1) - _test_fused_all_overflow(args) - - -def test_fused_some_overflow(tmpdir): - config_dict = { - "train_batch_size": 1, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 + def test_some_overflow(self): + config_dict = { + "train_batch_size": 1, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + } + }, + "fp16": { + "enabled": True, + "loss_scale": 0, + "initial_scale_power": 8, + "loss_scale_window": 2 } - }, - "fp16": { - "enabled": True, - "loss_scale": 0, - "initial_scale_power": 8, - "loss_scale_window": 2 } - } - args = args_from_dict(tmpdir, config_dict) - - @distributed_test(world_size=1) - def _test_fused_some_overflow(args): hidden_dim = 1 model = SimpleModel(hidden_dim) - model, optim, _, _ = deepspeed.initialize(args=args, + model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) @@ -158,33 +143,30 @@ def _test_fused_some_overflow(args): assert optim.cur_scale == expected_loss_scale assert optim.cur_iter == expected_iteration - _test_fused_some_overflow(args) - -def test_unfused_no_overflow(tmpdir): - config_dict = { - "train_batch_size": 1, - "steps_per_print": 1, - "optimizer": { - "type": "Lamb", - "params": { - "lr": 0.00015 +class TestUnfused(DistributedTest): + world_size = 1 + + def test_no_overflow(self): + config_dict = { + "train_batch_size": 1, + "steps_per_print": 1, + "optimizer": { + "type": "Lamb", + "params": { + "lr": 0.00015 + } + }, + "fp16": { + "enabled": True, + "loss_scale": 0, + "initial_scale_power": 8, + "loss_scale_window": 2 } - }, - "fp16": { - "enabled": True, - "loss_scale": 0, - "initial_scale_power": 8, - "loss_scale_window": 2 } - } - args = args_from_dict(tmpdir, config_dict) - - @distributed_test(world_size=1) - def _test_unfused_no_overflow(args): hidden_dim = 1 model = SimpleModel(hidden_dim) - model, optim, _, _ = deepspeed.initialize(args=args, + model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) expected_loss_scale = 2**8 @@ -201,34 +183,27 @@ def _test_unfused_no_overflow(args): if optim.cur_iter % expected_scale_window == 0: expected_loss_scale *= 2 - _test_unfused_no_overflow(args) - - -def test_unfused_all_overflow(tmpdir): - config_dict = { - "train_batch_size": 1, - "steps_per_print": 1, - "optimizer": { - "type": "Lamb", - "params": { - "lr": 0.00015 + def test_all_overflow(self): + config_dict = { + "train_batch_size": 1, + "steps_per_print": 1, + "optimizer": { + "type": "Lamb", + "params": { + "lr": 0.00015 + } + }, + "fp16": { + "enabled": True, + "loss_scale": 0, + "initial_scale_power": 4, + "loss_scale_window": 2, + "min_loss_scale": 0.25 } - }, - "fp16": { - "enabled": True, - "loss_scale": 0, - "initial_scale_power": 4, - "loss_scale_window": 2, - "min_loss_scale": 0.25 } - } - args = args_from_dict(tmpdir, config_dict) - - @distributed_test(world_size=1) - def _test_unfused_all_overflow(args): hidden_dim = 1 model = SimpleModel(hidden_dim) - model, optim, _, _ = deepspeed.initialize(args=args, + model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) @@ -246,33 +221,26 @@ def _test_unfused_all_overflow(args): assert optim.cur_scale == expected_loss_scale assert optim.cur_iter == (i + 1) - _test_unfused_all_overflow(args) - - -def test_unfused_some_overflow(tmpdir): - config_dict = { - "train_batch_size": 1, - "steps_per_print": 1, - "optimizer": { - "type": "Lamb", - "params": { - "lr": 0.00015 + def test_some_overflow(self): + config_dict = { + "train_batch_size": 1, + "steps_per_print": 1, + "optimizer": { + "type": "Lamb", + "params": { + "lr": 0.00015 + } + }, + "fp16": { + "enabled": True, + "loss_scale": 0, + "initial_scale_power": 8, + "loss_scale_window": 2 } - }, - "fp16": { - "enabled": True, - "loss_scale": 0, - "initial_scale_power": 8, - "loss_scale_window": 2 } - } - args = args_from_dict(tmpdir, config_dict) - - @distributed_test(world_size=1) - def _test_unfused_some_overflow(args): hidden_dim = 1 model = SimpleModel(hidden_dim) - model, optim, _, _ = deepspeed.initialize(args=args, + model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) @@ -307,5 +275,3 @@ def _test_unfused_some_overflow(args): expected_loss_scale /= (2**len(overflow_gradients)) assert optim.cur_scale == expected_loss_scale assert optim.cur_iter == expected_iteration - - _test_unfused_some_overflow(args) diff --git a/tests/unit/test_lr_schedulers.py b/tests/unit/test_lr_schedulers.py index 49da0111d985..ca2dd849cb97 100755 --- a/tests/unit/test_lr_schedulers.py +++ b/tests/unit/test_lr_schedulers.py @@ -1,8 +1,8 @@ import torch import deepspeed import pytest -from .common import distributed_test -from .simple_model import SimpleModel, random_dataloader, args_from_dict +from tests.unit.common import DistributedTest +from tests.unit.simple_model import SimpleModel, random_dataloader from deepspeed.runtime.lr_schedules import LR_RANGE_TEST, LR_RANGE_TEST_MIN_LR, LR_RANGE_TEST_STEP_RATE, LR_RANGE_TEST_STEP_SIZE, LR_RANGE_TEST_STAIRCASE from deepspeed.runtime.lr_schedules import WARMUP_LR, WARMUP_MIN_LR, WARMUP_MAX_LR, WARMUP_NUM_STEPS, WARMUP_TYPE, WARMUP_LOG_RATE, WARMUP_LINEAR_RATE from deepspeed.runtime.lr_schedules import ONE_CYCLE, CYCLE_MIN_LR, CYCLE_MAX_LR, CYCLE_FIRST_STEP_SIZE, DECAY_LR_RATE, DECAY_STEP_SIZE @@ -42,30 +42,29 @@ def _verify_staircase_increase(values, step_size): }), (LR_RANGE_TEST, {})]) -def test_get_lr_before_train(tmpdir, scheduler_type, params): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 +class TestGetLrBeforeTrain(DistributedTest): + world_size = 1 + + def test(self, scheduler_type, params): + config_dict = { + "train_batch_size": 2, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + }, }, - }, - "scheduler": { - "type": scheduler_type, - "params": params - }, - "gradient_clipping": 1.0 - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - model = SimpleModel(hidden_dim, empty_grad=False) - - @distributed_test(world_size=[1]) - def _test_get_lr_before_train(args, model, hidden_dim): - model, _, _, lr_scheduler = deepspeed.initialize(args=args, + "scheduler": { + "type": scheduler_type, + "params": params + }, + "gradient_clipping": 1.0 + } + hidden_dim = 10 + + model = SimpleModel(hidden_dim, empty_grad=False) + model, _, _, lr_scheduler = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, @@ -80,65 +79,44 @@ def _test_get_lr_before_train(args, model, hidden_dim): model.backward(loss) model.step() - _test_get_lr_before_train(args=args, model=model, hidden_dim=hidden_dim) - - -@pytest.mark.parametrize("warmup_num_steps, warmup_type", - [ - (10, - WARMUP_LOG_RATE), - (15, - WARMUP_LOG_RATE), - (19, - WARMUP_LOG_RATE), - (33, - WARMUP_LOG_RATE), - (10, - WARMUP_LINEAR_RATE), - (15, - WARMUP_LINEAR_RATE), - (19, - WARMUP_LINEAR_RATE), - (33, - WARMUP_LINEAR_RATE), - ]) -def test_lr_warmup_schedule(tmpdir, warmup_num_steps, warmup_type): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 + +@pytest.mark.parametrize("warmup_num_steps", [10, 15, 19, 33]) +@pytest.mark.parametrize("warnup_type", [WARMUP_LOG_RATE, WARMUP_LINEAR_RATE]) +class TestLrSchedule(DistributedTest): + world_size = 1 + + def test_lr_warmup_schedule(self, warmup_num_steps, warmup_type): + config_dict = { + "train_batch_size": 2, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + }, }, - }, - "scheduler": { - "type": WARMUP_LR, - "params": { - WARMUP_MIN_LR: 0.1, - WARMUP_MAX_LR: 0.2, - WARMUP_NUM_STEPS: warmup_num_steps, - WARMUP_TYPE: warmup_type, - } - }, - "gradient_clipping": 1.0 - } - - total_num_steps = 2 * warmup_num_steps - - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - model = SimpleModel(hidden_dim, empty_grad=False) - - @distributed_test(world_size=[1]) - def _test_lr_warmup_schedule(args, model, hidden_dim, schedule_params, num_steps): - model, _, _, lr_scheduler = deepspeed.initialize(args=args, + "scheduler": { + "type": WARMUP_LR, + "params": { + WARMUP_MIN_LR: 0.1, + WARMUP_MAX_LR: 0.2, + WARMUP_NUM_STEPS: warmup_num_steps, + WARMUP_TYPE: warmup_type, + } + }, + "gradient_clipping": 1.0 + } + schedule_params = config_dict["scheduler"]["params"] + total_num_steps = 2 * warmup_num_steps + hidden_dim = 10 + + model = SimpleModel(hidden_dim, empty_grad=False) + model, _, _, lr_scheduler = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, - total_samples=num_steps * 2, + total_samples=total_num_steps * 2, hidden_dim=hidden_dim, device=model.device, dtype=torch.float) @@ -160,72 +138,39 @@ def _test_lr_warmup_schedule(args, model, hidden_dim, schedule_params, num_steps # Verify post-warmup completion assert all([warmup_max_lr == lr for lr in step_lrs[warmup_num_steps:]]) - _test_lr_warmup_schedule(args=args, - model=model, - hidden_dim=hidden_dim, - schedule_params=config_dict["scheduler"]["params"], - num_steps=total_num_steps) - - -@pytest.mark.parametrize("warmup_num_steps, warmup_type", - [ - (10, - WARMUP_LOG_RATE), - (15, - WARMUP_LOG_RATE), - (19, - WARMUP_LOG_RATE), - (33, - WARMUP_LOG_RATE), - (10, - WARMUP_LINEAR_RATE), - (15, - WARMUP_LINEAR_RATE), - (19, - WARMUP_LINEAR_RATE), - (33, - WARMUP_LINEAR_RATE), - ]) -def test_lr_warmup_decay_schedule(tmpdir, warmup_num_steps, warmup_type): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 + def test_lr_warmup_decay_schedule(self, warmup_num_steps, warmup_type): + config_dict = { + "train_batch_size": 2, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + }, + }, + "scheduler": { + "type": WARMUP_DECAY_LR, + "params": { + WARMUP_MIN_LR: 0.1, + WARMUP_MAX_LR: 0.2, + WARMUP_NUM_STEPS: warmup_num_steps, + TOTAL_NUM_STEPS: warmup_num_steps * 2, + WARMUP_TYPE: warmup_type + } }, - }, - "scheduler": { - "type": WARMUP_DECAY_LR, - "params": { - WARMUP_MIN_LR: 0.1, - WARMUP_MAX_LR: 0.2, - WARMUP_NUM_STEPS: warmup_num_steps, - TOTAL_NUM_STEPS: warmup_num_steps * 2, - WARMUP_TYPE: warmup_type - } - }, - "gradient_clipping": 1.0 - } - - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - model = SimpleModel(hidden_dim, empty_grad=False) - - @distributed_test(world_size=[1]) - def _test_lr_warmup_decay_schedule(args, - model, - hidden_dim, - schedule_params, - num_steps): - model, _, _, lr_scheduler = deepspeed.initialize(args=args, + "gradient_clipping": 1.0 + } + schedule_params = config_dict["scheduler"]["params"] + total_num_steps = schedule_params[TOTAL_NUM_STEPS] + hidden_dim = 10 + + model = SimpleModel(hidden_dim, empty_grad=False) + model, _, _, lr_scheduler = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, - total_samples=num_steps * 2, + total_samples=total_num_steps * 2, hidden_dim=hidden_dim, device=model.device, dtype=torch.float) @@ -250,16 +195,6 @@ def _test_lr_warmup_decay_schedule(args, assert lr < previous_lr previous_lr = lr - schedule_params = config_dict["scheduler"]["params"] - - total_num_steps = schedule_params[TOTAL_NUM_STEPS] - - _test_lr_warmup_decay_schedule(args=args, - model=model, - hidden_dim=hidden_dim, - schedule_params=schedule_params, - num_steps=total_num_steps) - @pytest.mark.parametrize("scheduler_type,params", [(WARMUP_LR, @@ -281,30 +216,29 @@ def _test_lr_warmup_decay_schedule(args, LR_RANGE_TEST_MIN_LR: 1e-4, LR_RANGE_TEST_STEP_SIZE: 1 })]) -def test_scheduler_optimizer_parity(tmpdir, scheduler_type, params): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 +class TestSchedulerOptimizerParity(DistributedTest): + world_size = 1 + + def test(self, scheduler_type, params): + config_dict = { + "train_batch_size": 2, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + }, }, - }, - "scheduler": { - "type": scheduler_type, - "params": params - }, - "gradient_clipping": 1.0 - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - model = SimpleModel(hidden_dim, empty_grad=False) - - @distributed_test(world_size=[1]) - def _test_scheduler_optimizer_parity(args, model, hidden_dim): - model, _, _, lr_scheduler = deepspeed.initialize(args=args, + "scheduler": { + "type": scheduler_type, + "params": params + }, + "gradient_clipping": 1.0 + } + hidden_dim = 10 + + model = SimpleModel(hidden_dim, empty_grad=False) + model, _, _, lr_scheduler = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, @@ -318,8 +252,6 @@ def _test_scheduler_optimizer_parity(args, model, hidden_dim): model.step() assert lr_scheduler.get_lr() == model.get_lr() - _test_scheduler_optimizer_parity(args=args, model=model, hidden_dim=hidden_dim) - @pytest.mark.parametrize("min_lr, step_rate, step_size, staircase", [(1e-4, 1e-5, 1, True), @@ -329,35 +261,34 @@ def _test_scheduler_optimizer_parity(args, model, hidden_dim): (1e-2, 1e-2, 19, True), (1e-2, 1e-2, 19, False) ])# yapf: disable -def test_lr_range_test(tmpdir, min_lr, step_rate, step_size, staircase): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 +class TestLrRange(DistributedTest): + world_size = 1 + + def test(self, min_lr, step_rate, step_size, staircase): + config_dict = { + "train_batch_size": 2, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + }, + }, + "scheduler": { + "type": LR_RANGE_TEST, + "params": { + LR_RANGE_TEST_MIN_LR: min_lr, + LR_RANGE_TEST_STEP_RATE: step_rate, + LR_RANGE_TEST_STEP_SIZE: step_size, + LR_RANGE_TEST_STAIRCASE: staircase + } }, - }, - "scheduler": { - "type": LR_RANGE_TEST, - "params": { - LR_RANGE_TEST_MIN_LR: min_lr, - LR_RANGE_TEST_STEP_RATE: step_rate, - LR_RANGE_TEST_STEP_SIZE: step_size, - LR_RANGE_TEST_STAIRCASE: staircase - } - }, - "gradient_clipping": 1.0 - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - model = SimpleModel(hidden_dim, empty_grad=False) - - @distributed_test(world_size=[1]) - def _test_lr_range_test(args, model, hidden_dim, min_lr, step_size, staircase): - model, _, _, lr_scheduler = deepspeed.initialize(args=args, + "gradient_clipping": 1.0 + } + hidden_dim = 10 + + model = SimpleModel(hidden_dim, empty_grad=False) + model, _, _, lr_scheduler = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, @@ -384,68 +315,49 @@ def _test_lr_range_test(args, model, hidden_dim, min_lr, step_size, staircase): # Verify continuous increasing lr _verify_continuous_increase(step_lrs) - _test_lr_range_test(args=args, - model=model, - hidden_dim=hidden_dim, - min_lr=[min_lr], - step_size=step_size, - staircase=staircase) - - -@pytest.mark.parametrize("min_lr, max_lr, decay_rate, cycle_step_size, decay_step_size", - [ - (1e-5, 1e-2, 1e-3, 10, 10), - (1e-3, 1e-1, 0, 21, 21), - (1e-5, 1e-2, 1e-3, 10, 10), - (1e-3, 1e-1, 1e-1, 21, 21), - (1e-5, 1e-1, 0, 10, 0), - ]) # yapf: disable -def test_onecycle_lr(tmpdir, - min_lr, - max_lr, - decay_rate, - cycle_step_size, - decay_step_size): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 + +class TestOneCycle(DistributedTest): + world_size = 1 + + @pytest.mark.parametrize("min_lr, max_lr, decay_rate, cycle_step_size, decay_step_size", + [ + (1e-5, 1e-2, 1e-3, 10, 10), + (1e-3, 1e-1, 0, 21, 21), + (1e-5, 1e-2, 1e-3, 10, 10), + (1e-3, 1e-1, 1e-1, 21, 21), + (1e-5, 1e-1, 0, 10, 0), + ]) # yapf: disable + def test_lr(self, min_lr, max_lr, decay_rate, cycle_step_size, decay_step_size): + config_dict = { + "train_batch_size": 2, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + }, + }, + "scheduler": { + "type": ONE_CYCLE, + "params": { + CYCLE_MIN_LR: min_lr, + CYCLE_MAX_LR: max_lr, + DECAY_LR_RATE: decay_rate, + CYCLE_FIRST_STEP_SIZE: cycle_step_size, + DECAY_STEP_SIZE: decay_step_size + } }, - }, - "scheduler": { - "type": ONE_CYCLE, - "params": { - CYCLE_MIN_LR: min_lr, - CYCLE_MAX_LR: max_lr, - DECAY_LR_RATE: decay_rate, - CYCLE_FIRST_STEP_SIZE: cycle_step_size, - DECAY_STEP_SIZE: decay_step_size - } - }, - "gradient_clipping": 1.0 - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - model = SimpleModel(hidden_dim, empty_grad=False) - - @distributed_test(world_size=[1]) - def _test_onecycle_lr(args, - model, - hidden_dim, - min_lr, - max_lr, - step_size, - decay_rate): - model, _, _, lr_scheduler = deepspeed.initialize(args=args, + "gradient_clipping": 1.0 + } + hidden_dim = 10 + + model = SimpleModel(hidden_dim, empty_grad=False) + model, _, _, lr_scheduler = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=max(50, - step_size * 3), + cycle_step_size * 3), hidden_dim=hidden_dim, device=model.device, dtype=torch.float) @@ -461,72 +373,53 @@ def _test_onecycle_lr(args, assert step_lrs[0] == min_lr # Verify peak lr - assert step_lrs[step_size] == max_lr + assert step_lrs[cycle_step_size] == max_lr # Verify increasing phase - _verify_continuous_increase(step_lrs[:step_size]) + _verify_continuous_increase(step_lrs[:cycle_step_size]) # Verify decreasing phase - _verify_continuous_decrease(step_lrs[step_size:(step_size * 2)]) + _verify_continuous_decrease(step_lrs[cycle_step_size:(cycle_step_size * 2)]) # Verify decay phase if decay_rate > 0: - _verify_continuous_decrease(step_lrs[(step_size * 2):]) - - _test_onecycle_lr(args=args, - model=model, - hidden_dim=hidden_dim, - min_lr=[min_lr], - max_lr=[max_lr], - step_size=cycle_step_size, - decay_rate=decay_rate) - - -@pytest.mark.parametrize("min_mom, max_mom, decay_rate, step_size", - [ - (0.08, 0.09, 1e-3, 10), - (0.08, 0.09, 0, 21), - (0.08, 0.09, 1e-3, 10), - (0.08, 0.09, 0, 21), - ]) # yapf: disable -def test_onecycle_mom(tmpdir, min_mom, max_mom, decay_rate, step_size): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 + _verify_continuous_decrease(step_lrs[(cycle_step_size * 2):]) + + @pytest.mark.parametrize("min_mom, max_mom, decay_rate, step_size", + [ + (0.08, 0.09, 1e-3, 10), + (0.08, 0.09, 0, 21), + (0.08, 0.09, 1e-3, 10), + (0.08, 0.09, 0, 21), + ]) # yapf: disable + def test_mom(self, min_mom, max_mom, decay_rate, step_size): + config_dict = { + "train_batch_size": 2, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + }, + }, + "scheduler": { + "type": ONE_CYCLE, + "params": { + CYCLE_MIN_LR: 1e-3, + CYCLE_MAX_LR: 1e-2, + CYCLE_MIN_MOM: min_mom, + CYCLE_MAX_MOM: max_mom, + DECAY_MOM_RATE: decay_rate, + CYCLE_FIRST_STEP_SIZE: step_size, + DECAY_STEP_SIZE: step_size + } }, - }, - "scheduler": { - "type": ONE_CYCLE, - "params": { - CYCLE_MIN_LR: 1e-3, - CYCLE_MAX_LR: 1e-2, - CYCLE_MIN_MOM: min_mom, - CYCLE_MAX_MOM: max_mom, - DECAY_MOM_RATE: decay_rate, - CYCLE_FIRST_STEP_SIZE: step_size, - DECAY_STEP_SIZE: step_size - } - }, - "gradient_clipping": 1.0 - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - model = SimpleModel(hidden_dim, empty_grad=False) - - @distributed_test(world_size=[1]) - def _test_onecycle_mom(args, - model, - hidden_dim, - min_mom, - max_mom, - step_size, - decay_rate): - model, _, _, lr_scheduler = deepspeed.initialize(args=args, + "gradient_clipping": 1.0 + } + hidden_dim = 10 + + model = SimpleModel(hidden_dim, empty_grad=False) + model, _, _, lr_scheduler = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, @@ -558,11 +451,3 @@ def _test_onecycle_mom(args, # Verify decay phase if decay_rate > 0: _verify_continuous_increase(step_moms[(step_size * 2):]) - - _test_onecycle_mom(args=args, - model=model, - hidden_dim=hidden_dim, - min_mom=min_mom, - max_mom=max_mom, - step_size=step_size, - decay_rate=decay_rate) diff --git a/tests/unit/test_moe.py b/tests/unit/test_moe.py index cb1a89b9a1eb..22bf8ba2caf7 100644 --- a/tests/unit/test_moe.py +++ b/tests/unit/test_moe.py @@ -1,9 +1,9 @@ import torch import deepspeed import pytest -from .common import distributed_test -from .simple_model import SimplePRMoEModel, args_from_dict, SimpleMoEModel, sequence_dataloader -from .util import required_torch_version +from tests.unit.common import distributed_test +from tests.unit.simple_model import SimplePRMoEModel, SimpleMoEModel, sequence_dataloader, args_from_dict +from tests.unit.util import required_torch_version try: from apex import amp # noqa: F401 diff --git a/tests/unit/test_moe_tp.py b/tests/unit/test_moe_tp.py index 60fbe9697da9..f8586d4daaea 100644 --- a/tests/unit/test_moe_tp.py +++ b/tests/unit/test_moe_tp.py @@ -1,118 +1,68 @@ import torch import deepspeed import pytest -from .common import distributed_test -from .simple_model import args_from_dict -from .util import required_torch_version +from tests.unit.common import DistributedTest +from tests.unit.util import required_torch_version from deepspeed.moe.layer import MoE -@pytest.mark.parametrize("ep_size, tp_size, enable_expert_tp, use_residual", - [ - (1, - 2, - False, - False), - (1, - 2, - True, - False), - (1, - 2, - False, - True), - (1, - 2, - True, - True), - (1, - 4, - False, - False), - (1, - 4, - True, - False), - (1, - 4, - False, - True), - (1, - 4, - True, - True), - (2, - 2, - False, - False), - (2, - 2, - True, - False), - (2, - 2, - False, - True), - (2, - 2, - True, - True), - ]) -def test_moe_tensor_parallel(tmpdir, ep_size, tp_size, enable_expert_tp, use_residual): - if not required_torch_version(): - pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") - - config_dict = { - "train_batch_size": 8, - "steps_per_print": 1, - "fp16": { - "enabled": True - } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 16 +class MPU(): + def __init__(self, tp_world_size): + self.rank = deepspeed.comm.get_rank() + self.world_size = deepspeed.comm.get_world_size() + self.tp_world_size = tp_world_size - class MPU(): - def __init__(self, tp_world_size): - self.rank = deepspeed.comm.get_rank() - self.world_size = deepspeed.comm.get_world_size() - self.tp_world_size = tp_world_size + for i in range(0, self.world_size, tp_world_size): + ranks = range(i, i + tp_world_size) + group = deepspeed.comm.new_group(ranks) + if self.rank in ranks: + self.tp_group = group - for i in range(0, self.world_size, tp_world_size): - ranks = range(i, i + tp_world_size) - group = deepspeed.comm.new_group(ranks) - if self.rank in ranks: - self.tp_group = group + for i in range(0, tp_world_size): + ranks = range(i, self.world_size, tp_world_size) + group = deepspeed.comm.new_group(ranks) + if self.rank in ranks: + self.dp_group = group - for i in range(0, tp_world_size): - ranks = range(i, self.world_size, tp_world_size) - group = deepspeed.comm.new_group(ranks) - if self.rank in ranks: - self.dp_group = group + def get_model_parallel_rank(self): + return self.rank % self.tp_world_size - def get_model_parallel_rank(self): - return self.rank % self.tp_world_size + def get_model_parallel_world_size(self): + return self.tp_world_size - def get_model_parallel_world_size(self): - return self.tp_world_size + def get_data_parallel_rank(self): + return self.rank // self.tp_world_size - def get_data_parallel_rank(self): - return self.rank // self.tp_world_size + def get_data_parallel_world_size(self): + return self.world_size // self.tp_world_size - def get_data_parallel_world_size(self): - return self.world_size // self.tp_world_size + def get_data_parallel_group(self): + return self.dp_group - def get_data_parallel_group(self): - return self.dp_group + def get_model_parallel_group(self): + return self.tp_group - def get_model_parallel_group(self): - return self.tp_group - @distributed_test(world_size=[4]) - def _test_moe(args, hidden_dim, ep_size, tp_size, enable_expert_tp, use_residual): +@pytest.mark.parametrize("ep_size, tp_size", [(1, 2), (1, 4), (2, 2)]) +@pytest.mark.parametrize("enable_expert_tp", [True, False]) +@pytest.mark.parametrize("use_residual", [True, False]) +class TestMOETensorParallel(DistributedTest): + world_size = 4 + def test(self, ep_size, tp_size, enable_expert_tp, use_residual): # TODO: replace this with a true parallel mlp in the future # and run convergence tests + if not required_torch_version(): + pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") + + config_dict = { + "train_batch_size": 8, + "steps_per_print": 1, + "fp16": { + "enabled": True + } + } + hidden_dim = 16 tensor_parallel_expert = torch.nn.Sequential( torch.nn.Linear(hidden_dim, @@ -132,7 +82,7 @@ def _test_moe(args, hidden_dim, ep_size, tp_size, enable_expert_tp, use_residual enable_expert_tensor_parallelism=enable_expert_tp, ) optimizer = torch.optim.AdamW(params=model.parameters()) - model, _, _, _ = deepspeed.initialize(args=args, + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer, dist_init_required=False, @@ -144,10 +94,3 @@ def _test_moe(args, hidden_dim, ep_size, tp_size, enable_expert_tp, use_residual ) == tp_size else: assert deepspeed.utils.groups._get_expert_model_parallel_world_size() == 1 - - _test_moe(args=args, - hidden_dim=hidden_dim, - ep_size=ep_size, - tp_size=tp_size, - enable_expert_tp=enable_expert_tp, - use_residual=use_residual) diff --git a/tests/unit/test_multi_output_model.py b/tests/unit/test_multi_output_model.py index deef776c0815..c8add428c0a4 100755 --- a/tests/unit/test_multi_output_model.py +++ b/tests/unit/test_multi_output_model.py @@ -1,9 +1,8 @@ import torch import deepspeed from pytest import approx -from .common import distributed_test -from .simple_model import args_from_dict -from .multi_output_model import MultiOutputModel, multi_output_dataloader +from tests.unit.common import DistributedTest +from tests.unit.multi_output_model import MultiOutputModel, multi_output_dataloader def create_config_dict(micro_batch_size, grad_accumulation_steps, world_size): @@ -24,23 +23,18 @@ def create_config_dict(micro_batch_size, grad_accumulation_steps, world_size): } -def test_two_output_model(tmpdir): - gradient_accumulation_steps = 2 - micro_batch_size = 1 +class TestMultiModelOutput(DistributedTest): world_size = 1 - config_dict = create_config_dict(micro_batch_size, - gradient_accumulation_steps, - world_size) - hidden_dim = 10 - weight_value = 0.1 - args = args_from_dict(tmpdir, config_dict) + def test_two(self, gradient_accumulation_steps=2, micro_batch_size=1): + config_dict = create_config_dict(micro_batch_size, + gradient_accumulation_steps, + self.world_size) + hidden_dim = 10 + weight_value = 0.1 - model = MultiOutputModel(hidden_dim, weight_value) - - @distributed_test(world_size=[1]) - def _test_two_output_model(args, model, hidden_dim): - model, _, _, _ = deepspeed.initialize(args=args, + model = MultiOutputModel(hidden_dim, weight_value) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) total_samples = 4 @@ -74,26 +68,16 @@ def _test_two_output_model(args, model, hidden_dim): model.step() - _test_two_output_model(args=args, model=model, hidden_dim=hidden_dim) - - -def test_three_output_model(tmpdir): - gradient_accumulation_steps = 3 - micro_batch_size = 1 - world_size = 1 - config_dict = create_config_dict(micro_batch_size, - gradient_accumulation_steps, - world_size) - - hidden_dim = 10 - weight_value = 0.1 - args = args_from_dict(tmpdir, config_dict) + def test_three(self, gradient_accumulation_steps=3, micro_batch_size=1): + config_dict = create_config_dict(micro_batch_size, + gradient_accumulation_steps, + self.world_size) - model = MultiOutputModel(hidden_dim, weight_value) + hidden_dim = 10 + weight_value = 0.1 - @distributed_test(world_size=[1]) - def _test_three_output_model(args, model, hidden_dim): - model, _, _, _ = deepspeed.initialize(args=args, + model = MultiOutputModel(hidden_dim, weight_value) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) @@ -131,5 +115,3 @@ def _test_three_output_model(args, model, hidden_dim): assert scaled_loss.item() == approx(expected_scaled_loss.item()) model.step() - - _test_three_output_model(args=args, model=model, hidden_dim=hidden_dim) diff --git a/tests/unit/test_partition.py b/tests/unit/test_partition.py index cf4852e477e2..df9b8d619a4f 100644 --- a/tests/unit/test_partition.py +++ b/tests/unit/test_partition.py @@ -8,46 +8,50 @@ from deepspeed.runtime.utils import prefix_sum_inc from deepspeed.runtime.utils import PartitionedTensor -from .common import distributed_test +from tests.unit.common import DistributedTest -@distributed_test(world_size=4) -def test_partitioned_tensor(): - world = dist.get_world_size() - rank = dist.get_rank() +class TestPartitionedTensor(DistributedTest): + world_size = 4 - group = dist.new_group(ranks=list(range(world))) + def test(self): + world = dist.get_world_size() + rank = dist.get_rank() - rows = world * 4 - cols = 3 + group = dist.new_group(ranks=list(range(world))) - full = torch.rand(rows, cols).cuda() - dist.broadcast(full, src=0, group=group) - part = PartitionedTensor(full, group=group) + rows = world * 4 + cols = 3 - assert len(part.local_size()) == 1 - assert part.local_size()[0] * world == full.numel() + full = torch.rand(rows, cols).cuda() + dist.broadcast(full, src=0, group=group) + part = PartitionedTensor(full, group=group) - reconstructed = part.full() - assert torch.equal(full, reconstructed) + assert len(part.local_size()) == 1 + assert part.local_size()[0] * world == full.numel() + reconstructed = part.full() + assert torch.equal(full, reconstructed) -@distributed_test(world_size=4) -def test_partitioned_tensor_meta(): - world = dist.get_world_size() - rank = dist.get_rank() - group = dist.new_group(ranks=list(range(world))) +class TestPartitionedTensorMeta(DistributedTest): + world_size = 4 - rows = world * 7 - cols = 3 + def test(self): + world = dist.get_world_size() + rank = dist.get_rank() - full = torch.rand(rows, cols).cuda() - dist.broadcast(full, src=0, group=group) - part = PartitionedTensor(full, group=group) + group = dist.new_group(ranks=list(range(world))) - my_meta = PartitionedTensor.from_meta(part.to_meta(), part.local_data, group) - assert torch.equal(full, my_meta.full()) + rows = world * 7 + cols = 3 + + full = torch.rand(rows, cols).cuda() + dist.broadcast(full, src=0, group=group) + part = PartitionedTensor(full, group=group) + + my_meta = PartitionedTensor.from_meta(part.to_meta(), part.local_data, group) + assert torch.equal(full, my_meta.full()) def assert_valid_partition(weights, parts, P): diff --git a/tests/unit/test_zero_context.py b/tests/unit/test_zero_context.py index a8fb31a8c8e5..5db48e3e0df9 100644 --- a/tests/unit/test_zero_context.py +++ b/tests/unit/test_zero_context.py @@ -8,7 +8,7 @@ from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus, partitioned_param_data_shape import deepspeed.comm as dist -from .common import distributed_test, get_master_port +from tests.unit.common import DistributedTest, get_master_port def setup_serial_env(): @@ -27,42 +27,46 @@ def test_scattered_init_dist(): assert dist.is_initialized() -@distributed_test(world_size=2) -def test_scatter_gather(): - with deepspeed.zero.Init(): - l = torch.nn.Linear(6, 3) - assert l.weight.ds_status == ZeroParamStatus.NOT_AVAILABLE - assert l.weight.shape == torch.Size(partitioned_param_data_shape) +class TestScatterGather(DistributedTest): + world_size = 2 - # Ensure there is no impact outside the context - l2 = torch.nn.Linear(6, 3) - assert not hasattr(l2.weight, 'ds_status') - assert l2.weight.numel() == l2.in_features * l2.out_features + def test(self): + with deepspeed.zero.Init(): + l = torch.nn.Linear(6, 3) + assert l.weight.ds_status == ZeroParamStatus.NOT_AVAILABLE + assert l.weight.shape == torch.Size(partitioned_param_data_shape) - with deepspeed.zero.GatheredParameters(l.weight): - assert l.weight.ds_status == ZeroParamStatus.AVAILABLE - assert l.weight.numel() == l.in_features * l.out_features + # Ensure there is no impact outside the context + l2 = torch.nn.Linear(6, 3) + assert not hasattr(l2.weight, 'ds_status') + assert l2.weight.numel() == l2.in_features * l2.out_features + with deepspeed.zero.GatheredParameters(l.weight): + assert l.weight.ds_status == ZeroParamStatus.AVAILABLE + assert l.weight.numel() == l.in_features * l.out_features -@distributed_test(world_size=2) -def test_gather_update(): - with deepspeed.zero.Init(): - l = torch.nn.Linear(4, 2) - assert l.weight.ds_status == ZeroParamStatus.NOT_AVAILABLE - # Gather and make a change - with deepspeed.zero.GatheredParameters(l.weight, modifier_rank=1): - assert l.weight.ds_status == ZeroParamStatus.AVAILABLE - if dist.get_rank() == 1: - with torch.no_grad(): - l.weight.zero_() +class TestGatherUpdate(DistributedTest): + world_size = 2 + + def test(self): + with deepspeed.zero.Init(): + l = torch.nn.Linear(4, 2) + assert l.weight.ds_status == ZeroParamStatus.NOT_AVAILABLE + + # Gather and make a change + with deepspeed.zero.GatheredParameters(l.weight, modifier_rank=1): + assert l.weight.ds_status == ZeroParamStatus.AVAILABLE + if dist.get_rank() == 1: + with torch.no_grad(): + l.weight.zero_() - # should now be scattered again + # should now be scattered again - # Now gather again and ensure the change is global - with deepspeed.zero.GatheredParameters(l.weight): - # all ranks compare - assert torch.equal(l.weight, torch.zeros_like(l.weight)) + # Now gather again and ensure the change is global + with deepspeed.zero.GatheredParameters(l.weight): + # all ranks compare + assert torch.equal(l.weight, torch.zeros_like(l.weight)) config = { @@ -362,28 +366,30 @@ def test_subclass_param_init(): assert torch.equal(model.param_grandpa, ones + 3) -@distributed_test(world_size=2) -def test_ds_init_w_zinit(): - ds_config = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 +class TestDSInitWZinit(DistributedTest): + world_size = 2 + + def test(self): + ds_config = { + "train_batch_size": 2, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + } } } - } - class Model(torch.nn.Module): - def __init__(self): - super(Model, self).__init__() - self.linear = torch.nn.Linear(4, 4) + class Model(torch.nn.Module): + def __init__(self): + super(Model, self).__init__() + self.linear = torch.nn.Linear(4, 4) - def magic(self): - return 42 + def magic(self): + return 42 - with deepspeed.zero.Init(): - model = Model() - engine, *_ = deepspeed.initialize(model=model, config=ds_config, model_parameters=model.parameters()) - assert engine.magic() == 42 + with deepspeed.zero.Init(): + model = Model() + engine, *_ = deepspeed.initialize(model=model, config=ds_config, model_parameters=model.parameters()) + assert engine.magic() == 42 From c60f640dfe3b959c6e61ad416488a4095ee3a802 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Fri, 12 Aug 2022 17:15:00 -0700 Subject: [PATCH 02/25] more test refactoring --- tests/unit/test_averaging.py | 98 +-- tests/unit/test_configurable_parallel.py | 6 +- tests/unit/test_curriculum_learning.py | 162 ++--- tests/unit/test_ignore_unused_parameters.py | 68 +- tests/unit/test_zero.py | 705 +++++++++----------- 5 files changed, 489 insertions(+), 550 deletions(-) diff --git a/tests/unit/test_averaging.py b/tests/unit/test_averaging.py index 35c39f4257af..e178554c1aa9 100644 --- a/tests/unit/test_averaging.py +++ b/tests/unit/test_averaging.py @@ -1,49 +1,57 @@ import torch import deepspeed -from .common import distributed_test - - -def test_sparse_adam(tmpdir): - config_dict = {"train_batch_size": 2, "steps_per_print": 1, "sparse_gradients": True} - - class Model(torch.nn.Module): - def __init__(self): - super().__init__() - self.emb = torch.nn.EmbeddingBag(10, 3, mode="sum", sparse=True) - self.linear = torch.nn.Linear(3, 1) - - def forward(self, x, offsets): - return self.linear(self.emb(x, offsets)) - - class Adam(torch.optim.Optimizer): - def __init__(self, dense_params, sparse_params): - super().__init__(dense_params + sparse_params, defaults={}) - self.adam = torch.optim.Adam(dense_params) - self.adam_sparse = torch.optim.SparseAdam(sparse_params) - - @torch.no_grad() - def step(self, closure=None): - loss_1 = self.adam.step(closure) - loss_2 = self.adam_sparse.step(closure) - - if loss_1 is not None and loss_2 is not None: - return loss_1 + loss_2 - return loss_1 or loss_2 - - def get_model_optimizer(): - torch.manual_seed(0) - model = Model() - optimizer = Adam(list(model.linear.parameters()), list(model.emb.parameters())) - return model, optimizer - - def get_data(device): - x = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9], dtype=torch.long, device=device) - offsets = torch.tensor([0, 4], dtype=torch.long, device=device) - y = torch.tensor([[1.0], [0.0]], device=device) - return x, offsets, y - - @distributed_test(world_size=2) - def _test(): +from tests.unit.common import DistributedTest + + +class Model(torch.nn.Module): + def __init__(self): + super().__init__() + self.emb = torch.nn.EmbeddingBag(10, 3, mode="sum", sparse=True) + self.linear = torch.nn.Linear(3, 1) + + def forward(self, x, offsets): + return self.linear(self.emb(x, offsets)) + + +class Adam(torch.optim.Optimizer): + def __init__(self, dense_params, sparse_params): + super().__init__(dense_params + sparse_params, defaults={}) + self.adam = torch.optim.Adam(dense_params) + self.adam_sparse = torch.optim.SparseAdam(sparse_params) + + @torch.no_grad() + def step(self, closure=None): + loss_1 = self.adam.step(closure) + loss_2 = self.adam_sparse.step(closure) + + if loss_1 is not None and loss_2 is not None: + return loss_1 + loss_2 + return loss_1 or loss_2 + + +def get_model_optimizer(): + torch.manual_seed(0) + model = Model() + optimizer = Adam(list(model.linear.parameters()), list(model.emb.parameters())) + return model, optimizer + + +def get_data(device): + x = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9], dtype=torch.long, device=device) + offsets = torch.tensor([0, 4], dtype=torch.long, device=device) + y = torch.tensor([[1.0], [0.0]], device=device) + return x, offsets, y + + +class TestSparseAdam(DistributedTest): + world_size = 2 + + def test(self): + config_dict = { + "train_batch_size": 2, + "steps_per_print": 1, + "sparse_gradients": True + } model, optimizer = get_model_optimizer() loss = torch.nn.BCEWithLogitsLoss() engine, _, _, _ = deepspeed.initialize(model=model, @@ -69,5 +77,3 @@ def _test(): for k, v in engine.named_parameters(): grad = v.grad.to_dense() if v.grad.is_sparse else v.grad assert torch.allclose(grad, averaged_grads[k] * engine.world_size) - - _test() diff --git a/tests/unit/test_configurable_parallel.py b/tests/unit/test_configurable_parallel.py index f9ff67f578e0..c7ce77743312 100755 --- a/tests/unit/test_configurable_parallel.py +++ b/tests/unit/test_configurable_parallel.py @@ -5,9 +5,9 @@ import numpy as np import torch.multiprocessing as mp import deepspeed.comm as dist -from .common import distributed_test -from .megatron_model import get_gpt2_model, get_megatron_version -from .megatron_model import MockGPT2ModelPipe as GPT2ModelPipe +from tests.unit.common import distributed_test +from tests.unit.megatron_model import get_gpt2_model, get_megatron_version +from tests.unit.megatron_model import MockGPT2ModelPipe as GPT2ModelPipe from deepspeed.utils import RepeatingLoader TORCH_MAJOR = int(torch.__version__.split('.')[0]) diff --git a/tests/unit/test_curriculum_learning.py b/tests/unit/test_curriculum_learning.py index cb7af95b1edb..d46753b50ea7 100644 --- a/tests/unit/test_curriculum_learning.py +++ b/tests/unit/test_curriculum_learning.py @@ -1,52 +1,52 @@ import deepspeed -from .common import distributed_test -from .simple_model import Curriculum_SimpleModel, random_dataloader, args_from_dict +from tests.unit.common import DistributedTest +from tests.unit.simple_model import Curriculum_SimpleModel, random_dataloader -def test_curriculum_scheduler_fixed_discrete(tmpdir): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015, - "weight_decay": 0.01 - } - }, - "gradient_clipping": 1.0, - "fp16": { - "enabled": True, - "loss_scale": 0, - "initial_scale_power": 16 - }, - "curriculum_learning": { - "enabled": True, - "curriculum_type": "seqlen", - "min_difficulty": 1, - "max_difficulty": 5, - "schedule_type": "fixed_discrete", - "schedule_config": { - "difficulty": [1, - 2, - 3, - 4, - 5], - "max_step": [2, - 4, - 6, - 8] +class TestCurriculumScheduler(DistributedTest): + world_size = 2 + + def test_fixed_discrete(self): + config_dict = { + "train_batch_size": 2, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015, + "weight_decay": 0.01 + } + }, + "gradient_clipping": 1.0, + "fp16": { + "enabled": True, + "loss_scale": 0, + "initial_scale_power": 16 + }, + "curriculum_learning": { + "enabled": True, + "curriculum_type": "seqlen", + "min_difficulty": 1, + "max_difficulty": 5, + "schedule_type": "fixed_discrete", + "schedule_config": { + "difficulty": [1, + 2, + 3, + 4, + 5], + "max_step": [2, + 4, + 6, + 8] + } } } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - ground_truths = {1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 3, 7: 4, 8: 4} - model = Curriculum_SimpleModel(hidden_dim) + hidden_dim = 10 + ground_truths = {1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 3, 7: 4, 8: 4} - @distributed_test(world_size=[1, 2]) - def _test_curriculum_scheduler_fixed_discrete(args, model, hidden_dim): - model, _, _, _ = deepspeed.initialize(args=args, + model = Curriculum_SimpleModel(hidden_dim) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, @@ -60,51 +60,42 @@ def _test_curriculum_scheduler_fixed_discrete(args, model, hidden_dim): true_seqlen = 5 if n + 1 in ground_truths: true_seqlen = ground_truths[n + 1] - print('at step {} the seqlen is {}'.format(n + 1, seqlen)) assert seqlen == true_seqlen, f"Incorrect curriculum schedule" - _test_curriculum_scheduler_fixed_discrete(args=args, - model=model, - hidden_dim=hidden_dim) - - -def test_curriculum_scheduler_fixed_linear(tmpdir): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015, - "weight_decay": 0.01 - } - }, - "gradient_clipping": 1.0, - "fp16": { - "enabled": True, - "loss_scale": 0, - "initial_scale_power": 16 - }, - "curriculum_learning": { - "enabled": True, - "curriculum_type": "seqlen", - "min_difficulty": 2, - "max_difficulty": 10, - "schedule_type": "fixed_linear", - "schedule_config": { - "total_curriculum_step": 8, - "difficulty_step": 2 + def test_fixed_linear(self): + config_dict = { + "train_batch_size": 2, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015, + "weight_decay": 0.01 + } + }, + "gradient_clipping": 1.0, + "fp16": { + "enabled": True, + "loss_scale": 0, + "initial_scale_power": 16 + }, + "curriculum_learning": { + "enabled": True, + "curriculum_type": "seqlen", + "min_difficulty": 2, + "max_difficulty": 10, + "schedule_type": "fixed_linear", + "schedule_config": { + "total_curriculum_step": 8, + "difficulty_step": 2 + } } } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - ground_truths = {1: 2, 2: 4, 3: 4, 4: 6, 5: 6, 6: 8, 7: 8, 8: 10, 9: 10, 10: 10} - model = Curriculum_SimpleModel(hidden_dim) + hidden_dim = 10 + ground_truths = {1: 2, 2: 4, 3: 4, 4: 6, 5: 6, 6: 8, 7: 8, 8: 10, 9: 10, 10: 10} - @distributed_test(world_size=[1, 2]) - def _test_curriculum_scheduler_fixed_linear(args, model, hidden_dim): - model, _, _, _ = deepspeed.initialize(args=args, + model = Curriculum_SimpleModel(hidden_dim) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, @@ -117,9 +108,4 @@ def _test_curriculum_scheduler_fixed_linear(args, model, hidden_dim): model.step() if n + 1 in ground_truths: true_seqlen = ground_truths[n + 1] - print('at step {} the seqlen is {}'.format(n + 1, seqlen)) assert seqlen == true_seqlen, f"Incorrect curriculum schedule" - - _test_curriculum_scheduler_fixed_linear(args=args, - model=model, - hidden_dim=hidden_dim) diff --git a/tests/unit/test_ignore_unused_parameters.py b/tests/unit/test_ignore_unused_parameters.py index fd1f427d1220..a10d0c614ffb 100644 --- a/tests/unit/test_ignore_unused_parameters.py +++ b/tests/unit/test_ignore_unused_parameters.py @@ -1,47 +1,45 @@ import pytest -from .common import distributed_test -from .simple_model import UnusedParametersModel, random_dataloader, args_from_dict +from tests.unit.common import DistributedTest +from tests.unit.simple_model import UnusedParametersModel, random_dataloader from deepspeed.ops.op_builder import CPUAdamBuilder import deepspeed @pytest.mark.parametrize('ignore_unused_parameters', [False, True]) -def test_stage2_ignore_unused_parameters(tmpdir, ignore_unused_parameters): - use_cpu_offload = True - - if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: - pytest.skip("cpu-adam is not compatible") - - config_dict = { - "train_micro_batch_size_per_gpu": 2, - "gradient_accumulation_steps": 2, - "steps_per_print": 1, - "zero_optimization": { - "stage": 2, - "cpu_offload": use_cpu_offload, - "ignore_unused_parameters": ignore_unused_parameters - }, - "optimizer": { - "type": "Adam", - "params": { - "lr": 1e-3 +class TestStage2IgnoreUnusedParameters(DistributedTest): + world_size = 1 + + def test(self, ignore_unused_parameters): + use_cpu_offload = True + + if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: + pytest.skip("cpu-adam is not compatible") + + config_dict = { + "train_micro_batch_size_per_gpu": 2, + "gradient_accumulation_steps": 2, + "steps_per_print": 1, + "zero_optimization": { + "stage": 2, + "cpu_offload": use_cpu_offload, + "ignore_unused_parameters": ignore_unused_parameters + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-3 + } + }, + "fp16": { + "enabled": True, + "initial_scale_power": 8 } - }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 } - } - - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 4 + hidden_dim = 4 - model = UnusedParametersModel(hidden_dim=hidden_dim) - - @distributed_test(world_size=[1]) - def _test_stage2_ignore_unused_parameters(args, model, hidden_dim): - model, _, _, _ = deepspeed.initialize(args=args, + model = UnusedParametersModel(hidden_dim=hidden_dim) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) @@ -62,5 +60,3 @@ def _loop(): with pytest.raises(AssertionError) as e: _loop() assert e.value.args and 'ignore_unused_parameters' in e.value.args[0] - - _test_stage2_ignore_unused_parameters(args=args, model=model, hidden_dim=hidden_dim) diff --git a/tests/unit/test_zero.py b/tests/unit/test_zero.py index b580fc4eaaa5..be4ade46ecf0 100755 --- a/tests/unit/test_zero.py +++ b/tests/unit/test_zero.py @@ -9,8 +9,8 @@ from torch.nn.modules.loss import L1Loss from torch.nn.parameter import Parameter -from .common import distributed_test -from .simple_model import SimpleModel, random_dataloader +from tests.unit.common import DistributedTest +from tests.unit.simple_model import SimpleModel, random_dataloader import deepspeed from deepspeed.runtime.engine import DeepSpeedEngine @@ -44,32 +44,31 @@ def dump_state_dict(model): @pytest.mark.parametrize('zero_stage', [1, 2, 3]) -def test_zero_unbalanced_gradients(tmpdir, zero_stage): - config_dict = { - "train_micro_batch_size_per_gpu": 2, - "gradient_accumulation_steps": 2, - "steps_per_print": 1, - "zero_optimization": { - "stage": zero_stage - }, - "optimizer": { - "type": "Adam", - "params": { - "lr": 1e-3 +class TestZeroUnbalancedGradients(DistributedTest): + world_size = 1 + + def test(self, zero_stage): + config_dict = { + "train_micro_batch_size_per_gpu": 2, + "gradient_accumulation_steps": 2, + "steps_per_print": 1, + "zero_optimization": { + "stage": zero_stage + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-3 + } + }, + "fp16": { + "enabled": True, + "initial_scale_power": 8 } - }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 } - } + hidden_dim = 4 - hidden_dim = 4 - - model = SimpleModel(hidden_dim=hidden_dim) - - @distributed_test(world_size=[1]) - def _test_zero_unbalanced_gradients(model, hidden_dim): + model = SimpleModel(hidden_dim=hidden_dim) model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) @@ -80,53 +79,48 @@ def _test_zero_unbalanced_gradients(model, hidden_dim): run_unbalanced_gradients(model, data_loader) - _test_zero_unbalanced_gradients(model=model, hidden_dim=hidden_dim) - # testing the fix https://github.com/microsoft/DeepSpeed/pull/1227 -@pytest.mark.parametrize('zero_stage', [3]) -def test_zero3_repeat_forward_loop(tmpdir, zero_stage): - - # force all params to be partitioned by forcing threshold=0 - config_dict = { - "train_micro_batch_size_per_gpu": 2, - "gradient_accumulation_steps": 2, - "steps_per_print": 1, - "zero_optimization": { - "stage": zero_stage, - "stage3_param_persistence_threshold": 0 - }, - "optimizer": { - "type": "Adam", - "params": { - "lr": 1e-3 +class TestZero3RepeatForwardLoop(DistributedTest): + world_size = 1 + + def test(self, zero_stage=3): + # force all params to be partitioned by forcing threshold=0 + config_dict = { + "train_micro_batch_size_per_gpu": 2, + "gradient_accumulation_steps": 2, + "steps_per_print": 1, + "zero_optimization": { + "stage": zero_stage, + "stage3_param_persistence_threshold": 0 + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-3 + } + }, + "fp16": { + "enabled": True, + "initial_scale_power": 8 } - }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 } - } - - hidden_dim = 4 - - class AlbertLikeModel(torch.nn.Module): - def __init__(self, hidden_dim): - super().__init__() - self.linear = torch.nn.Linear(hidden_dim, hidden_dim) - self.cross_entropy_loss = torch.nn.CrossEntropyLoss() + hidden_dim = 4 - def forward(self, x, y): - # run the same layer multiple times in a loop - to test a stack of forwards, followed by a stack of backwards - hidden = x - for i in range(3): - hidden = hidden + self.linear(hidden) - return self.cross_entropy_loss(hidden, y) + class AlbertLikeModel(torch.nn.Module): + def __init__(self, hidden_dim): + super().__init__() + self.linear = torch.nn.Linear(hidden_dim, hidden_dim) + self.cross_entropy_loss = torch.nn.CrossEntropyLoss() - model = AlbertLikeModel(hidden_dim=hidden_dim) + def forward(self, x, y): + # run the same layer multiple times in a loop - to test a stack of forwards, followed by a stack of backwards + hidden = x + for i in range(3): + hidden = hidden + self.linear(hidden) + return self.cross_entropy_loss(hidden, y) - @distributed_test(world_size=[1]) - def _test_zero3_repeat_forward_loop(model, hidden_dim): + model = AlbertLikeModel(hidden_dim=hidden_dim) model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) @@ -140,39 +134,36 @@ def _test_zero3_repeat_forward_loop(model, hidden_dim): model.backward(loss) model.step() - _test_zero3_repeat_forward_loop(model=model, hidden_dim=hidden_dim) - # testing the fix https://github.com/microsoft/DeepSpeed/pull/1227 # also reproduces the https://github.com/microsoft/DeepSpeed/pull/1372 @pytest.mark.parametrize('zero_stage', [2, 3]) -def test_zero_to_fp32_1_param_group(tmpdir, zero_stage): - - # XXX: ideally refactor with the 2_param_group test as 75% is the same - - # force all params to be partitioned by forcing threshold=0 - config_dict = { - "train_micro_batch_size_per_gpu": 2, - "gradient_accumulation_steps": 2, - "steps_per_print": 1, - "zero_optimization": { - "stage": zero_stage, - "stage3_param_persistence_threshold": 0 - }, - "optimizer": { - "type": "Adam", - "params": { - "lr": 1e-3 +class TestZeroToFP32(DistributedTest): + world_size = 2 + + def test_1_param_group(self, tmpdir, zero_stage): + # XXX: ideally refactor with the 2_param_group test as 75% is the same + # force all params to be partitioned by forcing threshold=0 + config_dict = { + "train_micro_batch_size_per_gpu": 2, + "gradient_accumulation_steps": 2, + "steps_per_print": 1, + "zero_optimization": { + "stage": zero_stage, + "stage3_param_persistence_threshold": 0 + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-3 + } + }, + "fp16": { + "enabled": True, + "initial_scale_power": 8 } - }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 } - } - @distributed_test(world_size=[2]) - def _test_zero_to_fp32(): class MyModel(torch.nn.Module): def __init__(self, hidden_dim, n_layers): super().__init__() @@ -240,39 +231,31 @@ def forward(self, x, y): assert torch.allclose(orig_state_dict[name].float(), fp32_state_dict[name].float()) - _test_zero_to_fp32() - - -@pytest.mark.parametrize('zero_stage', [2, 3]) -def test_zero_to_fp32_2_param_groups(tmpdir, zero_stage): - - # TODO: - # - need to test with multiple param groups - - # force all params to be partitioned by forcing threshold=0 - config_dict = { - "train_micro_batch_size_per_gpu": 2, - "gradient_accumulation_steps": 2, - "steps_per_print": 1, - "zero_allow_untested_optimizer": 1, - "zero_optimization": { - "stage": zero_stage, - "stage3_param_persistence_threshold": 0 - }, - "optimizer": { - "type": "Adam", - "params": { - "lr": 1e-3 + def test_2_param_groups(self, tmpdir, zero_stage): + # TODO: + # - need to test with multiple param groups + # force all params to be partitioned by forcing threshold=0 + config_dict = { + "train_micro_batch_size_per_gpu": 2, + "gradient_accumulation_steps": 2, + "steps_per_print": 1, + "zero_allow_untested_optimizer": 1, + "zero_optimization": { + "stage": zero_stage, + "stage3_param_persistence_threshold": 0 + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-3 + } + }, + "fp16": { + "enabled": True, + "initial_scale_power": 8 } - }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 } - } - @distributed_test(world_size=[2]) - def _test_zero_to_fp32(): class MyModel(torch.nn.Module): def __init__(self, hidden_dim, n_layers): super().__init__() @@ -347,37 +330,34 @@ def forward(self, x, y): assert torch.allclose(orig_state_dict[name].float(), fp32_state_dict[name].float()) - _test_zero_to_fp32() - - -@pytest.mark.parametrize('zero_stage, allgather_bucket_size', [(2, 1000), (2, 1001)]) -def test_incorrect_allgather_bucket_size(tmpdir, zero_stage, allgather_bucket_size): - config_dict = { - "train_micro_batch_size_per_gpu": 2, - "gradient_accumulation_steps": 2, - "steps_per_print": 1, - "zero_optimization": { - "stage": zero_stage, - "allgather_bucket_size": allgather_bucket_size - }, - "optimizer": { - "type": "Adam", - "params": { - "lr": 1e-3 - } - }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - } - } - hidden_dim = 4 +@pytest.mark.parametrize("allgather_bucket_size", [1000, 1001]) +class TestIncorectAllgatherBucketSize(DistributedTest): + world_size = 1 - model = SimpleModel(hidden_dim=hidden_dim) + def test(self, allgather_bucket_size, zero_stage=2): + config_dict = { + "train_micro_batch_size_per_gpu": 2, + "gradient_accumulation_steps": 2, + "steps_per_print": 1, + "zero_optimization": { + "stage": zero_stage, + "allgather_bucket_size": allgather_bucket_size + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-3 + } + }, + "fp16": { + "enabled": True, + "initial_scale_power": 8 + } + } + hidden_dim = 4 - @distributed_test(world_size=[1]) - def _test_incorrect_allgather_bucket_size(model, hidden_dim): + model = SimpleModel(hidden_dim=hidden_dim) if allgather_bucket_size % 2 == 0: model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, @@ -390,36 +370,32 @@ def _test_incorrect_allgather_bucket_size(model, hidden_dim): assert "allgather_bucket_size must be a multiple of nccl_start_alignment_factor" in str( assertinfo) - _test_incorrect_allgather_bucket_size(model=model, hidden_dim=hidden_dim) - - -@pytest.mark.parametrize('zero_stage, world_size', [(2, 2), (2, 3), (2, 4)]) -def test_partition_nccl_alignment(tmpdir, zero_stage, world_size): - config_dict = { - "train_micro_batch_size_per_gpu": 2, - "gradient_accumulation_steps": 2, - "steps_per_print": 1, - "zero_optimization": { - "stage": zero_stage - }, - "optimizer": { - "type": "Adam", - "params": { - "lr": 1e-3 - } - }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - } - } - hidden_dim = 4 +class TestPartitionNcclAlignment(DistributedTest): + world_size = 4 - model = SimpleModel(hidden_dim=hidden_dim) + def test(self, zero_stage=2): + config_dict = { + "train_micro_batch_size_per_gpu": 2, + "gradient_accumulation_steps": 2, + "steps_per_print": 1, + "zero_optimization": { + "stage": zero_stage + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-3 + } + }, + "fp16": { + "enabled": True, + "initial_scale_power": 8 + } + } + hidden_dim = 4 - @distributed_test(world_size=world_size) - def _test_partition_nccl_alignment(model, hidden_dim): + model = SimpleModel(hidden_dim=hidden_dim) model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) @@ -434,8 +410,6 @@ def _test_partition_nccl_alignment(model, hidden_dim): assert (partitioned_data.data_ptr() % (2 * nccl_start_alignment_factor) == 0) - _test_partition_nccl_alignment(model=model, hidden_dim=hidden_dim) - def _ds_initialize_for_param_partitioning_testing(model: Module, cfg: dict) -> DeepSpeedEngine: @@ -547,16 +521,17 @@ def forward(self, @pytest.mark.parametrize("offload_optimizer", [True, False]) @pytest.mark.parametrize("zero_grad", [True, False]) @pytest.mark.parametrize("prefetching", [True, False]) -def test_zero3_param_partitioning_base( - param_persistence_threshold: int, - fp16_enabled: bool, - contiguous_gradients: bool, - offload_optimizer: bool, - zero_grad: bool, - prefetching: bool, -) -> None: - @distributed_test(world_size=[2]) - def _test_zero3_param_partitioning(): +class TestZero3ParamPartitioningBase(DistributedTest): + world_size = 2 + + def test( + param_persistence_threshold: int, + fp16_enabled: bool, + contiguous_gradients: bool, + offload_optimizer: bool, + zero_grad: bool, + prefetching: bool, + ) -> None: if offload_optimizer and not contiguous_gradients: return @@ -752,35 +727,30 @@ def create_tensor(vals, dtype: torch.dtype = None) -> Tensor: _assert_partition_status(ds_engine, {ZeroParamStatus.NOT_AVAILABLE}) assert not math.isclose(ds_engine.optimizer._global_grad_norm, 0.0) - _test_zero3_param_partitioning() - -@pytest.mark.parametrize("world_sz", [1, 2, 4]) -@pytest.mark.parametrize("param_sz", [8100]) @pytest.mark.parametrize("init_context_manager", [True, False]) -def test_zero3_param_partitioning_large_param(world_sz: int, - param_sz: int, - init_context_manager: bool) -> None: - class LargeParamModel(Module): - def __init__(self): - super().__init__() - self.param = Parameter(torch.zeros((param_sz, ), dtype=torch.float32)) - - # only do weight initialization on root rank to - # make sure we are broadcasting correctly from rank 0 - if dist.get_rank() == 0: - partition_sz = math.ceil(self.param.numel() / dist.get_world_size()) - offset = 0 - for rank in range(dist.get_world_size()): - with torch.no_grad(): - self.param[offset:offset + partition_sz].fill_(rank) - offset += partition_sz +class TestZero3ParamPartitioningLargeParam(DistributedTest): + world_size = 4 - def forward(self, x: Tensor) -> Tensor: - return x * self.param + def test(self, init_context_manager: bool, param_sz: int = 8100) -> None: + class LargeParamModel(Module): + def __init__(self): + super().__init__() + self.param = Parameter(torch.zeros((param_sz, ), dtype=torch.float32)) + + # only do weight initialization on root rank to + # make sure we are broadcasting correctly from rank 0 + if dist.get_rank() == 0: + partition_sz = math.ceil(self.param.numel() / dist.get_world_size()) + offset = 0 + for rank in range(dist.get_world_size()): + with torch.no_grad(): + self.param[offset:offset + partition_sz].fill_(rank) + offset += partition_sz + + def forward(self, x: Tensor) -> Tensor: + return x * self.param - @distributed_test(world_size=[world_sz]) - def _distributed_test(): ds_config = { "train_micro_batch_size_per_gpu": 1, "zero_optimization": { @@ -811,7 +781,7 @@ def _distributed_test(): dtype=torch.float16, device=ds_engine.device)) - partition_sz = math.ceil(param_sz / world_sz) + partition_sz = math.ceil(param_sz / self.world_size) for rank_idx, start_idx in enumerate(range(0, param_sz, partition_sz)): activation_from_partition = activation[start_idx:start_idx + partition_sz] @@ -832,50 +802,46 @@ def _distributed_test(): assert torch.allclose(weight_gradient, expected_weight_gradient) - _distributed_test() - -@pytest.mark.parametrize("world_sz", [1, 2, 4]) @pytest.mark.parametrize("param_sz", [100, 1_000, 10_000]) @pytest.mark.parametrize("n_layers", [100, 1_000]) @pytest.mark.parametrize("init_context_manager", [True, False]) -def test_zero3_param_partitioning_many_params(world_sz: int, - param_sz: int, - n_layers: int, - init_context_manager: bool) -> None: - class ManyParamModel(Module): - def __init__(self) -> None: - super().__init__() - - self.modulelist = ModuleList( - EltwiseMultiplicationModule( - weight=Parameter(torch.empty((param_sz, - ), - dtype=torch.float32))) - for _ in range(n_layers)) - - for layer_num, module in enumerate(self.modulelist): - with deepspeed.zero.GatheredParameters(module.weight, modifier_rank=0): - param: Parameter = module.weight - partition_sz = math.ceil(param.numel() / dist.get_world_size()) - offset = 0 - for rank in range(dist.get_world_size()): - with torch.no_grad(): - param[offset:offset + partition_sz].fill_(2 * layer_num * - rank) - offset += partition_sz - - def forward(self, x: Tensor) -> Tensor: - activations = [] +class TestZero3ParamPartitioningManyParams(DistributedTest): + world_size = 4 - for module in self.modulelist: - x = module(x) - activations.append(x) + def test(self, param_sz: int, n_layers: int, init_context_manager: bool) -> None: + class ManyParamModel(Module): + def __init__(self) -> None: + super().__init__() - return activations + self.modulelist = ModuleList( + EltwiseMultiplicationModule( + weight=Parameter(torch.empty((param_sz, + ), + dtype=torch.float32))) + for _ in range(n_layers)) + + for layer_num, module in enumerate(self.modulelist): + with deepspeed.zero.GatheredParameters(module.weight, + modifier_rank=0): + param: Parameter = module.weight + partition_sz = math.ceil(param.numel() / dist.get_world_size()) + offset = 0 + for rank in range(dist.get_world_size()): + with torch.no_grad(): + param[offset:offset + partition_sz].fill_(2 * layer_num * + rank) + offset += partition_sz + + def forward(self, x: Tensor) -> Tensor: + activations = [] + + for module in self.modulelist: + x = module(x) + activations.append(x) + + return activations - @distributed_test(world_size=[world_sz]) - def _distributed_test(): ds_cfg = { "train_micro_batch_size_per_gpu": 1, "zero_optimization": { @@ -911,7 +877,7 @@ def _distributed_test(): device=ds_engine.device)) assert len(activations) == n_layers - partition_sz = math.ceil(param_sz / world_sz) + partition_sz = math.ceil(param_sz / self.world_size) expected_activations = torch.empty(param_sz, dtype=torch.float16, device=ds_engine.device) @@ -933,26 +899,24 @@ def _distributed_test(): for layer_num, activation in enumerate(weight_gradients): pass - _distributed_test() +class TestZero3InitForParentWeightInitialization(DistributedTest): + world_size = 4 -@pytest.mark.parametrize("world_sz", [1, 2, 4]) -def test_zero3_init_for_parent_weight_initialization(world_sz): - class ModelWhereParentInitializesChildWeights(Module): - def __init__(self) -> None: - super().__init__() + def test(self): + class ModelWhereParentInitializesChildWeights(Module): + def __init__(self) -> None: + super().__init__() - self.linear = Linear(12, 1) + self.linear = Linear(12, 1) - self.apply(self.__init_weights) + self.apply(self.__init_weights) - def __init_weights(self, module): - if isinstance(module, Linear): - with torch.no_grad(): - module.weight.fill_(1 + dist.get_rank()) + def __init_weights(self, module): + if isinstance(module, Linear): + with torch.no_grad(): + module.weight.fill_(1 + dist.get_rank()) - @distributed_test(world_size=[world_sz]) - def _distributed_test(): ds_cfg = { "train_micro_batch_size_per_gpu": 1, "zero_optimization": { @@ -978,30 +942,26 @@ def _distributed_test(): enabled=True): model = ModelWhereParentInitializesChildWeights() - assert model.linear.weight.ds_tensor.numel() == math.ceil(12 / world_sz) + assert model.linear.weight.ds_tensor.numel() == math.ceil(12 / self.world_size) assert torch.allclose(model.linear.weight.ds_tensor, torch.full_like(model.linear.weight.ds_tensor, 1)) - _distributed_test() - -@pytest.mark.skip( - reason="depends on upgraded pytorch and nccl that isn't always available") @pytest.mark.parametrize("param_persistence_threshold", [0, 10]) @pytest.mark.parametrize("contiguous_gradients", [True, False]) @pytest.mark.parametrize("offload_optimizer", [True, False]) -@pytest.mark.parametrize("zero_grad", [True]) -@pytest.mark.parametrize("iteration", list(range(1))) -def test_zero3_param_partitioning_base_bf16( - param_persistence_threshold: int, - contiguous_gradients: bool, - offload_optimizer: bool, - zero_grad: bool, - iteration: int, -) -> None: - @distributed_test(world_size=[2]) - def _test_zero3_param_partitioning(): +class TestZero3ParamPartitioningBaseBF16(DistributedTest): + world_size = 2 + + def test( + self, + param_persistence_threshold: int, + contiguous_gradients: bool, + offload_optimizer: bool, + zero_grad: bool = True, + iteration: int = 0, + ) -> None: if offload_optimizer and not contiguous_gradients: return @@ -1187,36 +1147,34 @@ def create_tensor(vals): ds_engine.optimizer.step() _assert_partition_status(ds_engine, {ZeroParamStatus.NOT_AVAILABLE}) - _test_zero3_param_partitioning() +class TestZeroOffloadStage1(DistributedTest): + world_size = 2 -def test_zero_offload_stage1(): - config_dict = { - "train_batch_size": 4, - "gradient_accumulation_steps": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 1e-4 - } - }, - "fp16": { - "enabled": True - }, - "zero_optimization": { - "stage": 1, - "offload_optimizer": { - "device": "cpu" + def test(self): + config_dict = { + "train_batch_size": 4, + "gradient_accumulation_steps": 2, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-4 + } + }, + "fp16": { + "enabled": True + }, + "zero_optimization": { + "stage": 1, + "offload_optimizer": { + "device": "cpu" + } } } - } + hidden_dim = 10 - hidden_dim = 10 - model = SimpleModel(hidden_dim) - - @distributed_test(world_size=[2]) - def _go(model, hidden_dim): + model = SimpleModel(hidden_dim) model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict) @@ -1230,50 +1188,49 @@ def _go(model, hidden_dim): model.backward(loss) model.step() - _go(model=model, hidden_dim=hidden_dim) - @pytest.mark.parametrize('return_type', [tuple, list, dict]) -def test_z3_dict_fwd(return_type): - config_dict = { - "train_batch_size": 4, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 1e-4 +class TestZero3DictFwd(DistributedTest): + world_size = 1 + + def test(self, return_type): + config_dict = { + "train_batch_size": 4, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-4 + } + }, + "fp16": { + "enabled": True + }, + "zero_optimization": { + "stage": 3 } - }, - "fp16": { - "enabled": True - }, - "zero_optimization": { - "stage": 3 } - } - hidden_dim = 10 - - class MyModel(torch.nn.Module): - def __init__(self, hidden_dim): - super(MyModel, self).__init__() - self.l1 = torch.nn.Linear(hidden_dim, hidden_dim) - self.cel = torch.nn.CrossEntropyLoss() - - def forward(self, x, y): - x = self.l1(x) - loss = self.cel(x, y) - if return_type == dict: - val = {'a': x, 'loss': loss, 'b': 1, 'c': None} - elif return_type == list: - val = [x, loss] - elif return_type == tuple: - val = (x, loss) - else: - raise NotImplementedError - return val + hidden_dim = 10 + + class MyModel(torch.nn.Module): + def __init__(self, hidden_dim): + super(MyModel, self).__init__() + self.l1 = torch.nn.Linear(hidden_dim, hidden_dim) + self.cel = torch.nn.CrossEntropyLoss() + + def forward(self, x, y): + x = self.l1(x) + loss = self.cel(x, y) + if return_type == dict: + val = {'a': x, 'loss': loss, 'b': 1, 'c': None} + elif return_type == list: + val = [x, loss] + elif return_type == tuple: + val = (x, loss) + else: + raise NotImplementedError + return val - @distributed_test(world_size=[1]) - def _go(hidden_dim): with deepspeed.zero.Init(): model = MyModel(hidden_dim) @@ -1294,40 +1251,36 @@ def _go(hidden_dim): model.backward(loss) model.step() - _go(hidden_dim) - @pytest.mark.parametrize('zero_stage', [1, 2, 3]) -def test_zero_adam_optimizer_step_count(tmpdir, zero_stage): - - # force all params to be partitioned by forcing threshold=0 - config_dict = { - "train_micro_batch_size_per_gpu": 2, - "gradient_accumulation_steps": 2, - "steps_per_print": 1, - "zero_optimization": { - "stage": zero_stage, - "stage3_param_persistence_threshold": 0, - "sub_group_size": 4, - }, - "optimizer": { - "type": "Adam", - "params": { - "lr": 1e-3 +class TestZeroAdamOptimizerStepCount(DistributedTest): + world_size = 1 + + def test(self, zero_stage): + # force all params to be partitioned by forcing threshold=0 + config_dict = { + "train_micro_batch_size_per_gpu": 2, + "gradient_accumulation_steps": 2, + "steps_per_print": 1, + "zero_optimization": { + "stage": zero_stage, + "stage3_param_persistence_threshold": 0, + "sub_group_size": 4, + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-3 + } + }, + "fp16": { + "enabled": True, + "initial_scale_power": 8 } - }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 } - } - - hidden_dim = 4 + hidden_dim = 4 - model = SimpleModel(hidden_dim=hidden_dim, nlayers=12) - - @distributed_test(world_size=[1]) - def _test_zero_adam_optimizer_step_count_loop(model, hidden_dim): + model = SimpleModel(hidden_dim=hidden_dim, nlayers=12) model, optimizer, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) @@ -1354,5 +1307,3 @@ def _test_zero_adam_optimizer_step_count_loop(model, hidden_dim): state = optimizer.optimizer.state[param] step_counts.append(state['step']) assert all(step == step_counts[0] for step in step_counts) - - _test_zero_adam_optimizer_step_count_loop(model=model, hidden_dim=hidden_dim) From cd2c6fe9d43cfc2f30f1a9fd06ae89ce27ebbe0f Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Mon, 15 Aug 2022 09:24:54 -0700 Subject: [PATCH 03/25] fp16 test refactor --- tests/unit/test_fp16.py | 1098 +++++++++++++++++---------------------- 1 file changed, 464 insertions(+), 634 deletions(-) diff --git a/tests/unit/test_fp16.py b/tests/unit/test_fp16.py index 43d76994b38d..66d1bb737ff0 100755 --- a/tests/unit/test_fp16.py +++ b/tests/unit/test_fp16.py @@ -3,10 +3,10 @@ import deepspeed import pytest from deepspeed.ops.adam import FusedAdam -from .common import distributed_test +from tests.unit.common import DistributedTest from deepspeed.ops.op_builder import CPUAdamBuilder -from .simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict, create_deepspeed_args, SimpleMoEModel, sequence_dataloader -from .util import required_torch_version +from tests.unit.simple_model import SimpleModel, SimpleOptimizer, random_dataloader, SimpleMoEModel, sequence_dataloader +from tests.unit.util import required_torch_version try: from apex import amp # noqa: F401 @@ -17,26 +17,25 @@ reason="apex/amp is not installed") -def test_lamb_fp32_grad_clip(tmpdir): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Lamb", - "params": { - "lr": 0.00015 - } - }, - "gradient_clipping": 1.0 - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - model = SimpleModel(hidden_dim) +class TestLambFP32GradClip(DistributedTest): + world_size = 2 + + def test(self): + config_dict = { + "train_batch_size": 2, + "steps_per_print": 1, + "optimizer": { + "type": "Lamb", + "params": { + "lr": 0.00015 + } + }, + "gradient_clipping": 1.0 + } + hidden_dim = 10 - @distributed_test(world_size=[1, 2]) - def _test_lamb_fp32_grad_clip(args, model, hidden_dim): - model, _, _, _ = deepspeed.initialize(args=args, + model = SimpleModel(hidden_dim) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, @@ -49,32 +48,29 @@ def _test_lamb_fp32_grad_clip(args, model, hidden_dim): model.backward(loss) model.step() - _test_lamb_fp32_grad_clip(args=args, model=model, hidden_dim=hidden_dim) - -def test_lamb_fp16_basic(tmpdir): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Lamb", - "params": { - "lr": 0.00015 +class TestLambFP16(DistributedTest): + world_size = 2 + + def test__basic(self): + config_dict = { + "train_batch_size": 2, + "steps_per_print": 1, + "optimizer": { + "type": "Lamb", + "params": { + "lr": 0.00015 + } + }, + "gradient_clipping": 1.0, + "fp16": { + "enabled": True } - }, - "gradient_clipping": 1.0, - "fp16": { - "enabled": True } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - model = SimpleModel(hidden_dim) + hidden_dim = 10 - @distributed_test(world_size=[1, 2]) - def _test_lamb_fp16_basic(args, model, hidden_dim): - model, _, _, _ = deepspeed.initialize(args=args, + model = SimpleModel(hidden_dim) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, @@ -86,32 +82,25 @@ def _test_lamb_fp16_basic(args, model, hidden_dim): model.backward(loss) model.step() - _test_lamb_fp16_basic(args=args, model=model, hidden_dim=hidden_dim) - - -def test_lamb_fp16_empty_grad(tmpdir): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Lamb", - "params": { - "lr": 0.00015 + def test_empty_grad(self): + config_dict = { + "train_batch_size": 2, + "steps_per_print": 1, + "optimizer": { + "type": "Lamb", + "params": { + "lr": 0.00015 + } + }, + "gradient_clipping": 1.0, + "fp16": { + "enabled": True } - }, - "gradient_clipping": 1.0, - "fp16": { - "enabled": True } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - model = SimpleModel(hidden_dim, empty_grad=True) + hidden_dim = 10 - @distributed_test(world_size=[2]) - def _test_lamb_fp16_empty_grad(args, model, hidden_dim): - model, _, _, _ = deepspeed.initialize(args=args, + model = SimpleModel(hidden_dim, empty_grad=True) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, @@ -123,32 +112,29 @@ def _test_lamb_fp16_empty_grad(args, model, hidden_dim): model.backward(loss) model.step() - _test_lamb_fp16_empty_grad(args=args, model=model, hidden_dim=hidden_dim) - -def test_adam_fp32_empty_grad(tmpdir): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 +class TestAdamFP32EmptyGrad(DistributedTest): + world_size = 2 + + def test(self): + config_dict = { + "train_batch_size": 2, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + } + }, + "gradient_clipping": 1.0, + "fp16": { + "enabled": False } - }, - "gradient_clipping": 1.0, - "fp16": { - "enabled": False } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - model = SimpleModel(hidden_dim, empty_grad=True) + hidden_dim = 10 - @distributed_test(world_size=[2]) - def _test_adam_fp32_empty_grad(args, model, hidden_dim): - model, _, _, _ = deepspeed.initialize(args=args, + model = SimpleModel(hidden_dim, empty_grad=True) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, @@ -161,26 +147,23 @@ def _test_adam_fp32_empty_grad(args, model, hidden_dim): model.backward(loss) model.step() - _test_adam_fp32_empty_grad(args=args, model=model, hidden_dim=hidden_dim) +class TestAdamwFP16Basic(DistributedTest): + world_size = 1 -def test_adamw_fp16_basic(tmpdir): - config_dict = { - "train_batch_size": 1, - "steps_per_print": 1, - "fp16": { - "enabled": True + def test(self): + config_dict = { + "train_batch_size": 1, + "steps_per_print": 1, + "fp16": { + "enabled": True + } } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - model = SimpleModel(hidden_dim) + hidden_dim = 10 - @distributed_test(world_size=[1]) - def _test_adamw_fp16_basic(args, model, hidden_dim): + model = SimpleModel(hidden_dim) optimizer = torch.optim.AdamW(params=model.parameters()) - model, _, _, _ = deepspeed.initialize(args=args, + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer) data_loader = random_dataloader(model=model, @@ -192,38 +175,36 @@ def _test_adamw_fp16_basic(args, model, hidden_dim): model.backward(loss) model.step() - _test_adamw_fp16_basic(args=args, model=model, hidden_dim=hidden_dim) +class TestFP16OptimizerForMoE(DistributedTest): + world_size = 2 -def test_unfused_fp16_optimizer_gradnorm_for_moe(tmpdir, monkeypatch): - if not required_torch_version(): - pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") + def test_unfused_gradnorm(self, monkeypatch): + if not required_torch_version(): + pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "fp16": { - "enabled": True + config_dict = { + "train_batch_size": 2, + "steps_per_print": 1, + "fp16": { + "enabled": True + } } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - def mock_unscale_and_clip_grads(total_norm, apply_scale=True): - torch_norm_tensor = torch.cuda.FloatTensor([total_norm]) - all_gather_results = [ - torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size()) - ] - dist.all_gather(all_gather_results, torch_norm_tensor) - assert len(set([x.item() for x in all_gather_results])) == 1 - return 1.0 - - @distributed_test(world_size=[2]) - def _test_unfused_fp16_optimizer(args, hidden_dim): + hidden_dim = 10 + + def mock_unscale_and_clip_grads(total_norm, apply_scale=True): + torch_norm_tensor = torch.cuda.FloatTensor([total_norm]) + all_gather_results = [ + torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size()) + ] + dist.all_gather(all_gather_results, torch_norm_tensor) + assert len(set([x.item() for x in all_gather_results])) == 1 + return 1.0 + # initialize MoE model = SimpleMoEModel(hidden_dim, ep_size=2) optimizer = torch.optim.AdamW(params=model.parameters()) - engine, optimizer, _, _ = deepspeed.initialize(args=args, + engine, optimizer, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer, dist_init_required=False) @@ -239,39 +220,33 @@ def _test_unfused_fp16_optimizer(args, hidden_dim): engine.backward(loss) engine.step() - _test_unfused_fp16_optimizer(args=args, hidden_dim=hidden_dim) + def test_fused_gradnorm(self, monkeypatch): + if not required_torch_version(): + pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") + config_dict = { + "train_batch_size": 2, + "steps_per_print": 1, + "fp16": { + "enabled": True + } + } + hidden_dim = 10 -def test_fused_fp16_optimizer_gradnorm_for_moe(tmpdir, monkeypatch): - if not required_torch_version(): - pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") + def mock_unscale_and_clip_grads(grads_groups_flat, total_norm, apply_scale=True): + torch_norm_tensor = torch.cuda.FloatTensor([total_norm]) + all_gather_results = [ + torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size()) + ] + dist.all_gather(all_gather_results, torch_norm_tensor) + assert len(set([x.item() for x in all_gather_results])) == 1 + return 1.0 - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "fp16": { - "enabled": True - } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - def mock_unscale_and_clip_grads(grads_groups_flat, total_norm, apply_scale=True): - torch_norm_tensor = torch.cuda.FloatTensor([total_norm]) - all_gather_results = [ - torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size()) - ] - dist.all_gather(all_gather_results, torch_norm_tensor) - assert len(set([x.item() for x in all_gather_results])) == 1 - return 1.0 - - @distributed_test(world_size=[2]) - def _test_fused_fp16_optimizer(args, hidden_dim): # initialize MoE model = SimpleMoEModel(hidden_dim, ep_size=2) # optimizer = torch.optim.AdamW(params=model.parameters()) optimizer = FusedAdam(params=model.parameters()) - engine, optimizer, _, _ = deepspeed.initialize(args=args, + engine, optimizer, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer, dist_init_required=False) @@ -287,44 +262,38 @@ def _test_fused_fp16_optimizer(args, hidden_dim): engine.backward(loss) engine.step() - _test_fused_fp16_optimizer(args=args, hidden_dim=hidden_dim) - - -@pytest.mark.parametrize("fused_lamb_legacy", [(False), (True)]) -def test_lamb_optimizer_gradnorm_for_moe(tmpdir, monkeypatch, fused_lamb_legacy: bool): - if not required_torch_version(): - pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") - - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "fp16": { - "enabled": True - }, - "optimizer": { - "type": "Lamb", - "params": { - "lr": 0.00015 + @pytest.mark.parametrize("fused_lamb_legacy", [(False), (True)]) + def test_lamb_gradnorm(self, monkeypatch, fused_lamb_legacy: bool): + if not required_torch_version(): + pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") + + config_dict = { + "train_batch_size": 2, + "steps_per_print": 1, + "fp16": { + "enabled": True + }, + "optimizer": { + "type": "Lamb", + "params": { + "lr": 0.00015 + } } } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - def mock_unscale_and_clip_grads(total_norm, apply_scale=True): - torch_norm_tensor = torch.cuda.FloatTensor([total_norm]) - all_gather_results = [ - torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size()) - ] - dist.all_gather(all_gather_results, torch_norm_tensor) - assert len(set([x.item() for x in all_gather_results])) == 1 - return 1.0 - - @distributed_test(world_size=[2]) - def _test_lamb_legacy_optimizer_step(args, hidden_dim, fused_lamb_legacy): + hidden_dim = 10 + + def mock_unscale_and_clip_grads(total_norm, apply_scale=True): + torch_norm_tensor = torch.cuda.FloatTensor([total_norm]) + all_gather_results = [ + torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size()) + ] + dist.all_gather(all_gather_results, torch_norm_tensor) + assert len(set([x.item() for x in all_gather_results])) == 1 + return 1.0 + # initialize MoE model = SimpleMoEModel(hidden_dim, ep_size=2) - engine, optimizer, _, _ = deepspeed.initialize(args=args, + engine, optimizer, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters(), dist_init_required=False) @@ -341,54 +310,23 @@ def _test_lamb_legacy_optimizer_step(args, hidden_dim, fused_lamb_legacy): engine.backward(loss) engine.step() - _test_lamb_legacy_optimizer_step(args=args, - hidden_dim=hidden_dim, - fused_lamb_legacy=fused_lamb_legacy) +class TestAdamwFP16EmptyGrad(DistributedTest): + world_size = 1 -def test_dict_config_adamw_fp16_basic(): - config = {"train_batch_size": 1, "steps_per_print": 1, "fp16": {"enabled": True}} - args = create_deepspeed_args() - hidden_dim = 10 - - model = SimpleModel(hidden_dim) - - @distributed_test(world_size=[1]) - def _test_adamw_fp16_basic(args, model, hidden_dim, config): - optimizer = torch.optim.AdamW(params=model.parameters()) - model, _, _, _ = deepspeed.initialize(args=args, - model=model, - optimizer=optimizer, - config=config) - data_loader = random_dataloader(model=model, - total_samples=50, - hidden_dim=hidden_dim, - device=model.device) - for n, batch in enumerate(data_loader): - loss = model(batch[0], batch[1]) - model.backward(loss) - model.step() - - _test_adamw_fp16_basic(args=args, model=model, hidden_dim=hidden_dim, config=config) - - -def test_adamw_fp16_empty_grad(tmpdir): - config_dict = { - "train_batch_size": 1, - "steps_per_print": 1, - "fp16": { - "enabled": True + def test(self): + config_dict = { + "train_batch_size": 1, + "steps_per_print": 1, + "fp16": { + "enabled": True + } } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - model = SimpleModel(hidden_dim) + hidden_dim = 10 - @distributed_test(world_size=[1]) - def _test_adamw_fp16_empty_grad(args, model, hidden_dim): + model = SimpleModel(hidden_dim) optimizer = torch.optim.AdamW(params=model.parameters()) - model, _, _, _ = deepspeed.initialize(args=args, + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer) data_loader = random_dataloader(model=model, @@ -400,64 +338,51 @@ def _test_adamw_fp16_empty_grad(args, model, hidden_dim): model.backward(loss) model.step() - _test_adamw_fp16_empty_grad(args=args, model=model, hidden_dim=hidden_dim) - - -@pytest.mark.parametrize('zero_stage, use_cpu_offload', - [(1, - False), - (2, - False), - (2, - True), - (3, - False), - (3, - True)]) -def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offload): - if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: - pytest.skip("cpu-adam is not compatible") - config_dict = { - "train_batch_size": 1, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 - } - }, - "scheduler": { - "type": "OneCycle", - "params": { - "cycle_first_step_size": 16000, - "cycle_first_stair_count": 8000, - "decay_step_size": 16000, - "cycle_min_lr": 1e-06, - "cycle_max_lr": 3e-05, - "decay_lr_rate": 1e-07, - "cycle_min_mom": 0.85, - "cycle_max_mom": 0.99, - "decay_mom_rate": 0.0 +@pytest.mark.parametrize("zero_stage", [1, 2, 3]) +@pytest.mark.parametrize("use_cpu_offload", [True, False]) +class TestAdamFP16ZeroOneCycleCompatibility(DistributedTest): + world_size = 1 + + def test(self, zero_stage, use_cpu_offload): + if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: + pytest.skip("cpu-adam is not compatible") + + config_dict = { + "train_batch_size": 1, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + } + }, + "scheduler": { + "type": "OneCycle", + "params": { + "cycle_first_step_size": 16000, + "cycle_first_stair_count": 8000, + "decay_step_size": 16000, + "cycle_min_lr": 1e-06, + "cycle_max_lr": 3e-05, + "decay_lr_rate": 1e-07, + "cycle_min_mom": 0.85, + "cycle_max_mom": 0.99, + "decay_mom_rate": 0.0 + } + }, + "fp16": { + "enabled": True + }, + "zero_optimization": { + "stage": zero_stage, + "cpu_offload": use_cpu_offload } - }, - "fp16": { - "enabled": True - }, - "zero_optimization": { - "stage": zero_stage, - "cpu_offload": use_cpu_offload } - } - - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 + hidden_dim = 10 - @distributed_test(world_size=[1]) - def _test_adam_fp16_zero_onecycle_compatibility(args, zero_stage, hidden_dim): model = SimpleModel(hidden_dim) - - model, _, _,_ = deepspeed.initialize(args=args, + model, _, _,_ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, @@ -469,53 +394,38 @@ def _test_adam_fp16_zero_onecycle_compatibility(args, zero_stage, hidden_dim): model.backward(loss) model.step() - _test_adam_fp16_zero_onecycle_compatibility(args=args, - zero_stage=zero_stage, - hidden_dim=hidden_dim) - - -@pytest.mark.parametrize('zero_stage, use_cpu_offload', - [(1, - False), - (2, - False), - (2, - True), - (3, - False), - (3, - True)]) -def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload): - if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: - pytest.skip("cpu-adam is not compatible") - config_dict = { - "train_batch_size": 4, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 +@pytest.mark.parametrize("zero_stage", [1, 2, 3]) +@pytest.mark.parametrize("use_cpu_offload", [True, False]) +@pytest.mark.parametrize("hidden_dim", [9, 10]) +class TestZeroStaticScale(DistributedTest): + world_size = 1 + + def test(self, zero_stage, use_cpu_offload, hidden_dim): + if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: + pytest.skip("cpu-adam is not compatible") + + config_dict = { + "train_batch_size": 4, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + } + }, + "fp16": { + "enabled": True, + "loss_scale": 138. + }, + "zero_optimization": { + "stage": zero_stage, + "cpu_offload": use_cpu_offload } - }, - "fp16": { - "enabled": True, - "loss_scale": 138. - }, - "zero_optimization": { - "stage": zero_stage, - "cpu_offload": use_cpu_offload } - } - args = args_from_dict(tmpdir, config_dict) - @distributed_test(world_size=2) - def _test_zero_static_scale(args, zero_stage, hidden_dim): - #making hidden size not divisible by DP for covering this scenario - hidden_dim = hidden_dim model = SimpleModel(hidden_dim) - - model, optim, _, _ = deepspeed.initialize(args=args, + model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) @@ -533,12 +443,9 @@ def _test_zero_static_scale(args, zero_stage, hidden_dim): model.backward(loss) model.step() - #test when hidden_dim is not aligned with world size - _test_zero_static_scale(args=args, zero_stage=zero_stage, hidden_dim=9) - #test when hidden_dim is aligned with world size - _test_zero_static_scale(args=args, zero_stage=zero_stage, hidden_dim=10) - +#TODO: WHAT DOES THIS TEST? +''' def test_zero_static_scale_deprecated_format(tmpdir): config_dict = { "train_batch_size": 4, @@ -582,99 +489,79 @@ def _test_zero_static_scale(args): model.step() _test_zero_static_scale(args) - - -@pytest.mark.parametrize('zero_stage, use_cpu_offload', - [(1, - False), - (2, - False), - (2, - True), - (3, - False), - (3, - True)]) -def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload): - if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: - pytest.skip("cpu-adam is not compatible") - - config_dict = { - "train_batch_size": 4, - "steps_per_print": 1, - "fp16": { - "enabled": True, - }, - "zero_optimization": { - "stage": zero_stage, - "cpu_offload": use_cpu_offload - }, - "zero_allow_untested_optimizer": False - } - args = args_from_dict(tmpdir, config_dict) - - @distributed_test(world_size=[1]) - def _test_zero_allow_untested_optimizer(args, zero_stage): +''' + + +@pytest.mark.parametrize("zero_stage", [1, 2, 3]) +@pytest.mark.parametrize("use_cpu_offload", [True, False]) +class TestZeroAllowUntestedOptimizer(DistributedTest): + world_size = 1 + + def test(self, zero_stage, use_cpu_offload): + if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: + pytest.skip("cpu-adam is not compatible") + + config_dict = { + "train_batch_size": 4, + "steps_per_print": 1, + "fp16": { + "enabled": True, + }, + "zero_optimization": { + "stage": zero_stage, + "cpu_offload": use_cpu_offload + }, + "zero_allow_untested_optimizer": False + } hidden_dim = 10 + model = SimpleModel(hidden_dim) optimizer = SimpleOptimizer(model.parameters()) with pytest.raises(AssertionError): - model, optim, _, _ = deepspeed.initialize(args=args, + model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer, model_parameters=model.parameters()) - _test_zero_allow_untested_optimizer(args, zero_stage) - - -@pytest.mark.parametrize('zero_stage, use_cpu_offload', - [(1, - False), - (2, - False), - (2, - True), - (3, - False), - (3, - True)]) -def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload): - if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: - pytest.skip("cpu-adam is not compatible") - - if zero_stage == 3: - pytest.skip("skip for now") - config_dict = { - "train_micro_batch_size_per_gpu": 1, - "gradient_accumulation_steps": 1, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 +@pytest.mark.parametrize("zero_stage", [1, 2, 3]) +@pytest.mark.parametrize("use_cpu_offload", [True, False]) +class TestZeroEmptyPartition(DistributedTest): + world_size = 3 + + def test(self, zero_stage, use_cpu_offload): + if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: + pytest.skip("cpu-adam is not compatible") + + if zero_stage == 3: + pytest.skip("skip for now") + + config_dict = { + "train_micro_batch_size_per_gpu": 1, + "gradient_accumulation_steps": 1, + "fp16": { + "enabled": True, + "initial_scale_power": 8 + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + } + }, + "zero_optimization": { + "stage": zero_stage, + "cpu_offload": use_cpu_offload, + "reduce_bucket_size": 100, + "allgather_bucket_size": 100 } - }, - "zero_optimization": { - "stage": zero_stage, - "cpu_offload": use_cpu_offload, - "reduce_bucket_size": 100, - "allgather_bucket_size": 100 } - } - args = args_from_dict(tmpdir, config_dict) - - @distributed_test(world_size=[3]) - def _test_zero_empty_partition(args, zero_stage): hidden_dim = 1 model = SimpleModel(hidden_dim) # Ensure model has 2 parameters, to cause empty partition with DP=3 assert len(list(model.parameters())) == 2 - model, _, _, _ = deepspeed.initialize(args=args, + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) @@ -688,21 +575,24 @@ def _test_zero_empty_partition(args, zero_stage): model.backward(loss) model.step() - _test_zero_empty_partition(args=args, zero_stage=zero_stage) - @amp_available -def test_adam_amp_basic(tmpdir): - config_dict = {"train_batch_size": 1, "steps_per_print": 1, "amp": {"enabled": True}} - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - model = SimpleModel(hidden_dim) +class TestAmp(DistributedTest): + world_size = 2 + + def test_adam_basic(self): + config_dict = { + "train_batch_size": 1, + "steps_per_print": 1, + "amp": { + "enabled": True + } + } + hidden_dim = 10 - @distributed_test(world_size=[1]) - def _test_adam_amp_basic(args, model, hidden_dim): + model = SimpleModel(hidden_dim) optimizer = torch.optim.Adam(params=model.parameters()) - model, _, _, _ = deepspeed.initialize(args=args, + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer) data_loader = random_dataloader(model=model, @@ -714,33 +604,25 @@ def _test_adam_amp_basic(args, model, hidden_dim): model.backward(loss) model.step() - _test_adam_amp_basic(args=args, model=model, hidden_dim=hidden_dim) - - -@amp_available -def test_lamb_amp_basic(tmpdir): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Lamb", - "params": { - "lr": 0.00015 + def test_lamb_basic(self): + config_dict = { + "train_batch_size": 2, + "steps_per_print": 1, + "optimizer": { + "type": "Lamb", + "params": { + "lr": 0.00015 + } + }, + "gradient_clipping": 1.0, + "amp": { + "enabled": True, } - }, - "gradient_clipping": 1.0, - "amp": { - "enabled": True, } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - model = SimpleModel(hidden_dim) + hidden_dim = 10 - @distributed_test(world_size=[1, 2]) - def _test_lamb_amp_basic(args, model, hidden_dim): - model, _, _, _ = deepspeed.initialize(args=args, + model = SimpleModel(hidden_dim) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, @@ -752,34 +634,26 @@ def _test_lamb_amp_basic(args, model, hidden_dim): model.backward(loss) model.step() - _test_lamb_amp_basic(args=args, model=model, hidden_dim=hidden_dim) - - -@amp_available -def test_adam_amp_o2(tmpdir): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 + def test_adam_O2(self): + config_dict = { + "train_batch_size": 2, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + } + }, + "gradient_clipping": 1.0, + "amp": { + "enabled": True, + "opt_level": "O2" } - }, - "gradient_clipping": 1.0, - "amp": { - "enabled": True, - "opt_level": "O2" } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - model = SimpleModel(hidden_dim) + hidden_dim = 10 - @distributed_test(world_size=[1, 2]) - def _test_adam_amp_o2(args, model, hidden_dim): - model, _, _, _ = deepspeed.initialize(args=args, + model = SimpleModel(hidden_dim) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, @@ -791,34 +665,26 @@ def _test_adam_amp_o2(args, model, hidden_dim): model.backward(loss) model.step() - _test_adam_amp_o2(args=args, model=model, hidden_dim=hidden_dim) - - -@amp_available -def test_adam_amp_o2_empty_grad(tmpdir): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 + def test_adam_O2_empty_grad(tmpdir): + config_dict = { + "train_batch_size": 2, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + } + }, + "gradient_clipping": 1.0, + "amp": { + "enabled": True, + "opt_level": "O2" } - }, - "gradient_clipping": 1.0, - "amp": { - "enabled": True, - "opt_level": "O2" } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - model = SimpleModel(hidden_dim) + hidden_dim = 10 - @distributed_test(world_size=[2]) - def _test_adam_amp_o2_empty_grad(args, model, hidden_dim): - model, _, _, _ = deepspeed.initialize(args=args, + model = SimpleModel(hidden_dim) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, @@ -830,79 +696,62 @@ def _test_adam_amp_o2_empty_grad(args, model, hidden_dim): model.backward(loss) model.step() - _test_adam_amp_o2_empty_grad(args=args, model=model, hidden_dim=hidden_dim) - - -@pytest.mark.parametrize('zero_stage, optimizer_constructor', - [(1, - FusedAdam), - (2, - torch.optim.Adam), - (2, - FusedAdam), - (3, - torch.optim.Adam), - (3, - FusedAdam)]) -def test_zero_supported_client_optimizer(tmpdir, zero_stage, optimizer_constructor): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "fp16": { - "enabled": True - }, - "zero_optimization": { - "stage": zero_stage + +@pytest.mark.parametrize("zero_stage", [1, 2, 3]) +@pytest.mark.parametrize("optimizer_constructor", [FusedAdam, torch.optim.Adam]) +class TestZeroSupportedClientOptimizer(DistributedTest): + world_size = 1 + + def test(self, zero_stage, optimizer_constructor): + config_dict = { + "train_batch_size": 2, + "steps_per_print": 1, + "fp16": { + "enabled": True + }, + "zero_optimization": { + "stage": zero_stage + } } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 + hidden_dim = 10 - @distributed_test(world_size=[1]) - def _test_zero_supported_client_optimizer(args, zero_stage, optimizer_constructor): model = SimpleModel(hidden_dim) - client_optimizer = optimizer_constructor(params=model.parameters()) - model, _, _, _ = deepspeed.initialize(args=args, + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=client_optimizer) - _test_zero_supported_client_optimizer(args=args, - zero_stage=zero_stage, - optimizer_constructor=optimizer_constructor) - -def test_zero2_reduce_scatter_off(tmpdir): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 +class TestZero2ReduceScatterOff(DistributedTest): + world_size = 2 + + def test(self): + config_dict = { + "train_batch_size": 2, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + } + }, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": 2, + "contiguous_gradients": True, + "allgather_bucket_size": 2000000000, + "reduce_bucket_size": 200000000, + "overlap_comm": False, + "reduce_scatter": False + }, + "fp16": { + "enabled": True } - }, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": 2, - "contiguous_gradients": True, - "allgather_bucket_size": 2000000000, - "reduce_bucket_size": 200000000, - "overlap_comm": False, - "reduce_scatter": False - }, - "fp16": { - "enabled": True } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - model = SimpleModel(hidden_dim) + hidden_dim = 10 - @distributed_test(world_size=[2]) - def _helper(args, model, hidden_dim): - model, _, _, _ = deepspeed.initialize(args=args, + model = SimpleModel(hidden_dim) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, @@ -914,43 +763,32 @@ def _helper(args, model, hidden_dim): model.backward(loss) model.step() - _helper(args=args, model=model, hidden_dim=hidden_dim) - -@pytest.mark.parametrize('adam_type, torch_impl', - [('Adam', - True), - ('Adam', - False), - ('AdamW', - True), - ('AdamW', - False)]) -def test_fp16_adam_types(tmpdir, adam_type, torch_impl): - config_dict = { - "train_batch_size": 1, - "steps_per_print": 1, - "fp16": { - "enabled": True, - "initial_scale_power": 10 - }, - "optimizer": { - "type": adam_type, - "torch_adam": torch_impl, - "params": { - "lr": 0.00015 +@pytest.mark.parametrize("adam_type", ["Adam", "AdamW"]) +@pytest.mark.parametrize("torch_impl", [True, False]) +class TestFP16AdamTypes(DistributedTest): + world_size = 1 + + def test(self, adam_type, torch_impl): + config_dict = { + "train_batch_size": 1, + "steps_per_print": 1, + "fp16": { + "enabled": True, + "initial_scale_power": 10 + }, + "optimizer": { + "type": adam_type, + "torch_adam": torch_impl, + "params": { + "lr": 0.00015 + } } } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - model = SimpleModel(hidden_dim) - - @distributed_test(world_size=[1]) - def _test_fp16_adam_types(args, model, hidden_dim): + hidden_dim = 10 - model, _, _, _ = deepspeed.initialize(args=args, + model = SimpleModel(hidden_dim) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) @@ -964,35 +802,32 @@ def _test_fp16_adam_types(args, model, hidden_dim): model.backward(loss) model.step() - _test_fp16_adam_types(args=args, model=model, hidden_dim=hidden_dim) - -def test_zero3_lazyscatter(tmpdir): - config_dict = { - "train_batch_size": 1, - "steps_per_print": 1, - "fp16": { - "enabled": True, - "initial_scale_power": 10 - }, - "optimizer": { - "type": "AdamW", - "params": { - "lr": 0.00015 +class TestZero3LazyScatter(DistributedTest): + world_size = 1 + + def test(self): + config_dict = { + "train_batch_size": 1, + "steps_per_print": 1, + "fp16": { + "enabled": True, + "initial_scale_power": 10 + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": 0.00015 + } + }, + "zero_optimization": { + "stage": 3 } - }, - "zero_optimization": { - "stage": 3 } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 + hidden_dim = 10 - @distributed_test(world_size=[1]) - def _go(args): model = SimpleModel(hidden_dim) - - model, _, _, _ = deepspeed.initialize(args=args, + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) @@ -1006,30 +841,27 @@ def _go(args): model.backward(loss) model.step() - _go(args=args) - @pytest.mark.parametrize('stage', [1, 2, 3]) -def test_zero_empty_grad(tmpdir, stage): - config_dict = { - "train_batch_size": 1, - "steps_per_print": 1, - "fp16": { - "enabled": True - }, - "zero_optimization": { - "stage": stage +class TestZeroEmptyGrad(DistributedTest): + world_size = 2 + + def test(self, stage): + config_dict = { + "train_batch_size": 1, + "steps_per_print": 1, + "fp16": { + "enabled": True + }, + "zero_optimization": { + "stage": stage + } } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - model = SimpleModel(hidden_dim) + hidden_dim = 10 - @distributed_test(world_size=[1]) - def _go(args, model, hidden_dim): + model = SimpleModel(hidden_dim) optimizer = torch.optim.Adam(model.parameters()) - model, _, _, _ = deepspeed.initialize(args=args, + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer) data_loader = random_dataloader(model=model, @@ -1040,5 +872,3 @@ def _go(args, model, hidden_dim): loss = model(batch[0], batch[1]) model.backward(loss) model.step() - - _go(args=args, model=model, hidden_dim=hidden_dim) From f9f5dc3f782f351b84b148dd46de188da1611691 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Mon, 15 Aug 2022 11:15:58 -0700 Subject: [PATCH 04/25] more refactors --- tests/unit/test_coalesced_collectives.py | 110 +++++----- tests/unit/test_configurable_parallel.py | 243 +++++++++++------------ 2 files changed, 170 insertions(+), 183 deletions(-) diff --git a/tests/unit/test_coalesced_collectives.py b/tests/unit/test_coalesced_collectives.py index 9597a1e8536a..3c504dd58ac3 100644 --- a/tests/unit/test_coalesced_collectives.py +++ b/tests/unit/test_coalesced_collectives.py @@ -4,57 +4,59 @@ import deepspeed.comm as dist from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced -from .common import distributed_test - - -@distributed_test(world_size=2) -def test_reduce_scatter_coalesced_single_input(): - input = torch.full((6, - ), - dist.get_rank(), - dtype=torch.half, - device=torch.cuda.current_device()) - - (output, ) = reduce_scatter_coalesced([input], dist.get_world_group()) - - assert output.shape == (3, ) - assert torch.allclose(output, torch.full_like(output, 0.5)) - - -@distributed_test(world_size=2) -def test_reduce_scatter_coalesced_two_inputs(): - tensor_kwargs = {"device": torch.cuda.current_device(), "dtype": torch.half} - inputs = [ - dist.get_rank() * torch.arange(0, - 6, - **tensor_kwargs), - dist.get_rank() * torch.arange(6, - 9, - **tensor_kwargs), - ] - - output1, output2 = reduce_scatter_coalesced(inputs, dist.get_world_group()) - - if dist.get_rank() == 0: - assert output1.shape == (3, ) - assert torch.allclose(output1, torch.arange(0, 3, **tensor_kwargs) / 2) - assert output2.shape == (2, ) - assert torch.allclose(output2, torch.arange(6, 8, **tensor_kwargs) / 2) - elif dist.get_rank() == 1: - assert output1.shape == (3, ) - assert torch.allclose(output1, torch.arange(3, 6, **tensor_kwargs) / 2) - assert output2.shape == (1, ) - assert torch.allclose(output2, torch.arange(8, 9, **tensor_kwargs) / 2) - - -@distributed_test(world_size=2) -def test_reduce_scatter_coalesced_tensor_smaller_than_world_sz(): - input = torch.zeros((1, ), dtype=torch.half, device=torch.cuda.current_device()) - - (output, ) = reduce_scatter_coalesced([input], dist.get_world_group()) - - if dist.get_rank() == 0: - assert output.shape == (1, ) - assert torch.allclose(output, torch.zeros_like(output)) - elif dist.get_rank() == 1: - assert output.shape == (0, ) +from tests.unit.common import DistributedTest + + +class TestReduceScatterCoalesced(DistributedTest): + world_size = 2 + + def test_single_input(self): + input = torch.full((6, + ), + dist.get_rank(), + dtype=torch.half, + device=torch.cuda.current_device()) + + (output, ) = reduce_scatter_coalesced([input], dist.get_world_group()) + + assert output.shape == (3, ) + assert torch.allclose(output, torch.full_like(output, 0.5)) + + def test_two_inputs(self): + tensor_kwargs = {"device": torch.cuda.current_device(), "dtype": torch.half} + inputs = [ + dist.get_rank() * torch.arange(0, + 6, + **tensor_kwargs), + dist.get_rank() * torch.arange(6, + 9, + **tensor_kwargs), + ] + + output1, output2 = reduce_scatter_coalesced(inputs, dist.get_world_group()) + + if dist.get_rank() == 0: + assert output1.shape == (3, ) + assert torch.allclose(output1, torch.arange(0, 3, **tensor_kwargs) / 2) + assert output2.shape == (2, ) + assert torch.allclose(output2, torch.arange(6, 8, **tensor_kwargs) / 2) + elif dist.get_rank() == 1: + assert output1.shape == (3, ) + assert torch.allclose(output1, torch.arange(3, 6, **tensor_kwargs) / 2) + assert output2.shape == (1, ) + assert torch.allclose(output2, torch.arange(8, 9, **tensor_kwargs) / 2) + + +class TestReduceScatterCoalescedTensorSmallerThanWorldSize(DistributedTest): + world_size = 2 + + def test(self): + input = torch.zeros((1, ), dtype=torch.half, device=torch.cuda.current_device()) + + (output, ) = reduce_scatter_coalesced([input], dist.get_world_group()) + + if dist.get_rank() == 0: + assert output.shape == (1, ) + assert torch.allclose(output, torch.zeros_like(output)) + elif dist.get_rank() == 1: + assert output.shape == (0, ) diff --git a/tests/unit/test_configurable_parallel.py b/tests/unit/test_configurable_parallel.py index c7ce77743312..fc6fb43c5cbe 100755 --- a/tests/unit/test_configurable_parallel.py +++ b/tests/unit/test_configurable_parallel.py @@ -5,7 +5,7 @@ import numpy as np import torch.multiprocessing as mp import deepspeed.comm as dist -from tests.unit.common import distributed_test +from tests.unit.common import distributed_test, DistributedTest from tests.unit.megatron_model import get_gpt2_model, get_megatron_version from tests.unit.megatron_model import MockGPT2ModelPipe as GPT2ModelPipe from deepspeed.utils import RepeatingLoader @@ -17,6 +17,7 @@ reason='Megatron-LM package requires Pytorch version 1.5 or above') +@pytest.fixture(autouse=True) def reset_random(seed=1234): random.seed(seed) np.random.seed(seed) @@ -24,159 +25,142 @@ def reset_random(seed=1234): torch.cuda.manual_seed_all(seed) -class TestConfigurableMP: - def setup_method(self, method): - reset_random() +@pytest.mark.fixture() +def inputs(bs=1, seq_len=20): + input_ids = torch.randint(low=0, high=1000, size=(bs, seq_len)) + position_ids = torch.randint(low=0, high=2, size=(bs, seq_len)) + attention_mask = torch.randint(low=0, high=2, size=(bs, seq_len), dtype=torch.bool) + return [input_ids, position_ids, attention_mask] - def get_inputs(self, bs=1, seq_len=20): - input_ids = torch.randint(low=0, high=1000, size=(bs, seq_len)) - position_ids = torch.randint(low=0, high=2, size=(bs, seq_len)) - attention_mask = torch.randint(low=0, - high=2, - size=(bs, - seq_len), - dtype=torch.bool) - return [input_ids, position_ids, attention_mask] - def get_deepspeed_model(self, model, tmpdir): - ds_config_dict = { - "train_micro_batch_size_per_gpu": 1, - "optimizer": { - "type": "Lamb", - "params": { - "lr": 0.00015 - } - }, - } +def get_deepspeed_model(model, tmpdir): + ds_config_dict = { + "train_micro_batch_size_per_gpu": 1, + "optimizer": { + "type": "Lamb", + "params": { + "lr": 0.00015 + } + }, + } - from megatron import mpu - model, _, _,_ = deepspeed.initialize(model=model, - mpu=mpu, - model_parameters=model.parameters(), - config=ds_config_dict) - return model + from megatron import mpu + model, _, _,_ = deepspeed.initialize(model=model, + mpu=mpu, + model_parameters=model.parameters(), + config=ds_config_dict) + return model - def test_gpt2_basic(self, tmpdir): + +class TestConfigurableMP(DistributedTest): + @pytest.mark.world_size(1) + def test_gpt2_basic(self, inputs, tmpdir): # basic test case, mp_size=1, verify ckpt saving/loading. + args_defaults = { + 'num_layers': 2, + 'hidden_size': 128, + 'num_attention_heads': 8, + 'max_position_embeddings': 128, + } - @distributed_test(world_size=1) - def _run(): - inputs = self.get_inputs() - args_defaults = { - 'num_layers': 2, - 'hidden_size': 128, - 'num_attention_heads': 8, - 'max_position_embeddings': 128, - } + model = get_gpt2_model(args_defaults) + model = self.get_deepspeed_model(model, tmpdir) - model = get_gpt2_model(args_defaults) - model = self.get_deepspeed_model(model, tmpdir) + model.eval() + baseline = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) - model.eval() - baseline = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) + tag = 'mp_1' + state_dict = {} + state_dict['checkpoint_version'] = get_megatron_version() + model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict) + dist.barrier() + model.load_checkpoint(tmpdir, + tag=tag, + load_optimizer_states=False, + load_lr_scheduler_states=False) - tag = 'mp_1' - state_dict = {} - state_dict['checkpoint_version'] = get_megatron_version() - model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict) - dist.barrier() - model.load_checkpoint(tmpdir, - tag=tag, - load_optimizer_states=False, - load_lr_scheduler_states=False) + test = model(inputs[0], inputs[1], inputs[2]) + assert torch.allclose(baseline, test, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}" - test = model(inputs[0], inputs[1], inputs[2]) - assert torch.allclose(baseline, test, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}" + @pytest.mark.world_size(2) + def test_gpt2_mp2_no_resize(self, inputs, tmpdir): + # test mp_size=2 case, verify ckpt saving/loading without resize. + args_defaults = { + 'num_layers': 2, + 'hidden_size': 128, + 'num_attention_heads': 8, + 'max_position_embeddings': 128, + } - _run() + model = get_gpt2_model(args_defaults, mp_size=2) + model = self.get_deepspeed_model(model, tmpdir) - def test_gpt2_mp2_no_resize(self, tmpdir): - # test mp_size=2 case, verify ckpt saving/loading without resize. + model.eval() - @distributed_test(world_size=2) - def _run(inputs): - args_defaults = { - 'num_layers': 2, - 'hidden_size': 128, - 'num_attention_heads': 8, - 'max_position_embeddings': 128, - } + baseline = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) - model = get_gpt2_model(args_defaults, mp_size=2) - model = self.get_deepspeed_model(model, tmpdir) + tag = 'mp_2' + state_dict = {} + state_dict['checkpoint_version'] = get_megatron_version() + model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict) + dist.barrier() + model.load_checkpoint(tmpdir, + tag=tag, + load_optimizer_states=False, + load_lr_scheduler_states=False) + + test = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) + assert torch.allclose(baseline, test, rtol=1.0, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}" + + +#TODO: Figure out how to run fixtures as distributed processes +''' +class TestResizeMP(DistributedTest): + @pytest.mark.world_size(2) + def test_gpt2_mp_save_ckpt(self, class_tmpdir, inputs, mp_size=2): + args_defaults = { + 'num_layers': 2, + 'hidden_size': 128, + 'num_attention_heads': 8, + 'max_position_embeddings': 128, + } + + model = get_gpt2_model(args_defaults, mp_size=mp_size) + model = self.get_deepspeed_model(model, class_tmpdir) - model.eval() + model.eval() + with torch.no_grad(): baseline = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) + if dist.get_rank() == 0: + output.put(baseline.cpu()) - tag = 'mp_2' state_dict = {} state_dict['checkpoint_version'] = get_megatron_version() - model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict) - dist.barrier() + model.save_checkpoint(class_tmpdir, client_state=state_dict) + + @pytest.mark.world_size(1) + def test_gpt2_mp_2to1(self, class_tmpdir, inputs, resize=1): + args_defaults = { + 'num_layers': 2, + 'hidden_size': 128, + 'num_attention_heads': 8, + 'max_position_embeddings': 128, + } + + model = get_gpt2_model(args_defaults, mp_size=resize) + model = self.get_deepspeed_model(model, class_tmpdir) + + model.eval() + + with torch.no_grad(): model.load_checkpoint(tmpdir, tag=tag, load_optimizer_states=False, load_lr_scheduler_states=False) - test = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) - assert torch.allclose(baseline, test, rtol=1.0, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}" - - inputs = self.get_inputs() - _run(inputs) - - def _test_gpt2_config_mp(self, tmpdir, mp_size, resize): - # test mp_size=2 case, verify resize=1 case for ckpt merging. - - @distributed_test(world_size=mp_size) - def _run_baseline(inputs, tag, output, quit_event): - reset_random() - args_defaults = { - 'num_layers': 2, - 'hidden_size': 128, - 'num_attention_heads': 8, - 'max_position_embeddings': 128, - } - - model = get_gpt2_model(args_defaults, mp_size=mp_size) - model = self.get_deepspeed_model(model, tmpdir) - - model.eval() - - with torch.no_grad(): - baseline = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) - if dist.get_rank() == 0: - output.put(baseline.cpu()) - - state_dict = {} - state_dict['checkpoint_version'] = get_megatron_version() - model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict) - quit_event.wait() - - @distributed_test(world_size=resize) - def _run_resize(inputs, tag, output, quit_event): - reset_random() - args_defaults = { - 'num_layers': 2, - 'hidden_size': 128, - 'num_attention_heads': 8, - 'max_position_embeddings': 128, - } - - model = get_gpt2_model(args_defaults, mp_size=resize) - model = self.get_deepspeed_model(model, tmpdir) - - model.eval() - - with torch.no_grad(): - model.load_checkpoint(tmpdir, - tag=tag, - load_optimizer_states=False, - load_lr_scheduler_states=False) - test = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) - if dist.get_rank() == 0: - output.put(test.cpu()) - quit_event.wait() + if dist.get_rank() == 0: + output.put(test.cpu()) def _verify(b_queue, t_queue, baseline_event, test_event): baseline = b_queue.get() @@ -214,6 +198,7 @@ def test_gpt2_mp_2to1(self, tmpdir): def test_gpt2_mp_2to4(self, tmpdir): # test mp_size=2 case, verify resize=4 case for ckpt splitting. self._test_gpt2_config_mp(tmpdir, mp_size=2, resize=4) +''' class TestConfigurablePP: From fb077b6d0e7f602e2b078f6c4d02016523c578b4 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Thu, 18 Aug 2022 17:30:50 -0700 Subject: [PATCH 05/25] added DistributedFixture class --- tests/conftest.py | 7 +++ tests/unit/common.py | 105 +++++++++++++++++++++++++++---------------- 2 files changed, 74 insertions(+), 38 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 6f1831c53933..12ac58f6bdf2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -54,3 +54,10 @@ def pytest_runtest_call(item): dist_test_class = item.cls() dist_test_class._run_test(item._request) item.runtest = lambda: True # Dummy function so test is not run twice + + +@pytest.hookimpl(tryfirst=True) +def pytest_fixture_setup(fixturedef, request): + if getattr(fixturedef.func, "is_dist_fixture", False): + dist_fixture_class = fixturedef.func() + dist_fixture_class(request) diff --git a/tests/unit/common.py b/tests/unit/common.py index 36a1a7743156..f7e3c63da32e 100644 --- a/tests/unit/common.py +++ b/tests/unit/common.py @@ -1,7 +1,7 @@ import os import time import inspect -from abc import ABC +from abc import ABC, abstractmethod from pathlib import Path import torch @@ -12,7 +12,7 @@ import pytest from _pytest.outcomes import Skipped -from _pytest.fixtures import FixtureLookupError +from _pytest.fixtures import FixtureLookupError, FixtureFunctionMarker # Worker timeout *after* the first worker has completed. DEEPSPEED_UNIT_WORKER_TIMEOUT = 120 @@ -63,51 +63,25 @@ def set_cuda_visibile(): os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(dev_id_list) -class DistributedTest(ABC): - is_dist_test = True +class DistributedExec(ABC): world_size = 2 backend = "nccl" - # Temporary directory that is shared among test methods in a class - @pytest.fixture(autouse=True, scope="class") - def class_tmpdir(self, tmpdir_factory): - fn = tmpdir_factory.mktemp(self.__class__.__name__) - return fn - - def _run_test(self, request): - self.current_test = self._get_current_test_func(request) - self.test_kwargs = self._get_test_kwargs(request) - - # Catch world_size override pytest mark - for mark in getattr(request.function, "pytestmark", []): - if mark.name == "world_size": - world_size = mark.args[0] - break - else: - world_size = self.world_size - - if isinstance(world_size, int): - world_size = [world_size] - for procs in world_size: - self._launch_procs(procs) - time.sleep(0.5) - - def _get_current_test_func(self, request): - # DistributedTest subclasses may have multiple test methods - func_name = request.function.__name__ - return getattr(self, func_name) + @abstractmethod + def run(self): + NotImplementedError("Inheriting classes must define this method") - def _get_test_kwargs(self, request): + def _get_fixture_kwargs(self, request, func): # Grab fixture / parametrize kwargs from pytest request object - test_kwargs = {} - params = inspect.getfullargspec(self.current_test).args + fixture_kwargs = {} + params = inspect.getfullargspec(func).args params.remove("self") for p in params: try: - test_kwargs[p] = request.getfixturevalue(p) + fixture_kwargs[p] = request.getfixturevalue(p) except FixtureLookupError: pass # test methods can have kwargs that are not fixtures - return test_kwargs + return fixture_kwargs def _launch_procs(self, num_procs): mp.set_start_method('forkserver', force=True) @@ -170,7 +144,7 @@ def _dist_init(self, local_rank, num_procs, skip_msg): torch.cuda.set_device(local_rank) try: - self.current_test(**self.test_kwargs) + self.run(**self.fixture_kwargs) except BaseException as e: if isinstance(e, Skipped): skip_msg.put(e.msg) @@ -183,6 +157,61 @@ def _dist_init(self, local_rank, num_procs, skip_msg): dist.destroy_process_group() +class DistributedFixture(DistributedExec): + is_dist_fixture = True + + # These values are just placeholders so that pytest recognizes this as a fixture + _pytestfixturefunction = FixtureFunctionMarker(scope="function", params=None) + __name__ = "" + + def __init__(self): + assert isinstance(self.world_size, int), "Only one world size is allowed for distributed fixtures" + self.__name__ = type(self).__name__ + _pytestfixturefunction = FixtureFunctionMarker(scope="function", + params=None, + name=self.__name__) + + def __call__(self, request): + self.fixture_kwargs = self._get_fixture_kwargs(self.run) + self._launch_procs(self.world_size) + + +class DistributedTest(DistributedExec): + is_dist_test = True + + # Temporary directory that is shared among test methods in a class + @pytest.fixture(autouse=True, scope="class") + def class_tmpdir(self, tmpdir_factory): + fn = tmpdir_factory.mktemp(self.__class__.__name__) + return fn + + def run(self, **fixture_kwargs): + self.current_test(**fixture_kwargs) + + def __call__(self, request): + self.current_test = self._get_current_test_func(request) + self.fixture_kwargs = self._get_fixture_kwargs(request, self.current_test) + + # Catch world_size override pytest mark + for mark in getattr(request.function, "pytestmark", []): + if mark.name == "world_size": + world_size = mark.args[0] + break + else: + world_size = self.world_size + + if isinstance(world_size, int): + world_size = [world_size] + for procs in world_size: + self._launch_procs(procs) + time.sleep(0.5) + + def _get_current_test_func(self, request): + # DistributedTest subclasses may have multiple test methods + func_name = request.function.__name__ + return getattr(self, func_name) + + def distributed_test(world_size=2, backend='nccl'): """A decorator for executing a function (e.g., a unit test) in a distributed manner. This decorator manages the spawning and joining of processes, initialization of From f97dd208ca2be569e19f7e5a19d969481d65da62 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Fri, 19 Aug 2022 15:39:10 -0700 Subject: [PATCH 06/25] applied DistributedFixture to first batch of tests as a trial --- tests/conftest.py | 4 +- tests/unit/common.py | 23 +- tests/unit/test_configurable_parallel.py | 435 -------------------- tests/unit/test_configurable_parallel_mp.py | 170 ++++++++ tests/unit/test_configurable_parallel_pp.py | 298 ++++++++++++++ 5 files changed, 486 insertions(+), 444 deletions(-) delete mode 100755 tests/unit/test_configurable_parallel.py create mode 100755 tests/unit/test_configurable_parallel_mp.py create mode 100755 tests/unit/test_configurable_parallel_pp.py diff --git a/tests/conftest.py b/tests/conftest.py index 12ac58f6bdf2..cd34c9e1bf05 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -52,12 +52,14 @@ def pytest_runtest_call(item): # We want to use our own launching function for distributed tests if getattr(item.cls, "is_dist_test", False): dist_test_class = item.cls() - dist_test_class._run_test(item._request) + dist_test_class(item._request) item.runtest = lambda: True # Dummy function so test is not run twice @pytest.hookimpl(tryfirst=True) def pytest_fixture_setup(fixturedef, request): if getattr(fixturedef.func, "is_dist_fixture", False): + #for val in dir(request): + # print(val.upper(), getattr(request, val), "\n") dist_fixture_class = fixturedef.func() dist_fixture_class(request) diff --git a/tests/unit/common.py b/tests/unit/common.py index f7e3c63da32e..63993ac7bd39 100644 --- a/tests/unit/common.py +++ b/tests/unit/common.py @@ -71,7 +71,18 @@ class DistributedExec(ABC): def run(self): NotImplementedError("Inheriting classes must define this method") + def __call__(self, request=None): + self._fixture_kwargs = self._get_fixture_kwargs(request, self.run) + world_size = self.world_size + if isinstance(world_size, int): + world_size = [world_size] + for procs in world_size: + self._launch_procs(procs) + time.sleep(0.5) + def _get_fixture_kwargs(self, request, func): + if not request: + return {} # Grab fixture / parametrize kwargs from pytest request object fixture_kwargs = {} params = inspect.getfullargspec(func).args @@ -144,7 +155,7 @@ def _dist_init(self, local_rank, num_procs, skip_msg): torch.cuda.set_device(local_rank) try: - self.run(**self.fixture_kwargs) + self.run(**self._fixture_kwargs) except BaseException as e: if isinstance(e, Skipped): skip_msg.put(e.msg) @@ -171,10 +182,6 @@ def __init__(self): params=None, name=self.__name__) - def __call__(self, request): - self.fixture_kwargs = self._get_fixture_kwargs(self.run) - self._launch_procs(self.world_size) - class DistributedTest(DistributedExec): is_dist_test = True @@ -186,11 +193,11 @@ def class_tmpdir(self, tmpdir_factory): return fn def run(self, **fixture_kwargs): - self.current_test(**fixture_kwargs) + self._current_test(**fixture_kwargs) def __call__(self, request): - self.current_test = self._get_current_test_func(request) - self.fixture_kwargs = self._get_fixture_kwargs(request, self.current_test) + self._current_test = self._get_current_test_func(request) + self._fixture_kwargs = self._get_fixture_kwargs(request, self._current_test) # Catch world_size override pytest mark for mark in getattr(request.function, "pytestmark", []): diff --git a/tests/unit/test_configurable_parallel.py b/tests/unit/test_configurable_parallel.py deleted file mode 100755 index fc6fb43c5cbe..000000000000 --- a/tests/unit/test_configurable_parallel.py +++ /dev/null @@ -1,435 +0,0 @@ -import torch -import deepspeed -import pytest -import random -import numpy as np -import torch.multiprocessing as mp -import deepspeed.comm as dist -from tests.unit.common import distributed_test, DistributedTest -from tests.unit.megatron_model import get_gpt2_model, get_megatron_version -from tests.unit.megatron_model import MockGPT2ModelPipe as GPT2ModelPipe -from deepspeed.utils import RepeatingLoader - -TORCH_MAJOR = int(torch.__version__.split('.')[0]) -TORCH_MINOR = int(torch.__version__.split('.')[1]) -pytestmark = pytest.mark.skipif( - TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 5), - reason='Megatron-LM package requires Pytorch version 1.5 or above') - - -@pytest.fixture(autouse=True) -def reset_random(seed=1234): - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - torch.cuda.manual_seed_all(seed) - - -@pytest.mark.fixture() -def inputs(bs=1, seq_len=20): - input_ids = torch.randint(low=0, high=1000, size=(bs, seq_len)) - position_ids = torch.randint(low=0, high=2, size=(bs, seq_len)) - attention_mask = torch.randint(low=0, high=2, size=(bs, seq_len), dtype=torch.bool) - return [input_ids, position_ids, attention_mask] - - -def get_deepspeed_model(model, tmpdir): - ds_config_dict = { - "train_micro_batch_size_per_gpu": 1, - "optimizer": { - "type": "Lamb", - "params": { - "lr": 0.00015 - } - }, - } - - from megatron import mpu - model, _, _,_ = deepspeed.initialize(model=model, - mpu=mpu, - model_parameters=model.parameters(), - config=ds_config_dict) - return model - - -class TestConfigurableMP(DistributedTest): - @pytest.mark.world_size(1) - def test_gpt2_basic(self, inputs, tmpdir): - # basic test case, mp_size=1, verify ckpt saving/loading. - args_defaults = { - 'num_layers': 2, - 'hidden_size': 128, - 'num_attention_heads': 8, - 'max_position_embeddings': 128, - } - - model = get_gpt2_model(args_defaults) - model = self.get_deepspeed_model(model, tmpdir) - - model.eval() - baseline = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) - - tag = 'mp_1' - state_dict = {} - state_dict['checkpoint_version'] = get_megatron_version() - model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict) - dist.barrier() - model.load_checkpoint(tmpdir, - tag=tag, - load_optimizer_states=False, - load_lr_scheduler_states=False) - - test = model(inputs[0], inputs[1], inputs[2]) - assert torch.allclose(baseline, test, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}" - - @pytest.mark.world_size(2) - def test_gpt2_mp2_no_resize(self, inputs, tmpdir): - # test mp_size=2 case, verify ckpt saving/loading without resize. - args_defaults = { - 'num_layers': 2, - 'hidden_size': 128, - 'num_attention_heads': 8, - 'max_position_embeddings': 128, - } - - model = get_gpt2_model(args_defaults, mp_size=2) - model = self.get_deepspeed_model(model, tmpdir) - - model.eval() - - baseline = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) - - tag = 'mp_2' - state_dict = {} - state_dict['checkpoint_version'] = get_megatron_version() - model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict) - dist.barrier() - model.load_checkpoint(tmpdir, - tag=tag, - load_optimizer_states=False, - load_lr_scheduler_states=False) - - test = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) - assert torch.allclose(baseline, test, rtol=1.0, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}" - - -#TODO: Figure out how to run fixtures as distributed processes -''' -class TestResizeMP(DistributedTest): - @pytest.mark.world_size(2) - def test_gpt2_mp_save_ckpt(self, class_tmpdir, inputs, mp_size=2): - args_defaults = { - 'num_layers': 2, - 'hidden_size': 128, - 'num_attention_heads': 8, - 'max_position_embeddings': 128, - } - - model = get_gpt2_model(args_defaults, mp_size=mp_size) - model = self.get_deepspeed_model(model, class_tmpdir) - - model.eval() - - with torch.no_grad(): - baseline = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) - if dist.get_rank() == 0: - output.put(baseline.cpu()) - - state_dict = {} - state_dict['checkpoint_version'] = get_megatron_version() - model.save_checkpoint(class_tmpdir, client_state=state_dict) - - @pytest.mark.world_size(1) - def test_gpt2_mp_2to1(self, class_tmpdir, inputs, resize=1): - args_defaults = { - 'num_layers': 2, - 'hidden_size': 128, - 'num_attention_heads': 8, - 'max_position_embeddings': 128, - } - - model = get_gpt2_model(args_defaults, mp_size=resize) - model = self.get_deepspeed_model(model, class_tmpdir) - - model.eval() - - with torch.no_grad(): - model.load_checkpoint(tmpdir, - tag=tag, - load_optimizer_states=False, - load_lr_scheduler_states=False) - test = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) - if dist.get_rank() == 0: - output.put(test.cpu()) - - def _verify(b_queue, t_queue, baseline_event, test_event): - baseline = b_queue.get() - baseline_event.set() - - test = t_queue.get() - test_event.set() - - assert torch.allclose(baseline, test, atol=1e-03), f"Baseline output {baseline} is not equal to save-then-load output {test}" - - tag = f'mp_{mp_size}_resize_{resize}' - inputs = self.get_inputs() - - baseline = mp.Queue() - test = mp.Queue() - baseline_event = mp.Event() - test_event = mp.Event() - - verify_process = mp.Process(target=_verify, - args=(baseline, - test, - baseline_event, - test_event)) - verify_process.start() - - _run_baseline(inputs, tag, baseline, baseline_event) - _run_resize(inputs, tag, test, test_event) - - verify_process.join() - - def test_gpt2_mp_2to1(self, tmpdir): - # test mp_size=2 case, verify resize=1 case for ckpt merging. - self._test_gpt2_config_mp(tmpdir, mp_size=2, resize=1) - - def test_gpt2_mp_2to4(self, tmpdir): - # test mp_size=2 case, verify resize=4 case for ckpt splitting. - self._test_gpt2_config_mp(tmpdir, mp_size=2, resize=4) -''' - - -class TestConfigurablePP: - def setup_method(self, method): - reset_random() - - def get_inputs(self, bs=1, seq_len=1, hidden_size=128): - hidden_states = torch.randn(bs, seq_len, hidden_size) - attention_mask = torch.randint(low=0, - high=2, - size=(bs, - seq_len), - dtype=torch.bool) - return (hidden_states, attention_mask) - - def get_deepspeed_model(self, model, tmpdir): - ds_config_dict = { - "train_micro_batch_size_per_gpu": 1, - "optimizer": { - "type": "Lamb", - "params": { - "lr": 0.00015 - } - }, - } - dist.barrier() - - model, _, _,_ = deepspeed.initialize(model=model, - model_parameters=model.parameters(), - config=ds_config_dict) - return model.cuda() - - def get_topology(self, mp, pp, world_size): - assert world_size % (pp * mp) == 0 - dp = world_size // (pp * mp) - - from deepspeed.runtime.pipe.topology import PipeModelDataParallelTopology - topo = PipeModelDataParallelTopology(num_pp=pp, num_mp=mp, num_dp=dp) - - return topo - - def test_pp_basic(self, tmpdir): - # basic test case, mp_size=2, pp_size=2, verify ckpt saving/loading. - - mp_size = 2 - pp_size = 2 - world_size = mp_size * pp_size - - @distributed_test(world_size=world_size) - def _run(): - args_defaults = { - 'num_layers': 8, - 'hidden_size': 128, - 'num_attention_heads': 8, - 'max_position_embeddings': 128, - } - - topo = self.get_topology(mp_size, pp_size, world_size) - gpt2_pipe_model = GPT2ModelPipe(num_layers=8, - num_stages=pp_size, - mp_size=mp_size, - args_others=args_defaults, - topo=topo) - model = self.get_deepspeed_model(gpt2_pipe_model, tmpdir) - - tag = 'pp_basic' - state_dict = {} - state_dict['checkpoint_version'] = get_megatron_version() - model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict) - - if model.is_first_stage() or model.is_last_stage(): - inputs = self.get_inputs() - loader = RepeatingLoader([(inputs[0], 0)]) - data_iter = iter(loader) - else: - data_iter = None - - baseline = model.eval_batch(data_iter=data_iter, - compute_loss=False, - reduce_output=None) - - dist.barrier() - model.load_checkpoint(tmpdir, - tag=tag, - load_optimizer_states=False, - load_lr_scheduler_states=False) - dist.barrier() - - test = model.eval_batch(data_iter=data_iter, - compute_loss=False, - reduce_output=None) - - if test is not None: - assert len(baseline) == len(test) - # Compare outputs of each microbatch - for mb in range(len(baseline)): - for b, t in zip(baseline[mb], test[mb]): - if b.is_floating_point(): # don't compare masks - assert torch.allclose(b, t, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}" - - _run() - - def _test_gpt2_config_pp(self, tmpdir, mp_size, pp_size, mp_resize, pp_resize): - @distributed_test(world_size=pp_size * mp_size) - def _run_baseline(inputs, tag, output, quit_event): - reset_random() - args_defaults = { - 'num_layers': 8, - 'hidden_size': 128, - 'num_attention_heads': 8, - 'max_position_embeddings': 128, - } - - topo = self.get_topology(mp_size, pp_size, mp_size * pp_size) - gpt2_pipe_model = GPT2ModelPipe(num_layers=8, - num_stages=pp_size, - mp_size=mp_size, - args_others=args_defaults, - topo=topo) - model = self.get_deepspeed_model(gpt2_pipe_model, tmpdir) - - with torch.no_grad(): - inputs = [x.cuda() for x in inputs] - if model.is_first_stage() or model.is_last_stage(): - loader = RepeatingLoader([(inputs[0], 0)]) - data_iter = iter(loader) - else: - data_iter = None - - baseline = model.eval_batch(data_iter=data_iter, - compute_loss=False, - reduce_output=None) - - if baseline is not None: - # baseline should be [[hidden, True]]] - assert len(baseline) == 1 - assert len(baseline[0]) == 1 - assert torch.is_tensor(baseline[0][0]) - output.put(baseline[0][0].cpu()) - - state_dict = {} - state_dict['checkpoint_version'] = get_megatron_version() - model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict) - quit_event.wait() - - @distributed_test(world_size=mp_resize * pp_resize) - def _run_resize(inputs, tag, output, quit_event): - reset_random() - args_defaults = { - 'num_layers': 8, - 'hidden_size': 128, - 'num_attention_heads': 8, - 'max_position_embeddings': 128, - } - - topo = self.get_topology(mp_resize, pp_resize, mp_resize * pp_resize) - gpt2_pipe_model = GPT2ModelPipe(num_layers=8, - num_stages=pp_resize, - mp_size=mp_resize, - args_others=args_defaults, - topo=topo) - model = self.get_deepspeed_model(gpt2_pipe_model, tmpdir) - - with torch.no_grad(): - model.load_checkpoint(tmpdir, - tag=tag, - load_optimizer_states=False, - load_lr_scheduler_states=False) - inputs = [x.cuda() for x in inputs] - if model.is_first_stage() or model.is_last_stage(): - loader = RepeatingLoader([(inputs[0], 0)]) - data_iter = iter(loader) - else: - data_iter = None - - test = model.eval_batch(data_iter=data_iter, - compute_loss=False, - reduce_output=None) - - if test is not None: - # test should be [[hidden, True]]] - assert len(test) == 1 - assert len(test[0]) == 1 - assert torch.is_tensor(test[0][0]) - output.put(test[0][0].cpu()) - - quit_event.wait() - - def _verify(b_queue, t_queue, baseline_event, test_event): - baseline = b_queue.get() - baseline_event.set() - - test = t_queue.get() - test_event.set() - - assert torch.allclose(baseline, test, atol=1e-03), f"Baseline output {baseline} is not equal to save-then-load output {test}" - - tag = f'mp_{mp_size}to{mp_resize}_pp_{pp_size}to{pp_resize}' - - baseline = mp.Queue() - test = mp.Queue() - baseline_event = mp.Event() - test_event = mp.Event() - - verify_process = mp.Process(target=_verify, - args=(baseline, - test, - baseline_event, - test_event)) - verify_process.start() - - inputs = self.get_inputs() - _run_baseline(inputs, tag, baseline, baseline_event) - _run_resize(inputs, tag, test, test_event) - - verify_process.join() - - def test_gpt2_mp1_pp_2to1(self, tmpdir): - self._test_gpt2_config_pp(tmpdir, mp_size=1, pp_size=2, mp_resize=1, pp_resize=1) - - def test_gpt2_mp1_pp_2to4(self, tmpdir): - self._test_gpt2_config_pp(tmpdir, mp_size=1, pp_size=2, mp_resize=1, pp_resize=4) - - def test_gpt2_mp2_pp_2to1(self, tmpdir): - self._test_gpt2_config_pp(tmpdir, mp_size=2, pp_size=2, mp_resize=2, pp_resize=1) - - def test_gpt2_mp2_pp_1to2(self, tmpdir): - self._test_gpt2_config_pp(tmpdir, mp_size=2, pp_size=1, mp_resize=2, pp_resize=2) - - def test_gpt2_pp_2to1_mp_2to1(self, tmpdir): - self._test_gpt2_config_pp(tmpdir, mp_size=2, pp_size=2, mp_resize=1, pp_resize=1) - - def test_gpt2_pp_1to2_mp_1to2(self, tmpdir): - self._test_gpt2_config_pp(tmpdir, mp_size=1, pp_size=1, mp_resize=2, pp_resize=2) diff --git a/tests/unit/test_configurable_parallel_mp.py b/tests/unit/test_configurable_parallel_mp.py new file mode 100755 index 000000000000..b0a43233969b --- /dev/null +++ b/tests/unit/test_configurable_parallel_mp.py @@ -0,0 +1,170 @@ +import os +import torch +import deepspeed +import pytest +import random +import numpy as np +import deepspeed.comm as dist +from tests.unit.common import DistributedTest, DistributedFixture +from tests.unit.megatron_model import get_gpt2_model, get_megatron_version + +TORCH_MAJOR = int(torch.__version__.split('.')[0]) +TORCH_MINOR = int(torch.__version__.split('.')[1]) +pytestmark = pytest.mark.skipif( + TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 5), + reason='Megatron-LM package requires Pytorch version 1.5 or above') + + +def get_deepspeed_model(model): + ds_config_dict = { + "train_micro_batch_size_per_gpu": 1, + "optimizer": { + "type": "Lamb", + "params": { + "lr": 0.00015 + } + }, + } + + from megatron import mpu + model, _, _,_ = deepspeed.initialize(model=model, + mpu=mpu, + model_parameters=model.parameters(), + config=ds_config_dict) + return model + + +class ConfigurableMP(DistributedTest): + @pytest.fixture(autouse=True) + def reset_random(self, seed=1234): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + @pytest.fixture + def inputs(self, bs=1, seq_len=20): + input_ids = torch.randint(low=0, high=1000, size=(bs, seq_len)) + position_ids = torch.randint(low=0, high=2, size=(bs, seq_len)) + attention_mask = torch.randint(low=0, + high=2, + size=(bs, + seq_len), + dtype=torch.bool) + return [input_ids, position_ids, attention_mask] + + +class TestConfigurableMP(ConfigurableMP): + @pytest.mark.world_size(1) + def test_gpt2_basic(self, tmpdir, inputs): + args_defaults = { + 'num_layers': 2, + 'hidden_size': 128, + 'num_attention_heads': 8, + 'max_position_embeddings': 128, + } + + model = get_gpt2_model(args_defaults) + model = get_deepspeed_model(model) + + model.eval() + baseline = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) + + tag = 'mp_1' + state_dict = {} + state_dict['checkpoint_version'] = get_megatron_version() + model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict) + dist.barrier() + model.load_checkpoint(tmpdir, + tag=tag, + load_optimizer_states=False, + load_lr_scheduler_states=False) + + test = model(inputs[0], inputs[1], inputs[2]) + assert torch.allclose(baseline, test, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}" + + @pytest.mark.world_size(2) + def test_gpt2_mp2_no_resize(self, tmpdir, inputs): + args_defaults = { + 'num_layers': 2, + 'hidden_size': 128, + 'num_attention_heads': 8, + 'max_position_embeddings': 128, + } + + model = get_gpt2_model(args_defaults, mp_size=2) + model = get_deepspeed_model(model) + + model.eval() + + baseline = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) + + tag = 'mp_2' + state_dict = {} + state_dict['checkpoint_version'] = get_megatron_version() + model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict) + dist.barrier() + model.load_checkpoint(tmpdir, + tag=tag, + load_optimizer_states=False, + load_lr_scheduler_states=False) + + test = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) + assert torch.allclose(baseline, test, rtol=1.0, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}" + + +# This fixture provides the baseline model with mp=2 to TestConfigurableMPResize +class baseline_mp2(DistributedFixture): + world_size = 2 + + def run(self, inputs, class_tmpdir): + args_defaults = { + 'num_layers': 2, + 'hidden_size': 128, + 'num_attention_heads': 8, + 'max_position_embeddings': 128, + } + + model = get_gpt2_model(args_defaults, mp_size=self.world_size) + model = get_deepspeed_model(model) + + model.eval() + + with torch.no_grad(): + baseline = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) + if dist.get_rank() == 0: + save_path = os.path.join(class_tmpdir, "output.pt") + torch.save(baseline.cpu(), save_path) + + state_dict = {} + state_dict['checkpoint_version'] = get_megatron_version() + model.save_checkpoint(class_tmpdir, client_state=state_dict) + + +class TestConfigurableResizeMP(ConfigurableMP): + world_size = [1, 4] + + def test(self, baseline_mp2, inputs, class_tmpdir): + args_defaults = { + 'num_layers': 2, + 'hidden_size': 128, + 'num_attention_heads': 8, + 'max_position_embeddings': 128, + } + + world_size = os.environ["WORLD_SIZE"] + model = get_gpt2_model(args_defaults, mp_size=world_size) + model = get_deepspeed_model(model) + + model.eval() + + with torch.no_grad(): + model.load_checkpoint(class_tmpdir, + load_optimizer_states=False, + load_lr_scheduler_states=False) + test = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) + if dist.get_rank() == 0: + load_path = os.path.join(class_tmpdir, "output.pt") + baseline = torch.load(load_path) + test = test.cpu() + assert torch.allclose(baseline, test, atol=1e-03), f"Baseline output {baseline} is not equal to save-then-load output {test}" diff --git a/tests/unit/test_configurable_parallel_pp.py b/tests/unit/test_configurable_parallel_pp.py new file mode 100755 index 000000000000..8f116424fd99 --- /dev/null +++ b/tests/unit/test_configurable_parallel_pp.py @@ -0,0 +1,298 @@ +import os +import torch +import deepspeed +import pytest +import random +import numpy as np +import deepspeed.comm as dist +from tests.unit.common import DistributedTest, DistributedFixture +from tests.unit.megatron_model import get_megatron_version +from tests.unit.megatron_model import MockGPT2ModelPipe as GPT2ModelPipe +from deepspeed.utils import RepeatingLoader + +TORCH_MAJOR = int(torch.__version__.split('.')[0]) +TORCH_MINOR = int(torch.__version__.split('.')[1]) +pytestmark = pytest.mark.skipif( + TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 5), + reason='Megatron-LM package requires Pytorch version 1.5 or above') + + +def get_deepspeed_model(model): + ds_config_dict = { + "train_micro_batch_size_per_gpu": 1, + "optimizer": { + "type": "Lamb", + "params": { + "lr": 0.00015 + } + }, + } + + model, _, _,_ = deepspeed.initialize(model=model, + model_parameters=model.parameters(), + config=ds_config_dict) + return model.cuda() + + +def get_topology(mp, pp, world_size): + assert world_size % (pp * mp) == 0 + dp = world_size // (pp * mp) + + from deepspeed.runtime.pipe.topology import PipeModelDataParallelTopology + topo = PipeModelDataParallelTopology(num_pp=pp, num_mp=mp, num_dp=dp) + + return topo + + +class ConfigurablePP(DistributedTest): + @pytest.fixture(autouse=True) + def reset_random(self, seed=1234): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + @pytest.fixture + def inputs(self, bs=1, seq_len=1, hidden_size=128): + hidden_states = torch.randn(bs, seq_len, hidden_size) + attention_mask = torch.randint(low=0, + high=2, + size=(bs, + seq_len), + dtype=torch.bool) + return (hidden_states, attention_mask) + + +class TestConfigurablePP(ConfigurablePP): + mp_size = 2 + pp_size = 2 + world_size = 4 # mp_size * pp_size + + def test_pp_basic(self, tmpdir): + # basic test case, mp_size=2, pp_size=2, verify ckpt saving/loading. + args_defaults = { + 'num_layers': 8, + 'hidden_size': 128, + 'num_attention_heads': 8, + 'max_position_embeddings': 128, + } + mp_size = self.mp_size + pp_size = self.pp_size + world_size = self.world_size + + topo = get_topology(mp_size, pp_size, world_size) + gpt2_pipe_model = GPT2ModelPipe(num_layers=8, + num_stages=pp_size, + mp_size=mp_size, + args_others=args_defaults, + topo=topo) + model = get_deepspeed_model(gpt2_pipe_model, tmpdir) + + tag = 'pp_basic' + state_dict = {} + state_dict['checkpoint_version'] = get_megatron_version() + model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict) + + if model.is_first_stage() or model.is_last_stage(): + inputs = self.get_inputs() + loader = RepeatingLoader([(inputs[0], 0)]) + data_iter = iter(loader) + else: + data_iter = None + + baseline = model.eval_batch(data_iter=data_iter, + compute_loss=False, + reduce_output=None) + + dist.barrier() + model.load_checkpoint(tmpdir, + tag=tag, + load_optimizer_states=False, + load_lr_scheduler_states=False) + dist.barrier() + + test = model.eval_batch(data_iter=data_iter, + compute_loss=False, + reduce_output=None) + + if test is not None: + assert len(baseline) == len(test) + # Compare outputs of each microbatch + for mb in range(len(baseline)): + for b, t in zip(baseline[mb], test[mb]): + if b.is_floating_point(): # don't compare masks + assert torch.allclose(b, t, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}" + + +# Base class for creating / saving model output for baseline models. This is +# not meant to be used directly as a fixture to any classes +class _baseline(DistributedFixture): + world_size = None + + def run(self, inputs, class_tmpdir, mp_size, pp_size): + assert int(os.environ["WORLD_SIZE"]) == (pp_size * mp_size), "world size does not match provided pp_size and mp_size" + args_defaults = { + 'num_layers': 8, + 'hidden_size': 128, + 'num_attention_heads': 8, + 'max_position_embeddings': 128, + } + + topo = get_topology(mp_size, pp_size, mp_size * pp_size) + gpt2_pipe_model = GPT2ModelPipe(num_layers=8, + num_stages=pp_size, + mp_size=mp_size, + args_others=args_defaults, + topo=topo) + model = get_deepspeed_model(gpt2_pipe_model) + + with torch.no_grad(): + inputs = [x.cuda() for x in inputs] + if model.is_first_stage() or model.is_last_stage(): + loader = RepeatingLoader([(inputs[0], 0)]) + data_iter = iter(loader) + else: + data_iter = None + + baseline = model.eval_batch(data_iter=data_iter, + compute_loss=False, + reduce_output=None) + + if baseline is not None: + # baseline should be [[hidden, True]]] + assert len(baseline) == 1 + assert len(baseline[0]) == 1 + assert torch.is_tensor(baseline[0][0]) + save_path = os.path.join(class_tmpdir, "output.pt") + torch.save(baseline[0][0].cpu(), save_path) + + state_dict = {} + state_dict['checkpoint_version'] = get_megatron_version() + model.save_checkpoint(class_tmpdir, client_state=state_dict) + + +# This may look odd, but there is a limitation with DistributedFixture that +# doesn't allow us to reuse a fixture with different worldsizes. This could be +# implemented in conftest.py::pytest_fixture_setup and common.py::DistributedFixture +class baseline_ws1(_baseline): + world_size = 1 + + +class baseline_ws2(_baseline): + world_size = 2 + + +class baseline_ws4(_baseline): + world_size = 4 + + +class TestConfigurableResizePP(ConfigurablePP): + def _test(self, inputs, class_tmpdir, mp_size, pp_size, mp_resize, pp_resize): + args_defaults = { + 'num_layers': 8, + 'hidden_size': 128, + 'num_attention_heads': 8, + 'max_position_embeddings': 128, + } + + topo = get_topology(mp_resize, pp_resize, mp_resize * pp_resize) + gpt2_pipe_model = GPT2ModelPipe(num_layers=8, + num_stages=pp_resize, + mp_size=mp_resize, + args_others=args_defaults, + topo=topo) + model = get_deepspeed_model(gpt2_pipe_model) + + with torch.no_grad(): + model.load_checkpoint(class_tmpdir, + load_optimizer_states=False, + load_lr_scheduler_states=False) + inputs = [x.cuda() for x in inputs] + if model.is_first_stage() or model.is_last_stage(): + loader = RepeatingLoader([(inputs[0], 0)]) + data_iter = iter(loader) + else: + data_iter = None + + test = model.eval_batch(data_iter=data_iter, + compute_loss=False, + reduce_output=None) + + if test is not None: + # test should be [[hidden, True]]] + assert len(test) == 1 + assert len(test[0]) == 1 + assert torch.is_tensor(test[0][0]) + test = test[0][0].cpu() + load_path = os.path.join(class_tmpdir, "output.pt") + baseline = torch.load(load_path) + assert torch.allclose(baseline, test, atol=1e-03), f"Baseline output {baseline} is not equal to save-then-load output {test}" + + # These tests are divided by baseline model worldsize and test model worldsize + @pytest.mark.world_size(1) + @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", [(1, 2, 1, 1)]) + def test_world_size_2to1(self, + inputs, + class_tmpdir, + baseline_ws2, + mp_size, + pp_size, + mp_resize, + pp_resize): + self._test(inputs, class_tmpdir, mp_size, pp_size, mp_resize, pp_resize) + + @pytest.mark.world_size(1) + @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", [(2, 2, 1, 1)]) + def test_world_size_4to1(self, + inputs, + class_tmpdir, + baseline_ws4, + mp_size, + pp_size, + mp_resize, + pp_resize): + self._test(inputs, class_tmpdir, mp_size, pp_size, mp_resize, pp_resize) + + @pytest.mark.world_size(2) + @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", [(2, 2, 2, 1)]) + def test_world_size_4to2(self, + inputs, + class_tmpdir, + baseline_ws4, + mp_size, + pp_size, + mp_resize, + pp_resize): + self._test(inputs, class_tmpdir, mp_size, pp_size, mp_resize, pp_resize) + + @pytest.mark.world_size(4) + @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", [(1, 1, 2, 2)]) + def test_world_size_1to4(self, + inputs, + class_tmpdir, + baseline_ws1, + mp_size, + pp_size, + mp_resize, + pp_resize): + self._test(inputs, class_tmpdir, mp_size, pp_size, mp_resize, pp_resize) + + @pytest.mark.world_size(4) + @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", + [(1, + 2, + 1, + 4), + (2, + 1, + 2, + 2)]) + def test_world_size_2to4(self, + inputs, + class_tmpdir, + baseline_ws2, + mp_size, + pp_size, + mp_resize, + pp_resize): + self._test(inputs, class_tmpdir, mp_size, pp_size, mp_resize, pp_resize) From 27ed180a7618608cd90dc3b3d37a79a19c1bc2f7 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Fri, 19 Aug 2022 16:45:57 -0700 Subject: [PATCH 07/25] added DistributedFixture test and documentation --- tests/unit/comm/test_dist.py | 37 +++++++- tests/unit/common.py | 97 +++++++++++++++++++++ tests/unit/test_configurable_parallel_pp.py | 2 +- 3 files changed, 134 insertions(+), 2 deletions(-) diff --git a/tests/unit/comm/test_dist.py b/tests/unit/comm/test_dist.py index 32ea9126d75b..6f3ec1212393 100644 --- a/tests/unit/comm/test_dist.py +++ b/tests/unit/comm/test_dist.py @@ -1,7 +1,8 @@ +import os import torch import deepspeed.comm as dist -from tests.unit.common import DistributedTest +from tests.unit.common import DistributedTest, DistributedFixture import pytest @@ -62,6 +63,40 @@ def test_world_size_1(self): assert dist.get_world_size() == 1 +# Demonstration of the DistributedFixture class +@pytest.fixture(params=[2, 4]) +def val1(request): + return request.param + + +@pytest.fixture(params=[16, 32]) +def val2(request): + return request.param + + +class distributed_fixture(DistributedFixture): + world_size = 2 + + def run(self, class_tmpdir, val1, val2): + assert int(os.environ["WORLD_SIZE"]) == self.world_size + local_rank = os.environ["LOCAL_RANK"] + file_path = os.path.join(class_tmpdir, f"checkpoint-{local_rank}.pt") + with open(file_path, "w") as f: + f.write(f"{local_rank},{val1},{val2}") + + +class TestDistributedFixture(DistributedTest): + world_size = 1 + + def test(self, distributed_fixture, class_tmpdir, val1, val2): + for rank in range(2): + file_path = os.path.join(class_tmpdir, f"checkpoint-{rank}.pt") + with open(file_path, "r") as f: + chkpt = f.read() + assert chkpt == f"{rank},{val1},{val2}" + assert int(os.environ["WORLD_SIZE"]) == 1 + + class TestDistAllReduce(DistributedTest): world_size = [1, 2, 4] diff --git a/tests/unit/common.py b/tests/unit/common.py index 63993ac7bd39..9ea03d93bb11 100644 --- a/tests/unit/common.py +++ b/tests/unit/common.py @@ -64,6 +64,10 @@ def set_cuda_visibile(): class DistributedExec(ABC): + """ + Base class for distributed execution of functions/methods. Contains common + methods needed for DistributedTest and DistributedFixture. + """ world_size = 2 backend = "nccl" @@ -169,6 +173,55 @@ def _dist_init(self, local_rank, num_procs, skip_msg): class DistributedFixture(DistributedExec): + """ + Implementation that extends @pytest.fixture to allow for distributed execution. + This is primarily meant to be used when a test requires executing two pieces of + code with different world sizes. + + There are 2 parameters that can be modified: + - world_size: int = 2 -- the number of processes to launch + - backend: Literal['nccl','mpi','gloo'] = 'nccl' -- which backend to use + + Features: + - able to call pytest.skip() inside fixture + - can be reused by multiple tests + - can accept other fixtures as input + + Limitations: + - cannot use @pytest.mark.parametrize + - world_size cannot be modified after definition and only one world_size value is accepted + - any fixtures used must also be used in the test that uses this fixture (see example below) + - return values cannot be returned. Passing values to a DistributedTest + object can be achieved using class_tmpdir and writing to file (see example below) + + Usage: + - must implement a run(self, ...) method + - fixture can be used by making the class name input to a test function + + Example: + @pytest.fixture(params=[10,20]) + def regular_pytest_fixture(request): + return request.param + + class distributed_fixture_example(DistributedFixture): + world_size = 4 + + def run(self, regular_pytest_fixture, class_tmpdir): + assert int(os.environ["WORLD_SIZE"]) == self.world_size + local_rank = os.environ["LOCAL_RANK"] + print(f"Rank {local_rank} with value {regular_pytest_fixture}") + with open(os.path.join(class_tmpdir, f"{local_rank}.txt"), "w") as f: + f.write(f"{local_rank},{regular_pytest_fixture}") + + class TestExample(DistributedTest): + world_size = 1 + + def test(self, distributed_fixture_example, regular_pytest_fixture, class_tmpdir): + assert int(os.environ["WORLD_SIZE"]) == self.world_size + for rank in range(4): + with open(os.path.join(class_tmpdir, f"{rank}.txt"), "r") as f: + assert f.read() == f"{rank},{regular_pytest_fixture}" + """ is_dist_fixture = True # These values are just placeholders so that pytest recognizes this as a fixture @@ -184,6 +237,50 @@ def __init__(self): class DistributedTest(DistributedExec): + """ + Implementation for running pytest with distributed execution. + + There are 2 parameters that can be modified: + - world_size: Union[int,List[int]] = 2 -- the number of processes to launch + - backend: Literal['nccl','mpi','gloo'] = 'nccl' -- which backend to use + + Features: + - able to call pytest.skip() inside tests + - works with pytest fixtures, parametrize, mark, etc. + - can contain multiple tests (each of which can be parametrized separately) + - class methods can be fixtures (usable by tests in this class only) + - world_size can be changed for individual tests using @pytest.mark.world_size(world_size) + - class_tmpdir is a fixture that can be used to get a tmpdir shared among + all tests (including DistributedFixture) + + Usage: + - class name must start with "Test" + - must implement one or more test*(self, ...) methods + + Example: + @pytest.fixture(params=[10,20]) + def val1(request): + return request.param + + @pytest.mark.fast + @pytest.mark.parametrize("val2", [30,40]) + class TestExample(DistributedTest): + world_size = 2 + + @pytest.fixture(params=[50,60]) + def val3(self, request): + return request.param + + def test_1(self, val1, val2, str1="hello world"): + assert int(os.environ["WORLD_SIZE"]) == self.world_size + assert all(val1, val2, str1) + + @pytest.mark.world_size(1) + @pytest.mark.parametrize("val4", [70,80]) + def test_2(self, val1, val2, val3, val4): + assert int(os.environ["WORLD_SIZE"]) == 1 + assert all(val1, val2, val3, val4) + """ is_dist_test = True # Temporary directory that is shared among test methods in a class diff --git a/tests/unit/test_configurable_parallel_pp.py b/tests/unit/test_configurable_parallel_pp.py index 8f116424fd99..4daefeb824f4 100755 --- a/tests/unit/test_configurable_parallel_pp.py +++ b/tests/unit/test_configurable_parallel_pp.py @@ -86,7 +86,7 @@ def test_pp_basic(self, tmpdir): mp_size=mp_size, args_others=args_defaults, topo=topo) - model = get_deepspeed_model(gpt2_pipe_model, tmpdir) + model = get_deepspeed_model(gpt2_pipe_model) tag = 'pp_basic' state_dict = {} From fb59b1c6397e72d899b7d9796a065c943dca59db Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Mon, 22 Aug 2022 14:31:57 -0700 Subject: [PATCH 08/25] last tests --- tests/unit/inference/test_inference.py | 6 + tests/unit/test_activation_checkpointing.py | 190 +++++++++----------- tests/unit/test_fp16.py | 48 ----- tests/unit/test_moe.py | 89 ++++----- 4 files changed, 120 insertions(+), 213 deletions(-) diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index 09fcd0736af5..ab79e3d78361 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -291,6 +291,12 @@ class TestLMCorrectness(DistributedTest): world_size = 1 def test(self, model_family, model_name, task): + # imports here to avoid import errors when pytest collects tests + import lm_eval + import lm_eval.models + import lm_eval.tasks + import lm_eval.evaluator + local_rank = os.getenv("LOCAL_RANK", "0") device = torch.device(f"cuda:{local_rank}") dtype = torch.float diff --git a/tests/unit/test_activation_checkpointing.py b/tests/unit/test_activation_checkpointing.py index ad32a53385f2..1b27ff74257e 100644 --- a/tests/unit/test_activation_checkpointing.py +++ b/tests/unit/test_activation_checkpointing.py @@ -1,17 +1,13 @@ # TODO: add tests with model parallelism for activation partitioning and other features. -from copy import deepcopy - import pytest - import torch - import deepspeed +from copy import deepcopy +from tests.unit.common import DistributedTest ckpt = deepspeed.checkpointing.checkpoint -from .common import distributed_test - def _compute(module, *inputs, do_checkpoint=False): if do_checkpoint: @@ -59,9 +55,6 @@ def _match_outputs(ref, tgt): assert torch.equal(ref, tgt) -# This is distributed because checkpoint() assumes that deepspeed.comm is initialized. -# deepspeed.comm is used with activation partitioning, but not for these simple cases. -@distributed_test(world_size=1) def _test_activation_checkpoint(module, *inputs): # Move to device module.cuda() @@ -82,9 +75,6 @@ def _test_activation_checkpoint(module, *inputs): _match_outputs(b, t) -# This is distributed because checkpoint() assumes that deepspeed.comm is initialized. -# deepspeed.comm is used with activation partitioning, but not for these simple cases. -@distributed_test(world_size=1) def _test_activation_checkpoint_ordering(module, expected_ordering, *inputs): # Move to device module.cuda() @@ -137,6 +127,26 @@ def forward(self, x, mask): return dup, x, mask +class DropMaskLinear(torch.nn.Linear): + def forward(self, x, mask): + return super().forward(x) + + +class LinearNonTensorInput(torch.nn.Linear): + def forward(self, x, non_tensor_input): + return super().forward(x) + + +class LinearNonTensorOutput(torch.nn.Linear): + def __init__(self, non_tensor_output): + super().__init__(HIDDEN_DIM, HIDDEN_DIM) + self.non_tensor_output = non_tensor_output + + def forward(self, x): + out = super().forward(x) + return out, self.non_tensor_output + + HIDDEN_DIM = 20 @@ -159,69 +169,48 @@ def _bool_to_float(btensor, dtype=torch.float32): # -def test_ckpt_inputs1_outputs1(): - module = torch.nn.Linear(HIDDEN_DIM, HIDDEN_DIM) - inputs = torch.rand(HIDDEN_DIM) - inputs.requires_grad = True - _test_activation_checkpoint(module, inputs) - - # both bool and float are important, as bool is not differentiable @pytest.mark.parametrize('mask', [ _mixed_mask(), _bool_to_float(_mixed_mask()), ]) -def test_ckpt_inputs2_outputs1(mask): - module = MaskedLinear(HIDDEN_DIM, HIDDEN_DIM) - inputs = torch.rand(HIDDEN_DIM) - inputs.requires_grad = True - _test_activation_checkpoint(module, inputs, mask) - - -@pytest.mark.parametrize('mask', - [ - _mixed_mask(), - _bool_to_float(_mixed_mask()), - ]) -def test_ckpt_inputs2_outputs2(mask): - module = MaskedLinearSeq(HIDDEN_DIM, HIDDEN_DIM) - inputs = torch.rand(HIDDEN_DIM) - inputs.requires_grad = True - _test_activation_checkpoint(module, inputs, mask) - - -@pytest.mark.parametrize('mask', - [ - _mixed_mask(), - _bool_to_float(_mixed_mask()), - ]) -def test_ckpt_inputs2_outputs3(mask): - module = MaskedLinearSeqDup(HIDDEN_DIM, HIDDEN_DIM) - inputs = torch.rand(HIDDEN_DIM) - inputs.requires_grad = True - _test_activation_checkpoint(module, inputs, mask) - - -class DropMaskLinear(torch.nn.Linear): - def forward(self, x, mask): - return super().forward(x) - - -def test_ckpt_arg_none(): - module = DropMaskLinear(HIDDEN_DIM, HIDDEN_DIM) - inputs = (torch.rand(HIDDEN_DIM), None) - inputs[0].requires_grad = True - _test_activation_checkpoint(module, *inputs) - - -class LinearNonTensorInput(torch.nn.Linear): - def forward(self, x, non_tensor_input): - return super().forward(x) +class TestActivationCheckpoint(DistributedTest): + world_size = 1 + + def test_ckpt_inputs1_outputs1(self, mask): + module = torch.nn.Linear(HIDDEN_DIM, HIDDEN_DIM) + inputs = torch.rand(HIDDEN_DIM) + inputs.requires_grad = True + _test_activation_checkpoint(module, inputs) + + def test_ckpt_inputs2_outputs1(self, mask): + module = MaskedLinear(HIDDEN_DIM, HIDDEN_DIM) + inputs = torch.rand(HIDDEN_DIM) + inputs.requires_grad = True + _test_activation_checkpoint(module, inputs, mask) + + def test_ckpt_inputs2_outputs2(self, mask): + module = MaskedLinearSeq(HIDDEN_DIM, HIDDEN_DIM) + inputs = torch.rand(HIDDEN_DIM) + inputs.requires_grad = True + _test_activation_checkpoint(module, inputs, mask) + + def test_ckpt_inputs2_outputs3(self, mask): + module = MaskedLinearSeqDup(HIDDEN_DIM, HIDDEN_DIM) + inputs = torch.rand(HIDDEN_DIM) + inputs.requires_grad = True + _test_activation_checkpoint(module, inputs, mask) + + def test_ckpt_arg_none(self, mask): + module = DropMaskLinear(HIDDEN_DIM, HIDDEN_DIM) + inputs = (torch.rand(HIDDEN_DIM), None) + inputs[0].requires_grad = True + _test_activation_checkpoint(module, *inputs) @pytest.mark.parametrize( - 'non_tensor_input', + 'non_tensor', [None, 2, True, @@ -230,38 +219,20 @@ def forward(self, x, non_tensor_input): (None, True, torch.randn(HIDDEN_DIM))]) -def test_ckpt_non_tensor_input(non_tensor_input): - module = LinearNonTensorInput(HIDDEN_DIM, HIDDEN_DIM) - inputs = torch.rand(HIDDEN_DIM) - inputs.requires_grad = True - _test_activation_checkpoint(module, inputs, non_tensor_input) +class TestCheckpointNonTensor(DistributedTest): + world_size = 1 + def test_ckpt_non_tensor_input(self, non_tensor): + module = LinearNonTensorInput(HIDDEN_DIM, HIDDEN_DIM) + inputs = torch.rand(HIDDEN_DIM) + inputs.requires_grad = True + _test_activation_checkpoint(module, inputs, non_tensor) -class LinearNonTensorOutput(torch.nn.Linear): - def __init__(self, non_tensor_output): - super().__init__(HIDDEN_DIM, HIDDEN_DIM) - self.non_tensor_output = non_tensor_output - - def forward(self, x): - out = super().forward(x) - return out, self.non_tensor_output - - -@pytest.mark.parametrize( - 'non_tensor_output', - [None, - 2, - True, - (None, - 2.5), - (None, - True, - torch.randn(HIDDEN_DIM))]) -def test_ckpt_non_tensor_output(non_tensor_output): - module = LinearNonTensorOutput(non_tensor_output) - inputs = torch.rand(HIDDEN_DIM) - inputs.requires_grad = True - _test_activation_checkpoint(module, inputs) + def test_ckpt_non_tensor_output(self, non_tensor): + module = LinearNonTensorOutput(non_tensor) + inputs = torch.rand(HIDDEN_DIM) + inputs.requires_grad = True + _test_activation_checkpoint(module, inputs) @pytest.mark.parametrize('non_tensor_output', @@ -276,15 +247,18 @@ def test_ckpt_non_tensor_output(non_tensor_output): True, torch.randn(HIDDEN_DIM)) ]) -def test_ckpt_non_tensor_output_ordering(non_tensor_output): - module = LinearNonTensorOutput(non_tensor_output) - inputs = torch.rand(HIDDEN_DIM) - inputs.requires_grad = True - - # First return is a tensor - ordering = [True] - if type(non_tensor_output) in [list, tuple]: - ordering += [torch.is_tensor(t) for t in non_tensor_output] - else: - ordering += [torch.is_tensor(non_tensor_output)] - _test_activation_checkpoint_ordering(module, ordering, inputs) +class TestCheckpointNonTensorOutputOrdering(DistributedTest): + world_size = 1 + + def test_ckpt_non_tensor_output_ordering(self, non_tensor_output): + module = LinearNonTensorOutput(non_tensor_output) + inputs = torch.rand(HIDDEN_DIM) + inputs.requires_grad = True + + # First return is a tensor + ordering = [True] + if type(non_tensor_output) in [list, tuple]: + ordering += [torch.is_tensor(t) for t in non_tensor_output] + else: + ordering += [torch.is_tensor(non_tensor_output)] + _test_activation_checkpoint_ordering(module, ordering, inputs) diff --git a/tests/unit/test_fp16.py b/tests/unit/test_fp16.py index 66d1bb737ff0..ddfe8369f784 100755 --- a/tests/unit/test_fp16.py +++ b/tests/unit/test_fp16.py @@ -444,54 +444,6 @@ def test(self, zero_stage, use_cpu_offload, hidden_dim): model.step() -#TODO: WHAT DOES THIS TEST? -''' -def test_zero_static_scale_deprecated_format(tmpdir): - config_dict = { - "train_batch_size": 4, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 - } - }, - "fp16": { - "enabled": True, - "loss_scale": 138. - }, - "zero_optimization": { - "stage": 1 - } - } - args = args_from_dict(tmpdir, config_dict) - - @distributed_test(world_size=2) - def _test_zero_static_scale(args): - hidden_dim = 10 - model = SimpleModel(hidden_dim) - model, optim, _, _ = deepspeed.initialize(args=args, - model=model, - model_parameters=model.parameters()) - - # Ensure the static scaler is configured. - assert optim.dynamic_loss_scale == False - assert optim.loss_scaler.loss_scale == 138. - - # Now make sure things work.. - data_loader = random_dataloader(model=model, - total_samples=10, - hidden_dim=hidden_dim, - device=model.device) - for n, batch in enumerate(data_loader): - loss = model(batch[0], batch[1]) - model.backward(loss) - model.step() - - _test_zero_static_scale(args) -''' - - @pytest.mark.parametrize("zero_stage", [1, 2, 3]) @pytest.mark.parametrize("use_cpu_offload", [True, False]) class TestZeroAllowUntestedOptimizer(DistributedTest): diff --git a/tests/unit/test_moe.py b/tests/unit/test_moe.py index 22bf8ba2caf7..6d9e88bc21d8 100644 --- a/tests/unit/test_moe.py +++ b/tests/unit/test_moe.py @@ -1,48 +1,34 @@ import torch import deepspeed import pytest -from tests.unit.common import distributed_test -from tests.unit.simple_model import SimplePRMoEModel, SimpleMoEModel, sequence_dataloader, args_from_dict +from tests.unit.common import DistributedTest +from tests.unit.simple_model import SimplePRMoEModel, SimpleMoEModel, sequence_dataloader from tests.unit.util import required_torch_version -try: - from apex import amp # noqa: F401 - _amp_available = True -except ImportError: - _amp_available = False -amp_available = pytest.mark.skip(_amp_available, reason="apex/amp is not installed") +@pytest.mark.parametrize("ep_size", [2, 4]) +@pytest.mark.parametrize("use_residual", [True, False]) +class TestMoE(DistributedTest): + world_size = 4 -@pytest.mark.parametrize("ep_size, use_residual", - [(2, - True), - (2, - False), - (4, - True), - (4, - False)]) -def test_moe(tmpdir, ep_size, use_residual): - if not required_torch_version(): - pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") + def test(self, ep_size, use_residual): + if not required_torch_version(): + pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") - config_dict = { - "train_batch_size": 8, - "steps_per_print": 1, - "fp16": { - "enabled": True + config_dict = { + "train_batch_size": 8, + "steps_per_print": 1, + "fp16": { + "enabled": True + } } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 16 + hidden_dim = 16 - @distributed_test(world_size=[4]) - def _test_moe(args, hidden_dim, ep_size, use_residual): # E+D -- ep_size = 2 # E only -- ep_size = 4 model = SimpleMoEModel(hidden_dim, ep_size=ep_size, use_residual=use_residual) optimizer = torch.optim.AdamW(params=model.parameters()) - model, _, _, _ = deepspeed.initialize(args=args, + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer, dist_init_required=False) @@ -58,35 +44,29 @@ def _test_moe(args, hidden_dim, ep_size, use_residual): model.backward(loss) model.step() - _test_moe(args=args, - hidden_dim=hidden_dim, - ep_size=ep_size, - use_residual=use_residual) - @pytest.mark.parametrize("ep_size, use_residual", [(2, True), (2, False)]) -def test_pr_moe(tmpdir, ep_size, use_residual): - if not required_torch_version(): - pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") - - config_dict = { - "train_batch_size": 8, - "steps_per_print": 1, - "fp16": { - "enabled": True +class TestPRMoE(DistributedTest): + world_size = 4 + + def test(self, ep_size, use_residual): + if not required_torch_version(): + pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") + + config_dict = { + "train_batch_size": 8, + "steps_per_print": 1, + "fp16": { + "enabled": True + } } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 16 + hidden_dim = 16 - @distributed_test(world_size=[4]) - def _test_moe(args, hidden_dim, ep_size, use_residual): # E+D -- ep_size = 2 # E only -- ep_size = 4 - model = SimplePRMoEModel(hidden_dim, ep_size=ep_size, use_residual=use_residual) optimizer = torch.optim.AdamW(params=model.parameters()) - model, _, _, _ = deepspeed.initialize(args=args, + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer, dist_init_required=False) @@ -100,8 +80,3 @@ def _test_moe(args, hidden_dim, ep_size, use_residual): loss = model(batch[0], batch[1]) model.backward(loss) model.step() - - _test_moe(args=args, - hidden_dim=hidden_dim, - ep_size=ep_size, - use_residual=use_residual) From db57b66e653b44d2fa9464cf406107be613a45c8 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Tue, 23 Aug 2022 11:22:41 -0700 Subject: [PATCH 09/25] fixes for refactored tests --- tests/unit/test_configurable_parallel_pp.py | 3 +-- tests/unit/test_ds_initialize.py | 2 ++ tests/unit/test_fp16.py | 2 +- tests/unit/test_lr_schedulers.py | 6 +++--- tests/unit/test_zero.py | 14 ++++++++++---- 5 files changed, 17 insertions(+), 10 deletions(-) diff --git a/tests/unit/test_configurable_parallel_pp.py b/tests/unit/test_configurable_parallel_pp.py index 4daefeb824f4..e862ff33167a 100755 --- a/tests/unit/test_configurable_parallel_pp.py +++ b/tests/unit/test_configurable_parallel_pp.py @@ -68,7 +68,7 @@ class TestConfigurablePP(ConfigurablePP): pp_size = 2 world_size = 4 # mp_size * pp_size - def test_pp_basic(self, tmpdir): + def test_pp_basic(self, inputs, tmpdir): # basic test case, mp_size=2, pp_size=2, verify ckpt saving/loading. args_defaults = { 'num_layers': 8, @@ -94,7 +94,6 @@ def test_pp_basic(self, tmpdir): model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict) if model.is_first_stage() or model.is_last_stage(): - inputs = self.get_inputs() loader = RepeatingLoader([(inputs[0], 0)]) data_iter = iter(loader) else: diff --git a/tests/unit/test_ds_initialize.py b/tests/unit/test_ds_initialize.py index 5d1baf7ccf40..1dcb2d623932 100644 --- a/tests/unit/test_ds_initialize.py +++ b/tests/unit/test_ds_initialize.py @@ -89,6 +89,8 @@ def _optimizer_callable(params) -> Optimizer: @pytest.mark.parametrize("scheduler_type", [None, _LRScheduler, Callable]) @pytest.mark.parametrize("optimizer_type", [None, Optimizer, Callable]) class TestClientLrScheduler(DistributedTest): + world_size = 1 + def test(self, scheduler_type, optimizer_type): def _my_lambda(epoch): return epoch // 10 diff --git a/tests/unit/test_fp16.py b/tests/unit/test_fp16.py index ddfe8369f784..d5b8268844ed 100755 --- a/tests/unit/test_fp16.py +++ b/tests/unit/test_fp16.py @@ -796,7 +796,7 @@ def test(self): @pytest.mark.parametrize('stage', [1, 2, 3]) class TestZeroEmptyGrad(DistributedTest): - world_size = 2 + world_size = 1 def test(self, stage): config_dict = { diff --git a/tests/unit/test_lr_schedulers.py b/tests/unit/test_lr_schedulers.py index ca2dd849cb97..4f0c940a74bc 100755 --- a/tests/unit/test_lr_schedulers.py +++ b/tests/unit/test_lr_schedulers.py @@ -81,7 +81,7 @@ def test(self, scheduler_type, params): @pytest.mark.parametrize("warmup_num_steps", [10, 15, 19, 33]) -@pytest.mark.parametrize("warnup_type", [WARMUP_LOG_RATE, WARMUP_LINEAR_RATE]) +@pytest.mark.parametrize("warmup_type", [WARMUP_LOG_RATE, WARMUP_LINEAR_RATE]) class TestLrSchedule(DistributedTest): world_size = 1 @@ -300,7 +300,7 @@ def test(self, min_lr, step_rate, step_size, staircase): step_lrs = [] for _, batch in enumerate(data_loader): - step_lrs.append(lr_scheduler.get_lr()) + step_lrs.extend(lr_scheduler.get_lr()) loss = model(batch[0], batch[1]) model.backward(loss) model.step() @@ -364,7 +364,7 @@ def test_lr(self, min_lr, max_lr, decay_rate, cycle_step_size, decay_step_size): step_lrs = [] for _, batch in enumerate(data_loader): - step_lrs.append(lr_scheduler.get_lr()) + step_lrs.extend(lr_scheduler.get_lr()) loss = model(batch[0], batch[1]) model.backward(loss) model.step() diff --git a/tests/unit/test_zero.py b/tests/unit/test_zero.py index be4ade46ecf0..d4df3e925a29 100755 --- a/tests/unit/test_zero.py +++ b/tests/unit/test_zero.py @@ -525,6 +525,7 @@ class TestZero3ParamPartitioningBase(DistributedTest): world_size = 2 def test( + self, param_persistence_threshold: int, fp16_enabled: bool, contiguous_gradients: bool, @@ -948,9 +949,12 @@ def __init_weights(self, module): 1)) +@pytest.skip("not working") @pytest.mark.parametrize("param_persistence_threshold", [0, 10]) @pytest.mark.parametrize("contiguous_gradients", [True, False]) @pytest.mark.parametrize("offload_optimizer", [True, False]) +@pytest.mark.parametrize("zero_grad", [True, False]) +@pytest.mark.parametrize("prefetching", [True, False]) class TestZero3ParamPartitioningBaseBF16(DistributedTest): world_size = 2 @@ -959,8 +963,8 @@ def test( param_persistence_threshold: int, contiguous_gradients: bool, offload_optimizer: bool, - zero_grad: bool = True, - iteration: int = 0, + zero_grad: bool, + prefetching: bool, ) -> None: if offload_optimizer and not contiguous_gradients: return @@ -969,7 +973,7 @@ def test( n = 5 weights = [Parameter(torch.zeros((m, n), dtype=torch.float32)) for _ in range(3)] model = EltwiseMultiplicationTestNetwork(*weights) - + prefetch_bucket_size = sum([p.numel() for p in model.parameters(recurse=True)]) cfg = { "train_micro_batch_size_per_gpu": 1, "zero_optimization": { @@ -977,6 +981,7 @@ def test( "stage3_max_reuse_distance": 0, "stage3_param_persistence_threshold": param_persistence_threshold, "contiguous_gradients": contiguous_gradients, + "stage3_prefetch_bucket_size": prefetch_bucket_size if prefetching else 0 }, "optimizer": { "type": "Adam", @@ -1082,7 +1087,8 @@ def create_tensor(vals): n), dtype=torch.bfloat16, device=ds_engine.device), - prefetching=train_iter > 0, + use_module_trace=train_iter > 0, + param_prefetching=prefetching and train_iter > 0, ) assert torch.allclose(activations["hidden1"], expected_hidden1) assert torch.allclose(activations["hidden2"], expected_hidden2) From 5eab66ab664dd25f2b6a370d170508b3e091d45a Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Tue, 23 Aug 2022 14:21:40 -0700 Subject: [PATCH 10/25] remove subdirs in workflow files --- .github/workflows/amd.yml | 4 +- .github/workflows/nv-torch-latest-v100.yml | 4 +- tests/unit/common.py | 103 -- tests/unit/test_checkpointing.py | 1424 -------------------- 4 files changed, 4 insertions(+), 1531 deletions(-) delete mode 100755 tests/unit/test_checkpointing.py diff --git a/.github/workflows/amd.yml b/.github/workflows/amd.yml index 97b96f2d5875..b43098ad9f4e 100644 --- a/.github/workflows/amd.yml +++ b/.github/workflows/amd.yml @@ -66,5 +66,5 @@ jobs: run: | if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi cd tests - TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose unit/{autotuning,checkpoint,comm,compression,elasticity,inference,launcher,monitor,ops,profiling,runtime,utils} - #TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'sequential' unit/{autotuning,checkpoint,comm,compression,elasticity,inference,launcher,monitor,ops,profiling,runtime,utils} + TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose unit/ + TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'sequential' unit/ diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml index 1dc535f9b327..ca43e4653e85 100644 --- a/.github/workflows/nv-torch-latest-v100.yml +++ b/.github/workflows/nv-torch-latest-v100.yml @@ -60,5 +60,5 @@ jobs: unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi cd tests - TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/{autotuning,checkpoint,comm,compression,elasticity,inference,launcher,monitor,ops,profiling,runtime,utils} --torch_ver="1.12" --cuda_ver="11.3" - TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/{autotuning,checkpoint,comm,compression,elasticity,inference,launcher,monitor,ops,profiling,runtime,utils} --torch_ver="1.12" --cuda_ver="11.3" + TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/ --torch_ver="1.12" --cuda_ver="11.3" + TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/ --torch_ver="1.12" --cuda_ver="11.3" diff --git a/tests/unit/common.py b/tests/unit/common.py index 5c4144ebc562..2d2c8e0541ba 100644 --- a/tests/unit/common.py +++ b/tests/unit/common.py @@ -321,109 +321,6 @@ def _get_current_test_func(self, request): return getattr(self, func_name) -def distributed_test(world_size=2, backend='nccl'): - """A decorator for executing a function (e.g., a unit test) in a distributed manner. - This decorator manages the spawning and joining of processes, initialization of - deepspeed.comm, and catching of errors. - - Usage example: - @distributed_test(worker_size=[2,3]) - def my_test(): - rank = dist.get_rank() - world_size = dist.get_world_size() - assert(rank < world_size) - - Arguments: - world_size (int or list): number of ranks to spawn. Can be a list to spawn - multiple tests. - """ - def dist_wrap(run_func): - """Second-level decorator for dist_test. This actually wraps the function. """ - def dist_init(local_rank, num_procs, *func_args, **func_kwargs): - """Initialize deepspeed.comm and execute the user function. """ - os.environ['MASTER_ADDR'] = '127.0.0.1' - os.environ['MASTER_PORT'] = get_master_port() - os.environ['LOCAL_RANK'] = str(local_rank) - # NOTE: unit tests don't support multi-node so local_rank == global rank - os.environ['RANK'] = str(local_rank) - os.environ['WORLD_SIZE'] = str(num_procs) - - # turn off NCCL logging if set - os.environ.pop('NCCL_DEBUG', None) - - set_cuda_visibile() - - deepspeed.init_distributed(dist_backend=backend) - #dist.init_process_group(backend=backend) - dist.barrier() - - if torch.cuda.is_available(): - torch.cuda.set_device(local_rank) - - run_func(*func_args, **func_kwargs) - - # make sure all ranks finish at the same time - dist.barrier() - # tear down after test completes - dist.destroy_process_group() - - def dist_launcher(num_procs, *func_args, **func_kwargs): - """Launch processes and gracefully handle failures. """ - - # Spawn all workers on subprocesses. - processes = [] - for local_rank in range(num_procs): - p = Process(target=dist_init, - args=(local_rank, - num_procs, - *func_args), - kwargs=func_kwargs) - p.start() - processes.append(p) - - # Now loop and wait for a test to complete. The spin-wait here isn't a big - # deal because the number of processes will be O(#GPUs) << O(#CPUs). - any_done = False - while not any_done: - for p in processes: - if not p.is_alive(): - any_done = True - break - - # Wait for all other processes to complete - for p in processes: - p.join(DEEPSPEED_UNIT_WORKER_TIMEOUT) - - failed = [(rank, p) for rank, p in enumerate(processes) if p.exitcode != 0] - for rank, p in failed: - # If it still hasn't terminated, kill it because it hung. - if p.exitcode is None: - p.terminate() - pytest.fail(f'Worker {rank} hung.', pytrace=False) - if p.exitcode < 0: - pytest.fail(f'Worker {rank} killed by signal {-p.exitcode}', - pytrace=False) - if p.exitcode > 0: - pytest.fail(f'Worker {rank} exited with code {p.exitcode}', - pytrace=False) - - def run_func_decorator(*func_args, **func_kwargs): - """Entry point for @distributed_test(). """ - - if isinstance(world_size, int): - dist_launcher(world_size, *func_args, **func_kwargs) - elif isinstance(world_size, list): - for procs in world_size: - dist_launcher(procs, *func_args, **func_kwargs) - time.sleep(0.5) - else: - raise TypeError(f'world_size must be an integer or a list of integers.') - - return run_func_decorator - - return dist_wrap - - def get_test_path(filename): curr_path = Path(__file__).parent return str(curr_path.joinpath(filename)) diff --git a/tests/unit/test_checkpointing.py b/tests/unit/test_checkpointing.py deleted file mode 100755 index 7174ae0a0a63..000000000000 --- a/tests/unit/test_checkpointing.py +++ /dev/null @@ -1,1424 +0,0 @@ -import deepspeed -from deepspeed.runtime.zero.stage_1_and_2 import DeepSpeedZeroOptimizer -from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer -from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer -from deepspeed.runtime.checkpoint_engine.torch_checkpoint_engine import TorchCheckpointEngine -from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer - -from deepspeed.runtime.pipe.topology import * - -PipeTopo = PipeDataParallelTopology - -from deepspeed.ops.op_builder import FusedLambBuilder, CPUAdamBuilder - -from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3 -from .util import required_minimum_torch_version, required_torch_version - -import itertools -import pytest -import numbers -from .common import distributed_test -from .simple_model import * - - -def compare_deepspeed_states(saved_model, loaded_model): - # These are compared in more depth in other places - assert hasattr(loaded_model, 'module') - - assert saved_model.sparse_tensor_module_names == loaded_model.sparse_tensor_module_names - assert saved_model.skipped_steps == loaded_model.skipped_steps - assert saved_model.global_steps == loaded_model.global_steps - - -def compare_model_states(saved_model, - loaded_model, - compare_optimizer=True, - load_module_only=False): - if not load_module_only: - compare_deepspeed_states(saved_model, loaded_model) - - for p0, p1 in zip(saved_model.module.named_parameters(), loaded_model.module.named_parameters()): - np0, p0 = p0 - np1, p1 = p1 - if 'deepspeed_moe.gate.wg' in np0: - # these params are converted to float at runtime, cast to half for comparison - p1 = p1.half() - p0 = p0.half() - assert id(p0) != id(p1), f'Comparing fp16 model state tensor against itself : {id(p0)} <====> {id(p1)}' - try: - assert torch.allclose(p0, p1, atol=1e-07), f"FP16 model state {p0} is not equal to {p1}, names:{np0}, {np1}" - except RuntimeError as err: - print(f"FP16 model state {p0} is not equal to {p1}, names:{np0}, {np1}") - raise err - - if not compare_optimizer: - return - - if DeepSpeedZeroOptimizer_Stage3 is not None and isinstance( - saved_model.optimizer, - DeepSpeedZeroOptimizer_Stage3): - for p0, p1 in zip(saved_model.optimizer.fp32_partitioned_groups_flat, loaded_model.optimizer.fp32_partitioned_groups_flat): - assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}" - - elif isinstance(saved_model.optimizer, DeepSpeedZeroOptimizer): - for p0, p1 in zip(saved_model.optimizer.single_partition_of_fp32_groups, loaded_model.optimizer.single_partition_of_fp32_groups): - assert id(p0) != id(p1), f'Comparing fp32 model state tensor against itself: {id(p0)} <====> {id(p1)}' - assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}" - - elif isinstance(saved_model.optimizer, FP16_Optimizer): - for p0, p1 in zip(saved_model.optimizer.fp32_groups_flat, loaded_model.optimizer.fp32_groups_flat): - assert id(p0) != id(p1), f'Comparing fp32 model state tensor against itself: {id(p0)} <====> {id(p1)}' - assert torch.allclose(p0, p1, atol=1e-07), f"FP32 model states {p0} is not equal to {p1}" - - elif isinstance(saved_model.optimizer, FP16_UnfusedOptimizer): - for params0, params1 in zip(saved_model.optimizer.fp32_groups, loaded_model.optimizer.fp32_groups): - for p0, p1 in zip(params0, params1): - assert id(p0) != id(p1), f'Comparing fp32 model state tensor against itself: {id(p0)} <====> {id(p1)}' - assert torch.allclose(p0, p1, atol=1e-07), f"FP32 model states {p0} is not equal to {p1}" - elif isinstance(saved_model.optimizer, torch.optim.Optimizer): - pass - else: - assert False, f'Unexpected Optimizer Type: {saved_model.optimizer}' - - -def _compare_state_dicts(state0, state1, expected_mismatch_keys=[]): - for (k0, s0), (k1, s1) in zip(state0.items(), state1.items()): - assert k0 == k1, f'failure due to key mismatch {k0} != {k1}' - if k0 in expected_mismatch_keys: - continue - if isinstance(s0, torch.Tensor) and isinstance(s1, torch.Tensor): - assert id(s0) != id(s1), f'Comparing optimizer state tensor against itself: {id(s0)} <====> {id(s1)}' - assert torch.equal(s0.to('cpu'), s1.to('cpu')) - else: - assert s0 == s1, f'failures with keys = {k0}, {k1}, values = {type(s0[0])} and {type(s1[0])}' - - -def compare_optimizer_states(saved_model, loaded_model, hidden_dim, fp16=True): - saved_optimizer = saved_model.optimizer.optimizer if fp16 else saved_model.optimizer - loaded_optimizer = loaded_model.optimizer.optimizer if fp16 else loaded_model.optimizer - - for state0, state1 in zip(saved_optimizer.state.values(), - loaded_optimizer.state.values()): - _compare_state_dicts(state0, state1) - - -def compare_lr_scheduler_states(saved_model, loaded_model): - assert hasattr(saved_model, 'lr_scheduler') - assert hasattr(loaded_model, 'lr_scheduler') - - saved_scheduler = saved_model.lr_scheduler - loaded_scheduler = loaded_model.lr_scheduler - - assert hasattr(saved_scheduler, 'state_dict') - assert hasattr(loaded_scheduler, 'state_dict') - - saved_sd = saved_scheduler.state_dict() - loaded_sd = loaded_scheduler.state_dict() - - print(f"saved_sd = {saved_sd}") - print(f"loaded_sd = {loaded_sd}") - - assert saved_sd.keys() == loaded_sd.keys() - - for state0, state1 in zip(saved_sd.values(), loaded_sd.values()): - if isinstance(state0, numbers.Number) and isinstance(state1, numbers.Number): - assert state0 == state1 - - -def create_deepspeed_model(args, model, base_optimizer): - if base_optimizer is None: - ds_model, _, _, _ = deepspeed.initialize(args=args, - model=model, - model_parameters=model.parameters()) - else: - ds_model, _, _, _ = deepspeed.initialize(args=args, - model=model, - optimizer=base_optimizer) - - return ds_model - - -def checkpoint_correctness_verification(args, - models, - hidden_dim, - tmpdir, - load_optimizer_states=False, - load_lr_scheduler_states=False, - fp16=True, - train_batch=False, - base_optimizers=[None, - None], - empty_tag=False, - seq_dataloader=False, - load_module_only=False): - dtype = torch.half if fp16 else torch.float32 - ds_model = create_deepspeed_model(args=args, - model=models[0], - base_optimizer=base_optimizers[0]) - - if seq_dataloader: - data_loader = sequence_dataloader(model=ds_model, - total_samples=50, - hidden_dim=hidden_dim, - device=ds_model.device, - dtype=dtype) - else: - data_loader = random_dataloader(model=ds_model, - total_samples=50, - hidden_dim=hidden_dim, - device=ds_model.device, - dtype=dtype) - - if train_batch: - ds_model.set_dataloader(data_loader) - for n, batch in enumerate(data_loader): - loss = ds_model.train_batch() - else: - for n, batch in enumerate(data_loader): - loss = ds_model(batch[0], batch[1]) - ds_model.backward(loss) - ds_model.step() - - trained_model = ds_model - - save_folder = os.path.join(tmpdir, 'saved_checkpoint') - save_tag = None if empty_tag else '1' - - trained_model.save_checkpoint(save_folder, tag=save_tag) - - dist.barrier() - - loaded_model = create_deepspeed_model(args=args, - model=models[1], - base_optimizer=base_optimizers[1]) - assert list(trained_model.parameters())[0].dtype == list( - loaded_model.parameters())[0].dtype - - loaded_model.load_checkpoint(save_folder, - tag=save_tag, - load_optimizer_states=load_optimizer_states, - load_lr_scheduler_states=load_lr_scheduler_states, - load_module_only=load_module_only) - - compare_model_states(trained_model, - loaded_model, - compare_optimizer=load_optimizer_states, - load_module_only=load_module_only) - - if load_optimizer_states: - compare_optimizer_states(trained_model, loaded_model, hidden_dim, fp16) - - if load_lr_scheduler_states: - compare_lr_scheduler_states(trained_model, loaded_model) - - -@pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME], - reason="lamb is not compatible") -def test_checkpoint_unfused_optimizer(tmpdir): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Lamb", - "params": { - "lr": 0.00015 - } - }, - "gradient_clipping": 1.0, - "fp16": { - "enabled": True - }, - "scheduler": { - "type": "OneCycle", - "params": { - "cycle_first_step_size": 1000, - "cycle_first_stair_count": 500, - "cycle_second_step_size": 1000, - "cycle_second_stair_count": 500, - "decay_step_size": 1000, - "cycle_min_lr": 0.0001, - "cycle_max_lr": 0.0010, - "decay_lr_rate": 0.001, - "cycle_min_mom": 0.85, - "cycle_max_mom": 0.99, - "decay_mom_rate": 0.0 - } - } - } - - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] - - @distributed_test(world_size=[2]) - def _test_checkpoint_unfused_optimizer(args, - models, - hidden_dim, - load_optimizer_states): - checkpoint_correctness_verification(args, - models=models, - hidden_dim=hidden_dim, - tmpdir=tmpdir, - load_optimizer_states=load_optimizer_states) - - _test_checkpoint_unfused_optimizer(args=args, - models=models, - hidden_dim=hidden_dim, - load_optimizer_states=True) - - _test_checkpoint_unfused_optimizer(args=args, - models=models, - hidden_dim=hidden_dim, - load_optimizer_states=False) - - -def test_checkpoint_fused_optimizer(tmpdir): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015, - "betas": [0.8, - 0.999], - "eps": 1e-8, - "weight_decay": 3e-7 - } - }, - "fp16": { - "enabled": True - } - } - - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] - - @distributed_test(world_size=[2]) - def _test_checkpoint_fused_optimizer(args, - models, - hidden_dim, - load_optimizer_states): - checkpoint_correctness_verification(args, - models=models, - hidden_dim=hidden_dim, - tmpdir=tmpdir, - load_optimizer_states=load_optimizer_states) - - _test_checkpoint_fused_optimizer(args=args, - models=models, - hidden_dim=hidden_dim, - load_optimizer_states=True) - - _test_checkpoint_fused_optimizer(args=args, - models=models, - hidden_dim=hidden_dim, - load_optimizer_states=False) - - -@pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer', - [(1, - False, - 'Adam'), - (2, - False, - 'Adam'), - (2, - True, - 'deepspeed_adam'), - (3, - False, - 'Adam'), - (3, - True, - 'deepspeed_adam')]) -def test_checkpoint_zero_optimizer(tmpdir, zero_stage, use_cpu_offload, adam_optimizer): - if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: - pytest.skip("cpu-adam is not compatible") - - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": 'Adam', - "params": { - "lr": 0.00015, - "betas": [0.8, - 0.999], - "eps": 1e-8, - "weight_decay": 3e-7 - } - }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, - "wall_clock_breakdown": True, - "zero_optimization": { - "stage": zero_stage, - "cpu_offload": use_cpu_offload - } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - @distributed_test(world_size=[2]) - def _test_checkpoint_zero_optimizer(args, - zero_stage, - hidden_dim, - load_optimizer_states): - if zero_stage == 3: - with deepspeed.zero.Init(): - models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] - else: - models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] - - checkpoint_correctness_verification(args, - models, - hidden_dim, - tmpdir, - load_optimizer_states=load_optimizer_states) - - _test_checkpoint_zero_optimizer(args=args, - zero_stage=zero_stage, - hidden_dim=hidden_dim, - load_optimizer_states=True) - - -@pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer', - [(1, - False, - "Adam"), - (2, - False, - "Adam"), - (2, - True, - 'deepspeed_adam'), - (3, - False, - 'Adam'), - (3, - True, - 'deepspeed_adam')]) -def test_checkpoint_zero_no_optimizer(tmpdir, - zero_stage, - use_cpu_offload, - adam_optimizer): - if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: - pytest.skip("cpu-adam is not compatible") - - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": 'Adam', - "params": { - "lr": 0.00015, - "betas": [0.8, - 0.999], - "eps": 1e-8, - "weight_decay": 3e-7 - } - }, - "fp16": { - "enabled": True - }, - "zero_optimization": { - "stage": zero_stage, - "cpu_offload": use_cpu_offload - } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - @distributed_test(world_size=[1]) - def _test_checkpoint_zero_no_optimizer(args, - zero_stage, - hidden_dim, - load_optimizer_states): - if zero_stage == 3: - global DeepSpeedZeroOptimizer_Stage3 - from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3 - with deepspeed.zero.Init(): - models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] - else: - models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] - - checkpoint_correctness_verification(args, - models, - hidden_dim, - tmpdir, - load_optimizer_states=load_optimizer_states) - - _test_checkpoint_zero_no_optimizer(args=args, - zero_stage=zero_stage, - hidden_dim=hidden_dim, - load_optimizer_states=False) - - -@pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer', - [(0, - False, - 'Adam'), - (1, - False, - 'Adam'), - (2, - False, - 'Adam'), - (2, - True, - 'deepspeed_adam'), - (3, - False, - 'Adam'), - (3, - True, - 'deepspeed_adam')]) -def test_checkpoint_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optimizer): - if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: - pytest.skip("cpu-adam is not compatible") - - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": 'Adam', - "params": { - "lr": 0.00015, - "betas": [0.8, - 0.999], - "eps": 1e-8, - "weight_decay": 3e-7 - } - }, - "fp16": { - "enabled": True - }, - "zero_optimization": { - "stage": zero_stage, - "cpu_offload": use_cpu_offload - }, - "scheduler": { - "type": "WarmupLR", - "params": { - "warmup_min_lr": 0, - "warmup_max_lr": 0.001, - "warmup_num_steps": 1000 - } - } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - @distributed_test(world_size=[2]) - def _test_checkpoint_lr_scheduler(args, - zero_stage, - hidden_dim, - load_optimizer_states, - load_lr_scheduler_states): - if zero_stage == 3: - global DeepSpeedZeroOptimizer_Stage3 - from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3 - with deepspeed.zero.Init(): - models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] - else: - models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] - - checkpoint_correctness_verification( - args, - models, - hidden_dim, - tmpdir, - load_optimizer_states=load_optimizer_states, - load_lr_scheduler_states=load_lr_scheduler_states) - - _test_checkpoint_lr_scheduler(args=args, - zero_stage=zero_stage, - hidden_dim=hidden_dim, - load_optimizer_states=False, - load_lr_scheduler_states=True) - - -@pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer', - [(0, - False, - 'Adam'), - (1, - False, - 'Adam'), - (2, - False, - 'Adam'), - (2, - True, - 'deepspeed_adam'), - (3, - False, - 'Adam'), - (3, - True, - 'deepspeed_adam')]) -def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optimizer): - if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: - pytest.skip("cpu-adam is not compatible") - - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": 'Adam', - "params": { - "lr": 1e-5 - } - }, - "fp16": { - "enabled": True - }, - "zero_optimization": { - "stage": zero_stage, - "cpu_offload": use_cpu_offload - }, - "scheduler": { - "type": "WarmupLR", - "params": { - "warmup_min_lr": 0, - "warmup_max_lr": 0.001, - "warmup_num_steps": 1000 - } - }, - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - @distributed_test(world_size=[2]) - def _test_checkpoint_no_lr_scheduler(args, - zero_stage, - hidden_dim, - load_optimizer_states, - load_lr_scheduler_states): - if zero_stage == 3: - with deepspeed.zero.Init(): - models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] - else: - models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] - - checkpoint_correctness_verification( - args, - models, - hidden_dim, - tmpdir, - load_optimizer_states=load_optimizer_states, - load_lr_scheduler_states=load_lr_scheduler_states) - - _test_checkpoint_no_lr_scheduler(args=args, - zero_stage=zero_stage, - hidden_dim=hidden_dim, - load_optimizer_states=False, - load_lr_scheduler_states=False) - - -def test_checkpoint_fp32_optimizer(tmpdir): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015, - "betas": [0.8, - 0.999], - "eps": 1e-8, - "weight_decay": 3e-7 - } - }, - "fp16": { - "enabled": False - } - } - - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] - - @distributed_test(world_size=[2]) - def _test_checkpoint_fp32_optimizer(args, models, hidden_dim): - checkpoint_correctness_verification(args, - models=models, - hidden_dim=hidden_dim, - tmpdir=tmpdir, - fp16=False) - - _test_checkpoint_fp32_optimizer(args=args, models=models, hidden_dim=hidden_dim) - - -@pytest.mark.parametrize("zero_stage", [0, 1]) -def test_checkpoint_pipe_engine(zero_stage, tmpdir, stages=2): - config_dict = { - "train_batch_size": 2, - "train_micro_batch_size_per_gpu": 1, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 1e-5 - } - }, - "zero_optimization": { - "stage": zero_stage - }, - "fp16": { - "enabled": zero_stage > 0 - }, - "scheduler": { - "type": "OneCycle", - "params": { - "cycle_first_step_size": 1000, - "cycle_first_stair_count": 500, - "cycle_second_step_size": 1000, - "cycle_second_stair_count": 500, - "decay_step_size": 1000, - "cycle_min_lr": 0.0001, - "cycle_max_lr": 0.0010, - "decay_lr_rate": 0.001, - "cycle_min_mom": 0.85, - "cycle_max_mom": 0.99, - "decay_mom_rate": 0.0 - } - } - } - - @distributed_test(world_size=4) - def _test(save_folder, num_stages): - args = args_from_dict(tmpdir, config_dict) - models = [LinearStackPipe(num_stages=num_stages) for _ in range(2)] - checkpoint_correctness_verification(args=args, - models=models, - hidden_dim=models[0].hidden_dim, - tmpdir=save_folder, - fp16=config_dict['fp16']['enabled'], - load_optimizer_states=True, - load_lr_scheduler_states=True, - train_batch=True) - - _test(tmpdir, num_stages=stages) - - -@pytest.mark.parametrize( - "base_topo,test_topo", - [ - #(PipeTopo(num_pp=1, - # num_dp=4), - # PipeTopo(num_pp=4, - # num_dp=1)), - #(PipeTopo(num_pp=2, - # num_dp=2), - # PipeTopo(num_pp=2, - # num_dp=2)), - #(PipeTopo(num_pp=4, - # num_dp=1), - # PipeTopo(num_pp=2, - # num_dp=2)), - ]) -def test_checkpoint_pipe_module(base_topo, test_topo, tmpdir): - @distributed_test(world_size=4) - def _test(base_topo, test_topo, save_folder): - checkpoint_engine = TorchCheckpointEngine() - base_model = LinearStackPipe(topology=base_topo) - base_model.save_state_dict(save_folder, checkpoint_engine=checkpoint_engine) - - dist.barrier() - - test_model = LinearStackPipe(topology=test_topo) - test_model.load_state_dir(save_folder, checkpoint_engine=checkpoint_engine) - - # Base and test can have different lengths, so make sure we map from the - # smaller to larger model - if len(base_model.forward_funcs) < len(test_model.forward_funcs): - A = base_model - B = test_model - else: - A = test_model - B = base_model - - # Compare layers individually since partitions are different - for idx, A_layer in enumerate(A.forward_funcs): - if not hasattr(A_layer, 'parameters'): - # Skip functionals, etc. - continue - - # Find the corresponding layer in B - global_idx = idx + A._local_start - B_local_idx = global_idx - B._local_start - B_layer = B.forward_funcs[B_local_idx] - - # Compare layer parameters - for p0, p1 in zip(A_layer.parameters(), B_layer.parameters()): - assert torch.allclose(p0, p1, atol=1e-07), f"Model state {p0} is not equal to {p1}" - - _test(base_topo, test_topo, save_folder=tmpdir) - - -@pytest.mark.parametrize('zero_stage', [1, 2]) -def test_checkpoint_zero_hybrid_optimizer_state(tmpdir, zero_stage): - config_dict = { - "train_micro_batch_size_per_gpu": 2, - "gradient_accumulation_steps": 2, - "steps_per_print": 1, - "zero_optimization": { - "stage": zero_stage - }, - "zero_allow_untested_optimizer": True, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - } - } - - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - models = [SimpleModel(hidden_dim=hidden_dim) for _ in range(2)] - optimizers = [HybridStateOptimizer(model.parameters()) for model in models] - - @distributed_test(world_size=[2]) - def _test_checkpoint_zero_hybrid_optimizer_state(args, - models, - optimizers, - hidden_dim): - checkpoint_correctness_verification(args, - models=models, - base_optimizers=optimizers, - hidden_dim=hidden_dim, - tmpdir=tmpdir, - load_optimizer_states=True) - - _test_checkpoint_zero_hybrid_optimizer_state(args=args, - models=models, - optimizers=optimizers, - hidden_dim=hidden_dim) - - -def test_checkpoint_latest(tmpdir): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 - } - } - } - hidden_dim = 10 - args = args_from_dict(tmpdir, config_dict) - models = [SimpleModel(hidden_dim=hidden_dim) for _ in range(2)] - - @distributed_test(world_size=[1]) - def _helper(args, models): - checkpoint_correctness_verification(args, - models=models, - hidden_dim=hidden_dim, - tmpdir=tmpdir, - load_optimizer_states=True, - load_lr_scheduler_states=False, - fp16=False, - empty_tag=True) - - _helper(args, models) - - -def test_checkpoint_missing_latest(tmpdir): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 - } - } - } - hidden_dim = 10 - args = args_from_dict(tmpdir, config_dict) - - model = SimpleModel(hidden_dim) - - @distributed_test(world_size=[1]) - def _helper(args, model, hidden_dim): - model, _, _,_ = deepspeed.initialize(args=args, - model=model, - model_parameters=model.parameters()) - # should be no-op, since latest doesn't exist - model.load_checkpoint(tmpdir) - - _helper(args=args, model=model, hidden_dim=hidden_dim) - - -@pytest.mark.parametrize('valid_mode', ["FAIL", "WARN", "IGNORE"]) -def test_checkpoint_unique_tag(tmpdir, valid_mode): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 - } - }, - "checkpoint": { - "tag_validation": valid_mode - } - } - hidden_dim = 10 - args = args_from_dict(tmpdir, config_dict) - - model = SimpleModel(hidden_dim) - - @distributed_test(world_size=[2]) - def _helper(args, model, hidden_dim): - model, _, _,_ = deepspeed.initialize(args=args, - model=model, - model_parameters=model.parameters()) - if valid_mode == "FAIL": - with pytest.raises(AssertionError): - model.save_checkpoint(save_dir=tmpdir, tag=f"tag-{dist.get_rank()}") - else: - model.save_checkpoint(save_dir=tmpdir, tag=f"tag-{dist.get_rank()}") - - _helper(args=args, model=model, hidden_dim=hidden_dim) - - -def test_checkpoint_unknown_tag_validation(tmpdir): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 - } - }, - "checkpoint": { - "tag_validation": "foo" - } - } - hidden_dim = 10 - args = args_from_dict(tmpdir, config_dict) - - model = SimpleModel(hidden_dim) - - @distributed_test(world_size=[1]) - def _helper(args, model, hidden_dim): - with pytest.raises(deepspeed.DeepSpeedConfigError): - model, _, _,_ = deepspeed.initialize(args=args, - model=model, - model_parameters=model.parameters()) - - _helper(args=args, model=model, hidden_dim=hidden_dim) - - -@pytest.mark.parametrize("ep_size", [4]) -def test_checkpoint_moe(tmpdir, ep_size): - if not required_torch_version(): - pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") - - config_dict = { - "train_batch_size": 8, - "steps_per_print": 1, - "fp16": { - "enabled": True - } - } - hidden_dim = 16 - args = args_from_dict(tmpdir, config_dict) - - @distributed_test(world_size=[4]) - def _helper(args): - models = [ - SimpleMoEModel(hidden_dim=hidden_dim, - num_experts=ep_size, - ep_size=ep_size) for _ in range(2) - ] - optimizers = [torch.optim.AdamW(params=model.parameters()) for model in models] - checkpoint_correctness_verification(args, - models=models, - hidden_dim=hidden_dim, - tmpdir=tmpdir, - load_optimizer_states=True, - load_lr_scheduler_states=False, - fp16=config_dict["fp16"]["enabled"], - empty_tag=True, - base_optimizers=optimizers, - seq_dataloader=True) - - _helper(args) - - -@pytest.mark.parametrize("ep_size, load_optim_states", - [(4, - True), - (4, - False), - (2, - True), - (2, - False)]) -def test_checkpoint_moe_and_zero(tmpdir, ep_size, load_optim_states): - if not required_torch_version(): - pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") - - config_dict = { - "train_batch_size": 8, - "steps_per_print": 1, - "optimizer": { - "type": 'Adam', - "params": { - "lr": 0.00015, - "betas": [0.8, - 0.999], - "eps": 1e-8, - "weight_decay": 3e-7 - } - }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, - "zero_optimization": { - "stage": 2, - } - } - hidden_dim = 16 - args = args_from_dict(tmpdir, config_dict) - - def create_param_groups(model): - # param group must have a random unique name (for now) - # TODO: clean-up this requirement, the unique name should not be required here - return {'params': [p for p in model.parameters()], 'name': 'random-unique-name'} - - @distributed_test(world_size=[4]) - def _helper(args): - models = [ - SimpleMoEModel(hidden_dim=hidden_dim, - num_experts=ep_size, - ep_size=ep_size) for _ in range(2) - ] - params = [ - split_params_into_different_moe_groups_for_optimizer( - create_param_groups(model)) for model in models - ] - optimizers = [torch.optim.AdamW(params=param) for param in params] - checkpoint_correctness_verification(args, - models=models, - hidden_dim=hidden_dim, - tmpdir=tmpdir, - load_optimizer_states=load_optim_states, - load_lr_scheduler_states=False, - fp16=config_dict["fp16"]["enabled"], - empty_tag=True, - base_optimizers=optimizers, - seq_dataloader=True) - - _helper(args) - - -@pytest.mark.parametrize('zero_stage', [0, 1, 2, 3]) -def test_checkpoint_load_module_only(tmpdir, zero_stage): - config_dict = { - "train_batch_size": 2, - "optimizer": { - "type": 'Adam' - }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, - "zero_optimization": { - "stage": zero_stage, - } - } - args = args_from_dict(tmpdir, config_dict) - hidden_dim = 10 - - @distributed_test(world_size=[2]) - def _go(args, zero_stage, hidden_dim): - if zero_stage == 3: - with deepspeed.zero.Init(): - models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] - else: - models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] - - checkpoint_correctness_verification(args, - models, - hidden_dim, - tmpdir, - load_module_only=True) - - _go(args, zero_stage, hidden_dim) - - -@pytest.mark.parametrize(["to_save_model_has_embedding", - "to_save_model_sparse"], - [ - [False, - False], - [True, - False], - [True, - True], - ]) -@pytest.mark.parametrize(["destination_has_embedding", - "destination_sparse"], - [ - [False, - False], - [True, - False], - [True, - True], - ]) -def test_non_strict_load_sparse(tmpdir, - to_save_model_has_embedding, - to_save_model_sparse, - destination_has_embedding, - destination_sparse): - config_dict = {"train_batch_size": 2} - - class ModelNoEmbedding(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear = torch.nn.Linear(3, 1) - - def forward(self, x): - return self.linear(x) - - class ModelEmbedding(torch.nn.Module): - def __init__(self): - super().__init__() - self.emb = torch.nn.Embedding(10, 3) - self.linear = torch.nn.Linear(3, 1) - - def forward(self, x, offsets): - return self.linear(self.emb(x, offsets)) - - @distributed_test(world_size=[2]) - def _test(model_to_save, model_destination): - engine_to_save, _, _, _ = deepspeed.initialize( - model=model_to_save, config={"train_batch_size": 2, "sparse_gradients": to_save_model_sparse} - ) - engine_destination, _, _, _ = deepspeed.initialize( - model=model_destination, config={"train_batch_size": 2, "sparse_gradients": destination_sparse} - ) - - save_folder = os.path.join(tmpdir, 'saved_checkpoint') - save_tag = '1' - - engine_to_save.save_checkpoint(save_folder, tag=save_tag) - - is_sparse_destination = isinstance(model_destination, - ModelEmbedding) and destination_sparse - if isinstance(model_destination, - ModelEmbedding) and model_destination.emb.sparse: - assert "emb.weight" in engine_destination.sparse_tensor_module_names - engine_destination.load_checkpoint(save_folder, - tag=save_tag, - load_module_strict=False, - load_optimizer_states=False, - load_lr_scheduler_states=False, - load_module_only=False) - if isinstance(model_destination, - ModelEmbedding) and isinstance(model_to_save, - ModelEmbedding): - assert engine_destination.sparse_tensor_module_names == engine_to_save.sparse_tensor_module_names - elif isinstance(model_destination, ModelEmbedding): - assert not is_sparse_destination or "emb.weight" in engine_destination.sparse_tensor_module_names - else: - assert len(engine_destination.sparse_tensor_module_names) == 0 - - if to_save_model_has_embedding: - model_to_save = ModelEmbedding() - else: - model_to_save = ModelNoEmbedding() - if destination_has_embedding: - model_destination = ModelEmbedding() - else: - model_destination = ModelNoEmbedding() - _test(model_to_save, model_destination) - - -@pytest.mark.parametrize(["elastic_save", - "elastic_load", - "load_optim"], - itertools.product(*[[True, - False], - [True, - False], - [True, - False]])) -def test_checkpoint_zero_elastic(tmpdir, elastic_save, elastic_load, load_optim): - ds_config = { - "train_batch_size": 2, - "optimizer": { - "type": 'Adam' - }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, - "zero_optimization": { - "stage": 2, - "elastic_checkpoint": elastic_save - } - } - hidden_dim = 10 - - @distributed_test(world_size=[2]) - def _go(): - # torch 1.2.* stores raw tensor id numbers in checkpoint state which leads to - # false positive mismatches in checkpoint state comparisons. - # Newer torch versions store tensor ids as 0, 1, 2, ... - expected_mismatch_keys = [] if required_minimum_torch_version(1, - 4) else ['params'] - models = [SimpleModel(hidden_dim) for _ in range(2)] - model, _, _, _ = deepspeed.initialize(config=ds_config, - model=models[0], - model_parameters=models[0].parameters()) - data_loader = random_dataloader(model=model, - total_samples=8, - hidden_dim=hidden_dim, - device=model.device) - for n, batch in enumerate(data_loader): - loss = model(batch[0], batch[1]) - model.backward(loss) - model.step() - if load_optim: - torch.save(model.optimizer.optimizer.state_dict(), - os.path.join(tmpdir, - 'opt-state-dict')) - model.save_checkpoint(tmpdir) - - ds_config["zero_optimization"]["elastic_checkpoint"] = elastic_load - model, _, _, _ = deepspeed.initialize(config=ds_config, - model=models[1], - model_parameters=models[1].parameters()) - model.load_checkpoint(tmpdir, load_optimizer_states=load_optim) - - if load_optim: - saved_sd = torch.load(os.path.join(tmpdir, 'opt-state-dict')) - curr_sd = model.optimizer.optimizer.state_dict() - for curr_param_group, saved_param_group in zip(curr_sd['param_groups'], saved_sd['param_groups']): - _compare_state_dicts(curr_param_group, - saved_param_group, - expected_mismatch_keys) - - data_loader = random_dataloader(model=model, - total_samples=8, - hidden_dim=hidden_dim, - device=model.device) - for n, batch in enumerate(data_loader): - loss = model(batch[0], batch[1]) - model.backward(loss) - model.step() - - _go() - - -@pytest.mark.parametrize(["elastic_save", - "elastic_load", - "load_optim"], - itertools.product(*[[True, - False], - [True, - False], - [True, - False]])) -def test_checkpoint_zero_elastic_dp_change(tmpdir, - elastic_save, - elastic_load, - load_optim): - ds_config = { - "train_batch_size": 4, - "optimizer": { - "type": 'Adam' - }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, - "zero_optimization": { - "stage": 2, - "elastic_checkpoint": elastic_save - } - } - hidden_dim = 10 - models = [SimpleModel(hidden_dim) for _ in range(2)] - - @distributed_test(world_size=[4]) - def _go2(models): - model, _, _, _ = deepspeed.initialize(config=ds_config, - model=models[0], - model_parameters=models[0].parameters()) - data_loader = random_dataloader(model=model, - total_samples=8, - hidden_dim=hidden_dim, - device=model.device) - for n, batch in enumerate(data_loader): - loss = model(batch[0], batch[1]) - model.backward(loss) - model.step() - - if load_optim: - torch.save(model.optimizer.optimizer.state_dict(), - os.path.join(tmpdir, - 'opt-state-dict')) - model.save_checkpoint(tmpdir) - - _go2(models) - - @distributed_test(world_size=[2]) - def _go1(models): - ds_config["zero_optimization"]["elastic_checkpoint"] = elastic_load - model, _, _, _ = deepspeed.initialize(config=ds_config, - model=models[1], - model_parameters=models[1].parameters()) - if load_optim: - with pytest.raises(deepspeed.runtime.zero.utils.ZeRORuntimeException): - model.load_checkpoint(tmpdir, load_optimizer_states=load_optim) - else: - model.load_checkpoint(tmpdir, load_optimizer_states=load_optim) - - _go1(models) - - -@pytest.mark.parametrize('zero_stage', [0, 1, 2, 3]) -def test_immediate_save_load(tmpdir, zero_stage): - config_dict = { - "train_batch_size": 4, - "optimizer": { - "type": 'Adam' - }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, - "zero_optimization": { - "stage": zero_stage, - } - } - hidden_dim = 10 - model = SimpleModel(hidden_dim) - args = args_from_dict(tmpdir, config_dict) - - @distributed_test(world_size=[1]) - def _test_immediate_save_load(args, model, tmpdir): - - ds_model = create_deepspeed_model(args=args, model=model, base_optimizer=None) - ds_model.save_checkpoint(tmpdir) - ds_model.load_checkpoint(tmpdir, - load_optimizer_states=False, - load_lr_scheduler_states=False, - load_module_only=False) - - _test_immediate_save_load(args, model, tmpdir) - - -@pytest.mark.parametrize('zero_stage', [0, 1, 2, 3]) -def test_load_immediate_save(tmpdir, zero_stage): - config_dict = { - "train_batch_size": 4, - "optimizer": { - "type": 'Adam' - }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, - "zero_optimization": { - "stage": zero_stage, - } - } - hidden_dim = 10 - model = SimpleModel(hidden_dim) - args = args_from_dict(tmpdir, config_dict) - - @distributed_test(world_size=[1]) - def _test_load_immediate_save(args, model, tmpdir): - - # 1. pretrain a model and save it - dtype = torch.half - ds_model = create_deepspeed_model(args=args, model=model, base_optimizer=None) - data_loader = random_dataloader(model=ds_model, - total_samples=1, - hidden_dim=hidden_dim, - device=ds_model.device, - dtype=dtype) - for n, batch in enumerate(data_loader): - loss = ds_model(batch[0], batch[1]) - ds_model.backward(loss) - ds_model.step() - ds_model.save_checkpoint(tmpdir) - - # 2. load and immediately save a model with a fresh ds engine - ds_model = create_deepspeed_model(args=args, model=model, base_optimizer=None) - ds_model.load_checkpoint(tmpdir, - load_optimizer_states=False, - load_lr_scheduler_states=False, - load_module_only=False) - ds_model.save_checkpoint(tmpdir) - - _test_load_immediate_save(args, model, tmpdir) - - -@pytest.mark.parametrize('zero_stage', [0, 1, 2, 3]) -def test_save_before_accum_grad_is_done(tmpdir, zero_stage): - config_dict = { - "optimizer": { - "type": 'Adam' - }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, - "zero_optimization": { - "stage": zero_stage, - "stage3_gather_fp16_weights_on_model_save": True, - }, - "gradient_accumulation_steps": 2, - "train_micro_batch_size_per_gpu": 1, - "train_batch_size": 2, - } - hidden_dim = 10 - model = SimpleModel(hidden_dim) - args = args_from_dict(tmpdir, config_dict) - - @distributed_test(world_size=[1]) - def _test_save_before_accum_grad_is_done(args, model, tmpdir): - - # This test reproduces a bug where one tries to retrieve a 16bit model before grad_accum - # cycle was completed. - # So we config grad_accum=2 and step only once and save_16bit_model - ds_model = create_deepspeed_model(args=args, model=model, base_optimizer=None) - - data_loader = random_dataloader(model=ds_model, - total_samples=2, - hidden_dim=hidden_dim, - device=ds_model.device, - dtype=torch.half) - - batch = next(iter(data_loader)) - loss = ds_model(batch[0], batch[1]) - ds_model.backward(loss) - ds_model.step() - - # we stepped only once, and now save 16bit model before gradient_accumulation_steps=2 is complete - ds_model.save_16bit_model(tmpdir, "model.pt") - - # let's test just as well that we can save the checkpoint too - ds_model.save_checkpoint(tmpdir) - - _test_save_before_accum_grad_is_done(args, model, tmpdir) From 5baa6dbf6a82cd407d441d27b1587718ea5bb08b Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Tue, 23 Aug 2022 16:17:40 -0700 Subject: [PATCH 11/25] fix pytest syntax error --- tests/unit/test_zero.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_zero.py b/tests/unit/test_zero.py index d4df3e925a29..b6076236f2af 100755 --- a/tests/unit/test_zero.py +++ b/tests/unit/test_zero.py @@ -949,7 +949,7 @@ def __init_weights(self, module): 1)) -@pytest.skip("not working") +@pytest.mark.skip("not working") @pytest.mark.parametrize("param_persistence_threshold", [0, 10]) @pytest.mark.parametrize("contiguous_gradients", [True, False]) @pytest.mark.parametrize("offload_optimizer", [True, False]) From 70e18715411aa8ac5db9b3e4068ff5adc242c07b Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Tue, 23 Aug 2022 17:39:53 -0700 Subject: [PATCH 12/25] fix another syntax error --- tests/unit/test_aio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_aio.py b/tests/unit/test_aio.py index 247c6a201537..6dd626cd9224 100755 --- a/tests/unit/test_aio.py +++ b/tests/unit/test_aio.py @@ -126,7 +126,7 @@ def test_parallel_write(self, tmpdir, single_submit, overlap_events): assert filecmp.cmp(ref_file, aio_file, shallow=False) @pytest.mark.parametrize("cuda_device", [True, False]) - def test_async_write(tmpdir, single_submit, overlap_events, cuda_device): + def test_async_write(self, tmpdir, single_submit, overlap_events, cuda_device): ref_file, ref_buffer = _do_ref_write(tmpdir) aio_file, aio_buffer = _get_test_file_and_buffer(tmpdir, ref_buffer, cuda_device) From 45f7ebb34cead5d9bd2f4f12b4be503214d6e378 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Fri, 9 Sep 2022 09:35:17 -0700 Subject: [PATCH 13/25] update imports --- tests/unit/comm/test_dist.py | 2 +- tests/unit/test_activation_checkpointing.py | 2 +- tests/unit/test_aio.py | 2 +- tests/unit/test_averaging.py | 2 +- tests/unit/test_coalesced_collectives.py | 2 +- tests/unit/test_config.py | 4 ++-- tests/unit/test_configurable_parallel_mp.py | 4 ++-- tests/unit/test_configurable_parallel_pp.py | 6 +++--- tests/unit/test_curriculum_learning.py | 4 ++-- tests/unit/test_ds_initialize.py | 6 +++--- tests/unit/test_dynamic_loss_scale.py | 4 ++-- tests/unit/test_fp16.py | 6 +++--- tests/unit/test_ignore_unused_parameters.py | 4 ++-- tests/unit/test_lr_schedulers.py | 4 ++-- tests/unit/test_moe.py | 6 +++--- tests/unit/test_moe_tp.py | 4 ++-- tests/unit/test_multi_output_model.py | 4 ++-- tests/unit/test_partition.py | 2 +- tests/unit/test_zero.py | 4 ++-- tests/unit/test_zero_context.py | 2 +- 20 files changed, 37 insertions(+), 37 deletions(-) diff --git a/tests/unit/comm/test_dist.py b/tests/unit/comm/test_dist.py index b5ad7a7f72b4..2a2abeba680e 100644 --- a/tests/unit/comm/test_dist.py +++ b/tests/unit/comm/test_dist.py @@ -3,7 +3,7 @@ import deepspeed.comm as dist import deepspeed -from unit.common import DistributedTest, get_master_port +from unit.common import DistributedTest, DistributedFixture, get_master_port from unit.simple_model import SimpleModel import pytest diff --git a/tests/unit/test_activation_checkpointing.py b/tests/unit/test_activation_checkpointing.py index 1b27ff74257e..375ef30de0f6 100644 --- a/tests/unit/test_activation_checkpointing.py +++ b/tests/unit/test_activation_checkpointing.py @@ -4,7 +4,7 @@ import torch import deepspeed from copy import deepcopy -from tests.unit.common import DistributedTest +from unit.common import DistributedTest ckpt = deepspeed.checkpointing.checkpoint diff --git a/tests/unit/test_aio.py b/tests/unit/test_aio.py index 6dd626cd9224..ca1f1b923743 100755 --- a/tests/unit/test_aio.py +++ b/tests/unit/test_aio.py @@ -5,7 +5,7 @@ import deepspeed import deepspeed.comm as dist from deepspeed.ops.aio import AsyncIOBuilder -from tests.unit.common import DistributedTest +from unit.common import DistributedTest MEGA_BYTE = 1024**2 BLOCK_SIZE = MEGA_BYTE diff --git a/tests/unit/test_averaging.py b/tests/unit/test_averaging.py index e178554c1aa9..b137e2fbb529 100644 --- a/tests/unit/test_averaging.py +++ b/tests/unit/test_averaging.py @@ -1,6 +1,6 @@ import torch import deepspeed -from tests.unit.common import DistributedTest +from unit.common import DistributedTest class Model(torch.nn.Module): diff --git a/tests/unit/test_coalesced_collectives.py b/tests/unit/test_coalesced_collectives.py index 3c504dd58ac3..92a081fb309b 100644 --- a/tests/unit/test_coalesced_collectives.py +++ b/tests/unit/test_coalesced_collectives.py @@ -4,7 +4,7 @@ import deepspeed.comm as dist from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced -from tests.unit.common import DistributedTest +from unit.common import DistributedTest class TestReduceScatterCoalesced(DistributedTest): diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index 9d5cc3460ea7..dafe5ab674e5 100755 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -6,8 +6,8 @@ from deepspeed.runtime.zero.config import DeepSpeedZeroConfig -from tests.unit.common import DistributedTest, get_test_path -from tests.unit.simple_model import SimpleModel, create_config_from_dict, random_dataloader +from unit.common import DistributedTest, get_test_path +from unit.simple_model import SimpleModel, create_config_from_dict, random_dataloader import deepspeed.comm as dist # A test on its own diff --git a/tests/unit/test_configurable_parallel_mp.py b/tests/unit/test_configurable_parallel_mp.py index b0a43233969b..dda4c22bcdae 100755 --- a/tests/unit/test_configurable_parallel_mp.py +++ b/tests/unit/test_configurable_parallel_mp.py @@ -5,8 +5,8 @@ import random import numpy as np import deepspeed.comm as dist -from tests.unit.common import DistributedTest, DistributedFixture -from tests.unit.megatron_model import get_gpt2_model, get_megatron_version +from unit.common import DistributedTest, DistributedFixture +from unit.megatron_model import get_gpt2_model, get_megatron_version TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MINOR = int(torch.__version__.split('.')[1]) diff --git a/tests/unit/test_configurable_parallel_pp.py b/tests/unit/test_configurable_parallel_pp.py index e862ff33167a..164e8cea5363 100755 --- a/tests/unit/test_configurable_parallel_pp.py +++ b/tests/unit/test_configurable_parallel_pp.py @@ -5,9 +5,9 @@ import random import numpy as np import deepspeed.comm as dist -from tests.unit.common import DistributedTest, DistributedFixture -from tests.unit.megatron_model import get_megatron_version -from tests.unit.megatron_model import MockGPT2ModelPipe as GPT2ModelPipe +from unit.common import DistributedTest, DistributedFixture +from unit.megatron_model import get_megatron_version +from unit.megatron_model import MockGPT2ModelPipe as GPT2ModelPipe from deepspeed.utils import RepeatingLoader TORCH_MAJOR = int(torch.__version__.split('.')[0]) diff --git a/tests/unit/test_curriculum_learning.py b/tests/unit/test_curriculum_learning.py index d46753b50ea7..641dedd8140a 100644 --- a/tests/unit/test_curriculum_learning.py +++ b/tests/unit/test_curriculum_learning.py @@ -1,6 +1,6 @@ import deepspeed -from tests.unit.common import DistributedTest -from tests.unit.simple_model import Curriculum_SimpleModel, random_dataloader +from unit.common import DistributedTest +from unit.simple_model import Curriculum_SimpleModel, random_dataloader class TestCurriculumScheduler(DistributedTest): diff --git a/tests/unit/test_ds_initialize.py b/tests/unit/test_ds_initialize.py index 1dcb2d623932..0ecb2d90a57f 100644 --- a/tests/unit/test_ds_initialize.py +++ b/tests/unit/test_ds_initialize.py @@ -4,9 +4,9 @@ from torch.optim import Optimizer, Adam, AdamW from torch.optim.lr_scheduler import _LRScheduler, LambdaLR -from tests.unit.simple_model import SimpleModel, random_dataloader -from tests.unit.common import DistributedTest -from tests.unit.util import required_torch_version +from unit.simple_model import SimpleModel, random_dataloader +from unit.common import DistributedTest +from unit.util import required_torch_version import deepspeed from deepspeed.ops.adam import FusedAdam diff --git a/tests/unit/test_dynamic_loss_scale.py b/tests/unit/test_dynamic_loss_scale.py index 4682e2bd749d..7deb3661f428 100755 --- a/tests/unit/test_dynamic_loss_scale.py +++ b/tests/unit/test_dynamic_loss_scale.py @@ -1,8 +1,8 @@ import torch import deepspeed import numpy as np -from tests.unit.common import DistributedTest -from tests.unit.simple_model import SimpleModel +from unit.common import DistributedTest +from unit.simple_model import SimpleModel def run_model_step(model, gradient_list): diff --git a/tests/unit/test_fp16.py b/tests/unit/test_fp16.py index d5b8268844ed..a6d1b12c0349 100755 --- a/tests/unit/test_fp16.py +++ b/tests/unit/test_fp16.py @@ -3,10 +3,10 @@ import deepspeed import pytest from deepspeed.ops.adam import FusedAdam -from tests.unit.common import DistributedTest +from unit.common import DistributedTest from deepspeed.ops.op_builder import CPUAdamBuilder -from tests.unit.simple_model import SimpleModel, SimpleOptimizer, random_dataloader, SimpleMoEModel, sequence_dataloader -from tests.unit.util import required_torch_version +from unit.simple_model import SimpleModel, SimpleOptimizer, random_dataloader, SimpleMoEModel, sequence_dataloader +from unit.util import required_torch_version try: from apex import amp # noqa: F401 diff --git a/tests/unit/test_ignore_unused_parameters.py b/tests/unit/test_ignore_unused_parameters.py index a10d0c614ffb..329a221bb826 100644 --- a/tests/unit/test_ignore_unused_parameters.py +++ b/tests/unit/test_ignore_unused_parameters.py @@ -1,6 +1,6 @@ import pytest -from tests.unit.common import DistributedTest -from tests.unit.simple_model import UnusedParametersModel, random_dataloader +from unit.common import DistributedTest +from unit.simple_model import UnusedParametersModel, random_dataloader from deepspeed.ops.op_builder import CPUAdamBuilder import deepspeed diff --git a/tests/unit/test_lr_schedulers.py b/tests/unit/test_lr_schedulers.py index 4f0c940a74bc..e17751e9e547 100755 --- a/tests/unit/test_lr_schedulers.py +++ b/tests/unit/test_lr_schedulers.py @@ -1,8 +1,8 @@ import torch import deepspeed import pytest -from tests.unit.common import DistributedTest -from tests.unit.simple_model import SimpleModel, random_dataloader +from unit.common import DistributedTest +from unit.simple_model import SimpleModel, random_dataloader from deepspeed.runtime.lr_schedules import LR_RANGE_TEST, LR_RANGE_TEST_MIN_LR, LR_RANGE_TEST_STEP_RATE, LR_RANGE_TEST_STEP_SIZE, LR_RANGE_TEST_STAIRCASE from deepspeed.runtime.lr_schedules import WARMUP_LR, WARMUP_MIN_LR, WARMUP_MAX_LR, WARMUP_NUM_STEPS, WARMUP_TYPE, WARMUP_LOG_RATE, WARMUP_LINEAR_RATE from deepspeed.runtime.lr_schedules import ONE_CYCLE, CYCLE_MIN_LR, CYCLE_MAX_LR, CYCLE_FIRST_STEP_SIZE, DECAY_LR_RATE, DECAY_STEP_SIZE diff --git a/tests/unit/test_moe.py b/tests/unit/test_moe.py index 6d9e88bc21d8..96b33657605d 100644 --- a/tests/unit/test_moe.py +++ b/tests/unit/test_moe.py @@ -1,9 +1,9 @@ import torch import deepspeed import pytest -from tests.unit.common import DistributedTest -from tests.unit.simple_model import SimplePRMoEModel, SimpleMoEModel, sequence_dataloader -from tests.unit.util import required_torch_version +from unit.common import DistributedTest +from unit.simple_model import SimplePRMoEModel, SimpleMoEModel, sequence_dataloader +from unit.util import required_torch_version @pytest.mark.parametrize("ep_size", [2, 4]) diff --git a/tests/unit/test_moe_tp.py b/tests/unit/test_moe_tp.py index f8586d4daaea..6956da228970 100644 --- a/tests/unit/test_moe_tp.py +++ b/tests/unit/test_moe_tp.py @@ -1,8 +1,8 @@ import torch import deepspeed import pytest -from tests.unit.common import DistributedTest -from tests.unit.util import required_torch_version +from unit.common import DistributedTest +from unit.util import required_torch_version from deepspeed.moe.layer import MoE diff --git a/tests/unit/test_multi_output_model.py b/tests/unit/test_multi_output_model.py index c8add428c0a4..65acf726f1c4 100755 --- a/tests/unit/test_multi_output_model.py +++ b/tests/unit/test_multi_output_model.py @@ -1,8 +1,8 @@ import torch import deepspeed from pytest import approx -from tests.unit.common import DistributedTest -from tests.unit.multi_output_model import MultiOutputModel, multi_output_dataloader +from unit.common import DistributedTest +from unit.multi_output_model import MultiOutputModel, multi_output_dataloader def create_config_dict(micro_batch_size, grad_accumulation_steps, world_size): diff --git a/tests/unit/test_partition.py b/tests/unit/test_partition.py index df9b8d619a4f..e5e6ed14c586 100644 --- a/tests/unit/test_partition.py +++ b/tests/unit/test_partition.py @@ -8,7 +8,7 @@ from deepspeed.runtime.utils import prefix_sum_inc from deepspeed.runtime.utils import PartitionedTensor -from tests.unit.common import DistributedTest +from unit.common import DistributedTest class TestPartitionedTensor(DistributedTest): diff --git a/tests/unit/test_zero.py b/tests/unit/test_zero.py index b6076236f2af..f9715bd1dc8e 100755 --- a/tests/unit/test_zero.py +++ b/tests/unit/test_zero.py @@ -9,8 +9,8 @@ from torch.nn.modules.loss import L1Loss from torch.nn.parameter import Parameter -from tests.unit.common import DistributedTest -from tests.unit.simple_model import SimpleModel, random_dataloader +from unit.common import DistributedTest +from unit.simple_model import SimpleModel, random_dataloader import deepspeed from deepspeed.runtime.engine import DeepSpeedEngine diff --git a/tests/unit/test_zero_context.py b/tests/unit/test_zero_context.py index 5db48e3e0df9..d45beb3a618f 100644 --- a/tests/unit/test_zero_context.py +++ b/tests/unit/test_zero_context.py @@ -8,7 +8,7 @@ from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus, partitioned_param_data_shape import deepspeed.comm as dist -from tests.unit.common import DistributedTest, get_master_port +from unit.common import DistributedTest, get_master_port def setup_serial_env(): From c0a3955aa223df08d28348e0864a065d231d3d53 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Fri, 9 Sep 2022 11:39:34 -0700 Subject: [PATCH 14/25] use DistFixture with elastic checkpoint test --- tests/unit/checkpoint/test_zero_optimizer.py | 97 ++++++++++---------- 1 file changed, 49 insertions(+), 48 deletions(-) diff --git a/tests/unit/checkpoint/test_zero_optimizer.py b/tests/unit/checkpoint/test_zero_optimizer.py index e8c8c12a5495..06e2c3d794e8 100644 --- a/tests/unit/checkpoint/test_zero_optimizer.py +++ b/tests/unit/checkpoint/test_zero_optimizer.py @@ -7,7 +7,6 @@ from unit.checkpoint.common import * -import itertools import pytest @@ -192,18 +191,52 @@ def test_load_module_only(self, tmpdir, zero_stage): load_module_only=True) +class ws4_model_checkpoint(DistributedFixture): + world_size = 4 + + def run(self, class_tmpdir, elastic_save): + ds_config = { + "train_batch_size": 4, + "optimizer": { + "type": 'Adam' + }, + "fp16": { + "enabled": True, + "initial_scale_power": 8 + }, + "zero_optimization": { + "stage": 2, + "elastic_checkpoint": elastic_save + } + } + hidden_dim = 10 + model = SimpleModel(hidden_dim) + + model, _, _, _ = deepspeed.initialize(config=ds_config, + model=model, + model_parameters=model.parameters()) + data_loader = random_dataloader(model=model, + total_samples=8, + hidden_dim=hidden_dim, + device=model.device) + for n, batch in enumerate(data_loader): + loss = model(batch[0], batch[1]) + model.backward(loss) + model.step() + + if load_optim: + torch.save(model.optimizer.optimizer.state_dict(), + os.path.join(class_tmpdir, + 'opt-state-dict')) + model.save_checkpoint(class_tmpdir) + + +@pytest.mark.parametrize("elastic_save", [True, False]) +@pytest.mark.parametrize("elastic_load", [True, False]) +@pytest.mark.parametrize("load_optim", [True, False]) class TestZeROElasticCheckpoint(DistributedTest): world_size = 2 - @pytest.mark.parametrize(["elastic_save", - "elastic_load", - "load_optim"], - itertools.product(*[[True, - False], - [True, - False], - [True, - False]])) def test_elastic_checkpoint_fixed_dp(self, tmpdir, elastic_save, @@ -271,22 +304,12 @@ def test_elastic_checkpoint_fixed_dp(self, model.backward(loss) model.step() - @pytest.mark.parametrize(["elastic_save", - "elastic_load", - "load_optim"], - itertools.product(*[[True, - False], - [True, - False], - [True, - False]])) def test_elastic_checkpoint_change_dp(self, - tmpdir, + ws4_model_checkpoint, + class_tmpdir, elastic_save, elastic_load, load_optim): - pytest.skip( - 'skip until DistributedTest can support changing world size within a test') ds_config = { "train_batch_size": 4, "optimizer": { @@ -298,38 +321,16 @@ def test_elastic_checkpoint_change_dp(self, }, "zero_optimization": { "stage": 2, - "elastic_checkpoint": elastic_save + "elastic_checkpoint": elastic_load } } hidden_dim = 10 - models = [SimpleModel(hidden_dim) for _ in range(2)] - - # Save checkpoint with dp world size = 4 - #TODO - remove this line @distributed_test(world_size=[4]) - model, _, _, _ = deepspeed.initialize(config=ds_config, - model=models[0], - model_parameters=models[0].parameters()) - data_loader = random_dataloader(model=model, - total_samples=8, - hidden_dim=hidden_dim, - device=model.device) - for n, batch in enumerate(data_loader): - loss = model(batch[0], batch[1]) - model.backward(loss) - model.step() - - if load_optim: - torch.save(model.optimizer.optimizer.state_dict(), - os.path.join(tmpdir, - 'opt-state-dict')) - model.save_checkpoint(tmpdir) + model = SimpleModel(hidden_dim) # Load checkpoint with dp world size = 2 - #TODO - remove this line @distributed_test(world_size=[2]) - ds_config["zero_optimization"]["elastic_checkpoint"] = elastic_load model, _, _, _ = deepspeed.initialize(config=ds_config, - model=models[1], - model_parameters=models[1].parameters()) + model=model, + model_parameters=model.parameters()) if load_optim: with pytest.raises(deepspeed.runtime.zero.utils.ZeRORuntimeException): model.load_checkpoint(tmpdir, load_optimizer_states=load_optim) From 1ae399331bde45622a87e53be6c7ca560fa08063 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Fri, 9 Sep 2022 12:31:01 -0700 Subject: [PATCH 15/25] missing import --- tests/unit/checkpoint/test_zero_optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/checkpoint/test_zero_optimizer.py b/tests/unit/checkpoint/test_zero_optimizer.py index 06e2c3d794e8..67d9f90dbebb 100644 --- a/tests/unit/checkpoint/test_zero_optimizer.py +++ b/tests/unit/checkpoint/test_zero_optimizer.py @@ -1,7 +1,7 @@ import deepspeed from deepspeed.ops.op_builder import CPUAdamBuilder -from unit.common import DistributedTest +from unit.common import DistributedTest, DistributedFixture from unit.simple_model import * from unit.util import required_minimum_torch_version From 59b557bdd52ec75fb1a3179a3b39016db6271214 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Fri, 9 Sep 2022 13:19:27 -0700 Subject: [PATCH 16/25] update to shared class tmpdir for elastic test --- tests/unit/checkpoint/test_zero_optimizer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/checkpoint/test_zero_optimizer.py b/tests/unit/checkpoint/test_zero_optimizer.py index 67d9f90dbebb..73bde2fda940 100644 --- a/tests/unit/checkpoint/test_zero_optimizer.py +++ b/tests/unit/checkpoint/test_zero_optimizer.py @@ -194,7 +194,7 @@ def test_load_module_only(self, tmpdir, zero_stage): class ws4_model_checkpoint(DistributedFixture): world_size = 4 - def run(self, class_tmpdir, elastic_save): + def run(self, class_tmpdir, elastic_save, load_optim): ds_config = { "train_batch_size": 4, "optimizer": { @@ -333,9 +333,9 @@ def test_elastic_checkpoint_change_dp(self, model_parameters=model.parameters()) if load_optim: with pytest.raises(deepspeed.runtime.zero.utils.ZeRORuntimeException): - model.load_checkpoint(tmpdir, load_optimizer_states=load_optim) + model.load_checkpoint(class_tmpdir, load_optimizer_states=load_optim) else: - model.load_checkpoint(tmpdir, load_optimizer_states=load_optim) + model.load_checkpoint(class_tmpdir, load_optimizer_states=load_optim) class TestZeROSaveLoadEdgeCase(DistributedTest): From eab78bcf5822cb78a1631d2d124d53dce4778015 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Fri, 9 Sep 2022 14:57:19 -0700 Subject: [PATCH 17/25] moved test files --- tests/unit/alexnet_model.py | 0 tests/unit/ds_batch_config.json | 0 tests/unit/gpt2-merges.txt | 0 tests/unit/gpt2-vocab.json | 0 tests/unit/inference/test_inference.py | 12 ------------ tests/unit/modeling.py | 0 tests/unit/modelingpreln.py | 0 tests/unit/{ => moe}/test_moe.py | 0 tests/unit/{ => moe}/test_moe_tp.py | 0 tests/unit/multi_output_model.py | 0 .../test_activation_checkpointing.py | 0 .../{ => runtime/comm}/test_coalesced_collectives.py | 0 tests/unit/{ => runtime/fp16}/test_fp16.py | 0 tests/unit/{ => runtime}/test_lr_schedulers.py | 0 tests/unit/{ => runtime/utils}/test_partition.py | 0 tests/unit/{ => runtime/zero}/test_zero.py | 0 tests/unit/{ => runtime/zero}/test_zero_context.py | 0 tests/unit/simple_model.py | 0 tests/unit/test_aio.py | 0 tests/unit/test_config.py | 0 tests/unit/test_configurable_parallel_mp.py | 0 tests/unit/test_configurable_parallel_pp.py | 0 tests/unit/test_cuda_backward.py | 0 tests/unit/test_cuda_forward.py | 0 tests/unit/test_ds_arguments.py | 0 tests/unit/test_dynamic_loss_scale.py | 0 tests/unit/test_multi_output_model.py | 0 27 files changed, 12 deletions(-) mode change 100755 => 100644 tests/unit/alexnet_model.py mode change 100755 => 100644 tests/unit/ds_batch_config.json mode change 100755 => 100644 tests/unit/gpt2-merges.txt mode change 100755 => 100644 tests/unit/gpt2-vocab.json mode change 100755 => 100644 tests/unit/modeling.py mode change 100755 => 100644 tests/unit/modelingpreln.py rename tests/unit/{ => moe}/test_moe.py (100%) rename tests/unit/{ => moe}/test_moe_tp.py (100%) mode change 100755 => 100644 tests/unit/multi_output_model.py rename tests/unit/{ => runtime/activation_checkpointing}/test_activation_checkpointing.py (100%) rename tests/unit/{ => runtime/comm}/test_coalesced_collectives.py (100%) rename tests/unit/{ => runtime/fp16}/test_fp16.py (100%) mode change 100755 => 100644 rename tests/unit/{ => runtime}/test_lr_schedulers.py (100%) mode change 100755 => 100644 rename tests/unit/{ => runtime/utils}/test_partition.py (100%) rename tests/unit/{ => runtime/zero}/test_zero.py (100%) mode change 100755 => 100644 rename tests/unit/{ => runtime/zero}/test_zero_context.py (100%) mode change 100755 => 100644 tests/unit/simple_model.py mode change 100755 => 100644 tests/unit/test_aio.py mode change 100755 => 100644 tests/unit/test_config.py mode change 100755 => 100644 tests/unit/test_configurable_parallel_mp.py mode change 100755 => 100644 tests/unit/test_configurable_parallel_pp.py mode change 100755 => 100644 tests/unit/test_cuda_backward.py mode change 100755 => 100644 tests/unit/test_cuda_forward.py mode change 100755 => 100644 tests/unit/test_ds_arguments.py mode change 100755 => 100644 tests/unit/test_dynamic_loss_scale.py mode change 100755 => 100644 tests/unit/test_multi_output_model.py diff --git a/tests/unit/alexnet_model.py b/tests/unit/alexnet_model.py old mode 100755 new mode 100644 diff --git a/tests/unit/ds_batch_config.json b/tests/unit/ds_batch_config.json old mode 100755 new mode 100644 diff --git a/tests/unit/gpt2-merges.txt b/tests/unit/gpt2-merges.txt old mode 100755 new mode 100644 diff --git a/tests/unit/gpt2-vocab.json b/tests/unit/gpt2-vocab.json old mode 100755 new mode 100644 diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index e0ecd63e3528..17a43de59cb3 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -11,18 +11,6 @@ from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer from huggingface_hub import HfApi - -# Fixture avoids problems with missing imports when pytest collects tests when -# running non-inference tests -@pytest.fixture(scope="module", autouse=True) -def lm_eval_imports(): - global lm_eval - import lm_eval - import lm_eval.models - import lm_eval.tasks - import lm_eval.evaluator - - rocm_version = OpBuilder.installed_rocm_version() if rocm_version != (0, 0): pytest.skip("skip inference tests on rocm for now", allow_module_level=True) diff --git a/tests/unit/modeling.py b/tests/unit/modeling.py old mode 100755 new mode 100644 diff --git a/tests/unit/modelingpreln.py b/tests/unit/modelingpreln.py old mode 100755 new mode 100644 diff --git a/tests/unit/test_moe.py b/tests/unit/moe/test_moe.py similarity index 100% rename from tests/unit/test_moe.py rename to tests/unit/moe/test_moe.py diff --git a/tests/unit/test_moe_tp.py b/tests/unit/moe/test_moe_tp.py similarity index 100% rename from tests/unit/test_moe_tp.py rename to tests/unit/moe/test_moe_tp.py diff --git a/tests/unit/multi_output_model.py b/tests/unit/multi_output_model.py old mode 100755 new mode 100644 diff --git a/tests/unit/test_activation_checkpointing.py b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py similarity index 100% rename from tests/unit/test_activation_checkpointing.py rename to tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py diff --git a/tests/unit/test_coalesced_collectives.py b/tests/unit/runtime/comm/test_coalesced_collectives.py similarity index 100% rename from tests/unit/test_coalesced_collectives.py rename to tests/unit/runtime/comm/test_coalesced_collectives.py diff --git a/tests/unit/test_fp16.py b/tests/unit/runtime/fp16/test_fp16.py old mode 100755 new mode 100644 similarity index 100% rename from tests/unit/test_fp16.py rename to tests/unit/runtime/fp16/test_fp16.py diff --git a/tests/unit/test_lr_schedulers.py b/tests/unit/runtime/test_lr_schedulers.py old mode 100755 new mode 100644 similarity index 100% rename from tests/unit/test_lr_schedulers.py rename to tests/unit/runtime/test_lr_schedulers.py diff --git a/tests/unit/test_partition.py b/tests/unit/runtime/utils/test_partition.py similarity index 100% rename from tests/unit/test_partition.py rename to tests/unit/runtime/utils/test_partition.py diff --git a/tests/unit/test_zero.py b/tests/unit/runtime/zero/test_zero.py old mode 100755 new mode 100644 similarity index 100% rename from tests/unit/test_zero.py rename to tests/unit/runtime/zero/test_zero.py diff --git a/tests/unit/test_zero_context.py b/tests/unit/runtime/zero/test_zero_context.py similarity index 100% rename from tests/unit/test_zero_context.py rename to tests/unit/runtime/zero/test_zero_context.py diff --git a/tests/unit/simple_model.py b/tests/unit/simple_model.py old mode 100755 new mode 100644 diff --git a/tests/unit/test_aio.py b/tests/unit/test_aio.py old mode 100755 new mode 100644 diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py old mode 100755 new mode 100644 diff --git a/tests/unit/test_configurable_parallel_mp.py b/tests/unit/test_configurable_parallel_mp.py old mode 100755 new mode 100644 diff --git a/tests/unit/test_configurable_parallel_pp.py b/tests/unit/test_configurable_parallel_pp.py old mode 100755 new mode 100644 diff --git a/tests/unit/test_cuda_backward.py b/tests/unit/test_cuda_backward.py old mode 100755 new mode 100644 diff --git a/tests/unit/test_cuda_forward.py b/tests/unit/test_cuda_forward.py old mode 100755 new mode 100644 diff --git a/tests/unit/test_ds_arguments.py b/tests/unit/test_ds_arguments.py old mode 100755 new mode 100644 diff --git a/tests/unit/test_dynamic_loss_scale.py b/tests/unit/test_dynamic_loss_scale.py old mode 100755 new mode 100644 diff --git a/tests/unit/test_multi_output_model.py b/tests/unit/test_multi_output_model.py old mode 100755 new mode 100644 From c8c10e60b351139c5f8cc96eecfd34028f4c8368 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Fri, 9 Sep 2022 16:39:00 -0700 Subject: [PATCH 18/25] avoid duplicate test file name --- tests/unit/checkpoint/{test_moe.py => test_moe_checkpoint.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/unit/checkpoint/{test_moe.py => test_moe_checkpoint.py} (100%) diff --git a/tests/unit/checkpoint/test_moe.py b/tests/unit/checkpoint/test_moe_checkpoint.py similarity index 100% rename from tests/unit/checkpoint/test_moe.py rename to tests/unit/checkpoint/test_moe_checkpoint.py From 4441c4f10fa66787f1abef43458e6caaf6df4a51 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Wed, 28 Sep 2022 12:10:41 -0700 Subject: [PATCH 19/25] last refactor and moving test files --- .../unit/{ => launcher}/test_ds_arguments.py | 0 .../test_configurable_parallel_mp.py | 0 .../test_configurable_parallel_pp.py | 0 tests/unit/multi_output_model.py | 187 ++++++++++++++---- tests/unit/{ => ops/aio}/test_aio.py | 0 .../unit/{ => ops/cuda}/test_cuda_backward.py | 0 .../unit/{ => ops/cuda}/test_cuda_forward.py | 0 .../onebit/test_onebit.py | 0 .../runtime/{ => half_precision}/test_bf16.py | 0 .../test_dynamic_loss_scale.py | 0 .../{fp16 => half_precision}/test_fp16.py | 0 .../test_averaging_sparse_gradients.py} | 0 .../sparse_tensor}/test_sparse_grads.py | 0 .../{ => runtime}/test_curriculum_learning.py | 0 .../test_ds_config_dict.py} | 0 ...t_ds_config.py => test_ds_config_model.py} | 0 .../unit/{ => runtime}/test_ds_initialize.py | 0 .../{ => runtime}/test_multi_output_model.py | 0 .../zero}/test_ignore_unused_parameters.py | 0 19 files changed, 149 insertions(+), 38 deletions(-) rename tests/unit/{ => launcher}/test_ds_arguments.py (100%) rename tests/unit/{ => model_parallelism}/test_configurable_parallel_mp.py (100%) rename tests/unit/{ => model_parallelism}/test_configurable_parallel_pp.py (100%) rename tests/unit/{ => ops/aio}/test_aio.py (100%) rename tests/unit/{ => ops/cuda}/test_cuda_backward.py (100%) rename tests/unit/{ => ops/cuda}/test_cuda_forward.py (100%) rename tests/unit/runtime/{fp16 => half_precision}/onebit/test_onebit.py (100%) rename tests/unit/runtime/{ => half_precision}/test_bf16.py (100%) rename tests/unit/{ => runtime/half_precision}/test_dynamic_loss_scale.py (100%) rename tests/unit/runtime/{fp16 => half_precision}/test_fp16.py (100%) rename tests/unit/{test_averaging.py => runtime/sparse_tensor/test_averaging_sparse_gradients.py} (100%) rename tests/unit/{ => runtime/sparse_tensor}/test_sparse_grads.py (100%) rename tests/unit/{ => runtime}/test_curriculum_learning.py (100%) rename tests/unit/{test_config.py => runtime/test_ds_config_dict.py} (100%) rename tests/unit/runtime/{test_ds_config.py => test_ds_config_model.py} (100%) rename tests/unit/{ => runtime}/test_ds_initialize.py (100%) rename tests/unit/{ => runtime}/test_multi_output_model.py (100%) rename tests/unit/{ => runtime/zero}/test_ignore_unused_parameters.py (100%) diff --git a/tests/unit/test_ds_arguments.py b/tests/unit/launcher/test_ds_arguments.py similarity index 100% rename from tests/unit/test_ds_arguments.py rename to tests/unit/launcher/test_ds_arguments.py diff --git a/tests/unit/test_configurable_parallel_mp.py b/tests/unit/model_parallelism/test_configurable_parallel_mp.py similarity index 100% rename from tests/unit/test_configurable_parallel_mp.py rename to tests/unit/model_parallelism/test_configurable_parallel_mp.py diff --git a/tests/unit/test_configurable_parallel_pp.py b/tests/unit/model_parallelism/test_configurable_parallel_pp.py similarity index 100% rename from tests/unit/test_configurable_parallel_pp.py rename to tests/unit/model_parallelism/test_configurable_parallel_pp.py diff --git a/tests/unit/multi_output_model.py b/tests/unit/multi_output_model.py index 240c1a4b7aa6..7d7d4e2dc718 100644 --- a/tests/unit/multi_output_model.py +++ b/tests/unit/multi_output_model.py @@ -1,41 +1,152 @@ import torch +import deepspeed +from pytest import approx +from unit.common import DistributedTest +from unit.multi_output_model import MultiOutputModel, multi_output_dataloader -class MultiOutputModel(torch.nn.Module): - def __init__(self, hidden_dim, weight_value): - super(MultiOutputModel, self).__init__() - self.linear = torch.nn.Linear(hidden_dim, hidden_dim, bias=False) - self.linear.weight.data.fill_(weight_value) - self.cross_entropy_loss = torch.nn.CrossEntropyLoss() - - def forward(self, inputs, targets): - losses = [] - for x, y in zip(inputs, targets): - hidden_dim = self.linear(x) - loss = self.cross_entropy_loss(hidden_dim, y) - losses.append(loss) - return tuple(losses) - - -def multi_output_dataloader(model, total_samples, hidden_dim, device, inputs, targets): - assert len(inputs) == len(targets) - batch_size = model.train_micro_batch_size_per_gpu() - - train_data = [ - torch.full(size=(total_samples, - hidden_dim), - fill_value=x, - device=device, - dtype=torch.half, - requires_grad=True) for x in inputs - ] - - train_label = [ - torch.empty(total_samples, - device=device, - dtype=torch.long).fill_(y) for y in targets - ] - - train_dataset = torch.utils.data.TensorDataset(*train_data, *train_label) - train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size) - return train_loader +def create_config_dict(micro_batch_size, grad_accumulation_steps, world_size): + return { + "train_micro_batch_size_per_gpu": micro_batch_size, + "gradient_accumulation_steps": grad_accumulation_steps, + "train_batch_size": micro_batch_size * grad_accumulation_steps * world_size, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + } + }, + "fp16": { + "enabled": True + } + } + + +class TestTwoOutputModel(DistributedTest): + world_size = 1 + + def test(self, tmpdir): + gradient_accumulation_steps = 2 + micro_batch_size = 1 + world_size = self.world_size + config_dict = { + "train_micro_batch_size_per_gpu": micro_batch_size, + "gradient_accumulation_steps": grad_accumulation_steps, + "train_batch_size": micro_batch_size * grad_accumulation_steps * world_size, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + } + }, + "fp16": { + "enabled": True + } + } + + hidden_dim = 10 + weight_value = 0.1 + + model = MultiOutputModel(hidden_dim, weight_value) + model, _, _, _ = deepspeed.initialize(config=config_dict, + model=model, + model_parameters=model.parameters()) + total_samples = 4 + data_loader = multi_output_dataloader(model=model, + total_samples=total_samples, + hidden_dim=hidden_dim, + device=model.device, + inputs=[1.0, + 2.0], + targets=[1, + 2]) + for n, batch in enumerate(data_loader): + assert len(batch) % 2 == 0, \ + f"multi_output_dataloader failed to return even number of data samples (input+target)" + + midpoint = len(batch) // 2 + inputs, targets = batch[:midpoint], batch[midpoint:] + loss_tuple = model(inputs, targets) + + expected_loss = torch.tensor(2.302734375, + dtype=torch.half, + device=model.device) + for loss in loss_tuple: + assert loss.shape == torch.Size([]) + assert loss.item() == approx(expected_loss.item()) + + summed_loss = sum(loss_tuple) + scaled_loss = model.backward(summed_loss) + expected_scaled_loss = summed_loss.float() / gradient_accumulation_steps + assert scaled_loss.item() == approx(expected_scaled_loss.item()) + + model.step() + + +class TestThreeOutputModel(DistributedTest): + world_size = 1 + + def test(self, tmpdir): + gradient_accumulation_steps = 3 + micro_batch_size = 1 + world_size = 1 + config_dict = { + "train_micro_batch_size_per_gpu": micro_batch_size, + "gradient_accumulation_steps": grad_accumulation_steps, + "train_batch_size": micro_batch_size * grad_accumulation_steps * world_size, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + } + }, + "fp16": { + "enabled": True + } + } + + hidden_dim = 10 + weight_value = 0.1 + + model = MultiOutputModel(hidden_dim, weight_value) + model, _, _, _ = deepspeed.initialize(config=config_dict, + model=model, + model_parameters=model.parameters()) + + total_samples = gradient_accumulation_steps * micro_batch_size * 2 + data_loader = multi_output_dataloader(model=model, + total_samples=total_samples, + hidden_dim=hidden_dim, + device=model.device, + inputs=[1.0, + 2.0, + 3.0], + targets=[1, + 2, + 3]) + for n, batch in enumerate(data_loader): + assert len(batch) % 2 == 0, \ + f"multi_output_dataloader failed to return even number of data samples (input+target)" + + midpoint = len(batch) // 2 + inputs, targets = batch[:midpoint], batch[midpoint:] + loss_tuple = model(inputs, targets) + assert len(loss_tuple) == 3 + + expected_loss = torch.tensor(2.302734375, + dtype=torch.half, + device=model.device) + + for loss in loss_tuple: + assert loss.shape == torch.Size([]) + assert loss.item() == approx(expected_loss.item()) + + summed_loss = sum(loss_tuple) + scaled_loss = model.backward(summed_loss) + expected_scaled_loss = summed_loss.float() / gradient_accumulation_steps + assert scaled_loss.item() == approx(expected_scaled_loss.item()) + + model.step() diff --git a/tests/unit/test_aio.py b/tests/unit/ops/aio/test_aio.py similarity index 100% rename from tests/unit/test_aio.py rename to tests/unit/ops/aio/test_aio.py diff --git a/tests/unit/test_cuda_backward.py b/tests/unit/ops/cuda/test_cuda_backward.py similarity index 100% rename from tests/unit/test_cuda_backward.py rename to tests/unit/ops/cuda/test_cuda_backward.py diff --git a/tests/unit/test_cuda_forward.py b/tests/unit/ops/cuda/test_cuda_forward.py similarity index 100% rename from tests/unit/test_cuda_forward.py rename to tests/unit/ops/cuda/test_cuda_forward.py diff --git a/tests/unit/runtime/fp16/onebit/test_onebit.py b/tests/unit/runtime/half_precision/onebit/test_onebit.py similarity index 100% rename from tests/unit/runtime/fp16/onebit/test_onebit.py rename to tests/unit/runtime/half_precision/onebit/test_onebit.py diff --git a/tests/unit/runtime/test_bf16.py b/tests/unit/runtime/half_precision/test_bf16.py similarity index 100% rename from tests/unit/runtime/test_bf16.py rename to tests/unit/runtime/half_precision/test_bf16.py diff --git a/tests/unit/test_dynamic_loss_scale.py b/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py similarity index 100% rename from tests/unit/test_dynamic_loss_scale.py rename to tests/unit/runtime/half_precision/test_dynamic_loss_scale.py diff --git a/tests/unit/runtime/fp16/test_fp16.py b/tests/unit/runtime/half_precision/test_fp16.py similarity index 100% rename from tests/unit/runtime/fp16/test_fp16.py rename to tests/unit/runtime/half_precision/test_fp16.py diff --git a/tests/unit/test_averaging.py b/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py similarity index 100% rename from tests/unit/test_averaging.py rename to tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py diff --git a/tests/unit/test_sparse_grads.py b/tests/unit/runtime/sparse_tensor/test_sparse_grads.py similarity index 100% rename from tests/unit/test_sparse_grads.py rename to tests/unit/runtime/sparse_tensor/test_sparse_grads.py diff --git a/tests/unit/test_curriculum_learning.py b/tests/unit/runtime/test_curriculum_learning.py similarity index 100% rename from tests/unit/test_curriculum_learning.py rename to tests/unit/runtime/test_curriculum_learning.py diff --git a/tests/unit/test_config.py b/tests/unit/runtime/test_ds_config_dict.py similarity index 100% rename from tests/unit/test_config.py rename to tests/unit/runtime/test_ds_config_dict.py diff --git a/tests/unit/runtime/test_ds_config.py b/tests/unit/runtime/test_ds_config_model.py similarity index 100% rename from tests/unit/runtime/test_ds_config.py rename to tests/unit/runtime/test_ds_config_model.py diff --git a/tests/unit/test_ds_initialize.py b/tests/unit/runtime/test_ds_initialize.py similarity index 100% rename from tests/unit/test_ds_initialize.py rename to tests/unit/runtime/test_ds_initialize.py diff --git a/tests/unit/test_multi_output_model.py b/tests/unit/runtime/test_multi_output_model.py similarity index 100% rename from tests/unit/test_multi_output_model.py rename to tests/unit/runtime/test_multi_output_model.py diff --git a/tests/unit/test_ignore_unused_parameters.py b/tests/unit/runtime/zero/test_ignore_unused_parameters.py similarity index 100% rename from tests/unit/test_ignore_unused_parameters.py rename to tests/unit/runtime/zero/test_ignore_unused_parameters.py From 20403a4d0e89742bf9f66fe98db10fdf1fdcde47 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Wed, 28 Sep 2022 13:44:12 -0700 Subject: [PATCH 20/25] formatting --- tests/unit/multi_output_model.py | 187 ++++-------------- tests/unit/runtime/test_multi_output_model.py | 75 ++++--- 2 files changed, 84 insertions(+), 178 deletions(-) diff --git a/tests/unit/multi_output_model.py b/tests/unit/multi_output_model.py index 7d7d4e2dc718..240c1a4b7aa6 100644 --- a/tests/unit/multi_output_model.py +++ b/tests/unit/multi_output_model.py @@ -1,152 +1,41 @@ import torch -import deepspeed -from pytest import approx -from unit.common import DistributedTest -from unit.multi_output_model import MultiOutputModel, multi_output_dataloader -def create_config_dict(micro_batch_size, grad_accumulation_steps, world_size): - return { - "train_micro_batch_size_per_gpu": micro_batch_size, - "gradient_accumulation_steps": grad_accumulation_steps, - "train_batch_size": micro_batch_size * grad_accumulation_steps * world_size, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 - } - }, - "fp16": { - "enabled": True - } - } - - -class TestTwoOutputModel(DistributedTest): - world_size = 1 - - def test(self, tmpdir): - gradient_accumulation_steps = 2 - micro_batch_size = 1 - world_size = self.world_size - config_dict = { - "train_micro_batch_size_per_gpu": micro_batch_size, - "gradient_accumulation_steps": grad_accumulation_steps, - "train_batch_size": micro_batch_size * grad_accumulation_steps * world_size, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 - } - }, - "fp16": { - "enabled": True - } - } - - hidden_dim = 10 - weight_value = 0.1 - - model = MultiOutputModel(hidden_dim, weight_value) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) - total_samples = 4 - data_loader = multi_output_dataloader(model=model, - total_samples=total_samples, - hidden_dim=hidden_dim, - device=model.device, - inputs=[1.0, - 2.0], - targets=[1, - 2]) - for n, batch in enumerate(data_loader): - assert len(batch) % 2 == 0, \ - f"multi_output_dataloader failed to return even number of data samples (input+target)" - - midpoint = len(batch) // 2 - inputs, targets = batch[:midpoint], batch[midpoint:] - loss_tuple = model(inputs, targets) - - expected_loss = torch.tensor(2.302734375, - dtype=torch.half, - device=model.device) - for loss in loss_tuple: - assert loss.shape == torch.Size([]) - assert loss.item() == approx(expected_loss.item()) - - summed_loss = sum(loss_tuple) - scaled_loss = model.backward(summed_loss) - expected_scaled_loss = summed_loss.float() / gradient_accumulation_steps - assert scaled_loss.item() == approx(expected_scaled_loss.item()) - - model.step() - - -class TestThreeOutputModel(DistributedTest): - world_size = 1 - - def test(self, tmpdir): - gradient_accumulation_steps = 3 - micro_batch_size = 1 - world_size = 1 - config_dict = { - "train_micro_batch_size_per_gpu": micro_batch_size, - "gradient_accumulation_steps": grad_accumulation_steps, - "train_batch_size": micro_batch_size * grad_accumulation_steps * world_size, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 - } - }, - "fp16": { - "enabled": True - } - } - - hidden_dim = 10 - weight_value = 0.1 - - model = MultiOutputModel(hidden_dim, weight_value) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) - - total_samples = gradient_accumulation_steps * micro_batch_size * 2 - data_loader = multi_output_dataloader(model=model, - total_samples=total_samples, - hidden_dim=hidden_dim, - device=model.device, - inputs=[1.0, - 2.0, - 3.0], - targets=[1, - 2, - 3]) - for n, batch in enumerate(data_loader): - assert len(batch) % 2 == 0, \ - f"multi_output_dataloader failed to return even number of data samples (input+target)" - - midpoint = len(batch) // 2 - inputs, targets = batch[:midpoint], batch[midpoint:] - loss_tuple = model(inputs, targets) - assert len(loss_tuple) == 3 - - expected_loss = torch.tensor(2.302734375, - dtype=torch.half, - device=model.device) - - for loss in loss_tuple: - assert loss.shape == torch.Size([]) - assert loss.item() == approx(expected_loss.item()) - - summed_loss = sum(loss_tuple) - scaled_loss = model.backward(summed_loss) - expected_scaled_loss = summed_loss.float() / gradient_accumulation_steps - assert scaled_loss.item() == approx(expected_scaled_loss.item()) - - model.step() +class MultiOutputModel(torch.nn.Module): + def __init__(self, hidden_dim, weight_value): + super(MultiOutputModel, self).__init__() + self.linear = torch.nn.Linear(hidden_dim, hidden_dim, bias=False) + self.linear.weight.data.fill_(weight_value) + self.cross_entropy_loss = torch.nn.CrossEntropyLoss() + + def forward(self, inputs, targets): + losses = [] + for x, y in zip(inputs, targets): + hidden_dim = self.linear(x) + loss = self.cross_entropy_loss(hidden_dim, y) + losses.append(loss) + return tuple(losses) + + +def multi_output_dataloader(model, total_samples, hidden_dim, device, inputs, targets): + assert len(inputs) == len(targets) + batch_size = model.train_micro_batch_size_per_gpu() + + train_data = [ + torch.full(size=(total_samples, + hidden_dim), + fill_value=x, + device=device, + dtype=torch.half, + requires_grad=True) for x in inputs + ] + + train_label = [ + torch.empty(total_samples, + device=device, + dtype=torch.long).fill_(y) for y in targets + ] + + train_dataset = torch.utils.data.TensorDataset(*train_data, *train_label) + train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size) + return train_loader diff --git a/tests/unit/runtime/test_multi_output_model.py b/tests/unit/runtime/test_multi_output_model.py index 65acf726f1c4..a93ba7fc212f 100644 --- a/tests/unit/runtime/test_multi_output_model.py +++ b/tests/unit/runtime/test_multi_output_model.py @@ -5,31 +5,29 @@ from unit.multi_output_model import MultiOutputModel, multi_output_dataloader -def create_config_dict(micro_batch_size, grad_accumulation_steps, world_size): - return { - "train_micro_batch_size_per_gpu": micro_batch_size, - "gradient_accumulation_steps": grad_accumulation_steps, - "train_batch_size": micro_batch_size * grad_accumulation_steps * world_size, - "steps_per_print": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 +class TestTwoOutputModel(DistributedTest): + world_size = 1 + + def test(self, tmpdir): + grad_accumulation_steps = 2 + micro_batch_size = 1 + world_size = self.world_size + config_dict = { + "train_micro_batch_size_per_gpu": micro_batch_size, + "gradient_accumulation_steps": grad_accumulation_steps, + "train_batch_size": micro_batch_size * grad_accumulation_steps * world_size, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + } + }, + "fp16": { + "enabled": True } - }, - "fp16": { - "enabled": True } - } - - -class TestMultiModelOutput(DistributedTest): - world_size = 1 - def test_two(self, gradient_accumulation_steps=2, micro_batch_size=1): - config_dict = create_config_dict(micro_batch_size, - gradient_accumulation_steps, - self.world_size) hidden_dim = 10 weight_value = 0.1 @@ -63,15 +61,34 @@ def test_two(self, gradient_accumulation_steps=2, micro_batch_size=1): summed_loss = sum(loss_tuple) scaled_loss = model.backward(summed_loss) - expected_scaled_loss = summed_loss.float() / gradient_accumulation_steps + expected_scaled_loss = summed_loss.float() / grad_accumulation_steps assert scaled_loss.item() == approx(expected_scaled_loss.item()) model.step() - def test_three(self, gradient_accumulation_steps=3, micro_batch_size=1): - config_dict = create_config_dict(micro_batch_size, - gradient_accumulation_steps, - self.world_size) + +class TestThreeOutputModel(DistributedTest): + world_size = 1 + + def test(self, tmpdir): + grad_accumulation_steps = 3 + micro_batch_size = 1 + world_size = 1 + config_dict = { + "train_micro_batch_size_per_gpu": micro_batch_size, + "gradient_accumulation_steps": grad_accumulation_steps, + "train_batch_size": micro_batch_size * grad_accumulation_steps * world_size, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + } + }, + "fp16": { + "enabled": True + } + } hidden_dim = 10 weight_value = 0.1 @@ -81,7 +98,7 @@ def test_three(self, gradient_accumulation_steps=3, micro_batch_size=1): model=model, model_parameters=model.parameters()) - total_samples = gradient_accumulation_steps * micro_batch_size * 2 + total_samples = grad_accumulation_steps * micro_batch_size * 2 data_loader = multi_output_dataloader(model=model, total_samples=total_samples, hidden_dim=hidden_dim, @@ -111,7 +128,7 @@ def test_three(self, gradient_accumulation_steps=3, micro_batch_size=1): summed_loss = sum(loss_tuple) scaled_loss = model.backward(summed_loss) - expected_scaled_loss = summed_loss.float() / gradient_accumulation_steps + expected_scaled_loss = summed_loss.float() / grad_accumulation_steps assert scaled_loss.item() == approx(expected_scaled_loss.item()) model.step() From 03fd172ad3a6874a244967f30eabfbd74c6d506b Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Wed, 28 Sep 2022 14:07:42 -0700 Subject: [PATCH 21/25] fix broken import --- tests/unit/ops/cuda/test_cuda_backward.py | 4 ++-- tests/unit/ops/cuda/test_cuda_forward.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/ops/cuda/test_cuda_backward.py b/tests/unit/ops/cuda/test_cuda_backward.py index 42bf50c6ad4e..f2720ce5c1ce 100644 --- a/tests/unit/ops/cuda/test_cuda_backward.py +++ b/tests/unit/ops/cuda/test_cuda_backward.py @@ -6,8 +6,8 @@ import copy from torch import nn from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig -from .modeling import BertConfig, BertLayerNorm, BertEncoder as BertEncoderPostln -from .modelingpreln import BertEncoder as BertEncoderPreln +from unit.modeling import BertConfig, BertLayerNorm, BertEncoder as BertEncoderPostln +from unit.modelingpreln import BertEncoder as BertEncoderPreln #if not deepspeed.ops.__installed_ops__['transformer']: #pytest.skip( diff --git a/tests/unit/ops/cuda/test_cuda_forward.py b/tests/unit/ops/cuda/test_cuda_forward.py index 9c2b7f7afaa2..546a596523a8 100644 --- a/tests/unit/ops/cuda/test_cuda_forward.py +++ b/tests/unit/ops/cuda/test_cuda_forward.py @@ -5,8 +5,8 @@ import random import copy from torch import nn -from .modelingpreln import BertEncoder as BertEncoderPreln -from .modeling import BertLayerNorm, BertConfig, BertEncoder as BertEncoderPostln +from unit.modelingpreln import BertEncoder as BertEncoderPreln +from unit.modeling import BertLayerNorm, BertConfig, BertEncoder as BertEncoderPostln from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig From a24a4ac4021b57c3a1eedaae71dbe77649fe80c7 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Mon, 3 Oct 2022 11:58:02 -0700 Subject: [PATCH 22/25] testing forked AMD tests --- .github/workflows/amd.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/amd.yml b/.github/workflows/amd.yml index 5af91da0b389..8a0072ef5fcc 100644 --- a/.github/workflows/amd.yml +++ b/.github/workflows/amd.yml @@ -67,5 +67,5 @@ jobs: run: | if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi cd tests - TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose unit/ - TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'sequential' unit/ + TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked -n 4 --verbose unit/ + TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/ From 9ce06a762654af6bffecf3880a7adef7e887d2fd Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Mon, 3 Oct 2022 12:00:54 -0700 Subject: [PATCH 23/25] update abstract method --- tests/unit/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/common.py b/tests/unit/common.py index 2d2c8e0541ba..df59ed62f017 100644 --- a/tests/unit/common.py +++ b/tests/unit/common.py @@ -75,7 +75,7 @@ class DistributedExec(ABC): @abstractmethod def run(self): - NotImplementedError("Inheriting classes must define this method") + ... def __call__(self, request=None): self._fixture_kwargs = self._get_fixture_kwargs(request, self.run) From b8253246114e0c9274bd3501980c41646f16807f Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Mon, 3 Oct 2022 13:39:04 -0700 Subject: [PATCH 24/25] use blob storage for accelerate and transformers tests --- .github/workflows/nv-accelerate-v100.yml | 2 +- .github/workflows/nv-transformers-v100.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml index 436e11b3f822..1610a59c2c6a 100644 --- a/.github/workflows/nv-accelerate-v100.yml +++ b/.github/workflows/nv-accelerate-v100.yml @@ -58,4 +58,4 @@ jobs: # tmp fix: force newer datasets version pip install "datasets>=2.0.0" pip list - TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose tests/deepspeed + HF_DATASETS_CACHE=/blob/datasets_cache/ TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose tests/deepspeed diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml index bfc9919be1f9..144830e5e176 100644 --- a/.github/workflows/nv-transformers-v100.yml +++ b/.github/workflows/nv-transformers-v100.yml @@ -65,4 +65,4 @@ jobs: # force protobuf version due to issues pip install "protobuf<4.21.0" pip list - WANDB_DISABLED=true TORCH_EXTENSIONS_DIR=./torch-extensions RUN_SLOW=1 pytest --color=yes --durations=0 --verbose tests/deepspeed + HF_DATASETS_CACHE=/blob/datasets_cache/ TRANSFORMERS_CACHE=/blob/transformers_cache/ WANDB_DISABLED=true TORCH_EXTENSIONS_DIR=./torch-extensions RUN_SLOW=1 pytest --color=yes --durations=0 --verbose tests/deepspeed From 23790d5a5293c3da1f44e165b1fb389ae3e9109c Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Mon, 3 Oct 2022 14:05:43 -0700 Subject: [PATCH 25/25] upgrade torch for acclerate CI --- .github/workflows/nv-accelerate-v100.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml index 1610a59c2c6a..c1f915ec5bee 100644 --- a/.github/workflows/nv-accelerate-v100.yml +++ b/.github/workflows/nv-accelerate-v100.yml @@ -32,7 +32,7 @@ jobs: nvcc --version pip install --upgrade pip pip uninstall --yes torch torchvision - pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html + pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())"