From 4778832675a7ec898da5c809391ca4052991d91e Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Mon, 23 Nov 2020 14:35:36 -0800 Subject: [PATCH 01/26] add nccl 1-bit optim. --- deepspeed/runtime/engine.py | 4 +- deepspeed/runtime/fp16/onebit_adam_nccl.py | 372 +++++++++++++++++++++ 2 files changed, 374 insertions(+), 2 deletions(-) create mode 100644 deepspeed/runtime/fp16/onebit_adam_nccl.py diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 94b0d7e48488..589db9a82691 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -658,8 +658,8 @@ def _configure_basic_optimizer(self, model_parameters): from deepspeed.ops.lamb import FusedLamb optimizer = FusedLamb(model_parameters, **optimizer_parameters) elif self.optimizer_name() == ONEBIT_ADAM_OPTIMIZER: - from deepspeed.runtime.fp16.onebit_adam import OnebitAdam - optimizer = OnebitAdam(model_parameters, self, **optimizer_parameters) + from deepspeed.runtime.fp16.onebit_adam_nccl import OnebitAdamNCCL + optimizer = OnebitAdamNCCL(model_parameters, self, **optimizer_parameters) else: torch_optimizer = getattr(torch.optim, self.optimizer_name()) optimizer = torch_optimizer(model_parameters, **optimizer_parameters) diff --git a/deepspeed/runtime/fp16/onebit_adam_nccl.py b/deepspeed/runtime/fp16/onebit_adam_nccl.py new file mode 100644 index 000000000000..6b19678bccf6 --- /dev/null +++ b/deepspeed/runtime/fp16/onebit_adam_nccl.py @@ -0,0 +1,372 @@ +''' +Copyright 2020 The Microsoft DeepSpeed Team +''' +import types +import torch +import importlib +import numpy as np +import time +import cupy +from torch.utils.dlpack import to_dlpack +from torch.utils.dlpack import from_dlpack +from deepspeed.utils.logging import logger +from deepspeed.runtime.custom_collectives import gather_cuda, gather_host, allgather_cuda, allgather_host + + +class OnebitAdamNCCL(torch.optim.Optimizer): + """Implements the 1-bit Adam algorithm. Currently GPU-only. + For usage example please see, TODO DeepSpeed Tutorial + It has been proposed in APMSqueeze (https://arxiv.org/abs/2008.11343) + + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups. + lr (float, optional): learning rate. (default: 1e-3) + freeze_step (int, optional): Number of steps for warmup (uncompressed) + stage before we start using compressed communication. (default 100000) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square. (default: (0.9, 0.999)) + eps (float, optional): term added to the denominator to improve + numerical stability. (default: 1e-8) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + max_coeff(float, optional): maximum value of the lamb coefficient (default: 10.0) + min_coeff(float, optional): minimum value of the lamb coefficient (default: 0.01) + amsgrad (boolean, optional): whether to use the AMSGrad variant of this + algorithm from the paper `On the Convergence of Adam and Beyond`_ + (default: False) NOT SUPPORTED in 1-bit Adam! + eps_inside_sqrt (boolean, optional): in the 'update parameters' step, + adds eps to the bias-corrected second moment estimate before + evaluating square root instead of adding it to the square root of + second moment estimate as in the original paper. (default: False) + cuda_aware (boolean, required): Set True if the underlying MPI implementation + supports CUDA-Aware communication. (default: False) + .. _Adam\: A Method for Stochastic Optimization: + https://arxiv.org/abs/1412.6980 + .. _On the Convergence of Adam and Beyond: + https://openreview.net/forum?id=ryQu7f-RZ + """ + def __init__(self, + params, + deepspeed=None, + lr=1e-3, + freeze_step=100000, + bias_correction=True, + betas=(0.9, + 0.999), + eps=1e-8, + eps_inside_sqrt=False, + weight_decay=0., + max_grad_norm=0., + amsgrad=False, + cuda_aware=False): + + if amsgrad: + raise RuntimeError('1-bit Adam does not support the AMSGrad variant.') + defaults = dict(lr=lr, + bias_correction=bias_correction, + betas=betas, + eps=eps, + weight_decay=weight_decay, + max_grad_norm=max_grad_norm) + + super(OnebitAdamNCCL, self).__init__(params, defaults) + from mpi4py import MPI + self.eps_mode = 0 if eps_inside_sqrt else 1 + + self.comm = MPI.COMM_WORLD + self.rank = self.comm.Get_rank() + self.size = self.comm.Get_size() + self.comm_time = 0.0 + self.step_time = 0.0 + self.ave_step = 1 + self.bk_time = 0.0 + self.divider = int(self.size * 8 / np.gcd(self.size, 8)) + self.deepspeed = deepspeed + self.adam_freeze_key = False + self.initialize = False + self.freeze_step = freeze_step + self.cuda_aware = cuda_aware + + def torch2cupy(self, tensor): + return cupy.fromDlpack(to_dlpack(tensor)) + + def cupy2torch(self, cupy_tensor): + return from_dlpack(cupy_tensor.toDlpack()) + + def compress_by_chunk(self, cupy_bool_tensor, num_chunks): + packed_sign = cupy.packbits(cupy_bool_tensor) + sign_list_packed = cupy.split(packed_sign, num_chunks) + cupy.cuda.get_current_stream().synchronize() + return sign_list_packed + + def Compressed_Allreduce(self, + buffer_m: torch.tensor, + worker_error, + server_error, + rank, + world_size, + comm, + local_rank): + + all_start_time = time.time() + original_size = buffer_m.numel() + cupy.cuda.Device(local_rank).use() + + if torch.numel(buffer_m) != torch.numel(worker_error): + empty_tensor = torch.zeros(torch.numel(worker_error) - torch.numel(buffer_m), + device=buffer_m.device) + buffer_m = torch.cat([buffer_m, empty_tensor]) + + buffer_m.add_(worker_error) + worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m)) + sign_buffer_m = buffer_m.sign().add_(1).bool() + sign_buffer_m = sign_buffer_m.float() + sign_buffer_m.add_(-0.5).mul_(2.0) + worker_error.set_((buffer_m - worker_scale * sign_buffer_m)) + sign_buffer_m = None + + compensated_buffer_m = buffer_m + compensated_buffer_m.sign_() + compensated_buffer_m = compensated_buffer_m.add_(1).bool() + cupy_worker_scale = self.torch2cupy(worker_scale) + cupy_compensated_buffer_m = self.torch2cupy(compensated_buffer_m) + compensated_buffer_m = None + + cupy_sign_list_packed = self.compress_by_chunk(cupy_compensated_buffer_m, + world_size) + cupy_compensated_buffer_m = None + + cupy_recvbuf_sign = cupy.zeros([world_size, + cupy_sign_list_packed[rank].size], + dtype=cupy_sign_list_packed[0].dtype) + cupy_recvbuf_scale = cupy.zeros([world_size, 1], dtype=cupy_worker_scale.dtype) + + # Communication Phase 1 + gather_start = time.time() + if self.cuda_aware: + gather_cuda(rank, + world_size, + comm, + cupy_sign_list_packed, + cupy_recvbuf_sign, + cupy_worker_scale, + cupy_recvbuf_scale) + else: + cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale = gather_host(rank, + world_size, + comm, + cupy_sign_list_packed, + cupy_recvbuf_sign, + cupy_worker_scale, + cupy_recvbuf_scale) + gather_end = time.time() + + cupy_unpacked_sign = (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape( + world_size, + -1) + cupy_recvbuf_sign = None + unpacked_sign = self.cupy2torch(cupy_unpacked_sign).float() + cupy_unpacked_sign = None + unpacked_sign = unpacked_sign.add_(-0.5).mul_(2.0) + worker_scale = self.cupy2torch(cupy_recvbuf_scale).mul_(1 / world_size) + compensated_server_m = unpacked_sign.mul_(worker_scale).sum(0) + unpacked_sign = None + + compensated_server_m.add_(server_error) + server_scale = torch.norm(compensated_server_m) / np.sqrt( + compensated_server_m.numel()) + sign_server_m = compensated_server_m.sign().add_(1).bool() + sign_server_m = sign_server_m.float() + sign_server_m.add_(-0.5).mul_(2.0) + server_error.set_(compensated_server_m - server_scale * sign_server_m) + sign_server_m = None + + compensated_server_m.sign_() + compensated_server_m = compensated_server_m.add_(1).bool() + cupy_server_scale = self.torch2cupy(server_scale) + cupy_compensated_server_m = self.torch2cupy(compensated_server_m) + compensated_server_m = None + + cupy_server_sign_packed = self.compress_by_chunk(cupy_compensated_server_m, 1) + + cupy_recvbuf_sign_server = cupy.zeros( + [world_size, + cupy_server_sign_packed[0].size], + dtype=cupy_sign_list_packed[0].dtype) + cupy_recvbuf_scale_server = cupy.zeros([world_size, + 1], + dtype=cupy_worker_scale.dtype) + + # Communication Phase 2 + if self.cuda_aware: + allgather_cuda(comm, + cupy_server_sign_packed[0], + cupy_recvbuf_sign_server, + cupy_server_scale, + cupy_recvbuf_scale_server) + else: + cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server = allgather_host(comm, + cupy_server_sign_packed[0], + cupy_recvbuf_sign_server, + cupy_server_scale, + cupy_recvbuf_scale_server) + + cupy_server_unpacked_sign = (cupy.unpackbits( + cupy_recvbuf_sign_server.flatten())).reshape(world_size, + -1) + cupy_recvbuf_sign_server = None + + server_unpacked_sign = self.cupy2torch(cupy_server_unpacked_sign) + cupy_server_unpacked_sign = None + + server_unpacked_sign = server_unpacked_sign.float().add_(-0.5).mul_(2.0) + server_scale = self.cupy2torch(cupy_recvbuf_scale_server) + buffer_m = server_unpacked_sign.mul_(server_scale).flatten()[0:original_size] + + return buffer_m + + def step(self, closure=None, grads=None): + """Performs a single optimization step. + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + grads (list of tensors, optional): weight gradient to use for the + optimizer update. If gradients have type torch.half, parameters + are expected to be in type torch.float. (default: None) + output params (list of tensors, optional): A reduced recision copy + of the updated weights written out in addition to the regular + updated weights. Have to be of same type as gradients. (default: None) + scale (float, optional): factor to divide gradient tensor values + by before applying to weights. (default: 1) + """ + loss = None + if closure is not None: + loss = closure() + + gather_time = 0 + allgather_time = 0 + all_time = 0 + + if self.adam_freeze_key is False: + v_diff_buffer = 0.0 + + if grads is None: + grads_group = [None] * len(self.param_groups) + # backward compatibility + # assuming a list/generator of parameter means single group + elif isinstance(grads, types.GeneratorType): + grads_group = [grads] + elif type(grads[0]) != list: + grads_group = [grads] + else: + grads_group = grads + + for group, grads_this_group in zip(self.param_groups, grads_group): + if grads_this_group is None: + grads_this_group = [None] * len(group['params']) + + bias_correction = 1 if group['bias_correction'] else 0 + + for p, grad in zip(group['params'], grads_this_group): + if p.grad is None and grad is None: + continue + if grad is None: + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError( + 'FusedAdam does not support sparse gradients, please consider SparseAdam instead' + ) + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p.data) + + state['tensor_size'] = torch.numel(p.data) + state['corrected_tensor_size'] = state['tensor_size'] + + if state['tensor_size'] % (self.size * self.divider) != 0: + state['corrected_tensor_size'] += ((self.size * self.divider) - + (state['tensor_size'] % + (self.size * self.divider))) + state['server_chunk_size'] = state[ + 'corrected_tensor_size'] // self.size + + if not self.initialize or (self.adam_freeze_key + and 'worker_error' not in state.keys()): + torch.cuda.empty_cache() + state['worker_error'] = torch.zeros(state['corrected_tensor_size'], + device=p.device) + state['server_error'] = torch.zeros(state['server_chunk_size'], + device=p.device) + torch.cuda.empty_cache() + self.adam_freeze_key = True + if not self.initialize and torch.distributed.get_rank() == 0: + print("Cupy Buffers Initialized Successfully.") + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + state['step'] += 1 + + if self.adam_freeze_key is False: + exp_avg.mul_(beta1).add_(1 - beta1, grad) + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + grad = None + if self.initialize: + update = exp_avg / (exp_avg_sq.sqrt() + group['eps']) + + else: + if 'non_freeze' in group.keys() and group['non_freeze'] is True: + dist.all_reduce(grad) + grad.mul_(1 / dist.get_world_size()) + exp_avg.mul_(beta1).add(1 - beta1, grad) + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + grad = None + else: + if self.initialize is True: + exp_avg.mul_(beta1).add_(1 - beta1, grad) + grad = None + + if self.size > 1: + exp_avg.set_( + self.Compressed_Allreduce(exp_avg, + state['worker_error'], + state['server_error'], + self.rank, + self.size, + self.comm, + self.deepspeed.local_rank)) + if self.initialize: + update = exp_avg / (exp_avg_sq.sqrt() + group['eps']) + + if self.initialize: + if group['weight_decay'] > 0.0: + update += group['weight_decay'] * p.data + with torch.no_grad(): + p.add_(-group['lr'] * update) + + if not self.initialize: + print('Pop out errors', flush=True) + state.pop('worker_error') + state.pop('server_error') + + if not self.initialize: + self.adam_freeze_key = False + self.initialize = True + print( + f"Finished the initialization step at rant {torch.distributed.get_rank()}" + ) + return loss + + if self.adam_freeze_key is False: + if state['step'] >= self.freeze_step: + self.adam_freeze_key = True + self.deepspeed.enable_backward_allreduce = False + + return loss From 567232bea6de75cabe94c092deda292c39cbb6c0 Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Mon, 23 Nov 2020 16:52:59 -0800 Subject: [PATCH 02/26] temporary commit to save stuff. --- deepspeed/runtime/fp16/onebit_adam_nccl.py | 40 ++++++++++------------ 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/deepspeed/runtime/fp16/onebit_adam_nccl.py b/deepspeed/runtime/fp16/onebit_adam_nccl.py index 6b19678bccf6..1d1ac7592b21 100644 --- a/deepspeed/runtime/fp16/onebit_adam_nccl.py +++ b/deepspeed/runtime/fp16/onebit_adam_nccl.py @@ -10,8 +10,11 @@ from torch.utils.dlpack import to_dlpack from torch.utils.dlpack import from_dlpack from deepspeed.utils.logging import logger + from deepspeed.runtime.custom_collectives import gather_cuda, gather_host, allgather_cuda, allgather_host +import torch.distributed as dist + class OnebitAdamNCCL(torch.optim.Optimizer): """Implements the 1-bit Adam algorithm. Currently GPU-only. @@ -70,12 +73,11 @@ def __init__(self, max_grad_norm=max_grad_norm) super(OnebitAdamNCCL, self).__init__(params, defaults) - from mpi4py import MPI self.eps_mode = 0 if eps_inside_sqrt else 1 - - self.comm = MPI.COMM_WORLD - self.rank = self.comm.Get_rank() - self.size = self.comm.Get_size() + assert (dist.is_initialized()) + self.world_group = dist.new_group(ranks=range(dist.get_world_size())) + self.rank = dist.get_rank(group=self.world_group) + self.size = dist.get_world_size(group=self.world_group) self.comm_time = 0.0 self.step_time = 0.0 self.ave_step = 1 @@ -134,6 +136,9 @@ def Compressed_Allreduce(self, cupy_sign_list_packed = self.compress_by_chunk(cupy_compensated_buffer_m, world_size) + + sign_list_packed = self.cupy2torch(cupy_sign_list_packed) + cupy_compensated_buffer_m = None cupy_recvbuf_sign = cupy.zeros([world_size, @@ -143,27 +148,18 @@ def Compressed_Allreduce(self, # Communication Phase 1 gather_start = time.time() - if self.cuda_aware: - gather_cuda(rank, - world_size, - comm, - cupy_sign_list_packed, - cupy_recvbuf_sign, - cupy_worker_scale, - cupy_recvbuf_scale) - else: - cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale = gather_host(rank, - world_size, - comm, - cupy_sign_list_packed, - cupy_recvbuf_sign, - cupy_worker_scale, - cupy_recvbuf_scale) + gather_cuda(rank, + world_size, + comm, + cupy_sign_list_packed, + cupy_recvbuf_sign, + cupy_worker_scale, + cupy_recvbuf_scale) gather_end = time.time() cupy_unpacked_sign = (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape( world_size, - -1) + 1) cupy_recvbuf_sign = None unpacked_sign = self.cupy2torch(cupy_unpacked_sign).float() cupy_unpacked_sign = None From 79f64049b76ba9251849f3aca45037bb6c904ccb Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Mon, 23 Nov 2020 18:09:18 -0800 Subject: [PATCH 03/26] Use dist collectives instead of mpi routines. --- deepspeed/runtime/fp16/onebit_adam_nccl.py | 50 ++++++++++++---------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/deepspeed/runtime/fp16/onebit_adam_nccl.py b/deepspeed/runtime/fp16/onebit_adam_nccl.py index 1d1ac7592b21..f18fe8c3a3dd 100644 --- a/deepspeed/runtime/fp16/onebit_adam_nccl.py +++ b/deepspeed/runtime/fp16/onebit_adam_nccl.py @@ -136,9 +136,6 @@ def Compressed_Allreduce(self, cupy_sign_list_packed = self.compress_by_chunk(cupy_compensated_buffer_m, world_size) - - sign_list_packed = self.cupy2torch(cupy_sign_list_packed) - cupy_compensated_buffer_m = None cupy_recvbuf_sign = cupy.zeros([world_size, @@ -146,25 +143,35 @@ def Compressed_Allreduce(self, dtype=cupy_sign_list_packed[0].dtype) cupy_recvbuf_scale = cupy.zeros([world_size, 1], dtype=cupy_worker_scale.dtype) - # Communication Phase 1 + sign_list_packed = self.cupy2torch(cupy_sign_list_packed) + worker_scale = self.cupy2torch(cupy_worker_scale) + + recvbuf_sign = self.cupy2torch(cupy_recvbuf_sign) + recvbuf_scale = self.cupy2torch(cupy_recvbuf_scale) + + # communication phase 1 gather_start = time.time() - gather_cuda(rank, - world_size, - comm, - cupy_sign_list_packed, - cupy_recvbuf_sign, - cupy_worker_scale, - cupy_recvbuf_scale) + for idx in range(self.size): + dist.gather(recvbuf_sign, sign_list_packed[idx], self.world_group, idx) + h2 = dist.gather(recvbuf_scale, worker_scale, self.world_group, idx) gather_end = time.time() + #TODO: dist.sync and try async_op=True method + + cupy_recvbuf_sign = self.torch2cupy(recvbuf_sign) + cupy_recvbuf_scale = self.torch2cupy(recvbuf_scale) + cupy_unpacked_sign = (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape( world_size, 1) cupy_recvbuf_sign = None + unpacked_sign = self.cupy2torch(cupy_unpacked_sign).float() cupy_unpacked_sign = None + unpacked_sign = unpacked_sign.add_(-0.5).mul_(2.0) worker_scale = self.cupy2torch(cupy_recvbuf_scale).mul_(1 / world_size) + compensated_server_m = unpacked_sign.mul_(worker_scale).sum(0) unpacked_sign = None @@ -193,19 +200,16 @@ def Compressed_Allreduce(self, 1], dtype=cupy_worker_scale.dtype) + server_sign_packed = self.cupy2torch(cupy_server_sign_packed) + recvbuf_sign_server = self.cupy2torch(cupy_recvbuf_sign_server) + server_scale = self.cupy2torch(cupy_server_scale) + recvbuf_scale_server = self.cupy2torch(cupy_recvbuf_scale_server) + # Communication Phase 2 - if self.cuda_aware: - allgather_cuda(comm, - cupy_server_sign_packed[0], - cupy_recvbuf_sign_server, - cupy_server_scale, - cupy_recvbuf_scale_server) - else: - cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server = allgather_host(comm, - cupy_server_sign_packed[0], - cupy_recvbuf_sign_server, - cupy_server_scale, - cupy_recvbuf_scale_server) + dist.all_gather(server_sign_packed[0], recvbuf_sign_server) + dist.all_gather(server_scale, recvbuf_scale_server) + + cupy_recvbuf_sign_server = self.torch2cupy(recvbuf_sign_server) cupy_server_unpacked_sign = (cupy.unpackbits( cupy_recvbuf_sign_server.flatten())).reshape(world_size, From 57ab220a9ca9a641df9729c2425f394a8ad26c69 Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Tue, 24 Nov 2020 17:37:34 -0800 Subject: [PATCH 04/26] remove old code for comm. --- deepspeed/runtime/fp16/onebit_adam_nccl.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/deepspeed/runtime/fp16/onebit_adam_nccl.py b/deepspeed/runtime/fp16/onebit_adam_nccl.py index f18fe8c3a3dd..995d0e6cbbba 100644 --- a/deepspeed/runtime/fp16/onebit_adam_nccl.py +++ b/deepspeed/runtime/fp16/onebit_adam_nccl.py @@ -107,7 +107,6 @@ def Compressed_Allreduce(self, server_error, rank, world_size, - comm, local_rank): all_start_time = time.time() @@ -340,7 +339,6 @@ def step(self, closure=None, grads=None): state['server_error'], self.rank, self.size, - self.comm, self.deepspeed.local_rank)) if self.initialize: update = exp_avg / (exp_avg_sq.sqrt() + group['eps']) From ebec1fee7e4fa496bc2b7f456a87c3962e6ad1c1 Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Wed, 25 Nov 2020 19:50:34 +0000 Subject: [PATCH 05/26] Fix bugs. still does not work. --- deepspeed/runtime/fp16/onebit_adam_nccl.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/deepspeed/runtime/fp16/onebit_adam_nccl.py b/deepspeed/runtime/fp16/onebit_adam_nccl.py index 995d0e6cbbba..38dbed93bc10 100644 --- a/deepspeed/runtime/fp16/onebit_adam_nccl.py +++ b/deepspeed/runtime/fp16/onebit_adam_nccl.py @@ -141,18 +141,30 @@ def Compressed_Allreduce(self, cupy_sign_list_packed[rank].size], dtype=cupy_sign_list_packed[0].dtype) cupy_recvbuf_scale = cupy.zeros([world_size, 1], dtype=cupy_worker_scale.dtype) + + sign_list_packed = [None] * self.size - sign_list_packed = self.cupy2torch(cupy_sign_list_packed) - worker_scale = self.cupy2torch(cupy_worker_scale) + for idx in range(self.size): + sign_list_packed[idx] = self.cupy2torch(cupy_sign_list_packed[idx]) + print(sign_list_packed[idx].shape) recvbuf_sign = self.cupy2torch(cupy_recvbuf_sign) + + worker_scale = self.cupy2torch(cupy_worker_scale) recvbuf_scale = self.cupy2torch(cupy_recvbuf_scale) + print("sendbuf = ", worker_scale) + print("recvbuf = ", recvbuf_scale) + # communication phase 1 gather_start = time.time() - for idx in range(self.size): - dist.gather(recvbuf_sign, sign_list_packed[idx], self.world_group, idx) - h2 = dist.gather(recvbuf_scale, worker_scale, self.world_group, idx) + #for idx in range(self.size): + #dist.gather(tensor=sign_list_packed[idx], gather_list=recvbuf_sign, dst=idx) + if dist.get_rank() == 0: + dist.gather(worker_scale, gather_list=recvbuf_scale) + else: + dist.gather(worker_scale, gather_list=[]) + gather_end = time.time() #TODO: dist.sync and try async_op=True method From 3e6974d147feaeb710bcf4735c9112db61d74ee4 Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Wed, 25 Nov 2020 19:50:54 +0000 Subject: [PATCH 06/26] modify to test the nccl side code path --- tests/onebitadam/test_com_reduce_cuda.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/onebitadam/test_com_reduce_cuda.py b/tests/onebitadam/test_com_reduce_cuda.py index a5a87ce67232..4eda1b958c1e 100644 --- a/tests/onebitadam/test_com_reduce_cuda.py +++ b/tests/onebitadam/test_com_reduce_cuda.py @@ -4,7 +4,7 @@ import torch.distributed as dist import numpy as np import deepspeed -from deepspeed.runtime.fp16.onebit_adam import OnebitAdam +from deepspeed.runtime.fp16.onebit_adam_nccl import OnebitAdamNCCL comm = MPI.COMM_WORLD size = comm.Get_size() @@ -12,14 +12,14 @@ #TODO: Detect the hostname we are running on automatically torch.distributed.init_process_group(backend='nccl', - init_method='tcp://worker-1:2245', + init_method='tcp://worker-0:2245', world_size=size, rank=rank) dummy_model = [torch.nn.Parameter(torch.ones(10))] # Set cuda_aware to True to use CUDA buffers for communication -dummy_optim = OnebitAdam(dummy_model, cuda_aware=True) +dummy_optim = OnebitAdamNCCL(dummy_model, cuda_aware=True) device = torch.device('cuda', rank % torch.cuda.device_count()) @@ -65,7 +65,6 @@ def torch_sim(a): server_error, rank, size, - comm, local_rank) threshold = 1e-6 magnitude_threshold = 1e-6 From a72049b60124101bf12037885950fea2c3ce39eb Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Mon, 30 Nov 2020 19:14:13 +0000 Subject: [PATCH 07/26] Initial gather impl. Works intra-node. --- DeepSpeedExamples | 2 +- deepspeed/runtime/fp16/onebit_adam_nccl.py | 28 ++++++++++++++-------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/DeepSpeedExamples b/DeepSpeedExamples index fa1d1a71c486..896831c96266 160000 --- a/DeepSpeedExamples +++ b/DeepSpeedExamples @@ -1 +1 @@ -Subproject commit fa1d1a71c48623db8a091d9cf636a5fe3b8f43c7 +Subproject commit 896831c96266e12612c3e7a923d04e68d1f4dd84 diff --git a/deepspeed/runtime/fp16/onebit_adam_nccl.py b/deepspeed/runtime/fp16/onebit_adam_nccl.py index 38dbed93bc10..344881d914da 100644 --- a/deepspeed/runtime/fp16/onebit_adam_nccl.py +++ b/deepspeed/runtime/fp16/onebit_adam_nccl.py @@ -101,6 +101,16 @@ def compress_by_chunk(self, cupy_bool_tensor, num_chunks): cupy.cuda.get_current_stream().synchronize() return sign_list_packed + def my_igather(self, rank, size, sendbuf, recvbuf, root): + if rank == root: + for idx in range(size): + if idx != rank: + dist.recv(recvbuf[idx], src=idx, group=self.world_group, tag=987) + else: + recvbuf[rank] = sendbuf + else: + dist.send(sendbuf, group=self.world_group, dst=root, tag=987) + def Compressed_Allreduce(self, buffer_m: torch.tensor, worker_error, @@ -153,28 +163,26 @@ def Compressed_Allreduce(self, worker_scale = self.cupy2torch(cupy_worker_scale) recvbuf_scale = self.cupy2torch(cupy_recvbuf_scale) + print("sendbuf = ", worker_scale) print("recvbuf = ", recvbuf_scale) # communication phase 1 gather_start = time.time() - #for idx in range(self.size): - #dist.gather(tensor=sign_list_packed[idx], gather_list=recvbuf_sign, dst=idx) - if dist.get_rank() == 0: - dist.gather(worker_scale, gather_list=recvbuf_scale) - else: - dist.gather(worker_scale, gather_list=[]) - + for idx in range(self.size): + self.my_igather(self.rank, self.size, sign_list_packed[idx], recvbuf_sign, root=idx) + self.my_igather(self.rank, self.size, worker_scale, recvbuf_scale, root=idx) gather_end = time.time() #TODO: dist.sync and try async_op=True method + print("sendbuf = ", worker_scale) + print("recvbuf = ", recvbuf_scale) + cupy_recvbuf_sign = self.torch2cupy(recvbuf_sign) cupy_recvbuf_scale = self.torch2cupy(recvbuf_scale) - cupy_unpacked_sign = (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape( - world_size, - 1) + cupy_unpacked_sign = (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(self.size,-1) cupy_recvbuf_sign = None unpacked_sign = self.cupy2torch(cupy_unpacked_sign).float() From 1bf1c275358950924c26597df6bbdfdaf28f9b16 Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Mon, 30 Nov 2020 22:33:55 +0000 Subject: [PATCH 08/26] Updates to comm. phase 2. nccl comm. passed the tests. --- deepspeed/runtime/fp16/onebit_adam_nccl.py | 43 +++++++++++----------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/deepspeed/runtime/fp16/onebit_adam_nccl.py b/deepspeed/runtime/fp16/onebit_adam_nccl.py index 344881d914da..29495588b016 100644 --- a/deepspeed/runtime/fp16/onebit_adam_nccl.py +++ b/deepspeed/runtime/fp16/onebit_adam_nccl.py @@ -156,29 +156,20 @@ def Compressed_Allreduce(self, for idx in range(self.size): sign_list_packed[idx] = self.cupy2torch(cupy_sign_list_packed[idx]) - print(sign_list_packed[idx].shape) recvbuf_sign = self.cupy2torch(cupy_recvbuf_sign) worker_scale = self.cupy2torch(cupy_worker_scale) recvbuf_scale = self.cupy2torch(cupy_recvbuf_scale) - - print("sendbuf = ", worker_scale) - print("recvbuf = ", recvbuf_scale) - # communication phase 1 + #TODO: dist.sync and try async_op=True method gather_start = time.time() for idx in range(self.size): self.my_igather(self.rank, self.size, sign_list_packed[idx], recvbuf_sign, root=idx) self.my_igather(self.rank, self.size, worker_scale, recvbuf_scale, root=idx) gather_end = time.time() - #TODO: dist.sync and try async_op=True method - print("sendbuf = ", worker_scale) - print("recvbuf = ", recvbuf_scale) - - cupy_recvbuf_sign = self.torch2cupy(recvbuf_sign) cupy_recvbuf_scale = self.torch2cupy(recvbuf_scale) @@ -211,22 +202,30 @@ def Compressed_Allreduce(self, cupy_server_sign_packed = self.compress_by_chunk(cupy_compensated_server_m, 1) - cupy_recvbuf_sign_server = cupy.zeros( - [world_size, - cupy_server_sign_packed[0].size], - dtype=cupy_sign_list_packed[0].dtype) - cupy_recvbuf_scale_server = cupy.zeros([world_size, - 1], - dtype=cupy_worker_scale.dtype) + cupy_recvbuf_sign_server = cupy.zeros([world_size, cupy_server_sign_packed[0].size], dtype=cupy_sign_list_packed[0].dtype) + + server_sign_packed = [None] * 1 + recvbuf_sign_server = [None] * self.size + + for idx in range(self.size): + recvbuf_sign_server[idx] = self.cupy2torch(cupy_recvbuf_sign_server[idx]) + + server_sign_packed[0] = self.cupy2torch(cupy_server_sign_packed[0]) - server_sign_packed = self.cupy2torch(cupy_server_sign_packed) - recvbuf_sign_server = self.cupy2torch(cupy_recvbuf_sign_server) server_scale = self.cupy2torch(cupy_server_scale) - recvbuf_scale_server = self.cupy2torch(cupy_recvbuf_scale_server) + cupy_recvbuf_scale_server = cupy.zeros([world_size, 1], dtype=cupy_worker_scale.dtype) + recvbuf_scale_server = [None] * self.size + for idx in range(self.size): + recvbuf_scale_server[idx] = self.cupy2torch(cupy_recvbuf_scale_server[idx]) + # Communication Phase 2 - dist.all_gather(server_sign_packed[0], recvbuf_sign_server) - dist.all_gather(server_scale, recvbuf_scale_server) + dist.all_gather(recvbuf_sign_server, server_sign_packed[0]) + dist.all_gather(recvbuf_scale_server, server_scale) + + # need to convert from a tensor list to a single tensor + # dist.all_gather only provides a tensor list as the recv/output buffer + recvbuf_sign_server = torch.stack(recvbuf_sign_server) cupy_recvbuf_sign_server = self.torch2cupy(recvbuf_sign_server) From 886ebb52915f78df5bce1aef8be334273cad7916 Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Wed, 2 Dec 2020 16:16:49 -0800 Subject: [PATCH 09/26] refactor code to introduce nccl/mpi as backends for onebit adam. --- deepspeed/runtime/custom_collectives.py | 23 +- .../{onebit_adam_nccl.py => onebit/adam.py} | 252 +++++++++--- deepspeed/runtime/fp16/onebit_adam.py | 374 ------------------ 3 files changed, 220 insertions(+), 429 deletions(-) rename deepspeed/runtime/fp16/{onebit_adam_nccl.py => onebit/adam.py} (61%) delete mode 100644 deepspeed/runtime/fp16/onebit_adam.py diff --git a/deepspeed/runtime/custom_collectives.py b/deepspeed/runtime/custom_collectives.py index cb77edcaf60d..20232b96c31f 100644 --- a/deepspeed/runtime/custom_collectives.py +++ b/deepspeed/runtime/custom_collectives.py @@ -1,10 +1,18 @@ ''' -Copyright 2019 The Microsoft DeepSpeed Team +Copyright 2020 The Microsoft DeepSpeed Team ''' -from mpi4py import MPI -import numpy as np -import cupy + +def my_igather_nccl(self, rank, size, sendbuf, recvbuf, root): + import torch.distributed as dist + if rank == root: + for idx in range(size): + if idx != rank: + dist.recv(recvbuf[idx], src=idx, group=self.world_group, tag=987) + else: + recvbuf[rank] = sendbuf + else: + dist.send(sendbuf, group=self.world_group, dst=root, tag=987) def my_igather(rank, size, comm, sendbuf, recbuf, root): @@ -47,6 +55,7 @@ def gather_cuda(rank, root=idx) requests += req_scale + from mpi4py import MPI MPI.Request.Waitall(requests) @@ -57,6 +66,10 @@ def gather_host(rank, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale): + import cupy + import numpy as np + from mpi4py import MPI + # In-place operations are not possible for newly created cupy arrays # so we need to return the new buffers numpy_recvbuf_sign = np.zeros([world_size, @@ -124,6 +137,8 @@ def allgather_host(comm, cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server): + import cupy + import numpy as np # 1. Convert cupy to numpy numpy_recvbuf_sign_server = np.zeros([comm.Get_size(), diff --git a/deepspeed/runtime/fp16/onebit_adam_nccl.py b/deepspeed/runtime/fp16/onebit/adam.py similarity index 61% rename from deepspeed/runtime/fp16/onebit_adam_nccl.py rename to deepspeed/runtime/fp16/onebit/adam.py index 29495588b016..563ce5a635c9 100644 --- a/deepspeed/runtime/fp16/onebit_adam_nccl.py +++ b/deepspeed/runtime/fp16/onebit/adam.py @@ -6,19 +6,16 @@ import importlib import numpy as np import time -import cupy from torch.utils.dlpack import to_dlpack from torch.utils.dlpack import from_dlpack from deepspeed.utils.logging import logger -from deepspeed.runtime.custom_collectives import gather_cuda, gather_host, allgather_cuda, allgather_host - import torch.distributed as dist -class OnebitAdamNCCL(torch.optim.Optimizer): +class Adam(torch.optim.Optimizer): """Implements the 1-bit Adam algorithm. Currently GPU-only. - For usage example please see, TODO DeepSpeed Tutorial + For usage example please see, https://www.deepspeed.ai/tutorials/onebit-adam/ It has been proposed in APMSqueeze (https://arxiv.org/abs/2008.11343) Arguments: @@ -43,6 +40,9 @@ class OnebitAdamNCCL(torch.optim.Optimizer): second moment estimate as in the original paper. (default: False) cuda_aware (boolean, required): Set True if the underlying MPI implementation supports CUDA-Aware communication. (default: False) + communication_backend (string, optional): Set to 'mpi' if needed. (default: 'nccl') + compression_backend (string, optional): Set to 'cupy' to test out compression kernels + from cupy. (default: 'deepspeed') .. _Adam\: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 .. _On the Convergence of Adam and Beyond: @@ -61,10 +61,13 @@ def __init__(self, weight_decay=0., max_grad_norm=0., amsgrad=False, - cuda_aware=False): + cuda_aware=False, + communication_backend='nccl', + compression_backend='deepspeed'): if amsgrad: raise RuntimeError('1-bit Adam does not support the AMSGrad variant.') + defaults = dict(lr=lr, bias_correction=bias_correction, betas=betas, @@ -72,12 +75,10 @@ def __init__(self, weight_decay=weight_decay, max_grad_norm=max_grad_norm) - super(OnebitAdamNCCL, self).__init__(params, defaults) + super(Adam, self).__init__(params, defaults) self.eps_mode = 0 if eps_inside_sqrt else 1 assert (dist.is_initialized()) - self.world_group = dist.new_group(ranks=range(dist.get_world_size())) - self.rank = dist.get_rank(group=self.world_group) - self.size = dist.get_world_size(group=self.world_group) + self.comm_time = 0.0 self.step_time = 0.0 self.ave_step = 1 @@ -89,6 +90,23 @@ def __init__(self, self.freeze_step = freeze_step self.cuda_aware = cuda_aware + self.communication_backend = communication_backend + self.compression_backend = compression_backend + + if self.communication_backend == 'nccl': + assert dist.is_initialized() == True, "Please initialize the torch distributed backend." + self.world_group = dist.new_group(ranks=range(dist.get_world_size())) + self.rank = dist.get_rank(group=self.world_group) + self.size = dist.get_world_size(group=self.world_group) + from deepspeed.runtime.custom_collectives import my_igather_nccl + + elif self.communication_backend == 'mpi': + from mpi4py import MPI + self.comm = MPI.COMM_WORLD + self.rank = self.comm.Get_rank() + self.size = self.comm.Get_size() + from deepspeed.runtime.custom_collectives import gather_cuda, gather_host, allgather_cuda, allgather_host + def torch2cupy(self, tensor): return cupy.fromDlpack(to_dlpack(tensor)) @@ -101,23 +119,11 @@ def compress_by_chunk(self, cupy_bool_tensor, num_chunks): cupy.cuda.get_current_stream().synchronize() return sign_list_packed - def my_igather(self, rank, size, sendbuf, recvbuf, root): - if rank == root: - for idx in range(size): - if idx != rank: - dist.recv(recvbuf[idx], src=idx, group=self.world_group, tag=987) - else: - recvbuf[rank] = sendbuf - else: - dist.send(sendbuf, group=self.world_group, dst=root, tag=987) - - def Compressed_Allreduce(self, - buffer_m: torch.tensor, - worker_error, - server_error, - rank, - world_size, - local_rank): + def compressed_nccl_allreduce(self, + buffer_m: torch.tensor, + worker_error, + server_error, + local_rank): all_start_time = time.time() original_size = buffer_m.numel() @@ -139,26 +145,28 @@ def Compressed_Allreduce(self, compensated_buffer_m = buffer_m compensated_buffer_m.sign_() compensated_buffer_m = compensated_buffer_m.add_(1).bool() + cupy_worker_scale = self.torch2cupy(worker_scale) cupy_compensated_buffer_m = self.torch2cupy(compensated_buffer_m) compensated_buffer_m = None cupy_sign_list_packed = self.compress_by_chunk(cupy_compensated_buffer_m, - world_size) + self.size) cupy_compensated_buffer_m = None - cupy_recvbuf_sign = cupy.zeros([world_size, - cupy_sign_list_packed[rank].size], - dtype=cupy_sign_list_packed[0].dtype) - cupy_recvbuf_scale = cupy.zeros([world_size, 1], dtype=cupy_worker_scale.dtype) - + cupy_recvbuf_sign = cupy.zeros( + [self.size, + cupy_sign_list_packed[self.rank].size], + dtype=cupy_sign_list_packed[0].dtype) + cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype) + sign_list_packed = [None] * self.size for idx in range(self.size): sign_list_packed[idx] = self.cupy2torch(cupy_sign_list_packed[idx]) recvbuf_sign = self.cupy2torch(cupy_recvbuf_sign) - + worker_scale = self.cupy2torch(cupy_worker_scale) recvbuf_scale = self.cupy2torch(cupy_recvbuf_scale) @@ -166,21 +174,27 @@ def Compressed_Allreduce(self, #TODO: dist.sync and try async_op=True method gather_start = time.time() for idx in range(self.size): - self.my_igather(self.rank, self.size, sign_list_packed[idx], recvbuf_sign, root=idx) + self.my_igather(self.rank, + self.size, + sign_list_packed[idx], + recvbuf_sign, + root=idx) self.my_igather(self.rank, self.size, worker_scale, recvbuf_scale, root=idx) gather_end = time.time() cupy_recvbuf_sign = self.torch2cupy(recvbuf_sign) cupy_recvbuf_scale = self.torch2cupy(recvbuf_scale) - cupy_unpacked_sign = (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(self.size,-1) + cupy_unpacked_sign = (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape( + self.size, + -1) cupy_recvbuf_sign = None unpacked_sign = self.cupy2torch(cupy_unpacked_sign).float() cupy_unpacked_sign = None unpacked_sign = unpacked_sign.add_(-0.5).mul_(2.0) - worker_scale = self.cupy2torch(cupy_recvbuf_scale).mul_(1 / world_size) + worker_scale = self.cupy2torch(cupy_recvbuf_scale).mul_(1 / self.size) compensated_server_m = unpacked_sign.mul_(worker_scale).sum(0) unpacked_sign = None @@ -202,23 +216,28 @@ def Compressed_Allreduce(self, cupy_server_sign_packed = self.compress_by_chunk(cupy_compensated_server_m, 1) - cupy_recvbuf_sign_server = cupy.zeros([world_size, cupy_server_sign_packed[0].size], dtype=cupy_sign_list_packed[0].dtype) + cupy_recvbuf_sign_server = cupy.zeros( + [self.size, + cupy_server_sign_packed[0].size], + dtype=cupy_sign_list_packed[0].dtype) server_sign_packed = [None] * 1 recvbuf_sign_server = [None] * self.size for idx in range(self.size): recvbuf_sign_server[idx] = self.cupy2torch(cupy_recvbuf_sign_server[idx]) - + server_sign_packed[0] = self.cupy2torch(cupy_server_sign_packed[0]) server_scale = self.cupy2torch(cupy_server_scale) - cupy_recvbuf_scale_server = cupy.zeros([world_size, 1], dtype=cupy_worker_scale.dtype) + cupy_recvbuf_scale_server = cupy.zeros([self.size, + 1], + dtype=cupy_worker_scale.dtype) recvbuf_scale_server = [None] * self.size for idx in range(self.size): recvbuf_scale_server[idx] = self.cupy2torch(cupy_recvbuf_scale_server[idx]) - + # Communication Phase 2 dist.all_gather(recvbuf_sign_server, server_sign_packed[0]) dist.all_gather(recvbuf_scale_server, server_scale) @@ -230,7 +249,131 @@ def Compressed_Allreduce(self, cupy_recvbuf_sign_server = self.torch2cupy(recvbuf_sign_server) cupy_server_unpacked_sign = (cupy.unpackbits( - cupy_recvbuf_sign_server.flatten())).reshape(world_size, + cupy_recvbuf_sign_server.flatten())).reshape(self.size, + -1) + cupy_recvbuf_sign_server = None + + server_unpacked_sign = self.cupy2torch(cupy_server_unpacked_sign) + cupy_server_unpacked_sign = None + + server_unpacked_sign = server_unpacked_sign.float().add_(-0.5).mul_(2.0) + server_scale = self.cupy2torch(cupy_recvbuf_scale_server) + buffer_m = server_unpacked_sign.mul_(server_scale).flatten()[0:original_size] + + return buffer_m + + def compressed_mpi_allreduce(self, + buffer_m: torch.tensor, + worker_error, + server_error, + local_rank): + + all_start_time = time.time() + original_size = buffer_m.numel() + cupy.cuda.Device(local_rank).use() + + if torch.numel(buffer_m) != torch.numel(worker_error): + empty_tensor = torch.zeros(torch.numel(worker_error) - torch.numel(buffer_m), + device=buffer_m.device) + buffer_m = torch.cat([buffer_m, empty_tensor]) + + buffer_m.add_(worker_error) + worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m)) + sign_buffer_m = buffer_m.sign().add_(1).bool() + sign_buffer_m = sign_buffer_m.float() + sign_buffer_m.add_(-0.5).mul_(2.0) + worker_error.set_((buffer_m - worker_scale * sign_buffer_m)) + sign_buffer_m = None + + compensated_buffer_m = buffer_m + compensated_buffer_m.sign_() + compensated_buffer_m = compensated_buffer_m.add_(1).bool() + cupy_worker_scale = self.torch2cupy(worker_scale) + cupy_compensated_buffer_m = self.torch2cupy(compensated_buffer_m) + compensated_buffer_m = None + + cupy_sign_list_packed = self.compress_by_chunk(cupy_compensated_buffer_m, + self.size) + cupy_compensated_buffer_m = None + + cupy_recvbuf_sign = cupy.zeros( + [self.size, + cupy_sign_list_packed[self.rank].size], + dtype=cupy_sign_list_packed[0].dtype) + cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype) + + # Communication Phase 1 + gather_start = time.time() + if self.cuda_aware: + gather_cuda(self.rank, + self.size, + self.comm, + cupy_sign_list_packed, + cupy_recvbuf_sign, + cupy_worker_scale, + cupy_recvbuf_scale) + else: + cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale = gather_host(self.rank, + self.size, + self.comm, + cupy_sign_list_packed, + cupy_recvbuf_sign, + cupy_worker_scale, + cupy_recvbuf_scale) + gather_end = time.time() + + cupy_unpacked_sign = (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape( + self.size, + -1) + cupy_recvbuf_sign = None + unpacked_sign = self.cupy2torch(cupy_unpacked_sign).float() + cupy_unpacked_sign = None + unpacked_sign = unpacked_sign.add_(-0.5).mul_(2.0) + worker_scale = self.cupy2torch(cupy_recvbuf_scale).mul_(1 / self.size) + compensated_server_m = unpacked_sign.mul_(worker_scale).sum(0) + unpacked_sign = None + + compensated_server_m.add_(server_error) + server_scale = torch.norm(compensated_server_m) / np.sqrt( + compensated_server_m.numel()) + sign_server_m = compensated_server_m.sign().add_(1).bool() + sign_server_m = sign_server_m.float() + sign_server_m.add_(-0.5).mul_(2.0) + server_error.set_(compensated_server_m - server_scale * sign_server_m) + sign_server_m = None + + compensated_server_m.sign_() + compensated_server_m = compensated_server_m.add_(1).bool() + cupy_server_scale = self.torch2cupy(server_scale) + cupy_compensated_server_m = self.torch2cupy(compensated_server_m) + compensated_server_m = None + + cupy_server_sign_packed = self.compress_by_chunk(cupy_compensated_server_m, 1) + + cupy_recvbuf_sign_server = cupy.zeros( + [self.size, + cupy_server_sign_packed[0].size], + dtype=cupy_sign_list_packed[0].dtype) + cupy_recvbuf_scale_server = cupy.zeros([self.size, + 1], + dtype=cupy_worker_scale.dtype) + + # Communication Phase 2 + if self.cuda_aware: + allgather_cuda(self.comm, + cupy_server_sign_packed[0], + cupy_recvbuf_sign_server, + cupy_server_scale, + cupy_recvbuf_scale_server) + else: + cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server = allgather_host(self.comm, + cupy_server_sign_packed[0], + cupy_recvbuf_sign_server, + cupy_server_scale, + cupy_recvbuf_scale_server) + + cupy_server_unpacked_sign = (cupy.unpackbits( + cupy_recvbuf_sign_server.flatten())).reshape(self.size, -1) cupy_recvbuf_sign_server = None @@ -342,7 +485,7 @@ def step(self, closure=None, grads=None): else: if 'non_freeze' in group.keys() and group['non_freeze'] is True: dist.all_reduce(grad) - grad.mul_(1 / dist.get_world_size()) + grad.mul_(1 / dist.get_self.size()) exp_avg.mul_(beta1).add(1 - beta1, grad) exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) grad = None @@ -352,13 +495,20 @@ def step(self, closure=None, grads=None): grad = None if self.size > 1: - exp_avg.set_( - self.Compressed_Allreduce(exp_avg, - state['worker_error'], - state['server_error'], - self.rank, - self.size, - self.deepspeed.local_rank)) + if self.communication_backend == 'nccl': + exp_avg.set_( + self.compressed_nccl_allreduce( + exp_avg, + state['worker_error'], + state['server_error'], + self.deepspeed.local_rank)) + elif self.communication_backend == 'mpi': + exp_avg.set_( + self.compressed_mpi_allreduce( + exp_avg, + state['worker_error'], + state['server_error'], + self.deepspeed.local_rank)) if self.initialize: update = exp_avg / (exp_avg_sq.sqrt() + group['eps']) @@ -377,7 +527,7 @@ def step(self, closure=None, grads=None): self.adam_freeze_key = False self.initialize = True print( - f"Finished the initialization step at rant {torch.distributed.get_rank()}" + f"Finished the initialization step at rank {torch.distributed.get_rank()}" ) return loss diff --git a/deepspeed/runtime/fp16/onebit_adam.py b/deepspeed/runtime/fp16/onebit_adam.py deleted file mode 100644 index c6566c28777b..000000000000 --- a/deepspeed/runtime/fp16/onebit_adam.py +++ /dev/null @@ -1,374 +0,0 @@ -''' -Copyright 2020 The Microsoft DeepSpeed Team -''' -import types -import torch -import importlib -import numpy as np -import time -import cupy -from torch.utils.dlpack import to_dlpack -from torch.utils.dlpack import from_dlpack -from deepspeed.utils.logging import logger - -from mpi4py import MPI -from deepspeed.runtime.custom_collectives import gather_cuda, gather_host, allgather_cuda, allgather_host - - -class OnebitAdam(torch.optim.Optimizer): - """Implements the 1-bit Adam algorithm. Currently GPU-only. - For usage example please see, TODO DeepSpeed Tutorial - It has been proposed in APMSqueeze (https://arxiv.org/abs/2008.11343) - - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups. - lr (float, optional): learning rate. (default: 1e-3) - freeze_step (int, optional): Number of steps for warmup (uncompressed) - stage before we start using compressed communication. (default 100000) - betas (Tuple[float, float], optional): coefficients used for computing - running averages of gradient and its square. (default: (0.9, 0.999)) - eps (float, optional): term added to the denominator to improve - numerical stability. (default: 1e-8) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - max_coeff(float, optional): maximum value of the lamb coefficient (default: 10.0) - min_coeff(float, optional): minimum value of the lamb coefficient (default: 0.01) - amsgrad (boolean, optional): whether to use the AMSGrad variant of this - algorithm from the paper `On the Convergence of Adam and Beyond`_ - (default: False) NOT SUPPORTED in 1-bit Adam! - eps_inside_sqrt (boolean, optional): in the 'update parameters' step, - adds eps to the bias-corrected second moment estimate before - evaluating square root instead of adding it to the square root of - second moment estimate as in the original paper. (default: False) - cuda_aware (boolean, required): Set True if the underlying MPI implementation - supports CUDA-Aware communication. (default: False) - .. _Adam\: A Method for Stochastic Optimization: - https://arxiv.org/abs/1412.6980 - .. _On the Convergence of Adam and Beyond: - https://openreview.net/forum?id=ryQu7f-RZ - """ - def __init__(self, - params, - deepspeed=None, - lr=1e-3, - freeze_step=100000, - bias_correction=True, - betas=(0.9, - 0.999), - eps=1e-8, - eps_inside_sqrt=False, - weight_decay=0., - max_grad_norm=0., - amsgrad=False, - cuda_aware=False): - - if amsgrad: - raise RuntimeError('1-bit Adam does not support the AMSGrad variant.') - defaults = dict(lr=lr, - bias_correction=bias_correction, - betas=betas, - eps=eps, - weight_decay=weight_decay, - max_grad_norm=max_grad_norm) - - super(OnebitAdam, self).__init__(params, defaults) - from mpi4py import MPI - self.eps_mode = 0 if eps_inside_sqrt else 1 - - self.comm = MPI.COMM_WORLD - self.rank = self.comm.Get_rank() - self.size = self.comm.Get_size() - self.comm_time = 0.0 - self.step_time = 0.0 - self.ave_step = 1 - self.bk_time = 0.0 - self.divider = int(self.size * 8 / np.gcd(self.size, 8)) - self.deepspeed = deepspeed - self.adam_freeze_key = False - self.initialize = False - self.freeze_step = freeze_step - self.cuda_aware = cuda_aware - - def torch2cupy(self, tensor): - return cupy.fromDlpack(to_dlpack(tensor)) - - def cupy2torch(self, cupy_tensor): - return from_dlpack(cupy_tensor.toDlpack()) - - def compress_by_chunk(self, cupy_bool_tensor, num_chunks): - packed_sign = cupy.packbits(cupy_bool_tensor) - sign_list_packed = cupy.split(packed_sign, num_chunks) - cupy.cuda.get_current_stream().synchronize() - return sign_list_packed - - def Compressed_Allreduce(self, - buffer_m: torch.tensor, - worker_error, - server_error, - rank, - world_size, - comm, - local_rank): - - all_start_time = time.time() - original_size = buffer_m.numel() - cupy.cuda.Device(local_rank).use() - - if torch.numel(buffer_m) != torch.numel(worker_error): - empty_tensor = torch.zeros(torch.numel(worker_error) - torch.numel(buffer_m), - device=buffer_m.device) - buffer_m = torch.cat([buffer_m, empty_tensor]) - - buffer_m.add_(worker_error) - worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m)) - sign_buffer_m = buffer_m.sign().add_(1).bool() - sign_buffer_m = sign_buffer_m.float() - sign_buffer_m.add_(-0.5).mul_(2.0) - worker_error.set_((buffer_m - worker_scale * sign_buffer_m)) - sign_buffer_m = None - - compensated_buffer_m = buffer_m - compensated_buffer_m.sign_() - compensated_buffer_m = compensated_buffer_m.add_(1).bool() - cupy_worker_scale = self.torch2cupy(worker_scale) - cupy_compensated_buffer_m = self.torch2cupy(compensated_buffer_m) - compensated_buffer_m = None - - cupy_sign_list_packed = self.compress_by_chunk(cupy_compensated_buffer_m, - world_size) - cupy_compensated_buffer_m = None - - cupy_recvbuf_sign = cupy.zeros([world_size, - cupy_sign_list_packed[rank].size], - dtype=cupy_sign_list_packed[0].dtype) - cupy_recvbuf_scale = cupy.zeros([world_size, 1], dtype=cupy_worker_scale.dtype) - - # Communication Phase 1 - gather_start = time.time() - if self.cuda_aware: - gather_cuda(rank, - world_size, - comm, - cupy_sign_list_packed, - cupy_recvbuf_sign, - cupy_worker_scale, - cupy_recvbuf_scale) - else: - cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale = gather_host(rank, - world_size, - comm, - cupy_sign_list_packed, - cupy_recvbuf_sign, - cupy_worker_scale, - cupy_recvbuf_scale) - gather_end = time.time() - - cupy_unpacked_sign = (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape( - world_size, - -1) - cupy_recvbuf_sign = None - unpacked_sign = self.cupy2torch(cupy_unpacked_sign).float() - cupy_unpacked_sign = None - unpacked_sign = unpacked_sign.add_(-0.5).mul_(2.0) - worker_scale = self.cupy2torch(cupy_recvbuf_scale).mul_(1 / world_size) - compensated_server_m = unpacked_sign.mul_(worker_scale).sum(0) - unpacked_sign = None - - compensated_server_m.add_(server_error) - server_scale = torch.norm(compensated_server_m) / np.sqrt( - compensated_server_m.numel()) - sign_server_m = compensated_server_m.sign().add_(1).bool() - sign_server_m = sign_server_m.float() - sign_server_m.add_(-0.5).mul_(2.0) - server_error.set_(compensated_server_m - server_scale * sign_server_m) - sign_server_m = None - - compensated_server_m.sign_() - compensated_server_m = compensated_server_m.add_(1).bool() - cupy_server_scale = self.torch2cupy(server_scale) - cupy_compensated_server_m = self.torch2cupy(compensated_server_m) - compensated_server_m = None - - cupy_server_sign_packed = self.compress_by_chunk(cupy_compensated_server_m, 1) - - cupy_recvbuf_sign_server = cupy.zeros( - [world_size, - cupy_server_sign_packed[0].size], - dtype=cupy_sign_list_packed[0].dtype) - cupy_recvbuf_scale_server = cupy.zeros([world_size, - 1], - dtype=cupy_worker_scale.dtype) - - # Communication Phase 2 - if self.cuda_aware: - allgather_cuda(comm, - cupy_server_sign_packed[0], - cupy_recvbuf_sign_server, - cupy_server_scale, - cupy_recvbuf_scale_server) - else: - cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server = allgather_host(comm, - cupy_server_sign_packed[0], - cupy_recvbuf_sign_server, - cupy_server_scale, - cupy_recvbuf_scale_server) - - cupy_server_unpacked_sign = (cupy.unpackbits( - cupy_recvbuf_sign_server.flatten())).reshape(world_size, - -1) - cupy_recvbuf_sign_server = None - - server_unpacked_sign = self.cupy2torch(cupy_server_unpacked_sign) - cupy_server_unpacked_sign = None - - server_unpacked_sign = server_unpacked_sign.float().add_(-0.5).mul_(2.0) - server_scale = self.cupy2torch(cupy_recvbuf_scale_server) - buffer_m = server_unpacked_sign.mul_(server_scale).flatten()[0:original_size] - - return buffer_m - - def step(self, closure=None, grads=None): - """Performs a single optimization step. - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - grads (list of tensors, optional): weight gradient to use for the - optimizer update. If gradients have type torch.half, parameters - are expected to be in type torch.float. (default: None) - output params (list of tensors, optional): A reduced recision copy - of the updated weights written out in addition to the regular - updated weights. Have to be of same type as gradients. (default: None) - scale (float, optional): factor to divide gradient tensor values - by before applying to weights. (default: 1) - """ - loss = None - if closure is not None: - loss = closure() - - gather_time = 0 - allgather_time = 0 - all_time = 0 - - if self.adam_freeze_key is False: - v_diff_buffer = 0.0 - - if grads is None: - grads_group = [None] * len(self.param_groups) - # backward compatibility - # assuming a list/generator of parameter means single group - elif isinstance(grads, types.GeneratorType): - grads_group = [grads] - elif type(grads[0]) != list: - grads_group = [grads] - else: - grads_group = grads - - for group, grads_this_group in zip(self.param_groups, grads_group): - if grads_this_group is None: - grads_this_group = [None] * len(group['params']) - - bias_correction = 1 if group['bias_correction'] else 0 - - for p, grad in zip(group['params'], grads_this_group): - if p.grad is None and grad is None: - continue - if grad is None: - grad = p.grad.data - if grad.is_sparse: - raise RuntimeError( - 'FusedAdam does not support sparse gradients, please consider SparseAdam instead' - ) - - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = 0 - # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p.data) - # Exponential moving average of squared gradient values - state['exp_avg_sq'] = torch.zeros_like(p.data) - - state['tensor_size'] = torch.numel(p.data) - state['corrected_tensor_size'] = state['tensor_size'] - - if state['tensor_size'] % (self.size * self.divider) != 0: - state['corrected_tensor_size'] += ((self.size * self.divider) - - (state['tensor_size'] % - (self.size * self.divider))) - state['server_chunk_size'] = state[ - 'corrected_tensor_size'] // self.size - - if not self.initialize or (self.adam_freeze_key - and 'worker_error' not in state.keys()): - torch.cuda.empty_cache() - state['worker_error'] = torch.zeros(state['corrected_tensor_size'], - device=p.device) - state['server_error'] = torch.zeros(state['server_chunk_size'], - device=p.device) - torch.cuda.empty_cache() - self.adam_freeze_key = True - if not self.initialize and torch.distributed.get_rank() == 0: - print("Cupy Buffers Initialized Successfully.") - - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - beta1, beta2 = group['betas'] - - state['step'] += 1 - - if self.adam_freeze_key is False: - exp_avg.mul_(beta1).add_(1 - beta1, grad) - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) - grad = None - if self.initialize: - update = exp_avg / (exp_avg_sq.sqrt() + group['eps']) - - else: - if 'non_freeze' in group.keys() and group['non_freeze'] is True: - dist.all_reduce(grad) - grad.mul_(1 / dist.get_world_size()) - exp_avg.mul_(beta1).add(1 - beta1, grad) - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) - grad = None - else: - if self.initialize is True: - exp_avg.mul_(beta1).add_(1 - beta1, grad) - grad = None - - if self.size > 1: - exp_avg.set_( - self.Compressed_Allreduce(exp_avg, - state['worker_error'], - state['server_error'], - self.rank, - self.size, - self.comm, - self.deepspeed.local_rank)) - if self.initialize: - update = exp_avg / (exp_avg_sq.sqrt() + group['eps']) - - if self.initialize: - if group['weight_decay'] > 0.0: - update += group['weight_decay'] * p.data - with torch.no_grad(): - p.add_(-group['lr'] * update) - - if not self.initialize: - print('Pop out errors', flush=True) - state.pop('worker_error') - state.pop('server_error') - - if not self.initialize: - self.adam_freeze_key = False - self.initialize = True - print( - f"Finished the initialization step at rant {torch.distributed.get_rank()}" - ) - return loss - - if self.adam_freeze_key is False: - if state['step'] >= self.freeze_step: - self.adam_freeze_key = True - self.deepspeed.enable_backward_allreduce = False - - return loss From a38351ecbe44903afce7acf4125ae6af1052f97f Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Thu, 3 Dec 2020 10:40:43 -0800 Subject: [PATCH 10/26] Refactor updates to test/engine. --- deepspeed/runtime/engine.py | 4 ++-- tests/onebitadam/test_com_reduce_cuda.py | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index d18401c57b5a..e99faa6efb81 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -658,8 +658,8 @@ def _configure_basic_optimizer(self, model_parameters): from deepspeed.ops.lamb import FusedLamb optimizer = FusedLamb(model_parameters, **optimizer_parameters) elif self.optimizer_name() == ONEBIT_ADAM_OPTIMIZER: - from deepspeed.runtime.fp16.onebit_adam_nccl import OnebitAdamNCCL - optimizer = OnebitAdamNCCL(model_parameters, self, **optimizer_parameters) + from deepspeed.runtime.fp16.onebit.adam import Adam + optimizer = Adam(model_parameters, self, **optimizer_parameters) else: torch_optimizer = getattr(torch.optim, self.optimizer_name()) optimizer = torch_optimizer(model_parameters, **optimizer_parameters) diff --git a/tests/onebitadam/test_com_reduce_cuda.py b/tests/onebitadam/test_com_reduce_cuda.py index 4eda1b958c1e..b0114abe3806 100644 --- a/tests/onebitadam/test_com_reduce_cuda.py +++ b/tests/onebitadam/test_com_reduce_cuda.py @@ -4,7 +4,7 @@ import torch.distributed as dist import numpy as np import deepspeed -from deepspeed.runtime.fp16.onebit_adam_nccl import OnebitAdamNCCL +from deepspeed.runtime.fp16.onebit.adam import Adam comm = MPI.COMM_WORLD size = comm.Get_size() @@ -19,7 +19,10 @@ dummy_model = [torch.nn.Parameter(torch.ones(10))] # Set cuda_aware to True to use CUDA buffers for communication -dummy_optim = OnebitAdamNCCL(dummy_model, cuda_aware=True) +dummy_optim = Adam(dummy_model, + cuda_aware=True, + communication_backend='nccl', + compression_backend='cupy') device = torch.device('cuda', rank % torch.cuda.device_count()) From be75d8858182798b495f364a29035e15e24bff74 Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Thu, 3 Dec 2020 19:23:56 +0000 Subject: [PATCH 11/26] Fix compile/runtime errors. --- deepspeed/runtime/custom_collectives.py | 6 +++--- deepspeed/runtime/fp16/onebit/adam.py | 21 ++++++++++++++++----- tests/onebitadam/test_com_reduce_cuda.py | 4 +--- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/deepspeed/runtime/custom_collectives.py b/deepspeed/runtime/custom_collectives.py index 20232b96c31f..45699be5abf7 100644 --- a/deepspeed/runtime/custom_collectives.py +++ b/deepspeed/runtime/custom_collectives.py @@ -3,16 +3,16 @@ ''' -def my_igather_nccl(self, rank, size, sendbuf, recvbuf, root): +def my_igather_nccl(rank, size, group, sendbuf, recvbuf, root): import torch.distributed as dist if rank == root: for idx in range(size): if idx != rank: - dist.recv(recvbuf[idx], src=idx, group=self.world_group, tag=987) + dist.recv(recvbuf[idx], src=idx, group=group, tag=987) else: recvbuf[rank] = sendbuf else: - dist.send(sendbuf, group=self.world_group, dst=root, tag=987) + dist.send(sendbuf, group=group, dst=root, tag=987) def my_igather(rank, size, comm, sendbuf, recbuf, root): diff --git a/deepspeed/runtime/fp16/onebit/adam.py b/deepspeed/runtime/fp16/onebit/adam.py index 563ce5a635c9..a0f4cc2d603b 100644 --- a/deepspeed/runtime/fp16/onebit/adam.py +++ b/deepspeed/runtime/fp16/onebit/adam.py @@ -6,12 +6,14 @@ import importlib import numpy as np import time +import torch.distributed as dist from torch.utils.dlpack import to_dlpack from torch.utils.dlpack import from_dlpack from deepspeed.utils.logging import logger -import torch.distributed as dist - +# Delayed/lazy imports +my_igather_nccl = None +cupy = None class Adam(torch.optim.Optimizer): """Implements the 1-bit Adam algorithm. Currently GPU-only. @@ -83,7 +85,7 @@ def __init__(self, self.step_time = 0.0 self.ave_step = 1 self.bk_time = 0.0 - self.divider = int(self.size * 8 / np.gcd(self.size, 8)) + self.deepspeed = deepspeed self.adam_freeze_key = False self.initialize = False @@ -93,6 +95,12 @@ def __init__(self, self.communication_backend = communication_backend self.compression_backend = compression_backend + global my_igather_nccl + global cupy + + if self.compression_backend == 'cupy': + import cupy + if self.communication_backend == 'nccl': assert dist.is_initialized() == True, "Please initialize the torch distributed backend." self.world_group = dist.new_group(ranks=range(dist.get_world_size())) @@ -106,6 +114,8 @@ def __init__(self, self.rank = self.comm.Get_rank() self.size = self.comm.Get_size() from deepspeed.runtime.custom_collectives import gather_cuda, gather_host, allgather_cuda, allgather_host + + self.divider = int(self.size * 8 / np.gcd(self.size, 8)) def torch2cupy(self, tensor): return cupy.fromDlpack(to_dlpack(tensor)) @@ -174,12 +184,13 @@ def compressed_nccl_allreduce(self, #TODO: dist.sync and try async_op=True method gather_start = time.time() for idx in range(self.size): - self.my_igather(self.rank, + my_igather_nccl(self.rank, self.size, + self.world_group, sign_list_packed[idx], recvbuf_sign, root=idx) - self.my_igather(self.rank, self.size, worker_scale, recvbuf_scale, root=idx) + my_igather_nccl(self.rank, self.size, self.world_group, worker_scale, recvbuf_scale, root=idx) gather_end = time.time() cupy_recvbuf_sign = self.torch2cupy(recvbuf_sign) diff --git a/tests/onebitadam/test_com_reduce_cuda.py b/tests/onebitadam/test_com_reduce_cuda.py index b0114abe3806..7cba9ca85e5a 100644 --- a/tests/onebitadam/test_com_reduce_cuda.py +++ b/tests/onebitadam/test_com_reduce_cuda.py @@ -63,11 +63,9 @@ def torch_sim(a): a_torch, worker_error_torch, server_error_torch = torch_sim(a) torch.cuda.empty_cache() local_rank = rank % torch.cuda.device_count() -a_after = dummy_optim.Compressed_Allreduce(a, +a_after = dummy_optim.compressed_nccl_allreduce(a, worker_error, server_error, - rank, - size, local_rank) threshold = 1e-6 magnitude_threshold = 1e-6 From 7b7f122bdbb7a94f77a4e861005d83b6e105c1fc Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Thu, 3 Dec 2020 20:35:51 +0000 Subject: [PATCH 12/26] simplify support for nccl/mpi backends. --- deepspeed/runtime/fp16/onebit/adam.py | 37 +++++++++++++++--------- tests/onebitadam/test_com_reduce_cuda.py | 6 ++-- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/deepspeed/runtime/fp16/onebit/adam.py b/deepspeed/runtime/fp16/onebit/adam.py index a0f4cc2d603b..53fbe8b269cb 100644 --- a/deepspeed/runtime/fp16/onebit/adam.py +++ b/deepspeed/runtime/fp16/onebit/adam.py @@ -14,6 +14,10 @@ # Delayed/lazy imports my_igather_nccl = None cupy = None +gather_cuda = None +gather_host = None +allgather_cuda = None +allgather_host = None class Adam(torch.optim.Optimizer): """Implements the 1-bit Adam algorithm. Currently GPU-only. @@ -97,6 +101,7 @@ def __init__(self, global my_igather_nccl global cupy + global gather_cuda, gather_host, allgather_cuda, allgather_host if self.compression_backend == 'cupy': import cupy @@ -129,6 +134,7 @@ def compress_by_chunk(self, cupy_bool_tensor, num_chunks): cupy.cuda.get_current_stream().synchronize() return sign_list_packed + def compressed_nccl_allreduce(self, buffer_m: torch.tensor, worker_error, @@ -397,6 +403,17 @@ def compressed_mpi_allreduce(self, return buffer_m + def compressed_allreduce(self, + buffer_m: torch.tensor, + worker_error, + server_error, + local_rank): + + if self.communication_backend == 'nccl': + return self.compressed_nccl_allreduce(buffer_m, worker_error, server_error, local_rank) + elif self.communication_backend == 'mpi': + return self.compressed_mpi_allreduce(buffer_m, worker_error, server_error, local_rank) + def step(self, closure=None, grads=None): """Performs a single optimization step. Arguments: @@ -506,20 +523,12 @@ def step(self, closure=None, grads=None): grad = None if self.size > 1: - if self.communication_backend == 'nccl': - exp_avg.set_( - self.compressed_nccl_allreduce( - exp_avg, - state['worker_error'], - state['server_error'], - self.deepspeed.local_rank)) - elif self.communication_backend == 'mpi': - exp_avg.set_( - self.compressed_mpi_allreduce( - exp_avg, - state['worker_error'], - state['server_error'], - self.deepspeed.local_rank)) + exp_avg.set_(self.compressed_allreduce( + exp_avg, + state['worker_error'], + state['server_error'], + self.deepspeed.local_rank)) + if self.initialize: update = exp_avg / (exp_avg_sq.sqrt() + group['eps']) diff --git a/tests/onebitadam/test_com_reduce_cuda.py b/tests/onebitadam/test_com_reduce_cuda.py index 7cba9ca85e5a..423fd7bfcf60 100644 --- a/tests/onebitadam/test_com_reduce_cuda.py +++ b/tests/onebitadam/test_com_reduce_cuda.py @@ -20,8 +20,8 @@ # Set cuda_aware to True to use CUDA buffers for communication dummy_optim = Adam(dummy_model, - cuda_aware=True, - communication_backend='nccl', + cuda_aware=False, + communication_backend='mpi', compression_backend='cupy') device = torch.device('cuda', rank % torch.cuda.device_count()) @@ -63,7 +63,7 @@ def torch_sim(a): a_torch, worker_error_torch, server_error_torch = torch_sim(a) torch.cuda.empty_cache() local_rank = rank % torch.cuda.device_count() -a_after = dummy_optim.compressed_nccl_allreduce(a, +a_after = dummy_optim.compressed_allreduce(a, worker_error, server_error, local_rank) From fd2c366fc623f8ec29668cade1e5b134ea5b084d Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Thu, 3 Dec 2020 20:36:20 +0000 Subject: [PATCH 13/26] Add missign file --- deepspeed/runtime/fp16/onebit/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 deepspeed/runtime/fp16/onebit/__init__.py diff --git a/deepspeed/runtime/fp16/onebit/__init__.py b/deepspeed/runtime/fp16/onebit/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 From df8c40d3105e9f2542a8aa6619e80d675a09753f Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Fri, 4 Dec 2020 17:39:23 +0000 Subject: [PATCH 14/26] Add compression backend in constructor. Revert later. --- deepspeed/runtime/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index b39e1cc89d81..6f9c2f5cd54b 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -664,7 +664,7 @@ def _configure_basic_optimizer(self, model_parameters): optimizer = FusedLamb(model_parameters, **optimizer_parameters) elif self.optimizer_name() == ONEBIT_ADAM_OPTIMIZER: from deepspeed.runtime.fp16.onebit.adam import Adam - optimizer = Adam(model_parameters, self, **optimizer_parameters) + optimizer = Adam(model_parameters, self, **optimizer_parameters, compression_backend='cupy') else: torch_optimizer = getattr(torch.optim, self.optimizer_name()) optimizer = torch_optimizer(model_parameters, **optimizer_parameters) From f29ea3f31c3a7e562ac9eb7c18d3f0021cbe284a Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Fri, 4 Dec 2020 17:40:18 +0000 Subject: [PATCH 15/26] modify test with some perf counting. --- tests/onebitadam/test_com_reduce_cuda.py | 53 +++++++++++++++++++----- 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/tests/onebitadam/test_com_reduce_cuda.py b/tests/onebitadam/test_com_reduce_cuda.py index 423fd7bfcf60..a0dc1ee2a1bb 100644 --- a/tests/onebitadam/test_com_reduce_cuda.py +++ b/tests/onebitadam/test_com_reduce_cuda.py @@ -6,6 +6,11 @@ import deepspeed from deepspeed.runtime.fp16.onebit.adam import Adam +# Configure wall clock timer +from deepspeed.utils.timer import SynchronizedWallClockTimer + +timers = SynchronizedWallClockTimer() + comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() @@ -58,15 +63,38 @@ def torch_sim(a): # Adding bias to the initialization of the gradient we are communicating # In order to get rid of the case where some elements in the gradient are too small a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank +print("size of the momentum buffer =", a.shape) + worker_error = torch.zeros(right_tensor_size, device=device) server_error = torch.zeros(right_server_size, device=device) + +iters = 100 + +# Warmup +for i in range (iters): + torch_sim(a) + +timers('simulated').start() +for i in range (iters): + torch_sim(a) +timers('simulated').stop() + a_torch, worker_error_torch, server_error_torch = torch_sim(a) + torch.cuda.empty_cache() local_rank = rank % torch.cuda.device_count() -a_after = dummy_optim.compressed_allreduce(a, - worker_error, - server_error, - local_rank) + +# Warmup +for i in range (iters): + dummy_optim.compressed_allreduce(a, worker_error, server_error, local_rank) + +timers('compressed').start() +for i in range(iters): + dummy_optim.compressed_allreduce(a, worker_error, server_error, local_rank) +timers('compressed').stop() + +a_after = dummy_optim.compressed_allreduce(a, worker_error, server_error, local_rank) + threshold = 1e-6 magnitude_threshold = 1e-6 diff_mask = (a_after - a_torch) > threshold @@ -74,13 +102,18 @@ def torch_sim(a): mpi_server = torch.chunk(a_after, size)[rank] + server_error torch_server = torch.chunk(a_torch, size)[rank] + server_error_torch +test_correctness = False # If the number in the compensated_server_m is too small (e.g 1e-8), then calling sign() might be problematic # The test would skip those numbers that are too small in compensated_server_m -if torch.sum(diff_server_mask) == 0: - print('Successfully passed the test for 1bit Adam at Rank {}'.format(rank)) -else: - check_mag_mask = mpi_server[diff_mask] > magnitude_threshold - if torch.sum(check_mag_mask) == 0: +if test_correctness: + if torch.sum(diff_server_mask) == 0: print('Successfully passed the test for 1bit Adam at Rank {}'.format(rank)) else: - print('Fails at {} of positions'.format(torch.sum(check_mag_mask))) + check_mag_mask = mpi_server[diff_mask] > magnitude_threshold + if torch.sum(check_mag_mask) == 0: + print('Successfully passed the test for 1bit Adam at Rank {}'.format(rank)) + else: + print('Fails at {} of positions'.format(torch.sum(check_mag_mask))) + +timer_names = ['simulated', 'compressed'] +timers.log(names=timer_names, normalizer=iters, memory_breakdown=None) From 170ef020b00256c731d463957f0f6c6440c2cbf6 Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Mon, 7 Dec 2020 19:24:51 +0000 Subject: [PATCH 16/26] Implement a true non-blocking gather for nccl side. --- deepspeed/runtime/custom_collectives.py | 15 +++++++++++++-- deepspeed/runtime/fp16/onebit/adam.py | 11 +++++++---- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/deepspeed/runtime/custom_collectives.py b/deepspeed/runtime/custom_collectives.py index 45699be5abf7..2cb76a181a67 100644 --- a/deepspeed/runtime/custom_collectives.py +++ b/deepspeed/runtime/custom_collectives.py @@ -2,8 +2,20 @@ Copyright 2020 The Microsoft DeepSpeed Team ''' - def my_igather_nccl(rank, size, group, sendbuf, recvbuf, root): + req = [] + import torch.distributed as dist + if rank == root: + for idx in range(size): + if idx != rank: + req.append(dist.irecv(recvbuf[idx], src=idx, group=group)) + else: + recvbuf[rank] = sendbuf + else: + req.append(dist.isend(sendbuf, group=group, dst=root)) + return req + +def my_gather_nccl(rank, size, group, sendbuf, recvbuf, root): import torch.distributed as dist if rank == root: for idx in range(size): @@ -14,7 +26,6 @@ def my_igather_nccl(rank, size, group, sendbuf, recvbuf, root): else: dist.send(sendbuf, group=group, dst=root, tag=987) - def my_igather(rank, size, comm, sendbuf, recbuf, root): req = [] if rank == root: diff --git a/deepspeed/runtime/fp16/onebit/adam.py b/deepspeed/runtime/fp16/onebit/adam.py index 53fbe8b269cb..668a4bd436f3 100644 --- a/deepspeed/runtime/fp16/onebit/adam.py +++ b/deepspeed/runtime/fp16/onebit/adam.py @@ -187,16 +187,20 @@ def compressed_nccl_allreduce(self, recvbuf_scale = self.cupy2torch(cupy_recvbuf_scale) # communication phase 1 - #TODO: dist.sync and try async_op=True method gather_start = time.time() + requests = [] for idx in range(self.size): - my_igather_nccl(self.rank, + requests += my_igather_nccl(self.rank, self.size, self.world_group, sign_list_packed[idx], recvbuf_sign, root=idx) - my_igather_nccl(self.rank, self.size, self.world_group, worker_scale, recvbuf_scale, root=idx) + requests += my_igather_nccl(self.rank, self.size, self.world_group, worker_scale, recvbuf_scale, root=idx) + + for i in range(len(requests)): + requests[i].wait() + gather_end = time.time() cupy_recvbuf_sign = self.torch2cupy(recvbuf_sign) @@ -408,7 +412,6 @@ def compressed_allreduce(self, worker_error, server_error, local_rank): - if self.communication_backend == 'nccl': return self.compressed_nccl_allreduce(buffer_m, worker_error, server_error, local_rank) elif self.communication_backend == 'mpi': From e2ddf4893aa594643ad10fc85d71a40eff04e23f Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Mon, 7 Dec 2020 19:25:35 +0000 Subject: [PATCH 17/26] Revert "Add compression backend in constructor. Revert later." This reverts commit df8c40d3105e9f2542a8aa6619e80d675a09753f. --- deepspeed/runtime/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 6f9c2f5cd54b..b39e1cc89d81 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -664,7 +664,7 @@ def _configure_basic_optimizer(self, model_parameters): optimizer = FusedLamb(model_parameters, **optimizer_parameters) elif self.optimizer_name() == ONEBIT_ADAM_OPTIMIZER: from deepspeed.runtime.fp16.onebit.adam import Adam - optimizer = Adam(model_parameters, self, **optimizer_parameters, compression_backend='cupy') + optimizer = Adam(model_parameters, self, **optimizer_parameters) else: torch_optimizer = getattr(torch.optim, self.optimizer_name()) optimizer = torch_optimizer(model_parameters, **optimizer_parameters) From dbd3cff504c2f6119280e0eb3db0791221082ac9 Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Mon, 7 Dec 2020 19:35:04 +0000 Subject: [PATCH 18/26] improve the 1-bit adam test. --- tests/onebitadam/test_com_reduce_cuda.py | 49 ++++++++++++------------ 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/tests/onebitadam/test_com_reduce_cuda.py b/tests/onebitadam/test_com_reduce_cuda.py index a0dc1ee2a1bb..03f2cca8862c 100644 --- a/tests/onebitadam/test_com_reduce_cuda.py +++ b/tests/onebitadam/test_com_reduce_cuda.py @@ -26,12 +26,12 @@ # Set cuda_aware to True to use CUDA buffers for communication dummy_optim = Adam(dummy_model, cuda_aware=False, - communication_backend='mpi', + communication_backend='nccl', compression_backend='cupy') device = torch.device('cuda', rank % torch.cuda.device_count()) - +# A simulated compression function using torch.distributed def torch_sim(a): a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0) scale = a.norm() / np.sqrt(a.numel()) @@ -52,7 +52,6 @@ def torch_sim(a): torch.distributed.barrier() return a_server_compressed, worker_error, server_error - tensor_size = 100 * 2**20 server_size = int(tensor_size / size) if tensor_size % (8 * size) != 0: @@ -62,36 +61,36 @@ def torch_sim(a): right_server_size = right_tensor_size // size # Adding bias to the initialization of the gradient we are communicating # In order to get rid of the case where some elements in the gradient are too small -a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank -print("size of the momentum buffer =", a.shape) - worker_error = torch.zeros(right_tensor_size, device=device) server_error = torch.zeros(right_server_size, device=device) +test_performance = False + iters = 100 -# Warmup -for i in range (iters): - torch_sim(a) +if test_performance: + # Warmup + for i in range (iters): + torch_sim(a) -timers('simulated').start() -for i in range (iters): - torch_sim(a) -timers('simulated').stop() + timers('simulated').start() + for i in range (iters): + torch_sim(a) + timers('simulated').stop() a_torch, worker_error_torch, server_error_torch = torch_sim(a) - torch.cuda.empty_cache() local_rank = rank % torch.cuda.device_count() -# Warmup -for i in range (iters): - dummy_optim.compressed_allreduce(a, worker_error, server_error, local_rank) +if test_performance: + # Warmup + for i in range (iters): + dummy_optim.compressed_allreduce(a, worker_error, server_error, local_rank) -timers('compressed').start() -for i in range(iters): - dummy_optim.compressed_allreduce(a, worker_error, server_error, local_rank) -timers('compressed').stop() + timers('compressed').start() + for i in range(iters): + dummy_optim.compressed_allreduce(a, worker_error, server_error, local_rank) + timers('compressed').stop() a_after = dummy_optim.compressed_allreduce(a, worker_error, server_error, local_rank) @@ -102,7 +101,8 @@ def torch_sim(a): mpi_server = torch.chunk(a_after, size)[rank] + server_error torch_server = torch.chunk(a_torch, size)[rank] + server_error_torch -test_correctness = False +test_correctness = True + # If the number in the compensated_server_m is too small (e.g 1e-8), then calling sign() might be problematic # The test would skip those numbers that are too small in compensated_server_m if test_correctness: @@ -115,5 +115,6 @@ def torch_sim(a): else: print('Fails at {} of positions'.format(torch.sum(check_mag_mask))) -timer_names = ['simulated', 'compressed'] -timers.log(names=timer_names, normalizer=iters, memory_breakdown=None) +if test_performance: + timer_names = ['simulated', 'compressed'] + timers.log(names=timer_names, normalizer=iters, memory_breakdown=None) From 7edc3ab2cade95bf2f222812341f168a13c9900e Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Tue, 8 Dec 2020 12:53:07 -0800 Subject: [PATCH 19/26] Refactor comm. and compression backend in 1-bit adam. --- deepspeed/runtime/comm/__init__.py | 0 deepspeed/runtime/comm/mpi.py | 284 ++++++++++++++++++ deepspeed/runtime/comm/nccl.py | 187 ++++++++++++ deepspeed/runtime/compression/__init__.py | 0 deepspeed/runtime/compression/cupy.py | 24 ++ deepspeed/runtime/fp16/onebit/adam.py | 349 ++-------------------- 6 files changed, 513 insertions(+), 331 deletions(-) create mode 100644 deepspeed/runtime/comm/__init__.py create mode 100644 deepspeed/runtime/comm/mpi.py create mode 100644 deepspeed/runtime/comm/nccl.py create mode 100644 deepspeed/runtime/compression/__init__.py create mode 100644 deepspeed/runtime/compression/cupy.py diff --git a/deepspeed/runtime/comm/__init__.py b/deepspeed/runtime/comm/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/deepspeed/runtime/comm/mpi.py b/deepspeed/runtime/comm/mpi.py new file mode 100644 index 000000000000..948e719fb7af --- /dev/null +++ b/deepspeed/runtime/comm/mpi.py @@ -0,0 +1,284 @@ +''' +Copyright 2020 The Microsoft DeepSpeed Team +''' + +import cupy +import numpy as np +from mpi4py import MPI + + +class MpiBackend(object): + def __init__(self, cuda_aware): + self.comm = MPI.COMM_WORLD + self.rank = self.comm.Get_rank() + self.size = self.comm.Get_size() + self.cuda_aware = cuda_aware + + def my_igather(self, rank, size, comm, sendbuf, recbuf, root): + req = [] + if rank == root: + for idx in range(size): + if idx != rank: + req.append(comm.Irecv(recbuf[idx], source=idx)) + else: + recbuf[rank] = sendbuf + else: + req.append(comm.Isend(sendbuf, dest=root)) + return req + + def gather_cuda(self, + rank, + world_size, + comm, + cupy_sign_list_packed, + cupy_recvbuf_sign, + cupy_worker_scale, + cupy_recvbuf_scale): + # We do in-place operations on cupy buffers so we do not return any buffers + requests = [] + for idx in range(world_size): + req_sign = my_igather(rank, + world_size, + comm, + cupy_sign_list_packed[idx], + cupy_recvbuf_sign, + root=idx) + requests += req_sign + + for idx in range(world_size): + req_scale = my_igather(rank, + world_size, + comm, + cupy_worker_scale, + cupy_recvbuf_scale, + root=idx) + requests += req_scale + + MPI.Request.Waitall(requests) + + def gather_host(rank, + world_size, + comm, + cupy_sign_list_packed, + cupy_recvbuf_sign, + cupy_worker_scale, + cupy_recvbuf_scale): + + # In-place operations are not possible for newly created cupy arrays + # so we need to return the new buffers + numpy_recvbuf_sign = np.zeros([world_size, + cupy_sign_list_packed[rank].size], + dtype=cupy_sign_list_packed[0].dtype) + numpy_recvbuf_scale = np.zeros([world_size, 1], dtype=cupy_worker_scale.dtype) + + # 1. convert from cupy to numpy + numpy_sign_list_packed = cupy_sign_list_packed + + for idx in range(world_size): + numpy_sign_list_packed[idx] = cupy.asnumpy(cupy_sign_list_packed[idx]) + + numpy_worker_scale = cupy.asnumpy(cupy_worker_scale) + numpy_recvbuf_scale = cupy.asnumpy(cupy_recvbuf_scale) + + cupy.cuda.get_current_stream().synchronize() + + # 2. use numpy buffers for communication + requests = [] + + for idx in range(world_size): + req_sign = my_igather(rank, + world_size, + comm, + numpy_sign_list_packed[idx], + numpy_recvbuf_sign, + root=idx) + requests += req_sign + + for idx in range(world_size): + req_scale = my_igather(rank, + world_size, + comm, + numpy_worker_scale, + numpy_recvbuf_scale, + root=idx) + requests += req_scale + + MPI.Request.Waitall(requests) + + # 3. Convert back from numpy to cupy + cupy_recvbuf_sign = cupy.asarray(numpy_recvbuf_sign) + for idx in range(world_size): + cupy_sign_list_packed[idx] = cupy.asarray(numpy_sign_list_packed[idx]) + + cupy_worker_scale = cupy.asarray(numpy_worker_scale) + cupy_recvbuf_scale = cupy.asarray(numpy_recvbuf_scale) + cupy.cuda.get_current_stream().synchronize() + + return cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale + + def allgather_cuda(comm, + cupy_server_sign_packed, + cupy_recvbuf_sign_server, + cupy_server_scale, + cupy_recvbuf_scale_server): + comm.Allgather(cupy_server_sign_packed, cupy_recvbuf_sign_server) + comm.Allgather(cupy_server_scale, cupy_recvbuf_scale_server) + + def allgather_host(comm, + cupy_server_sign_packed, + cupy_recvbuf_sign_server, + cupy_server_scale, + cupy_recvbuf_scale_server): + + # 1. Convert cupy to numpy + numpy_recvbuf_sign_server = np.zeros( + [comm.Get_size(), + cupy_server_sign_packed.size], + dtype=cupy_server_sign_packed.dtype) + numpy_recvbuf_scale_server = np.zeros([comm.Get_size(), + 1], + dtype=cupy_server_scale.dtype) + + numpy_server_sign_packed = cupy.asnumpy(cupy_server_sign_packed) + numpy_recvbuf_sign_server = cupy.asnumpy(cupy_recvbuf_sign_server) + numpy_server_scale = cupy.asnumpy(cupy_server_scale) + numpy_recvbuf_scale_server = cupy.asnumpy(cupy_recvbuf_scale_server) + cupy.cuda.get_current_stream().synchronize() + + # 2. Communicate numpy buffers + comm.Allgather(numpy_server_sign_packed, numpy_recvbuf_sign_server) + comm.Allgather(numpy_server_scale, numpy_recvbuf_scale_server) + comm.Barrier() + + # 3. Convert numpy back to cupy + cupy_server_sign_packed = cupy.asarray(numpy_server_sign_packed) + cupy_recvbuf_sign_server = cupy.asarray(numpy_recvbuf_sign_server) + cupy_server_scale = cupy.asarray(numpy_server_scale) + cupy_recvbuf_scale_server = cupy.asarray(numpy_recvbuf_scale_server) + cupy.cuda.get_current_stream().synchronize() + + return cupy_server_sign_packed, cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server + + def compressed_allreduce(self, + buffer_m: torch.tensor, + worker_error, + server_error, + local_rank): + + all_start_time = time.time() + original_size = buffer_m.numel() + cupy.cuda.Device(local_rank).use() + + if torch.numel(buffer_m) != torch.numel(worker_error): + empty_tensor = torch.zeros(torch.numel(worker_error) - torch.numel(buffer_m), + device=buffer_m.device) + buffer_m = torch.cat([buffer_m, empty_tensor]) + + buffer_m.add_(worker_error) + worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m)) + sign_buffer_m = buffer_m.sign().add_(1).bool() + sign_buffer_m = sign_buffer_m.float() + sign_buffer_m.add_(-0.5).mul_(2.0) + worker_error.set_((buffer_m - worker_scale * sign_buffer_m)) + sign_buffer_m = None + + compensated_buffer_m = buffer_m + compensated_buffer_m.sign_() + compensated_buffer_m = compensated_buffer_m.add_(1).bool() + cupy_worker_scale = self.torch2cupy(worker_scale) + cupy_compensated_buffer_m = self.torch2cupy(compensated_buffer_m) + compensated_buffer_m = None + + cupy_sign_list_packed = self.compress_by_chunk(cupy_compensated_buffer_m, + self.size) + cupy_compensated_buffer_m = None + + cupy_recvbuf_sign = cupy.zeros( + [self.size, + cupy_sign_list_packed[self.rank].size], + dtype=cupy_sign_list_packed[0].dtype) + cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype) + + # Communication Phase 1 + gather_start = time.time() + if self.cuda_aware: + gather_cuda(self.rank, + self.size, + self.comm, + cupy_sign_list_packed, + cupy_recvbuf_sign, + cupy_worker_scale, + cupy_recvbuf_scale) + else: + cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale = gather_host(self.rank, + self.size, + self.comm, + cupy_sign_list_packed, + cupy_recvbuf_sign, + cupy_worker_scale, + cupy_recvbuf_scale) + gather_end = time.time() + + cupy_unpacked_sign = (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape( + self.size, + -1) + cupy_recvbuf_sign = None + unpacked_sign = self.cupy2torch(cupy_unpacked_sign).float() + cupy_unpacked_sign = None + unpacked_sign = unpacked_sign.add_(-0.5).mul_(2.0) + worker_scale = self.cupy2torch(cupy_recvbuf_scale).mul_(1 / self.size) + compensated_server_m = unpacked_sign.mul_(worker_scale).sum(0) + unpacked_sign = None + + compensated_server_m.add_(server_error) + server_scale = torch.norm(compensated_server_m) / np.sqrt( + compensated_server_m.numel()) + sign_server_m = compensated_server_m.sign().add_(1).bool() + sign_server_m = sign_server_m.float() + sign_server_m.add_(-0.5).mul_(2.0) + server_error.set_(compensated_server_m - server_scale * sign_server_m) + sign_server_m = None + + compensated_server_m.sign_() + compensated_server_m = compensated_server_m.add_(1).bool() + cupy_server_scale = self.torch2cupy(server_scale) + cupy_compensated_server_m = self.torch2cupy(compensated_server_m) + compensated_server_m = None + + cupy_server_sign_packed = self.compress_by_chunk(cupy_compensated_server_m, 1) + + cupy_recvbuf_sign_server = cupy.zeros( + [self.size, + cupy_server_sign_packed[0].size], + dtype=cupy_sign_list_packed[0].dtype) + cupy_recvbuf_scale_server = cupy.zeros([self.size, + 1], + dtype=cupy_worker_scale.dtype) + + # Communication Phase 2 + if self.cuda_aware: + allgather_cuda(self.comm, + cupy_server_sign_packed[0], + cupy_recvbuf_sign_server, + cupy_server_scale, + cupy_recvbuf_scale_server) + else: + cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server = allgather_host(self.comm, + cupy_server_sign_packed[0], + cupy_recvbuf_sign_server, + cupy_server_scale, + cupy_recvbuf_scale_server) + + cupy_server_unpacked_sign = (cupy.unpackbits( + cupy_recvbuf_sign_server.flatten())).reshape(self.size, + -1) + cupy_recvbuf_sign_server = None + + server_unpacked_sign = self.cupy2torch(cupy_server_unpacked_sign) + cupy_server_unpacked_sign = None + + server_unpacked_sign = server_unpacked_sign.float().add_(-0.5).mul_(2.0) + server_scale = self.cupy2torch(cupy_recvbuf_scale_server) + buffer_m = server_unpacked_sign.mul_(server_scale).flatten()[0:original_size] + + return buffer_m diff --git a/deepspeed/runtime/comm/nccl.py b/deepspeed/runtime/comm/nccl.py new file mode 100644 index 000000000000..8c71a0fa70f8 --- /dev/null +++ b/deepspeed/runtime/comm/nccl.py @@ -0,0 +1,187 @@ +''' +Copyright 2020 The Microsoft DeepSpeed Team +''' + +import torch.distributed as dist + + +class NcclBackend(object): + def __init__(self, group, size, rank): + self.world_group = dist.new_group(ranks=range(dist.get_world_size())) + self.rank = dist.get_rank(group=self.world_group) + self.size = dist.get_world_size(group=self.world_group) + + def my_igather(self, rank, size, group, sendbuf, recvbuf, root): + req = [] + if rank == root: + for idx in range(size): + if idx != rank: + req.append(dist.irecv(recvbuf[idx], src=idx, group=group)) + else: + recvbuf[rank] = sendbuf + else: + req.append(dist.isend(sendbuf, group=group, dst=root)) + return req + + def my_gather(self, rank, size, group, sendbuf, recvbuf, root): + if rank == root: + for idx in range(size): + if idx != rank: + dist.recv(recvbuf[idx], src=idx, group=group) + else: + recvbuf[rank] = sendbuf + else: + dist.send(sendbuf, group=group, dst=root) + + def compressed_allreduce(self, + buffer_m: torch.tensor, + worker_error, + server_error, + local_rank): + + all_start_time = time.time() + original_size = buffer_m.numel() + cupy.cuda.Device(local_rank).use() + + if torch.numel(buffer_m) != torch.numel(worker_error): + empty_tensor = torch.zeros(torch.numel(worker_error) - torch.numel(buffer_m), + device=buffer_m.device) + buffer_m = torch.cat([buffer_m, empty_tensor]) + + buffer_m.add_(worker_error) + worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m)) + sign_buffer_m = buffer_m.sign().add_(1).bool() + sign_buffer_m = sign_buffer_m.float() + sign_buffer_m.add_(-0.5).mul_(2.0) + worker_error.set_((buffer_m - worker_scale * sign_buffer_m)) + sign_buffer_m = None + + compensated_buffer_m = buffer_m + compensated_buffer_m.sign_() + compensated_buffer_m = compensated_buffer_m.add_(1).bool() + + cupy_worker_scale = self.torch2cupy(worker_scale) + cupy_compensated_buffer_m = self.torch2cupy(compensated_buffer_m) + compensated_buffer_m = None + + cupy_sign_list_packed = self.compress_by_chunk(cupy_compensated_buffer_m, + self.size) + cupy_compensated_buffer_m = None + + cupy_recvbuf_sign = cupy.zeros( + [self.size, + cupy_sign_list_packed[self.rank].size], + dtype=cupy_sign_list_packed[0].dtype) + cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype) + + sign_list_packed = [None] * self.size + + for idx in range(self.size): + sign_list_packed[idx] = self.cupy2torch(cupy_sign_list_packed[idx]) + + recvbuf_sign = self.cupy2torch(cupy_recvbuf_sign) + + worker_scale = self.cupy2torch(cupy_worker_scale) + recvbuf_scale = self.cupy2torch(cupy_recvbuf_scale) + + # communication phase 1 + gather_start = time.time() + requests = [] + for idx in range(self.size): + requests += self.my_igather(self.rank, + self.size, + self.world_group, + sign_list_packed[idx], + recvbuf_sign, + root=idx) + requests += self.my_igather(self.rank, + self.size, + self.world_group, + worker_scale, + recvbuf_scale, + root=idx) + + for i in range(len(requests)): + requests[i].wait() + + gather_end = time.time() + + cupy_recvbuf_sign = self.torch2cupy(recvbuf_sign) + cupy_recvbuf_scale = self.torch2cupy(recvbuf_scale) + + cupy_unpacked_sign = (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape( + self.size, + -1) + cupy_recvbuf_sign = None + + unpacked_sign = self.cupy2torch(cupy_unpacked_sign).float() + cupy_unpacked_sign = None + + unpacked_sign = unpacked_sign.add_(-0.5).mul_(2.0) + worker_scale = self.cupy2torch(cupy_recvbuf_scale).mul_(1 / self.size) + + compensated_server_m = unpacked_sign.mul_(worker_scale).sum(0) + unpacked_sign = None + + compensated_server_m.add_(server_error) + server_scale = torch.norm(compensated_server_m) / np.sqrt( + compensated_server_m.numel()) + sign_server_m = compensated_server_m.sign().add_(1).bool() + sign_server_m = sign_server_m.float() + sign_server_m.add_(-0.5).mul_(2.0) + server_error.set_(compensated_server_m - server_scale * sign_server_m) + sign_server_m = None + + compensated_server_m.sign_() + compensated_server_m = compensated_server_m.add_(1).bool() + cupy_server_scale = self.torch2cupy(server_scale) + cupy_compensated_server_m = self.torch2cupy(compensated_server_m) + compensated_server_m = None + + cupy_server_sign_packed = self.compress_by_chunk(cupy_compensated_server_m, 1) + + cupy_recvbuf_sign_server = cupy.zeros( + [self.size, + cupy_server_sign_packed[0].size], + dtype=cupy_sign_list_packed[0].dtype) + + server_sign_packed = [None] * 1 + recvbuf_sign_server = [None] * self.size + + for idx in range(self.size): + recvbuf_sign_server[idx] = self.cupy2torch(cupy_recvbuf_sign_server[idx]) + + server_sign_packed[0] = self.cupy2torch(cupy_server_sign_packed[0]) + + server_scale = self.cupy2torch(cupy_server_scale) + cupy_recvbuf_scale_server = cupy.zeros([self.size, + 1], + dtype=cupy_worker_scale.dtype) + + recvbuf_scale_server = [None] * self.size + for idx in range(self.size): + recvbuf_scale_server[idx] = self.cupy2torch(cupy_recvbuf_scale_server[idx]) + + # Communication Phase 2 + dist.all_gather(recvbuf_sign_server, server_sign_packed[0]) + dist.all_gather(recvbuf_scale_server, server_scale) + + # need to convert from a tensor list to a single tensor + # dist.all_gather only provides a tensor list as the recv/output buffer + recvbuf_sign_server = torch.stack(recvbuf_sign_server) + + cupy_recvbuf_sign_server = self.torch2cupy(recvbuf_sign_server) + + cupy_server_unpacked_sign = (cupy.unpackbits( + cupy_recvbuf_sign_server.flatten())).reshape(self.size, + -1) + cupy_recvbuf_sign_server = None + + server_unpacked_sign = self.cupy2torch(cupy_server_unpacked_sign) + cupy_server_unpacked_sign = None + + server_unpacked_sign = server_unpacked_sign.float().add_(-0.5).mul_(2.0) + server_scale = self.cupy2torch(cupy_recvbuf_scale_server) + buffer_m = server_unpacked_sign.mul_(server_scale).flatten()[0:original_size] + + return buffer_m diff --git a/deepspeed/runtime/compression/__init__.py b/deepspeed/runtime/compression/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/deepspeed/runtime/compression/cupy.py b/deepspeed/runtime/compression/cupy.py new file mode 100644 index 000000000000..68e56c68e9d0 --- /dev/null +++ b/deepspeed/runtime/compression/cupy.py @@ -0,0 +1,24 @@ +''' +Copyright 2020 The Microsoft DeepSpeed Team +''' + +import cupy +from torch.utils.dlpack import to_dlpack +from torch.utils.dlpack import from_dlpack + + +class CupyBackend(object): + def __init__(self): + pass + + def torch2cupy(self, tensor): + return cupy.fromDlpack(to_dlpack(tensor)) + + def cupy2torch(self, cupy_tensor): + return from_dlpack(cupy_tensor.toDlpack()) + + def compress_by_chunk(self, cupy_bool_tensor, num_chunks): + packed_sign = cupy.packbits(cupy_bool_tensor) + sign_list_packed = cupy.split(packed_sign, num_chunks) + cupy.cuda.get_current_stream().synchronize() + return sign_list_packed diff --git a/deepspeed/runtime/fp16/onebit/adam.py b/deepspeed/runtime/fp16/onebit/adam.py index 668a4bd436f3..231caaf36246 100644 --- a/deepspeed/runtime/fp16/onebit/adam.py +++ b/deepspeed/runtime/fp16/onebit/adam.py @@ -7,17 +7,9 @@ import numpy as np import time import torch.distributed as dist -from torch.utils.dlpack import to_dlpack -from torch.utils.dlpack import from_dlpack + from deepspeed.utils.logging import logger -# Delayed/lazy imports -my_igather_nccl = None -cupy = None -gather_cuda = None -gather_host = None -allgather_cuda = None -allgather_host = None class Adam(torch.optim.Optimizer): """Implements the 1-bit Adam algorithm. Currently GPU-only. @@ -68,8 +60,7 @@ def __init__(self, max_grad_norm=0., amsgrad=False, cuda_aware=False, - communication_backend='nccl', - compression_backend='deepspeed'): + comm_backend_name='nccl'): if amsgrad: raise RuntimeError('1-bit Adam does not support the AMSGrad variant.') @@ -96,326 +87,21 @@ def __init__(self, self.freeze_step = freeze_step self.cuda_aware = cuda_aware - self.communication_backend = communication_backend - self.compression_backend = compression_backend - - global my_igather_nccl - global cupy - global gather_cuda, gather_host, allgather_cuda, allgather_host + self.comm_backend_name = comm_backend_name - if self.compression_backend == 'cupy': - import cupy + # Empty initializer. Set handle based on the comm backend as follows. + self.comm_backend_handle = None - if self.communication_backend == 'nccl': + if self.comm_backend_name == 'nccl': assert dist.is_initialized() == True, "Please initialize the torch distributed backend." - self.world_group = dist.new_group(ranks=range(dist.get_world_size())) - self.rank = dist.get_rank(group=self.world_group) - self.size = dist.get_world_size(group=self.world_group) - from deepspeed.runtime.custom_collectives import my_igather_nccl - - elif self.communication_backend == 'mpi': - from mpi4py import MPI - self.comm = MPI.COMM_WORLD - self.rank = self.comm.Get_rank() - self.size = self.comm.Get_size() - from deepspeed.runtime.custom_collectives import gather_cuda, gather_host, allgather_cuda, allgather_host - - self.divider = int(self.size * 8 / np.gcd(self.size, 8)) - - def torch2cupy(self, tensor): - return cupy.fromDlpack(to_dlpack(tensor)) - - def cupy2torch(self, cupy_tensor): - return from_dlpack(cupy_tensor.toDlpack()) - - def compress_by_chunk(self, cupy_bool_tensor, num_chunks): - packed_sign = cupy.packbits(cupy_bool_tensor) - sign_list_packed = cupy.split(packed_sign, num_chunks) - cupy.cuda.get_current_stream().synchronize() - return sign_list_packed - - - def compressed_nccl_allreduce(self, - buffer_m: torch.tensor, - worker_error, - server_error, - local_rank): - - all_start_time = time.time() - original_size = buffer_m.numel() - cupy.cuda.Device(local_rank).use() - - if torch.numel(buffer_m) != torch.numel(worker_error): - empty_tensor = torch.zeros(torch.numel(worker_error) - torch.numel(buffer_m), - device=buffer_m.device) - buffer_m = torch.cat([buffer_m, empty_tensor]) - - buffer_m.add_(worker_error) - worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m)) - sign_buffer_m = buffer_m.sign().add_(1).bool() - sign_buffer_m = sign_buffer_m.float() - sign_buffer_m.add_(-0.5).mul_(2.0) - worker_error.set_((buffer_m - worker_scale * sign_buffer_m)) - sign_buffer_m = None - - compensated_buffer_m = buffer_m - compensated_buffer_m.sign_() - compensated_buffer_m = compensated_buffer_m.add_(1).bool() - - cupy_worker_scale = self.torch2cupy(worker_scale) - cupy_compensated_buffer_m = self.torch2cupy(compensated_buffer_m) - compensated_buffer_m = None - - cupy_sign_list_packed = self.compress_by_chunk(cupy_compensated_buffer_m, - self.size) - cupy_compensated_buffer_m = None - - cupy_recvbuf_sign = cupy.zeros( - [self.size, - cupy_sign_list_packed[self.rank].size], - dtype=cupy_sign_list_packed[0].dtype) - cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype) - - sign_list_packed = [None] * self.size + from deepspeed.runtime.comm.nccl import NcclBackend + self.comm_backend_handle = NcclBackend() - for idx in range(self.size): - sign_list_packed[idx] = self.cupy2torch(cupy_sign_list_packed[idx]) + elif self.comm_backend_name == 'mpi': + from deepspeed.runtime.comm.mpi import MpiBackend + self.comm_backend_handle = MpiBackend(cuda_aware) - recvbuf_sign = self.cupy2torch(cupy_recvbuf_sign) - - worker_scale = self.cupy2torch(cupy_worker_scale) - recvbuf_scale = self.cupy2torch(cupy_recvbuf_scale) - - # communication phase 1 - gather_start = time.time() - requests = [] - for idx in range(self.size): - requests += my_igather_nccl(self.rank, - self.size, - self.world_group, - sign_list_packed[idx], - recvbuf_sign, - root=idx) - requests += my_igather_nccl(self.rank, self.size, self.world_group, worker_scale, recvbuf_scale, root=idx) - - for i in range(len(requests)): - requests[i].wait() - - gather_end = time.time() - - cupy_recvbuf_sign = self.torch2cupy(recvbuf_sign) - cupy_recvbuf_scale = self.torch2cupy(recvbuf_scale) - - cupy_unpacked_sign = (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape( - self.size, - -1) - cupy_recvbuf_sign = None - - unpacked_sign = self.cupy2torch(cupy_unpacked_sign).float() - cupy_unpacked_sign = None - - unpacked_sign = unpacked_sign.add_(-0.5).mul_(2.0) - worker_scale = self.cupy2torch(cupy_recvbuf_scale).mul_(1 / self.size) - - compensated_server_m = unpacked_sign.mul_(worker_scale).sum(0) - unpacked_sign = None - - compensated_server_m.add_(server_error) - server_scale = torch.norm(compensated_server_m) / np.sqrt( - compensated_server_m.numel()) - sign_server_m = compensated_server_m.sign().add_(1).bool() - sign_server_m = sign_server_m.float() - sign_server_m.add_(-0.5).mul_(2.0) - server_error.set_(compensated_server_m - server_scale * sign_server_m) - sign_server_m = None - - compensated_server_m.sign_() - compensated_server_m = compensated_server_m.add_(1).bool() - cupy_server_scale = self.torch2cupy(server_scale) - cupy_compensated_server_m = self.torch2cupy(compensated_server_m) - compensated_server_m = None - - cupy_server_sign_packed = self.compress_by_chunk(cupy_compensated_server_m, 1) - - cupy_recvbuf_sign_server = cupy.zeros( - [self.size, - cupy_server_sign_packed[0].size], - dtype=cupy_sign_list_packed[0].dtype) - - server_sign_packed = [None] * 1 - recvbuf_sign_server = [None] * self.size - - for idx in range(self.size): - recvbuf_sign_server[idx] = self.cupy2torch(cupy_recvbuf_sign_server[idx]) - - server_sign_packed[0] = self.cupy2torch(cupy_server_sign_packed[0]) - - server_scale = self.cupy2torch(cupy_server_scale) - cupy_recvbuf_scale_server = cupy.zeros([self.size, - 1], - dtype=cupy_worker_scale.dtype) - - recvbuf_scale_server = [None] * self.size - for idx in range(self.size): - recvbuf_scale_server[idx] = self.cupy2torch(cupy_recvbuf_scale_server[idx]) - - # Communication Phase 2 - dist.all_gather(recvbuf_sign_server, server_sign_packed[0]) - dist.all_gather(recvbuf_scale_server, server_scale) - - # need to convert from a tensor list to a single tensor - # dist.all_gather only provides a tensor list as the recv/output buffer - recvbuf_sign_server = torch.stack(recvbuf_sign_server) - - cupy_recvbuf_sign_server = self.torch2cupy(recvbuf_sign_server) - - cupy_server_unpacked_sign = (cupy.unpackbits( - cupy_recvbuf_sign_server.flatten())).reshape(self.size, - -1) - cupy_recvbuf_sign_server = None - - server_unpacked_sign = self.cupy2torch(cupy_server_unpacked_sign) - cupy_server_unpacked_sign = None - - server_unpacked_sign = server_unpacked_sign.float().add_(-0.5).mul_(2.0) - server_scale = self.cupy2torch(cupy_recvbuf_scale_server) - buffer_m = server_unpacked_sign.mul_(server_scale).flatten()[0:original_size] - - return buffer_m - - def compressed_mpi_allreduce(self, - buffer_m: torch.tensor, - worker_error, - server_error, - local_rank): - - all_start_time = time.time() - original_size = buffer_m.numel() - cupy.cuda.Device(local_rank).use() - - if torch.numel(buffer_m) != torch.numel(worker_error): - empty_tensor = torch.zeros(torch.numel(worker_error) - torch.numel(buffer_m), - device=buffer_m.device) - buffer_m = torch.cat([buffer_m, empty_tensor]) - - buffer_m.add_(worker_error) - worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m)) - sign_buffer_m = buffer_m.sign().add_(1).bool() - sign_buffer_m = sign_buffer_m.float() - sign_buffer_m.add_(-0.5).mul_(2.0) - worker_error.set_((buffer_m - worker_scale * sign_buffer_m)) - sign_buffer_m = None - - compensated_buffer_m = buffer_m - compensated_buffer_m.sign_() - compensated_buffer_m = compensated_buffer_m.add_(1).bool() - cupy_worker_scale = self.torch2cupy(worker_scale) - cupy_compensated_buffer_m = self.torch2cupy(compensated_buffer_m) - compensated_buffer_m = None - - cupy_sign_list_packed = self.compress_by_chunk(cupy_compensated_buffer_m, - self.size) - cupy_compensated_buffer_m = None - - cupy_recvbuf_sign = cupy.zeros( - [self.size, - cupy_sign_list_packed[self.rank].size], - dtype=cupy_sign_list_packed[0].dtype) - cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype) - - # Communication Phase 1 - gather_start = time.time() - if self.cuda_aware: - gather_cuda(self.rank, - self.size, - self.comm, - cupy_sign_list_packed, - cupy_recvbuf_sign, - cupy_worker_scale, - cupy_recvbuf_scale) - else: - cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale = gather_host(self.rank, - self.size, - self.comm, - cupy_sign_list_packed, - cupy_recvbuf_sign, - cupy_worker_scale, - cupy_recvbuf_scale) - gather_end = time.time() - - cupy_unpacked_sign = (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape( - self.size, - -1) - cupy_recvbuf_sign = None - unpacked_sign = self.cupy2torch(cupy_unpacked_sign).float() - cupy_unpacked_sign = None - unpacked_sign = unpacked_sign.add_(-0.5).mul_(2.0) - worker_scale = self.cupy2torch(cupy_recvbuf_scale).mul_(1 / self.size) - compensated_server_m = unpacked_sign.mul_(worker_scale).sum(0) - unpacked_sign = None - - compensated_server_m.add_(server_error) - server_scale = torch.norm(compensated_server_m) / np.sqrt( - compensated_server_m.numel()) - sign_server_m = compensated_server_m.sign().add_(1).bool() - sign_server_m = sign_server_m.float() - sign_server_m.add_(-0.5).mul_(2.0) - server_error.set_(compensated_server_m - server_scale * sign_server_m) - sign_server_m = None - - compensated_server_m.sign_() - compensated_server_m = compensated_server_m.add_(1).bool() - cupy_server_scale = self.torch2cupy(server_scale) - cupy_compensated_server_m = self.torch2cupy(compensated_server_m) - compensated_server_m = None - - cupy_server_sign_packed = self.compress_by_chunk(cupy_compensated_server_m, 1) - - cupy_recvbuf_sign_server = cupy.zeros( - [self.size, - cupy_server_sign_packed[0].size], - dtype=cupy_sign_list_packed[0].dtype) - cupy_recvbuf_scale_server = cupy.zeros([self.size, - 1], - dtype=cupy_worker_scale.dtype) - - # Communication Phase 2 - if self.cuda_aware: - allgather_cuda(self.comm, - cupy_server_sign_packed[0], - cupy_recvbuf_sign_server, - cupy_server_scale, - cupy_recvbuf_scale_server) - else: - cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server = allgather_host(self.comm, - cupy_server_sign_packed[0], - cupy_recvbuf_sign_server, - cupy_server_scale, - cupy_recvbuf_scale_server) - - cupy_server_unpacked_sign = (cupy.unpackbits( - cupy_recvbuf_sign_server.flatten())).reshape(self.size, - -1) - cupy_recvbuf_sign_server = None - - server_unpacked_sign = self.cupy2torch(cupy_server_unpacked_sign) - cupy_server_unpacked_sign = None - - server_unpacked_sign = server_unpacked_sign.float().add_(-0.5).mul_(2.0) - server_scale = self.cupy2torch(cupy_recvbuf_scale_server) - buffer_m = server_unpacked_sign.mul_(server_scale).flatten()[0:original_size] - - return buffer_m - - def compressed_allreduce(self, - buffer_m: torch.tensor, - worker_error, - server_error, - local_rank): - if self.communication_backend == 'nccl': - return self.compressed_nccl_allreduce(buffer_m, worker_error, server_error, local_rank) - elif self.communication_backend == 'mpi': - return self.compressed_mpi_allreduce(buffer_m, worker_error, server_error, local_rank) + self.divider = int(self.size * 8 / np.gcd(self.size, 8)) def step(self, closure=None, grads=None): """Performs a single optimization step. @@ -526,11 +212,12 @@ def step(self, closure=None, grads=None): grad = None if self.size > 1: - exp_avg.set_(self.compressed_allreduce( - exp_avg, - state['worker_error'], - state['server_error'], - self.deepspeed.local_rank)) + exp_avg.set_( + self.comm_backend_handle.compressed_allreduce( + exp_avg, + state['worker_error'], + state['server_error'], + self.deepspeed.local_rank)) if self.initialize: update = exp_avg / (exp_avg_sq.sqrt() + group['eps']) From 0813d117071f962cc0e4f2697b18ccb768805bff Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Tue, 8 Dec 2020 13:10:41 -0800 Subject: [PATCH 20/26] Fix the test. --- tests/onebitadam/test_com_reduce_cuda.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/onebitadam/test_com_reduce_cuda.py b/tests/onebitadam/test_com_reduce_cuda.py index 03f2cca8862c..2eaba4dcb8bd 100644 --- a/tests/onebitadam/test_com_reduce_cuda.py +++ b/tests/onebitadam/test_com_reduce_cuda.py @@ -31,6 +31,7 @@ device = torch.device('cuda', rank % torch.cuda.device_count()) + # A simulated compression function using torch.distributed def torch_sim(a): a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0) @@ -52,6 +53,7 @@ def torch_sim(a): torch.distributed.barrier() return a_server_compressed, worker_error, server_error + tensor_size = 100 * 2**20 server_size = int(tensor_size / size) if tensor_size % (8 * size) != 0: @@ -59,8 +61,11 @@ def torch_sim(a): else: right_tensor_size = tensor_size right_server_size = right_tensor_size // size + # Adding bias to the initialization of the gradient we are communicating # In order to get rid of the case where some elements in the gradient are too small +a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank + worker_error = torch.zeros(right_tensor_size, device=device) server_error = torch.zeros(right_server_size, device=device) @@ -70,11 +75,11 @@ def torch_sim(a): if test_performance: # Warmup - for i in range (iters): + for i in range(iters): torch_sim(a) timers('simulated').start() - for i in range (iters): + for i in range(iters): torch_sim(a) timers('simulated').stop() @@ -84,7 +89,7 @@ def torch_sim(a): if test_performance: # Warmup - for i in range (iters): + for i in range(iters): dummy_optim.compressed_allreduce(a, worker_error, server_error, local_rank) timers('compressed').start() From 4c3c7772180eecefee67570d50a6f99f57b86e77 Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Tue, 8 Dec 2020 23:17:08 +0000 Subject: [PATCH 21/26] Fix runtime errors and typos in nccl backend --- deepspeed/runtime/comm/mpi.py | 2 +- deepspeed/runtime/comm/nccl.py | 51 ++++++++++++++---------- deepspeed/runtime/fp16/onebit/adam.py | 2 + tests/onebitadam/test_com_reduce_cuda.py | 13 ++++-- 4 files changed, 41 insertions(+), 27 deletions(-) diff --git a/deepspeed/runtime/comm/mpi.py b/deepspeed/runtime/comm/mpi.py index 948e719fb7af..32eacee0009f 100644 --- a/deepspeed/runtime/comm/mpi.py +++ b/deepspeed/runtime/comm/mpi.py @@ -2,11 +2,11 @@ Copyright 2020 The Microsoft DeepSpeed Team ''' +import torch import cupy import numpy as np from mpi4py import MPI - class MpiBackend(object): def __init__(self, cuda_aware): self.comm = MPI.COMM_WORLD diff --git a/deepspeed/runtime/comm/nccl.py b/deepspeed/runtime/comm/nccl.py index 8c71a0fa70f8..49f57f57df36 100644 --- a/deepspeed/runtime/comm/nccl.py +++ b/deepspeed/runtime/comm/nccl.py @@ -2,14 +2,21 @@ Copyright 2020 The Microsoft DeepSpeed Team ''' +import torch import torch.distributed as dist +import time +import cupy +import numpy as np +from deepspeed.runtime.compression.cupy import CupyBackend class NcclBackend(object): - def __init__(self, group, size, rank): + + def __init__(self): self.world_group = dist.new_group(ranks=range(dist.get_world_size())) self.rank = dist.get_rank(group=self.world_group) self.size = dist.get_world_size(group=self.world_group) + self.compression_backend = CupyBackend() def my_igather(self, rank, size, group, sendbuf, recvbuf, root): req = [] @@ -60,11 +67,11 @@ def compressed_allreduce(self, compensated_buffer_m.sign_() compensated_buffer_m = compensated_buffer_m.add_(1).bool() - cupy_worker_scale = self.torch2cupy(worker_scale) - cupy_compensated_buffer_m = self.torch2cupy(compensated_buffer_m) + cupy_worker_scale = self.compression_backend.torch2cupy(worker_scale) + cupy_compensated_buffer_m = self.compression_backend.torch2cupy(compensated_buffer_m) compensated_buffer_m = None - cupy_sign_list_packed = self.compress_by_chunk(cupy_compensated_buffer_m, + cupy_sign_list_packed = self.compression_backend.compress_by_chunk(cupy_compensated_buffer_m, self.size) cupy_compensated_buffer_m = None @@ -77,12 +84,12 @@ def compressed_allreduce(self, sign_list_packed = [None] * self.size for idx in range(self.size): - sign_list_packed[idx] = self.cupy2torch(cupy_sign_list_packed[idx]) + sign_list_packed[idx] = self.compression_backend.cupy2torch(cupy_sign_list_packed[idx]) - recvbuf_sign = self.cupy2torch(cupy_recvbuf_sign) + recvbuf_sign = self.compression_backend.cupy2torch(cupy_recvbuf_sign) - worker_scale = self.cupy2torch(cupy_worker_scale) - recvbuf_scale = self.cupy2torch(cupy_recvbuf_scale) + worker_scale = self.compression_backend.cupy2torch(cupy_worker_scale) + recvbuf_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale) # communication phase 1 gather_start = time.time() @@ -106,19 +113,19 @@ def compressed_allreduce(self, gather_end = time.time() - cupy_recvbuf_sign = self.torch2cupy(recvbuf_sign) - cupy_recvbuf_scale = self.torch2cupy(recvbuf_scale) + cupy_recvbuf_sign = self.compression_backend.torch2cupy(recvbuf_sign) + cupy_recvbuf_scale = self.compression_backend.torch2cupy(recvbuf_scale) cupy_unpacked_sign = (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape( self.size, -1) cupy_recvbuf_sign = None - unpacked_sign = self.cupy2torch(cupy_unpacked_sign).float() + unpacked_sign = self.compression_backend.cupy2torch(cupy_unpacked_sign).float() cupy_unpacked_sign = None unpacked_sign = unpacked_sign.add_(-0.5).mul_(2.0) - worker_scale = self.cupy2torch(cupy_recvbuf_scale).mul_(1 / self.size) + worker_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale).mul_(1 / self.size) compensated_server_m = unpacked_sign.mul_(worker_scale).sum(0) unpacked_sign = None @@ -134,11 +141,11 @@ def compressed_allreduce(self, compensated_server_m.sign_() compensated_server_m = compensated_server_m.add_(1).bool() - cupy_server_scale = self.torch2cupy(server_scale) - cupy_compensated_server_m = self.torch2cupy(compensated_server_m) + cupy_server_scale = self.compression_backend.torch2cupy(server_scale) + cupy_compensated_server_m = self.compression_backend.torch2cupy(compensated_server_m) compensated_server_m = None - cupy_server_sign_packed = self.compress_by_chunk(cupy_compensated_server_m, 1) + cupy_server_sign_packed = self.compression_backend.compress_by_chunk(cupy_compensated_server_m, 1) cupy_recvbuf_sign_server = cupy.zeros( [self.size, @@ -149,18 +156,18 @@ def compressed_allreduce(self, recvbuf_sign_server = [None] * self.size for idx in range(self.size): - recvbuf_sign_server[idx] = self.cupy2torch(cupy_recvbuf_sign_server[idx]) + recvbuf_sign_server[idx] = self.compression_backend.cupy2torch(cupy_recvbuf_sign_server[idx]) - server_sign_packed[0] = self.cupy2torch(cupy_server_sign_packed[0]) + server_sign_packed[0] = self.compression_backend.cupy2torch(cupy_server_sign_packed[0]) - server_scale = self.cupy2torch(cupy_server_scale) + server_scale = self.compression_backend.cupy2torch(cupy_server_scale) cupy_recvbuf_scale_server = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype) recvbuf_scale_server = [None] * self.size for idx in range(self.size): - recvbuf_scale_server[idx] = self.cupy2torch(cupy_recvbuf_scale_server[idx]) + recvbuf_scale_server[idx] = self.compression_backend.cupy2torch(cupy_recvbuf_scale_server[idx]) # Communication Phase 2 dist.all_gather(recvbuf_sign_server, server_sign_packed[0]) @@ -170,18 +177,18 @@ def compressed_allreduce(self, # dist.all_gather only provides a tensor list as the recv/output buffer recvbuf_sign_server = torch.stack(recvbuf_sign_server) - cupy_recvbuf_sign_server = self.torch2cupy(recvbuf_sign_server) + cupy_recvbuf_sign_server = self.compression_backend.torch2cupy(recvbuf_sign_server) cupy_server_unpacked_sign = (cupy.unpackbits( cupy_recvbuf_sign_server.flatten())).reshape(self.size, -1) cupy_recvbuf_sign_server = None - server_unpacked_sign = self.cupy2torch(cupy_server_unpacked_sign) + server_unpacked_sign = self.compression_backend.cupy2torch(cupy_server_unpacked_sign) cupy_server_unpacked_sign = None server_unpacked_sign = server_unpacked_sign.float().add_(-0.5).mul_(2.0) - server_scale = self.cupy2torch(cupy_recvbuf_scale_server) + server_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale_server) buffer_m = server_unpacked_sign.mul_(server_scale).flatten()[0:original_size] return buffer_m diff --git a/deepspeed/runtime/fp16/onebit/adam.py b/deepspeed/runtime/fp16/onebit/adam.py index 231caaf36246..5f0e485e87b9 100644 --- a/deepspeed/runtime/fp16/onebit/adam.py +++ b/deepspeed/runtime/fp16/onebit/adam.py @@ -101,6 +101,8 @@ def __init__(self, from deepspeed.runtime.comm.mpi import MpiBackend self.comm_backend_handle = MpiBackend(cuda_aware) + self.size = self.comm_backend_handle.size + self.divider = int(self.size * 8 / np.gcd(self.size, 8)) def step(self, closure=None, grads=None): diff --git a/tests/onebitadam/test_com_reduce_cuda.py b/tests/onebitadam/test_com_reduce_cuda.py index 2eaba4dcb8bd..2c35900b6b55 100644 --- a/tests/onebitadam/test_com_reduce_cuda.py +++ b/tests/onebitadam/test_com_reduce_cuda.py @@ -4,7 +4,10 @@ import torch.distributed as dist import numpy as np import deepspeed + from deepspeed.runtime.fp16.onebit.adam import Adam +from deepspeed.runtime.comm.nccl import NcclBackend + # Configure wall clock timer from deepspeed.utils.timer import SynchronizedWallClockTimer @@ -24,10 +27,12 @@ dummy_model = [torch.nn.Parameter(torch.ones(10))] # Set cuda_aware to True to use CUDA buffers for communication -dummy_optim = Adam(dummy_model, - cuda_aware=False, - communication_backend='nccl', - compression_backend='cupy') + +#dummy_optim = Adam(dummy_model, +# cuda_aware=False, +# comm_backend_name='nccl') + +dummy_optim = NcclBackend() device = torch.device('cuda', rank % torch.cuda.device_count()) From d495c7a258f157f8bcba1b055de661f3a2086028 Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Tue, 8 Dec 2020 15:47:39 -0800 Subject: [PATCH 22/26] fix mpi backend. modify tests. --- deepspeed/runtime/comm/mpi.py | 43 ++++++--- tests/onebitadam/test_mpi_backend.py | 88 +++++++++++++++++++ ...om_reduce_cuda.py => test_nccl_backend.py} | 52 ++--------- tests/onebitadam/test_nccl_perf.py | 63 +++++++++++++ 4 files changed, 185 insertions(+), 61 deletions(-) create mode 100644 tests/onebitadam/test_mpi_backend.py rename tests/onebitadam/{test_com_reduce_cuda.py => test_nccl_backend.py} (68%) create mode 100644 tests/onebitadam/test_nccl_perf.py diff --git a/deepspeed/runtime/comm/mpi.py b/deepspeed/runtime/comm/mpi.py index 32eacee0009f..450c4522c935 100644 --- a/deepspeed/runtime/comm/mpi.py +++ b/deepspeed/runtime/comm/mpi.py @@ -4,15 +4,20 @@ import torch import cupy +import time import numpy as np from mpi4py import MPI +from deepspeed.runtime.compression.cupy import CupyBackend + + class MpiBackend(object): def __init__(self, cuda_aware): self.comm = MPI.COMM_WORLD self.rank = self.comm.Get_rank() self.size = self.comm.Get_size() self.cuda_aware = cuda_aware + self.compression_backend = CupyBackend() def my_igather(self, rank, size, comm, sendbuf, recbuf, root): req = [] @@ -56,7 +61,8 @@ def gather_cuda(self, MPI.Request.Waitall(requests) - def gather_host(rank, + def gather_host(self, + rank, world_size, comm, cupy_sign_list_packed, @@ -116,7 +122,8 @@ def gather_host(rank, return cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale - def allgather_cuda(comm, + def allgather_cuda(self, + comm, cupy_server_sign_packed, cupy_recvbuf_sign_server, cupy_server_scale, @@ -124,7 +131,8 @@ def allgather_cuda(comm, comm.Allgather(cupy_server_sign_packed, cupy_recvbuf_sign_server) comm.Allgather(cupy_server_scale, cupy_recvbuf_scale_server) - def allgather_host(comm, + def allgather_host(self, + comm, cupy_server_sign_packed, cupy_recvbuf_sign_server, cupy_server_scale, @@ -185,12 +193,14 @@ def compressed_allreduce(self, compensated_buffer_m = buffer_m compensated_buffer_m.sign_() compensated_buffer_m = compensated_buffer_m.add_(1).bool() - cupy_worker_scale = self.torch2cupy(worker_scale) - cupy_compensated_buffer_m = self.torch2cupy(compensated_buffer_m) + cupy_worker_scale = self.compression_backend.torch2cupy(worker_scale) + cupy_compensated_buffer_m = self.compression_backend.torch2cupy( + compensated_buffer_m) compensated_buffer_m = None - cupy_sign_list_packed = self.compress_by_chunk(cupy_compensated_buffer_m, - self.size) + cupy_sign_list_packed = self.compression_backend.compress_by_chunk( + cupy_compensated_buffer_m, + self.size) cupy_compensated_buffer_m = None cupy_recvbuf_sign = cupy.zeros( @@ -223,10 +233,11 @@ def compressed_allreduce(self, self.size, -1) cupy_recvbuf_sign = None - unpacked_sign = self.cupy2torch(cupy_unpacked_sign).float() + unpacked_sign = self.compression_backend.cupy2torch(cupy_unpacked_sign).float() cupy_unpacked_sign = None unpacked_sign = unpacked_sign.add_(-0.5).mul_(2.0) - worker_scale = self.cupy2torch(cupy_recvbuf_scale).mul_(1 / self.size) + worker_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale).mul_( + 1 / self.size) compensated_server_m = unpacked_sign.mul_(worker_scale).sum(0) unpacked_sign = None @@ -241,11 +252,14 @@ def compressed_allreduce(self, compensated_server_m.sign_() compensated_server_m = compensated_server_m.add_(1).bool() - cupy_server_scale = self.torch2cupy(server_scale) - cupy_compensated_server_m = self.torch2cupy(compensated_server_m) + cupy_server_scale = self.compression_backend.torch2cupy(server_scale) + cupy_compensated_server_m = self.compression_backend.torch2cupy( + compensated_server_m) compensated_server_m = None - cupy_server_sign_packed = self.compress_by_chunk(cupy_compensated_server_m, 1) + cupy_server_sign_packed = self.compression_backend.compress_by_chunk( + cupy_compensated_server_m, + 1) cupy_recvbuf_sign_server = cupy.zeros( [self.size, @@ -274,11 +288,12 @@ def compressed_allreduce(self, -1) cupy_recvbuf_sign_server = None - server_unpacked_sign = self.cupy2torch(cupy_server_unpacked_sign) + server_unpacked_sign = self.compression_backend.cupy2torch( + cupy_server_unpacked_sign) cupy_server_unpacked_sign = None server_unpacked_sign = server_unpacked_sign.float().add_(-0.5).mul_(2.0) - server_scale = self.cupy2torch(cupy_recvbuf_scale_server) + server_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale_server) buffer_m = server_unpacked_sign.mul_(server_scale).flatten()[0:original_size] return buffer_m diff --git a/tests/onebitadam/test_mpi_backend.py b/tests/onebitadam/test_mpi_backend.py new file mode 100644 index 000000000000..7c1b59737532 --- /dev/null +++ b/tests/onebitadam/test_mpi_backend.py @@ -0,0 +1,88 @@ +from mpi4py import MPI +import time +import torch +import torch.distributed as dist +import numpy as np +import deepspeed + +from deepspeed.runtime.comm.mpi import MpiBackend + +comm = MPI.COMM_WORLD +size = comm.Get_size() +rank = comm.Get_rank() + +#TODO: Detect the hostname we are running on automatically +torch.distributed.init_process_group(backend='nccl', + init_method='tcp://worker-0:2245', + world_size=size, + rank=rank) + +# Change cuda_aware to True to test out CUDA-Aware MPI communication +backend = MpiBackend(cuda_aware=False) + +device = torch.device('cuda', rank % torch.cuda.device_count()) + + +# A simulated compression function using torch.distributed +def torch_sim(a): + a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0) + scale = a.norm() / np.sqrt(a.numel()) + a_compressed = scale * a_sign + a_sign = None + worker_error = a - a_compressed + dist.all_reduce(a_compressed) + a_compressed.mul_(1 / dist.get_world_size()) + a_server_sign = a_compressed.sign().add_(1).bool().float().add_(-0.5).mul_(2.0) + a_list = torch.chunk(a_compressed, chunks=dist.get_world_size()) + server_scale = [chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list] + a_sign_list = torch.chunk(a_server_sign, dist.get_world_size()) + a_server_compressed = torch.cat( + [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())]) + rank = dist.get_rank() + server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank] + torch.cuda.synchronize() + torch.distributed.barrier() + return a_server_compressed, worker_error, server_error + + +tensor_size = 100 * 2**20 +server_size = int(tensor_size / size) +if tensor_size % (8 * size) != 0: + right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size))) +else: + right_tensor_size = tensor_size +right_server_size = right_tensor_size // size + +# Adding bias to the initialization of the gradient we are communicating +# In order to get rid of the case where some elements in the gradient are too small +a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank + +worker_error = torch.zeros(right_tensor_size, device=device) +server_error = torch.zeros(right_server_size, device=device) + +a_torch, worker_error_torch, server_error_torch = torch_sim(a) +torch.cuda.empty_cache() +local_rank = rank % torch.cuda.device_count() + +a_after = backend.compressed_allreduce(a, worker_error, server_error, local_rank) + +threshold = 1e-6 +magnitude_threshold = 1e-6 +diff_mask = (a_after - a_torch) > threshold +diff_server_mask = torch.chunk(diff_mask, size)[rank] +mpi_server = torch.chunk(a_after, size)[rank] + server_error +torch_server = torch.chunk(a_torch, size)[rank] + server_error_torch + +test_correctness = True + +# If the number in the compensated_server_m is too small (e.g 1e-8), then calling sign() might be problematic +# The test would skip those numbers that are too small in compensated_server_m +if test_correctness: + if torch.sum(diff_server_mask) == 0: + print('Successfully passed the test for MPI Backend at Rank {}'.format(rank)) + else: + check_mag_mask = mpi_server[diff_mask] > magnitude_threshold + if torch.sum(check_mag_mask) == 0: + print('Successfully passed the test for MPI Backend at Rank {}'.format(rank)) + else: + print('Fails at {} of positions'.format(torch.sum(check_mag_mask))) diff --git a/tests/onebitadam/test_com_reduce_cuda.py b/tests/onebitadam/test_nccl_backend.py similarity index 68% rename from tests/onebitadam/test_com_reduce_cuda.py rename to tests/onebitadam/test_nccl_backend.py index 2c35900b6b55..be4acc8a31d8 100644 --- a/tests/onebitadam/test_com_reduce_cuda.py +++ b/tests/onebitadam/test_nccl_backend.py @@ -5,15 +5,8 @@ import numpy as np import deepspeed -from deepspeed.runtime.fp16.onebit.adam import Adam from deepspeed.runtime.comm.nccl import NcclBackend - -# Configure wall clock timer -from deepspeed.utils.timer import SynchronizedWallClockTimer - -timers = SynchronizedWallClockTimer() - comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() @@ -24,15 +17,7 @@ world_size=size, rank=rank) -dummy_model = [torch.nn.Parameter(torch.ones(10))] - -# Set cuda_aware to True to use CUDA buffers for communication - -#dummy_optim = Adam(dummy_model, -# cuda_aware=False, -# comm_backend_name='nccl') - -dummy_optim = NcclBackend() +backend = NcclBackend() device = torch.device('cuda', rank % torch.cuda.device_count()) @@ -74,35 +59,11 @@ def torch_sim(a): worker_error = torch.zeros(right_tensor_size, device=device) server_error = torch.zeros(right_server_size, device=device) -test_performance = False - -iters = 100 - -if test_performance: - # Warmup - for i in range(iters): - torch_sim(a) - - timers('simulated').start() - for i in range(iters): - torch_sim(a) - timers('simulated').stop() - a_torch, worker_error_torch, server_error_torch = torch_sim(a) torch.cuda.empty_cache() local_rank = rank % torch.cuda.device_count() -if test_performance: - # Warmup - for i in range(iters): - dummy_optim.compressed_allreduce(a, worker_error, server_error, local_rank) - - timers('compressed').start() - for i in range(iters): - dummy_optim.compressed_allreduce(a, worker_error, server_error, local_rank) - timers('compressed').stop() - -a_after = dummy_optim.compressed_allreduce(a, worker_error, server_error, local_rank) +a_after = backend.compressed_allreduce(a, worker_error, server_error, local_rank) threshold = 1e-6 magnitude_threshold = 1e-6 @@ -117,14 +78,11 @@ def torch_sim(a): # The test would skip those numbers that are too small in compensated_server_m if test_correctness: if torch.sum(diff_server_mask) == 0: - print('Successfully passed the test for 1bit Adam at Rank {}'.format(rank)) + print('Successfully passed the test for NCCL Backend at Rank {}'.format(rank)) else: check_mag_mask = mpi_server[diff_mask] > magnitude_threshold if torch.sum(check_mag_mask) == 0: - print('Successfully passed the test for 1bit Adam at Rank {}'.format(rank)) + print( + 'Successfully passed the test for NCCL Backend at Rank {}'.format(rank)) else: print('Fails at {} of positions'.format(torch.sum(check_mag_mask))) - -if test_performance: - timer_names = ['simulated', 'compressed'] - timers.log(names=timer_names, normalizer=iters, memory_breakdown=None) diff --git a/tests/onebitadam/test_nccl_perf.py b/tests/onebitadam/test_nccl_perf.py new file mode 100644 index 000000000000..4b2cd667d7bb --- /dev/null +++ b/tests/onebitadam/test_nccl_perf.py @@ -0,0 +1,63 @@ +from mpi4py import MPI +import time +import torch +import torch.distributed as dist +import numpy as np +import deepspeed + +from deepspeed.runtime.comm.nccl import NcclBackend + +# Configure wall clock timer +from deepspeed.utils.timer import SynchronizedWallClockTimer + +timers = SynchronizedWallClockTimer() + +comm = MPI.COMM_WORLD +size = comm.Get_size() +rank = comm.Get_rank() + +#TODO: Detect the hostname we are running on automatically +torch.distributed.init_process_group(backend='nccl', + init_method='tcp://worker-0:2245', + world_size=size, + rank=rank) + +backend = NcclBackend() + +device = torch.device('cuda', rank % torch.cuda.device_count()) + +tensor_size = 100 * 2**20 +server_size = int(tensor_size / size) +if tensor_size % (8 * size) != 0: + right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size))) +else: + right_tensor_size = tensor_size +right_server_size = right_tensor_size // size + +# Adding bias to the initialization of the gradient we are communicating +# In order to get rid of the case where some elements in the gradient are too small +a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank + +worker_error = torch.zeros(right_tensor_size, device=device) +server_error = torch.zeros(right_server_size, device=device) + +iters = 100 + +local_rank = rank % torch.cuda.device_count() + +# Warmup +for i in range(iters): + backend.compressed_allreduce(a, worker_error, server_error, local_rank) + +time_list = [] + +for i in range(iters): + timers('compressed_allreduce').start() + backend.compressed_allreduce(a, worker_error, server_error, local_rank) + timers('compressed_allreduce').stop() + time_list += timers('compressed_allreduce').elapsed() + +timer_names = ['compressed_allreduce'] +timers.log(names=timer_names, normalizer=iters, memory_breakdown=None) + +print(time_list) From 60f3344b52260d4d49de30765062b2e41107ade5 Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Wed, 9 Dec 2020 00:48:48 +0000 Subject: [PATCH 23/26] modify nccl perf test. --- tests/onebitadam/test_nccl_perf.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/tests/onebitadam/test_nccl_perf.py b/tests/onebitadam/test_nccl_perf.py index 4b2cd667d7bb..e079838288a0 100644 --- a/tests/onebitadam/test_nccl_perf.py +++ b/tests/onebitadam/test_nccl_perf.py @@ -10,6 +10,8 @@ # Configure wall clock timer from deepspeed.utils.timer import SynchronizedWallClockTimer +from statistics import mean + timers = SynchronizedWallClockTimer() comm = MPI.COMM_WORLD @@ -26,7 +28,7 @@ device = torch.device('cuda', rank % torch.cuda.device_count()) -tensor_size = 100 * 2**20 +tensor_size = 300 * 2**20 server_size = int(tensor_size / size) if tensor_size % (8 * size) != 0: right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size))) @@ -41,12 +43,13 @@ worker_error = torch.zeros(right_tensor_size, device=device) server_error = torch.zeros(right_server_size, device=device) +warmup = 10 iters = 100 local_rank = rank % torch.cuda.device_count() # Warmup -for i in range(iters): +for i in range(warmup): backend.compressed_allreduce(a, worker_error, server_error, local_rank) time_list = [] @@ -55,9 +58,21 @@ timers('compressed_allreduce').start() backend.compressed_allreduce(a, worker_error, server_error, local_rank) timers('compressed_allreduce').stop() - time_list += timers('compressed_allreduce').elapsed() + time_list.append(timers('compressed_allreduce').elapsed()) timer_names = ['compressed_allreduce'] -timers.log(names=timer_names, normalizer=iters, memory_breakdown=None) +timers.log(names=timer_names, normalizer=1, memory_breakdown=None) + +places = 2 +convert = 1e3 +float_size = 4 + +if rank == 0: + for i in range(iters): + lat = time_list[i] + print("latency = ", lat * convert) -print(time_list) +minlat = round(min(time_list) * convert) +maxlat = round(max(time_list) * convert) +meanlat = round(mean(time_list) * convert, places) +print("min, max, and mean = {} ms, {} ms, {} ms".format(minlat, maxlat, meanlat)) From c1ab39e09e7768a63d9be3649a18620bfa04ee41 Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Wed, 9 Dec 2020 01:24:20 +0000 Subject: [PATCH 24/26] fix mpi side errors. --- deepspeed/runtime/comm/mpi.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/deepspeed/runtime/comm/mpi.py b/deepspeed/runtime/comm/mpi.py index 450c4522c935..532f03d940cd 100644 --- a/deepspeed/runtime/comm/mpi.py +++ b/deepspeed/runtime/comm/mpi.py @@ -42,7 +42,7 @@ def gather_cuda(self, # We do in-place operations on cupy buffers so we do not return any buffers requests = [] for idx in range(world_size): - req_sign = my_igather(rank, + req_sign = self.my_igather(rank, world_size, comm, cupy_sign_list_packed[idx], @@ -51,7 +51,7 @@ def gather_cuda(self, requests += req_sign for idx in range(world_size): - req_scale = my_igather(rank, + req_scale = self.my_igather(rank, world_size, comm, cupy_worker_scale, @@ -92,7 +92,7 @@ def gather_host(self, requests = [] for idx in range(world_size): - req_sign = my_igather(rank, + req_sign = self.my_igather(rank, world_size, comm, numpy_sign_list_packed[idx], @@ -101,7 +101,7 @@ def gather_host(self, requests += req_sign for idx in range(world_size): - req_scale = my_igather(rank, + req_scale = self.my_igather(rank, world_size, comm, numpy_worker_scale, @@ -212,7 +212,7 @@ def compressed_allreduce(self, # Communication Phase 1 gather_start = time.time() if self.cuda_aware: - gather_cuda(self.rank, + self.gather_cuda(self.rank, self.size, self.comm, cupy_sign_list_packed, @@ -220,7 +220,7 @@ def compressed_allreduce(self, cupy_worker_scale, cupy_recvbuf_scale) else: - cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale = gather_host(self.rank, + cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale = self.gather_host(self.rank, self.size, self.comm, cupy_sign_list_packed, @@ -271,13 +271,13 @@ def compressed_allreduce(self, # Communication Phase 2 if self.cuda_aware: - allgather_cuda(self.comm, + self.allgather_cuda(self.comm, cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server) else: - cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server = allgather_host(self.comm, + cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server = self.allgather_host(self.comm, cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale, From 70938e17ebba6bcf7ca2f0cb60731f955b91974e Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Wed, 9 Dec 2020 01:24:57 +0000 Subject: [PATCH 25/26] Add an mpi perf test --- tests/onebitadam/test_mpi_perf.py | 78 +++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 tests/onebitadam/test_mpi_perf.py diff --git a/tests/onebitadam/test_mpi_perf.py b/tests/onebitadam/test_mpi_perf.py new file mode 100644 index 000000000000..63e445e89c50 --- /dev/null +++ b/tests/onebitadam/test_mpi_perf.py @@ -0,0 +1,78 @@ +from mpi4py import MPI +import time +import torch +import torch.distributed as dist +import numpy as np +import deepspeed + +from deepspeed.runtime.comm.mpi import MpiBackend + +# Configure wall clock timer +from deepspeed.utils.timer import SynchronizedWallClockTimer + +from statistics import mean + +timers = SynchronizedWallClockTimer() + +comm = MPI.COMM_WORLD +size = comm.Get_size() +rank = comm.Get_rank() + +#TODO: Detect the hostname we are running on automatically +torch.distributed.init_process_group(backend='nccl', + init_method='tcp://worker-0:2245', + world_size=size, + rank=rank) + +backend = MpiBackend(cuda_aware=False) + +device = torch.device('cuda', rank % torch.cuda.device_count()) + +tensor_size = 300 * 2**20 +server_size = int(tensor_size / size) +if tensor_size % (8 * size) != 0: + right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size))) +else: + right_tensor_size = tensor_size +right_server_size = right_tensor_size // size + +# Adding bias to the initialization of the gradient we are communicating +# In order to get rid of the case where some elements in the gradient are too small +a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank + +worker_error = torch.zeros(right_tensor_size, device=device) +server_error = torch.zeros(right_server_size, device=device) + +warmup = 10 +iters = 100 + +local_rank = rank % torch.cuda.device_count() + +# Warmup +for i in range(warmup): + backend.compressed_allreduce(a, worker_error, server_error, local_rank) + +time_list = [] + +for i in range(iters): + timers('compressed_allreduce').start() + backend.compressed_allreduce(a, worker_error, server_error, local_rank) + timers('compressed_allreduce').stop() + time_list.append(timers('compressed_allreduce').elapsed()) + +timer_names = ['compressed_allreduce'] +timers.log(names=timer_names, normalizer=1, memory_breakdown=None) + +places = 2 +convert = 1e3 +float_size = 4 + +if rank == 0: + for i in range(iters): + lat = time_list[i] + print("latency = ", lat * convert) + +minlat = round(min(time_list) * convert) +maxlat = round(max(time_list) * convert) +meanlat = round(mean(time_list) * convert, places) +print("min, max, and mean = {} ms, {} ms, {} ms".format(minlat, maxlat, meanlat)) From 7aac01889a7c14a03ec66de9abbd45ab839f90f2 Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Thu, 10 Dec 2020 11:46:48 -0800 Subject: [PATCH 26/26] Sync DSE. --- DeepSpeedExamples | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DeepSpeedExamples b/DeepSpeedExamples index 896831c96266..fa1d1a71c486 160000 --- a/DeepSpeedExamples +++ b/DeepSpeedExamples @@ -1 +1 @@ -Subproject commit 896831c96266e12612c3e7a923d04e68d1f4dd84 +Subproject commit fa1d1a71c48623db8a091d9cf636a5fe3b8f43c7