diff --git a/benchmarks/communication/all_gather.py b/benchmarks/communication/all_gather.py index 101f129cca45..dc97267b3840 100644 --- a/benchmarks/communication/all_gather.py +++ b/benchmarks/communication/all_gather.py @@ -2,6 +2,7 @@ from benchmarks.communication.utils import * from benchmarks.communication.constants import * +from deepspeed.accelerator import get_accelerator import time @@ -85,16 +86,20 @@ def run_all_gather(local_rank, args): try: mat = torch.ones(world_size, M, - dtype=getattr(torch, - args.dtype)).cuda(local_rank) + dtype=getattr( + torch, + args.dtype)).to( + get_accelerator().device_name(local_rank)) sync_all() input = ((mat.mul_(float(global_rank))).view(-1)) # Delete original mat to avoid OOM del mat - torch.cuda.empty_cache() + get_accelerator().empty_cache() output = torch.zeros(input.nelement() * world_size, - dtype=getattr(torch, - args.dtype)).cuda(local_rank) + dtype=getattr( + torch, + args.dtype)).to( + get_accelerator().device_name(local_rank)) except RuntimeError as e: if 'out of memory' in str(e): if dist.get_rank() == 0: @@ -123,15 +128,17 @@ def run_all_gather(local_rank, args): try: mat = torch.ones(elements_per_gpu, dtype=getattr(torch, - args.dtype)).cuda(local_rank) + args.dtype)).to( + get_accelerator().device_name(local_rank)) # multiply each GPU's tensor by the rank to ease debugging input = ((mat.mul_(float(global_rank))).view(-1)) # Delete original mat to avoid OOM del mat - torch.cuda.empty_cache() - output = torch.zeros(elements_per_gpu * world_size, - dtype=getattr(torch, - args.dtype)).cuda(local_rank) + get_accelerator().empty_cache() + output = torch.zeros( + elements_per_gpu * world_size, + dtype=getattr(torch, + args.dtype)).to(get_accelerator().device_name(local_rank)) except RuntimeError as e: if 'out of memory' in str(e): if dist.get_rank() == 0: diff --git a/benchmarks/communication/all_reduce.py b/benchmarks/communication/all_reduce.py index 9d46f70c93c2..edc1b99301c0 100644 --- a/benchmarks/communication/all_reduce.py +++ b/benchmarks/communication/all_reduce.py @@ -2,6 +2,7 @@ from benchmarks.communication.utils import * from benchmarks.communication.constants import * +from deepspeed.accelerator import get_accelerator import time @@ -64,8 +65,10 @@ def run_all_reduce(local_rank, args): try: mat = torch.ones(world_size, M, - dtype=getattr(torch, - args.dtype)).cuda(local_rank) + dtype=getattr( + torch, + args.dtype)).to( + get_accelerator().device_name(local_rank)) sync_all() input = ((mat.mul_(float(global_rank))).view(-1)) except RuntimeError as e: @@ -88,7 +91,8 @@ def run_all_reduce(local_rank, args): try: mat = torch.ones(elements_per_gpu, dtype=getattr(torch, - args.dtype)).cuda(local_rank) + args.dtype)).to( + get_accelerator().device_name(local_rank)) input = ((mat.mul_(float(global_rank))).view(-1)) except RuntimeError as e: if 'out of memory' in str(e): diff --git a/benchmarks/communication/all_to_all.py b/benchmarks/communication/all_to_all.py index f5ce3b37d514..bd35cf290e4c 100644 --- a/benchmarks/communication/all_to_all.py +++ b/benchmarks/communication/all_to_all.py @@ -2,6 +2,7 @@ from benchmarks.communication.utils import * from benchmarks.communication.constants import * +from deepspeed.accelerator import get_accelerator import time @@ -63,8 +64,10 @@ def run_all_to_all(local_rank, args): try: mat = torch.ones(world_size, M, - dtype=getattr(torch, - args.dtype)).cuda(local_rank) + dtype=getattr( + torch, + args.dtype)).to( + get_accelerator().device_name(local_rank)) assert mat.numel() % world_size == 0, f"tensor cannot be divided in {world_size} chunks" sync_all() input = ((mat.mul_(float(global_rank))).view(-1)) @@ -88,15 +91,17 @@ def run_all_to_all(local_rank, args): try: mat = torch.ones(elements_per_gpu, dtype=getattr(torch, - args.dtype)).cuda(local_rank) + args.dtype)).to( + get_accelerator().device_name(local_rank)) assert mat.numel() % world_size == 0, f"tensor with {mat.numel()} elements cannot be divided in {world_size} chunks" input = ((mat.mul_(float(global_rank))).view(-1)) # Delete original mat to avoid OOM del mat - torch.cuda.empty_cache() - output = torch.zeros(elements_per_gpu, - dtype=getattr(torch, - args.dtype)).cuda(local_rank) + get_accelerator().empty_cache() + output = torch.zeros( + elements_per_gpu, + dtype=getattr(torch, + args.dtype)).to(get_accelerator().device_name(local_rank)) except RuntimeError as e: if 'out of memory' in str(e): if dist.get_rank() == 0: diff --git a/benchmarks/communication/broadcast.py b/benchmarks/communication/broadcast.py index d0480cb15b5a..633e46638fac 100644 --- a/benchmarks/communication/broadcast.py +++ b/benchmarks/communication/broadcast.py @@ -3,6 +3,7 @@ import torch from benchmarks.communication.utils import * from benchmarks.communication.constants import * +from deepspeed.accelerator import get_accelerator import time @@ -65,8 +66,10 @@ def run_broadcast(local_rank, args): try: mat = torch.ones(world_size, M, - dtype=getattr(torch, - args.dtype)).cuda(local_rank) + dtype=getattr( + torch, + args.dtype)).to( + get_accelerator().device_name(local_rank)) sync_all() input = ((mat.mul_(float(global_rank))).view(-1)) except RuntimeError as e: @@ -89,7 +92,8 @@ def run_broadcast(local_rank, args): try: mat = torch.ones(elements_per_gpu, dtype=getattr(torch, - args.dtype)).cuda(local_rank) + args.dtype)).to( + get_accelerator().device_name(local_rank)) input = ((mat.mul_(float(global_rank))).view(-1)) except RuntimeError as e: if 'out of memory' in str(e): diff --git a/benchmarks/communication/constants.py b/benchmarks/communication/constants.py index 71416b16f084..935927acd174 100644 --- a/benchmarks/communication/constants.py +++ b/benchmarks/communication/constants.py @@ -1,9 +1,10 @@ '''Copyright The Microsoft DeepSpeed Team''' +from deepspeed.accelerator import get_accelerator DEFAULT_WARMUPS = 5 DEFAULT_TRIALS = 50 DEFAULT_TYPE = 'float' -DEFAULT_BACKEND = 'nccl' +DEFAULT_BACKEND = get_accelerator().communication_backend_name() DEFAULT_UNIT = 'Gbps' DEFAULT_DIST = 'deepspeed' DEFAULT_MAXSIZE = 24 diff --git a/benchmarks/communication/pt2pt.py b/benchmarks/communication/pt2pt.py index 89a2ec045b39..1c890fc42e93 100644 --- a/benchmarks/communication/pt2pt.py +++ b/benchmarks/communication/pt2pt.py @@ -2,6 +2,7 @@ from benchmarks.communication.utils import * from benchmarks.communication.constants import * +from deepspeed.accelerator import get_accelerator import time @@ -83,8 +84,10 @@ def run_pt2pt(local_rank, args): try: mat = torch.ones(world_size, M, - dtype=getattr(torch, - args.dtype)).cuda(local_rank) + dtype=getattr( + torch, + args.dtype)).to( + get_accelerator().device_name(local_rank)) sync_all() input = ((mat.mul_(float(global_rank))).view(-1)) except RuntimeError as e: @@ -107,7 +110,8 @@ def run_pt2pt(local_rank, args): try: mat = torch.ones(elements_per_gpu, dtype=getattr(torch, - args.dtype)).cuda(local_rank) + args.dtype)).to( + get_accelerator().device_name(local_rank)) input = ((mat.mul_(float(global_rank))).view(-1)) except RuntimeError as e: if 'out of memory' in str(e): diff --git a/benchmarks/communication/utils.py b/benchmarks/communication/utils.py index b2f7367bb703..b913dda14fe5 100644 --- a/benchmarks/communication/utils.py +++ b/benchmarks/communication/utils.py @@ -5,6 +5,7 @@ import math import argparse from benchmarks.communication.constants import * +from deepspeed.accelerator import get_accelerator global dist @@ -14,7 +15,7 @@ def init_torch_distributed(backend): import torch.distributed as dist torch.distributed.init_process_group(backend) local_rank = int(os.environ['LOCAL_RANK']) - torch.cuda.set_device(local_rank) + get_accelerator().set_device(local_rank) def init_deepspeed_comm(backend): @@ -23,7 +24,7 @@ def init_deepspeed_comm(backend): import deepspeed.comm as dist deepspeed.init_distributed(dist_backend=backend) local_rank = int(os.environ['LOCAL_RANK']) - torch.cuda.set_device(local_rank) + get_accelerator().set_device(local_rank) def init_processes(local_rank, args): @@ -101,14 +102,13 @@ def get_metric_strings(args, tput, busbw, duration): def sync_all(): - torch.cuda.synchronize() + get_accelerator().synchronize() dist.barrier() def max_numel(comm_op, dtype, mem_factor, local_rank, args): dtype_size = _element_size(dtype) - max_memory_per_gpu = torch.cuda.get_device_properties( - local_rank).total_memory * mem_factor + max_memory_per_gpu = get_accelerator().total_memory(local_rank) * mem_factor if comm_op == 'all_reduce' or comm_op == 'pt2pt' or comm_op == 'broadcast': elements_per_gpu = int(max_memory_per_gpu // dtype_size) elif comm_op == 'all_gather': @@ -185,7 +185,8 @@ def benchmark_parser(): parser.add_argument("--backend", type=str, default=DEFAULT_BACKEND, - choices=['nccl'], + choices=['nccl', + 'ccl'], help='Communication library to use') parser.add_argument("--dist", type=str, diff --git a/benchmarks/inference/bert-bench.py b/benchmarks/inference/bert-bench.py index 5b88ba235e3f..9d586d033cd7 100644 --- a/benchmarks/inference/bert-bench.py +++ b/benchmarks/inference/bert-bench.py @@ -5,6 +5,7 @@ import deepspeed import argparse from transformers import pipeline +from deepspeed.accelerator import get_accelerator parser = argparse.ArgumentParser() parser.add_argument("--model", "-m", type=str, help="hf model name") @@ -46,7 +47,7 @@ def print_latency(latency_set, title, warmup=3): print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000)) -deepspeed.init_distributed("nccl") +deepspeed.init_distributed() print(args.model, args.max_tokens, args.dtype) @@ -75,10 +76,10 @@ def print_latency(latency_set, title, warmup=3): times = [] mtimes = [] for i in range(args.trials): - torch.cuda.synchronize() + get_accelerator().synchronize() start = time.time() r = pipe(f"Hello I'm a {mask} model") - torch.cuda.synchronize() + get_accelerator().synchronize() end = time.time() responses.append(r) times.append((end - start)) diff --git a/benchmarks/inference/gpt-bench.py b/benchmarks/inference/gpt-bench.py index b41ca921064e..29578b30cf1f 100644 --- a/benchmarks/inference/gpt-bench.py +++ b/benchmarks/inference/gpt-bench.py @@ -6,6 +6,7 @@ import deepspeed import argparse from transformers import pipeline +from deepspeed.accelerator import get_accelerator parser = argparse.ArgumentParser() parser.add_argument("--model", "-m", type=str, help="hf model name") @@ -63,7 +64,7 @@ def print_latency(latency_set, title, warmup=3): print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000)) -deepspeed.init_distributed("nccl") +deepspeed.init_distributed() if args.local_rank == 0: print("BENCHMARK SETTINGS:") @@ -102,10 +103,10 @@ def print_latency(latency_set, title, warmup=3): times = [] mtimes = [] for i in range(args.trials): - torch.cuda.synchronize() + get_accelerator().synchronize() start = time.time() r = pipe("DeepSpeed is", do_sample=False, max_new_tokens=args.max_tokens) - torch.cuda.synchronize() + get_accelerator().synchronize() end = time.time() responses.append(r) times.append(end - start) # / (args.max_tokens - 3)) diff --git a/deepspeed/module_inject/containers/base.py b/deepspeed/module_inject/containers/base.py index ca5f0e23ffe3..45faeb477951 100644 --- a/deepspeed/module_inject/containers/base.py +++ b/deepspeed/module_inject/containers/base.py @@ -5,6 +5,7 @@ import torch from deepspeed.ops.transformer.inference.config import DeepSpeedInferenceConfig +from deepspeed.accelerator import get_accelerator class BaseConvolutionContainer(ABC): @@ -216,12 +217,14 @@ def copy_data_to_new_module(self): self.module.mlp.attn_nb = self.attn_nb else: self.module.mlp.attn_nw.data.copy_( - self.attn_nw.to(torch.cuda.current_device())) + self.attn_nw.to(get_accelerator().current_device_name())) self.module.mlp.attn_nb.data.copy_( - self.attn_nb.to(torch.cuda.current_device())) + self.attn_nb.to(get_accelerator().current_device_name())) - self.module.norm_w.data.copy_(self.input_nw.to(torch.cuda.current_device())) - self.module.norm_b.data.copy_(self.input_nb.to(torch.cuda.current_device())) + self.module.norm_w.data.copy_( + self.input_nw.to(get_accelerator().current_device_name())) + self.module.norm_b.data.copy_( + self.input_nb.to(get_accelerator().current_device_name())) def transpose(self): self.transpose_attention() @@ -241,5 +244,5 @@ def transpose_impl(self, data): data = data.contiguous() data.reshape(-1).copy_(data.transpose(-1, -2).contiguous().reshape(-1)) data = data.reshape(data.shape[-1], data.shape[-2]) - data.to(torch.cuda.current_device()) + data.to(get_accelerator().current_device_name()) return data diff --git a/deepspeed/module_inject/containers/base_moe.py b/deepspeed/module_inject/containers/base_moe.py index f120a5ef6006..4139b08d9091 100644 --- a/deepspeed/module_inject/containers/base_moe.py +++ b/deepspeed/module_inject/containers/base_moe.py @@ -2,9 +2,9 @@ # Create a container object to save model-specific tensors using the policy file above. from .base import * -import torch from deepspeed import comm as dist import deepspeed.ops.transformer as transformer_inference +from deepspeed.accelerator import get_accelerator class BaseTransformerMoEContainer(BaseTransformerContainer): @@ -104,33 +104,38 @@ def mlp_mp(self): # mlp inter self.module.mlp[ep_index].inter_w.data = self._h4h_w[ gpu_index * self.local_ep_size + ep_index].to( - torch.cuda.current_device()) + get_accelerator().current_device_name()) self.module.mlp[ep_index].inter_b.data = self._h4h_b[ gpu_index * self.local_ep_size + ep_index].to( - torch.cuda.current_device()) + get_accelerator().current_device_name()) # mlp output self.module.mlp[ep_index].output_w.data = self._4hh_w[ gpu_index * self.local_ep_size + ep_index].to( - torch.cuda.current_device()) + get_accelerator().current_device_name()) self.module.mlp[ep_index].output_b.data = self._4hh_b[ gpu_index * self.local_ep_size + ep_index].to( - torch.cuda.current_device()) + get_accelerator().current_device_name()) def copy_data_to_new_module(self): - self.module.attn_nw.data = self.attn_nw.to(torch.cuda.current_device()) - self.module.attn_nb.data = self.attn_nb.to(torch.cuda.current_device()) + self.module.attn_nw.data = self.attn_nw.to( + get_accelerator().current_device_name()) + self.module.attn_nb.data = self.attn_nb.to( + get_accelerator().current_device_name()) - self.module.norm_w.data.copy_(self.input_nw.to(torch.cuda.current_device())) - self.module.norm_b.data.copy_(self.input_nb.to(torch.cuda.current_device())) + self.module.norm_w.data.copy_( + self.input_nw.to(get_accelerator().current_device_name())) + self.module.norm_b.data.copy_( + self.input_nb.to(get_accelerator().current_device_name())) if self.config.moe.type == 'residual': self.module.res_mlp.inter_w.data = self._res_h4h_w.to( - torch.cuda.current_device()) + get_accelerator().current_device_name()) self.module.res_mlp.inter_b.data = self._res_h4h_b.to( - torch.cuda.current_device()) + get_accelerator().current_device_name()) self.module.res_mlp.output_w.data = self._res_4hh_w.to( - torch.cuda.current_device()) + get_accelerator().current_device_name()) self.module.res_mlp.output_b.data = self._res_4hh_b.to( - torch.cuda.current_device()) - self.module.res_coef.data = self._res_coef.to(torch.cuda.current_device()) + get_accelerator().current_device_name()) + self.module.res_coef.data = self._res_coef.to( + get_accelerator().current_device_name()) diff --git a/deepspeed/module_inject/policy.py b/deepspeed/module_inject/policy.py index bbf8f59e9222..dfd3343e12a3 100644 --- a/deepspeed/module_inject/policy.py +++ b/deepspeed/module_inject/policy.py @@ -4,6 +4,7 @@ from abc import ABC, abstractmethod from deepspeed.utils.types import ActivationFuncType import torch +from deepspeed.accelerator import get_accelerator transformer_param_names = ( 'attn_qkvw', \ @@ -196,9 +197,9 @@ def maybe_copy_qkv(module, dst = mp_replace.copy(dst, qkv_data) else: if split_qkv: - dst = mp_replace.qkv_copy(dst, weight_quantizer.quantize(qkv_data.cuda() if weight_quantizer.q_int8 else \ + dst = mp_replace.qkv_copy(dst, weight_quantizer.quantize(qkv_data.to(get_accelerator().device_name()) if weight_quantizer.q_int8 else \ ((transpose(qkv_data)).contiguous())), int8=weight_quantizer.q_int8) else: - dst = mp_replace.copy(dst, weight_quantizer.quantize(qkv_data.cuda() if weight_quantizer.q_int8 else \ + dst = mp_replace.copy(dst, weight_quantizer.quantize(qkv_data.to(get_accelerator().device_name()) if weight_quantizer.q_int8 else \ transpose(qkv_data)), int8=weight_quantizer.q_int8) setattr(module, dst_name, dst) diff --git a/docs/_tutorials/cifar-10.md b/docs/_tutorials/cifar-10.md index 11a05a78a749..74ee04502f18 100644 --- a/docs/_tutorials/cifar-10.md +++ b/docs/_tutorials/cifar-10.md @@ -140,7 +140,8 @@ Here we initialize DeepSpeed with CIFAR-10 model (`net`), `args`, `parameters` a After initializing DeepSpeed, the original `device` and `optimizer` are removed: ```python - #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + #from deepspeed.accelerator import get_accelerator + #device = torch.device(get_accelerator().device_name(0) if get_accelerator().is_available() else "cpu") #net.to(device) #optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) diff --git a/tests/accelerator/ds_config.json b/tests/accelerator/ds_config.json new file mode 100644 index 000000000000..8e9ac6b889ea --- /dev/null +++ b/tests/accelerator/ds_config.json @@ -0,0 +1,19 @@ +{ + "train_batch_size": 1, + "gradient_accumulation_steps": 1, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015, + "weight_decay": 1e-2 + } + }, + "fp16": { + "enabled": false, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + } +} diff --git a/tests/accelerator/test_ds_init.py b/tests/accelerator/test_ds_init.py new file mode 100644 index 000000000000..6c4e90e2aa63 --- /dev/null +++ b/tests/accelerator/test_ds_init.py @@ -0,0 +1,43 @@ +'''Copyright The Microsoft DeepSpeed Team''' +import os +import torch +import deepspeed +from deepspeed.accelerator import get_accelerator + + +class OneLayerNet(torch.nn.Module): + def __init__(self, D_in, D_out): + """ + In the constructor we instantiate two nn.Linear modules and assign them as + member variables. + """ + super(OneLayerNet, self).__init__() + self.linear1 = torch.nn.Linear(D_in, D_out) + + def forward(self, x): + """ + In the forward function we accept a Variable of input data and we must return + a Variable of output data. We can use Modules defined in the constructor as + well as arbitrary operators on Variables. + """ + h_relu = self.linear1(x).clamp(min=0) + y_pred = self.linear1(h_relu) + return y_pred + + +def test_literal_device(): + model = OneLayerNet(128, 128) + + os.environ['RANK'] = '0' + os.environ['WORLD_SIZE'] = '1' + os.environ['MASTER_ADDR'] = '127.0.0.1' + os.environ['MASTER_PORT'] = '8088' + os.environ['LOCAL_RANK'] = '0' + deepspeed.init_distributed(get_accelerator().communication_backend_name()) + deepspeed.initialize(model=model, config='ds_config.json') + string = get_accelerator().device_name() #'xpu' or 'cuda' + string0 = get_accelerator().device_name(0) #'xpu:0' or 'cuda:0' + string1 = get_accelerator().device_name(1) #'xpu:1' or 'cuda:1' + assert string == 'xpu' or string == 'cuda' + assert string0 == 'xpu:0' or string0 == 'cuda:0' + assert string1 == 'xpu:1' or string1 == 'cuda:1' diff --git a/tests/benchmarks/flatten_bench.py b/tests/benchmarks/flatten_bench.py index fdf53df0b131..1082554f81d1 100755 --- a/tests/benchmarks/flatten_bench.py +++ b/tests/benchmarks/flatten_bench.py @@ -14,6 +14,7 @@ import torch from torch._utils import _flatten_dense_tensors +from deepspeed.accelerator import get_accelerator from deepspeed.ops.op_builder import UtilsBuilder from apex_C import flatten as flatten_apex @@ -26,11 +27,11 @@ # emulate a small typical model weights x = [ torch.rand((512, - 512)).cuda(), + 512)).to(get_accelerator().device_name()), torch.rand((512, - 1024)).cuda(), + 1024)).to(get_accelerator().device_name()), torch.rand((512, - 30000)).cuda() + 30000)).to(get_accelerator().device_name()) ] t = x * 30 @@ -71,15 +72,15 @@ def cprofileme(): print("py") cProfile.run("py()", sort=-1) gc.collect() - torch.cuda.empty_cache() + get_accelerator().empty_cache() print("cpp") cProfile.run("cpp()", sort=-1) gc.collect() - torch.cuda.empty_cache() + get_accelerator().empty_cache() print("apex") cProfile.run("apex()", sort=-1) gc.collect() - torch.cuda.empty_cache() + get_accelerator().empty_cache() #### timeit #### @@ -91,13 +92,13 @@ def timeme(): print("--------------- timeit -----------------") print(f'py ={timeit.Timer("py()", globals=globals()).timeit(number=1)}') gc.collect() - torch.cuda.empty_cache() + get_accelerator().empty_cache() print(f'cpp ={timeit.Timer("cpp()", globals=globals()).timeit(number=1)}') gc.collect() - torch.cuda.empty_cache() + get_accelerator().empty_cache() print(f'apex={timeit.Timer("apex()", globals=globals()).timeit(number=1)}') gc.collect() - torch.cuda.empty_cache() + get_accelerator().empty_cache() #### line_profiler #### @@ -111,15 +112,15 @@ def line_profileme(): print("py") profile(py)() # noqa: F821 gc.collect() - torch.cuda.empty_cache() + get_accelerator().empty_cache() print("cpp") profile(cpp)() # noqa: F821 gc.collect() - torch.cuda.empty_cache() + get_accelerator().empty_cache() print("apex") profile(apex)() # noqa: F821 gc.collect() - torch.cuda.empty_cache() + get_accelerator().empty_cache() if __name__ == "__main__": diff --git a/tests/benchmarks/unflatten_bench.py b/tests/benchmarks/unflatten_bench.py index 0c60528891ea..a4a1b63b3dd0 100755 --- a/tests/benchmarks/unflatten_bench.py +++ b/tests/benchmarks/unflatten_bench.py @@ -13,6 +13,7 @@ import gc import torch from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors +from deepspeed.accelerator import get_accelerator from deepspeed.ops.op_builder import UtilsBuilder from apex_C import flatten as flatten_apex @@ -26,11 +27,11 @@ # emulate a small typical model weights x = [ torch.rand((512, - 512)).cuda(), + 512)).to(get_accelerator().device_name()), torch.rand((512, - 1024)).cuda(), + 1024)).to(get_accelerator().device_name()), torch.rand((512, - 30000)).cuda() + 30000)).to(get_accelerator().device_name()) ] unflat_t = x * 30 @@ -80,15 +81,15 @@ def cprofileme(): print("py") cProfile.run("py()", sort=-1) gc.collect() - torch.cuda.empty_cache() + get_accelerator().empty_cache() print("cpp") cProfile.run("cpp()", sort=-1) gc.collect() - torch.cuda.empty_cache() + get_accelerator().empty_cache() print("apex") cProfile.run("apex()", sort=-1) gc.collect() - torch.cuda.empty_cache() + get_accelerator().empty_cache() #### timeit #### @@ -100,13 +101,13 @@ def timeme(): print("--------------- timeit -----------------") print(f'py ={timeit.Timer("py()", globals=globals()).timeit(number=1)}') gc.collect() - torch.cuda.empty_cache() + get_accelerator().empty_cache() print(f'cpp ={timeit.Timer("cpp()", globals=globals()).timeit(number=1)}') gc.collect() - torch.cuda.empty_cache() + get_accelerator().empty_cache() print(f'apex={timeit.Timer("apex()", globals=globals()).timeit(number=1)}') gc.collect() - torch.cuda.empty_cache() + get_accelerator().empty_cache() #### line_profiler #### @@ -120,15 +121,15 @@ def line_profileme(): print("py") profile(py)() # noqa: F821 gc.collect() - torch.cuda.empty_cache() + get_accelerator().empty_cache() print("cpp") profile(cpp)() # noqa: F821 gc.collect() - torch.cuda.empty_cache() + get_accelerator().empty_cache() print("apex") profile(apex)() # noqa: F821 gc.collect() - torch.cuda.empty_cache() + get_accelerator().empty_cache() if __name__ == "__main__": diff --git a/tests/onebit/test_mpi_backend.py b/tests/onebit/test_mpi_backend.py index 3b9f67cce9ca..bb8915f2c001 100644 --- a/tests/onebit/test_mpi_backend.py +++ b/tests/onebit/test_mpi_backend.py @@ -7,17 +7,19 @@ import deepspeed from deepspeed.runtime.comm.mpi import MpiBackend +from deepspeed.accelerator import get_accelerator comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() -deepspeed.init_distributed(dist_backend='nccl') +deepspeed.init_distributed(dist_backend=get_accelerator().communication_backend_name()) # Change cuda_aware to True to test out CUDA-Aware MPI communication backend = MpiBackend(cuda_aware=False) -device = torch.device('cuda', rank % torch.cuda.device_count()) +local_rank = rank % get_accelerator().device_count() +device = torch.device(get_accelerator().device_name(), local_rank) # A simulated compression function using deepspeed.comm @@ -37,7 +39,7 @@ def torch_sim(a): [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())]) rank = dist.get_rank() server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank] - torch.cuda.synchronize() + get_accelerator().synchronize() dist.barrier() return a_server_compressed, worker_error, server_error @@ -58,8 +60,7 @@ def torch_sim(a): server_error = torch.zeros(right_server_size, device=device) a_torch, worker_error_torch, server_error_torch = torch_sim(a) -torch.cuda.empty_cache() -local_rank = rank % torch.cuda.device_count() +get_accelerator().empty_cache() a_after = backend.compressed_allreduce(a, worker_error, server_error, local_rank) diff --git a/tests/onebit/test_mpi_perf.py b/tests/onebit/test_mpi_perf.py index 3345c20e5008..dd67fdb615e8 100644 --- a/tests/onebit/test_mpi_perf.py +++ b/tests/onebit/test_mpi_perf.py @@ -8,6 +8,7 @@ # Configure wall clock timer from deepspeed.utils.timer import SynchronizedWallClockTimer +from deepspeed.accelerator import get_accelerator from statistics import mean @@ -17,11 +18,12 @@ size = comm.Get_size() rank = comm.Get_rank() -deepspeed.init_distributed(dist_backend='nccl') +deepspeed.init_distributed(dist_backend=get_accelerator().communication_backend_name()) # Change cuda_aware to True to test out CUDA-Aware MPI communication backend = MpiBackend(cuda_aware=False) -device = torch.device('cuda', rank % torch.cuda.device_count()) +local_rank = rank % get_accelerator().device_count() +device = torch.device(get_accelerator().device_name(), local_rank) tensor_size = 300 * 2**20 server_size = int(tensor_size / size) @@ -41,8 +43,6 @@ warmup = 10 iters = 10 -local_rank = rank % torch.cuda.device_count() - # Warmup for i in range(warmup): backend.compressed_allreduce(a, worker_error, server_error, local_rank) diff --git a/tests/onebit/test_nccl_backend.py b/tests/onebit/test_nccl_backend.py index d569c7272f7b..e544865b7685 100644 --- a/tests/onebit/test_nccl_backend.py +++ b/tests/onebit/test_nccl_backend.py @@ -8,16 +8,17 @@ import os from deepspeed.runtime.comm.nccl import NcclBackend +from deepspeed.accelerator import get_accelerator parser = argparse.ArgumentParser() parser.add_argument('--local_rank', type=int, default=-1) args = parser.parse_args() -deepspeed.init_distributed(dist_backend='nccl') +deepspeed.init_distributed(dist_backend=get_accelerator().communication_backend_name()) args.local_rank = int(os.environ['LOCAL_RANK']) -torch.cuda.set_device(args.local_rank) -device = torch.device("cuda", args.local_rank) +get_accelerator().set_device(args.local_rank) +device = torch.device(get_accelerator().device_name(), args.local_rank) size = dist.get_world_size() rank = dist.get_rank() @@ -43,7 +44,7 @@ def torch_sim(a): [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())]) rank = dist.get_rank() server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank] - torch.cuda.synchronize() + get_accelerator().synchronize() dist.barrier() return a_server_compressed, worker_error, server_error @@ -64,7 +65,7 @@ def torch_sim(a): server_error = torch.zeros(right_server_size, device=device) a_torch, worker_error_torch, server_error_torch = torch_sim(a) -torch.cuda.empty_cache() +get_accelerator().empty_cache() a_after = backend.compressed_allreduce(a, worker_error, server_error, local_rank) diff --git a/tests/onebit/test_nccl_perf.py b/tests/onebit/test_nccl_perf.py index dcdb13acd4c2..aab93efac851 100644 --- a/tests/onebit/test_nccl_perf.py +++ b/tests/onebit/test_nccl_perf.py @@ -9,6 +9,7 @@ from deepspeed.runtime.comm.nccl import NcclBackend from deepspeed.utils.timer import SynchronizedWallClockTimer +from deepspeed.accelerator import get_accelerator from statistics import mean timers = SynchronizedWallClockTimer() @@ -17,11 +18,11 @@ parser.add_argument('--local_rank', type=int, default=-1) args = parser.parse_args() -deepspeed.init_distributed(dist_backend='nccl') +deepspeed.init_distributed(dist_backend=get_accelerator().communication_backend_name()) args.local_rank = int(os.environ['LOCAL_RANK']) -torch.cuda.set_device(args.local_rank) -device = torch.device("cuda", args.local_rank) +get_accelerator().set_device(args.local_rank) +device = torch.device(get_accelerator().device_name(), args.local_rank) size = dist.get_world_size() rank = dist.get_rank() diff --git a/tests/perf/adam_test1.py b/tests/perf/adam_test1.py index 7ee6c71b509d..13d486d4d855 100755 --- a/tests/perf/adam_test1.py +++ b/tests/perf/adam_test1.py @@ -3,13 +3,15 @@ import torch from deepspeed.ops.adam import DeepSpeedCPUAdam import time +from deepspeed.accelerator import get_accelerator device = 'cpu' model_size = 1 * 1024**3 param = torch.nn.Parameter(torch.ones(model_size, device=device)) -param_fp16 = torch.nn.Parameter(torch.ones(model_size, - dtype=torch.half, - device='cuda:0')) +param_fp16 = torch.nn.Parameter( + torch.ones(model_size, + dtype=torch.half, + device=get_accelerator().device_name(0))) optimizer = DeepSpeedCPUAdam([param]) #torch.set_num_threads(128) diff --git a/tests/small_model_debugging/test.py b/tests/small_model_debugging/test.py index 799fa9872d74..a97792df56ac 100644 --- a/tests/small_model_debugging/test.py +++ b/tests/small_model_debugging/test.py @@ -3,6 +3,7 @@ import torch from deepspeed.pt.deepspeed_linear import LinearModuleForZeroStage3 from deepspeed.pt.log_utils import logger +from deepspeed.accelerator import get_accelerator def see_memory_usage(message): @@ -11,37 +12,42 @@ def see_memory_usage(message): logger.info(message) logger.info( "Memory Allocated %s GigaBytes ", - torch.cuda.memory_allocated() / (1024 * 1024 * 1024), + get_accelerator().memory_allocated() / (1024 * 1024 * 1024), ) logger.info( "Max Memory Allocated %s GigaBytes", - torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024), + get_accelerator().max_memory_allocated() / (1024 * 1024 * 1024), ) logger.info( "Cache Allocated %s GigaBytes", - torch.cuda.memory_cached() / (1024 * 1024 * 1024), + get_accelerator().memory_cached() / (1024 * 1024 * 1024), ) logger.info( "Max cache Allocated %s GigaBytes", - torch.cuda.max_memory_cached() / (1024 * 1024 * 1024), + get_accelerator().max_memory_cached() / (1024 * 1024 * 1024), ) -tens = torch.rand(1024, 16384, dtype=torch.half, device=torch.device('cuda')) +tens = torch.rand(1024, + 16384, + dtype=torch.half, + device=torch.device(get_accelerator().device_name())) tens_back = tens.detach().clone() #linear_bk = torch.nn.functional.linear #torch.nn.functional.linear = deepspeed.pt.deepspeed_linear.LinearFunctionForZeroStage3.apply model = LinearModuleForZeroStage3(16384, 16384) -model.cuda().half() +model.to(get_accelerator().device_name()).half() see_memory_usage("Before forward") y = model(tens) see_memory_usage("After forward") -model.weight.data = torch.zeros(1, dtype=torch.half, device=torch.device('cuda')) +model.weight.data = torch.zeros(1, + dtype=torch.half, + device=torch.device(get_accelerator().device_name())) see_memory_usage("After weight zero") diff --git a/tests/unit/alexnet_model.py b/tests/unit/alexnet_model.py index f7038f6a2072..bdbaf02922e2 100644 --- a/tests/unit/alexnet_model.py +++ b/tests/unit/alexnet_model.py @@ -7,6 +7,7 @@ import deepspeed import deepspeed.comm as dist import deepspeed.runtime.utils as ds_utils +from deepspeed.accelerator import get_accelerator from deepspeed.runtime.pipe.module import PipelineModule, LayerSpec @@ -110,7 +111,7 @@ def cifar_trainset(fp16=False): transform = transforms.Compose(transform_list) - local_rank = torch.cuda.current_device() + local_rank = get_accelerator().current_device() # Only one rank per machine downloads. dist.barrier() @@ -131,7 +132,8 @@ def train_cifar(model, average_dp_losses=True, fp16=True, seed=123): - with torch.random.fork_rng(devices=[torch.cuda.current_device()]): + with get_accelerator().random().fork_rng( + devices=[get_accelerator().current_device_name()]): ds_utils.set_random_seed(seed) # disable dropout @@ -154,7 +156,7 @@ def train_cifar(model, print(f'STEP={step} LOSS={loss.item()}') if average_dp_losses: - loss_tensor = torch.tensor(losses).cuda() + loss_tensor = torch.tensor(losses).to(get_accelerator().device_name()) dist.all_reduce(loss_tensor) loss_tensor /= dist.get_world_size() losses = loss_tensor.tolist() diff --git a/tests/unit/comm/test_dist.py b/tests/unit/comm/test_dist.py index 7ade69a2ef5a..6005c926f793 100644 --- a/tests/unit/comm/test_dist.py +++ b/tests/unit/comm/test_dist.py @@ -7,6 +7,7 @@ from unit.common import DistributedTest, DistributedFixture, get_master_port from unit.simple_model import SimpleModel +from deepspeed.accelerator import get_accelerator import pytest @@ -105,9 +106,9 @@ class TestDistAllReduce(DistributedTest): world_size = [1, 2, 4] def test(self): - x = torch.ones(1, 3).cuda() * (dist.get_rank() + 1) + x = torch.ones(1, 3).to(get_accelerator().device_name()) * (dist.get_rank() + 1) sum_of_ranks = (dist.get_world_size() * (dist.get_world_size() + 1)) // 2 - result = torch.ones(1, 3).cuda() * sum_of_ranks + result = torch.ones(1, 3).to(get_accelerator().device_name()) * sum_of_ranks dist.all_reduce(x) assert torch.all(x == result) @@ -117,16 +118,21 @@ class TestDistInit(DistributedTest): init_distributed = False def test_already_init(self, dist_init_required): - torch.distributed.init_process_group('nccl') - deepspeed.init_distributed('nccl', dist_init_required=dist_init_required) + torch.distributed.init_process_group( + get_accelerator().communication_backend_name()) + deepspeed.init_distributed(get_accelerator().communication_backend_name(), + dist_init_required=dist_init_required) def test_no_init(self, dist_init_required): if dist_init_required or dist_init_required is None: - deepspeed.init_distributed('nccl', dist_init_required=dist_init_required) + deepspeed.init_distributed(get_accelerator().communication_backend_name(), + dist_init_required=dist_init_required) else: # torch.dist is not done and for some reason the user says they don't want it done with pytest.raises(Exception): - deepspeed.init_distributed('nccl', dist_init_required=dist_init_required) + deepspeed.init_distributed( + get_accelerator().communication_backend_name(), + dist_init_required=dist_init_required) class TestDistInitNoEnv(DistributedTest): @@ -136,12 +142,13 @@ class TestDistInitNoEnv(DistributedTest): def test(self): torch.distributed.init_process_group( - backend='nccl', + backend=get_accelerator().communication_backend_name(), init_method=f"tcp://127.0.0.1:{get_master_port()}", world_size=1, rank=0) assert torch.distributed.is_initialized() - deepspeed.init_distributed('nccl', auto_mpi_discovery=True) + deepspeed.init_distributed(get_accelerator().communication_backend_name(), + auto_mpi_discovery=True) @pytest.mark.parametrize("dist_init_required", [True, False]) @@ -149,7 +156,8 @@ class TestDistInitWithModel(DistributedTest): init_distributed = False def test_already_init(self, dist_init_required): - torch.distributed.init_process_group('nccl') + torch.distributed.init_process_group( + get_accelerator().communication_backend_name()) model = SimpleModel(4) config_dict = { "train_micro_batch_size_per_gpu": 1, diff --git a/tests/unit/common.py b/tests/unit/common.py index 15c98911f519..35e8f3983072 100644 --- a/tests/unit/common.py +++ b/tests/unit/common.py @@ -9,6 +9,7 @@ import torch import torch.multiprocessing as mp import deepspeed +from deepspeed.accelerator import get_accelerator import deepspeed.comm as dist from torch.multiprocessing import Process @@ -39,23 +40,36 @@ def get_master_port(): return master_port -def set_cuda_visibile(): +def set_accelerator_visible(): cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", None) xdist_worker_id = get_xdist_worker_id() if xdist_worker_id is None: xdist_worker_id = 0 if cuda_visible is None: - # CUDA_VISIBLE_DEVICES is not set, discover it from nvidia-smi instead + # CUDA_VISIBLE_DEVICES is not set, discover it using accelerator specific command instead import subprocess - is_rocm_pytorch = hasattr(torch.version, 'hip') and torch.version.hip is not None - if is_rocm_pytorch: - rocm_smi = subprocess.check_output(['rocm-smi', '--showid']) - gpu_ids = filter(lambda s: 'GPU' in s, - rocm_smi.decode('utf-8').strip().split('\n')) - num_gpus = len(list(gpu_ids)) + if get_accelerator().device_name() == 'cuda': + is_rocm_pytorch = hasattr(torch.version, + 'hip') and torch.version.hip is not None + if is_rocm_pytorch: + rocm_smi = subprocess.check_output(['rocm-smi', '--showid']) + gpu_ids = filter(lambda s: 'GPU' in s, + rocm_smi.decode('utf-8').strip().split('\n')) + num_gpus = len(list(gpu_ids)) + else: + nvidia_smi = subprocess.check_output(['nvidia-smi', '--list-gpus']) + num_gpus = len(nvidia_smi.decode('utf-8').strip().split('\n')) else: - nvidia_smi = subprocess.check_output(['nvidia-smi', '--list-gpus']) - num_gpus = len(nvidia_smi.decode('utf-8').strip().split('\n')) + assert get_accelerator().device_name() == 'xpu' + import re + clinfo = subprocess.check_output(['clinfo']) + lines = clinfo.decode('utf-8').strip().split('\n') + num_gpus = 0 + for line in lines: + match = re.search('Device Type.*GPU', line) + if match: + num_gpus += 1 + cuda_visible = ",".join(map(str, range(num_gpus))) # rotate list based on xdist worker id, example below @@ -74,7 +88,7 @@ class DistributedExec(ABC): methods needed for DistributedTest and DistributedFixture. """ world_size = 2 - backend = "nccl" + backend = get_accelerator().communication_backend_name() init_distributed = True set_dist_env = True requires_cuda_env = True @@ -86,8 +100,8 @@ def run(self): def __call__(self, request=None): self._fixture_kwargs = self._get_fixture_kwargs(request, self.run) world_size = self.world_size - if self.requires_cuda_env and not torch.cuda.is_available(): - pytest.skip("only supported in CUDA environments.") + if self.requires_cuda_env and not get_accelerator().is_available(): + pytest.skip("only supported in accelerator environments.") if isinstance(world_size, int): world_size = [world_size] @@ -174,15 +188,15 @@ def _dist_init(self, local_rank, num_procs, skip_msg): # turn off NCCL logging if set os.environ.pop('NCCL_DEBUG', None) - if torch.cuda.is_available(): - set_cuda_visibile() + if get_accelerator().is_available(): + set_accelerator_visible() if self.init_distributed: deepspeed.init_distributed(dist_backend=self.backend) dist.barrier() - if torch.cuda.is_available(): - torch.cuda.set_device(local_rank) + if get_accelerator().is_available(): + get_accelerator().set_device(local_rank) try: self.run(**self._fixture_kwargs) @@ -323,8 +337,8 @@ def __call__(self, request): self._current_test = self._get_current_test_func(request) self._fixture_kwargs = self._get_fixture_kwargs(request, self._current_test) - if self.requires_cuda_env and not torch.cuda.is_available(): - pytest.skip("only supported in CUDA environments.") + if self.requires_cuda_env and not get_accelerator().is_available(): + pytest.skip("only supported in accelerator environments.") # Catch world_size override pytest mark for mark in getattr(request.function, "pytestmark", []): diff --git a/tests/unit/compression/test_compression.py b/tests/unit/compression/test_compression.py index 8b3617424225..829161ea072c 100644 --- a/tests/unit/compression/test_compression.py +++ b/tests/unit/compression/test_compression.py @@ -10,6 +10,7 @@ from unit.modelingpreln import BertEncoder as BertEncoderPreln from deepspeed.compression.basic_layer import LinearLayer_Compress, ColumnParallelLinear_Compress, RowParallelLinear_Compress from deepspeed.compression.helper import convert_conv1d_to_linear +from deepspeed.accelerator import get_accelerator from unit.common import DistributedTest TORCH_MAJOR = int(torch.__version__.split('.')[0]) @@ -23,7 +24,7 @@ def reset_random(seed=1234): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) - torch.cuda.manual_seed_all(seed) + get_accelerator().manual_seed_all(seed) def create_bert_model(): diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index ec791028deb4..371ecda710b2 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -16,6 +16,7 @@ from huggingface_hub import HfApi from deepspeed.model_implementations import DeepSpeedTransformerInference from torch import nn +from deepspeed.accelerator import get_accelerator rocm_version = OpBuilder.installed_rocm_version() if rocm_version != (0, 0): @@ -288,17 +289,17 @@ def test( pipe.model.half() # Switch device to GPU after converting to half - device = torch.device(f"cuda:{local_rank}") + device = torch.device(get_accelerator().device_name(local_rank)) pipe.device = device pipe.model.to(device) # Warm-up queries for perf measurement #for i in range(10): # _ = pipe(query, **inf_kwargs) - torch.cuda.synchronize() + get_accelerator().synchronize() start = time.time() bs_output = pipe(query, **inf_kwargs) - torch.cuda.synchronize() + get_accelerator().synchronize() bs_time = time.time() - start pipe.model = deepspeed.init_inference( @@ -312,10 +313,10 @@ def test( # Warm-up queries for perf measurement #for i in range(10): # _ = pipe(query, **inf_kwargs) - torch.cuda.synchronize() + get_accelerator().synchronize() start = time.time() ds_output = pipe(query, **inf_kwargs) - torch.cuda.synchronize() + get_accelerator().synchronize() ds_time = time.time() - start # facebook/opt* and some bigscient/bloom* models are not matching @@ -372,7 +373,7 @@ def test( replace_with_kernel_inject=True) check_injection(pipe.model) # Switch device to GPU so that input tensors are not on CPU - pipe.device = torch.device(f"cuda:{local_rank}") + pipe.device = torch.device(get_accelerator().device_name(local_rank)) ds_output = pipe(query, **inf_kwargs) print(local_rank, "baseline", bs_output) @@ -433,7 +434,7 @@ def test( dtype=dtype, injection_policy=injection_policy) # Switch device to GPU so that input tensors are not on CPU - pipe.device = torch.device(f"cuda:{local_rank}") + pipe.device = torch.device(get_accelerator().device_name(local_rank)) ds_output = pipe(query, **inf_kwargs) print(local_rank, "baseline", bs_output) @@ -483,7 +484,7 @@ def test( mp_size=world_size, dtype=dtype) # Switch device to GPU so that input tensors are not on CPU - pipe.device = torch.device(f"cuda:{local_rank}") + pipe.device = torch.device(get_accelerator().device_name(local_rank)) ds_output = pipe(query, **inf_kwargs) print(local_rank, "baseline", bs_output) @@ -515,7 +516,7 @@ def test(self, model_family, model_name, task): import lm_eval.evaluator local_rank = os.getenv("LOCAL_RANK", "0") - device = torch.device(f"cuda:{local_rank}") + device = torch.device(get_accelerator().device_name(local_rank)) dtype = torch.float task_dict = lm_eval.tasks.get_task_dict([task]) @@ -529,12 +530,12 @@ def test(self, model_family, model_name, task): else: lm = lm_eval.models.get_model(model_family).create_from_arg_string( f"pretrained={model_name}", - {"device": "cuda"}) + {"device": get_accelerator().device_name()}) - torch.cuda.synchronize() + get_accelerator().synchronize() start = time.time() bs_output = lm_eval.evaluator.evaluate(lm=lm, task_dict=task_dict) - torch.cuda.synchronize() + get_accelerator().synchronize() bs_time = time.time() - start ds_model = deepspeed.init_inference( @@ -547,10 +548,10 @@ def test(self, model_family, model_name, task): ) check_injection(ds_model) setattr(lm, model_family, ds_model) - torch.cuda.synchronize() + get_accelerator().synchronize() start = time.time() ds_output = lm_eval.evaluator.evaluate(lm=lm, task_dict=task_dict) - torch.cuda.synchronize() + get_accelerator().synchronize() ds_time = time.time() - start ppl_diff = abs(bs_output["results"][task]["ppl"] - diff --git a/tests/unit/inference/test_model_profiling.py b/tests/unit/inference/test_model_profiling.py index 0259ce09a1d4..07ce839306a6 100644 --- a/tests/unit/inference/test_model_profiling.py +++ b/tests/unit/inference/test_model_profiling.py @@ -7,6 +7,7 @@ import deepspeed from transformers import pipeline from unit.common import DistributedTest +from deepspeed.accelerator import get_accelerator @pytest.fixture @@ -74,12 +75,12 @@ def test(self, e2e_times = [] model_times = [] for _ in range(10): - torch.cuda.synchronize() + get_accelerator().synchronize() start = time.perf_counter_ns() r = pipe(query, **inf_kwargs) - torch.cuda.synchronize() + get_accelerator().synchronize() end = time.perf_counter_ns() e2e_times.append((end - start) / 1e6) # convert ns to ms diff --git a/tests/unit/megatron_model.py b/tests/unit/megatron_model.py index 00212a853570..32faf2244940 100644 --- a/tests/unit/megatron_model.py +++ b/tests/unit/megatron_model.py @@ -7,6 +7,7 @@ from .common import get_test_path from deepspeed.pipe import PipelineModule, LayerSpec +from deepspeed.accelerator import get_accelerator def get_megatron_version(): @@ -39,10 +40,10 @@ def get_gpt2_model(args_others, mp_size=1): initialize_megatron(args_defaults=args_defaults, ignore_unknown_args=True) model = GPT2Model(num_tokentypes=0, parallel_output=False) - model.cuda() + model.to(get_accelerator().device_name()) from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP from megatron import mpu - i = torch.cuda.current_device() + i = get_accelerator().current_device_name() model = torchDDP(model, device_ids=[i], output_device=i, @@ -78,8 +79,9 @@ def __init__(self, num_layers, mp_size, args_others, topo, **kwargs): class ParallelTransformerLayerPipe(ParallelTransformerLayer): def forward(self, args): # hardcode attn mask for testing, PP requires the attn_mask to be stashed - attention_mask = torch.tensor([[True]], - device=torch.cuda.current_device()) + attention_mask = torch.tensor( + [[True]], + device=get_accelerator().current_device_name()) return super().forward(args, attention_mask) layers = [] diff --git a/tests/unit/model_parallelism/test_configurable_parallel_mp.py b/tests/unit/model_parallelism/test_configurable_parallel_mp.py index 93b66cee225a..d17f45c0b526 100644 --- a/tests/unit/model_parallelism/test_configurable_parallel_mp.py +++ b/tests/unit/model_parallelism/test_configurable_parallel_mp.py @@ -7,6 +7,7 @@ import random import numpy as np import deepspeed.comm as dist +from deepspeed.accelerator import get_accelerator from unit.common import DistributedTest, DistributedFixture from unit.megatron_model import get_gpt2_model, get_megatron_version @@ -42,7 +43,7 @@ def reset_random(self, seed=1234): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) - torch.cuda.manual_seed_all(seed) + get_accelerator().manual_seed_all(seed) @pytest.fixture def inputs(self, bs=1, seq_len=20): @@ -70,7 +71,10 @@ def test_gpt2_basic(self, tmpdir, inputs): model = get_deepspeed_model(model) model.eval() - baseline = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) + device_name = get_accelerator().device_name() + baseline = model(inputs[0].to(device_name), + inputs[1].to(device_name), + inputs[2].to(device_name)) tag = 'mp_1' state_dict = {} @@ -99,7 +103,10 @@ def test_gpt2_mp2_no_resize(self, tmpdir, inputs): model.eval() - baseline = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) + device_name = get_accelerator().device_name() + baseline = model(inputs[0].to(device_name), + inputs[1].to(device_name), + inputs[2].to(device_name)) tag = 'mp_2' state_dict = {} @@ -111,7 +118,10 @@ def test_gpt2_mp2_no_resize(self, tmpdir, inputs): load_optimizer_states=False, load_lr_scheduler_states=False) - test = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) + device_name = get_accelerator().device_name() + test = model(inputs[0].to(device_name), + inputs[1].to(device_name), + inputs[2].to(device_name)) assert torch.allclose(baseline, test, rtol=1.0, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}" @@ -133,7 +143,10 @@ def run(self, inputs, class_tmpdir): model.eval() with torch.no_grad(): - baseline = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) + device_name = get_accelerator().device_name() + baseline = model(inputs[0].to(device_name), + inputs[1].to(device_name), + inputs[2].to(device_name)) if dist.get_rank() == 0: save_path = os.path.join(class_tmpdir, "output.pt") torch.save(baseline.cpu(), save_path) @@ -164,7 +177,10 @@ def test(self, baseline_mp2, inputs, class_tmpdir): model.load_checkpoint(class_tmpdir, load_optimizer_states=False, load_lr_scheduler_states=False) - test = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) + device_name = get_accelerator().device_name() + test = model(inputs[0].to(device_name), + inputs[1].to(device_name), + inputs[2].to(device_name)) if dist.get_rank() == 0: load_path = os.path.join(class_tmpdir, "output.pt") baseline = torch.load(load_path) diff --git a/tests/unit/model_parallelism/test_configurable_parallel_pp.py b/tests/unit/model_parallelism/test_configurable_parallel_pp.py index 33fd33ff2975..af091d68c411 100644 --- a/tests/unit/model_parallelism/test_configurable_parallel_pp.py +++ b/tests/unit/model_parallelism/test_configurable_parallel_pp.py @@ -11,6 +11,7 @@ from unit.megatron_model import get_megatron_version from unit.megatron_model import MockGPT2ModelPipe as GPT2ModelPipe from deepspeed.utils import RepeatingLoader +from deepspeed.accelerator import get_accelerator TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MINOR = int(torch.__version__.split('.')[1]) @@ -33,7 +34,7 @@ def get_deepspeed_model(model): model, _, _,_ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=ds_config_dict) - return model.cuda() + return model.to(get_accelerator().device_name()) def get_topology(mp, pp, world_size): @@ -52,7 +53,7 @@ def reset_random(self, seed=1234): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) - torch.cuda.manual_seed_all(seed) + get_accelerator().manual_seed_all(seed) @pytest.fixture def inputs(self, bs=1, seq_len=1, hidden_size=128): @@ -155,7 +156,7 @@ def run(self, inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size): model = get_deepspeed_model(gpt2_pipe_model) with torch.no_grad(): - inputs = [x.cuda() for x in inputs] + inputs = [x.to(get_accelerator().device_name()) for x in inputs] if model.is_first_stage() or model.is_last_stage(): loader = RepeatingLoader([(inputs[0], 0)]) data_iter = iter(loader) @@ -225,7 +226,7 @@ def _test(self, tag=checkpoint_tag, load_optimizer_states=False, load_lr_scheduler_states=False) - inputs = [x.cuda() for x in inputs] + inputs = [x.to(get_accelerator().device_name()) for x in inputs] if model.is_first_stage() or model.is_last_stage(): loader = RepeatingLoader([(inputs[0], 0)]) data_iter = iter(loader) diff --git a/tests/unit/modeling.py b/tests/unit/modeling.py index 2624e0e2eedb..94dea45468bc 100644 --- a/tests/unit/modeling.py +++ b/tests/unit/modeling.py @@ -43,6 +43,7 @@ #from numba import cuda #from deepspeed_cuda import DeepSpeedSoftmaxConfig, DeepSpeedSoftmax +from deepspeed.accelerator import get_accelerator logger = logging.getLogger(__name__) @@ -184,8 +185,8 @@ def swish(x): class GPUTimer: def __init__(self): super().__init__() - self.start = cuda.event() # noqa: F821 - self.stop = cuda.event() # noqa: F821 + self.start = get_accelerator().Event() # noqa: F821 + self.stop = get_accelerator().Event() # noqa: F821 def record(self): self.start.record() @@ -749,12 +750,12 @@ def __init__(self, config, bert_model_embedding_weights): def forward(self, hidden_states): hidden_states = self.transform(hidden_states) - torch.cuda.nvtx.range_push( + get_accelerator().range_push( "decoder input.size() = {}, weight.size() = {}".format( hidden_states.size(), self.decoder.weight.size())) hidden_states = self.decoder(hidden_states) + self.bias - torch.cuda.nvtx.range_pop() + get_accelerator().range_pop() return hidden_states @@ -884,7 +885,7 @@ def from_pretrained(cls, weights_path = os.path.join(serialization_dir, WEIGHTS_NAME) state_dict = torch.load( weights_path, - map_location='cpu' if not torch.cuda.is_available() else None) + map_location='cpu' if not get_accelerator().is_available() else None) if tempdir: # Clean up temp dir shutil.rmtree(tempdir) diff --git a/tests/unit/modelingpreln.py b/tests/unit/modelingpreln.py index e9947b2079b3..0069add9aa4d 100644 --- a/tests/unit/modelingpreln.py +++ b/tests/unit/modelingpreln.py @@ -39,6 +39,7 @@ from torch.nn import Module import torch.nn.functional as F import torch.nn.init as init +from deepspeed.accelerator import get_accelerator #from numba import cuda @@ -184,8 +185,8 @@ def swish(x): class GPUTimer: def __init__(self): super().__init__() - self.start = cuda.event() # noqa: F821 - self.stop = cuda.event() # noqa: F821 + self.start = get_accelerator().Event() # noqa: F821 + self.stop = get_accelerator().Event() # noqa: F821 def record(self): self.start.record() @@ -844,12 +845,12 @@ def __init__(self, config, bert_model_embedding_weights): def forward(self, hidden_states): hidden_states = self.transform(hidden_states) - torch.cuda.nvtx.range_push( + get_accelerator().range_push( "decoder input.size() = {}, weight.size() = {}".format( hidden_states.size(), self.decoder.weight.size())) hidden_states = self.decoder(hidden_states) + self.bias - torch.cuda.nvtx.range_pop() + get_accelerator().range_pop() return hidden_states @@ -979,7 +980,7 @@ def from_pretrained(cls, weights_path = os.path.join(serialization_dir, WEIGHTS_NAME) state_dict = torch.load( weights_path, - map_location='cpu' if not torch.cuda.is_available() else None) + map_location='cpu' if not get_accelerator().is_available() else None) if tempdir: # Clean up temp dir shutil.rmtree(tempdir) diff --git a/tests/unit/ops/cuda/test_cuda_backward.py b/tests/unit/ops/accelerators/test_accelerator_backward.py similarity index 95% rename from tests/unit/ops/cuda/test_cuda_backward.py rename to tests/unit/ops/accelerators/test_accelerator_backward.py index c7a460161792..ad26daeb698c 100644 --- a/tests/unit/ops/cuda/test_cuda_backward.py +++ b/tests/unit/ops/accelerators/test_accelerator_backward.py @@ -8,6 +8,7 @@ import copy from torch import nn from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig +from deepspeed.accelerator import get_accelerator from unit.modeling import BertConfig, BertLayerNorm, BertEncoder as BertEncoderPostln from unit.modelingpreln import BertEncoder as BertEncoderPreln from unit.common import DistributedTest @@ -84,7 +85,7 @@ def zero_grad(variables): variable.grad.zero_() -device = torch.device("cuda") +device = torch.device(get_accelerator().device_name()) kwargs_fp32 = {'dtype': torch.float, 'device': device, 'requires_grad': True} kwargs_fp16 = {'dtype': torch.half, 'device': device, 'requires_grad': True} @@ -210,8 +211,8 @@ def create_models(ds_config): bert_encoder.half() ds_encoder.half() - bert_encoder.cuda() - ds_encoder.cuda() + bert_encoder.to(get_accelerator().device_name()) + ds_encoder.to(get_accelerator().device_name()) return bert_encoder, ds_encoder @@ -288,9 +289,9 @@ def test_backward(self, is_preln, use_fp16, atol): - # Only run fp16 test cases on devices with 7+ capability. - major, _ = torch.cuda.get_device_capability() - if major < 7 and (use_fp16 is True or is_preln is False): + # Only run fp16 test cases on devices with FP16 capability. + if not get_accelerator().is_fp16_supported() and (use_fp16 is True + or is_preln is False): return ds_config = DeepSpeedTransformerConfig() @@ -322,9 +323,8 @@ def test_backward(self, # is_preln, # use_fp16, # atol): - # # Only run fp16 test cases on devices with 7+ capability. - # major, _ = torch.cuda.get_device_capability() - # if major < 7 and (use_fp16 is True or is_preln is False): + # # Only run fp16 test cases on devices with FP16 capability. + # if not get_accelerator().is_fp16_supported() and use_fp16 is True: # return # # ds_config = DeepSpeedTransformerConfig() diff --git a/tests/unit/ops/cuda/test_cuda_forward.py b/tests/unit/ops/accelerators/test_accelerator_forward.py similarity index 95% rename from tests/unit/ops/cuda/test_cuda_forward.py rename to tests/unit/ops/accelerators/test_accelerator_forward.py index 3958a220b35b..317e2fe3cb45 100644 --- a/tests/unit/ops/cuda/test_cuda_forward.py +++ b/tests/unit/ops/accelerators/test_accelerator_forward.py @@ -10,6 +10,7 @@ from unit.modelingpreln import BertEncoder as BertEncoderPreln from unit.modeling import BertLayerNorm, BertConfig, BertEncoder as BertEncoderPostln from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig +from deepspeed.accelerator import get_accelerator from unit.common import DistributedTest @@ -31,7 +32,7 @@ def zero_grad(variables): variable.grad.zero_() -device = torch.device("cuda") +device = torch.device(get_accelerator().device_name()) kwargs_fp32 = {'dtype': torch.float, 'device': device, 'requires_grad': True} kwargs_fp16 = {'dtype': torch.half, 'device': device, 'requires_grad': True} @@ -150,8 +151,8 @@ def create_models(ds_config): bert_encoder.half() ds_encoder.half() - bert_encoder.cuda() - ds_encoder.cuda() + bert_encoder.to(get_accelerator().device_name()) + ds_encoder.to(get_accelerator().device_name()) return bert_encoder, ds_encoder @@ -241,9 +242,8 @@ def test_forward(self, num_layers, is_preln, use_fp16): - # Only run fp16 test cases on devices with 7+ capability. - major, _ = torch.cuda.get_device_capability() - if major < 7 and use_fp16 is True: + # Only run fp16 test cases on devices with FP16 capability. + if not get_accelerator().is_fp16_supported() and use_fp16 is True: return ds_config = DeepSpeedTransformerConfig() @@ -281,9 +281,8 @@ def test_forward_with_small_bsz(self, num_layers, is_preln, use_fp16): - # Only run fp16 test cases on devices with 7+ capability. - major, _ = torch.cuda.get_device_capability() - if major < 7 and use_fp16 is True: + # Only run fp16 test cases on devices with FP16 capability. + if not get_accelerator().is_fp16_supported() and use_fp16 is True: return ds_config = DeepSpeedTransformerConfig() @@ -319,9 +318,8 @@ def test_forward_stochastic(self, num_layers, is_preln, use_fp16): - # Only run fp16 test cases on devices with 7+ capability. - major, _ = torch.cuda.get_device_capability() - if major < 7 and use_fp16 is True: + # Only run fp16 test cases on devices with FP16 capability. + if not get_accelerator().is_fp16_supported() and use_fp16 is True: return ds_config = DeepSpeedTransformerConfig() diff --git a/tests/unit/ops/adagrad/test_cpu_adagrad.py b/tests/unit/ops/adagrad/test_cpu_adagrad.py index b5dc7dea1be3..17001e6bd021 100644 --- a/tests/unit/ops/adagrad/test_cpu_adagrad.py +++ b/tests/unit/ops/adagrad/test_cpu_adagrad.py @@ -6,6 +6,7 @@ import deepspeed from deepspeed.ops.adagrad import DeepSpeedCPUAdagrad +from deepspeed.accelerator import get_accelerator from deepspeed.ops.op_builder import CPUAdagradBuilder from unit.common import DistributedTest @@ -26,7 +27,7 @@ def check_equal(first, second, atol=1e-2, verbose=False): class TestCPUAdagrad(DistributedTest): world_size = 1 requires_cuda_env = False - if not torch.cuda.is_available(): + if not get_accelerator().is_available(): init_distributed = False set_dist_env = False @@ -138,7 +139,7 @@ def gen_sparse_grad(vocabulary_size, dim, num_indices, dtype, device): class TestCPUAdagradGPUError(DistributedTest): def test_cpu_adagrad_gpu_error(self): model_size = 64 - device = 'cuda:0' + device = get_accelerator().device_name(0) # 'cuda:0' or 'xpu:0' param = torch.nn.Parameter(torch.randn(model_size, device=device)) optimizer = DeepSpeedCPUAdagrad([param]) diff --git a/tests/unit/ops/adam/test_cpu_adam.py b/tests/unit/ops/adam/test_cpu_adam.py index a9a235d57612..d10fb98105a8 100644 --- a/tests/unit/ops/adam/test_cpu_adam.py +++ b/tests/unit/ops/adam/test_cpu_adam.py @@ -6,6 +6,7 @@ from cpuinfo import get_cpu_info import deepspeed +from deepspeed.accelerator import get_accelerator from deepspeed.ops.adam import FusedAdam from deepspeed.ops.op_builder import CPUAdamBuilder from unit.common import DistributedTest @@ -56,11 +57,11 @@ def _compare_optimizers(model_size, param1, optimizer1, param2, optimizer2): class TestCPUAdam(DistributedTest): world_size = 1 requires_cuda_env = False - if not torch.cuda.is_available(): + if not get_accelerator().is_available(): init_distributed = False set_dist_env = False - @pytest.mark.skipif(not torch.cuda.is_available(), + @pytest.mark.skipif(not get_accelerator().is_available(), reason="only supported in CUDA environments.") def test_fused_adam_equal(self, dtype, model_size): if ("amd" in pytest.cpu_vendor) and (dtype == torch.half): @@ -70,7 +71,7 @@ def test_fused_adam_equal(self, dtype, model_size): cpu_data = torch.randn(model_size, device='cpu').to(dtype) cpu_param = torch.nn.Parameter(cpu_data) - cuda_param = torch.nn.Parameter(cpu_data.cuda()) + cuda_param = torch.nn.Parameter(cpu_data.to(get_accelerator().device_name())) # tolerance = cpu_param.float().norm().detach().numpy() * 1e-2 # check_equal(cpu_param.float().norm(), @@ -88,10 +89,10 @@ def test_fused_adam_equal(self, dtype, model_size): optimizer2=cuda_optimizer) def test_torch_adamw_equal(self, dtype, model_size): - if torch.cuda.is_available(): + if get_accelerator().is_available(): if ("amd" in pytest.cpu_vendor) and (dtype == torch.half): pytest.skip("cpu-adam with half precision not supported on AMD CPUs") - ref_param_device = 'cuda' + ref_param_device = get_accelerator().device_name() else: if dtype == torch.half: pytest.skip( @@ -119,7 +120,7 @@ class TestCPUAdamGPUError(DistributedTest): def test_cpu_adam_gpu_error(self): model_size = 64 from deepspeed.ops.adam import DeepSpeedCPUAdam - device = 'cuda:0' + device = get_accelerator().device_name(0) # 'cuda:0' or 'xpu:0' param = torch.nn.Parameter(torch.randn(model_size, device=device)) optimizer = DeepSpeedCPUAdam([param]) diff --git a/tests/unit/ops/aio/test_aio.py b/tests/unit/ops/aio/test_aio.py index 886354c38935..86265ab15ef9 100644 --- a/tests/unit/ops/aio/test_aio.py +++ b/tests/unit/ops/aio/test_aio.py @@ -6,7 +6,8 @@ import torch import deepspeed import deepspeed.comm as dist -from deepspeed.ops.aio import AsyncIOBuilder +from deepspeed.accelerator import get_accelerator +from deepspeed.ops.op_builder import AsyncIOBuilder from unit.common import DistributedTest KILO_BYTE = 1024 @@ -20,7 +21,7 @@ def _skip_for_invalid_environment(use_cuda_device=True, use_cuda_pinned_tensor=True): - if not torch.cuda.is_available(): + if not get_accelerator().is_available(): if use_cuda_device: pytest.skip("GPU tensors only supported in CUDA environments.") if use_cuda_pinned_tensor: @@ -28,7 +29,7 @@ def _skip_for_invalid_environment(use_cuda_device=True, use_cuda_pinned_tensor=T def _get_local_rank(): - if torch.cuda.is_available(): + if get_accelerator().is_available(): return dist.get_rank() return 0 @@ -50,14 +51,14 @@ def _get_test_write_file(tmpdir, index): def _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffer, index=0): test_file = _get_test_write_file(tmpdir, index) - test_buffer = torch.cuda.ByteTensor(list(ref_buffer)) + test_buffer = get_accelerator().ByteTensor(list(ref_buffer)) return test_file, test_buffer def _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer, aio_handle=None, index=0): test_file = _get_test_write_file(tmpdir, index) if aio_handle is None: - test_buffer = torch.ByteTensor(list(ref_buffer)).pin_memory() + test_buffer = get_accelerator().pin_memory(torch.ByteTensor(list(ref_buffer))) else: tmp_buffer = torch.ByteTensor(list(ref_buffer)) test_buffer = aio_handle.new_cpu_locked_tensor(len(ref_buffer), tmp_buffer) @@ -80,7 +81,7 @@ def _validate_handle_state(handle, single_submit, overlap_events): class TestRead(DistributedTest): world_size = 1 requires_cuda_env = False - if not torch.cuda.is_available(): + if not get_accelerator().is_available(): init_distributed = False set_dist_env = False @@ -99,9 +100,10 @@ def test_parallel_read(self, IO_PARALLEL) if use_cuda_pinned_tensor: - aio_buffer = torch.empty(IO_SIZE, - dtype=torch.uint8, - device='cpu').pin_memory() + aio_buffer = get_accelerator().pin_memory( + torch.empty(IO_SIZE, + dtype=torch.uint8, + device='cpu')) else: aio_buffer = h.new_cpu_locked_tensor(IO_SIZE, torch.empty(0, @@ -138,11 +140,14 @@ def test_async_read(self, IO_PARALLEL) if cuda_device: - aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device='cuda') - elif use_cuda_pinned_tensor: aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, - device='cpu').pin_memory() + device=get_accelerator().device_name()) + elif use_cuda_pinned_tensor: + aio_buffer = get_accelerator().pin_memory( + torch.empty(IO_SIZE, + dtype=torch.uint8, + device='cpu')) else: aio_buffer = h.new_cpu_locked_tensor(IO_SIZE, torch.empty(0, @@ -172,7 +177,7 @@ def test_async_read(self, class TestWrite(DistributedTest): world_size = 1 requires_cuda_env = False - if not torch.cuda.is_available(): + if not get_accelerator().is_available(): init_distributed = False set_dist_env = False @@ -258,7 +263,7 @@ def test_async_write(self, class TestAsyncQueue(DistributedTest): world_size = 1 requires_cuda_env = False - if not torch.cuda.is_available(): + if not get_accelerator().is_available(): init_distributed = False set_dist_env = False @@ -285,13 +290,15 @@ def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device): aio_buffers = [ torch.empty(IO_SIZE, dtype=torch.uint8, - device='cuda') for _ in range(async_queue) + device=get_accelerator().device_name()) + for _ in range(async_queue) ] elif use_cuda_pinned_tensor: aio_buffers = [ - torch.empty(IO_SIZE, - dtype=torch.uint8, - device='cpu').pin_memory() for _ in range(async_queue) + get_accelerator().pin_memory( + torch.empty(IO_SIZE, + dtype=torch.uint8, + device='cpu')) for _ in range(async_queue) ] else: tmp_tensor = torch.empty(0, dtype=torch.uint8) diff --git a/tests/unit/ops/quantizer/test_dequantize.py b/tests/unit/ops/quantizer/test_dequantize.py index c211b3411a29..5dc2f7d68a70 100644 --- a/tests/unit/ops/quantizer/test_dequantize.py +++ b/tests/unit/ops/quantizer/test_dequantize.py @@ -5,6 +5,7 @@ import pytest import torch from deepspeed.ops import op_builder +from deepspeed.accelerator import get_accelerator quantize_module = None @@ -75,7 +76,7 @@ def test_dequantize(num_elems, num_groups, is_symmetric_quant, q_bits): activations = torch.randn((num_groups, num_elems), dtype=torch.float16, - device='cuda') + device=get_accelerator().device_name()) quantized_data, params = run_quantize(activations, num_groups, q_bits, is_symmetric_quant) ds_dequant = run_dequantize(quantized_data, diff --git a/tests/unit/ops/quantizer/test_fake_quantization.py b/tests/unit/ops/quantizer/test_fake_quantization.py index 62b154d34ff8..c5304f7694ee 100644 --- a/tests/unit/ops/quantizer/test_fake_quantization.py +++ b/tests/unit/ops/quantizer/test_fake_quantization.py @@ -2,6 +2,7 @@ import torch import pytest +from deepspeed.accelerator import get_accelerator from deepspeed.ops import op_builder quantizer_cuda_module = None @@ -44,7 +45,8 @@ def run_quant_dequant(inputs, groups, bits): # Note that we have an explicit boundary for groups as ((size / groups) - 1) / 4096 + 1) <= MAX_REG. def test_fake_quant_dequant(tensor_shape, groups): - input_tensor = torch.rand((tensor_shape), dtype=torch.float16).cuda() + input_tensor = torch.rand((tensor_shape), + dtype=torch.float16).to(get_accelerator().device_name()) # 8-bit quantization. ref_input_8bit = input_tensor.clone().detach() diff --git a/tests/unit/ops/quantizer/test_quantize.py b/tests/unit/ops/quantizer/test_quantize.py index 1aa2c79643e6..3cfd812e63f9 100644 --- a/tests/unit/ops/quantizer/test_quantize.py +++ b/tests/unit/ops/quantizer/test_quantize.py @@ -5,6 +5,7 @@ import pytest import torch from deepspeed.ops import op_builder +from deepspeed.accelerator import get_accelerator inference_module = None @@ -27,8 +28,8 @@ def get_q_props(q_bits): q_min = -(2**(q_bits - 1)) q_max = (2**(q_bits - 1) - 1) - q_min = torch.IntTensor([q_min]).to(device='cuda') - q_max = torch.IntTensor([q_max]).to(device='cuda') + q_min = torch.IntTensor([q_min]).to(device=get_accelerator().device_name()) + q_max = torch.IntTensor([q_max]).to(device=get_accelerator().device_name()) return q_range, q_max, q_min @@ -46,7 +47,9 @@ def get_scale_zero_point(q_bits, scale = torch.empty_like(absmax) for i, x in enumerate(absmax): scale[i] = torch.ones_like(x) if x == 0 else q_range / (2 * x) - zero_point = torch.zeros(scale.shape, dtype=torch.float32, device='cuda') + zero_point = torch.zeros(scale.shape, + dtype=torch.float32, + device=get_accelerator().device_name()) else: scale = torch.empty_like(max) for i, x in enumerate(max): @@ -125,12 +128,12 @@ def test_float_quantize(num_elems, activations_ds = torch.zeros((num_groups, num_elems), dtype=torch.float16, - device='cuda') + device=get_accelerator().device_name()) else: activations_ds = torch.randn((num_groups, num_elems), dtype=torch.float16, - device='cuda') + device=get_accelerator().device_name()) activations_ref = activations_ds.clone().detach() ref_out_tensor, ref_params = run_float_quantize(q_bits, is_symmetric_quant, activations_ref, num_groups) diff --git a/tests/unit/ops/sparse_attention/test_sparse_attention.py b/tests/unit/ops/sparse_attention/test_sparse_attention.py index d663698beb3e..a4fc49354739 100644 --- a/tests/unit/ops/sparse_attention/test_sparse_attention.py +++ b/tests/unit/ops/sparse_attention/test_sparse_attention.py @@ -8,6 +8,7 @@ import pytest import torch import deepspeed +from deepspeed.accelerator import get_accelerator from deepspeed.ops.op_builder import SparseAttnBuilder if not deepspeed.ops.__compatible_ops__[SparseAttnBuilder.NAME]: @@ -94,7 +95,13 @@ def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layo if layout is None: layout = make_layout(rho, (H, M // block, N // block)) if dense_x: - x = torch.rand((Z, H, M, N), dtype=dtype, requires_grad=True, device='cuda') + x = torch.rand((Z, + H, + M, + N), + dtype=dtype, + requires_grad=True, + device=get_accelerator().device_name()) else: x = torch.rand((Z, layout.sum(), @@ -102,7 +109,7 @@ def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layo block), dtype=dtype, requires_grad=True, - device='cuda') + device=get_accelerator().device_name()) dx = torch.rand_like(x) bool_attn_mask = torch.randint(low=0, high=2, @@ -110,7 +117,7 @@ def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layo N), dtype=torch.bool, requires_grad=False, - device='cuda') + device=get_accelerator().device_name()) fp_attn_mask = bool_attn_mask.type(dtype) kp_mask = torch.randint(low=0, high=2, @@ -118,20 +125,24 @@ def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layo N), dtype=dtype, requires_grad=False, - device='cuda') + device=get_accelerator().device_name()) kp_mask[kp_mask == 1.] = float('-inf') return layout, x, dx, bool_attn_mask, fp_attn_mask, kp_mask def _skip_on_cuda_compatability(): - if torch.cuda.get_device_capability()[0] < 7: - pytest.skip("needs higher compute capability than 7") - cuda_major = int(torch.version.cuda.split('.')[0]) * 10 - cuda_minor = int(torch.version.cuda.split('.')[1]) - cuda_version = cuda_major + cuda_minor - if (cuda_version != 101 and cuda_version != 102) and \ - (cuda_version != 111 and cuda_version != 110): - pytest.skip("requires cuda 10.1 or 10.2 or 11.0 or 11.1") + if deepspeed.accelerator.get_accelerator().device_name() == 'cuda': + if torch.cuda.get_device_capability()[0] < 7: + pytest.skip("needs higher compute capability than 7") + cuda_major = int(torch.version.cuda.split('.')[0]) * 10 + cuda_minor = int(torch.version.cuda.split('.')[1]) + cuda_version = cuda_major + cuda_minor + if (cuda_version != 101 and cuda_version != 102) and \ + (cuda_version != 111 and cuda_version != 110): + pytest.skip("requires cuda 10.1 or 10.2 or 11.0 or 11.1") + else: + assert deepspeed.accelerator.get_accelerator().device_name() == 'xpu' + return @pytest.mark.parametrize("block", [16, 32]) @@ -195,9 +206,21 @@ def init_matmul_inputs(Z, H, M, N, K, rho, mode, trans_a, trans_b, block, dtype, BS0 = N if trans_b else K BS1 = K if trans_b else N shape = {'sdd': (M, N), 'dsd': (AS0, AS1), 'dds': (BS0, BS1)}[mode] - x = torch.rand((Z, H, AS0, AS1), dtype=dtype, requires_grad=True, device='cuda') - w = torch.rand((Z, H, BS0, BS1), dtype=dtype, requires_grad=True, device='cuda') - dy = torch.rand((Z, H, M, N), dtype=dtype, device='cuda') + x = torch.rand((Z, + H, + AS0, + AS1), + dtype=dtype, + requires_grad=True, + device=get_accelerator().device_name()) + w = torch.rand((Z, + H, + BS0, + BS1), + dtype=dtype, + requires_grad=True, + device=get_accelerator().device_name()) + dy = torch.rand((Z, H, M, N), dtype=dtype, device=get_accelerator().device_name()) if layout is None: layout = make_layout(rho, (H, shape[0] // block, shape[1] // block)) else: diff --git a/tests/unit/ops/spatial/test_nhwc_bias_add.py b/tests/unit/ops/spatial/test_nhwc_bias_add.py index c863d0f6f1d9..f3a31cf47ba4 100644 --- a/tests/unit/ops/spatial/test_nhwc_bias_add.py +++ b/tests/unit/ops/spatial/test_nhwc_bias_add.py @@ -5,6 +5,7 @@ import pytest import torch from deepspeed.ops.transformer.inference.bias_add import nhwc_bias_add +from deepspeed.accelerator import get_accelerator def allclose(x, y): @@ -40,13 +41,16 @@ def ref_bias_add(activations, bias): @pytest.mark.parametrize("image_size", [16, 32, 64]) @pytest.mark.parametrize("channels", channels_list) def test_bias_add(batch, image_size, channels): - activations = torch.randn((batch, - channels, - image_size, - image_size), - dtype=torch.float16, - device="cuda").to(memory_format=torch.channels_last) - bias = torch.randn((channels), dtype=torch.float16, device="cuda") + activations = torch.randn( + (batch, + channels, + image_size, + image_size), + dtype=torch.float16, + device=get_accelerator().device_name()).to(memory_format=torch.channels_last) + bias = torch.randn((channels), + dtype=torch.float16, + device=get_accelerator().device_name()) ref_vals = ref_bias_add(activations.clone().detach(), bias) ds_vals = nhwc_bias_add(activations, bias) @@ -63,19 +67,23 @@ def ref_bias_add_add(activations, bias, other): @pytest.mark.parametrize("image_size", [16, 32, 64]) @pytest.mark.parametrize("channels", channels_list) def test_bias_add_add(batch, image_size, channels): - activations = torch.randn((batch, - channels, - image_size, - image_size), - dtype=torch.float16, - device="cuda").to(memory_format=torch.channels_last) - other = torch.randn((batch, - channels, - image_size, - image_size), - dtype=torch.float16, - device="cuda").to(memory_format=torch.channels_last) - bias = torch.randn((channels), dtype=torch.float16, device="cuda") + activations = torch.randn( + (batch, + channels, + image_size, + image_size), + dtype=torch.float16, + device=get_accelerator().device_name()).to(memory_format=torch.channels_last) + other = torch.randn( + (batch, + channels, + image_size, + image_size), + dtype=torch.float16, + device=get_accelerator().device_name()).to(memory_format=torch.channels_last) + bias = torch.randn((channels), + dtype=torch.float16, + device=get_accelerator().device_name()) ref_vals = ref_bias_add_add(activations.clone().detach(), bias, other) ds_vals = nhwc_bias_add(activations, bias, other=other) @@ -98,20 +106,26 @@ def ref_bias_add_bias_add(activations, bias, other, other_bias): @pytest.mark.parametrize("image_size", [16, 32, 64]) @pytest.mark.parametrize("channels", channels_list) def test_bias_add_bias_add(batch, image_size, channels): - activations = torch.randn((batch, - channels, - image_size, - image_size), - dtype=torch.float16, - device="cuda").to(memory_format=torch.channels_last) - other = torch.randn((batch, - channels, - image_size, - image_size), - dtype=torch.float16, - device="cuda").to(memory_format=torch.channels_last) - bias = torch.randn((channels), dtype=torch.float16, device="cuda") - other_bias = torch.randn((channels), dtype=torch.float16, device="cuda") + activations = torch.randn( + (batch, + channels, + image_size, + image_size), + dtype=torch.float16, + device=get_accelerator().device_name()).to(memory_format=torch.channels_last) + other = torch.randn( + (batch, + channels, + image_size, + image_size), + dtype=torch.float16, + device=get_accelerator().device_name()).to(memory_format=torch.channels_last) + bias = torch.randn((channels), + dtype=torch.float16, + device=get_accelerator().device_name()) + other_bias = torch.randn((channels), + dtype=torch.float16, + device=get_accelerator().device_name()) ref_vals = ref_bias_add_bias_add(activations.clone().detach(), bias, diff --git a/tests/unit/ops/transformer/inference/test_bias_add.py b/tests/unit/ops/transformer/inference/test_bias_add.py index f8d759c3f567..f3475a14766d 100644 --- a/tests/unit/ops/transformer/inference/test_bias_add.py +++ b/tests/unit/ops/transformer/inference/test_bias_add.py @@ -3,6 +3,7 @@ import pytest import torch import deepspeed +from deepspeed.accelerator import get_accelerator from deepspeed.ops.op_builder import InferenceBuilder if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]: @@ -39,8 +40,14 @@ def run_bias_add_ds(activations, bias): @pytest.mark.parametrize("channels", [512, 1232, 4096]) @pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=["fp16", "fp32"]) def test_bias_add(batch, sequence, channels, dtype): - activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device='cuda') - bias_ds = torch.randn((channels), dtype=dtype, device='cuda') + activations_ds = torch.randn((batch, + sequence, + channels), + dtype=dtype, + device=get_accelerator().device_name()) + bias_ds = torch.randn((channels), + dtype=dtype, + device=get_accelerator().device_name()) activations_ref = activations_ds.clone().detach() bias_ref = bias_ds.clone().detach() diff --git a/tests/unit/ops/transformer/inference/test_bias_geglu.py b/tests/unit/ops/transformer/inference/test_bias_geglu.py index a9f221488a4c..c70974e51d94 100644 --- a/tests/unit/ops/transformer/inference/test_bias_geglu.py +++ b/tests/unit/ops/transformer/inference/test_bias_geglu.py @@ -6,6 +6,7 @@ import torch import deepspeed from deepspeed.ops.op_builder import InferenceBuilder +from deepspeed.accelerator import get_accelerator if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]: pytest.skip("Inference ops are not available on this system", @@ -43,8 +44,14 @@ def run_bias_geglu_ds(activation, bias): @pytest.mark.parametrize("channels", [512, 1232, 4096]) @pytest.mark.parametrize("dtype", [torch.float16, torch.float32]) def test_bias_geglu(batch, sequence, channels, dtype): - activation = torch.randn((batch, sequence, channels * 2), dtype=dtype, device='cuda') - bias = torch.randn((channels * 2), dtype=dtype, device='cuda') + activation = torch.randn((batch, + sequence, + channels * 2), + dtype=dtype, + device=get_accelerator().device_name()) + bias = torch.randn((channels * 2), + dtype=dtype, + device=get_accelerator().device_name()) ds_out = run_bias_geglu_ds(activation, bias) ref_out = run_bias_geglu_reference(activation, bias) diff --git a/tests/unit/ops/transformer/inference/test_bias_gelu.py b/tests/unit/ops/transformer/inference/test_bias_gelu.py index a58512c9c83a..3c1762179ead 100644 --- a/tests/unit/ops/transformer/inference/test_bias_gelu.py +++ b/tests/unit/ops/transformer/inference/test_bias_gelu.py @@ -5,6 +5,7 @@ import pytest import torch import deepspeed +from deepspeed.accelerator import get_accelerator from deepspeed.ops.op_builder import InferenceBuilder from packaging import version as pkg_version @@ -48,8 +49,14 @@ def test_bias_gelu(batch, sequence, channels, dtype): if pkg_version.parse(torch.__version__) < pkg_version.parse("1.12"): pytest.skip("gelu implementation matches only after torch 1.12") - activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device='cuda') - bias_ds = torch.randn((channels), dtype=dtype, device='cuda') + activations_ds = torch.randn((batch, + sequence, + channels), + dtype=dtype, + device=get_accelerator().device_name()) + bias_ds = torch.randn((channels), + dtype=dtype, + device=get_accelerator().device_name()) activations_ref = activations_ds.clone().detach() bias_ref = bias_ds.clone().detach() diff --git a/tests/unit/ops/transformer/inference/test_bias_relu.py b/tests/unit/ops/transformer/inference/test_bias_relu.py index 932b02c01bfa..e2b66f6bd2e1 100644 --- a/tests/unit/ops/transformer/inference/test_bias_relu.py +++ b/tests/unit/ops/transformer/inference/test_bias_relu.py @@ -5,6 +5,7 @@ import pytest import torch import deepspeed +from deepspeed.accelerator import get_accelerator from deepspeed.ops.op_builder import InferenceBuilder if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]: @@ -43,8 +44,14 @@ def run_bias_relu_ds(activations, bias): @pytest.mark.parametrize("channels", [512, 1232, 4096]) @pytest.mark.parametrize("dtype", [torch.float16, torch.float32]) def test_bias_relu(batch, sequence, channels, dtype): - activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device='cuda') - bias_ds = torch.randn((channels), dtype=dtype, device='cuda') + activations_ds = torch.randn((batch, + sequence, + channels), + dtype=dtype, + device=get_accelerator().device_name()) + bias_ds = torch.randn((channels), + dtype=dtype, + device=get_accelerator().device_name()) activations_ref = activations_ds.clone().detach() bias_ref = bias_ds.clone().detach() diff --git a/tests/unit/ops/transformer/inference/test_layer_norm.py b/tests/unit/ops/transformer/inference/test_layer_norm.py index 3f8ddcd9615b..61f6455629e6 100644 --- a/tests/unit/ops/transformer/inference/test_layer_norm.py +++ b/tests/unit/ops/transformer/inference/test_layer_norm.py @@ -5,6 +5,7 @@ import deepspeed import torch import pytest +from deepspeed.accelerator import get_accelerator from deepspeed.ops.op_builder import InferenceBuilder if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]: @@ -48,9 +49,13 @@ def test_layer_norm(batch, seq_len, channels, dtype): seq_len, channels), dtype=dtype, - device=torch.cuda.current_device()) - gamma = torch.randn((channels), dtype=dtype, device=torch.cuda.current_device()) - beta = torch.rand((channels), dtype=dtype, device=torch.cuda.current_device()) + device=get_accelerator().current_device_name()) + gamma = torch.randn((channels), + dtype=dtype, + device=get_accelerator().current_device_name()) + beta = torch.rand((channels), + dtype=dtype, + device=get_accelerator().current_device_name()) epsilon = 1e-5 ref_output = ref_implementation(vals, gamma, beta, epsilon, channels, dtype) @@ -89,15 +94,21 @@ def test_layer_norm_residual(batch, seq_len, channels, dtype): seq_len, channels), dtype=dtype, - device=torch.cuda.current_device()) + device=get_accelerator().current_device_name()) residual = torch.randn((batch, seq_len, channels), dtype=dtype, - device=torch.cuda.current_device()) - bias = torch.randn((channels), dtype=dtype, device=torch.cuda.current_device()) - gamma = torch.randn((channels), dtype=dtype, device=torch.cuda.current_device()) - beta = torch.rand((channels), dtype=dtype, device=torch.cuda.current_device()) + device=get_accelerator().current_device_name()) + bias = torch.randn((channels), + dtype=dtype, + device=get_accelerator().current_device_name()) + gamma = torch.randn((channels), + dtype=dtype, + device=get_accelerator().current_device_name()) + beta = torch.rand((channels), + dtype=dtype, + device=get_accelerator().current_device_name()) epsilon = 1e-5 new_output = residual_ds_implementation(vals, bias, residual, gamma, beta, epsilon) @@ -158,15 +169,21 @@ def test_layer_norm_residual_store_pre_ln_res(batch, seq_len, channels, dtype): seq_len, channels), dtype=dtype, - device=torch.cuda.current_device()) + device=get_accelerator().current_device_name()) residual = torch.randn((batch, seq_len, channels), dtype=dtype, - device=torch.cuda.current_device()) - bias = torch.randn((channels), dtype=dtype, device=torch.cuda.current_device()) - gamma = torch.randn((channels), dtype=dtype, device=torch.cuda.current_device()) - beta = torch.rand((channels), dtype=dtype, device=torch.cuda.current_device()) + device=get_accelerator().current_device_name()) + bias = torch.randn((channels), + dtype=dtype, + device=get_accelerator().current_device_name()) + gamma = torch.randn((channels), + dtype=dtype, + device=get_accelerator().current_device_name()) + beta = torch.rand((channels), + dtype=dtype, + device=get_accelerator().current_device_name()) epsilon = 1e-5 # Need to run the reference first since there's an in-place component to ours diff --git a/tests/unit/ops/transformer/inference/test_moe_res_matmult.py b/tests/unit/ops/transformer/inference/test_moe_res_matmult.py index defdc99ac508..fdd6e8607c71 100644 --- a/tests/unit/ops/transformer/inference/test_moe_res_matmult.py +++ b/tests/unit/ops/transformer/inference/test_moe_res_matmult.py @@ -5,6 +5,7 @@ import pytest import torch import deepspeed +from deepspeed.accelerator import get_accelerator from deepspeed.ops.op_builder import InferenceBuilder if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]: @@ -41,10 +42,22 @@ def test_moe_residual_matmul(hidden_dim, c, dtype): hidden_dim * c, hidden_dim), dtype=dtype, - device='cuda') - coeff1 = torch.randn((1, 1, hidden_dim), dtype=dtype, device='cuda') - coeff2 = torch.randn((1, 1, hidden_dim), dtype=dtype, device='cuda') - out_ds = torch.randn((c, hidden_dim * c, hidden_dim), dtype=dtype, device='cuda') + device=get_accelerator().device_name()) + coeff1 = torch.randn((1, + 1, + hidden_dim), + dtype=dtype, + device=get_accelerator().device_name()) + coeff2 = torch.randn((1, + 1, + hidden_dim), + dtype=dtype, + device=get_accelerator().device_name()) + out_ds = torch.randn((c, + hidden_dim * c, + hidden_dim), + dtype=dtype, + device=get_accelerator().device_name()) coeff_ds = torch.cat((coeff1, coeff2), dim=-1) residual_ref = residual_ds.clone().detach() coeff_ref = coeff_ds.clone().detach() diff --git a/tests/unit/ops/transformer/inference/test_residual_add.py b/tests/unit/ops/transformer/inference/test_residual_add.py index e5d4f08c50f4..0dacee355369 100644 --- a/tests/unit/ops/transformer/inference/test_residual_add.py +++ b/tests/unit/ops/transformer/inference/test_residual_add.py @@ -5,6 +5,7 @@ import pytest import torch import deepspeed +from deepspeed.accelerator import get_accelerator from deepspeed.ops.op_builder import InferenceBuilder if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]: @@ -95,11 +96,27 @@ def test_residual_add(inference_module, add_bias, mp_size, pre_attn_norm): - ds_out = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device='cuda') - residual = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device='cuda') - attn_output = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device='cuda') - final_bias = torch.randn((hidden_dim), dtype=dtype, device='cuda') - attn_bias = torch.randn((hidden_dim), dtype=dtype, device='cuda') + ds_out = torch.randn((batch, + sequence, + hidden_dim), + dtype=dtype, + device=get_accelerator().device_name()) + residual = torch.randn((batch, + sequence, + hidden_dim), + dtype=dtype, + device=get_accelerator().device_name()) + attn_output = torch.randn((batch, + sequence, + hidden_dim), + dtype=dtype, + device=get_accelerator().device_name()) + final_bias = torch.randn((hidden_dim), + dtype=dtype, + device=get_accelerator().device_name()) + attn_bias = torch.randn((hidden_dim), + dtype=dtype, + device=get_accelerator().device_name()) ref_out = ds_out.clone() ref_out = run_residual_add_reference(ref_out, diff --git a/tests/unit/pipe/test_pipe_module.py b/tests/unit/pipe/test_pipe_module.py index 5454c7f8969e..e8404b0d5a17 100644 --- a/tests/unit/pipe/test_pipe_module.py +++ b/tests/unit/pipe/test_pipe_module.py @@ -11,6 +11,7 @@ import deepspeed from deepspeed.pipe import PipelineModule from deepspeed.utils import RepeatingLoader +from deepspeed.accelerator import get_accelerator from unit.common import DistributedTest @@ -72,7 +73,8 @@ def test(self, sequential_model, simple_config, batch_input): # Ensure all parameters are accounted for. my_params = sum(p.numel() for p in pipe_model.parameters()) - total_pipe_params = torch.LongTensor([my_params]).to('cuda') + total_pipe_params = torch.LongTensor([my_params + ]).to(get_accelerator().device_name()) dist.all_reduce(total_pipe_params) total_pipe_params = total_pipe_params.item() assert total_pipe_params == base_params @@ -83,7 +85,7 @@ def test(self, sequential_model, simple_config, batch_input): model_parameters=[p for p in pipe_model.parameters()]) if pipe_model.is_first_stage or pipe_model.is_last_stage: - pipe_input = base_input.clone().detach().to('cuda') + pipe_input = base_input.clone().detach().to(get_accelerator().device_name()) # label 0 is meaningless dataset = [(pipe_input, 0)] loader = RepeatingLoader(dataset) diff --git a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py index f1d0c79d05d5..af354fe1caa6 100644 --- a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py +++ b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py @@ -5,6 +5,7 @@ import pytest import torch import deepspeed +from deepspeed.accelerator import get_accelerator from copy import deepcopy from unit.common import DistributedTest @@ -38,7 +39,7 @@ def _prep_inputs(*inputs): for inp in inputs: inp = deepcopy(inp) if torch.is_tensor(inp): - inp = inp.cuda() + inp = inp.to(get_accelerator().device_name()) _inputs.append(inp) return tuple(_inputs) @@ -59,7 +60,7 @@ def _match_outputs(ref, tgt): def _test_activation_checkpoint(module, *inputs): # Move to device - module.cuda() + module.to(get_accelerator().device_name()) # Get rid of dropouts until we fork the RNG between tests. module.eval() @@ -79,7 +80,7 @@ def _test_activation_checkpoint(module, *inputs): def _test_activation_checkpoint_ordering(module, expected_ordering, *inputs): # Move to device - module.cuda() + module.to(get_accelerator().device_name()) # Get rid of dropouts until we fork the RNG between tests. module.eval() diff --git a/tests/unit/runtime/comm/test_coalesced_collectives.py b/tests/unit/runtime/comm/test_coalesced_collectives.py index a072eb0803c4..fa1041379a6b 100644 --- a/tests/unit/runtime/comm/test_coalesced_collectives.py +++ b/tests/unit/runtime/comm/test_coalesced_collectives.py @@ -4,6 +4,7 @@ import torch import deepspeed.comm as dist from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced +from deepspeed.accelerator import get_accelerator from unit.common import DistributedTest @@ -16,7 +17,7 @@ def test_single_input(self): ), dist.get_rank(), dtype=torch.half, - device=torch.cuda.current_device()) + device=get_accelerator().current_device_name()) (output, ) = reduce_scatter_coalesced([input], dist.get_world_group()) @@ -24,7 +25,10 @@ def test_single_input(self): assert torch.allclose(output, torch.full_like(output, 0.5)) def test_two_inputs(self): - tensor_kwargs = {"device": torch.cuda.current_device(), "dtype": torch.half} + tensor_kwargs = { + "device": get_accelerator().current_device_name(), + "dtype": torch.half + } inputs = [ dist.get_rank() * torch.arange(0, 6, @@ -52,7 +56,10 @@ class TestReduceScatterCoalescedTensorSmallerThanWorldSize(DistributedTest): world_size = 2 def test(self): - input = torch.zeros((1, ), dtype=torch.half, device=torch.cuda.current_device()) + input = torch.zeros((1, + ), + dtype=torch.half, + device=get_accelerator().current_device_name()) (output, ) = reduce_scatter_coalesced([input], dist.get_world_group()) diff --git a/tests/unit/runtime/half_precision/onebit/test_onebit.py b/tests/unit/runtime/half_precision/onebit/test_onebit.py index 6b4a2f5d7989..84a36768174a 100644 --- a/tests/unit/runtime/half_precision/onebit/test_onebit.py +++ b/tests/unit/runtime/half_precision/onebit/test_onebit.py @@ -15,6 +15,7 @@ from unit.common import DistributedTest from unit.simple_model import SimpleModel, random_dataloader from unit.alexnet_model import AlexNetPipe, train_cifar +from deepspeed.accelerator import get_accelerator PipeTopo = PipeDataParallelTopology @@ -48,7 +49,7 @@ def test(self, dtype): "weight_decay": 0.01, "freeze_step": 2, "cuda_aware": False, - "comm_backend_name": "nccl", + "comm_backend_name": get_accelerator().communication_backend_name(), }, }, "gradient_clipping": 1.0, @@ -91,7 +92,7 @@ def test(self): "weight_decay": 0.01, "freeze_step": 2, "cuda_aware": False, - "comm_backend_name": "nccl", + "comm_backend_name": get_accelerator().communication_backend_name(), }, }, "gradient_clipping": 1.0, @@ -158,7 +159,7 @@ def test(self, tmpdir): "weight_decay": 0.01, "freeze_step": 2, "cuda_aware": False, - "comm_backend_name": "nccl", + "comm_backend_name": get_accelerator().communication_backend_name(), }, }, "gradient_clipping": 1.0, @@ -314,7 +315,7 @@ def test_overflow(self, tmpdir): "weight_decay": 0.01, "freeze_step": 2, "cuda_aware": False, - "comm_backend_name": "nccl", + "comm_backend_name": get_accelerator().communication_backend_name(), }, }, "gradient_clipping": 1.0, @@ -381,7 +382,7 @@ def test(self, topo_config): "weight_decay": 3e-7, "freeze_step": 200, "cuda_aware": False, - "comm_backend_name": "nccl", + "comm_backend_name": get_accelerator().communication_backend_name(), }, }, "gradient_clipping": 1.0, @@ -436,7 +437,7 @@ def test(self, dtype): "local_step_scaler": 1, "local_step_clipper": 2, "cuda_aware": False, - "comm_backend_name": "nccl", + "comm_backend_name": get_accelerator().communication_backend_name(), }, }, "gradient_clipping": 1.0, @@ -482,7 +483,7 @@ def test(self): "local_step_scaler": 1, "local_step_clipper": 2, "cuda_aware": False, - "comm_backend_name": "nccl", + "comm_backend_name": get_accelerator().communication_backend_name(), }, }, "gradient_clipping": 1.0, @@ -552,7 +553,7 @@ def test(self, tmpdir): "local_step_scaler": 1, "local_step_clipper": 2, "cuda_aware": False, - "comm_backend_name": "nccl", + "comm_backend_name": get_accelerator().communication_backend_name(), }, }, "gradient_clipping": 1.0, @@ -707,7 +708,7 @@ def test_overflow(self, tmpdir): "local_step_scaler": 1, "local_step_clipper": 2, "cuda_aware": False, - "comm_backend_name": "nccl", + "comm_backend_name": get_accelerator().communication_backend_name(), }, }, "gradient_clipping": 1.0, @@ -777,7 +778,7 @@ def test(self, topo_config): "local_step_scaler": 1, "local_step_clipper": 2, "cuda_aware": False, - "comm_backend_name": "nccl", + "comm_backend_name": get_accelerator().communication_backend_name(), }, }, "gradient_clipping": 1.0, @@ -831,7 +832,7 @@ def test(self, dtype): "min_coeff": 0.01, "freeze_step": 2, "cuda_aware": False, - "comm_backend_name": "nccl", + "comm_backend_name": get_accelerator().communication_backend_name(), "coeff_beta": 0.9, "factor_max": 1.0, "factor_min": 0.5, @@ -880,7 +881,7 @@ def test(self): "min_coeff": 0.01, "freeze_step": 2, "cuda_aware": False, - "comm_backend_name": "nccl", + "comm_backend_name": get_accelerator().communication_backend_name(), "coeff_beta": 0.9, "factor_max": 1.0, "factor_min": 0.5, @@ -952,7 +953,7 @@ def test(self, tmpdir): "min_coeff": 0.01, "freeze_step": 2, "cuda_aware": False, - "comm_backend_name": "nccl", + "comm_backend_name": get_accelerator().communication_backend_name(), "coeff_beta": 0.9, "factor_max": 1.0, "factor_min": 0.5, @@ -1127,7 +1128,7 @@ def test_overflow(self, tmpdir): "min_coeff": 0.01, "freeze_step": 2, "cuda_aware": False, - "comm_backend_name": "nccl", + "comm_backend_name": get_accelerator().communication_backend_name(), "coeff_beta": 0.9, "factor_max": 1.0, "factor_min": 0.5, @@ -1198,7 +1199,7 @@ def test(self, topo_config): "weight_decay": 3e-7, "freeze_step": 200, "cuda_aware": False, - "comm_backend_name": "nccl", + "comm_backend_name": get_accelerator().communication_backend_name(), }, }, "gradient_clipping": 1.0, @@ -1246,7 +1247,7 @@ def test(self, tmpdir): rank = dist.get_rank() backend = NcclBackend() local_rank = dist.get_rank() - device = torch.device("cuda", dist.get_rank()) + device = torch.device(get_accelerator().device_name(), dist.get_rank()) # A simulated compression function using deepspeed.comm def torch_sim(a): @@ -1268,7 +1269,7 @@ def torch_sim(a): [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())]) rank = dist.get_rank() server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank] - torch.cuda.synchronize() + get_accelerator().synchronize() dist.barrier() return a_server_compressed, worker_error, server_error @@ -1288,7 +1289,7 @@ def torch_sim(a): server_error = torch.zeros(right_server_size, device=device) a_torch, worker_error_torch, server_error_torch = torch_sim(a) - torch.cuda.empty_cache() + get_accelerator().empty_cache() a_after = backend.compressed_allreduce(a, worker_error, server_error, local_rank) diff --git a/tests/unit/runtime/half_precision/test_fp16.py b/tests/unit/runtime/half_precision/test_fp16.py index 57f6de1be09a..c3c933fca144 100644 --- a/tests/unit/runtime/half_precision/test_fp16.py +++ b/tests/unit/runtime/half_precision/test_fp16.py @@ -6,9 +6,10 @@ import pytest from deepspeed.ops.adam import FusedAdam from unit.common import DistributedTest -from deepspeed.ops.op_builder import CPUAdamBuilder from unit.simple_model import SimpleModel, SimpleOptimizer, random_dataloader, SimpleMoEModel, sequence_dataloader from unit.util import required_torch_version +from deepspeed.accelerator import get_accelerator +from deepspeed.ops.op_builder import CPUAdamBuilder try: from apex import amp # noqa: F401 @@ -195,7 +196,7 @@ def test_unfused_gradnorm(self, monkeypatch): hidden_dim = 10 def mock_unscale_and_clip_grads(total_norm, apply_scale=True): - torch_norm_tensor = torch.cuda.FloatTensor([total_norm]) + torch_norm_tensor = get_accelerator().FloatTensor([total_norm]) all_gather_results = [ torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size()) ] @@ -236,7 +237,7 @@ def test_fused_gradnorm(self, monkeypatch): hidden_dim = 10 def mock_unscale_and_clip_grads(grads_groups_flat, total_norm, apply_scale=True): - torch_norm_tensor = torch.cuda.FloatTensor([total_norm]) + torch_norm_tensor = get_accelerator().FloatTensor([total_norm]) all_gather_results = [ torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size()) ] @@ -285,7 +286,7 @@ def test_lamb_gradnorm(self, monkeypatch, fused_lamb_legacy: bool): hidden_dim = 10 def mock_unscale_and_clip_grads(total_norm, apply_scale=True): - torch_norm_tensor = torch.cuda.FloatTensor([total_norm]) + torch_norm_tensor = get_accelerator().FloatTensor([total_norm]) all_gather_results = [ torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size()) ] diff --git a/tests/unit/runtime/pipe/test_topology.py b/tests/unit/runtime/pipe/test_topology.py index 9c71ce7d72d1..4b0cc42d4336 100644 --- a/tests/unit/runtime/pipe/test_topology.py +++ b/tests/unit/runtime/pipe/test_topology.py @@ -9,6 +9,7 @@ from deepspeed.runtime.pipe.topology import ProcessTopology as Topo from deepspeed.runtime.pipe.topology import _prime_factors +from deepspeed.accelerator import get_accelerator from unit.common import DistributedTest @@ -175,13 +176,13 @@ def test_grid_pipe_data(self): grid.get_stage_id() == grid.get_pipe_parallel_world_size() - 1) # Test collectives along the pipeline parallel process groups - rank_tensor = torch.LongTensor(data=[rank]).cuda() + rank_tensor = torch.LongTensor(data=[rank]).to(get_accelerator().device_name()) dist.all_reduce(rank_tensor, group=grid.get_pipe_parallel_group()) pipe_group = grid.pp_group assert torch.all(rank_tensor == sum(pipe_group)) # Test collectives along the data parallel process groups - rank_tensor = torch.LongTensor(data=[rank]).cuda() + rank_tensor = torch.LongTensor(data=[rank]).to(get_accelerator().device_name()) dist.all_reduce(rank_tensor, group=grid.get_data_parallel_group()) data_group = grid.dp_group assert torch.all(rank_tensor == sum(data_group)) diff --git a/tests/unit/runtime/test_autocast.py b/tests/unit/runtime/test_autocast.py index 307feb106572..b0d8d8696cb8 100644 --- a/tests/unit/runtime/test_autocast.py +++ b/tests/unit/runtime/test_autocast.py @@ -3,6 +3,7 @@ import pytest import torch from deepspeed.runtime.zero.linear import LinearModuleForZeroStage3 +from deepspeed.accelerator import get_accelerator from unit.common import DistributedTest @@ -11,31 +12,40 @@ class TestAutoCastDisable(DistributedTest): def test_missing_amp_autocast(self, half_op): hidden_dim = 4 if half_op: - input = torch.randn(hidden_dim).cuda().half() - ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).cuda().half() + input = torch.randn(hidden_dim).to(get_accelerator().device_name()).half() + ds_linear = LinearModuleForZeroStage3( + hidden_dim, + hidden_dim).to(get_accelerator().device_name()).half() else: - input = torch.randn(hidden_dim).cuda() - ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).cuda() + input = torch.randn(hidden_dim).to(get_accelerator().device_name()) + ds_linear = LinearModuleForZeroStage3(hidden_dim, + hidden_dim).to( + get_accelerator().device_name()) output = ds_linear(input) assert output.dtype == ds_linear.weight.dtype def test_disable_autocast_linear(self, half_op): - amp = pytest.importorskip("torch.cuda.amp") + amp = get_accelerator().amp() hidden_dim = 4 if half_op: - input = torch.randn(hidden_dim).cuda().half() - ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).cuda().half() + input = torch.randn(hidden_dim).to(get_accelerator().device_name()).half() + ds_linear = LinearModuleForZeroStage3( + hidden_dim, + hidden_dim).to(get_accelerator().device_name()).half() else: - input = torch.randn(hidden_dim).cuda() - ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).cuda() + input = torch.randn(hidden_dim).to(get_accelerator().device_name()) + ds_linear = LinearModuleForZeroStage3(hidden_dim, + hidden_dim).to( + get_accelerator().device_name()) with amp.autocast(False): output = ds_linear(input) assert output.dtype == ds_linear.weight.dtype +@pytest.mark.skipif(get_accelerator().amp() is None, reason='amp is not installed') @pytest.mark.parametrize('half_input, half_weight', [(False, False), @@ -47,11 +57,13 @@ def test_disable_autocast_linear(self, half_op): True)]) class TestAutoCastEnable(DistributedTest): def test_autocast_linear(self, tmpdir, half_input, half_weight): - amp = pytest.importorskip("torch.cuda.amp") + amp = get_accelerator().amp() hidden_dim = 4 - input = torch.randn(hidden_dim).cuda() - ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).cuda() + input = torch.randn(hidden_dim).to(get_accelerator().device_name()) + ds_linear = LinearModuleForZeroStage3(hidden_dim, + hidden_dim).to( + get_accelerator().device_name()) if half_input: input = input.half() @@ -61,4 +73,4 @@ def test_autocast_linear(self, tmpdir, half_input, half_weight): with amp.autocast(): output = ds_linear(input) - assert output.dtype == torch.half + assert output.dtype == torch.half or output.dtype == torch.bfloat16 diff --git a/tests/unit/runtime/test_data.py b/tests/unit/runtime/test_data.py index 3bee3dc2d471..ed2fee950bc3 100644 --- a/tests/unit/runtime/test_data.py +++ b/tests/unit/runtime/test_data.py @@ -4,6 +4,7 @@ import torch import pytest import deepspeed +from deepspeed.accelerator import get_accelerator from unit.common import DistributedTest from unit.simple_model import SimpleModel, random_dataset @@ -51,8 +52,8 @@ def test(self, train_batch_size, drop_last): training_data=train_dataset, optimizer=optimizer) for n, batch in enumerate(training_dataloader): - x = batch[0].to(torch.cuda.current_device()) - y = batch[1].to(torch.cuda.current_device()) + x = batch[0].to(get_accelerator().current_device_name()) + y = batch[1].to(get_accelerator().current_device_name()) loss = model(x, y) model.backward(loss) model.step() diff --git a/tests/unit/runtime/test_data_efficiency.py b/tests/unit/runtime/test_data_efficiency.py index 74e1222997d2..993e4aa66e20 100644 --- a/tests/unit/runtime/test_data_efficiency.py +++ b/tests/unit/runtime/test_data_efficiency.py @@ -3,6 +3,7 @@ import torch import os import deepspeed +from deepspeed.accelerator import get_accelerator from unit.common import DistributedTest from unit.simple_model import Curriculum_SimpleModel, SimpleModel, random_dataloader, random_dataset @@ -110,8 +111,8 @@ def data_post_process(data, data_sampler_state_dict): os.makedirs('/tmp') model.set_data_post_process_func(data_post_process) for n, batch in enumerate(data_loader): - x = batch[0].to(torch.cuda.current_device()) - y = batch[1].to(torch.cuda.current_device()) + x = batch[0].to(get_accelerator().current_device_name()) + y = batch[1].to(get_accelerator().current_device_name()) loss = model(x, y) model.backward(loss) model.step() diff --git a/tests/unit/runtime/test_ds_config_dict.py b/tests/unit/runtime/test_ds_config_dict.py index 311517b3e052..54c91a6fc3e6 100644 --- a/tests/unit/runtime/test_ds_config_dict.py +++ b/tests/unit/runtime/test_ds_config_dict.py @@ -2,13 +2,13 @@ # A test on its own import os -import torch import pytest import json import hjson import argparse from deepspeed.runtime.zero.config import DeepSpeedZeroConfig +from deepspeed.accelerator import get_accelerator from unit.common import DistributedTest, get_test_path from unit.simple_model import SimpleModel, create_config_from_dict, random_dataloader @@ -22,8 +22,8 @@ class TestBasicConfig(DistributedTest): world_size = 1 - def test_cuda(self): - assert (torch.cuda.is_available()) + def test_accelerator(self): + assert (get_accelerator().is_available()) def test_check_version(self): assert hasattr(deepspeed, "__git_hash__") diff --git a/tests/unit/runtime/test_runtime_utils.py b/tests/unit/runtime/test_runtime_utils.py index 751fae11971b..18a8bb77a5b6 100644 --- a/tests/unit/runtime/test_runtime_utils.py +++ b/tests/unit/runtime/test_runtime_utils.py @@ -7,6 +7,7 @@ import deepspeed.runtime.utils as ds_utils import deepspeed.utils.groups as groups +from deepspeed.accelerator import get_accelerator from unit.common import DistributedTest @@ -38,10 +39,11 @@ def test(self): groups._create_expert_and_data_parallel(2) norm = ds_utils.clip_grad_norm_(parameters, max_norm=0.1) - norm = torch.Tensor([norm]).to(dist.get_rank()) - + norm = torch.Tensor([norm]).to(get_accelerator().device_name(dist.get_rank())) world_size = dist.get_world_size() - gathered_norm = [torch.zeros(1).cuda() for i in range(world_size)] + gathered_norm = [ + torch.zeros(1).to(get_accelerator().device_name()) for i in range(world_size) + ] dist.all_gather(gathered_norm, norm) diff --git a/tests/unit/runtime/utils/test_partition.py b/tests/unit/runtime/utils/test_partition.py index 04fa5c94374d..58b62825de3f 100644 --- a/tests/unit/runtime/utils/test_partition.py +++ b/tests/unit/runtime/utils/test_partition.py @@ -9,6 +9,7 @@ from deepspeed.runtime.utils import partition_balanced from deepspeed.runtime.utils import prefix_sum_inc from deepspeed.runtime.utils import PartitionedTensor +from deepspeed.accelerator import get_accelerator from unit.common import DistributedTest @@ -25,7 +26,7 @@ def test(self): rows = world * 4 cols = 3 - full = torch.rand(rows, cols).cuda() + full = torch.rand(rows, cols).to(get_accelerator().device_name()) dist.broadcast(full, src=0, group=group) part = PartitionedTensor(full, group=group) @@ -48,7 +49,7 @@ def test(self): rows = world * 7 cols = 3 - full = torch.rand(rows, cols).cuda() + full = torch.rand(rows, cols).to(get_accelerator().device_name()) dist.broadcast(full, src=0, group=group) part = PartitionedTensor(full, group=group) diff --git a/tests/unit/runtime/zero/test_zero.py b/tests/unit/runtime/zero/test_zero.py index 958998441a9e..5de3ffca27df 100644 --- a/tests/unit/runtime/zero/test_zero.py +++ b/tests/unit/runtime/zero/test_zero.py @@ -18,6 +18,7 @@ from deepspeed.runtime.engine import DeepSpeedEngine from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint +from deepspeed.accelerator import get_accelerator def run_unbalanced_gradients(model, data_loader): @@ -698,30 +699,30 @@ def create_tensor(vals, dtype: torch.dtype = None) -> Tensor: grad_multiplier = 1 if zero_grad else (train_iter + 1) if dist.get_rank() == 0: assert torch.allclose( - dloss_wrt_layer3.cuda(), + dloss_wrt_layer3.to(get_accelerator().device_name()), grad_multiplier * create_tensor([2] * 8, torch.float)) assert torch.allclose( - dloss_wrt_layer2.cuda(), + dloss_wrt_layer2.to(get_accelerator().device_name()), grad_multiplier * create_tensor([3 * 1] * 8, torch.float)) assert torch.allclose( - dloss_wrt_layer1.cuda(), + dloss_wrt_layer1.to(get_accelerator().device_name()), grad_multiplier * create_tensor([3 * 2 * 1] * 8, torch.float)) elif dist.get_rank() == 1: # parameters dont split evenly across ranks so rank 1 has a zero-padded # partition assert torch.allclose( - dloss_wrt_layer3.cuda(), + dloss_wrt_layer3.to(get_accelerator().device_name()), grad_multiplier * create_tensor(([8] * 7) + [0], torch.float)) assert torch.allclose( - dloss_wrt_layer2.cuda(), + dloss_wrt_layer2.to(get_accelerator().device_name()), grad_multiplier * create_tensor(([6 * 2] * 7) + [0], torch.float)) assert torch.allclose( - dloss_wrt_layer1.cuda(), + dloss_wrt_layer1.to(get_accelerator().device_name()), grad_multiplier * create_tensor(([6 * 4 * 1] * 7) + [0], torch.float)) else: @@ -1128,28 +1129,28 @@ def create_tensor(vals): grad_multiplier = 1 if zero_grad else (train_iter + 1) if dist.get_rank() == 0: assert torch.allclose( - dloss_wrt_layer3.cuda(), + dloss_wrt_layer3.to(get_accelerator().device_name()), grad_multiplier * create_tensor([2] * 8).to(expected_grad_dtype)) assert torch.allclose( - dloss_wrt_layer2.cuda(), + dloss_wrt_layer2.to(get_accelerator().device_name()), grad_multiplier * create_tensor([3 * 1] * 8).to(expected_grad_dtype)) assert torch.allclose( - dloss_wrt_layer1.cuda(), + dloss_wrt_layer1.to(get_accelerator().device_name()), grad_multiplier * create_tensor([3 * 2 * 1] * 8).to(expected_grad_dtype)) elif dist.get_rank() == 1: # parameters dont split evenly across ranks so rank 1 has a zero-padded # partition assert torch.allclose( - dloss_wrt_layer3.cuda(), + dloss_wrt_layer3.to(get_accelerator().device_name()), grad_multiplier * create_tensor(([8] * 7) + [0]).to(expected_grad_dtype)) assert torch.allclose( - dloss_wrt_layer2.cuda(), + dloss_wrt_layer2.to(get_accelerator().device_name()), grad_multiplier * create_tensor(([6 * 2] * 7) + [0]).to(expected_grad_dtype)) assert torch.allclose( - dloss_wrt_layer1.cuda(), + dloss_wrt_layer1.to(get_accelerator().device_name()), grad_multiplier * create_tensor(([6 * 4 * 1] * 7) + [0]).to(expected_grad_dtype)) else: diff --git a/tests/unit/runtime/zero/test_zero_context_ancestry.py b/tests/unit/runtime/zero/test_zero_context_ancestry.py index 6035efcff111..38ae524906d5 100644 --- a/tests/unit/runtime/zero/test_zero_context_ancestry.py +++ b/tests/unit/runtime/zero/test_zero_context_ancestry.py @@ -3,6 +3,7 @@ import torch import deepspeed from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus +from deepspeed.accelerator import get_accelerator from utils import setup_serial_env from unit.common import DistributedTest @@ -74,7 +75,7 @@ def test_subclass_param_init(self): assert model.param.ds_status == ZeroParamStatus.NOT_AVAILABLE # test that the weights manipulation during each __init__ worked in all w/o needing gathering - ones = torch.ones(5).half().cuda() + ones = torch.ones(5).half().to(get_accelerator().device_name()) with deepspeed.zero.GatheredParameters(list(model.parameters(recurse=False))): assert torch.equal(model.param, ones + 1) assert torch.equal(model.param_pa, ones + 2) diff --git a/tests/unit/simple_model.py b/tests/unit/simple_model.py index dcc4958b26e9..481aae0bfdcd 100644 --- a/tests/unit/simple_model.py +++ b/tests/unit/simple_model.py @@ -7,6 +7,7 @@ from deepspeed.pipe import PipelineModule, LayerSpec from deepspeed.moe.layer import MoE +from deepspeed.accelerator import get_accelerator import deepspeed.comm as dist @@ -274,7 +275,7 @@ def create_deepspeed_args(): args.deepspeed = True if dist.is_initialized(): # We assume up to one full node executing unit tests - assert dist.get_world_size() <= torch.cuda.device_count() + assert dist.get_world_size() <= get_accelerator().device_count() args.local_rank = dist.get_rank() return args diff --git a/tests/unit/utils/test_init_on_device.py b/tests/unit/utils/test_init_on_device.py index 9b4f9970b365..25d102fd05a7 100644 --- a/tests/unit/utils/test_init_on_device.py +++ b/tests/unit/utils/test_init_on_device.py @@ -5,10 +5,11 @@ from unit.simple_model import SimpleModel from deepspeed import OnDevice from packaging import version as pkg_version +from deepspeed.accelerator import get_accelerator from unit.common import DistributedTest -@pytest.mark.parametrize('device', ['meta', 'cuda:0']) +@pytest.mark.parametrize('device', ['meta', get_accelerator().device_name(0)]) class TestOnDevice(DistributedTest): world_size = 1