diff --git a/benchmarks/communication/all_gather.py b/benchmarks/communication/all_gather.py
index 101f129cca45..dc97267b3840 100644
--- a/benchmarks/communication/all_gather.py
+++ b/benchmarks/communication/all_gather.py
@@ -2,6 +2,7 @@
 
 from benchmarks.communication.utils import *
 from benchmarks.communication.constants import *
+from deepspeed.accelerator import get_accelerator
 
 import time
 
@@ -85,16 +86,20 @@ def run_all_gather(local_rank, args):
             try:
                 mat = torch.ones(world_size,
                                  M,
-                                 dtype=getattr(torch,
-                                               args.dtype)).cuda(local_rank)
+                                 dtype=getattr(
+                                     torch,
+                                     args.dtype)).to(
+                                         get_accelerator().device_name(local_rank))
                 sync_all()
                 input = ((mat.mul_(float(global_rank))).view(-1))
                 # Delete original mat to avoid OOM
                 del mat
-                torch.cuda.empty_cache()
+                get_accelerator().empty_cache()
                 output = torch.zeros(input.nelement() * world_size,
-                                     dtype=getattr(torch,
-                                                   args.dtype)).cuda(local_rank)
+                                     dtype=getattr(
+                                         torch,
+                                         args.dtype)).to(
+                                             get_accelerator().device_name(local_rank))
             except RuntimeError as e:
                 if 'out of memory' in str(e):
                     if dist.get_rank() == 0:
@@ -123,15 +128,17 @@ def run_all_gather(local_rank, args):
         try:
             mat = torch.ones(elements_per_gpu,
                              dtype=getattr(torch,
-                                           args.dtype)).cuda(local_rank)
+                                           args.dtype)).to(
+                                               get_accelerator().device_name(local_rank))
             # multiply each GPU's tensor by the rank to ease debugging
             input = ((mat.mul_(float(global_rank))).view(-1))
             # Delete original mat to avoid OOM
             del mat
-            torch.cuda.empty_cache()
-            output = torch.zeros(elements_per_gpu * world_size,
-                                 dtype=getattr(torch,
-                                               args.dtype)).cuda(local_rank)
+            get_accelerator().empty_cache()
+            output = torch.zeros(
+                elements_per_gpu * world_size,
+                dtype=getattr(torch,
+                              args.dtype)).to(get_accelerator().device_name(local_rank))
         except RuntimeError as e:
             if 'out of memory' in str(e):
                 if dist.get_rank() == 0:
diff --git a/benchmarks/communication/all_reduce.py b/benchmarks/communication/all_reduce.py
index 9d46f70c93c2..edc1b99301c0 100644
--- a/benchmarks/communication/all_reduce.py
+++ b/benchmarks/communication/all_reduce.py
@@ -2,6 +2,7 @@
 
 from benchmarks.communication.utils import *
 from benchmarks.communication.constants import *
+from deepspeed.accelerator import get_accelerator
 
 import time
 
@@ -64,8 +65,10 @@ def run_all_reduce(local_rank, args):
             try:
                 mat = torch.ones(world_size,
                                  M,
-                                 dtype=getattr(torch,
-                                               args.dtype)).cuda(local_rank)
+                                 dtype=getattr(
+                                     torch,
+                                     args.dtype)).to(
+                                         get_accelerator().device_name(local_rank))
                 sync_all()
                 input = ((mat.mul_(float(global_rank))).view(-1))
             except RuntimeError as e:
@@ -88,7 +91,8 @@ def run_all_reduce(local_rank, args):
         try:
             mat = torch.ones(elements_per_gpu,
                              dtype=getattr(torch,
-                                           args.dtype)).cuda(local_rank)
+                                           args.dtype)).to(
+                                               get_accelerator().device_name(local_rank))
             input = ((mat.mul_(float(global_rank))).view(-1))
         except RuntimeError as e:
             if 'out of memory' in str(e):
diff --git a/benchmarks/communication/all_to_all.py b/benchmarks/communication/all_to_all.py
index f5ce3b37d514..bd35cf290e4c 100644
--- a/benchmarks/communication/all_to_all.py
+++ b/benchmarks/communication/all_to_all.py
@@ -2,6 +2,7 @@
 
 from benchmarks.communication.utils import *
 from benchmarks.communication.constants import *
+from deepspeed.accelerator import get_accelerator
 
 import time
 
@@ -63,8 +64,10 @@ def run_all_to_all(local_rank, args):
             try:
                 mat = torch.ones(world_size,
                                  M,
-                                 dtype=getattr(torch,
-                                               args.dtype)).cuda(local_rank)
+                                 dtype=getattr(
+                                     torch,
+                                     args.dtype)).to(
+                                         get_accelerator().device_name(local_rank))
                 assert mat.numel() % world_size == 0, f"tensor cannot be divided in {world_size} chunks"
                 sync_all()
                 input = ((mat.mul_(float(global_rank))).view(-1))
@@ -88,15 +91,17 @@ def run_all_to_all(local_rank, args):
         try:
             mat = torch.ones(elements_per_gpu,
                              dtype=getattr(torch,
-                                           args.dtype)).cuda(local_rank)
+                                           args.dtype)).to(
+                                               get_accelerator().device_name(local_rank))
             assert mat.numel() % world_size == 0, f"tensor with {mat.numel()} elements cannot be divided in {world_size} chunks"
             input = ((mat.mul_(float(global_rank))).view(-1))
             # Delete original mat to avoid OOM
             del mat
-            torch.cuda.empty_cache()
-            output = torch.zeros(elements_per_gpu,
-                                 dtype=getattr(torch,
-                                               args.dtype)).cuda(local_rank)
+            get_accelerator().empty_cache()
+            output = torch.zeros(
+                elements_per_gpu,
+                dtype=getattr(torch,
+                              args.dtype)).to(get_accelerator().device_name(local_rank))
         except RuntimeError as e:
             if 'out of memory' in str(e):
                 if dist.get_rank() == 0:
diff --git a/benchmarks/communication/broadcast.py b/benchmarks/communication/broadcast.py
index d0480cb15b5a..633e46638fac 100644
--- a/benchmarks/communication/broadcast.py
+++ b/benchmarks/communication/broadcast.py
@@ -3,6 +3,7 @@
 import torch
 from benchmarks.communication.utils import *
 from benchmarks.communication.constants import *
+from deepspeed.accelerator import get_accelerator
 
 import time
 
@@ -65,8 +66,10 @@ def run_broadcast(local_rank, args):
             try:
                 mat = torch.ones(world_size,
                                  M,
-                                 dtype=getattr(torch,
-                                               args.dtype)).cuda(local_rank)
+                                 dtype=getattr(
+                                     torch,
+                                     args.dtype)).to(
+                                         get_accelerator().device_name(local_rank))
                 sync_all()
                 input = ((mat.mul_(float(global_rank))).view(-1))
             except RuntimeError as e:
@@ -89,7 +92,8 @@ def run_broadcast(local_rank, args):
         try:
             mat = torch.ones(elements_per_gpu,
                              dtype=getattr(torch,
-                                           args.dtype)).cuda(local_rank)
+                                           args.dtype)).to(
+                                               get_accelerator().device_name(local_rank))
             input = ((mat.mul_(float(global_rank))).view(-1))
         except RuntimeError as e:
             if 'out of memory' in str(e):
diff --git a/benchmarks/communication/constants.py b/benchmarks/communication/constants.py
index 71416b16f084..935927acd174 100644
--- a/benchmarks/communication/constants.py
+++ b/benchmarks/communication/constants.py
@@ -1,9 +1,10 @@
 '''Copyright The Microsoft DeepSpeed Team'''
+from deepspeed.accelerator import get_accelerator
 
 DEFAULT_WARMUPS = 5
 DEFAULT_TRIALS = 50
 DEFAULT_TYPE = 'float'
-DEFAULT_BACKEND = 'nccl'
+DEFAULT_BACKEND = get_accelerator().communication_backend_name()
 DEFAULT_UNIT = 'Gbps'
 DEFAULT_DIST = 'deepspeed'
 DEFAULT_MAXSIZE = 24
diff --git a/benchmarks/communication/pt2pt.py b/benchmarks/communication/pt2pt.py
index 89a2ec045b39..1c890fc42e93 100644
--- a/benchmarks/communication/pt2pt.py
+++ b/benchmarks/communication/pt2pt.py
@@ -2,6 +2,7 @@
 
 from benchmarks.communication.utils import *
 from benchmarks.communication.constants import *
+from deepspeed.accelerator import get_accelerator
 
 import time
 
@@ -83,8 +84,10 @@ def run_pt2pt(local_rank, args):
             try:
                 mat = torch.ones(world_size,
                                  M,
-                                 dtype=getattr(torch,
-                                               args.dtype)).cuda(local_rank)
+                                 dtype=getattr(
+                                     torch,
+                                     args.dtype)).to(
+                                         get_accelerator().device_name(local_rank))
                 sync_all()
                 input = ((mat.mul_(float(global_rank))).view(-1))
             except RuntimeError as e:
@@ -107,7 +110,8 @@ def run_pt2pt(local_rank, args):
         try:
             mat = torch.ones(elements_per_gpu,
                              dtype=getattr(torch,
-                                           args.dtype)).cuda(local_rank)
+                                           args.dtype)).to(
+                                               get_accelerator().device_name(local_rank))
             input = ((mat.mul_(float(global_rank))).view(-1))
         except RuntimeError as e:
             if 'out of memory' in str(e):
diff --git a/benchmarks/communication/utils.py b/benchmarks/communication/utils.py
index b2f7367bb703..b913dda14fe5 100644
--- a/benchmarks/communication/utils.py
+++ b/benchmarks/communication/utils.py
@@ -5,6 +5,7 @@
 import math
 import argparse
 from benchmarks.communication.constants import *
+from deepspeed.accelerator import get_accelerator
 
 global dist
 
@@ -14,7 +15,7 @@ def init_torch_distributed(backend):
     import torch.distributed as dist
     torch.distributed.init_process_group(backend)
     local_rank = int(os.environ['LOCAL_RANK'])
-    torch.cuda.set_device(local_rank)
+    get_accelerator().set_device(local_rank)
 
 
 def init_deepspeed_comm(backend):
@@ -23,7 +24,7 @@ def init_deepspeed_comm(backend):
     import deepspeed.comm as dist
     deepspeed.init_distributed(dist_backend=backend)
     local_rank = int(os.environ['LOCAL_RANK'])
-    torch.cuda.set_device(local_rank)
+    get_accelerator().set_device(local_rank)
 
 
 def init_processes(local_rank, args):
@@ -101,14 +102,13 @@ def get_metric_strings(args, tput, busbw, duration):
 
 
 def sync_all():
-    torch.cuda.synchronize()
+    get_accelerator().synchronize()
     dist.barrier()
 
 
 def max_numel(comm_op, dtype, mem_factor, local_rank, args):
     dtype_size = _element_size(dtype)
-    max_memory_per_gpu = torch.cuda.get_device_properties(
-        local_rank).total_memory * mem_factor
+    max_memory_per_gpu = get_accelerator().total_memory(local_rank) * mem_factor
     if comm_op == 'all_reduce' or comm_op == 'pt2pt' or comm_op == 'broadcast':
         elements_per_gpu = int(max_memory_per_gpu // dtype_size)
     elif comm_op == 'all_gather':
@@ -185,7 +185,8 @@ def benchmark_parser():
     parser.add_argument("--backend",
                         type=str,
                         default=DEFAULT_BACKEND,
-                        choices=['nccl'],
+                        choices=['nccl',
+                                 'ccl'],
                         help='Communication library to use')
     parser.add_argument("--dist",
                         type=str,
diff --git a/benchmarks/inference/bert-bench.py b/benchmarks/inference/bert-bench.py
index 5b88ba235e3f..9d586d033cd7 100644
--- a/benchmarks/inference/bert-bench.py
+++ b/benchmarks/inference/bert-bench.py
@@ -5,6 +5,7 @@
 import deepspeed
 import argparse
 from transformers import pipeline
+from deepspeed.accelerator import get_accelerator
 
 parser = argparse.ArgumentParser()
 parser.add_argument("--model", "-m", type=str, help="hf model name")
@@ -46,7 +47,7 @@ def print_latency(latency_set, title, warmup=3):
         print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000))
 
 
-deepspeed.init_distributed("nccl")
+deepspeed.init_distributed()
 
 print(args.model, args.max_tokens, args.dtype)
 
@@ -75,10 +76,10 @@ def print_latency(latency_set, title, warmup=3):
 times = []
 mtimes = []
 for i in range(args.trials):
-    torch.cuda.synchronize()
+    get_accelerator().synchronize()
     start = time.time()
     r = pipe(f"Hello I'm a {mask} model")
-    torch.cuda.synchronize()
+    get_accelerator().synchronize()
     end = time.time()
     responses.append(r)
     times.append((end - start))
diff --git a/benchmarks/inference/gpt-bench.py b/benchmarks/inference/gpt-bench.py
index b41ca921064e..29578b30cf1f 100644
--- a/benchmarks/inference/gpt-bench.py
+++ b/benchmarks/inference/gpt-bench.py
@@ -6,6 +6,7 @@
 import deepspeed
 import argparse
 from transformers import pipeline
+from deepspeed.accelerator import get_accelerator
 
 parser = argparse.ArgumentParser()
 parser.add_argument("--model", "-m", type=str, help="hf model name")
@@ -63,7 +64,7 @@ def print_latency(latency_set, title, warmup=3):
         print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000))
 
 
-deepspeed.init_distributed("nccl")
+deepspeed.init_distributed()
 
 if args.local_rank == 0:
     print("BENCHMARK SETTINGS:")
@@ -102,10 +103,10 @@ def print_latency(latency_set, title, warmup=3):
 times = []
 mtimes = []
 for i in range(args.trials):
-    torch.cuda.synchronize()
+    get_accelerator().synchronize()
     start = time.time()
     r = pipe("DeepSpeed is", do_sample=False, max_new_tokens=args.max_tokens)
-    torch.cuda.synchronize()
+    get_accelerator().synchronize()
     end = time.time()
     responses.append(r)
     times.append(end - start)  # / (args.max_tokens - 3))
diff --git a/deepspeed/module_inject/containers/base.py b/deepspeed/module_inject/containers/base.py
index ca5f0e23ffe3..45faeb477951 100644
--- a/deepspeed/module_inject/containers/base.py
+++ b/deepspeed/module_inject/containers/base.py
@@ -5,6 +5,7 @@
 import torch
 
 from deepspeed.ops.transformer.inference.config import DeepSpeedInferenceConfig
+from deepspeed.accelerator import get_accelerator
 
 
 class BaseConvolutionContainer(ABC):
@@ -216,12 +217,14 @@ def copy_data_to_new_module(self):
             self.module.mlp.attn_nb = self.attn_nb
         else:
             self.module.mlp.attn_nw.data.copy_(
-                self.attn_nw.to(torch.cuda.current_device()))
+                self.attn_nw.to(get_accelerator().current_device_name()))
             self.module.mlp.attn_nb.data.copy_(
-                self.attn_nb.to(torch.cuda.current_device()))
+                self.attn_nb.to(get_accelerator().current_device_name()))
 
-        self.module.norm_w.data.copy_(self.input_nw.to(torch.cuda.current_device()))
-        self.module.norm_b.data.copy_(self.input_nb.to(torch.cuda.current_device()))
+        self.module.norm_w.data.copy_(
+            self.input_nw.to(get_accelerator().current_device_name()))
+        self.module.norm_b.data.copy_(
+            self.input_nb.to(get_accelerator().current_device_name()))
 
     def transpose(self):
         self.transpose_attention()
@@ -241,5 +244,5 @@ def transpose_impl(self, data):
         data = data.contiguous()
         data.reshape(-1).copy_(data.transpose(-1, -2).contiguous().reshape(-1))
         data = data.reshape(data.shape[-1], data.shape[-2])
-        data.to(torch.cuda.current_device())
+        data.to(get_accelerator().current_device_name())
         return data
diff --git a/deepspeed/module_inject/containers/base_moe.py b/deepspeed/module_inject/containers/base_moe.py
index f120a5ef6006..4139b08d9091 100644
--- a/deepspeed/module_inject/containers/base_moe.py
+++ b/deepspeed/module_inject/containers/base_moe.py
@@ -2,9 +2,9 @@
 
 # Create a container object to save model-specific tensors using the policy file above.
 from .base import *
-import torch
 from deepspeed import comm as dist
 import deepspeed.ops.transformer as transformer_inference
+from deepspeed.accelerator import get_accelerator
 
 
 class BaseTransformerMoEContainer(BaseTransformerContainer):
@@ -104,33 +104,38 @@ def mlp_mp(self):
             # mlp inter
             self.module.mlp[ep_index].inter_w.data = self._h4h_w[
                 gpu_index * self.local_ep_size + ep_index].to(
-                    torch.cuda.current_device())
+                    get_accelerator().current_device_name())
             self.module.mlp[ep_index].inter_b.data = self._h4h_b[
                 gpu_index * self.local_ep_size + ep_index].to(
-                    torch.cuda.current_device())
+                    get_accelerator().current_device_name())
 
             # mlp output
             self.module.mlp[ep_index].output_w.data = self._4hh_w[
                 gpu_index * self.local_ep_size + ep_index].to(
-                    torch.cuda.current_device())
+                    get_accelerator().current_device_name())
             self.module.mlp[ep_index].output_b.data = self._4hh_b[
                 gpu_index * self.local_ep_size + ep_index].to(
-                    torch.cuda.current_device())
+                    get_accelerator().current_device_name())
 
     def copy_data_to_new_module(self):
-        self.module.attn_nw.data = self.attn_nw.to(torch.cuda.current_device())
-        self.module.attn_nb.data = self.attn_nb.to(torch.cuda.current_device())
+        self.module.attn_nw.data = self.attn_nw.to(
+            get_accelerator().current_device_name())
+        self.module.attn_nb.data = self.attn_nb.to(
+            get_accelerator().current_device_name())
 
-        self.module.norm_w.data.copy_(self.input_nw.to(torch.cuda.current_device()))
-        self.module.norm_b.data.copy_(self.input_nb.to(torch.cuda.current_device()))
+        self.module.norm_w.data.copy_(
+            self.input_nw.to(get_accelerator().current_device_name()))
+        self.module.norm_b.data.copy_(
+            self.input_nb.to(get_accelerator().current_device_name()))
 
         if self.config.moe.type == 'residual':
             self.module.res_mlp.inter_w.data = self._res_h4h_w.to(
-                torch.cuda.current_device())
+                get_accelerator().current_device_name())
             self.module.res_mlp.inter_b.data = self._res_h4h_b.to(
-                torch.cuda.current_device())
+                get_accelerator().current_device_name())
             self.module.res_mlp.output_w.data = self._res_4hh_w.to(
-                torch.cuda.current_device())
+                get_accelerator().current_device_name())
             self.module.res_mlp.output_b.data = self._res_4hh_b.to(
-                torch.cuda.current_device())
-            self.module.res_coef.data = self._res_coef.to(torch.cuda.current_device())
+                get_accelerator().current_device_name())
+            self.module.res_coef.data = self._res_coef.to(
+                get_accelerator().current_device_name())
diff --git a/deepspeed/module_inject/policy.py b/deepspeed/module_inject/policy.py
index bbf8f59e9222..dfd3343e12a3 100644
--- a/deepspeed/module_inject/policy.py
+++ b/deepspeed/module_inject/policy.py
@@ -4,6 +4,7 @@
 from abc import ABC, abstractmethod
 from deepspeed.utils.types import ActivationFuncType
 import torch
+from deepspeed.accelerator import get_accelerator
 
 transformer_param_names = (
         'attn_qkvw', \
@@ -196,9 +197,9 @@ def maybe_copy_qkv(module,
                 dst = mp_replace.copy(dst, qkv_data)
         else:
             if split_qkv:
-                dst = mp_replace.qkv_copy(dst, weight_quantizer.quantize(qkv_data.cuda() if weight_quantizer.q_int8 else \
+                dst = mp_replace.qkv_copy(dst, weight_quantizer.quantize(qkv_data.to(get_accelerator().device_name()) if weight_quantizer.q_int8 else \
                                                 ((transpose(qkv_data)).contiguous())), int8=weight_quantizer.q_int8)
             else:
-                dst = mp_replace.copy(dst, weight_quantizer.quantize(qkv_data.cuda() if weight_quantizer.q_int8 else \
+                dst = mp_replace.copy(dst, weight_quantizer.quantize(qkv_data.to(get_accelerator().device_name()) if weight_quantizer.q_int8 else \
                                                 transpose(qkv_data)), int8=weight_quantizer.q_int8)
         setattr(module, dst_name, dst)
diff --git a/docs/_tutorials/cifar-10.md b/docs/_tutorials/cifar-10.md
index 11a05a78a749..74ee04502f18 100644
--- a/docs/_tutorials/cifar-10.md
+++ b/docs/_tutorials/cifar-10.md
@@ -140,7 +140,8 @@ Here we initialize DeepSpeed with CIFAR-10 model (`net`), `args`, `parameters` a
 After initializing DeepSpeed, the original `device` and `optimizer` are removed:
 
 ```python
- #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+ #from deepspeed.accelerator import get_accelerator
+ #device = torch.device(get_accelerator().device_name(0) if get_accelerator().is_available() else "cpu")
  #net.to(device)
 
  #optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
diff --git a/tests/accelerator/ds_config.json b/tests/accelerator/ds_config.json
new file mode 100644
index 000000000000..8e9ac6b889ea
--- /dev/null
+++ b/tests/accelerator/ds_config.json
@@ -0,0 +1,19 @@
+{
+  "train_batch_size": 1,
+  "gradient_accumulation_steps": 1,
+  "steps_per_print": 1,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.00015,
+      "weight_decay": 1e-2
+    }
+  },
+  "fp16": {
+    "enabled": false,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  }
+}
diff --git a/tests/accelerator/test_ds_init.py b/tests/accelerator/test_ds_init.py
new file mode 100644
index 000000000000..6c4e90e2aa63
--- /dev/null
+++ b/tests/accelerator/test_ds_init.py
@@ -0,0 +1,43 @@
+'''Copyright The Microsoft DeepSpeed Team'''
+import os
+import torch
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+
+
+class OneLayerNet(torch.nn.Module):
+    def __init__(self, D_in, D_out):
+        """
+        In the constructor we instantiate two nn.Linear modules and assign them as
+        member variables.
+        """
+        super(OneLayerNet, self).__init__()
+        self.linear1 = torch.nn.Linear(D_in, D_out)
+
+    def forward(self, x):
+        """
+        In the forward function we accept a Variable of input data and we must return
+        a Variable of output data. We can use Modules defined in the constructor as
+        well as arbitrary operators on Variables.
+        """
+        h_relu = self.linear1(x).clamp(min=0)
+        y_pred = self.linear1(h_relu)
+        return y_pred
+
+
+def test_literal_device():
+    model = OneLayerNet(128, 128)
+
+    os.environ['RANK'] = '0'
+    os.environ['WORLD_SIZE'] = '1'
+    os.environ['MASTER_ADDR'] = '127.0.0.1'
+    os.environ['MASTER_PORT'] = '8088'
+    os.environ['LOCAL_RANK'] = '0'
+    deepspeed.init_distributed(get_accelerator().communication_backend_name())
+    deepspeed.initialize(model=model, config='ds_config.json')
+    string = get_accelerator().device_name()  #'xpu' or 'cuda'
+    string0 = get_accelerator().device_name(0)  #'xpu:0' or 'cuda:0'
+    string1 = get_accelerator().device_name(1)  #'xpu:1' or 'cuda:1'
+    assert string == 'xpu' or string == 'cuda'
+    assert string0 == 'xpu:0' or string0 == 'cuda:0'
+    assert string1 == 'xpu:1' or string1 == 'cuda:1'
diff --git a/tests/benchmarks/flatten_bench.py b/tests/benchmarks/flatten_bench.py
index fdf53df0b131..1082554f81d1 100755
--- a/tests/benchmarks/flatten_bench.py
+++ b/tests/benchmarks/flatten_bench.py
@@ -14,6 +14,7 @@
 
 import torch
 from torch._utils import _flatten_dense_tensors
+from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import UtilsBuilder
 
 from apex_C import flatten as flatten_apex
@@ -26,11 +27,11 @@
 # emulate a small typical model weights
 x = [
     torch.rand((512,
-                512)).cuda(),
+                512)).to(get_accelerator().device_name()),
     torch.rand((512,
-                1024)).cuda(),
+                1024)).to(get_accelerator().device_name()),
     torch.rand((512,
-                30000)).cuda()
+                30000)).to(get_accelerator().device_name())
 ]
 t = x * 30
 
@@ -71,15 +72,15 @@ def cprofileme():
     print("py")
     cProfile.run("py()", sort=-1)
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
     print("cpp")
     cProfile.run("cpp()", sort=-1)
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
     print("apex")
     cProfile.run("apex()", sort=-1)
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
 
 
 #### timeit ####
@@ -91,13 +92,13 @@ def timeme():
     print("--------------- timeit -----------------")
     print(f'py  ={timeit.Timer("py()", globals=globals()).timeit(number=1)}')
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
     print(f'cpp ={timeit.Timer("cpp()", globals=globals()).timeit(number=1)}')
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
     print(f'apex={timeit.Timer("apex()", globals=globals()).timeit(number=1)}')
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
 
 
 #### line_profiler ####
@@ -111,15 +112,15 @@ def line_profileme():
     print("py")
     profile(py)()  # noqa: F821
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
     print("cpp")
     profile(cpp)()  # noqa: F821
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
     print("apex")
     profile(apex)()  # noqa: F821
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
 
 
 if __name__ == "__main__":
diff --git a/tests/benchmarks/unflatten_bench.py b/tests/benchmarks/unflatten_bench.py
index 0c60528891ea..a4a1b63b3dd0 100755
--- a/tests/benchmarks/unflatten_bench.py
+++ b/tests/benchmarks/unflatten_bench.py
@@ -13,6 +13,7 @@
 import gc
 import torch
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import UtilsBuilder
 
 from apex_C import flatten as flatten_apex
@@ -26,11 +27,11 @@
 # emulate a small typical model weights
 x = [
     torch.rand((512,
-                512)).cuda(),
+                512)).to(get_accelerator().device_name()),
     torch.rand((512,
-                1024)).cuda(),
+                1024)).to(get_accelerator().device_name()),
     torch.rand((512,
-                30000)).cuda()
+                30000)).to(get_accelerator().device_name())
 ]
 unflat_t = x * 30
 
@@ -80,15 +81,15 @@ def cprofileme():
     print("py")
     cProfile.run("py()", sort=-1)
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
     print("cpp")
     cProfile.run("cpp()", sort=-1)
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
     print("apex")
     cProfile.run("apex()", sort=-1)
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
 
 
 #### timeit ####
@@ -100,13 +101,13 @@ def timeme():
     print("--------------- timeit -----------------")
     print(f'py  ={timeit.Timer("py()", globals=globals()).timeit(number=1)}')
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
     print(f'cpp ={timeit.Timer("cpp()", globals=globals()).timeit(number=1)}')
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
     print(f'apex={timeit.Timer("apex()", globals=globals()).timeit(number=1)}')
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
 
 
 #### line_profiler ####
@@ -120,15 +121,15 @@ def line_profileme():
     print("py")
     profile(py)()  # noqa: F821
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
     print("cpp")
     profile(cpp)()  # noqa: F821
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
     print("apex")
     profile(apex)()  # noqa: F821
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
 
 
 if __name__ == "__main__":
diff --git a/tests/onebit/test_mpi_backend.py b/tests/onebit/test_mpi_backend.py
index 3b9f67cce9ca..bb8915f2c001 100644
--- a/tests/onebit/test_mpi_backend.py
+++ b/tests/onebit/test_mpi_backend.py
@@ -7,17 +7,19 @@
 import deepspeed
 
 from deepspeed.runtime.comm.mpi import MpiBackend
+from deepspeed.accelerator import get_accelerator
 
 comm = MPI.COMM_WORLD
 size = comm.Get_size()
 rank = comm.Get_rank()
 
-deepspeed.init_distributed(dist_backend='nccl')
+deepspeed.init_distributed(dist_backend=get_accelerator().communication_backend_name())
 
 # Change cuda_aware to True to test out CUDA-Aware MPI communication
 backend = MpiBackend(cuda_aware=False)
 
-device = torch.device('cuda', rank % torch.cuda.device_count())
+local_rank = rank % get_accelerator().device_count()
+device = torch.device(get_accelerator().device_name(), local_rank)
 
 
 # A simulated compression function using deepspeed.comm
@@ -37,7 +39,7 @@ def torch_sim(a):
         [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
     rank = dist.get_rank()
     server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
-    torch.cuda.synchronize()
+    get_accelerator().synchronize()
     dist.barrier()
     return a_server_compressed, worker_error, server_error
 
@@ -58,8 +60,7 @@ def torch_sim(a):
 server_error = torch.zeros(right_server_size, device=device)
 
 a_torch, worker_error_torch, server_error_torch = torch_sim(a)
-torch.cuda.empty_cache()
-local_rank = rank % torch.cuda.device_count()
+get_accelerator().empty_cache()
 
 a_after = backend.compressed_allreduce(a, worker_error, server_error, local_rank)
 
diff --git a/tests/onebit/test_mpi_perf.py b/tests/onebit/test_mpi_perf.py
index 3345c20e5008..dd67fdb615e8 100644
--- a/tests/onebit/test_mpi_perf.py
+++ b/tests/onebit/test_mpi_perf.py
@@ -8,6 +8,7 @@
 
 # Configure wall clock timer
 from deepspeed.utils.timer import SynchronizedWallClockTimer
+from deepspeed.accelerator import get_accelerator
 
 from statistics import mean
 
@@ -17,11 +18,12 @@
 size = comm.Get_size()
 rank = comm.Get_rank()
 
-deepspeed.init_distributed(dist_backend='nccl')
+deepspeed.init_distributed(dist_backend=get_accelerator().communication_backend_name())
 # Change cuda_aware to True to test out CUDA-Aware MPI communication
 backend = MpiBackend(cuda_aware=False)
 
-device = torch.device('cuda', rank % torch.cuda.device_count())
+local_rank = rank % get_accelerator().device_count()
+device = torch.device(get_accelerator().device_name(), local_rank)
 
 tensor_size = 300 * 2**20
 server_size = int(tensor_size / size)
@@ -41,8 +43,6 @@
 warmup = 10
 iters = 10
 
-local_rank = rank % torch.cuda.device_count()
-
 # Warmup
 for i in range(warmup):
     backend.compressed_allreduce(a, worker_error, server_error, local_rank)
diff --git a/tests/onebit/test_nccl_backend.py b/tests/onebit/test_nccl_backend.py
index d569c7272f7b..e544865b7685 100644
--- a/tests/onebit/test_nccl_backend.py
+++ b/tests/onebit/test_nccl_backend.py
@@ -8,16 +8,17 @@
 import os
 
 from deepspeed.runtime.comm.nccl import NcclBackend
+from deepspeed.accelerator import get_accelerator
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--local_rank', type=int, default=-1)
 args = parser.parse_args()
 
-deepspeed.init_distributed(dist_backend='nccl')
+deepspeed.init_distributed(dist_backend=get_accelerator().communication_backend_name())
 args.local_rank = int(os.environ['LOCAL_RANK'])
 
-torch.cuda.set_device(args.local_rank)
-device = torch.device("cuda", args.local_rank)
+get_accelerator().set_device(args.local_rank)
+device = torch.device(get_accelerator().device_name(), args.local_rank)
 
 size = dist.get_world_size()
 rank = dist.get_rank()
@@ -43,7 +44,7 @@ def torch_sim(a):
         [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
     rank = dist.get_rank()
     server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
-    torch.cuda.synchronize()
+    get_accelerator().synchronize()
     dist.barrier()
     return a_server_compressed, worker_error, server_error
 
@@ -64,7 +65,7 @@ def torch_sim(a):
 server_error = torch.zeros(right_server_size, device=device)
 
 a_torch, worker_error_torch, server_error_torch = torch_sim(a)
-torch.cuda.empty_cache()
+get_accelerator().empty_cache()
 
 a_after = backend.compressed_allreduce(a, worker_error, server_error, local_rank)
 
diff --git a/tests/onebit/test_nccl_perf.py b/tests/onebit/test_nccl_perf.py
index dcdb13acd4c2..aab93efac851 100644
--- a/tests/onebit/test_nccl_perf.py
+++ b/tests/onebit/test_nccl_perf.py
@@ -9,6 +9,7 @@
 
 from deepspeed.runtime.comm.nccl import NcclBackend
 from deepspeed.utils.timer import SynchronizedWallClockTimer
+from deepspeed.accelerator import get_accelerator
 from statistics import mean
 
 timers = SynchronizedWallClockTimer()
@@ -17,11 +18,11 @@
 parser.add_argument('--local_rank', type=int, default=-1)
 args = parser.parse_args()
 
-deepspeed.init_distributed(dist_backend='nccl')
+deepspeed.init_distributed(dist_backend=get_accelerator().communication_backend_name())
 args.local_rank = int(os.environ['LOCAL_RANK'])
 
-torch.cuda.set_device(args.local_rank)
-device = torch.device("cuda", args.local_rank)
+get_accelerator().set_device(args.local_rank)
+device = torch.device(get_accelerator().device_name(), args.local_rank)
 
 size = dist.get_world_size()
 rank = dist.get_rank()
diff --git a/tests/perf/adam_test1.py b/tests/perf/adam_test1.py
index 7ee6c71b509d..13d486d4d855 100755
--- a/tests/perf/adam_test1.py
+++ b/tests/perf/adam_test1.py
@@ -3,13 +3,15 @@
 import torch
 from deepspeed.ops.adam import DeepSpeedCPUAdam
 import time
+from deepspeed.accelerator import get_accelerator
 
 device = 'cpu'
 model_size = 1 * 1024**3
 param = torch.nn.Parameter(torch.ones(model_size, device=device))
-param_fp16 = torch.nn.Parameter(torch.ones(model_size,
-                                           dtype=torch.half,
-                                           device='cuda:0'))
+param_fp16 = torch.nn.Parameter(
+    torch.ones(model_size,
+               dtype=torch.half,
+               device=get_accelerator().device_name(0)))
 
 optimizer = DeepSpeedCPUAdam([param])
 #torch.set_num_threads(128)
diff --git a/tests/small_model_debugging/test.py b/tests/small_model_debugging/test.py
index 799fa9872d74..a97792df56ac 100644
--- a/tests/small_model_debugging/test.py
+++ b/tests/small_model_debugging/test.py
@@ -3,6 +3,7 @@
 import torch
 from deepspeed.pt.deepspeed_linear import LinearModuleForZeroStage3
 from deepspeed.pt.log_utils import logger
+from deepspeed.accelerator import get_accelerator
 
 
 def see_memory_usage(message):
@@ -11,37 +12,42 @@ def see_memory_usage(message):
     logger.info(message)
     logger.info(
         "Memory Allocated %s GigaBytes ",
-        torch.cuda.memory_allocated() / (1024 * 1024 * 1024),
+        get_accelerator().memory_allocated() / (1024 * 1024 * 1024),
     )
     logger.info(
         "Max Memory Allocated %s GigaBytes",
-        torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024),
+        get_accelerator().max_memory_allocated() / (1024 * 1024 * 1024),
     )
     logger.info(
         "Cache Allocated %s GigaBytes",
-        torch.cuda.memory_cached() / (1024 * 1024 * 1024),
+        get_accelerator().memory_cached() / (1024 * 1024 * 1024),
     )
     logger.info(
         "Max cache Allocated %s GigaBytes",
-        torch.cuda.max_memory_cached() / (1024 * 1024 * 1024),
+        get_accelerator().max_memory_cached() / (1024 * 1024 * 1024),
     )
 
 
-tens = torch.rand(1024, 16384, dtype=torch.half, device=torch.device('cuda'))
+tens = torch.rand(1024,
+                  16384,
+                  dtype=torch.half,
+                  device=torch.device(get_accelerator().device_name()))
 tens_back = tens.detach().clone()
 
 #linear_bk = torch.nn.functional.linear
 #torch.nn.functional.linear = deepspeed.pt.deepspeed_linear.LinearFunctionForZeroStage3.apply
 model = LinearModuleForZeroStage3(16384, 16384)
 
-model.cuda().half()
+model.to(get_accelerator().device_name()).half()
 
 see_memory_usage("Before forward")
 y = model(tens)
 
 see_memory_usage("After forward")
 
-model.weight.data = torch.zeros(1, dtype=torch.half, device=torch.device('cuda'))
+model.weight.data = torch.zeros(1,
+                                dtype=torch.half,
+                                device=torch.device(get_accelerator().device_name()))
 
 see_memory_usage("After weight zero")
 
diff --git a/tests/unit/alexnet_model.py b/tests/unit/alexnet_model.py
index f7038f6a2072..bdbaf02922e2 100644
--- a/tests/unit/alexnet_model.py
+++ b/tests/unit/alexnet_model.py
@@ -7,6 +7,7 @@
 import deepspeed
 import deepspeed.comm as dist
 import deepspeed.runtime.utils as ds_utils
+from deepspeed.accelerator import get_accelerator
 from deepspeed.runtime.pipe.module import PipelineModule, LayerSpec
 
 
@@ -110,7 +111,7 @@ def cifar_trainset(fp16=False):
 
     transform = transforms.Compose(transform_list)
 
-    local_rank = torch.cuda.current_device()
+    local_rank = get_accelerator().current_device()
 
     # Only one rank per machine downloads.
     dist.barrier()
@@ -131,7 +132,8 @@ def train_cifar(model,
                 average_dp_losses=True,
                 fp16=True,
                 seed=123):
-    with torch.random.fork_rng(devices=[torch.cuda.current_device()]):
+    with get_accelerator().random().fork_rng(
+            devices=[get_accelerator().current_device_name()]):
         ds_utils.set_random_seed(seed)
 
         # disable dropout
@@ -154,7 +156,7 @@ def train_cifar(model,
                 print(f'STEP={step} LOSS={loss.item()}')
 
         if average_dp_losses:
-            loss_tensor = torch.tensor(losses).cuda()
+            loss_tensor = torch.tensor(losses).to(get_accelerator().device_name())
             dist.all_reduce(loss_tensor)
             loss_tensor /= dist.get_world_size()
             losses = loss_tensor.tolist()
diff --git a/tests/unit/comm/test_dist.py b/tests/unit/comm/test_dist.py
index 7ade69a2ef5a..6005c926f793 100644
--- a/tests/unit/comm/test_dist.py
+++ b/tests/unit/comm/test_dist.py
@@ -7,6 +7,7 @@
 
 from unit.common import DistributedTest, DistributedFixture, get_master_port
 from unit.simple_model import SimpleModel
+from deepspeed.accelerator import get_accelerator
 
 import pytest
 
@@ -105,9 +106,9 @@ class TestDistAllReduce(DistributedTest):
     world_size = [1, 2, 4]
 
     def test(self):
-        x = torch.ones(1, 3).cuda() * (dist.get_rank() + 1)
+        x = torch.ones(1, 3).to(get_accelerator().device_name()) * (dist.get_rank() + 1)
         sum_of_ranks = (dist.get_world_size() * (dist.get_world_size() + 1)) // 2
-        result = torch.ones(1, 3).cuda() * sum_of_ranks
+        result = torch.ones(1, 3).to(get_accelerator().device_name()) * sum_of_ranks
         dist.all_reduce(x)
         assert torch.all(x == result)
 
@@ -117,16 +118,21 @@ class TestDistInit(DistributedTest):
     init_distributed = False
 
     def test_already_init(self, dist_init_required):
-        torch.distributed.init_process_group('nccl')
-        deepspeed.init_distributed('nccl', dist_init_required=dist_init_required)
+        torch.distributed.init_process_group(
+            get_accelerator().communication_backend_name())
+        deepspeed.init_distributed(get_accelerator().communication_backend_name(),
+                                   dist_init_required=dist_init_required)
 
     def test_no_init(self, dist_init_required):
         if dist_init_required or dist_init_required is None:
-            deepspeed.init_distributed('nccl', dist_init_required=dist_init_required)
+            deepspeed.init_distributed(get_accelerator().communication_backend_name(),
+                                       dist_init_required=dist_init_required)
         else:
             # torch.dist is not done and for some reason the user says they don't want it done
             with pytest.raises(Exception):
-                deepspeed.init_distributed('nccl', dist_init_required=dist_init_required)
+                deepspeed.init_distributed(
+                    get_accelerator().communication_backend_name(),
+                    dist_init_required=dist_init_required)
 
 
 class TestDistInitNoEnv(DistributedTest):
@@ -136,12 +142,13 @@ class TestDistInitNoEnv(DistributedTest):
 
     def test(self):
         torch.distributed.init_process_group(
-            backend='nccl',
+            backend=get_accelerator().communication_backend_name(),
             init_method=f"tcp://127.0.0.1:{get_master_port()}",
             world_size=1,
             rank=0)
         assert torch.distributed.is_initialized()
-        deepspeed.init_distributed('nccl', auto_mpi_discovery=True)
+        deepspeed.init_distributed(get_accelerator().communication_backend_name(),
+                                   auto_mpi_discovery=True)
 
 
 @pytest.mark.parametrize("dist_init_required", [True, False])
@@ -149,7 +156,8 @@ class TestDistInitWithModel(DistributedTest):
     init_distributed = False
 
     def test_already_init(self, dist_init_required):
-        torch.distributed.init_process_group('nccl')
+        torch.distributed.init_process_group(
+            get_accelerator().communication_backend_name())
         model = SimpleModel(4)
         config_dict = {
             "train_micro_batch_size_per_gpu": 1,
diff --git a/tests/unit/common.py b/tests/unit/common.py
index 15c98911f519..35e8f3983072 100644
--- a/tests/unit/common.py
+++ b/tests/unit/common.py
@@ -9,6 +9,7 @@
 import torch
 import torch.multiprocessing as mp
 import deepspeed
+from deepspeed.accelerator import get_accelerator
 import deepspeed.comm as dist
 from torch.multiprocessing import Process
 
@@ -39,23 +40,36 @@ def get_master_port():
     return master_port
 
 
-def set_cuda_visibile():
+def set_accelerator_visible():
     cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", None)
     xdist_worker_id = get_xdist_worker_id()
     if xdist_worker_id is None:
         xdist_worker_id = 0
     if cuda_visible is None:
-        # CUDA_VISIBLE_DEVICES is not set, discover it from nvidia-smi instead
+        # CUDA_VISIBLE_DEVICES is not set, discover it using accelerator specific command instead
         import subprocess
-        is_rocm_pytorch = hasattr(torch.version, 'hip') and torch.version.hip is not None
-        if is_rocm_pytorch:
-            rocm_smi = subprocess.check_output(['rocm-smi', '--showid'])
-            gpu_ids = filter(lambda s: 'GPU' in s,
-                             rocm_smi.decode('utf-8').strip().split('\n'))
-            num_gpus = len(list(gpu_ids))
+        if get_accelerator().device_name() == 'cuda':
+            is_rocm_pytorch = hasattr(torch.version,
+                                      'hip') and torch.version.hip is not None
+            if is_rocm_pytorch:
+                rocm_smi = subprocess.check_output(['rocm-smi', '--showid'])
+                gpu_ids = filter(lambda s: 'GPU' in s,
+                                 rocm_smi.decode('utf-8').strip().split('\n'))
+                num_gpus = len(list(gpu_ids))
+            else:
+                nvidia_smi = subprocess.check_output(['nvidia-smi', '--list-gpus'])
+                num_gpus = len(nvidia_smi.decode('utf-8').strip().split('\n'))
         else:
-            nvidia_smi = subprocess.check_output(['nvidia-smi', '--list-gpus'])
-            num_gpus = len(nvidia_smi.decode('utf-8').strip().split('\n'))
+            assert get_accelerator().device_name() == 'xpu'
+            import re
+            clinfo = subprocess.check_output(['clinfo'])
+            lines = clinfo.decode('utf-8').strip().split('\n')
+            num_gpus = 0
+            for line in lines:
+                match = re.search('Device Type.*GPU', line)
+                if match:
+                    num_gpus += 1
+
         cuda_visible = ",".join(map(str, range(num_gpus)))
 
     # rotate list based on xdist worker id, example below
@@ -74,7 +88,7 @@ class DistributedExec(ABC):
     methods needed for DistributedTest and DistributedFixture.
     """
     world_size = 2
-    backend = "nccl"
+    backend = get_accelerator().communication_backend_name()
     init_distributed = True
     set_dist_env = True
     requires_cuda_env = True
@@ -86,8 +100,8 @@ def run(self):
     def __call__(self, request=None):
         self._fixture_kwargs = self._get_fixture_kwargs(request, self.run)
         world_size = self.world_size
-        if self.requires_cuda_env and not torch.cuda.is_available():
-            pytest.skip("only supported in CUDA environments.")
+        if self.requires_cuda_env and not get_accelerator().is_available():
+            pytest.skip("only supported in accelerator environments.")
 
         if isinstance(world_size, int):
             world_size = [world_size]
@@ -174,15 +188,15 @@ def _dist_init(self, local_rank, num_procs, skip_msg):
         # turn off NCCL logging if set
         os.environ.pop('NCCL_DEBUG', None)
 
-        if torch.cuda.is_available():
-            set_cuda_visibile()
+        if get_accelerator().is_available():
+            set_accelerator_visible()
 
         if self.init_distributed:
             deepspeed.init_distributed(dist_backend=self.backend)
             dist.barrier()
 
-        if torch.cuda.is_available():
-            torch.cuda.set_device(local_rank)
+        if get_accelerator().is_available():
+            get_accelerator().set_device(local_rank)
 
         try:
             self.run(**self._fixture_kwargs)
@@ -323,8 +337,8 @@ def __call__(self, request):
         self._current_test = self._get_current_test_func(request)
         self._fixture_kwargs = self._get_fixture_kwargs(request, self._current_test)
 
-        if self.requires_cuda_env and not torch.cuda.is_available():
-            pytest.skip("only supported in CUDA environments.")
+        if self.requires_cuda_env and not get_accelerator().is_available():
+            pytest.skip("only supported in accelerator environments.")
 
         # Catch world_size override pytest mark
         for mark in getattr(request.function, "pytestmark", []):
diff --git a/tests/unit/compression/test_compression.py b/tests/unit/compression/test_compression.py
index 8b3617424225..829161ea072c 100644
--- a/tests/unit/compression/test_compression.py
+++ b/tests/unit/compression/test_compression.py
@@ -10,6 +10,7 @@
 from unit.modelingpreln import BertEncoder as BertEncoderPreln
 from deepspeed.compression.basic_layer import LinearLayer_Compress, ColumnParallelLinear_Compress, RowParallelLinear_Compress
 from deepspeed.compression.helper import convert_conv1d_to_linear
+from deepspeed.accelerator import get_accelerator
 from unit.common import DistributedTest
 
 TORCH_MAJOR = int(torch.__version__.split('.')[0])
@@ -23,7 +24,7 @@ def reset_random(seed=1234):
     random.seed(seed)
     np.random.seed(seed)
     torch.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
+    get_accelerator().manual_seed_all(seed)
 
 
 def create_bert_model():
diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py
index ec791028deb4..371ecda710b2 100644
--- a/tests/unit/inference/test_inference.py
+++ b/tests/unit/inference/test_inference.py
@@ -16,6 +16,7 @@
 from huggingface_hub import HfApi
 from deepspeed.model_implementations import DeepSpeedTransformerInference
 from torch import nn
+from deepspeed.accelerator import get_accelerator
 
 rocm_version = OpBuilder.installed_rocm_version()
 if rocm_version != (0, 0):
@@ -288,17 +289,17 @@ def test(
             pipe.model.half()
 
         # Switch device to GPU after converting to half
-        device = torch.device(f"cuda:{local_rank}")
+        device = torch.device(get_accelerator().device_name(local_rank))
         pipe.device = device
         pipe.model.to(device)
 
         # Warm-up queries for perf measurement
         #for i in range(10):
         #    _ = pipe(query, **inf_kwargs)
-        torch.cuda.synchronize()
+        get_accelerator().synchronize()
         start = time.time()
         bs_output = pipe(query, **inf_kwargs)
-        torch.cuda.synchronize()
+        get_accelerator().synchronize()
         bs_time = time.time() - start
 
         pipe.model = deepspeed.init_inference(
@@ -312,10 +313,10 @@ def test(
         # Warm-up queries for perf measurement
         #for i in range(10):
         #    _ = pipe(query, **inf_kwargs)
-        torch.cuda.synchronize()
+        get_accelerator().synchronize()
         start = time.time()
         ds_output = pipe(query, **inf_kwargs)
-        torch.cuda.synchronize()
+        get_accelerator().synchronize()
         ds_time = time.time() - start
 
         # facebook/opt* and some bigscient/bloom* models are not matching
@@ -372,7 +373,7 @@ def test(
                                               replace_with_kernel_inject=True)
         check_injection(pipe.model)
         # Switch device to GPU so that input tensors are not on CPU
-        pipe.device = torch.device(f"cuda:{local_rank}")
+        pipe.device = torch.device(get_accelerator().device_name(local_rank))
         ds_output = pipe(query, **inf_kwargs)
 
         print(local_rank, "baseline", bs_output)
@@ -433,7 +434,7 @@ def test(
                                               dtype=dtype,
                                               injection_policy=injection_policy)
         # Switch device to GPU so that input tensors are not on CPU
-        pipe.device = torch.device(f"cuda:{local_rank}")
+        pipe.device = torch.device(get_accelerator().device_name(local_rank))
         ds_output = pipe(query, **inf_kwargs)
 
         print(local_rank, "baseline", bs_output)
@@ -483,7 +484,7 @@ def test(
                                               mp_size=world_size,
                                               dtype=dtype)
         # Switch device to GPU so that input tensors are not on CPU
-        pipe.device = torch.device(f"cuda:{local_rank}")
+        pipe.device = torch.device(get_accelerator().device_name(local_rank))
         ds_output = pipe(query, **inf_kwargs)
 
         print(local_rank, "baseline", bs_output)
@@ -515,7 +516,7 @@ def test(self, model_family, model_name, task):
         import lm_eval.evaluator
 
         local_rank = os.getenv("LOCAL_RANK", "0")
-        device = torch.device(f"cuda:{local_rank}")
+        device = torch.device(get_accelerator().device_name(local_rank))
         dtype = torch.float
         task_dict = lm_eval.tasks.get_task_dict([task])
 
@@ -529,12 +530,12 @@ def test(self, model_family, model_name, task):
         else:
             lm = lm_eval.models.get_model(model_family).create_from_arg_string(
                 f"pretrained={model_name}",
-                {"device": "cuda"})
+                {"device": get_accelerator().device_name()})
 
-        torch.cuda.synchronize()
+        get_accelerator().synchronize()
         start = time.time()
         bs_output = lm_eval.evaluator.evaluate(lm=lm, task_dict=task_dict)
-        torch.cuda.synchronize()
+        get_accelerator().synchronize()
         bs_time = time.time() - start
 
         ds_model = deepspeed.init_inference(
@@ -547,10 +548,10 @@ def test(self, model_family, model_name, task):
         )
         check_injection(ds_model)
         setattr(lm, model_family, ds_model)
-        torch.cuda.synchronize()
+        get_accelerator().synchronize()
         start = time.time()
         ds_output = lm_eval.evaluator.evaluate(lm=lm, task_dict=task_dict)
-        torch.cuda.synchronize()
+        get_accelerator().synchronize()
         ds_time = time.time() - start
 
         ppl_diff = abs(bs_output["results"][task]["ppl"] -
diff --git a/tests/unit/inference/test_model_profiling.py b/tests/unit/inference/test_model_profiling.py
index 0259ce09a1d4..07ce839306a6 100644
--- a/tests/unit/inference/test_model_profiling.py
+++ b/tests/unit/inference/test_model_profiling.py
@@ -7,6 +7,7 @@
 import deepspeed
 from transformers import pipeline
 from unit.common import DistributedTest
+from deepspeed.accelerator import get_accelerator
 
 
 @pytest.fixture
@@ -74,12 +75,12 @@ def test(self,
         e2e_times = []
         model_times = []
         for _ in range(10):
-            torch.cuda.synchronize()
+            get_accelerator().synchronize()
             start = time.perf_counter_ns()
 
             r = pipe(query, **inf_kwargs)
 
-            torch.cuda.synchronize()
+            get_accelerator().synchronize()
             end = time.perf_counter_ns()
 
             e2e_times.append((end - start) / 1e6)  # convert ns to ms
diff --git a/tests/unit/megatron_model.py b/tests/unit/megatron_model.py
index 00212a853570..32faf2244940 100644
--- a/tests/unit/megatron_model.py
+++ b/tests/unit/megatron_model.py
@@ -7,6 +7,7 @@
 
 from .common import get_test_path
 from deepspeed.pipe import PipelineModule, LayerSpec
+from deepspeed.accelerator import get_accelerator
 
 
 def get_megatron_version():
@@ -39,10 +40,10 @@ def get_gpt2_model(args_others, mp_size=1):
 
     initialize_megatron(args_defaults=args_defaults, ignore_unknown_args=True)
     model = GPT2Model(num_tokentypes=0, parallel_output=False)
-    model.cuda()
+    model.to(get_accelerator().device_name())
     from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
     from megatron import mpu
-    i = torch.cuda.current_device()
+    i = get_accelerator().current_device_name()
     model = torchDDP(model,
                      device_ids=[i],
                      output_device=i,
@@ -78,8 +79,9 @@ def __init__(self, num_layers, mp_size, args_others, topo, **kwargs):
         class ParallelTransformerLayerPipe(ParallelTransformerLayer):
             def forward(self, args):
                 # hardcode attn mask for testing, PP requires the attn_mask to be stashed
-                attention_mask = torch.tensor([[True]],
-                                              device=torch.cuda.current_device())
+                attention_mask = torch.tensor(
+                    [[True]],
+                    device=get_accelerator().current_device_name())
                 return super().forward(args, attention_mask)
 
         layers = []
diff --git a/tests/unit/model_parallelism/test_configurable_parallel_mp.py b/tests/unit/model_parallelism/test_configurable_parallel_mp.py
index 93b66cee225a..d17f45c0b526 100644
--- a/tests/unit/model_parallelism/test_configurable_parallel_mp.py
+++ b/tests/unit/model_parallelism/test_configurable_parallel_mp.py
@@ -7,6 +7,7 @@
 import random
 import numpy as np
 import deepspeed.comm as dist
+from deepspeed.accelerator import get_accelerator
 from unit.common import DistributedTest, DistributedFixture
 from unit.megatron_model import get_gpt2_model, get_megatron_version
 
@@ -42,7 +43,7 @@ def reset_random(self, seed=1234):
         random.seed(seed)
         np.random.seed(seed)
         torch.manual_seed(seed)
-        torch.cuda.manual_seed_all(seed)
+        get_accelerator().manual_seed_all(seed)
 
     @pytest.fixture
     def inputs(self, bs=1, seq_len=20):
@@ -70,7 +71,10 @@ def test_gpt2_basic(self, tmpdir, inputs):
         model = get_deepspeed_model(model)
 
         model.eval()
-        baseline = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda())
+        device_name = get_accelerator().device_name()
+        baseline = model(inputs[0].to(device_name),
+                         inputs[1].to(device_name),
+                         inputs[2].to(device_name))
 
         tag = 'mp_1'
         state_dict = {}
@@ -99,7 +103,10 @@ def test_gpt2_mp2_no_resize(self, tmpdir, inputs):
 
         model.eval()
 
-        baseline = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda())
+        device_name = get_accelerator().device_name()
+        baseline = model(inputs[0].to(device_name),
+                         inputs[1].to(device_name),
+                         inputs[2].to(device_name))
 
         tag = 'mp_2'
         state_dict = {}
@@ -111,7 +118,10 @@ def test_gpt2_mp2_no_resize(self, tmpdir, inputs):
                               load_optimizer_states=False,
                               load_lr_scheduler_states=False)
 
-        test = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda())
+        device_name = get_accelerator().device_name()
+        test = model(inputs[0].to(device_name),
+                     inputs[1].to(device_name),
+                     inputs[2].to(device_name))
         assert torch.allclose(baseline, test, rtol=1.0, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"
 
 
@@ -133,7 +143,10 @@ def run(self, inputs, class_tmpdir):
         model.eval()
 
         with torch.no_grad():
-            baseline = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda())
+            device_name = get_accelerator().device_name()
+            baseline = model(inputs[0].to(device_name),
+                             inputs[1].to(device_name),
+                             inputs[2].to(device_name))
             if dist.get_rank() == 0:
                 save_path = os.path.join(class_tmpdir, "output.pt")
                 torch.save(baseline.cpu(), save_path)
@@ -164,7 +177,10 @@ def test(self, baseline_mp2, inputs, class_tmpdir):
             model.load_checkpoint(class_tmpdir,
                                   load_optimizer_states=False,
                                   load_lr_scheduler_states=False)
-            test = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda())
+            device_name = get_accelerator().device_name()
+            test = model(inputs[0].to(device_name),
+                         inputs[1].to(device_name),
+                         inputs[2].to(device_name))
             if dist.get_rank() == 0:
                 load_path = os.path.join(class_tmpdir, "output.pt")
                 baseline = torch.load(load_path)
diff --git a/tests/unit/model_parallelism/test_configurable_parallel_pp.py b/tests/unit/model_parallelism/test_configurable_parallel_pp.py
index 33fd33ff2975..af091d68c411 100644
--- a/tests/unit/model_parallelism/test_configurable_parallel_pp.py
+++ b/tests/unit/model_parallelism/test_configurable_parallel_pp.py
@@ -11,6 +11,7 @@
 from unit.megatron_model import get_megatron_version
 from unit.megatron_model import MockGPT2ModelPipe as GPT2ModelPipe
 from deepspeed.utils import RepeatingLoader
+from deepspeed.accelerator import get_accelerator
 
 TORCH_MAJOR = int(torch.__version__.split('.')[0])
 TORCH_MINOR = int(torch.__version__.split('.')[1])
@@ -33,7 +34,7 @@ def get_deepspeed_model(model):
     model, _, _,_ = deepspeed.initialize(model=model,
                                          model_parameters=model.parameters(),
                                          config=ds_config_dict)
-    return model.cuda()
+    return model.to(get_accelerator().device_name())
 
 
 def get_topology(mp, pp, world_size):
@@ -52,7 +53,7 @@ def reset_random(self, seed=1234):
         random.seed(seed)
         np.random.seed(seed)
         torch.manual_seed(seed)
-        torch.cuda.manual_seed_all(seed)
+        get_accelerator().manual_seed_all(seed)
 
     @pytest.fixture
     def inputs(self, bs=1, seq_len=1, hidden_size=128):
@@ -155,7 +156,7 @@ def run(self, inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size):
         model = get_deepspeed_model(gpt2_pipe_model)
 
         with torch.no_grad():
-            inputs = [x.cuda() for x in inputs]
+            inputs = [x.to(get_accelerator().device_name()) for x in inputs]
             if model.is_first_stage() or model.is_last_stage():
                 loader = RepeatingLoader([(inputs[0], 0)])
                 data_iter = iter(loader)
@@ -225,7 +226,7 @@ def _test(self,
                                   tag=checkpoint_tag,
                                   load_optimizer_states=False,
                                   load_lr_scheduler_states=False)
-            inputs = [x.cuda() for x in inputs]
+            inputs = [x.to(get_accelerator().device_name()) for x in inputs]
             if model.is_first_stage() or model.is_last_stage():
                 loader = RepeatingLoader([(inputs[0], 0)])
                 data_iter = iter(loader)
diff --git a/tests/unit/modeling.py b/tests/unit/modeling.py
index 2624e0e2eedb..94dea45468bc 100644
--- a/tests/unit/modeling.py
+++ b/tests/unit/modeling.py
@@ -43,6 +43,7 @@
 #from numba import cuda
 
 #from deepspeed_cuda import DeepSpeedSoftmaxConfig, DeepSpeedSoftmax
+from deepspeed.accelerator import get_accelerator
 
 logger = logging.getLogger(__name__)
 
@@ -184,8 +185,8 @@ def swish(x):
 class GPUTimer:
     def __init__(self):
         super().__init__()
-        self.start = cuda.event()  # noqa: F821
-        self.stop = cuda.event()  # noqa: F821
+        self.start = get_accelerator().Event()  # noqa: F821
+        self.stop = get_accelerator().Event()  # noqa: F821
 
     def record(self):
         self.start.record()
@@ -749,12 +750,12 @@ def __init__(self, config, bert_model_embedding_weights):
 
     def forward(self, hidden_states):
         hidden_states = self.transform(hidden_states)
-        torch.cuda.nvtx.range_push(
+        get_accelerator().range_push(
             "decoder input.size() = {}, weight.size() = {}".format(
                 hidden_states.size(),
                 self.decoder.weight.size()))
         hidden_states = self.decoder(hidden_states) + self.bias
-        torch.cuda.nvtx.range_pop()
+        get_accelerator().range_pop()
         return hidden_states
 
 
@@ -884,7 +885,7 @@ def from_pretrained(cls,
             weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
             state_dict = torch.load(
                 weights_path,
-                map_location='cpu' if not torch.cuda.is_available() else None)
+                map_location='cpu' if not get_accelerator().is_available() else None)
         if tempdir:
             # Clean up temp dir
             shutil.rmtree(tempdir)
diff --git a/tests/unit/modelingpreln.py b/tests/unit/modelingpreln.py
index e9947b2079b3..0069add9aa4d 100644
--- a/tests/unit/modelingpreln.py
+++ b/tests/unit/modelingpreln.py
@@ -39,6 +39,7 @@
 from torch.nn import Module
 import torch.nn.functional as F
 import torch.nn.init as init
+from deepspeed.accelerator import get_accelerator
 
 #from numba import cuda
 
@@ -184,8 +185,8 @@ def swish(x):
 class GPUTimer:
     def __init__(self):
         super().__init__()
-        self.start = cuda.event()  # noqa: F821
-        self.stop = cuda.event()  # noqa: F821
+        self.start = get_accelerator().Event()  # noqa: F821
+        self.stop = get_accelerator().Event()  # noqa: F821
 
     def record(self):
         self.start.record()
@@ -844,12 +845,12 @@ def __init__(self, config, bert_model_embedding_weights):
 
     def forward(self, hidden_states):
         hidden_states = self.transform(hidden_states)
-        torch.cuda.nvtx.range_push(
+        get_accelerator().range_push(
             "decoder input.size() = {}, weight.size() = {}".format(
                 hidden_states.size(),
                 self.decoder.weight.size()))
         hidden_states = self.decoder(hidden_states) + self.bias
-        torch.cuda.nvtx.range_pop()
+        get_accelerator().range_pop()
         return hidden_states
 
 
@@ -979,7 +980,7 @@ def from_pretrained(cls,
             weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
             state_dict = torch.load(
                 weights_path,
-                map_location='cpu' if not torch.cuda.is_available() else None)
+                map_location='cpu' if not get_accelerator().is_available() else None)
         if tempdir:
             # Clean up temp dir
             shutil.rmtree(tempdir)
diff --git a/tests/unit/ops/cuda/test_cuda_backward.py b/tests/unit/ops/accelerators/test_accelerator_backward.py
similarity index 95%
rename from tests/unit/ops/cuda/test_cuda_backward.py
rename to tests/unit/ops/accelerators/test_accelerator_backward.py
index c7a460161792..ad26daeb698c 100644
--- a/tests/unit/ops/cuda/test_cuda_backward.py
+++ b/tests/unit/ops/accelerators/test_accelerator_backward.py
@@ -8,6 +8,7 @@
 import copy
 from torch import nn
 from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
+from deepspeed.accelerator import get_accelerator
 from unit.modeling import BertConfig, BertLayerNorm, BertEncoder as BertEncoderPostln
 from unit.modelingpreln import BertEncoder as BertEncoderPreln
 from unit.common import DistributedTest
@@ -84,7 +85,7 @@ def zero_grad(variables):
         variable.grad.zero_()
 
 
-device = torch.device("cuda")
+device = torch.device(get_accelerator().device_name())
 kwargs_fp32 = {'dtype': torch.float, 'device': device, 'requires_grad': True}
 kwargs_fp16 = {'dtype': torch.half, 'device': device, 'requires_grad': True}
 
@@ -210,8 +211,8 @@ def create_models(ds_config):
         bert_encoder.half()
         ds_encoder.half()
 
-    bert_encoder.cuda()
-    ds_encoder.cuda()
+    bert_encoder.to(get_accelerator().device_name())
+    ds_encoder.to(get_accelerator().device_name())
 
     return bert_encoder, ds_encoder
 
@@ -288,9 +289,9 @@ def test_backward(self,
                       is_preln,
                       use_fp16,
                       atol):
-        # Only run fp16 test cases on devices with 7+ capability.
-        major, _ = torch.cuda.get_device_capability()
-        if major < 7 and (use_fp16 is True or is_preln is False):
+        # Only run fp16 test cases on devices with FP16 capability.
+        if not get_accelerator().is_fp16_supported() and (use_fp16 is True
+                                                          or is_preln is False):
             return
 
         ds_config = DeepSpeedTransformerConfig()
@@ -322,9 +323,8 @@ def test_backward(self,
     #                             is_preln,
     #                             use_fp16,
     #                             atol):
-    #    # Only run fp16 test cases on devices with 7+ capability.
-    #    major, _ = torch.cuda.get_device_capability()
-    #    if major < 7 and (use_fp16 is True or is_preln is False):
+    #    # Only run fp16 test cases on devices with FP16 capability.
+    #    if not get_accelerator().is_fp16_supported() and use_fp16 is True:
     #        return
     #
     #    ds_config = DeepSpeedTransformerConfig()
diff --git a/tests/unit/ops/cuda/test_cuda_forward.py b/tests/unit/ops/accelerators/test_accelerator_forward.py
similarity index 95%
rename from tests/unit/ops/cuda/test_cuda_forward.py
rename to tests/unit/ops/accelerators/test_accelerator_forward.py
index 3958a220b35b..317e2fe3cb45 100644
--- a/tests/unit/ops/cuda/test_cuda_forward.py
+++ b/tests/unit/ops/accelerators/test_accelerator_forward.py
@@ -10,6 +10,7 @@
 from unit.modelingpreln import BertEncoder as BertEncoderPreln
 from unit.modeling import BertLayerNorm, BertConfig, BertEncoder as BertEncoderPostln
 from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
+from deepspeed.accelerator import get_accelerator
 from unit.common import DistributedTest
 
 
@@ -31,7 +32,7 @@ def zero_grad(variables):
         variable.grad.zero_()
 
 
-device = torch.device("cuda")
+device = torch.device(get_accelerator().device_name())
 kwargs_fp32 = {'dtype': torch.float, 'device': device, 'requires_grad': True}
 kwargs_fp16 = {'dtype': torch.half, 'device': device, 'requires_grad': True}
 
@@ -150,8 +151,8 @@ def create_models(ds_config):
         bert_encoder.half()
         ds_encoder.half()
 
-    bert_encoder.cuda()
-    ds_encoder.cuda()
+    bert_encoder.to(get_accelerator().device_name())
+    ds_encoder.to(get_accelerator().device_name())
 
     return bert_encoder, ds_encoder
 
@@ -241,9 +242,8 @@ def test_forward(self,
                      num_layers,
                      is_preln,
                      use_fp16):
-        # Only run fp16 test cases on devices with 7+ capability.
-        major, _ = torch.cuda.get_device_capability()
-        if major < 7 and use_fp16 is True:
+        # Only run fp16 test cases on devices with FP16 capability.
+        if not get_accelerator().is_fp16_supported() and use_fp16 is True:
             return
 
         ds_config = DeepSpeedTransformerConfig()
@@ -281,9 +281,8 @@ def test_forward_with_small_bsz(self,
                                     num_layers,
                                     is_preln,
                                     use_fp16):
-        # Only run fp16 test cases on devices with 7+ capability.
-        major, _ = torch.cuda.get_device_capability()
-        if major < 7 and use_fp16 is True:
+        # Only run fp16 test cases on devices with FP16 capability.
+        if not get_accelerator().is_fp16_supported() and use_fp16 is True:
             return
 
         ds_config = DeepSpeedTransformerConfig()
@@ -319,9 +318,8 @@ def test_forward_stochastic(self,
                                 num_layers,
                                 is_preln,
                                 use_fp16):
-        # Only run fp16 test cases on devices with 7+ capability.
-        major, _ = torch.cuda.get_device_capability()
-        if major < 7 and use_fp16 is True:
+        # Only run fp16 test cases on devices with FP16 capability.
+        if not get_accelerator().is_fp16_supported() and use_fp16 is True:
             return
 
         ds_config = DeepSpeedTransformerConfig()
diff --git a/tests/unit/ops/adagrad/test_cpu_adagrad.py b/tests/unit/ops/adagrad/test_cpu_adagrad.py
index b5dc7dea1be3..17001e6bd021 100644
--- a/tests/unit/ops/adagrad/test_cpu_adagrad.py
+++ b/tests/unit/ops/adagrad/test_cpu_adagrad.py
@@ -6,6 +6,7 @@
 
 import deepspeed
 from deepspeed.ops.adagrad import DeepSpeedCPUAdagrad
+from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import CPUAdagradBuilder
 from unit.common import DistributedTest
 
@@ -26,7 +27,7 @@ def check_equal(first, second, atol=1e-2, verbose=False):
 class TestCPUAdagrad(DistributedTest):
     world_size = 1
     requires_cuda_env = False
-    if not torch.cuda.is_available():
+    if not get_accelerator().is_available():
         init_distributed = False
         set_dist_env = False
 
@@ -138,7 +139,7 @@ def gen_sparse_grad(vocabulary_size, dim, num_indices, dtype, device):
 class TestCPUAdagradGPUError(DistributedTest):
     def test_cpu_adagrad_gpu_error(self):
         model_size = 64
-        device = 'cuda:0'
+        device = get_accelerator().device_name(0)  # 'cuda:0' or 'xpu:0'
         param = torch.nn.Parameter(torch.randn(model_size, device=device))
         optimizer = DeepSpeedCPUAdagrad([param])
 
diff --git a/tests/unit/ops/adam/test_cpu_adam.py b/tests/unit/ops/adam/test_cpu_adam.py
index a9a235d57612..d10fb98105a8 100644
--- a/tests/unit/ops/adam/test_cpu_adam.py
+++ b/tests/unit/ops/adam/test_cpu_adam.py
@@ -6,6 +6,7 @@
 from cpuinfo import get_cpu_info
 
 import deepspeed
+from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.adam import FusedAdam
 from deepspeed.ops.op_builder import CPUAdamBuilder
 from unit.common import DistributedTest
@@ -56,11 +57,11 @@ def _compare_optimizers(model_size, param1, optimizer1, param2, optimizer2):
 class TestCPUAdam(DistributedTest):
     world_size = 1
     requires_cuda_env = False
-    if not torch.cuda.is_available():
+    if not get_accelerator().is_available():
         init_distributed = False
         set_dist_env = False
 
-    @pytest.mark.skipif(not torch.cuda.is_available(),
+    @pytest.mark.skipif(not get_accelerator().is_available(),
                         reason="only supported in CUDA environments.")
     def test_fused_adam_equal(self, dtype, model_size):
         if ("amd" in pytest.cpu_vendor) and (dtype == torch.half):
@@ -70,7 +71,7 @@ def test_fused_adam_equal(self, dtype, model_size):
 
         cpu_data = torch.randn(model_size, device='cpu').to(dtype)
         cpu_param = torch.nn.Parameter(cpu_data)
-        cuda_param = torch.nn.Parameter(cpu_data.cuda())
+        cuda_param = torch.nn.Parameter(cpu_data.to(get_accelerator().device_name()))
 
         # tolerance = cpu_param.float().norm().detach().numpy() * 1e-2
         # check_equal(cpu_param.float().norm(),
@@ -88,10 +89,10 @@ def test_fused_adam_equal(self, dtype, model_size):
                             optimizer2=cuda_optimizer)
 
     def test_torch_adamw_equal(self, dtype, model_size):
-        if torch.cuda.is_available():
+        if get_accelerator().is_available():
             if ("amd" in pytest.cpu_vendor) and (dtype == torch.half):
                 pytest.skip("cpu-adam with half precision not supported on AMD CPUs")
-            ref_param_device = 'cuda'
+            ref_param_device = get_accelerator().device_name()
         else:
             if dtype == torch.half:
                 pytest.skip(
@@ -119,7 +120,7 @@ class TestCPUAdamGPUError(DistributedTest):
     def test_cpu_adam_gpu_error(self):
         model_size = 64
         from deepspeed.ops.adam import DeepSpeedCPUAdam
-        device = 'cuda:0'
+        device = get_accelerator().device_name(0)  # 'cuda:0' or 'xpu:0'
         param = torch.nn.Parameter(torch.randn(model_size, device=device))
         optimizer = DeepSpeedCPUAdam([param])
 
diff --git a/tests/unit/ops/aio/test_aio.py b/tests/unit/ops/aio/test_aio.py
index 886354c38935..86265ab15ef9 100644
--- a/tests/unit/ops/aio/test_aio.py
+++ b/tests/unit/ops/aio/test_aio.py
@@ -6,7 +6,8 @@
 import torch
 import deepspeed
 import deepspeed.comm as dist
-from deepspeed.ops.aio import AsyncIOBuilder
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import AsyncIOBuilder
 from unit.common import DistributedTest
 
 KILO_BYTE = 1024
@@ -20,7 +21,7 @@
 
 
 def _skip_for_invalid_environment(use_cuda_device=True, use_cuda_pinned_tensor=True):
-    if not torch.cuda.is_available():
+    if not get_accelerator().is_available():
         if use_cuda_device:
             pytest.skip("GPU tensors only supported in CUDA environments.")
         if use_cuda_pinned_tensor:
@@ -28,7 +29,7 @@ def _skip_for_invalid_environment(use_cuda_device=True, use_cuda_pinned_tensor=T
 
 
 def _get_local_rank():
-    if torch.cuda.is_available():
+    if get_accelerator().is_available():
         return dist.get_rank()
     return 0
 
@@ -50,14 +51,14 @@ def _get_test_write_file(tmpdir, index):
 
 def _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffer, index=0):
     test_file = _get_test_write_file(tmpdir, index)
-    test_buffer = torch.cuda.ByteTensor(list(ref_buffer))
+    test_buffer = get_accelerator().ByteTensor(list(ref_buffer))
     return test_file, test_buffer
 
 
 def _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer, aio_handle=None, index=0):
     test_file = _get_test_write_file(tmpdir, index)
     if aio_handle is None:
-        test_buffer = torch.ByteTensor(list(ref_buffer)).pin_memory()
+        test_buffer = get_accelerator().pin_memory(torch.ByteTensor(list(ref_buffer)))
     else:
         tmp_buffer = torch.ByteTensor(list(ref_buffer))
         test_buffer = aio_handle.new_cpu_locked_tensor(len(ref_buffer), tmp_buffer)
@@ -80,7 +81,7 @@ def _validate_handle_state(handle, single_submit, overlap_events):
 class TestRead(DistributedTest):
     world_size = 1
     requires_cuda_env = False
-    if not torch.cuda.is_available():
+    if not get_accelerator().is_available():
         init_distributed = False
         set_dist_env = False
 
@@ -99,9 +100,10 @@ def test_parallel_read(self,
                                                IO_PARALLEL)
 
         if use_cuda_pinned_tensor:
-            aio_buffer = torch.empty(IO_SIZE,
-                                     dtype=torch.uint8,
-                                     device='cpu').pin_memory()
+            aio_buffer = get_accelerator().pin_memory(
+                torch.empty(IO_SIZE,
+                            dtype=torch.uint8,
+                            device='cpu'))
         else:
             aio_buffer = h.new_cpu_locked_tensor(IO_SIZE,
                                                  torch.empty(0,
@@ -138,11 +140,14 @@ def test_async_read(self,
                                                IO_PARALLEL)
 
         if cuda_device:
-            aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device='cuda')
-        elif use_cuda_pinned_tensor:
             aio_buffer = torch.empty(IO_SIZE,
                                      dtype=torch.uint8,
-                                     device='cpu').pin_memory()
+                                     device=get_accelerator().device_name())
+        elif use_cuda_pinned_tensor:
+            aio_buffer = get_accelerator().pin_memory(
+                torch.empty(IO_SIZE,
+                            dtype=torch.uint8,
+                            device='cpu'))
         else:
             aio_buffer = h.new_cpu_locked_tensor(IO_SIZE,
                                                  torch.empty(0,
@@ -172,7 +177,7 @@ def test_async_read(self,
 class TestWrite(DistributedTest):
     world_size = 1
     requires_cuda_env = False
-    if not torch.cuda.is_available():
+    if not get_accelerator().is_available():
         init_distributed = False
         set_dist_env = False
 
@@ -258,7 +263,7 @@ def test_async_write(self,
 class TestAsyncQueue(DistributedTest):
     world_size = 1
     requires_cuda_env = False
-    if not torch.cuda.is_available():
+    if not get_accelerator().is_available():
         init_distributed = False
         set_dist_env = False
 
@@ -285,13 +290,15 @@ def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device):
             aio_buffers = [
                 torch.empty(IO_SIZE,
                             dtype=torch.uint8,
-                            device='cuda') for _ in range(async_queue)
+                            device=get_accelerator().device_name())
+                for _ in range(async_queue)
             ]
         elif use_cuda_pinned_tensor:
             aio_buffers = [
-                torch.empty(IO_SIZE,
-                            dtype=torch.uint8,
-                            device='cpu').pin_memory() for _ in range(async_queue)
+                get_accelerator().pin_memory(
+                    torch.empty(IO_SIZE,
+                                dtype=torch.uint8,
+                                device='cpu')) for _ in range(async_queue)
             ]
         else:
             tmp_tensor = torch.empty(0, dtype=torch.uint8)
diff --git a/tests/unit/ops/quantizer/test_dequantize.py b/tests/unit/ops/quantizer/test_dequantize.py
index c211b3411a29..5dc2f7d68a70 100644
--- a/tests/unit/ops/quantizer/test_dequantize.py
+++ b/tests/unit/ops/quantizer/test_dequantize.py
@@ -5,6 +5,7 @@
 import pytest
 import torch
 from deepspeed.ops import op_builder
+from deepspeed.accelerator import get_accelerator
 
 quantize_module = None
 
@@ -75,7 +76,7 @@ def test_dequantize(num_elems, num_groups, is_symmetric_quant, q_bits):
     activations = torch.randn((num_groups,
                                num_elems),
                               dtype=torch.float16,
-                              device='cuda')
+                              device=get_accelerator().device_name())
     quantized_data, params = run_quantize(activations, num_groups, q_bits, is_symmetric_quant)
 
     ds_dequant = run_dequantize(quantized_data,
diff --git a/tests/unit/ops/quantizer/test_fake_quantization.py b/tests/unit/ops/quantizer/test_fake_quantization.py
index 62b154d34ff8..c5304f7694ee 100644
--- a/tests/unit/ops/quantizer/test_fake_quantization.py
+++ b/tests/unit/ops/quantizer/test_fake_quantization.py
@@ -2,6 +2,7 @@
 
 import torch
 import pytest
+from deepspeed.accelerator import get_accelerator
 from deepspeed.ops import op_builder
 
 quantizer_cuda_module = None
@@ -44,7 +45,8 @@ def run_quant_dequant(inputs, groups, bits):
 # Note that we have an explicit boundary for groups as ((size / groups) - 1) / 4096 + 1) <= MAX_REG.
 def test_fake_quant_dequant(tensor_shape, groups):
 
-    input_tensor = torch.rand((tensor_shape), dtype=torch.float16).cuda()
+    input_tensor = torch.rand((tensor_shape),
+                              dtype=torch.float16).to(get_accelerator().device_name())
 
     # 8-bit quantization.
     ref_input_8bit = input_tensor.clone().detach()
diff --git a/tests/unit/ops/quantizer/test_quantize.py b/tests/unit/ops/quantizer/test_quantize.py
index 1aa2c79643e6..3cfd812e63f9 100644
--- a/tests/unit/ops/quantizer/test_quantize.py
+++ b/tests/unit/ops/quantizer/test_quantize.py
@@ -5,6 +5,7 @@
 import pytest
 import torch
 from deepspeed.ops import op_builder
+from deepspeed.accelerator import get_accelerator
 
 inference_module = None
 
@@ -27,8 +28,8 @@ def get_q_props(q_bits):
     q_min = -(2**(q_bits - 1))
     q_max = (2**(q_bits - 1) - 1)
 
-    q_min = torch.IntTensor([q_min]).to(device='cuda')
-    q_max = torch.IntTensor([q_max]).to(device='cuda')
+    q_min = torch.IntTensor([q_min]).to(device=get_accelerator().device_name())
+    q_max = torch.IntTensor([q_max]).to(device=get_accelerator().device_name())
     return q_range, q_max, q_min
 
 
@@ -46,7 +47,9 @@ def get_scale_zero_point(q_bits,
         scale = torch.empty_like(absmax)
         for i, x in enumerate(absmax):
             scale[i] = torch.ones_like(x) if x == 0 else q_range / (2 * x)
-        zero_point = torch.zeros(scale.shape, dtype=torch.float32, device='cuda')
+        zero_point = torch.zeros(scale.shape,
+                                 dtype=torch.float32,
+                                 device=get_accelerator().device_name())
     else:
         scale = torch.empty_like(max)
         for i, x in enumerate(max):
@@ -125,12 +128,12 @@ def test_float_quantize(num_elems,
         activations_ds = torch.zeros((num_groups,
                                       num_elems),
                                      dtype=torch.float16,
-                                     device='cuda')
+                                     device=get_accelerator().device_name())
     else:
         activations_ds = torch.randn((num_groups,
                                       num_elems),
                                      dtype=torch.float16,
-                                     device='cuda')
+                                     device=get_accelerator().device_name())
     activations_ref = activations_ds.clone().detach()
 
     ref_out_tensor, ref_params = run_float_quantize(q_bits, is_symmetric_quant, activations_ref, num_groups)
diff --git a/tests/unit/ops/sparse_attention/test_sparse_attention.py b/tests/unit/ops/sparse_attention/test_sparse_attention.py
index d663698beb3e..a4fc49354739 100644
--- a/tests/unit/ops/sparse_attention/test_sparse_attention.py
+++ b/tests/unit/ops/sparse_attention/test_sparse_attention.py
@@ -8,6 +8,7 @@
 import pytest
 import torch
 import deepspeed
+from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import SparseAttnBuilder
 
 if not deepspeed.ops.__compatible_ops__[SparseAttnBuilder.NAME]:
@@ -94,7 +95,13 @@ def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layo
     if layout is None:
         layout = make_layout(rho, (H, M // block, N // block))
     if dense_x:
-        x = torch.rand((Z, H, M, N), dtype=dtype, requires_grad=True, device='cuda')
+        x = torch.rand((Z,
+                        H,
+                        M,
+                        N),
+                       dtype=dtype,
+                       requires_grad=True,
+                       device=get_accelerator().device_name())
     else:
         x = torch.rand((Z,
                         layout.sum(),
@@ -102,7 +109,7 @@ def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layo
                         block),
                        dtype=dtype,
                        requires_grad=True,
-                       device='cuda')
+                       device=get_accelerator().device_name())
     dx = torch.rand_like(x)
     bool_attn_mask = torch.randint(low=0,
                                    high=2,
@@ -110,7 +117,7 @@ def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layo
                                          N),
                                    dtype=torch.bool,
                                    requires_grad=False,
-                                   device='cuda')
+                                   device=get_accelerator().device_name())
     fp_attn_mask = bool_attn_mask.type(dtype)
     kp_mask = torch.randint(low=0,
                             high=2,
@@ -118,20 +125,24 @@ def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layo
                                   N),
                             dtype=dtype,
                             requires_grad=False,
-                            device='cuda')
+                            device=get_accelerator().device_name())
     kp_mask[kp_mask == 1.] = float('-inf')
     return layout, x, dx, bool_attn_mask, fp_attn_mask, kp_mask
 
 
 def _skip_on_cuda_compatability():
-    if torch.cuda.get_device_capability()[0] < 7:
-        pytest.skip("needs higher compute capability than 7")
-    cuda_major = int(torch.version.cuda.split('.')[0]) * 10
-    cuda_minor = int(torch.version.cuda.split('.')[1])
-    cuda_version = cuda_major + cuda_minor
-    if (cuda_version != 101 and cuda_version != 102) and \
-            (cuda_version != 111 and cuda_version != 110):
-        pytest.skip("requires cuda 10.1 or 10.2 or 11.0 or 11.1")
+    if deepspeed.accelerator.get_accelerator().device_name() == 'cuda':
+        if torch.cuda.get_device_capability()[0] < 7:
+            pytest.skip("needs higher compute capability than 7")
+        cuda_major = int(torch.version.cuda.split('.')[0]) * 10
+        cuda_minor = int(torch.version.cuda.split('.')[1])
+        cuda_version = cuda_major + cuda_minor
+        if (cuda_version != 101 and cuda_version != 102) and \
+                (cuda_version != 111 and cuda_version != 110):
+            pytest.skip("requires cuda 10.1 or 10.2 or 11.0 or 11.1")
+    else:
+        assert deepspeed.accelerator.get_accelerator().device_name() == 'xpu'
+        return
 
 
 @pytest.mark.parametrize("block", [16, 32])
@@ -195,9 +206,21 @@ def init_matmul_inputs(Z, H, M, N, K, rho, mode, trans_a, trans_b, block, dtype,
     BS0 = N if trans_b else K
     BS1 = K if trans_b else N
     shape = {'sdd': (M, N), 'dsd': (AS0, AS1), 'dds': (BS0, BS1)}[mode]
-    x = torch.rand((Z, H, AS0, AS1), dtype=dtype, requires_grad=True, device='cuda')
-    w = torch.rand((Z, H, BS0, BS1), dtype=dtype, requires_grad=True, device='cuda')
-    dy = torch.rand((Z, H, M, N), dtype=dtype, device='cuda')
+    x = torch.rand((Z,
+                    H,
+                    AS0,
+                    AS1),
+                   dtype=dtype,
+                   requires_grad=True,
+                   device=get_accelerator().device_name())
+    w = torch.rand((Z,
+                    H,
+                    BS0,
+                    BS1),
+                   dtype=dtype,
+                   requires_grad=True,
+                   device=get_accelerator().device_name())
+    dy = torch.rand((Z, H, M, N), dtype=dtype, device=get_accelerator().device_name())
     if layout is None:
         layout = make_layout(rho, (H, shape[0] // block, shape[1] // block))
     else:
diff --git a/tests/unit/ops/spatial/test_nhwc_bias_add.py b/tests/unit/ops/spatial/test_nhwc_bias_add.py
index c863d0f6f1d9..f3a31cf47ba4 100644
--- a/tests/unit/ops/spatial/test_nhwc_bias_add.py
+++ b/tests/unit/ops/spatial/test_nhwc_bias_add.py
@@ -5,6 +5,7 @@
 import pytest
 import torch
 from deepspeed.ops.transformer.inference.bias_add import nhwc_bias_add
+from deepspeed.accelerator import get_accelerator
 
 
 def allclose(x, y):
@@ -40,13 +41,16 @@ def ref_bias_add(activations, bias):
 @pytest.mark.parametrize("image_size", [16, 32, 64])
 @pytest.mark.parametrize("channels", channels_list)
 def test_bias_add(batch, image_size, channels):
-    activations = torch.randn((batch,
-                               channels,
-                               image_size,
-                               image_size),
-                              dtype=torch.float16,
-                              device="cuda").to(memory_format=torch.channels_last)
-    bias = torch.randn((channels), dtype=torch.float16, device="cuda")
+    activations = torch.randn(
+        (batch,
+         channels,
+         image_size,
+         image_size),
+        dtype=torch.float16,
+        device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
+    bias = torch.randn((channels),
+                       dtype=torch.float16,
+                       device=get_accelerator().device_name())
 
     ref_vals = ref_bias_add(activations.clone().detach(), bias)
     ds_vals = nhwc_bias_add(activations, bias)
@@ -63,19 +67,23 @@ def ref_bias_add_add(activations, bias, other):
 @pytest.mark.parametrize("image_size", [16, 32, 64])
 @pytest.mark.parametrize("channels", channels_list)
 def test_bias_add_add(batch, image_size, channels):
-    activations = torch.randn((batch,
-                               channels,
-                               image_size,
-                               image_size),
-                              dtype=torch.float16,
-                              device="cuda").to(memory_format=torch.channels_last)
-    other = torch.randn((batch,
-                         channels,
-                         image_size,
-                         image_size),
-                        dtype=torch.float16,
-                        device="cuda").to(memory_format=torch.channels_last)
-    bias = torch.randn((channels), dtype=torch.float16, device="cuda")
+    activations = torch.randn(
+        (batch,
+         channels,
+         image_size,
+         image_size),
+        dtype=torch.float16,
+        device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
+    other = torch.randn(
+        (batch,
+         channels,
+         image_size,
+         image_size),
+        dtype=torch.float16,
+        device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
+    bias = torch.randn((channels),
+                       dtype=torch.float16,
+                       device=get_accelerator().device_name())
 
     ref_vals = ref_bias_add_add(activations.clone().detach(), bias, other)
     ds_vals = nhwc_bias_add(activations, bias, other=other)
@@ -98,20 +106,26 @@ def ref_bias_add_bias_add(activations, bias, other, other_bias):
 @pytest.mark.parametrize("image_size", [16, 32, 64])
 @pytest.mark.parametrize("channels", channels_list)
 def test_bias_add_bias_add(batch, image_size, channels):
-    activations = torch.randn((batch,
-                               channels,
-                               image_size,
-                               image_size),
-                              dtype=torch.float16,
-                              device="cuda").to(memory_format=torch.channels_last)
-    other = torch.randn((batch,
-                         channels,
-                         image_size,
-                         image_size),
-                        dtype=torch.float16,
-                        device="cuda").to(memory_format=torch.channels_last)
-    bias = torch.randn((channels), dtype=torch.float16, device="cuda")
-    other_bias = torch.randn((channels), dtype=torch.float16, device="cuda")
+    activations = torch.randn(
+        (batch,
+         channels,
+         image_size,
+         image_size),
+        dtype=torch.float16,
+        device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
+    other = torch.randn(
+        (batch,
+         channels,
+         image_size,
+         image_size),
+        dtype=torch.float16,
+        device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
+    bias = torch.randn((channels),
+                       dtype=torch.float16,
+                       device=get_accelerator().device_name())
+    other_bias = torch.randn((channels),
+                             dtype=torch.float16,
+                             device=get_accelerator().device_name())
 
     ref_vals = ref_bias_add_bias_add(activations.clone().detach(),
                                      bias,
diff --git a/tests/unit/ops/transformer/inference/test_bias_add.py b/tests/unit/ops/transformer/inference/test_bias_add.py
index f8d759c3f567..f3475a14766d 100644
--- a/tests/unit/ops/transformer/inference/test_bias_add.py
+++ b/tests/unit/ops/transformer/inference/test_bias_add.py
@@ -3,6 +3,7 @@
 import pytest
 import torch
 import deepspeed
+from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
@@ -39,8 +40,14 @@ def run_bias_add_ds(activations, bias):
 @pytest.mark.parametrize("channels", [512, 1232, 4096])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=["fp16", "fp32"])
 def test_bias_add(batch, sequence, channels, dtype):
-    activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device='cuda')
-    bias_ds = torch.randn((channels), dtype=dtype, device='cuda')
+    activations_ds = torch.randn((batch,
+                                  sequence,
+                                  channels),
+                                 dtype=dtype,
+                                 device=get_accelerator().device_name())
+    bias_ds = torch.randn((channels),
+                          dtype=dtype,
+                          device=get_accelerator().device_name())
 
     activations_ref = activations_ds.clone().detach()
     bias_ref = bias_ds.clone().detach()
diff --git a/tests/unit/ops/transformer/inference/test_bias_geglu.py b/tests/unit/ops/transformer/inference/test_bias_geglu.py
index a9f221488a4c..c70974e51d94 100644
--- a/tests/unit/ops/transformer/inference/test_bias_geglu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_geglu.py
@@ -6,6 +6,7 @@
 import torch
 import deepspeed
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.accelerator import get_accelerator
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system",
@@ -43,8 +44,14 @@ def run_bias_geglu_ds(activation, bias):
 @pytest.mark.parametrize("channels", [512, 1232, 4096])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
 def test_bias_geglu(batch, sequence, channels, dtype):
-    activation = torch.randn((batch, sequence, channels * 2), dtype=dtype, device='cuda')
-    bias = torch.randn((channels * 2), dtype=dtype, device='cuda')
+    activation = torch.randn((batch,
+                              sequence,
+                              channels * 2),
+                             dtype=dtype,
+                             device=get_accelerator().device_name())
+    bias = torch.randn((channels * 2),
+                       dtype=dtype,
+                       device=get_accelerator().device_name())
 
     ds_out = run_bias_geglu_ds(activation, bias)
     ref_out = run_bias_geglu_reference(activation, bias)
diff --git a/tests/unit/ops/transformer/inference/test_bias_gelu.py b/tests/unit/ops/transformer/inference/test_bias_gelu.py
index a58512c9c83a..3c1762179ead 100644
--- a/tests/unit/ops/transformer/inference/test_bias_gelu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_gelu.py
@@ -5,6 +5,7 @@
 import pytest
 import torch
 import deepspeed
+from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
 from packaging import version as pkg_version
 
@@ -48,8 +49,14 @@ def test_bias_gelu(batch, sequence, channels, dtype):
     if pkg_version.parse(torch.__version__) < pkg_version.parse("1.12"):
         pytest.skip("gelu implementation matches only after torch 1.12")
 
-    activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device='cuda')
-    bias_ds = torch.randn((channels), dtype=dtype, device='cuda')
+    activations_ds = torch.randn((batch,
+                                  sequence,
+                                  channels),
+                                 dtype=dtype,
+                                 device=get_accelerator().device_name())
+    bias_ds = torch.randn((channels),
+                          dtype=dtype,
+                          device=get_accelerator().device_name())
 
     activations_ref = activations_ds.clone().detach()
     bias_ref = bias_ds.clone().detach()
diff --git a/tests/unit/ops/transformer/inference/test_bias_relu.py b/tests/unit/ops/transformer/inference/test_bias_relu.py
index 932b02c01bfa..e2b66f6bd2e1 100644
--- a/tests/unit/ops/transformer/inference/test_bias_relu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_relu.py
@@ -5,6 +5,7 @@
 import pytest
 import torch
 import deepspeed
+from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
@@ -43,8 +44,14 @@ def run_bias_relu_ds(activations, bias):
 @pytest.mark.parametrize("channels", [512, 1232, 4096])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
 def test_bias_relu(batch, sequence, channels, dtype):
-    activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device='cuda')
-    bias_ds = torch.randn((channels), dtype=dtype, device='cuda')
+    activations_ds = torch.randn((batch,
+                                  sequence,
+                                  channels),
+                                 dtype=dtype,
+                                 device=get_accelerator().device_name())
+    bias_ds = torch.randn((channels),
+                          dtype=dtype,
+                          device=get_accelerator().device_name())
 
     activations_ref = activations_ds.clone().detach()
     bias_ref = bias_ds.clone().detach()
diff --git a/tests/unit/ops/transformer/inference/test_layer_norm.py b/tests/unit/ops/transformer/inference/test_layer_norm.py
index 3f8ddcd9615b..61f6455629e6 100644
--- a/tests/unit/ops/transformer/inference/test_layer_norm.py
+++ b/tests/unit/ops/transformer/inference/test_layer_norm.py
@@ -5,6 +5,7 @@
 import deepspeed
 import torch
 import pytest
+from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
@@ -48,9 +49,13 @@ def test_layer_norm(batch, seq_len, channels, dtype):
                         seq_len,
                         channels),
                        dtype=dtype,
-                       device=torch.cuda.current_device())
-    gamma = torch.randn((channels), dtype=dtype, device=torch.cuda.current_device())
-    beta = torch.rand((channels), dtype=dtype, device=torch.cuda.current_device())
+                       device=get_accelerator().current_device_name())
+    gamma = torch.randn((channels),
+                        dtype=dtype,
+                        device=get_accelerator().current_device_name())
+    beta = torch.rand((channels),
+                      dtype=dtype,
+                      device=get_accelerator().current_device_name())
     epsilon = 1e-5
 
     ref_output = ref_implementation(vals, gamma, beta, epsilon, channels, dtype)
@@ -89,15 +94,21 @@ def test_layer_norm_residual(batch, seq_len, channels, dtype):
                         seq_len,
                         channels),
                        dtype=dtype,
-                       device=torch.cuda.current_device())
+                       device=get_accelerator().current_device_name())
     residual = torch.randn((batch,
                             seq_len,
                             channels),
                            dtype=dtype,
-                           device=torch.cuda.current_device())
-    bias = torch.randn((channels), dtype=dtype, device=torch.cuda.current_device())
-    gamma = torch.randn((channels), dtype=dtype, device=torch.cuda.current_device())
-    beta = torch.rand((channels), dtype=dtype, device=torch.cuda.current_device())
+                           device=get_accelerator().current_device_name())
+    bias = torch.randn((channels),
+                       dtype=dtype,
+                       device=get_accelerator().current_device_name())
+    gamma = torch.randn((channels),
+                        dtype=dtype,
+                        device=get_accelerator().current_device_name())
+    beta = torch.rand((channels),
+                      dtype=dtype,
+                      device=get_accelerator().current_device_name())
     epsilon = 1e-5
 
     new_output = residual_ds_implementation(vals, bias, residual, gamma, beta, epsilon)
@@ -158,15 +169,21 @@ def test_layer_norm_residual_store_pre_ln_res(batch, seq_len, channels, dtype):
                         seq_len,
                         channels),
                        dtype=dtype,
-                       device=torch.cuda.current_device())
+                       device=get_accelerator().current_device_name())
     residual = torch.randn((batch,
                             seq_len,
                             channels),
                            dtype=dtype,
-                           device=torch.cuda.current_device())
-    bias = torch.randn((channels), dtype=dtype, device=torch.cuda.current_device())
-    gamma = torch.randn((channels), dtype=dtype, device=torch.cuda.current_device())
-    beta = torch.rand((channels), dtype=dtype, device=torch.cuda.current_device())
+                           device=get_accelerator().current_device_name())
+    bias = torch.randn((channels),
+                       dtype=dtype,
+                       device=get_accelerator().current_device_name())
+    gamma = torch.randn((channels),
+                        dtype=dtype,
+                        device=get_accelerator().current_device_name())
+    beta = torch.rand((channels),
+                      dtype=dtype,
+                      device=get_accelerator().current_device_name())
     epsilon = 1e-5
 
     # Need to run the reference first since there's an in-place component to ours
diff --git a/tests/unit/ops/transformer/inference/test_moe_res_matmult.py b/tests/unit/ops/transformer/inference/test_moe_res_matmult.py
index defdc99ac508..fdd6e8607c71 100644
--- a/tests/unit/ops/transformer/inference/test_moe_res_matmult.py
+++ b/tests/unit/ops/transformer/inference/test_moe_res_matmult.py
@@ -5,6 +5,7 @@
 import pytest
 import torch
 import deepspeed
+from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
@@ -41,10 +42,22 @@ def test_moe_residual_matmul(hidden_dim, c, dtype):
                                hidden_dim * c,
                                hidden_dim),
                               dtype=dtype,
-                              device='cuda')
-    coeff1 = torch.randn((1, 1, hidden_dim), dtype=dtype, device='cuda')
-    coeff2 = torch.randn((1, 1, hidden_dim), dtype=dtype, device='cuda')
-    out_ds = torch.randn((c, hidden_dim * c, hidden_dim), dtype=dtype, device='cuda')
+                              device=get_accelerator().device_name())
+    coeff1 = torch.randn((1,
+                          1,
+                          hidden_dim),
+                         dtype=dtype,
+                         device=get_accelerator().device_name())
+    coeff2 = torch.randn((1,
+                          1,
+                          hidden_dim),
+                         dtype=dtype,
+                         device=get_accelerator().device_name())
+    out_ds = torch.randn((c,
+                          hidden_dim * c,
+                          hidden_dim),
+                         dtype=dtype,
+                         device=get_accelerator().device_name())
     coeff_ds = torch.cat((coeff1, coeff2), dim=-1)
     residual_ref = residual_ds.clone().detach()
     coeff_ref = coeff_ds.clone().detach()
diff --git a/tests/unit/ops/transformer/inference/test_residual_add.py b/tests/unit/ops/transformer/inference/test_residual_add.py
index e5d4f08c50f4..0dacee355369 100644
--- a/tests/unit/ops/transformer/inference/test_residual_add.py
+++ b/tests/unit/ops/transformer/inference/test_residual_add.py
@@ -5,6 +5,7 @@
 import pytest
 import torch
 import deepspeed
+from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
@@ -95,11 +96,27 @@ def test_residual_add(inference_module,
                       add_bias,
                       mp_size,
                       pre_attn_norm):
-    ds_out = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device='cuda')
-    residual = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device='cuda')
-    attn_output = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device='cuda')
-    final_bias = torch.randn((hidden_dim), dtype=dtype, device='cuda')
-    attn_bias = torch.randn((hidden_dim), dtype=dtype, device='cuda')
+    ds_out = torch.randn((batch,
+                          sequence,
+                          hidden_dim),
+                         dtype=dtype,
+                         device=get_accelerator().device_name())
+    residual = torch.randn((batch,
+                            sequence,
+                            hidden_dim),
+                           dtype=dtype,
+                           device=get_accelerator().device_name())
+    attn_output = torch.randn((batch,
+                               sequence,
+                               hidden_dim),
+                              dtype=dtype,
+                              device=get_accelerator().device_name())
+    final_bias = torch.randn((hidden_dim),
+                             dtype=dtype,
+                             device=get_accelerator().device_name())
+    attn_bias = torch.randn((hidden_dim),
+                            dtype=dtype,
+                            device=get_accelerator().device_name())
 
     ref_out = ds_out.clone()
     ref_out = run_residual_add_reference(ref_out,
diff --git a/tests/unit/pipe/test_pipe_module.py b/tests/unit/pipe/test_pipe_module.py
index 5454c7f8969e..e8404b0d5a17 100644
--- a/tests/unit/pipe/test_pipe_module.py
+++ b/tests/unit/pipe/test_pipe_module.py
@@ -11,6 +11,7 @@
 import deepspeed
 from deepspeed.pipe import PipelineModule
 from deepspeed.utils import RepeatingLoader
+from deepspeed.accelerator import get_accelerator
 
 from unit.common import DistributedTest
 
@@ -72,7 +73,8 @@ def test(self, sequential_model, simple_config, batch_input):
 
         # Ensure all parameters are accounted for.
         my_params = sum(p.numel() for p in pipe_model.parameters())
-        total_pipe_params = torch.LongTensor([my_params]).to('cuda')
+        total_pipe_params = torch.LongTensor([my_params
+                                              ]).to(get_accelerator().device_name())
         dist.all_reduce(total_pipe_params)
         total_pipe_params = total_pipe_params.item()
         assert total_pipe_params == base_params
@@ -83,7 +85,7 @@ def test(self, sequential_model, simple_config, batch_input):
             model_parameters=[p for p in pipe_model.parameters()])
 
         if pipe_model.is_first_stage or pipe_model.is_last_stage:
-            pipe_input = base_input.clone().detach().to('cuda')
+            pipe_input = base_input.clone().detach().to(get_accelerator().device_name())
             # label 0 is meaningless
             dataset = [(pipe_input, 0)]
             loader = RepeatingLoader(dataset)
diff --git a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py
index f1d0c79d05d5..af354fe1caa6 100644
--- a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py
+++ b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py
@@ -5,6 +5,7 @@
 import pytest
 import torch
 import deepspeed
+from deepspeed.accelerator import get_accelerator
 from copy import deepcopy
 from unit.common import DistributedTest
 
@@ -38,7 +39,7 @@ def _prep_inputs(*inputs):
     for inp in inputs:
         inp = deepcopy(inp)
         if torch.is_tensor(inp):
-            inp = inp.cuda()
+            inp = inp.to(get_accelerator().device_name())
         _inputs.append(inp)
 
     return tuple(_inputs)
@@ -59,7 +60,7 @@ def _match_outputs(ref, tgt):
 
 def _test_activation_checkpoint(module, *inputs):
     # Move to device
-    module.cuda()
+    module.to(get_accelerator().device_name())
 
     # Get rid of dropouts until we fork the RNG between tests.
     module.eval()
@@ -79,7 +80,7 @@ def _test_activation_checkpoint(module, *inputs):
 
 def _test_activation_checkpoint_ordering(module, expected_ordering, *inputs):
     # Move to device
-    module.cuda()
+    module.to(get_accelerator().device_name())
 
     # Get rid of dropouts until we fork the RNG between tests.
     module.eval()
diff --git a/tests/unit/runtime/comm/test_coalesced_collectives.py b/tests/unit/runtime/comm/test_coalesced_collectives.py
index a072eb0803c4..fa1041379a6b 100644
--- a/tests/unit/runtime/comm/test_coalesced_collectives.py
+++ b/tests/unit/runtime/comm/test_coalesced_collectives.py
@@ -4,6 +4,7 @@
 import torch
 import deepspeed.comm as dist
 from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced
+from deepspeed.accelerator import get_accelerator
 
 from unit.common import DistributedTest
 
@@ -16,7 +17,7 @@ def test_single_input(self):
                             ),
                            dist.get_rank(),
                            dtype=torch.half,
-                           device=torch.cuda.current_device())
+                           device=get_accelerator().current_device_name())
 
         (output, ) = reduce_scatter_coalesced([input], dist.get_world_group())
 
@@ -24,7 +25,10 @@ def test_single_input(self):
         assert torch.allclose(output, torch.full_like(output, 0.5))
 
     def test_two_inputs(self):
-        tensor_kwargs = {"device": torch.cuda.current_device(), "dtype": torch.half}
+        tensor_kwargs = {
+            "device": get_accelerator().current_device_name(),
+            "dtype": torch.half
+        }
         inputs = [
             dist.get_rank() * torch.arange(0,
                                            6,
@@ -52,7 +56,10 @@ class TestReduceScatterCoalescedTensorSmallerThanWorldSize(DistributedTest):
     world_size = 2
 
     def test(self):
-        input = torch.zeros((1, ), dtype=torch.half, device=torch.cuda.current_device())
+        input = torch.zeros((1,
+                             ),
+                            dtype=torch.half,
+                            device=get_accelerator().current_device_name())
 
         (output, ) = reduce_scatter_coalesced([input], dist.get_world_group())
 
diff --git a/tests/unit/runtime/half_precision/onebit/test_onebit.py b/tests/unit/runtime/half_precision/onebit/test_onebit.py
index 6b4a2f5d7989..84a36768174a 100644
--- a/tests/unit/runtime/half_precision/onebit/test_onebit.py
+++ b/tests/unit/runtime/half_precision/onebit/test_onebit.py
@@ -15,6 +15,7 @@
 from unit.common import DistributedTest
 from unit.simple_model import SimpleModel, random_dataloader
 from unit.alexnet_model import AlexNetPipe, train_cifar
+from deepspeed.accelerator import get_accelerator
 
 PipeTopo = PipeDataParallelTopology
 
@@ -48,7 +49,7 @@ def test(self, dtype):
                     "weight_decay": 0.01,
                     "freeze_step": 2,
                     "cuda_aware": False,
-                    "comm_backend_name": "nccl",
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
                 },
             },
             "gradient_clipping": 1.0,
@@ -91,7 +92,7 @@ def test(self):
                     "weight_decay": 0.01,
                     "freeze_step": 2,
                     "cuda_aware": False,
-                    "comm_backend_name": "nccl",
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
                 },
             },
             "gradient_clipping": 1.0,
@@ -158,7 +159,7 @@ def test(self, tmpdir):
                     "weight_decay": 0.01,
                     "freeze_step": 2,
                     "cuda_aware": False,
-                    "comm_backend_name": "nccl",
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
                 },
             },
             "gradient_clipping": 1.0,
@@ -314,7 +315,7 @@ def test_overflow(self, tmpdir):
                     "weight_decay": 0.01,
                     "freeze_step": 2,
                     "cuda_aware": False,
-                    "comm_backend_name": "nccl",
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
                 },
             },
             "gradient_clipping": 1.0,
@@ -381,7 +382,7 @@ def test(self, topo_config):
                     "weight_decay": 3e-7,
                     "freeze_step": 200,
                     "cuda_aware": False,
-                    "comm_backend_name": "nccl",
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
                 },
             },
             "gradient_clipping": 1.0,
@@ -436,7 +437,7 @@ def test(self, dtype):
                     "local_step_scaler": 1,
                     "local_step_clipper": 2,
                     "cuda_aware": False,
-                    "comm_backend_name": "nccl",
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
                 },
             },
             "gradient_clipping": 1.0,
@@ -482,7 +483,7 @@ def test(self):
                     "local_step_scaler": 1,
                     "local_step_clipper": 2,
                     "cuda_aware": False,
-                    "comm_backend_name": "nccl",
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
                 },
             },
             "gradient_clipping": 1.0,
@@ -552,7 +553,7 @@ def test(self, tmpdir):
                     "local_step_scaler": 1,
                     "local_step_clipper": 2,
                     "cuda_aware": False,
-                    "comm_backend_name": "nccl",
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
                 },
             },
             "gradient_clipping": 1.0,
@@ -707,7 +708,7 @@ def test_overflow(self, tmpdir):
                     "local_step_scaler": 1,
                     "local_step_clipper": 2,
                     "cuda_aware": False,
-                    "comm_backend_name": "nccl",
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
                 },
             },
             "gradient_clipping": 1.0,
@@ -777,7 +778,7 @@ def test(self, topo_config):
                     "local_step_scaler": 1,
                     "local_step_clipper": 2,
                     "cuda_aware": False,
-                    "comm_backend_name": "nccl",
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
                 },
             },
             "gradient_clipping": 1.0,
@@ -831,7 +832,7 @@ def test(self, dtype):
                     "min_coeff": 0.01,
                     "freeze_step": 2,
                     "cuda_aware": False,
-                    "comm_backend_name": "nccl",
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
                     "coeff_beta": 0.9,
                     "factor_max": 1.0,
                     "factor_min": 0.5,
@@ -880,7 +881,7 @@ def test(self):
                     "min_coeff": 0.01,
                     "freeze_step": 2,
                     "cuda_aware": False,
-                    "comm_backend_name": "nccl",
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
                     "coeff_beta": 0.9,
                     "factor_max": 1.0,
                     "factor_min": 0.5,
@@ -952,7 +953,7 @@ def test(self, tmpdir):
                     "min_coeff": 0.01,
                     "freeze_step": 2,
                     "cuda_aware": False,
-                    "comm_backend_name": "nccl",
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
                     "coeff_beta": 0.9,
                     "factor_max": 1.0,
                     "factor_min": 0.5,
@@ -1127,7 +1128,7 @@ def test_overflow(self, tmpdir):
                     "min_coeff": 0.01,
                     "freeze_step": 2,
                     "cuda_aware": False,
-                    "comm_backend_name": "nccl",
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
                     "coeff_beta": 0.9,
                     "factor_max": 1.0,
                     "factor_min": 0.5,
@@ -1198,7 +1199,7 @@ def test(self, topo_config):
                     "weight_decay": 3e-7,
                     "freeze_step": 200,
                     "cuda_aware": False,
-                    "comm_backend_name": "nccl",
+                    "comm_backend_name": get_accelerator().communication_backend_name(),
                 },
             },
             "gradient_clipping": 1.0,
@@ -1246,7 +1247,7 @@ def test(self, tmpdir):
         rank = dist.get_rank()
         backend = NcclBackend()
         local_rank = dist.get_rank()
-        device = torch.device("cuda", dist.get_rank())
+        device = torch.device(get_accelerator().device_name(), dist.get_rank())
 
         # A simulated compression function using deepspeed.comm
         def torch_sim(a):
@@ -1268,7 +1269,7 @@ def torch_sim(a):
                 [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
             rank = dist.get_rank()
             server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
-            torch.cuda.synchronize()
+            get_accelerator().synchronize()
             dist.barrier()
             return a_server_compressed, worker_error, server_error
 
@@ -1288,7 +1289,7 @@ def torch_sim(a):
         server_error = torch.zeros(right_server_size, device=device)
 
         a_torch, worker_error_torch, server_error_torch = torch_sim(a)
-        torch.cuda.empty_cache()
+        get_accelerator().empty_cache()
 
         a_after = backend.compressed_allreduce(a, worker_error, server_error, local_rank)
 
diff --git a/tests/unit/runtime/half_precision/test_fp16.py b/tests/unit/runtime/half_precision/test_fp16.py
index 57f6de1be09a..c3c933fca144 100644
--- a/tests/unit/runtime/half_precision/test_fp16.py
+++ b/tests/unit/runtime/half_precision/test_fp16.py
@@ -6,9 +6,10 @@
 import pytest
 from deepspeed.ops.adam import FusedAdam
 from unit.common import DistributedTest
-from deepspeed.ops.op_builder import CPUAdamBuilder
 from unit.simple_model import SimpleModel, SimpleOptimizer, random_dataloader, SimpleMoEModel, sequence_dataloader
 from unit.util import required_torch_version
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import CPUAdamBuilder
 
 try:
     from apex import amp  # noqa: F401
@@ -195,7 +196,7 @@ def test_unfused_gradnorm(self, monkeypatch):
         hidden_dim = 10
 
         def mock_unscale_and_clip_grads(total_norm, apply_scale=True):
-            torch_norm_tensor = torch.cuda.FloatTensor([total_norm])
+            torch_norm_tensor = get_accelerator().FloatTensor([total_norm])
             all_gather_results = [
                 torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())
             ]
@@ -236,7 +237,7 @@ def test_fused_gradnorm(self, monkeypatch):
         hidden_dim = 10
 
         def mock_unscale_and_clip_grads(grads_groups_flat, total_norm, apply_scale=True):
-            torch_norm_tensor = torch.cuda.FloatTensor([total_norm])
+            torch_norm_tensor = get_accelerator().FloatTensor([total_norm])
             all_gather_results = [
                 torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())
             ]
@@ -285,7 +286,7 @@ def test_lamb_gradnorm(self, monkeypatch, fused_lamb_legacy: bool):
         hidden_dim = 10
 
         def mock_unscale_and_clip_grads(total_norm, apply_scale=True):
-            torch_norm_tensor = torch.cuda.FloatTensor([total_norm])
+            torch_norm_tensor = get_accelerator().FloatTensor([total_norm])
             all_gather_results = [
                 torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())
             ]
diff --git a/tests/unit/runtime/pipe/test_topology.py b/tests/unit/runtime/pipe/test_topology.py
index 9c71ce7d72d1..4b0cc42d4336 100644
--- a/tests/unit/runtime/pipe/test_topology.py
+++ b/tests/unit/runtime/pipe/test_topology.py
@@ -9,6 +9,7 @@
 from deepspeed.runtime.pipe.topology import ProcessTopology as Topo
 from deepspeed.runtime.pipe.topology import _prime_factors
 
+from deepspeed.accelerator import get_accelerator
 from unit.common import DistributedTest
 
 
@@ -175,13 +176,13 @@ def test_grid_pipe_data(self):
             grid.get_stage_id() == grid.get_pipe_parallel_world_size() - 1)
 
         # Test collectives along the pipeline parallel process groups
-        rank_tensor = torch.LongTensor(data=[rank]).cuda()
+        rank_tensor = torch.LongTensor(data=[rank]).to(get_accelerator().device_name())
         dist.all_reduce(rank_tensor, group=grid.get_pipe_parallel_group())
         pipe_group = grid.pp_group
         assert torch.all(rank_tensor == sum(pipe_group))
 
         # Test collectives along the data parallel process groups
-        rank_tensor = torch.LongTensor(data=[rank]).cuda()
+        rank_tensor = torch.LongTensor(data=[rank]).to(get_accelerator().device_name())
         dist.all_reduce(rank_tensor, group=grid.get_data_parallel_group())
         data_group = grid.dp_group
         assert torch.all(rank_tensor == sum(data_group))
diff --git a/tests/unit/runtime/test_autocast.py b/tests/unit/runtime/test_autocast.py
index 307feb106572..b0d8d8696cb8 100644
--- a/tests/unit/runtime/test_autocast.py
+++ b/tests/unit/runtime/test_autocast.py
@@ -3,6 +3,7 @@
 import pytest
 import torch
 from deepspeed.runtime.zero.linear import LinearModuleForZeroStage3
+from deepspeed.accelerator import get_accelerator
 from unit.common import DistributedTest
 
 
@@ -11,31 +12,40 @@ class TestAutoCastDisable(DistributedTest):
     def test_missing_amp_autocast(self, half_op):
         hidden_dim = 4
         if half_op:
-            input = torch.randn(hidden_dim).cuda().half()
-            ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).cuda().half()
+            input = torch.randn(hidden_dim).to(get_accelerator().device_name()).half()
+            ds_linear = LinearModuleForZeroStage3(
+                hidden_dim,
+                hidden_dim).to(get_accelerator().device_name()).half()
         else:
-            input = torch.randn(hidden_dim).cuda()
-            ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).cuda()
+            input = torch.randn(hidden_dim).to(get_accelerator().device_name())
+            ds_linear = LinearModuleForZeroStage3(hidden_dim,
+                                                  hidden_dim).to(
+                                                      get_accelerator().device_name())
 
         output = ds_linear(input)
         assert output.dtype == ds_linear.weight.dtype
 
     def test_disable_autocast_linear(self, half_op):
-        amp = pytest.importorskip("torch.cuda.amp")
+        amp = get_accelerator().amp()
 
         hidden_dim = 4
         if half_op:
-            input = torch.randn(hidden_dim).cuda().half()
-            ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).cuda().half()
+            input = torch.randn(hidden_dim).to(get_accelerator().device_name()).half()
+            ds_linear = LinearModuleForZeroStage3(
+                hidden_dim,
+                hidden_dim).to(get_accelerator().device_name()).half()
         else:
-            input = torch.randn(hidden_dim).cuda()
-            ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).cuda()
+            input = torch.randn(hidden_dim).to(get_accelerator().device_name())
+            ds_linear = LinearModuleForZeroStage3(hidden_dim,
+                                                  hidden_dim).to(
+                                                      get_accelerator().device_name())
 
         with amp.autocast(False):
             output = ds_linear(input)
             assert output.dtype == ds_linear.weight.dtype
 
 
+@pytest.mark.skipif(get_accelerator().amp() is None, reason='amp is not installed')
 @pytest.mark.parametrize('half_input, half_weight',
                          [(False,
                            False),
@@ -47,11 +57,13 @@ def test_disable_autocast_linear(self, half_op):
                            True)])
 class TestAutoCastEnable(DistributedTest):
     def test_autocast_linear(self, tmpdir, half_input, half_weight):
-        amp = pytest.importorskip("torch.cuda.amp")
+        amp = get_accelerator().amp()
 
         hidden_dim = 4
-        input = torch.randn(hidden_dim).cuda()
-        ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).cuda()
+        input = torch.randn(hidden_dim).to(get_accelerator().device_name())
+        ds_linear = LinearModuleForZeroStage3(hidden_dim,
+                                              hidden_dim).to(
+                                                  get_accelerator().device_name())
 
         if half_input:
             input = input.half()
@@ -61,4 +73,4 @@ def test_autocast_linear(self, tmpdir, half_input, half_weight):
 
         with amp.autocast():
             output = ds_linear(input)
-            assert output.dtype == torch.half
+            assert output.dtype == torch.half or output.dtype == torch.bfloat16
diff --git a/tests/unit/runtime/test_data.py b/tests/unit/runtime/test_data.py
index 3bee3dc2d471..ed2fee950bc3 100644
--- a/tests/unit/runtime/test_data.py
+++ b/tests/unit/runtime/test_data.py
@@ -4,6 +4,7 @@
 import torch
 import pytest
 import deepspeed
+from deepspeed.accelerator import get_accelerator
 from unit.common import DistributedTest
 from unit.simple_model import SimpleModel, random_dataset
 
@@ -51,8 +52,8 @@ def test(self, train_batch_size, drop_last):
                                                                 training_data=train_dataset,
                                                                 optimizer=optimizer)
         for n, batch in enumerate(training_dataloader):
-            x = batch[0].to(torch.cuda.current_device())
-            y = batch[1].to(torch.cuda.current_device())
+            x = batch[0].to(get_accelerator().current_device_name())
+            y = batch[1].to(get_accelerator().current_device_name())
             loss = model(x, y)
             model.backward(loss)
             model.step()
diff --git a/tests/unit/runtime/test_data_efficiency.py b/tests/unit/runtime/test_data_efficiency.py
index 74e1222997d2..993e4aa66e20 100644
--- a/tests/unit/runtime/test_data_efficiency.py
+++ b/tests/unit/runtime/test_data_efficiency.py
@@ -3,6 +3,7 @@
 import torch
 import os
 import deepspeed
+from deepspeed.accelerator import get_accelerator
 from unit.common import DistributedTest
 from unit.simple_model import Curriculum_SimpleModel, SimpleModel, random_dataloader, random_dataset
 
@@ -110,8 +111,8 @@ def data_post_process(data, data_sampler_state_dict):
             os.makedirs('/tmp')
         model.set_data_post_process_func(data_post_process)
         for n, batch in enumerate(data_loader):
-            x = batch[0].to(torch.cuda.current_device())
-            y = batch[1].to(torch.cuda.current_device())
+            x = batch[0].to(get_accelerator().current_device_name())
+            y = batch[1].to(get_accelerator().current_device_name())
             loss = model(x, y)
             model.backward(loss)
             model.step()
diff --git a/tests/unit/runtime/test_ds_config_dict.py b/tests/unit/runtime/test_ds_config_dict.py
index 311517b3e052..54c91a6fc3e6 100644
--- a/tests/unit/runtime/test_ds_config_dict.py
+++ b/tests/unit/runtime/test_ds_config_dict.py
@@ -2,13 +2,13 @@
 
 # A test on its own
 import os
-import torch
 import pytest
 import json
 import hjson
 import argparse
 
 from deepspeed.runtime.zero.config import DeepSpeedZeroConfig
+from deepspeed.accelerator import get_accelerator
 
 from unit.common import DistributedTest, get_test_path
 from unit.simple_model import SimpleModel, create_config_from_dict, random_dataloader
@@ -22,8 +22,8 @@
 class TestBasicConfig(DistributedTest):
     world_size = 1
 
-    def test_cuda(self):
-        assert (torch.cuda.is_available())
+    def test_accelerator(self):
+        assert (get_accelerator().is_available())
 
     def test_check_version(self):
         assert hasattr(deepspeed, "__git_hash__")
diff --git a/tests/unit/runtime/test_runtime_utils.py b/tests/unit/runtime/test_runtime_utils.py
index 751fae11971b..18a8bb77a5b6 100644
--- a/tests/unit/runtime/test_runtime_utils.py
+++ b/tests/unit/runtime/test_runtime_utils.py
@@ -7,6 +7,7 @@
 
 import deepspeed.runtime.utils as ds_utils
 import deepspeed.utils.groups as groups
+from deepspeed.accelerator import get_accelerator
 
 from unit.common import DistributedTest
 
@@ -38,10 +39,11 @@ def test(self):
         groups._create_expert_and_data_parallel(2)
 
         norm = ds_utils.clip_grad_norm_(parameters, max_norm=0.1)
-        norm = torch.Tensor([norm]).to(dist.get_rank())
-
+        norm = torch.Tensor([norm]).to(get_accelerator().device_name(dist.get_rank()))
         world_size = dist.get_world_size()
-        gathered_norm = [torch.zeros(1).cuda() for i in range(world_size)]
+        gathered_norm = [
+            torch.zeros(1).to(get_accelerator().device_name()) for i in range(world_size)
+        ]
 
         dist.all_gather(gathered_norm, norm)
 
diff --git a/tests/unit/runtime/utils/test_partition.py b/tests/unit/runtime/utils/test_partition.py
index 04fa5c94374d..58b62825de3f 100644
--- a/tests/unit/runtime/utils/test_partition.py
+++ b/tests/unit/runtime/utils/test_partition.py
@@ -9,6 +9,7 @@
 from deepspeed.runtime.utils import partition_balanced
 from deepspeed.runtime.utils import prefix_sum_inc
 from deepspeed.runtime.utils import PartitionedTensor
+from deepspeed.accelerator import get_accelerator
 
 from unit.common import DistributedTest
 
@@ -25,7 +26,7 @@ def test(self):
         rows = world * 4
         cols = 3
 
-        full = torch.rand(rows, cols).cuda()
+        full = torch.rand(rows, cols).to(get_accelerator().device_name())
         dist.broadcast(full, src=0, group=group)
         part = PartitionedTensor(full, group=group)
 
@@ -48,7 +49,7 @@ def test(self):
         rows = world * 7
         cols = 3
 
-        full = torch.rand(rows, cols).cuda()
+        full = torch.rand(rows, cols).to(get_accelerator().device_name())
         dist.broadcast(full, src=0, group=group)
         part = PartitionedTensor(full, group=group)
 
diff --git a/tests/unit/runtime/zero/test_zero.py b/tests/unit/runtime/zero/test_zero.py
index 958998441a9e..5de3ffca27df 100644
--- a/tests/unit/runtime/zero/test_zero.py
+++ b/tests/unit/runtime/zero/test_zero.py
@@ -18,6 +18,7 @@
 from deepspeed.runtime.engine import DeepSpeedEngine
 from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
 from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+from deepspeed.accelerator import get_accelerator
 
 
 def run_unbalanced_gradients(model, data_loader):
@@ -698,30 +699,30 @@ def create_tensor(vals, dtype: torch.dtype = None) -> Tensor:
             grad_multiplier = 1 if zero_grad else (train_iter + 1)
             if dist.get_rank() == 0:
                 assert torch.allclose(
-                    dloss_wrt_layer3.cuda(),
+                    dloss_wrt_layer3.to(get_accelerator().device_name()),
                     grad_multiplier * create_tensor([2] * 8,
                                                     torch.float))
                 assert torch.allclose(
-                    dloss_wrt_layer2.cuda(),
+                    dloss_wrt_layer2.to(get_accelerator().device_name()),
                     grad_multiplier * create_tensor([3 * 1] * 8,
                                                     torch.float))
                 assert torch.allclose(
-                    dloss_wrt_layer1.cuda(),
+                    dloss_wrt_layer1.to(get_accelerator().device_name()),
                     grad_multiplier * create_tensor([3 * 2 * 1] * 8,
                                                     torch.float))
             elif dist.get_rank() == 1:
                 # parameters dont split evenly across ranks so rank 1 has a zero-padded
                 # partition
                 assert torch.allclose(
-                    dloss_wrt_layer3.cuda(),
+                    dloss_wrt_layer3.to(get_accelerator().device_name()),
                     grad_multiplier * create_tensor(([8] * 7) + [0],
                                                     torch.float))
                 assert torch.allclose(
-                    dloss_wrt_layer2.cuda(),
+                    dloss_wrt_layer2.to(get_accelerator().device_name()),
                     grad_multiplier * create_tensor(([6 * 2] * 7) + [0],
                                                     torch.float))
                 assert torch.allclose(
-                    dloss_wrt_layer1.cuda(),
+                    dloss_wrt_layer1.to(get_accelerator().device_name()),
                     grad_multiplier * create_tensor(([6 * 4 * 1] * 7) + [0],
                                                     torch.float))
             else:
@@ -1128,28 +1129,28 @@ def create_tensor(vals):
             grad_multiplier = 1 if zero_grad else (train_iter + 1)
             if dist.get_rank() == 0:
                 assert torch.allclose(
-                    dloss_wrt_layer3.cuda(),
+                    dloss_wrt_layer3.to(get_accelerator().device_name()),
                     grad_multiplier * create_tensor([2] * 8).to(expected_grad_dtype))
                 assert torch.allclose(
-                    dloss_wrt_layer2.cuda(),
+                    dloss_wrt_layer2.to(get_accelerator().device_name()),
                     grad_multiplier * create_tensor([3 * 1] * 8).to(expected_grad_dtype))
                 assert torch.allclose(
-                    dloss_wrt_layer1.cuda(),
+                    dloss_wrt_layer1.to(get_accelerator().device_name()),
                     grad_multiplier *
                     create_tensor([3 * 2 * 1] * 8).to(expected_grad_dtype))
             elif dist.get_rank() == 1:
                 # parameters dont split evenly across ranks so rank 1 has a zero-padded
                 # partition
                 assert torch.allclose(
-                    dloss_wrt_layer3.cuda(),
+                    dloss_wrt_layer3.to(get_accelerator().device_name()),
                     grad_multiplier *
                     create_tensor(([8] * 7) + [0]).to(expected_grad_dtype))
                 assert torch.allclose(
-                    dloss_wrt_layer2.cuda(),
+                    dloss_wrt_layer2.to(get_accelerator().device_name()),
                     grad_multiplier *
                     create_tensor(([6 * 2] * 7) + [0]).to(expected_grad_dtype))
                 assert torch.allclose(
-                    dloss_wrt_layer1.cuda(),
+                    dloss_wrt_layer1.to(get_accelerator().device_name()),
                     grad_multiplier *
                     create_tensor(([6 * 4 * 1] * 7) + [0]).to(expected_grad_dtype))
             else:
diff --git a/tests/unit/runtime/zero/test_zero_context_ancestry.py b/tests/unit/runtime/zero/test_zero_context_ancestry.py
index 6035efcff111..38ae524906d5 100644
--- a/tests/unit/runtime/zero/test_zero_context_ancestry.py
+++ b/tests/unit/runtime/zero/test_zero_context_ancestry.py
@@ -3,6 +3,7 @@
 import torch
 import deepspeed
 from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+from deepspeed.accelerator import get_accelerator
 
 from utils import setup_serial_env
 from unit.common import DistributedTest
@@ -74,7 +75,7 @@ def test_subclass_param_init(self):
         assert model.param.ds_status == ZeroParamStatus.NOT_AVAILABLE
 
         # test that the weights manipulation during each __init__ worked in all w/o needing gathering
-        ones = torch.ones(5).half().cuda()
+        ones = torch.ones(5).half().to(get_accelerator().device_name())
         with deepspeed.zero.GatheredParameters(list(model.parameters(recurse=False))):
             assert torch.equal(model.param, ones + 1)
             assert torch.equal(model.param_pa, ones + 2)
diff --git a/tests/unit/simple_model.py b/tests/unit/simple_model.py
index dcc4958b26e9..481aae0bfdcd 100644
--- a/tests/unit/simple_model.py
+++ b/tests/unit/simple_model.py
@@ -7,6 +7,7 @@
 
 from deepspeed.pipe import PipelineModule, LayerSpec
 from deepspeed.moe.layer import MoE
+from deepspeed.accelerator import get_accelerator
 
 import deepspeed.comm as dist
 
@@ -274,7 +275,7 @@ def create_deepspeed_args():
     args.deepspeed = True
     if dist.is_initialized():
         # We assume up to one full node executing unit tests
-        assert dist.get_world_size() <= torch.cuda.device_count()
+        assert dist.get_world_size() <= get_accelerator().device_count()
         args.local_rank = dist.get_rank()
     return args
 
diff --git a/tests/unit/utils/test_init_on_device.py b/tests/unit/utils/test_init_on_device.py
index 9b4f9970b365..25d102fd05a7 100644
--- a/tests/unit/utils/test_init_on_device.py
+++ b/tests/unit/utils/test_init_on_device.py
@@ -5,10 +5,11 @@
 from unit.simple_model import SimpleModel
 from deepspeed import OnDevice
 from packaging import version as pkg_version
+from deepspeed.accelerator import get_accelerator
 from unit.common import DistributedTest
 
 
-@pytest.mark.parametrize('device', ['meta', 'cuda:0'])
+@pytest.mark.parametrize('device', ['meta', get_accelerator().device_name(0)])
 class TestOnDevice(DistributedTest):
     world_size = 1