From e45b5e4cd0550ef84b54e651e746e182d7b131b2 Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Wed, 2 Sep 2020 10:17:47 -0700
Subject: [PATCH 01/44] ZeRO-Offload v1 (squash) (#345)

* update DSE to point to ZeRO-Offload staging

* ZeRO-2 enable CPU offload (#313)

* cpu-offload

* update

* deleted:    deepspeed/pt/deepspeed_zero_optimizer_cpuoffload.py
	modified:   deepspeed/pt/fp16_unfused_optimizer.py
	new file:   install_output.txt
	modified:   tests/unit/test_dynamic_loss_scale.py

* modified:   deepspeed/pt/deepspeed_zero_optimizer.py

* update

* modified:   deepspeed/pt/deepspeed_cpu_adam.py
	modified:   deepspeed/pt/deepspeed_zero_optimizer.py
	modified:   tests/unit/test_checkpointing.py
	modified:   tests/unit/test_fp16.py

* deleted:    install_output.txt

* modified:   deepspeed/pt/fp16_unfused_optimizer.py
	modified:   tests/unit/test_dynamic_loss_scale.py

* modified:   deepspeed/pt/deepspeed_cpu_adam.py

* modified:   deepspeed/pt/deepspeed_zero_optimizer.py

* modified:   deepspeed/pt/deepspeed_cpu_adam.py
	modified:   deepspeed/pt/deepspeed_zero_optimizer.py

* deleted:    deepspeed_cpu_adam.py
	modified:   deepspeed_light.py
	modified:   deepspeed_zero_optimizer.py
	../../deepspeed_zero_optimizer_cpu_offload.py

* modified:   deepspeed/pt/deepspeed_light.py

* modified:   deepspeed/pt/deepspeed_light.py
	modified:   deepspeed/pt/deepspeed_zero_optimizer.py
	modified:   deepspeed/pt/deepspeed_zero_utils.py
	modified:   tests/unit/test_fp16.py

* modified:   deepspeed/pt/deepspeed_config.py
	modified:   deepspeed/pt/deepspeed_light.py
	modified:   deepspeed/pt/deepspeed_zero_optimizer.py
	modified:   tests/unit/test_checkpointing.py
	modified:   tests/unit/test_fp16.py

* modified:   deepspeed/pt/deepspeed_checkpointing.py

* update DSE to ZeRO-Offload commit

Co-authored-by: Jeff Rasley <jerasley@microsoft.com>

* Enable ZeRO checkpointing for ZeRO-Offload (#337)

* Enable ZeRO checkpointing for ZeRO-Offload
Fix unit tests
Bump DSE to 33b9fb77c8cecdb49118188890f662526d8e9397

* Fix accidental revert

* Add ZeRO-Offload checkpointing model tests (#344)

* Enable ZeRO checkpointing for ZeRO-Offload
Fix unit tests
Bump DSE to 33b9fb77c8cecdb49118188890f662526d8e9397

* Fix accidental revert

* Fix ZeRO-Offload checkpointing bug when change gpu count
Add checkpointing model tests for ZeRO-Offload
Remove optimizer key from Megatron model tests
Use different deepspeed master port for Megatron model tests

Co-authored-by: Jie <37380896+jren73@users.noreply.github.com>
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 DeepSpeedExamples                             |   2 +-
 deepspeed/pt/deepspeed_zero_utils.py          |  33 +++
 deepspeed/runtime/config.py                   |   4 +-
 deepspeed/runtime/engine.py                   |  24 +-
 deepspeed/runtime/zero/config.py              |  22 +-
 deepspeed/runtime/zero/stage2.py              | 213 ++++++++++++++----
 .../ds_config_func_bs4_zero1.json             |   9 +-
 .../ds_config_func_bs4_zero2.json             |   9 +-
 .../ds_config_func_bs4_zero2_offload.json     |  21 ++
 .../ds_config_func_bs8_no_zero.json           |   8 +-
 .../ds_config_func_bs8_zero1.json             |  11 +-
 .../ds_config_func_bs8_zero2.json             |  10 +-
 .../ds_config_func_bs8_zero2_offload.json     |  25 ++
 .../ds_config_func_scheduler.json             |  10 +-
 .../Megatron_GPT2/ds_config_perf_bs16.json    |   5 +-
 .../Megatron_GPT2/ds_config_perf_bs32.json    |   3 +-
 .../Megatron_GPT2/ds_config_perf_bs8.json     |   5 +-
 tests/model/Megatron_GPT2/ds_gpt2_test.sh     |   4 +-
 .../Megatron_GPT2/run_checkpoint_test.py      | 154 ++++++++++++-
 tests/unit/test_checkpointing.py              |  92 +++++---
 tests/unit/test_dynamic_loss_scale.py         |   1 -
 tests/unit/test_fp16.py                       |  60 ++++-
 22 files changed, 564 insertions(+), 161 deletions(-)
 create mode 100755 deepspeed/pt/deepspeed_zero_utils.py
 create mode 100755 tests/model/Megatron_GPT2/ds_config_func_bs4_zero2_offload.json
 create mode 100755 tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_offload.json

diff --git a/DeepSpeedExamples b/DeepSpeedExamples
index 896831c96266..33b9fb77c8ce 160000
--- a/DeepSpeedExamples
+++ b/DeepSpeedExamples
@@ -1 +1 @@
-Subproject commit 896831c96266e12612c3e7a923d04e68d1f4dd84
+Subproject commit 33b9fb77c8cecdb49118188890f662526d8e9397
diff --git a/deepspeed/pt/deepspeed_zero_utils.py b/deepspeed/pt/deepspeed_zero_utils.py
new file mode 100755
index 000000000000..034a9f4ccb22
--- /dev/null
+++ b/deepspeed/pt/deepspeed_zero_utils.py
@@ -0,0 +1,33 @@
+import torch
+from torch.autograd import Variable
+import collections
+
+
+def async_migrate_to(obj, dev, main_stream=None):
+    if torch.is_tensor(obj):
+        obj = Variable(obj)
+    if isinstance(obj, Variable):
+        v = obj.cuda(dev, async=True)
+        if main_stream is not None:
+            v.data.record_stream(main_stream)
+        return v
+    elif isinstance(obj, collections.Mapping):
+        return {k: async_copy_to(o, dev, main_stream) for k, o in obj.items()}
+    elif isinstance(obj, collections.Sequence):
+        return [async_copy_to(o, dev, main_stream) for o in obj]
+    else:
+        return obj
+
+
+def async_copy_to(obj, dev, main_stream=None):
+    if torch.is_tensor(obj):
+        obj = Variable(obj)
+    if isinstance(obj, Variable):
+        target = torch.empty_like(obj, device=dev).copy_(obj)
+        if main_stream is not None:
+            target.data.record_stream(main_stream)
+        return target
+    elif isinstance(obj, collections.Mapping):
+        return {k: async_copy_to(o, dev, main_stream) for k, o in obj.items()}
+    elif isinstance(obj, collections.Sequence):
+        return [async_copy_to(o, dev, main_stream) for o in obj]
diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
index 78e9eaaf1b3f..90d1d6d42757 100755
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -630,10 +630,12 @@ def _do_error_check(self):
         if self.zero_enabled:
             assert self.fp16_enabled, "DeepSpeedConfig: ZeRO is only supported if fp16 is enabled"
             assert self.zero_optimization_stage <= MAX_STAGE_ZERO_OPTIMIZATION, "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format(MAX_STAGE_ZERO_OPTIMIZATION)
+            if self.zero_config.cpu_offload is True:
+                assert self.zero_optimization_stage == ZERO_OPTIMIZATION_GRADIENTS, "DeepSpeedConfig: cpu-offload supported ZeRO stage is {}".format(ZERO_OPTIMIZATION_GRADIENTS)
 
         assert self.train_micro_batch_size_per_gpu, "DeepSpeedConfig: {} is not defined".format(TRAIN_MICRO_BATCH_SIZE_PER_GPU)
 
-        assert self.gradient_accumulation_steps, 'DeepSpeedConfig: {} is not defined'.format(
+        assert self.gradient_accumulation_steps, "DeepSpeedConfig: {} is not defined".format(
             GRADIENT_ACCUMULATION_STEPS)
 
     def _do_warning_check(self):
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 0a02bf6dd7d0..2382503952ea 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -106,7 +106,6 @@ def __init__(self,
                  collate_fn=None,
                  config_params=None):
         super(DeepSpeedEngine, self).__init__()
-
         self.client_optimizer = optimizer
         self.client_model_parameters = model_parameters
         self.client_lr_scheduler = lr_scheduler
@@ -292,6 +291,9 @@ def zero_reduce_scatter(self):
     def zero_overlap_comm(self):
         return self._config.zero_config.overlap_comm
 
+    def zero_cpu_offload(self):
+        return self._config.zero_config.cpu_offload
+
     def zero_optimization_stage(self):
         return self._config.zero_optimization_stage
 
@@ -491,6 +493,7 @@ def _configure_distributed_model(self, model):
 
     # Configure optimizer
     def _configure_optimizer(self, client_optimizer, model_parameters):
+
         if client_optimizer is not None:
             basic_optimizer = client_optimizer
             logger.info('Using client Optimizer as basic optimizer')
@@ -504,13 +507,14 @@ def _configure_optimizer(self, client_optimizer, model_parameters):
 
         if self.zero_optimization():
             assert not self.amp_enabled(), "Amp and ZeRO are not currently compatible, please use (legacy) fp16 mode which performs similar to amp opt_mode=O2"
-            if self.optimizer_name() != ADAM_OPTIMIZER:
+            if self.optimizer_name() not in [ADAM_OPTIMIZER]:
                 assert self.zero_allow_untested_optimizer(), \
                     'You are using an untested ZeRO Optimizer. Please add <"zero_allow_untested_optimizer": true> in the configuration file to use it.'
 
                 logger.warning(
                     "**** You are using ZeRO with an untested optimizer, proceed with caution *****"
                 )
+
             self.optimizer = self._configure_zero_optimizer(basic_optimizer)
         elif self.amp_enabled():
             assert not self.fp16_enabled(), "Cannot enable both amp with (legacy) fp16 mode"
@@ -522,8 +526,8 @@ def _configure_optimizer(self, client_optimizer, model_parameters):
             self.optimizer = self._configure_fp16_optimizer(basic_optimizer)
         else:
             self.optimizer = basic_optimizer
-
-        # logger.info('DeepSpeed Final Optimizer = {}'.format(self.optimizer.state_dict()))
+        logger.info('DeepSpeed Final Optimizer = {}'.format(self.optimizer))
+        logger.info('DeepSpeed Final Optimizer = {}'.format(self.optimizer.state_dict()))
 
     def _configure_basic_optimizer(self, model_parameters):
         optimizer_parameters = self.optimizer_params()
@@ -532,8 +536,11 @@ def _configure_basic_optimizer(self, model_parameters):
                 "'max_grad_norm' is not supported as an optimizer parameter, please switch to using the deepspeed parameter 'gradient_clipping' see: https://www.deepspeed.ai/docs/config-json/#gradient-clipping for more details"
             )
         if self.optimizer_name() == ADAM_OPTIMIZER:
-            from apex.optimizers.fused_adam import FusedAdam
-            optimizer = FusedAdam(model_parameters, **optimizer_parameters)
+            if self.zero_cpu_offload():
+                optimizer = torch.optim.Adam(model_parameters, **optimizer_parameters)
+            else:
+                from apex.optimizers.fused_adam import FusedAdam
+                optimizer = FusedAdam(model_parameters, **optimizer_parameters)
         elif self.optimizer_name() == LAMB_OPTIMIZER:
             optimizer = FusedLamb(model_parameters, **optimizer_parameters)
         else:
@@ -610,6 +617,7 @@ def _configure_zero_optimizer(self, optimizer):
                 dp_process_group=self.data_parallel_group,
                 reduce_scatter=self.zero_reduce_scatter(),
                 overlap_comm=self.zero_overlap_comm(),
+                cpu_offload=self.zero_cpu_offload(),
                 mpu=self.mpu,
                 postscale_gradients=self.postscale_gradients(),
                 gradient_predivide_factor=self.gradient_predivide_factor())
@@ -844,7 +852,6 @@ def step(self):
                     master_params = amp.master_params(self.optimizer)
                     torch.nn.utils.clip_grad_norm_(parameters=master_params,
                                                    max_norm=self.gradient_clipping())
-
             self.optimizer.step()
 
             #zero grad in basic optimizer could be unreliable and may not exhibit
@@ -947,6 +954,9 @@ def _get_optimizer_param(self, param_name):
     def get_lr(self):
         return self._get_optimizer_param('lr')
 
+    def get_type(self):
+        return self._get_optimizer_param('type')
+
     def get_mom(self):
         return self._get_optimizer_param('betas')
 
diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py
index 3ad10a811fe1..fd95d9e3b8ff 100755
--- a/deepspeed/runtime/zero/config.py
+++ b/deepspeed/runtime/zero/config.py
@@ -23,6 +23,7 @@
     "overlap_comm": [true|false],
     "reduce_bucket_size": 500000000
     "load_from_fp32_weights": [true|false]
+    "cpu_offload": [true|false]
     }
 }
 '''
@@ -62,21 +63,22 @@
 ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS = 'load_from_fp32_weights'
 ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT = True
 
+ZERO_OPTIMIZATION_CPU_OFFLOAD = 'cpu_offload'
+ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT = False
+
 ZERO_OPTIMIZATION_DEFAULT = {
-    ZERO_OPTIMIZATION_STAGE:
-    ZERO_OPTIMIZATION_STAGE_DEFAULT,
+    ZERO_OPTIMIZATION_STAGE: ZERO_OPTIMIZATION_STAGE_DEFAULT,
     ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS:
     ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS_DEFAULT,
-    ZERO_OPTIMIZATION_REDUCE_SCATTER:
-    ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT,
-    ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE:
-    ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE_DEFAULT,
+    ZERO_OPTIMIZATION_REDUCE_SCATTER: ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT,
+    ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE: ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE_DEFAULT,
     ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS:
     ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS_DEFAULT,
     ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE:
     ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT,
     ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS:
-    ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT
+    ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT,
+    ZERO_OPTIMIZATION_CPU_OFFLOAD: ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT
 }
 
 
@@ -92,6 +94,7 @@ def __init__(self, param_dict):
         self.allgather_bucket_size = None
         self.overlap_comm = None
         self.load_from_fp32_weights = None
+        self.cpu_offload = None
 
         if ZERO_OPTIMIZATION in param_dict.keys():
             zero_config_dict = param_dict[ZERO_OPTIMIZATION]
@@ -156,7 +159,12 @@ def _initialize(self, zero_config_dict):
             zero_config_dict,
             ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE,
             ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT)
+
         self.load_from_fp32_weights = get_scalar_param(
             zero_config_dict,
             ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS,
             ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT)
+
+        self.cpu_offload = get_scalar_param(zero_config_dict,
+                                            ZERO_OPTIMIZATION_CPU_OFFLOAD,
+                                            ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT)
diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py
index e1521c3752d9..edfd15be3221 100755
--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -9,6 +9,7 @@
 import math
 from torch._six import inf
 from torch.autograd import Variable
+import collections
 
 from deepspeed.runtime.fp16.loss_scaler import LossScaler, DynamicLossScaler
 from deepspeed.runtime.utils import see_memory_usage, is_model_parallel_parameter
@@ -98,6 +99,39 @@ def print_rank_msg(msg):
     print(f"rank {dist.get_rank()} - {msg}")
 
 
+#jie:asyn move to target device
+def async_migrate_to(obj, dev, main_stream=None):
+    if torch.is_tensor(obj):
+        obj = Variable(obj)
+    if isinstance(obj, Variable):
+        v = obj.cuda(dev, async=True)
+        if main_stream is not None:
+            v.data.record_stream(main_stream)
+        return v
+    elif isinstance(obj, collections.Mapping):
+        return {k: async_copy_to(o, dev, main_stream) for k, o in obj.items()}
+    elif isinstance(obj, collections.Sequence):
+        return [async_copy_to(o, dev, main_stream) for o in obj]
+    else:
+        return obj
+
+
+def async_copy_to(obj, dev, main_stream=None):
+    if torch.is_tensor(obj):
+        obj = Variable(obj)
+    if isinstance(obj, Variable):
+        target = torch.empty_like(obj, device=dev).copy_(obj)
+        if main_stream is not None:
+            target.data.record_stream(main_stream)
+        return target
+    elif isinstance(obj, collections.Mapping):
+        return {k: async_copy_to(o, dev, main_stream) for k, o in obj.items()}
+    elif isinstance(obj, collections.Sequence):
+        return [async_copy_to(o, dev, main_stream) for o in obj]
+    else:
+        return obj
+
+
 class FP16_DeepSpeedZeroOptimizer(object):
     """
     DeepSpeedZeroOptimizer designed to reduce the memory footprint
@@ -122,6 +156,7 @@ def __init__(self,
                  dp_process_group=None,
                  reduce_scatter=True,
                  overlap_comm=False,
+                 cpu_offload=False,
                  mpu=None,
                  clip_grad=0.0,
                  allreduce_always_fp32=False,
@@ -131,6 +166,7 @@ def __init__(self,
         if dist.get_rank() == 0:
             logger.info(f"Reduce bucket size {reduce_bucket_size}")
             logger.info(f"Allgather bucket size {allgather_bucket_size}")
+            logger.info(f"CPU Offload: {cpu_offload}")
         # The fused optimizer does all the work. We need this layer for two reason:
         # 1. maintain same user API from apex.fp16_utils
         # 2. keep common stuff here in case we need to add ne552w fused optimizer later
@@ -150,6 +186,8 @@ def __init__(self,
 
         self.overlap_comm = overlap_comm
 
+        self.cpu_offload = cpu_offload
+
         self.dp_process_group = dp_process_group
 
         self.partition_count = dist.get_world_size(group=self.dp_process_group)
@@ -184,7 +222,8 @@ def __init__(self,
         #a single 32-bit partition of the parallel partitioned parameters
         #that this process will update
         self.single_partition_of_fp32_groups = []
-
+        if cpu_offload:
+            self.averaged_gradients_on_cpu = {}
         #param partition info
 
         #These are the parameters in each group that will not be updated by this process directly
@@ -207,6 +246,7 @@ def __init__(self,
 
         # padding on each partition for alignment purposes
         self.groups_padding = []
+
         # loop to deal with groups
         for i, param_group in enumerate(self.optimizer.param_groups):
             # push this group to list before modify
@@ -252,13 +292,24 @@ def __init__(self,
             self.parallel_partitioned_fp16_groups.append(data_parallel_partitions)
 
             # a partition of the fp32 master weights that will be updated by this process
-            self.single_partition_of_fp32_groups.append(
-                self.parallel_partitioned_fp16_groups[i]
-                [partition_id].clone().float().detach())
+            if self.cpu_offload:
+                self.single_partition_of_fp32_groups.append(
+                    async_copy_to(self.parallel_partitioned_fp16_groups[i][partition_id],
+                                  'cpu').float())
+                self.averaged_gradients_on_cpu[i] = [
+                    torch.empty_like(
+                        self.parallel_partitioned_fp16_groups[i][partition_id],
+                        device='cpu')
+                ]
+            else:
+                self.single_partition_of_fp32_groups.append(
+                    self.parallel_partitioned_fp16_groups[i]
+                    [partition_id].clone().float().detach())
 
             # modify optimizer of have flat master weight
             self.single_partition_of_fp32_groups[
                 i].requires_grad = True  # keep this in case internal optimizer uses it
+
             param_group['params'] = [self.single_partition_of_fp32_groups[i]]
 
             partition_size = len(self.fp16_groups_flat[i]) / dist.get_world_size(
@@ -275,6 +326,8 @@ def __init__(self,
 
         self.reduction_event = torch.cuda.Event(enable_timing=False, blocking=False)
         self.reduction_stream = torch.cuda.Stream()
+        self.cpu_computation_stream = torch.cuda.Stream()
+        self.migration_stream = torch.cuda.Stream()
         self.callback_queued = False
 
         self.param_dict = {}
@@ -379,10 +432,16 @@ def _release_ipg_buffers(self):
     def initialize_optimizer_states(self):
 
         for i, group in enumerate(self.fp16_groups):
-            single_grad_partition = torch.zeros(
-                int(self.partition_size[i]),
-                dtype=self.single_partition_of_fp32_groups[i].dtype,
-                device=torch.cuda.current_device())
+            if self.cpu_offload:
+                single_grad_partition = torch.zeros(
+                    int(self.partition_size[i]),
+                    dtype=self.single_partition_of_fp32_groups[i].dtype,
+                    device='cpu')
+            else:
+                single_grad_partition = torch.zeros(
+                    int(self.partition_size[i]),
+                    dtype=self.single_partition_of_fp32_groups[i].dtype,
+                    device=torch.cuda.current_device())
             self.single_partition_of_fp32_groups[i].grad = single_grad_partition
 
         self.optimizer.step()
@@ -444,28 +503,29 @@ def independent_gradient_partition_epilogue(self):
         if self.overlap_comm:
             torch.cuda.synchronize()
 
-        for i, _ in enumerate(self.fp16_groups):
-            if not i in self.averaged_gradients or self.averaged_gradients[i] is None:
-                self.averaged_gradients[i] = self.get_flat_partition(
-                    self.params_in_partition[i],
-                    self.first_offset[i],
-                    self.partition_size[i],
-                    dtype=torch.half,
-                    device=torch.cuda.current_device(),
-                    return_tensor_list=True)
-            else:
-                #When gradient accumulation is greater that 1
-                #This code path will be triggered and will add
-                #to the accumulated averaged gradients
-                avg_new = self.get_flat_partition(self.params_in_partition[i],
-                                                  self.first_offset[i],
-                                                  self.partition_size[i],
-                                                  dtype=torch.half,
-                                                  device=torch.cuda.current_device(),
-                                                  return_tensor_list=True)
-
-                for accumulated_grad, new_avg_grad in zip(self.averaged_gradients[i],avg_new):
-                    accumulated_grad.add_(new_avg_grad)
+        if self.cpu_offload is False:
+            for i, _ in enumerate(self.fp16_groups):
+                if not i in self.averaged_gradients or self.averaged_gradients[i] is None:
+                    self.averaged_gradients[i] = self.get_flat_partition(
+                        self.params_in_partition[i],
+                        self.first_offset[i],
+                        self.partition_size[i],
+                        dtype=torch.half,
+                        device=torch.cuda.current_device(),
+                        return_tensor_list=True)
+                else:
+                    #When gradient accumulation is greater that 1
+                    #This code path will be triggered and will add
+                    #to the accumulated averaged gradients
+                    avg_new = self.get_flat_partition(self.params_in_partition[i],
+                                                      self.first_offset[i],
+                                                      self.partition_size[i],
+                                                      dtype=torch.half,
+                                                      device=torch.cuda.current_device(),
+                                                      return_tensor_list=True)
+
+                    for accumulated_grad, new_avg_grad in zip(self.averaged_gradients[i],avg_new):
+                        accumulated_grad.add_(new_avg_grad)
 
         self._release_ipg_buffers()
 
@@ -578,6 +638,7 @@ def report_ipg_memory_usage(self, tag, param_elems):
 
     ###############Idependent Partition Gradient ########################
     def reduce_independent_p_g_buckets_and_remove_grads(self, param, i):
+
         if self.elements_in_ipg_bucket + param.numel() > self.reduce_bucket_size:
             self.report_ipg_memory_usage("In ipg_remove_grads before reduce_ipg_grads",
                                          param.numel())
@@ -602,7 +663,15 @@ def reduce_independent_p_g_buckets_and_remove_grads(self, param, i):
                 self.elements_in_ipg_bucket,
                 param.numel())
             new_grad_tensor.copy_(param.grad.view(-1))
-            param.grad.data = new_grad_tensor.data.view_as(param.grad)
+            #jie:
+            if self.cpu_offload:
+                with stream(self.migration_stream):
+                    averaged_gradients_on_cpu[i][param_id] = async_copy_to(
+                        new_grad_tensor.data.view_as(param.grad),
+                        'cpu',
+                        self.cpu_computation_stream)
+            else:
+                param.grad.data = new_grad_tensor.data.view_as(param.grad)
 
         self.elements_in_ipg_bucket += param.numel()
         self.grads_in_ipg_bucket.append(param.grad)
@@ -714,6 +783,15 @@ def copy_grads_in_partition(self, param):
                     total_size += param_in_partition.numel()
 
             see_memory_usage(f"before copying {total_size} gradients into partition")
+            #jie:
+            '''
+            if self.cpu_offload:
+                self.grads_in_partition = torch.empty(int(total_size),
+                                                      dtype=torch.half,
+                                                      device=torch.cuda.current_device())
+                                                      ''
+            else:
+            '''
             self.grads_in_partition = torch.empty(int(total_size),
                                                   dtype=torch.half,
                                                   device=torch.cuda.current_device())
@@ -724,6 +802,11 @@ def copy_grads_in_partition(self, param):
                                                          self.grads_in_partition_offset,
                                                          param.numel())
         new_grad_tensor.copy_(param.grad.view(-1))
+        '''
+        if self.cpu_offload:
+            averaged_gradients_on_cpu[self.get_param_id()] = new_grad_tensor.data.view_as(param.grad)
+        else:
+        '''
         param.grad.data = new_grad_tensor.data.view_as(param.grad)
         self.grads_in_partition_offset += param.numel()
 
@@ -1142,6 +1225,11 @@ def step(self, closure=None):
         partition_id = dist.get_rank(group=self.dp_process_group)
         for i, group in enumerate(self.fp16_groups):
 
+            if self.cpu_offload:
+                self.averaged_gradients[i] = self.averaged_gradients_on_cpu[i]
+            #norm_groups.append(
+            #    self.get_grad_norm_direct(self.averaged_gradients[i],
+            #                              self.params_in_partition[i]))
             norm_groups.append(
                 self.get_grad_norm_direct(self.averaged_gradients[i],
                                           self.params_in_partition[i]))
@@ -1167,20 +1255,31 @@ def step(self, closure=None):
             #release all the gradient since we have already created a necessary copy in dp_grad_partition
             self.free_grad_in_param_list(self.params_in_partition[i])
 
-            self.averaged_gradients[i] = None
+            if self.cpu_offload is False:
+                self.averaged_gradients[i] = None
 
             single_partition_grad_groups.append(single_grad_partition)
 
         self.unscale_and_clip_grads(single_partition_grad_groups, norm_groups)
 
+        self.cpu_computation_stream.wait_stream(self.migration_stream)
         timers('optimizer_step').start()
-        self.optimizer.step()
-        #get rid of the fp32 gradients. Not needed anymore
-        for group in self.single_partition_of_fp32_groups:
-            group.grad = None
+        with torch.cuda.stream(self.cpu_computation_stream):
+            self.optimizer.step()
 
-        for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
-            fp16_partitions[partition_id].data.copy_(fp32_partition.data)
+        if self.cpu_offload:
+            stream = torch.cuda.current_stream()
+            with torch.cuda.stream(self.migration_stream):
+                for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
+                    fp16_partitions[partition_id] = async_copy_to(
+                        fp32_partition,
+                        torch.cuda.current_device(),
+                        stream)
+                #for averaged_gradients_cpu, fp32_partition in zip(self.averaged_gradients_on_cpu, self.single_partition_of_fp32_groups):
+                #    averaged_gradients_cpu = [fp32_partition]
+        else:
+            for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
+                fp16_partitions[partition_id].data.copy_(fp32_partition.data)
         timers('optimizer_step').stop()
 
         timers('optimizer_allgather').start()
@@ -1263,6 +1362,8 @@ def has_overflow_serial(self, params, is_grad_list=False):
 
     def has_overflow_partitioned_grads_serial(self):
         for i in range(len(self.fp16_groups)):
+            if self.cpu_offload:
+                self.averaged_gradients[i] = self.averaged_gradients_on_cpu[i]
             for j, grad in enumerate(self.averaged_gradients[i]):
                 if grad is not None and self._has_inf_or_nan(grad.data, j):
                     return True
@@ -1337,7 +1438,7 @@ def backward(self, loss, retain_graph=False):
                                     device=torch.cuda.current_device())
                 self.ipg_buffer.append(buf_1)
             self.ipg_index = 0
-
+        torch.cuda.empty_cache()
         self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
 
     def check_overflow(self, partition_gradients=True):
@@ -1389,8 +1490,11 @@ def _get_groups_without_padding(self, groups_with_padding):
     def _get_state_without_padding(self, state_with_padding, padding):
         lean_state = {}
         for key, value in state_with_padding.items():
-            lean_length = value.numel() - padding
-            lean_state[key] = value[:lean_length]
+            if torch.is_tensor(value):
+                lean_length = value.numel() - padding
+                lean_state[key] = value[:lean_length]
+            else:
+                lean_state[key] = value
 
         return lean_state
 
@@ -1432,6 +1536,12 @@ def state_dict(self):
             self.single_partition_of_fp32_groups)
         state_dict['single_partition_of_fp32_groups'] = fp32_groups_without_padding
 
+        if self.cpu_offload:
+            state_dict_tmp = async_copy_to(state_dict,
+                                           'cpu',
+                                           torch.cuda.current_stream())
+            state_dict = state_dict_tmp
+
         return state_dict
 
     # Restore base optimizer fp32 weights from checkpoint by:
@@ -1468,11 +1578,16 @@ def refresh_fp32_params(self):
     def _partition_base_optimizer_state(self, state_key, all_partition_states):
         partition_id = dist.get_rank(group=self.dp_process_group)
         alignment = dist.get_world_size(group=self.dp_process_group)
-        flat_merged_partitions = flatten_dense_tensors_aligned(
-            all_partition_states,
-            alignment)
-        dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions)
-        return dp_partitions[partition_id]
+
+        if torch.is_tensor(all_partition_states[0]):
+            flat_merged_partitions = flatten_dense_tensors_aligned(
+                all_partition_states,
+                alignment)
+            dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions)
+            return dp_partitions[partition_id]
+        else:
+            # Assume non-tensor states are not partitioned and equal across ranks, so return first one
+            return all_partition_states[0]
 
     # Restore base optimizer state from checkpoint by
     # 1) Merging optimizer state from checkpoints of all partitions
@@ -1497,8 +1612,10 @@ def _restore_base_optimizer_state(self, all_state_dict):
         for i, group in enumerate(self.optimizer.param_groups):
             p = group['params'][0]
             for key, saved in base_optimizer_group_states[i].items():
-                current = self.optimizer.state[p][key]
-                current.data.copy_(saved.data)
+                if torch.is_tensor(self.optimizer.state[p][key]):
+                    self.optimizer.state[p][key].data.copy_(saved.data)
+                else:
+                    self.optimizer.state[p][key] = saved
 
     def load_state_dict(self,
                         state_dict_list,
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs4_zero1.json b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero1.json
index 2a3b9ca5a0be..c3322eca8138 100755
--- a/tests/model/Megatron_GPT2/ds_config_func_bs4_zero1.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero1.json
@@ -3,14 +3,9 @@
   "gradient_accumulation_steps": 1,
   "steps_per_print": 1,
   "zero_optimization": {
-    "stage":1
-  },
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015
-    }
+    "stage": 1
   },
+  "zero_allow_untested_optimizer": true,
   "gradient_clipping": 1.0,
   "fp16": {
     "enabled": true,
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2.json b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2.json
index fde222a3cca2..f6a6db57daf2 100755
--- a/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2.json
@@ -3,17 +3,12 @@
   "gradient_accumulation_steps": 1,
   "steps_per_print": 1,
   "zero_optimization": {
-    "stage":2,
+    "stage": 2,
     "reduce_bucket_size": 7000000,
     "allgather_bucket_size": 7000000,
     "reduce_scatter": true
   },
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015
-    }
-  },
+  "zero_allow_untested_optimizer": true,
   "gradient_clipping": 1.0,
   "fp16": {
     "enabled": true,
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2_offload.json b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2_offload.json
new file mode 100755
index 000000000000..ad054d31bb66
--- /dev/null
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2_offload.json
@@ -0,0 +1,21 @@
+{
+  "train_batch_size": 4,
+  "gradient_accumulation_steps": 1,
+  "steps_per_print": 1,
+  "zero_optimization": {
+    "stage": 2,
+    "reduce_bucket_size": 7000000,
+    "allgather_bucket_size": 7000000,
+    "reduce_scatter": true,
+    "cpu_offload": true
+  },
+  "zero_allow_untested_optimizer": true,
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  }
+}
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json
index 99637973cd60..63b30c225753 100755
--- a/tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json
@@ -3,13 +3,7 @@
   "gradient_accumulation_steps": 1,
   "steps_per_print": 1,
   "zero_optimization": {
-    "stage":0
-  },
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015
-    }
+    "stage": 0
   },
   "gradient_clipping": 1.0,
   "fp16": {
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json
index 8d44659a9ee3..342fd665ccae 100755
--- a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json
@@ -2,15 +2,10 @@
   "train_batch_size": 8,
   "gradient_accumulation_steps": 1,
   "steps_per_print": 1,
-  "zero_optimization":{
-    "stage":1
-  },
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015
-    }
+  "zero_optimization": {
+    "stage": 1
   },
+  "zero_allow_untested_optimizer": true,
   "gradient_clipping": 1.0,
   "fp16": {
     "enabled": true,
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json
index fde90e8274b8..0e2582fa102f 100755
--- a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json
@@ -3,17 +3,12 @@
   "gradient_accumulation_steps": 1,
   "steps_per_print": 1,
   "zero_optimization": {
-    "stage":2,
+    "stage": 2,
     "reduce_bucket_size": 7000000,
     "allgather_bucket_size": 7000000,
     "reduce_scatter": true
   },
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015
-    }
-  },
+  "zero_allow_untested_optimizer": true,
   "gradient_clipping": 1.0,
   "fp16": {
     "enabled": true,
@@ -26,5 +21,4 @@
     "partition_activations": true,
     "contiguous_memory_optimization": true
   }
-
 }
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_offload.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_offload.json
new file mode 100755
index 000000000000..5c66ed7cc585
--- /dev/null
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_offload.json
@@ -0,0 +1,25 @@
+{
+  "train_batch_size": 8,
+  "gradient_accumulation_steps": 1,
+  "steps_per_print": 1,
+  "zero_optimization": {
+    "stage": 2,
+    "reduce_bucket_size": 7000000,
+    "allgather_bucket_size": 7000000,
+    "reduce_scatter": true,
+    "cpu_offload": true
+  },
+  "zero_allow_untested_optimizer": true,
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": true,
+    "contiguous_memory_optimization": true
+  }
+}
diff --git a/tests/model/Megatron_GPT2/ds_config_func_scheduler.json b/tests/model/Megatron_GPT2/ds_config_func_scheduler.json
index 60c810786bf0..2d2ab356e57c 100755
--- a/tests/model/Megatron_GPT2/ds_config_func_scheduler.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_scheduler.json
@@ -3,14 +3,9 @@
   "gradient_accumulation_steps": 1,
   "steps_per_print": 1,
   "zero_optimization": {
-    "stage":2
-  },
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015
-    }
+    "stage": 2
   },
+  "zero_allow_untested_optimizer": true,
   "gradient_clipping": 1.0,
   "scheduler": {
     "type": "WarmupLR",
@@ -20,7 +15,6 @@
       "warmup_num_steps": 10
     }
   },
-
   "fp16": {
     "enabled": true,
     "loss_scale": 0,
diff --git a/tests/model/Megatron_GPT2/ds_config_perf_bs16.json b/tests/model/Megatron_GPT2/ds_config_perf_bs16.json
index f160ccd8e610..a40f3e4c7d44 100644
--- a/tests/model/Megatron_GPT2/ds_config_perf_bs16.json
+++ b/tests/model/Megatron_GPT2/ds_config_perf_bs16.json
@@ -2,7 +2,10 @@
   "train_batch_size": 16,
   "gradient_accumulation_steps": 1,
   "steps_per_print": 1,
-  "zero_optimization": 1,
+  "zero_optimization": {
+    "stage": 1
+  },
+  "zero_allow_untested_optimizer": true,
   "disable_allgather": true,
   "optimizer": {
     "type": "Adam",
diff --git a/tests/model/Megatron_GPT2/ds_config_perf_bs32.json b/tests/model/Megatron_GPT2/ds_config_perf_bs32.json
index 6e23fe687bc8..096a0d3645cd 100755
--- a/tests/model/Megatron_GPT2/ds_config_perf_bs32.json
+++ b/tests/model/Megatron_GPT2/ds_config_perf_bs32.json
@@ -3,8 +3,9 @@
   "gradient_accumulation_steps": 1,
   "steps_per_print": 1,
   "zero_optimization": {
-    "stage":1
+    "stage": 1
   },
+  "zero_allow_untested_optimizer": true,
   "disable_allgather": true,
   "optimizer": {
     "type": "Adam",
diff --git a/tests/model/Megatron_GPT2/ds_config_perf_bs8.json b/tests/model/Megatron_GPT2/ds_config_perf_bs8.json
index 514496958e14..e793e221e1e7 100644
--- a/tests/model/Megatron_GPT2/ds_config_perf_bs8.json
+++ b/tests/model/Megatron_GPT2/ds_config_perf_bs8.json
@@ -2,7 +2,10 @@
   "train_batch_size": 8,
   "gradient_accumulation_steps": 1,
   "steps_per_print": 1,
-  "zero_optimization": 1,
+  "zero_optimization": {
+    "stage": 1
+  },
+  "zero_allow_untested_optimizer": true,
   "disable_allgather": true,
   "optimizer": {
     "type": "Adam",
diff --git a/tests/model/Megatron_GPT2/ds_gpt2_test.sh b/tests/model/Megatron_GPT2/ds_gpt2_test.sh
index 5c901f855a33..a8af44df9c7e 100755
--- a/tests/model/Megatron_GPT2/ds_gpt2_test.sh
+++ b/tests/model/Megatron_GPT2/ds_gpt2_test.sh
@@ -91,9 +91,9 @@ gpt_options=" \
        ${ds_opt} \
        ${zero_opt} \
 "
-
+DEEPSPEED_PORT=29600
 work_dir="../../../DeepSpeedExamples/Megatron-LM/"
-run_cmd="(cd ${work_dir} && deepspeed --num_nodes $nodes --num_gpus $gpus pretrain_gpt2.py ${gpt_options})"
+run_cmd="(cd ${work_dir} && deepspeed --master_port ${DEEPSPEED_PORT} --num_nodes $nodes --num_gpus $gpus pretrain_gpt2.py ${gpt_options})"
 echo ${run_cmd}
 eval ${run_cmd}
 
diff --git a/tests/model/Megatron_GPT2/run_checkpoint_test.py b/tests/model/Megatron_GPT2/run_checkpoint_test.py
index 116e58b98fa2..cf11af6c2ae4 100755
--- a/tests/model/Megatron_GPT2/run_checkpoint_test.py
+++ b/tests/model/Megatron_GPT2/run_checkpoint_test.py
@@ -97,6 +97,29 @@ def test_mp2_gpu4_node1_with_zero2(self):
         succ = self.run_test(test_config, 0.01)
         self.assertTrue(succ)
 
+    def test_mp2_gpu4_node1_with_zero2_offload(self):
+        test_config = {
+            "mp": 2,
+            "gpus": 4,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1100,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": 256,
+            "heads": ATTN_HEADS,
+            "deepspeed": True,
+            "tag": "ds_zero2_offload",
+            "zero": True,
+            "other_args": "",
+            "checkpoint_name": "ckpt_mp2_gpu8_w_zero2_offload",
+            "checkpoint_interval": 1000,
+            "json": "ds_config_func_bs8_zero2_offload.json",
+            "cpu_optimizer": True,
+        }
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+
     def test_mp1_gpu2_load_gpu1_node1_with_zero1(self):
         test_config = {
             "mp": 1,
@@ -110,7 +133,7 @@ def test_mp1_gpu2_load_gpu1_node1_with_zero1(self):
             "seq_length": 256,
             "heads": ATTN_HEADS,
             "deepspeed": True,
-            "tag": "ds_zero2",
+            "tag": "ds_zero1",
             "zero": True,
             "other_args": "",
             "checkpoint_name": "ckpt_mp1_gpu2_gpu1_w_zero1",
@@ -133,7 +156,7 @@ def test_mp1_gpu2_load_gpu4_node1_with_zero1(self):
             "seq_length": 256,
             "heads": ATTN_HEADS,
             "deepspeed": True,
-            "tag": "ds_zero2",
+            "tag": "ds_zero1",
             "zero": True,
             "other_args": "",
             "checkpoint_name": "ckpt_mp1_gpu2_gpu4_w_zero1",
@@ -166,6 +189,30 @@ def test_mp1_gpu2_load_gpu1_node1_with_zero2(self):
         succ = self.run_test(test_config, 0.01)
         self.assertTrue(succ)
 
+    def test_mp1_gpu2_load_gpu1_node1_with_zero2_offload(self):
+        test_config = {
+            "mp": 1,
+            "gpus": 2,
+            "load_gpus": 1,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1100,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": 256,
+            "heads": ATTN_HEADS,
+            "deepspeed": True,
+            "tag": "ds_zero2_offload",
+            "zero": True,
+            "other_args": "",
+            "checkpoint_name": "ckpt_mp1_gpu2_gpu1_w_zero2_offload",
+            "checkpoint_interval": 1000,
+            "json": "ds_config_func_bs8_zero2_offload.json",
+            "cpu_optimizer": True,
+        }
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+
     def test_mp1_gpu2_load_gpu4_node1_with_zero2(self):
         test_config = {
             "mp": 1,
@@ -189,6 +236,30 @@ def test_mp1_gpu2_load_gpu4_node1_with_zero2(self):
         succ = self.run_test(test_config, 0.01)
         self.assertTrue(succ)
 
+    def test_mp1_gpu2_load_gpu4_node1_with_zero2_offload(self):
+        test_config = {
+            "mp": 1,
+            "gpus": 2,
+            "load_gpus": 4,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1100,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": 256,
+            "heads": ATTN_HEADS,
+            "deepspeed": True,
+            "tag": "ds_zero2_offload",
+            "zero": True,
+            "other_args": "",
+            "checkpoint_name": "ckpt_mp1_gpu2_gpu4_w_zero2_offload",
+            "checkpoint_interval": 1000,
+            "json": "ds_config_func_bs8_zero2_offload.json",
+            "cpu_optimizer": True,
+        }
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+
     def test_mp2_gpu4_load_gpu2_node1_with_zero1(self):
         test_config = {
             "mp": 2,
@@ -258,6 +329,30 @@ def test_mp2_gpu4_load_gpu2_node1_with_zero2(self):
         succ = self.run_test(test_config, 0.01)
         self.assertTrue(succ)
 
+    def test_mp2_gpu4_load_gpu2_node1_with_zero2_offload(self):
+        test_config = {
+            "mp": 2,
+            "gpus": 4,
+            "load_gpus": 2,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1100,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": 256,
+            "heads": ATTN_HEADS,
+            "deepspeed": True,
+            "tag": "ds_zero2_offload",
+            "zero": True,
+            "other_args": "",
+            "checkpoint_name": "ckpt_mp2_gpu4_gpu2_w_zero2_offload",
+            "checkpoint_interval": 1000,
+            "json": "ds_config_func_bs8_zero2_offload.json",
+            "cpu_optimizer": True,
+        }
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+
     def test_mp2_gpu2_load_gpu4_node1_with_zero2(self):
         test_config = {
             "mp": 2,
@@ -281,6 +376,30 @@ def test_mp2_gpu2_load_gpu4_node1_with_zero2(self):
         succ = self.run_test(test_config, 0.01)
         self.assertTrue(succ)
 
+    def test_mp2_gpu2_load_gpu4_node1_with_zero2_offload(self):
+        test_config = {
+            "mp": 2,
+            "gpus": 2,
+            "load_gpus": 4,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1100,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": 256,
+            "heads": ATTN_HEADS,
+            "deepspeed": True,
+            "tag": "ds_zero2_offload",
+            "zero": True,
+            "other_args": "",
+            "checkpoint_name": "ckpt_mp2_gpu2_gpu4_w_zero2_offload",
+            "checkpoint_interval": 1000,
+            "json": "ds_config_func_bs8_zero2_offload.json",
+            "cpu_optimizer": True,
+        }
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+
     def test_mp2_gpu4_node1_without_zero(self):
         test_config = {
             "mp": 2,
@@ -306,7 +425,8 @@ def test_mp2_gpu4_node1_without_zero(self):
     def gen_name(self, test_config, prefix):
         save_dir = "checkpoint_test_logs"
         tag = test_config["tag"]
-        file_name = f"_{tag}.log"
+        checkpoint_name = test_config["checkpoint_name"]
+        file_name = f"_{tag}_{checkpoint_name}.log"
         return os.path.join(save_dir, prefix + file_name)
 
     def run_test(self, test_config, r_tol):
@@ -334,10 +454,15 @@ def run_test(self, test_config, r_tol):
         except:
             print("No old checkpoint")
 
+        if "cpu_optimizer" in test_config and test_config["cpu_optimizer"]:
+            cpu_optimizer_flag = " --cpu-optimizer"
+        else:
+            cpu_optimizer_flag = ""
+
         #-----------------Saving Checkpoint-----------------#
-        #building checkpoint arguments
+        # building checkpoint arguments
         test_config[
-            "other_args"] = f"\"--save {checkpoint_folder} --save-interval {checkpoint_interval}\""
+            "other_args"] = f"\"--save {checkpoint_folder} --save-interval {checkpoint_interval} {cpu_optimizer_flag}\""
 
         prefix = "gpt2_saving_checkpoint"
 
@@ -356,10 +481,11 @@ def run_test(self, test_config, r_tol):
 
         #-----------------Loading Checkpoint-----------------#
 
-        #building checkpoint arguments
-        test_config["other_args"] = f"\"--load {checkpoint_folder}\""
+        # building checkpoint arguments
+        test_config[
+            "other_args"] = f"\"--load {checkpoint_folder} {cpu_optimizer_flag} \""
 
-        #set checkpoint load iteration
+        # set checkpoint load iteration
         try:
             cmd = f"echo {checkpoint_interval} > {checkpoint_name}/latest_checkpointed_iteration.txt"
             print(f"{self.id()} running cmd: {cmd}")
@@ -411,20 +537,32 @@ def check_parity(self, base_file, test_file, r_tol):
 
 def checkpoint_suite():
     suite = unittest.TestSuite()
+
     suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_node1_with_zero1'))
     suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_node1_with_zero2'))
+    suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_node1_with_zero2_offload'))
 
     # Shrink DP
     suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero1'))
     suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero2'))
+    suite.addTest(
+        GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero2_offload'))
+
     suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero1'))
     suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero2'))
+    suite.addTest(
+        GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero2_offload'))
 
     # Expand DP
     suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero1'))
     suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero2'))
+    suite.addTest(
+        GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero2_offload'))
+
     suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero1'))
     suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero2'))
+    suite.addTest(
+        GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero2_offload'))
 
     suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_node1_without_zero'))
 
diff --git a/tests/unit/test_checkpointing.py b/tests/unit/test_checkpointing.py
index ac2aee4ead69..4ce0d36284e0 100755
--- a/tests/unit/test_checkpointing.py
+++ b/tests/unit/test_checkpointing.py
@@ -28,25 +28,25 @@ def compare_model_states(saved_model, loaded_model):
     compare_deepspeed_states(saved_model, loaded_model)
 
     for p0, p1 in zip(saved_model.module.parameters(), loaded_model.module.parameters()):
-        assert torch.allclose(p0,p1,atol=1e-07), f"FP16 model state {p0} is not equal to {p1}"
+        assert torch.allclose(p0, p1, atol=1e-07), f"FP16 model state {p0} is not equal to {p1}"
 
     if isinstance(saved_model.optimizer, FP16_DeepSpeedZeroOptimizer):
         for p0, p1 in zip(saved_model.optimizer.single_partition_of_fp32_groups, loaded_model.optimizer.single_partition_of_fp32_groups):
-            assert torch.allclose(p0,p1,atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}"
+            assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}"
 
     elif isinstance(saved_model.optimizer, FP16_DeepSpeedZeroOptimizer_Stage1):
         for partition0, partition1 in zip(saved_model.optimizer.local_sub_partitions_of_fp32_groups, loaded_model.optimizer.local_sub_partitions_of_fp32_groups):
             for p0, p1 in zip(partition0, partition1):
-                assert torch.allclose(p0,p1,atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}"
+                assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}"
 
     elif isinstance(saved_model.optimizer, FP16_Optimizer):
         for p0, p1 in zip(saved_model.optimizer.fp32_groups_flat, loaded_model.optimizer.fp32_groups_flat):
-            assert torch.allclose(p0,p1,atol=1e-07), f"FP32 model states {p0} is not equal to {p1}"
+            assert torch.allclose(p0, p1, atol=1e-07), f"FP32 model states {p0} is not equal to {p1}"
 
     elif isinstance(saved_model.optimizer, FP16_UnfusedOptimizer):
         for params0, params1 in zip(saved_model.optimizer.fp32_groups, loaded_model.optimizer.fp32_groups):
             for p0, p1 in zip(params0, params1):
-                assert torch.allclose(p0,p1,atol=1e-07), f"FP32 model states {p0} is not equal to {p1}"
+                assert torch.allclose(p0, p1, atol=1e-07), f"FP32 model states {p0} is not equal to {p1}"
     elif isinstance(saved_model.optimizer, torch.optim.Optimizer):
         pass
     else:
@@ -97,9 +97,9 @@ def checkpoint_correctness_verification(args,
                                         load_lr_scheduler_states=False,
                                         fp16=True):
     dtype = torch.half if fp16 else torch.float32
-    ds_model, _, _,_ = deepspeed.initialize(args=args,
-                                            model=model,
-                                            model_parameters=model.parameters())
+    ds_model, _, _, _ = deepspeed.initialize(args=args,
+                                             model=model,
+                                             model_parameters=model.parameters())
     data_loader = random_dataloader(model=ds_model,
                                     total_samples=50,
                                     hidden_dim=hidden_dim,
@@ -117,9 +117,9 @@ def checkpoint_correctness_verification(args,
 
     trained_model.save_checkpoint(save_folder, save_tag)
 
-    loaded_model, _, _,_ = deepspeed.initialize(args=args,
-                                            model=model,
-                                            model_parameters=model.parameters())
+    loaded_model, _, _, _ = deepspeed.initialize(args=args,
+                                                 model=model,
+                                                 model_parameters=model.parameters())
 
     loaded_model.load_checkpoint(save_folder,
                                  save_tag,
@@ -235,8 +235,16 @@ def _test_checkpoint_fused_optimizer(args, model, hidden_dim, load_optimizer_sta
                                      load_optimizer_states=False)
 
 
-@pytest.mark.parametrize("zero_stage", [1, 2])
-def test_checkpoint_zero_optimizer(tmpdir, zero_stage):
+@pytest.mark.parametrize('zero_stage, use_cpu_offload',
+                         [
+                             (1,
+                              False),
+                             (2,
+                              False),
+                             (2,
+                              True),
+                         ])
+def test_checkpoint_zero_optimizer(tmpdir, zero_stage, use_cpu_offload):
     config_dict = {
         "train_batch_size": 2,
         "steps_per_print": 1,
@@ -254,8 +262,9 @@ def test_checkpoint_zero_optimizer(tmpdir, zero_stage):
             "enabled": True
         },
         "zero_optimization": {
-            "stage": zero_stage
-        },
+            "stage": zero_stage,
+            "cpu_offload": use_cpu_offload
+        }
     }
     args = args_from_dict(tmpdir, config_dict)
     hidden_dim = 10
@@ -276,8 +285,16 @@ def _test_checkpoint_zero_optimizer(args, model, hidden_dim, load_optimizer_stat
                                     load_optimizer_states=True)
 
 
-@pytest.mark.parametrize("zero_stage", [1, 2])
-def test_checkpoint_zero_no_optimizer(tmpdir, zero_stage):
+@pytest.mark.parametrize('zero_stage, use_cpu_offload',
+                         [
+                             (1,
+                              False),
+                             (2,
+                              False),
+                             (2,
+                              True),
+                         ])
+def test_checkpoint_zero_no_optimizer(tmpdir, zero_stage, use_cpu_offload):
     config_dict = {
         "train_batch_size": 2,
         "steps_per_print": 1,
@@ -295,8 +312,9 @@ def test_checkpoint_zero_no_optimizer(tmpdir, zero_stage):
             "enabled": True
         },
         "zero_optimization": {
-            "stage": zero_stage
-        },
+            "stage": zero_stage,
+            "cpu_offload": use_cpu_offload
+        }
     }
     args = args_from_dict(tmpdir, config_dict)
     hidden_dim = 10
@@ -320,8 +338,18 @@ def _test_checkpoint_zero_no_optimizer(args,
                                        load_optimizer_states=False)
 
 
-@pytest.mark.parametrize("zero_stage", [0, 1, 2])
-def test_checkpoint_lr_scheduler(tmpdir, zero_stage):
+@pytest.mark.parametrize('zero_stage, use_cpu_offload',
+                         [
+                             (0,
+                              False),
+                             (1,
+                              False),
+                             (2,
+                              False),
+                             (2,
+                              True),
+                         ])
+def test_checkpoint_lr_scheduler(tmpdir, zero_stage, use_cpu_offload):
     config_dict = {
         "train_batch_size": 2,
         "steps_per_print": 1,
@@ -339,7 +367,8 @@ def test_checkpoint_lr_scheduler(tmpdir, zero_stage):
             "enabled": True
         },
         "zero_optimization": {
-            "stage": zero_stage
+            "stage": zero_stage,
+            "cpu_offload": use_cpu_offload
         },
         "scheduler": {
             "type": "WarmupLR",
@@ -376,8 +405,18 @@ def _test_checkpoint_lr_scheduler(args,
                                   load_lr_scheduler_states=True)
 
 
-@pytest.mark.parametrize("zero_stage", [0, 1, 2])
-def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage):
+@pytest.mark.parametrize('zero_stage, use_cpu_offload',
+                         [
+                             (0,
+                              False),
+                             (1,
+                              False),
+                             (2,
+                              False),
+                             (2,
+                              True),
+                         ])
+def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage, use_cpu_offload):
     config_dict = {
         "train_batch_size": 2,
         "steps_per_print": 1,
@@ -391,7 +430,8 @@ def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage):
             "enabled": True
         },
         "zero_optimization": {
-            "stage": zero_stage
+            "stage": zero_stage,
+            "cpu_offload": use_cpu_offload
         },
         "scheduler": {
             "type": "WarmupLR",
@@ -400,7 +440,7 @@ def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage):
                 "warmup_max_lr": 0.001,
                 "warmup_num_steps": 1000
             }
-        }
+        },
     }
     args = args_from_dict(tmpdir, config_dict)
     hidden_dim = 10
diff --git a/tests/unit/test_dynamic_loss_scale.py b/tests/unit/test_dynamic_loss_scale.py
index e12386271450..7575d6b49454 100755
--- a/tests/unit/test_dynamic_loss_scale.py
+++ b/tests/unit/test_dynamic_loss_scale.py
@@ -191,7 +191,6 @@ def _test_unfused_no_overflow(args):
         model, optim, _, _ = deepspeed.initialize(args=args,
                                                   model=model,
                                                   model_parameters=model.parameters())
-
         expected_loss_scale = 2**8
         expected_scale_window = 2
         # Ensure the dynamic loss scaler is correctly configured.
diff --git a/tests/unit/test_fp16.py b/tests/unit/test_fp16.py
index 320d026bdd83..69a76a85830d 100755
--- a/tests/unit/test_fp16.py
+++ b/tests/unit/test_fp16.py
@@ -217,8 +217,16 @@ def _test_adamw_fp16_empty_grad(args, model, hidden_dim):
     _test_adamw_fp16_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
 
 
-@pytest.mark.parametrize("zero_stage", [0, 1, 2])
-def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage):
+@pytest.mark.parametrize('zero_stage, use_cpu_offload',
+                         [
+                             (1,
+                              False),
+                             (2,
+                              False),
+                             (2,
+                              True),
+                         ])
+def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offload):
     config_dict = {
         "train_batch_size": 1,
         "steps_per_print": 1,
@@ -246,7 +254,8 @@ def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage):
             "enabled": True
         },
         "zero_optimization": {
-            "stage": zero_stage
+            "stage": zero_stage,
+            "cpu_offload": use_cpu_offload
         }
     }
 
@@ -274,8 +283,16 @@ def _test_adam_fp16_zero_onecycle_compatibility(args, model, hidden_dim):
                                                 hidden_dim=hidden_dim)
 
 
-@pytest.mark.parametrize("zero_stage", [1, 2])
-def test_zero_static_scale(tmpdir, zero_stage):
+@pytest.mark.parametrize('zero_stage, use_cpu_offload',
+                         [
+                             (1,
+                              False),
+                             (2,
+                              False),
+                             (2,
+                              True),
+                         ])
+def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload):
     config_dict = {
         "train_batch_size": 4,
         "steps_per_print": 1,
@@ -290,7 +307,8 @@ def test_zero_static_scale(tmpdir, zero_stage):
             "loss_scale": 138.
         },
         "zero_optimization": {
-            "stage": zero_stage
+            "stage": zero_stage,
+            "cpu_offload": use_cpu_offload
         }
     }
     args = args_from_dict(tmpdir, config_dict)
@@ -363,8 +381,16 @@ def _test_zero_static_scale(args):
     _test_zero_static_scale(args)
 
 
-@pytest.mark.parametrize("zero_stage", [1, 2])
-def test_zero_allow_untested_optimizer(tmpdir, zero_stage):
+@pytest.mark.parametrize('zero_stage, use_cpu_offload',
+                         [
+                             (1,
+                              False),
+                             (2,
+                              False),
+                             (2,
+                              True),
+                         ])
+def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload):
     config_dict = {
         "train_batch_size": 4,
         "steps_per_print": 1,
@@ -372,7 +398,8 @@ def test_zero_allow_untested_optimizer(tmpdir, zero_stage):
             "enabled": True,
         },
         "zero_optimization": {
-            "stage": zero_stage
+            "stage": zero_stage,
+            "cpu_offload": use_cpu_offload
         },
         "zero_allow_untested_optimizer": False
     }
@@ -392,8 +419,16 @@ def _test_zero_allow_untested_optimizer(args):
     _test_zero_allow_untested_optimizer(args)
 
 
-@pytest.mark.parametrize("zero_stage", [1, 2])
-def test_zero_empty_partition(tmpdir, zero_stage):
+@pytest.mark.parametrize('zero_stage, use_cpu_offload',
+                         [
+                             (1,
+                              False),
+                             (2,
+                              False),
+                             (2,
+                              True),
+                         ])
+def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload):
     config_dict = {
         "train_micro_batch_size_per_gpu": 1,
         "gradient_accumulation_steps": 1,
@@ -408,7 +443,8 @@ def test_zero_empty_partition(tmpdir, zero_stage):
             }
         },
         "zero_optimization": {
-            "stage": zero_stage
+            "stage": zero_stage,
+            "cpu_offload": use_cpu_offload
         }
     }
     args = args_from_dict(tmpdir, config_dict)

From 75d70a9fe2dbf483d373b924fd98a9ea521ffc35 Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Wed, 2 Sep 2020 12:48:15 -0700
Subject: [PATCH 02/44] update DSE to staging for zero-dual

---
 DeepSpeedExamples | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DeepSpeedExamples b/DeepSpeedExamples
index 33b9fb77c8ce..8774a178169f 160000
--- a/DeepSpeedExamples
+++ b/DeepSpeedExamples
@@ -1 +1 @@
-Subproject commit 33b9fb77c8cecdb49118188890f662526d8e9397
+Subproject commit 8774a178169f31f7139a3b1a1b937c2a91aac53f

From 1ebcd6c50af7bc5c41507c60961f9031c729ec8c Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Wed, 2 Sep 2020 22:05:04 -0700
Subject: [PATCH 03/44] Update test_sparse_attention.py

---
 tests/unit/test_sparse_attention.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unit/test_sparse_attention.py b/tests/unit/test_sparse_attention.py
index 3f2078946297..f113e3b72ecd 100644
--- a/tests/unit/test_sparse_attention.py
+++ b/tests/unit/test_sparse_attention.py
@@ -232,6 +232,7 @@ def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layo
 
 
 def _skip_on_cuda_compatability():
+    pytest.skip("Skip these tests for now until we get our docker image fixed.")
     if torch.cuda.get_device_capability()[0] != 7:
         pytest.skip("needs compute capability 7; v100")
     cuda_major = int(torch.version.cuda.split('.')[0]) * 10

From 0159ebba0bd357b7666987534553848bf70a4a95 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Thu, 3 Sep 2020 11:08:24 -0700
Subject: [PATCH 04/44] Assert ZeRO-Offload+gradient accumulation (#347)

---
 deepspeed/runtime/config.py         | 31 ++++-------
 deepspeed/runtime/constants.py      | 29 -----------
 deepspeed/runtime/engine.py         |  6 +--
 deepspeed/runtime/zero/config.py    | 76 +--------------------------
 deepspeed/runtime/zero/constants.py | 79 +++++++++++++++++++++++++++++
 5 files changed, 91 insertions(+), 130 deletions(-)
 create mode 100755 deepspeed/runtime/zero/constants.py

diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
index 90d1d6d42757..206993dbe4c9 100755
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -10,6 +10,7 @@
 from deepspeed.runtime.fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, DELAYED_SHIFT, MIN_LOSS_SCALE
 from deepspeed.runtime.config_utils import get_scalar_param, dict_raise_error_on_duplicate_keys
 from deepspeed.runtime.zero.config import DeepSpeedZeroConfig
+from deepspeed.runtime.zero.constants import *
 from deepspeed.runtime.activation_checkpointing.config import DeepSpeedActivationCheckpointingConfig
 from deepspeed.utils import logger
 
@@ -110,22 +111,9 @@ def get_zero_optimization(param_dict):
 
 
 def get_zero_reduce_scatter(param_dict):
-    return get_scalar_param(param_dict, ZERO_REDUCE_SCATTER, ZERO_REDUCE_SCATTER_DEFAULT)
-
-
-def get_zero_max_elements_per_comm(param_dict):
-    return get_scalar_param(param_dict,
-                            ZERO_MAX_ELEMENTS_PER_COMM,
-                            ZERO_MAX_ELEMENTS_PER_COMM_DEFAULT)
-
-
-def get_allgather_size(param_dict):
     return get_scalar_param(param_dict,
-                            ALLGATHER_SIZE,
-                            ALLGATHER_SIZE_DEFAULT) if get_scalar_param(
-                                param_dict,
-                                ALLGATHER_SIZE,
-                                ALLGATHER_SIZE_DEFAULT) > 0 else ALLGATHER_SIZE_DEFAULT
+                            ZERO_OPTIMIZATION_REDUCE_SCATTER,
+                            ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT)
 
 
 def get_allreduce_always_fp32(param_dict):
@@ -492,8 +480,6 @@ def _initialize_params(self, param_dict):
         self.gradient_predivide_factor = get_gradient_predivide_factor(param_dict)
         self.sparse_gradients_enabled = get_sparse_gradients_enabled(param_dict)
 
-        self.allgather_size = get_allgather_size(param_dict)
-
         self.zero_config = DeepSpeedZeroConfig(param_dict)
         self.zero_optimization_stage = self.zero_config.stage
         self.zero_enabled = self.zero_optimization_stage > 0
@@ -627,16 +613,17 @@ def print(self, name):
                                    ':'))))
 
     def _do_error_check(self):
+        assert self.train_micro_batch_size_per_gpu, "DeepSpeedConfig: {} is not defined".format(TRAIN_MICRO_BATCH_SIZE_PER_GPU)
+
+        assert self.gradient_accumulation_steps, "DeepSpeedConfig: {} is not defined".format(
+            GRADIENT_ACCUMULATION_STEPS)
+
         if self.zero_enabled:
             assert self.fp16_enabled, "DeepSpeedConfig: ZeRO is only supported if fp16 is enabled"
             assert self.zero_optimization_stage <= MAX_STAGE_ZERO_OPTIMIZATION, "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format(MAX_STAGE_ZERO_OPTIMIZATION)
             if self.zero_config.cpu_offload is True:
                 assert self.zero_optimization_stage == ZERO_OPTIMIZATION_GRADIENTS, "DeepSpeedConfig: cpu-offload supported ZeRO stage is {}".format(ZERO_OPTIMIZATION_GRADIENTS)
-
-        assert self.train_micro_batch_size_per_gpu, "DeepSpeedConfig: {} is not defined".format(TRAIN_MICRO_BATCH_SIZE_PER_GPU)
-
-        assert self.gradient_accumulation_steps, "DeepSpeedConfig: {} is not defined".format(
-            GRADIENT_ACCUMULATION_STEPS)
+                assert self.gradient_accumulation_steps == 1, "DeepSpeedConfig: {}is not supported for {}".format(GRADIENT_ACCUMULATION_STEPS, ZERO_OPTIMIZATION_CPU_OFFLOAD)
 
     def _do_warning_check(self):
         fp16_enabled = self.fp16_enabled or self.zero_enabled
diff --git a/deepspeed/runtime/constants.py b/deepspeed/runtime/constants.py
index 57bd5ecf81fc..83249ade0de6 100755
--- a/deepspeed/runtime/constants.py
+++ b/deepspeed/runtime/constants.py
@@ -183,35 +183,6 @@
 GRADIENT_CLIPPING = 'gradient_clipping'
 GRADIENT_CLIPPING_DEFAULT = 0.
 
-#########################################
-# ZeRO optimization
-#########################################
-# ZeRO optimization. By default, this optimization is not enabled.
-# Users have to configure the desired optimization (0 means disabled) in params.json as below example:
-ZERO_FORMAT = '''
-ZeRO optimization should be enabled as:
-"session_params": {
-  "zero_optimization": [0|1|2],
-  "zero_all_gather_size": 200
-}
-'''
-
-ZERO_OPTIMIZATION = 'zero_optimization'
-ZERO_OPTIMIZATION_DEFAULT = 0
-ZERO_OPTIMIZATION_OPTIMIZER_STATES = 1
-ZERO_OPTIMIZATION_GRADIENTS = 2
-ZERO_OPTIMIZATION_WEIGHTS = 3
-MAX_STAGE_ZERO_OPTIMIZATION = ZERO_OPTIMIZATION_GRADIENTS
-
-ZERO_REDUCE_SCATTER = "zero_reduce_scatter"
-ZERO_REDUCE_SCATTER_DEFAULT = True
-
-ZERO_MAX_ELEMENTS_PER_COMM = "zero_max_elements_per_comm"
-ZERO_MAX_ELEMENTS_PER_COMM_DEFAULT = 5e8
-
-ALLGATHER_SIZE = 'allgather_size'
-ALLGATHER_SIZE_DEFAULT = 500000000
-
 #########################################
 # FP32 AllReduce
 #########################################
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 2382503952ea..96c591bd3c2a 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -22,7 +22,8 @@
 from deepspeed.runtime.dataloader import DeepSpeedDataLoader
 from deepspeed.runtime.constants import \
     ROUTE_TRAIN, ROUTE_PREDICT, ROUTE_EVAL, \
-    TORCH_DISTRIBUTED_DEFAULT_PORT, \
+    TORCH_DISTRIBUTED_DEFAULT_PORT
+from deepspeed.runtime.zero.constants import \
     ZERO_OPTIMIZATION_OPTIMIZER_STATES, ZERO_OPTIMIZATION_GRADIENTS
 from deepspeed.runtime.csr_tensor import CSRTensor
 import deepspeed.runtime.lr_schedules as lr_schedules
@@ -312,9 +313,6 @@ def zero_contiguous_gradients(self):
     def zero_load_from_fp32_weights(self):
         return self._config.zero_config.load_from_fp32_weights
 
-    def allgather_size(self):
-        return self._config.allgather_size
-
     def fp16_enabled(self):
         return self._config.fp16_enabled
 
diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py
index fd95d9e3b8ff..c0954690250d 100755
--- a/deepspeed/runtime/zero/config.py
+++ b/deepspeed/runtime/zero/config.py
@@ -5,81 +5,7 @@
 
 from deepspeed.runtime.config_utils import get_scalar_param
 from deepspeed.utils import logger
-
-#########################################
-# ZeRO optimization
-#########################################
-# ZeRO optimization. By default, this optimization is not enabled.
-# Users have to configure the desired optimization (0 means disabled) in params.json as below example:
-ZERO_FORMAT = '''
-ZeRO optimization should be enabled as:
-"session_params": {
-  "zero_optimization": {
-    "stage": [0|1|2],
-    "allgather_partitions": [true|false],
-    "allgather_bucket_size": 500000000,
-    "reduce_scatter": [true|false],
-    "contiguous_gradients" : [true|false]
-    "overlap_comm": [true|false],
-    "reduce_bucket_size": 500000000
-    "load_from_fp32_weights": [true|false]
-    "cpu_offload": [true|false]
-    }
-}
-'''
-
-ZERO_OPTIMIZATION = 'zero_optimization'
-ZERO_OPTIMIZATION_DISABLED = 0
-ZERO_OPTIMIZATION_OPTIMIZER_STATES = 1
-ZERO_OPTIMIZATION_GRADIENTS = 2
-ZERO_OPTIMIZATION_WEIGHTS = 3
-MAX_STAGE_ZERO_OPTIMIZATION = ZERO_OPTIMIZATION_GRADIENTS
-
-ZERO_OPTIMIZATION_STAGE = 'stage'
-ZERO_OPTIMIZATION_STAGE_1 = 'stage_1'
-ZERO_OPTIMIZATION_STAGE_2 = 'stage_2'
-ZERO_OPTIMIZATION_STAGE_3 = 'stage_3'
-
-ZERO_OPTIMIZATION_STAGE_DEFAULT = ZERO_OPTIMIZATION_DISABLED
-
-ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS = 'allgather_partitions'
-ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS_DEFAULT = True
-
-ZERO_OPTIMIZATION_REDUCE_SCATTER = 'reduce_scatter'
-ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT = True
-
-ZERO_OPTIMIZATION_OVERLAP_COMM = 'overlap_comm'
-ZERO_OPTIMIZATION_OVERLAP_COMM_DEFAULT = False
-
-ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS = 'contiguous_gradients'
-ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS_DEFAULT = False
-
-ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE = 'reduce_bucket_size'
-ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE_DEFAULT = 500000000
-
-ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE = 'allgather_bucket_size'
-ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT = 500000000
-ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEPRECATED = 'allgather_size'
-ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS = 'load_from_fp32_weights'
-ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT = True
-
-ZERO_OPTIMIZATION_CPU_OFFLOAD = 'cpu_offload'
-ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT = False
-
-ZERO_OPTIMIZATION_DEFAULT = {
-    ZERO_OPTIMIZATION_STAGE: ZERO_OPTIMIZATION_STAGE_DEFAULT,
-    ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS:
-    ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS_DEFAULT,
-    ZERO_OPTIMIZATION_REDUCE_SCATTER: ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT,
-    ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE: ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE_DEFAULT,
-    ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS:
-    ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS_DEFAULT,
-    ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE:
-    ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT,
-    ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS:
-    ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT,
-    ZERO_OPTIMIZATION_CPU_OFFLOAD: ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT
-}
+from deepspeed.runtime.zero.constants import *
 
 
 class DeepSpeedZeroConfig(object):
diff --git a/deepspeed/runtime/zero/constants.py b/deepspeed/runtime/zero/constants.py
new file mode 100755
index 000000000000..423d899748ab
--- /dev/null
+++ b/deepspeed/runtime/zero/constants.py
@@ -0,0 +1,79 @@
+"""
+Copyright (c) Microsoft Corporation
+Licensed under the MIT license.
+"""
+
+#########################################
+# ZeRO optimization
+#########################################
+# ZeRO optimization. By default, this optimization is not enabled.
+# Users have to configure the desired optimization (0 means disabled) in params.json as below example:
+ZERO_FORMAT = '''
+ZeRO optimization should be enabled as:
+"session_params": {
+  "zero_optimization": {
+    "stage": [0|1|2],
+    "allgather_partitions": [true|false],
+    "allgather_bucket_size": 500000000,
+    "reduce_scatter": [true|false],
+    "contiguous_gradients" : [true|false]
+    "overlap_comm": [true|false],
+    "reduce_bucket_size": 500000000
+    "load_from_fp32_weights": [true|false]
+    "cpu_offload": [true|false]
+    }
+}
+'''
+
+ZERO_OPTIMIZATION = 'zero_optimization'
+ZERO_OPTIMIZATION_DISABLED = 0
+ZERO_OPTIMIZATION_OPTIMIZER_STATES = 1
+ZERO_OPTIMIZATION_GRADIENTS = 2
+ZERO_OPTIMIZATION_WEIGHTS = 3
+MAX_STAGE_ZERO_OPTIMIZATION = ZERO_OPTIMIZATION_GRADIENTS
+
+ZERO_OPTIMIZATION_STAGE = 'stage'
+ZERO_OPTIMIZATION_STAGE_1 = 'stage_1'
+ZERO_OPTIMIZATION_STAGE_2 = 'stage_2'
+ZERO_OPTIMIZATION_STAGE_3 = 'stage_3'
+
+ZERO_OPTIMIZATION_STAGE_DEFAULT = ZERO_OPTIMIZATION_DISABLED
+
+ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS = 'allgather_partitions'
+ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS_DEFAULT = True
+
+ZERO_OPTIMIZATION_REDUCE_SCATTER = 'reduce_scatter'
+ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT = True
+
+ZERO_OPTIMIZATION_OVERLAP_COMM = 'overlap_comm'
+ZERO_OPTIMIZATION_OVERLAP_COMM_DEFAULT = False
+
+ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS = 'contiguous_gradients'
+ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS_DEFAULT = False
+
+ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE = 'reduce_bucket_size'
+ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE_DEFAULT = 500000000
+
+ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE = 'allgather_bucket_size'
+ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT = 500000000
+ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEPRECATED = 'allgather_size'
+ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS = 'load_from_fp32_weights'
+ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT = True
+
+ZERO_OPTIMIZATION_CPU_OFFLOAD = 'cpu_offload'
+ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT = False
+
+ZERO_OPTIMIZATION_DEFAULT = {
+    ZERO_OPTIMIZATION_STAGE: ZERO_OPTIMIZATION_STAGE_DEFAULT,
+    ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS:
+    ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS_DEFAULT,
+    ZERO_OPTIMIZATION_REDUCE_SCATTER: ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT,
+    ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE: ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE_DEFAULT,
+    ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS:
+    ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS_DEFAULT,
+    ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE:
+    ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT,
+    ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS:
+    ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT,
+    ZERO_OPTIMIZATION_CPU_OFFLOAD: ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT
+}

From 6deac82ca60b0a0e409f5c4206d9865a696c2891 Mon Sep 17 00:00:00 2001
From: Arash Ashari <arashari@microsoft.com>
Date: Thu, 3 Sep 2020 15:01:23 -0700
Subject: [PATCH 05/44] Adding link to Sparse Attention in Navigation page
 (#355)

* adding link to Sparse Attention in Navigation page
---
 docs/_data/navigation.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml
index b04769742041..7a98032b863c 100644
--- a/docs/_data/navigation.yml
+++ b/docs/_data/navigation.yml
@@ -64,5 +64,7 @@ lnav:
         url: /tutorials/1Cycle/
       - title: "Learning Rate Range Test"
         url: /tutorials/lrrt/
+      - title: "DeepSpeed Sparse Attention"
+        url: /tutorials/sparse_attention/
   - title: "Contributing"
     url: /contributing/

From 6604a5da1fccea629b6107ece52af77aaccbc7a0 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Thu, 3 Sep 2020 15:12:57 -0700
Subject: [PATCH 06/44] Correctness and perf fixes (#354)

* Update test_sparse_attention.py

* jren changes

* Merge with correctness/perf fixes

* Formatting fixes

Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
---
 deepspeed/runtime/engine.py         |   9 +-
 deepspeed/runtime/zero/stage2.py    | 446 ++++++++++++++++++----------
 tests/unit/test_sparse_attention.py |   1 +
 3 files changed, 296 insertions(+), 160 deletions(-)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 96c591bd3c2a..f77fc64de7d2 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -6,7 +6,6 @@
 import torch
 import warnings
 import torch.distributed as dist
-
 from apex import amp
 from torch.nn.modules import Module
 from torch.distributed.distributed_c10d import _get_global_rank
@@ -27,9 +26,7 @@
     ZERO_OPTIMIZATION_OPTIMIZER_STATES, ZERO_OPTIMIZATION_GRADIENTS
 from deepspeed.runtime.csr_tensor import CSRTensor
 import deepspeed.runtime.lr_schedules as lr_schedules
-
 from deepspeed.ops.lamb import FusedLamb
-
 from deepspeed.utils import logger
 from deepspeed.utils.timer import ThroughputTimer, SynchronizedWallClockTimer
 
@@ -618,7 +615,8 @@ def _configure_zero_optimizer(self, optimizer):
                 cpu_offload=self.zero_cpu_offload(),
                 mpu=self.mpu,
                 postscale_gradients=self.postscale_gradients(),
-                gradient_predivide_factor=self.gradient_predivide_factor())
+                gradient_predivide_factor=self.gradient_predivide_factor(),
+                gradient_accumulation_steps=self.gradient_accumulation_steps())
         else:
             raise NotImplementedError("ZeRO stage {} not implemented".format(zero_stage))
 
@@ -724,7 +722,6 @@ def forward(self, *inputs, **kwargs):
         return loss
 
     def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE):
-
         #Zero stage 2 communicates during non gradient accumulation boundaries as well
         if self.zero_optimization_partition_gradients():
             self.optimizer.overlapping_partition_gradients_reduce_epilogue()
@@ -780,6 +777,8 @@ def backward(self, loss, allreduce_gradients=True):
             self.timers('backward_inner').start()
 
         if self.zero_optimization():
+            self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary(
+            )
             self.optimizer.backward(loss)
         elif self.amp_enabled():
             # AMP requires delaying unscale when inside gradient accumulation boundaries
diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py
index edfd15be3221..69cdcf34adf1 100755
--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -9,13 +9,13 @@
 import math
 from torch._six import inf
 from torch.autograd import Variable
+
 import collections
 
 from deepspeed.runtime.fp16.loss_scaler import LossScaler, DynamicLossScaler
 from deepspeed.runtime.utils import see_memory_usage, is_model_parallel_parameter
 from deepspeed.runtime.zero.config import ZERO_OPTIMIZATION_GRADIENTS
 from deepspeed.utils import logger
-
 #Toggle this to true to enable correctness test
 #with gradient partitioning and without
 pg_correctness_test = False
@@ -99,39 +99,6 @@ def print_rank_msg(msg):
     print(f"rank {dist.get_rank()} - {msg}")
 
 
-#jie:asyn move to target device
-def async_migrate_to(obj, dev, main_stream=None):
-    if torch.is_tensor(obj):
-        obj = Variable(obj)
-    if isinstance(obj, Variable):
-        v = obj.cuda(dev, async=True)
-        if main_stream is not None:
-            v.data.record_stream(main_stream)
-        return v
-    elif isinstance(obj, collections.Mapping):
-        return {k: async_copy_to(o, dev, main_stream) for k, o in obj.items()}
-    elif isinstance(obj, collections.Sequence):
-        return [async_copy_to(o, dev, main_stream) for o in obj]
-    else:
-        return obj
-
-
-def async_copy_to(obj, dev, main_stream=None):
-    if torch.is_tensor(obj):
-        obj = Variable(obj)
-    if isinstance(obj, Variable):
-        target = torch.empty_like(obj, device=dev).copy_(obj)
-        if main_stream is not None:
-            target.data.record_stream(main_stream)
-        return target
-    elif isinstance(obj, collections.Mapping):
-        return {k: async_copy_to(o, dev, main_stream) for k, o in obj.items()}
-    elif isinstance(obj, collections.Sequence):
-        return [async_copy_to(o, dev, main_stream) for o in obj]
-    else:
-        return obj
-
-
 class FP16_DeepSpeedZeroOptimizer(object):
     """
     DeepSpeedZeroOptimizer designed to reduce the memory footprint
@@ -161,7 +128,8 @@ def __init__(self,
                  clip_grad=0.0,
                  allreduce_always_fp32=False,
                  postscale_gradients=True,
-                 gradient_predivide_factor=1.0):
+                 gradient_predivide_factor=1.0,
+                 gradient_accumulation_steps=1):
 
         if dist.get_rank() == 0:
             logger.info(f"Reduce bucket size {reduce_bucket_size}")
@@ -187,6 +155,7 @@ def __init__(self,
         self.overlap_comm = overlap_comm
 
         self.cpu_offload = cpu_offload
+        self.device = torch.cuda.current_device() if not self.cpu_offload else 'cpu'
 
         self.dp_process_group = dp_process_group
 
@@ -204,6 +173,8 @@ def __init__(self,
         self.allreduce_always_fp32 = allreduce_always_fp32
         self.gradient_predivide_factor = gradient_predivide_factor
         self.postscale_gradients = postscale_gradients
+        self.gradient_accumulation_steps = gradient_accumulation_steps
+        self.micro_step_id = 0
 
         if self.reduce_scatter:
             assert not self.allreduce_always_fp32, "allreduce_always_fp32 is not yet supported with ZeRO-2 with reduce scatter enabled"
@@ -222,8 +193,7 @@ def __init__(self,
         #a single 32-bit partition of the parallel partitioned parameters
         #that this process will update
         self.single_partition_of_fp32_groups = []
-        if cpu_offload:
-            self.averaged_gradients_on_cpu = {}
+
         #param partition info
 
         #These are the parameters in each group that will not be updated by this process directly
@@ -246,7 +216,6 @@ def __init__(self,
 
         # padding on each partition for alignment purposes
         self.groups_padding = []
-
         # loop to deal with groups
         for i, param_group in enumerate(self.optimizer.param_groups):
             # push this group to list before modify
@@ -292,24 +261,13 @@ def __init__(self,
             self.parallel_partitioned_fp16_groups.append(data_parallel_partitions)
 
             # a partition of the fp32 master weights that will be updated by this process
-            if self.cpu_offload:
-                self.single_partition_of_fp32_groups.append(
-                    async_copy_to(self.parallel_partitioned_fp16_groups[i][partition_id],
-                                  'cpu').float())
-                self.averaged_gradients_on_cpu[i] = [
-                    torch.empty_like(
-                        self.parallel_partitioned_fp16_groups[i][partition_id],
-                        device='cpu')
-                ]
-            else:
-                self.single_partition_of_fp32_groups.append(
-                    self.parallel_partitioned_fp16_groups[i]
-                    [partition_id].clone().float().detach())
+            self.single_partition_of_fp32_groups.append(
+                self.parallel_partitioned_fp16_groups[i]
+                [partition_id].clone().float().detach().to(self.device))
 
             # modify optimizer of have flat master weight
             self.single_partition_of_fp32_groups[
                 i].requires_grad = True  # keep this in case internal optimizer uses it
-
             param_group['params'] = [self.single_partition_of_fp32_groups[i]]
 
             partition_size = len(self.fp16_groups_flat[i]) / dist.get_world_size(
@@ -346,6 +304,7 @@ def __init__(self,
         #simplified param id
         self.param_id = {}
 
+        largest_param_numel = 0
         count = 0
         for i, params_group in enumerate(self.fp16_groups):
             for param in params_group:
@@ -353,6 +312,8 @@ def __init__(self,
                 self.param_id[unique_id] = count
                 self.param_dict[count] = param
                 self.params_already_reduced.append(False)
+                if param.numel() > largest_param_numel:
+                    largest_param_numel = param.numel()
                 count = count + 1
 
         for param_group in self.params_in_partition:
@@ -363,6 +324,23 @@ def __init__(self,
             for param in param_group:
                 self.is_param_in_current_partition[self.get_param_id(param)] = False
 
+        if self.cpu_offload:
+            self.accumulated_grads_in_cpu = {}
+            self.norm_for_param_grads = {}
+            self.local_overflow = False
+            self.grad_position = {}
+            self.temp_grad_buffer_for_cpu_offload = torch.zeros(
+                largest_param_numel).half().pin_memory()
+            self.temp_grad_buffer_for_gpu_offload = torch.zeros(
+                largest_param_numel,
+                device=torch.cuda.current_device()).half()
+
+            for i, params_group in enumerate(self.fp16_groups):
+                self.get_grad_position(i,
+                                       self.params_in_partition[i],
+                                       self.first_offset[i],
+                                       self.partition_size[i])
+
         #mapping from parameter to partition that it belongs to
         self.param_to_partition_ids = {}
 
@@ -432,22 +410,19 @@ def _release_ipg_buffers(self):
     def initialize_optimizer_states(self):
 
         for i, group in enumerate(self.fp16_groups):
-            if self.cpu_offload:
-                single_grad_partition = torch.zeros(
-                    int(self.partition_size[i]),
-                    dtype=self.single_partition_of_fp32_groups[i].dtype,
-                    device='cpu')
-            else:
-                single_grad_partition = torch.zeros(
-                    int(self.partition_size[i]),
-                    dtype=self.single_partition_of_fp32_groups[i].dtype,
-                    device=torch.cuda.current_device())
-            self.single_partition_of_fp32_groups[i].grad = single_grad_partition
+            single_grad_partition = torch.zeros(
+                int(self.partition_size[i]),
+                dtype=self.single_partition_of_fp32_groups[i].dtype,
+                device=self.device)
+            self.single_partition_of_fp32_groups[
+                i].grad = single_grad_partition.pin_memory(
+                ) if self.cpu_offload else single_grad_partition
 
         self.optimizer.step()
 
-        for group in self.single_partition_of_fp32_groups:
-            group.grad = None
+        if not self.cpu_offload:
+            for group in self.single_partition_of_fp32_groups:
+                group.grad = None
 
         return
 
@@ -514,9 +489,6 @@ def independent_gradient_partition_epilogue(self):
                         device=torch.cuda.current_device(),
                         return_tensor_list=True)
                 else:
-                    #When gradient accumulation is greater that 1
-                    #This code path will be triggered and will add
-                    #to the accumulated averaged gradients
                     avg_new = self.get_flat_partition(self.params_in_partition[i],
                                                       self.first_offset[i],
                                                       self.partition_size[i],
@@ -526,7 +498,6 @@ def independent_gradient_partition_epilogue(self):
 
                     for accumulated_grad, new_avg_grad in zip(self.averaged_gradients[i],avg_new):
                         accumulated_grad.add_(new_avg_grad)
-
         self._release_ipg_buffers()
 
         # No need to keep the gradients anymore.
@@ -638,7 +609,6 @@ def report_ipg_memory_usage(self, tag, param_elems):
 
     ###############Idependent Partition Gradient ########################
     def reduce_independent_p_g_buckets_and_remove_grads(self, param, i):
-
         if self.elements_in_ipg_bucket + param.numel() > self.reduce_bucket_size:
             self.report_ipg_memory_usage("In ipg_remove_grads before reduce_ipg_grads",
                                          param.numel())
@@ -663,15 +633,7 @@ def reduce_independent_p_g_buckets_and_remove_grads(self, param, i):
                 self.elements_in_ipg_bucket,
                 param.numel())
             new_grad_tensor.copy_(param.grad.view(-1))
-            #jie:
-            if self.cpu_offload:
-                with stream(self.migration_stream):
-                    averaged_gradients_on_cpu[i][param_id] = async_copy_to(
-                        new_grad_tensor.data.view_as(param.grad),
-                        'cpu',
-                        self.cpu_computation_stream)
-            else:
-                param.grad.data = new_grad_tensor.data.view_as(param.grad)
+            param.grad.data = new_grad_tensor.data.view_as(param.grad)
 
         self.elements_in_ipg_bucket += param.numel()
         self.grads_in_ipg_bucket.append(param.grad)
@@ -774,7 +736,188 @@ def average_tensor(self, tensor):
             for handle in async_handles:
                 handle.wait()
 
+    ##############################################################################
+    ############################# CPU Offload Methods#############################
+    ##############################################################################
+    def get_grad_position(self, group_id, tensor_list, first_offset, partition_size):
+        current_offset = 0
+
+        for i, tensor in enumerate(tensor_list):
+            param_id = self.get_param_id(tensor)
+            param_start_offset = 0
+
+            num_elements = tensor.numel()
+            tensor_offset = 0
+
+            #we need to offset to get to the right element
+            if i == 0 and first_offset > 0:
+                tensor_offset = first_offset
+                num_elements = num_elements - tensor_offset
+                param_start_offset = first_offset
+
+            #we dont need all elements of the tensor
+            if num_elements > (partition_size - current_offset):
+                num_elements = partition_size - current_offset
+
+            self.grad_position[param_id] = [
+                int(group_id),
+                int(param_start_offset),
+                int(current_offset),
+                int(num_elements)
+            ]
+            current_offset += num_elements
+
+    def update_overflow_tracker_for_param_grad(self, param):
+        if param.grad is not None and self._has_inf_or_nan(param.grad.data):
+            self.local_overflow = True
+
+    def async_accumulate_grad_in_cpu(self, param):
+        param_id = self.get_param_id(param)
+
+        #copy to a preexisiting buffer to avoid memory allocation penalty
+        dest_buffer = self.temp_grad_buffer_for_cpu_offload.view(-1).narrow(
+            0,
+            0,
+            param.numel())
+        dest_buffer.copy_(param.grad.view(-1), non_blocking=True)
+
+        if param_id not in self.accumulated_grads_in_cpu:
+            self.accumulated_grads_in_cpu[param_id] = torch.zeros(
+                param.numel(),
+                dtype=param.dtype).pin_memory()
+
+        self.accumulated_grads_in_cpu[param_id].add_(dest_buffer)
+
+    def async_accumulate_grad_in_cpu_via_gpu(self, param):
+        param_id = self.get_param_id(param)
+
+        #copy to a preexisiting buffer to avoid memory allocation penalty
+        dest_buffer = self.temp_grad_buffer_for_gpu_offload.view(-1).narrow(
+            0,
+            0,
+            param.numel())
+
+        if param_id not in self.accumulated_grads_in_cpu:
+            self.accumulated_grads_in_cpu[param_id] = torch.zeros(
+                param.numel(),
+                dtype=param.dtype).pin_memory()
+
+        if self.micro_step_id > 0:
+            dest_buffer.copy_(self.accumulated_grads_in_cpu[param_id].view(-1),
+                              non_blocking=True)
+            param.grad.data.view(-1).add_(dest_buffer)
+
+        #at the boundary we will send 32bit directly
+        if not self.is_gradient_accumulation_boundary:
+            self.accumulated_grads_in_cpu[param_id].data.copy_(param.grad.data.view(-1),
+                                                               non_blocking=True)
+
+    def set_norm_for_param_grad(self, param):
+        param_id = self.get_param_id(param)
+        accumulated_grad = self.accumulated_grads_in_cpu[
+            param_id] if self.gradient_accumulation_steps > 1 else param.grad
+
+        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
+
+        start = source_offset
+        accumulated_grad = accumulated_grad.view(-1).narrow(0, start, num_elements)
+
+        self.norm_for_param_grads[param_id] = accumulated_grad.data.double().norm(2)
+
+    def set_norm_for_param_grad_in_gpu(self, param):
+        param_id = self.get_param_id(param)
+        accumulated_grad = param.grad
+
+        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
+
+        start = source_offset
+        accumulated_grad = accumulated_grad.view(-1).narrow(0, start, num_elements)
+
+        self.norm_for_param_grads[param_id] = accumulated_grad.data.double().norm(2)
+
+    def async_inplace_copy_grad_to_fp32_buffer(self, param):
+        param_id = self.get_param_id(param)
+
+        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
+
+        dest_tensor = self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(
+            0,
+            dest_offset,
+            num_elements)
+        if self.gradient_accumulation_steps > 1:
+            src_tensor = self.accumulated_grads_in_cpu[param_id].view(-1).narrow(
+                0,
+                source_offset,
+                num_elements)
+        else:
+            src_tensor = param.grad.view(-1).narrow(0,
+                                                    source_offset,
+                                                    num_elements).float()
+        dest_tensor.copy_(src_tensor, non_blocking=True)
+
+    def async_inplace_copy_grad_to_fp32_buffer_from_gpu(self, param):
+        param_id = self.get_param_id(param)
+
+        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
+
+        dest_tensor = self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(
+            0,
+            dest_offset,
+            num_elements)
+
+        src_tensor = param.grad.view(-1).narrow(0, source_offset, num_elements).float()
+        dest_tensor.copy_(src_tensor, non_blocking=True)
+
+    def complete_grad_norm_calculation_for_cpu_offload(self, params):
+        total_norm = 0.0
+        norm_type = 2.0
+        for p in params:
+            if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
+                param_id = self.get_param_id(p)
+                param_norm = self.norm_for_param_grads[param_id]
+                total_norm += param_norm.item()**2
+
+        # Sum across all model parallel GPUs.
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+
+        torch.distributed.all_reduce(total_norm_cuda,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=self.dp_process_group)
+
+        self._model_parallel_all_reduce(tensor=total_norm_cuda,
+                                        op=torch.distributed.ReduceOp.SUM)
+
+        total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+
+        if total_norm == float(
+                'inf') or total_norm == -float('inf') or total_norm != total_norm:
+            total_norm = -1
+
+        return total_norm
+
+    ############################################################################################
+
     def copy_grads_in_partition(self, param):
+        if self.cpu_offload:
+            #print(f"GAS: {self.gradient_accumulation_steps}")
+            #print(f"GAS: {self.is_gradient_accumulation_boundary}")
+            #with torch.cuda.stream(torch.cuda.current_stream()):
+
+            self.update_overflow_tracker_for_param_grad(param)
+
+            if self.gradient_accumulation_steps > 1:
+                self.async_accumulate_grad_in_cpu_via_gpu(param)
+
+            if self.is_gradient_accumulation_boundary:
+                self.set_norm_for_param_grad_in_gpu(param)
+                self.async_inplace_copy_grad_to_fp32_buffer_from_gpu(param)
+
+                #new_grad_tensor = async_copy_to(param.grad.view(-1),
+                #                                'cpu',
+                #                                self.cpu_computation_stream)
+            #param.grad.data = new_grad_tensor.data.view_as(param.grad)
+            return
+
         if self.grads_in_partition is None:
             self.grads_in_partition_offset = 0
             total_size = 0
@@ -783,30 +926,17 @@ def copy_grads_in_partition(self, param):
                     total_size += param_in_partition.numel()
 
             see_memory_usage(f"before copying {total_size} gradients into partition")
-            #jie:
-            '''
-            if self.cpu_offload:
-                self.grads_in_partition = torch.empty(int(total_size),
-                                                      dtype=torch.half,
-                                                      device=torch.cuda.current_device())
-                                                      ''
-            else:
-            '''
             self.grads_in_partition = torch.empty(int(total_size),
                                                   dtype=torch.half,
                                                   device=torch.cuda.current_device())
             see_memory_usage(f"after copying {total_size} gradients into partition")
 
         #The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer
-        new_grad_tensor = self.grads_in_partition.narrow(0,
-                                                         self.grads_in_partition_offset,
-                                                         param.numel())
+        new_grad_tensor = self.grads_in_partition.view(-1).narrow(
+            0,
+            self.grads_in_partition_offset,
+            param.numel())
         new_grad_tensor.copy_(param.grad.view(-1))
-        '''
-        if self.cpu_offload:
-            averaged_gradients_on_cpu[self.get_param_id()] = new_grad_tensor.data.view_as(param.grad)
-        else:
-        '''
         param.grad.data = new_grad_tensor.data.view_as(param.grad)
         self.grads_in_partition_offset += param.numel()
 
@@ -1187,10 +1317,21 @@ def free_grad_in_param_list(self, param_list):
         for p in param_list:
             p.grad = None
 
+    def reset_cpu_buffers(self):
+        self.norm_for_param_grads = {}
+        with torch.cuda.stream(self.migration_stream):
+            for key, value in self.accumulated_grads_in_cpu.items():
+                value.mul_(0.0)
+
     def step(self, closure=None):
         """
         Not supporting closure.
         """
+        self.micro_step_id = 0
+
+        if self.cpu_offload:
+            torch.cuda.current_stream().wait_stream(self.migration_stream)
+
         see_memory_usage(f"In step before checking overflow")
 
         # First compute norm for all group so we know if there is overflow
@@ -1203,8 +1344,8 @@ def step(self, closure=None):
         if self.overflow:
             see_memory_usage('After overflow before clearing gradients')
             self.zero_grad()
-            for key in self.averaged_gradients:
-                self.averaged_gradients[key] = None
+            if self.cpu_offload:
+                self.reset_cpu_buffers()
 
             see_memory_usage('After overflow after clearing gradients')
 
@@ -1224,64 +1365,57 @@ def step(self, closure=None):
         skip = False
         partition_id = dist.get_rank(group=self.dp_process_group)
         for i, group in enumerate(self.fp16_groups):
-
             if self.cpu_offload:
-                self.averaged_gradients[i] = self.averaged_gradients_on_cpu[i]
-            #norm_groups.append(
-            #    self.get_grad_norm_direct(self.averaged_gradients[i],
-            #                              self.params_in_partition[i]))
-            norm_groups.append(
-                self.get_grad_norm_direct(self.averaged_gradients[i],
-                                          self.params_in_partition[i]))
-
-            #free gradients for all the prameters that are not updated by this process
-            self.free_grad_in_param_list(self.params_not_in_partition[i])
-
-            #create a flat gradients for parameters updated by this process
-            # If we are last partition, ensure we have same size grads and partition size, if not pad with zero tensors
-            if partition_id == dist.get_world_size(group=self.dp_process_group) - 1:
-                single_grad_partition = flatten_dense_tensors_aligned(
-                    self.averaged_gradients[i],
-                    int(self.partition_size[i])).to(
-                        self.single_partition_of_fp32_groups[i].dtype)
+                norm_groups.append(
+                    self.complete_grad_norm_calculation_for_cpu_offload(
+                        self.params_in_partition[i]))
+                single_grad_partition = self.single_partition_of_fp32_groups[i].grad
             else:
-                single_grad_partition = _flatten_dense_tensors(
-                    self.averaged_gradients[i]).to(
-                        self.single_partition_of_fp32_groups[i].dtype)
-            assert single_grad_partition.numel() == self.partition_size[i], \
-                "averaged gradients have different number of elements that partition size {} {} {} {}".format(single_grad_partition.numel(), self.partition_size[i], i, partition_id)
+                norm_groups.append(
+                    self.get_grad_norm_direct(self.averaged_gradients[i],
+                                              self.params_in_partition[i]))
+
+                #free gradients for all the prameters that are not updated by this process
+                self.free_grad_in_param_list(self.params_not_in_partition[i])
+
+                #create a flat gradients for parameters updated by this process
+                # If we are last partition, ensure we have same size grads and partition size, if not pad with zero tensors
+                if partition_id == dist.get_world_size(group=self.dp_process_group) - 1:
+                    single_grad_partition = flatten_dense_tensors_aligned(
+                        self.averaged_gradients[i],
+                        int(self.partition_size[i])).to(
+                            self.single_partition_of_fp32_groups[i].dtype)
+                else:
+                    single_grad_partition = _flatten_dense_tensors(
+                        self.averaged_gradients[i]).to(
+                            self.single_partition_of_fp32_groups[i].dtype)
+                assert single_grad_partition.numel() == self.partition_size[i], \
+                    "averaged gradients have different number of elements that partition size {} {} {} {}".format(single_grad_partition.numel(), self.partition_size[i], i, partition_id)
 
-            self.single_partition_of_fp32_groups[i].grad = single_grad_partition
-            #release all the gradient since we have already created a necessary copy in dp_grad_partition
-            self.free_grad_in_param_list(self.params_in_partition[i])
+                self.single_partition_of_fp32_groups[i].grad = single_grad_partition
+                #release all the gradient since we have already created a necessary copy in dp_grad_partition
+                self.free_grad_in_param_list(self.params_in_partition[i])
 
-            if self.cpu_offload is False:
                 self.averaged_gradients[i] = None
 
             single_partition_grad_groups.append(single_grad_partition)
 
         self.unscale_and_clip_grads(single_partition_grad_groups, norm_groups)
-
-        self.cpu_computation_stream.wait_stream(self.migration_stream)
+        #torch.set_num_threads(12)
         timers('optimizer_step').start()
-        with torch.cuda.stream(self.cpu_computation_stream):
-            self.optimizer.step()
+        self.optimizer.step()
+        #get rid of the fp32 gradients. Not needed anymore
+        if not self.cpu_offload:
+            for group in self.single_partition_of_fp32_groups:
+                group.grad = None
 
-        if self.cpu_offload:
-            stream = torch.cuda.current_stream()
-            with torch.cuda.stream(self.migration_stream):
-                for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
-                    fp16_partitions[partition_id] = async_copy_to(
-                        fp32_partition,
-                        torch.cuda.current_device(),
-                        stream)
-                #for averaged_gradients_cpu, fp32_partition in zip(self.averaged_gradients_on_cpu, self.single_partition_of_fp32_groups):
-                #    averaged_gradients_cpu = [fp32_partition]
-        else:
-            for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
-                fp16_partitions[partition_id].data.copy_(fp32_partition.data)
+        for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
+            fp16_partitions[partition_id].data.copy_(fp32_partition.data)
         timers('optimizer_step').stop()
 
+        if self.cpu_offload:
+            self.reset_cpu_buffers()
+
         timers('optimizer_allgather').start()
         #gather the updated weights from everyone
         for group_id, partitioned_params in enumerate(self.parallel_partitioned_fp16_groups):
@@ -1362,8 +1496,6 @@ def has_overflow_serial(self, params, is_grad_list=False):
 
     def has_overflow_partitioned_grads_serial(self):
         for i in range(len(self.fp16_groups)):
-            if self.cpu_offload:
-                self.averaged_gradients[i] = self.averaged_gradients_on_cpu[i]
             for j, grad in enumerate(self.averaged_gradients[i]):
                 if grad is not None and self._has_inf_or_nan(grad.data, j):
                     return True
@@ -1371,7 +1503,8 @@ def has_overflow_partitioned_grads_serial(self):
 
     def has_overflow(self, partition_gradients=True):
         if partition_gradients:
-            overflow = self.has_overflow_partitioned_grads_serial()
+            overflow = self.local_overflow if self.cpu_offload else self.has_overflow_partitioned_grads_serial(
+            )
             overflow_gpu = torch.cuda.ByteTensor([overflow])
             torch.distributed.all_reduce(overflow_gpu,
                                          op=torch.distributed.ReduceOp.MAX,
@@ -1424,22 +1557,26 @@ def backward(self, loss, retain_graph=False):
         2. scaled_loss = fp32_loss*loss_scale
         3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's fp16 leaves
         """
+        if self.cpu_offload:
+            torch.cuda.current_stream().wait_stream(self.migration_stream)
+
         if self.contiguous_gradients:
             self.ipg_buffer = []
-            buf_0 = torch.empty(self.reduce_bucket_size,
+            buf_0 = torch.empty(int(self.reduce_bucket_size * 4.5),
                                 dtype=torch.half,
                                 device=torch.cuda.current_device())
             self.ipg_buffer.append(buf_0)
 
             # Use double buffers to avoid data access conflict when overlap_comm is enabled.
             if self.overlap_comm:
-                buf_1 = torch.empty(self.reduce_bucket_size,
+                buf_1 = torch.empty(int(self.reduce_bucket_size * 4.5),
                                     dtype=torch.half,
                                     device=torch.cuda.current_device())
                 self.ipg_buffer.append(buf_1)
             self.ipg_index = 0
-        torch.cuda.empty_cache()
+
         self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
+        self.micro_step_id += 1
 
     def check_overflow(self, partition_gradients=True):
         self._check_overflow(partition_gradients)
@@ -1536,11 +1673,11 @@ def state_dict(self):
             self.single_partition_of_fp32_groups)
         state_dict['single_partition_of_fp32_groups'] = fp32_groups_without_padding
 
-        if self.cpu_offload:
-            state_dict_tmp = async_copy_to(state_dict,
-                                           'cpu',
-                                           torch.cuda.current_stream())
-            state_dict = state_dict_tmp
+        #        if self.cpu_offload:
+        #            state_dict_tmp = async_copy_to(state_dict,
+        #                                           'cpu',
+        #                                           torch.cuda.current_stream())
+        #            state_dict = state_dict_tmp
 
         return state_dict
 
@@ -1578,7 +1715,6 @@ def refresh_fp32_params(self):
     def _partition_base_optimizer_state(self, state_key, all_partition_states):
         partition_id = dist.get_rank(group=self.dp_process_group)
         alignment = dist.get_world_size(group=self.dp_process_group)
-
         if torch.is_tensor(all_partition_states[0]):
             flat_merged_partitions = flatten_dense_tensors_aligned(
                 all_partition_states,
diff --git a/tests/unit/test_sparse_attention.py b/tests/unit/test_sparse_attention.py
index 3f2078946297..f113e3b72ecd 100644
--- a/tests/unit/test_sparse_attention.py
+++ b/tests/unit/test_sparse_attention.py
@@ -232,6 +232,7 @@ def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layo
 
 
 def _skip_on_cuda_compatability():
+    pytest.skip("Skip these tests for now until we get our docker image fixed.")
     if torch.cuda.get_device_capability()[0] != 7:
         pytest.skip("needs compute capability 7; v100")
     cuda_major = int(torch.version.cuda.split('.')[0]) * 10

From 20c414d0ed97b7c3cdded0a2ce8e09595f098126 Mon Sep 17 00:00:00 2001
From: RezaYazdaniAminabadi
 <44502768+RezaYazdaniAminabadi@users.noreply.github.com>
Date: Thu, 3 Sep 2020 15:46:52 -0700
Subject: [PATCH 07/44] add cpu adam optimizer (#356)

* add cpu adam optimizer

* run precommit

* clean adam_test

* add accuracy test for adam
---
 csrc/adam/cpu_adam.cpp          | 324 ++++++++++++++++++++++++++++++++
 csrc/includes/cpu_adam.h        |  70 +++++++
 deepspeed/ops/adam/__init__.py  |   1 +
 deepspeed/ops/adam/cpu_adam.py  |  70 +++++++
 setup.py                        |  28 ++-
 tests/unit/adam_test.py         |  18 ++
 tests/unit/test_adam_acuracy.py |  39 ++++
 7 files changed, 548 insertions(+), 2 deletions(-)
 create mode 100644 csrc/adam/cpu_adam.cpp
 create mode 100644 csrc/includes/cpu_adam.h
 create mode 100755 deepspeed/ops/adam/__init__.py
 create mode 100755 deepspeed/ops/adam/cpu_adam.py
 create mode 100755 tests/unit/adam_test.py
 create mode 100755 tests/unit/test_adam_acuracy.py

diff --git a/csrc/adam/cpu_adam.cpp b/csrc/adam/cpu_adam.cpp
new file mode 100644
index 000000000000..2d3b521bea1c
--- /dev/null
+++ b/csrc/adam/cpu_adam.cpp
@@ -0,0 +1,324 @@
+#include "cpu_adam.h"
+#include <cuda_runtime_api.h>
+#include <omp.h>
+#include <torch/extension.h>
+#include <x86intrin.h>
+#include <iostream>
+#include <memory>
+#include <type_traits>
+#include <unordered_map>
+#include "cublas_v2.h"
+#include "cuda.h"
+#include "curand.h"
+
+static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
+
+// C++ interface
+
+void Adam_Optimizer::Step(float* _params,
+                          float* grads,
+                          float* _exp_avg,
+                          float* _exp_avg_sq,
+                          size_t _param_size,
+                          __half* dev_params)
+{
+    _betta1_t *= _betta1;
+    _betta2_t *= _betta2;
+
+    bool buf_index = 0;
+
+    __m512 betta1_4 = _mm512_set1_ps(_betta1);
+    __m512 betta2_4 = _mm512_set1_ps(_betta2);
+
+    float betta1_minus1 = 1 - _betta1;
+    float betta2_minus1 = 1 - _betta2;
+    __m512 betta1_minus1_4 = _mm512_set1_ps(betta1_minus1);
+    __m512 betta2_minus1_4 = _mm512_set1_ps(betta2_minus1);
+
+    float bias_correction1 = 1 - _betta1_t;
+    float bias_correction2 = 1 - _betta2_t;
+    //__m512 bias_correction1_4 = _mm512_set1_ps(bias_correction1);
+    __m512 bias_correction2_4 = _mm512_set1_ps(bias_correction2);
+
+    __m512 eps_4 = _mm512_set1_ps(_eps);
+
+    float step_size = -1 * _alpha / bias_correction1;
+    __m512 step_size_4 = _mm512_set1_ps(step_size);
+
+    __m512 bias2_sqrt = _mm512_sqrt_ps(bias_correction2_4);
+
+    size_t tile = 0;
+
+#pragma omp parallel for
+    for (size_t i = 0; i < _param_size; i += SIMD_WIDTH) {
+        __m512 grad_4 = _mm512_loadu_ps(grads + i);
+
+        __m512 momntum_4 = _mm512_loadu_ps(_exp_avg + i);
+        __m512 varianc_4 = _mm512_loadu_ps(_exp_avg_sq + i);
+
+        __m512 param_4 = _mm512_loadu_ps(_params + i);
+
+        if (_weight_decay > 0) {
+            __m512 weight_decay4 = _mm512_set1_ps(_weight_decay);
+            grad_4 = _mm512_fmadd_ps(param_4, weight_decay4, grad_4);
+        }
+
+        momntum_4 = _mm512_mul_ps(momntum_4, betta1_4);
+        momntum_4 = _mm512_fmadd_ps(grad_4, betta1_minus1_4, momntum_4);
+
+        varianc_4 = _mm512_mul_ps(varianc_4, betta2_4);
+        grad_4 = _mm512_mul_ps(grad_4, grad_4);
+        varianc_4 = _mm512_fmadd_ps(grad_4, betta2_minus1_4, varianc_4);
+
+        grad_4 = _mm512_sqrt_ps(varianc_4) / bias2_sqrt;
+        grad_4 = _mm512_add_ps(grad_4, eps_4);
+        grad_4 = _mm512_div_ps(momntum_4, grad_4);
+
+        param_4 = _mm512_fmadd_ps(grad_4, step_size_4, param_4);
+
+        _mm512_storeu_ps(_params + i, param_4);
+        _mm512_storeu_ps(_exp_avg + i, momntum_4);
+        _mm512_storeu_ps(_exp_avg_sq + i, varianc_4);
+        if (dev_params) {
+            if ((i + SIMD_WIDTH) % TILE == 0) {
+                size_t offset = tile * TILE;
+#pragma omp parallel for
+                for (size_t j = 0; j < TILE; j += 4) {
+                    _doubled_buffer[buf_index][j] = (__half)_params[offset + j];
+                    _doubled_buffer[buf_index][j + 1] = (__half)_params[offset + j + 1];
+                    _doubled_buffer[buf_index][j + 2] = (__half)_params[offset + j + 2];
+                    _doubled_buffer[buf_index][j + 3] = (__half)_params[offset + j + 3];
+                }
+                CUDA_CHECK(cudaMemcpyAsync(dev_params + tile * TILE,
+                                           _doubled_buffer[buf_index],
+                                           TILE * sizeof(__half),
+                                           cudaMemcpyHostToDevice,
+                                           Context::Instance().GetCurrentStream()));
+                buf_index = !buf_index;
+                tile++;
+            }
+        }
+    }
+}
+
+void Adam_Optimizer::Step_4(float* _params,
+                            float* grads,
+                            float* _exp_avg,
+                            float* _exp_avg_sq,
+                            size_t _param_size,
+                            __half* dev_params)
+{
+    _betta1_t *= _betta1;
+    _betta2_t *= _betta2;
+
+    __m512 betta1_4 = _mm512_set1_ps(_betta1);
+    __m512 betta2_4 = _mm512_set1_ps(_betta2);
+
+    bool buf_index = 0;
+    size_t tile = 0;
+
+    float betta1_minus1 = 1 - _betta1;
+    float betta2_minus1 = 1 - _betta2;
+    __m512 betta1_minus1_4 = _mm512_set1_ps(betta1_minus1);
+    __m512 betta2_minus1_4 = _mm512_set1_ps(betta2_minus1);
+
+    float bias_correction1 = 1 - _betta1_t;
+    float bias_correction2 = 1 - _betta2_t;
+    //__m512 bias_correction1_4 = _mm512_set1_ps(bias_correction1);
+    __m512 bias_correction2_4 = _mm512_set1_ps(bias_correction2);
+
+    __m512 eps_4 = _mm512_set1_ps(_eps);
+
+    float step_size = -1 * _alpha / bias_correction1;
+    __m512 step_size_4 = _mm512_set1_ps(step_size);
+
+    __m512 bias2_sqrt = _mm512_sqrt_ps(bias_correction2_4);
+
+#pragma omp parallel for
+    for (size_t i = 0; i < _param_size; i += (SIMD_WIDTH << 2)) {
+        __m512 grad_4[4];
+        grad_4[0] = _mm512_loadu_ps(grads + i);
+        grad_4[1] = _mm512_loadu_ps(grads + i + SIMD_WIDTH);
+        grad_4[2] = _mm512_loadu_ps(grads + i + (SIMD_WIDTH << 1));
+        grad_4[3] = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 3);
+
+        __m512 momntum_4[2];
+        momntum_4[0] = _mm512_loadu_ps(_exp_avg + i);
+        momntum_4[1] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH);
+        momntum_4[2] = _mm512_loadu_ps(_exp_avg + i + (SIMD_WIDTH << 1));
+        momntum_4[3] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 3);
+
+        __m512 varianc_4[2];
+        varianc_4[0] = _mm512_loadu_ps(_exp_avg_sq + i);
+        varianc_4[1] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH);
+        varianc_4[2] = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1));
+        varianc_4[3] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3);
+
+        __m512 param_4[2];
+        param_4[0] = _mm512_loadu_ps(_params + i);
+        param_4[1] = _mm512_loadu_ps(_params + i + SIMD_WIDTH);
+        param_4[2] = _mm512_loadu_ps(_params + i + (SIMD_WIDTH << 1));
+        param_4[3] = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 3);
+
+        if (_weight_decay > 0) {
+            __m512 weight_decay4 = _mm512_set1_ps(_weight_decay);
+            grad_4[0] = _mm512_fmadd_ps(param_4[0], weight_decay4, grad_4[0]);
+            grad_4[1] = _mm512_fmadd_ps(param_4[1], weight_decay4, grad_4[1]);
+            grad_4[2] = _mm512_fmadd_ps(param_4[2], weight_decay4, grad_4[2]);
+            grad_4[3] = _mm512_fmadd_ps(param_4[3], weight_decay4, grad_4[3]);
+        }
+
+        momntum_4[0] = _mm512_mul_ps(momntum_4[0], betta1_4);
+        momntum_4[0] = _mm512_fmadd_ps(grad_4[0], betta1_minus1_4, momntum_4[0]);
+        momntum_4[1] = _mm512_mul_ps(momntum_4[1], betta1_4);
+        momntum_4[1] = _mm512_fmadd_ps(grad_4[1], betta1_minus1_4, momntum_4[1]);
+        momntum_4[2] = _mm512_mul_ps(momntum_4[2], betta1_4);
+        momntum_4[2] = _mm512_fmadd_ps(grad_4[2], betta1_minus1_4, momntum_4[2]);
+        momntum_4[3] = _mm512_mul_ps(momntum_4[3], betta1_4);
+        momntum_4[3] = _mm512_fmadd_ps(grad_4[3], betta1_minus1_4, momntum_4[3]);
+
+        varianc_4[0] = _mm512_mul_ps(varianc_4[0], betta2_4);
+        varianc_4[1] = _mm512_mul_ps(varianc_4[1], betta2_4);
+        varianc_4[2] = _mm512_mul_ps(varianc_4[2], betta2_4);
+        varianc_4[3] = _mm512_mul_ps(varianc_4[3], betta2_4);
+        grad_4[0] = _mm512_mul_ps(grad_4[0], grad_4[0]);
+        grad_4[1] = _mm512_mul_ps(grad_4[1], grad_4[1]);
+        grad_4[2] = _mm512_mul_ps(grad_4[2], grad_4[2]);
+        grad_4[3] = _mm512_mul_ps(grad_4[3], grad_4[3]);
+        varianc_4[0] = _mm512_fmadd_ps(grad_4[0], betta2_minus1_4, varianc_4[0]);
+        varianc_4[1] = _mm512_fmadd_ps(grad_4[1], betta2_minus1_4, varianc_4[1]);
+        varianc_4[2] = _mm512_fmadd_ps(grad_4[2], betta2_minus1_4, varianc_4[2]);
+        varianc_4[3] = _mm512_fmadd_ps(grad_4[3], betta2_minus1_4, varianc_4[3]);
+
+        grad_4[0] = _mm512_sqrt_ps(varianc_4[0]) / bias2_sqrt;
+        grad_4[1] = _mm512_sqrt_ps(varianc_4[1]) / bias2_sqrt;
+        grad_4[2] = _mm512_sqrt_ps(varianc_4[2]) / bias2_sqrt;
+        grad_4[3] = _mm512_sqrt_ps(varianc_4[3]) / bias2_sqrt;
+
+        grad_4[0] = _mm512_add_ps(grad_4[0], eps_4);
+        grad_4[1] = _mm512_add_ps(grad_4[1], eps_4);
+        grad_4[2] = _mm512_add_ps(grad_4[2], eps_4);
+        grad_4[3] = _mm512_add_ps(grad_4[3], eps_4);
+        grad_4[0] = _mm512_div_ps(momntum_4[0], grad_4[0]);
+        grad_4[1] = _mm512_div_ps(momntum_4[1], grad_4[1]);
+        grad_4[2] = _mm512_div_ps(momntum_4[2], grad_4[2]);
+        grad_4[3] = _mm512_div_ps(momntum_4[3], grad_4[3]);
+
+        param_4[0] = _mm512_fmadd_ps(grad_4[0], step_size_4, param_4[0]);
+        param_4[1] = _mm512_fmadd_ps(grad_4[1], step_size_4, param_4[1]);
+        param_4[2] = _mm512_fmadd_ps(grad_4[2], step_size_4, param_4[2]);
+        param_4[3] = _mm512_fmadd_ps(grad_4[3], step_size_4, param_4[3]);
+
+        _mm512_storeu_ps(_params + i, param_4[0]);
+        _mm512_storeu_ps(_params + i + SIMD_WIDTH, param_4[1]);
+        _mm512_storeu_ps(_params + i + (SIMD_WIDTH << 1), param_4[2]);
+        _mm512_storeu_ps(_params + i + SIMD_WIDTH * 3, param_4[3]);
+
+        _mm512_storeu_ps(_exp_avg + i, momntum_4[0]);
+        _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH, momntum_4[1]);
+        _mm512_storeu_ps(_exp_avg + i + (SIMD_WIDTH << 1), momntum_4[2]);
+        _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 3, momntum_4[3]);
+
+        _mm512_storeu_ps(_exp_avg_sq + i, varianc_4[0]);
+        _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH, varianc_4[1]);
+        _mm512_storeu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1), varianc_4[2]);
+        _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3, varianc_4[3]);
+        if (dev_params) {
+            if ((i + (SIMD_WIDTH << 2)) % TILE == 0) {
+                size_t offset = tile * TILE;
+#pragma omp parallel for
+                for (size_t j = 0; j < TILE; j += 4) {
+                    _doubled_buffer[buf_index][j] = (__half)_params[offset + j];
+                    _doubled_buffer[buf_index][j + 1] = (__half)_params[offset + j + 1];
+                    _doubled_buffer[buf_index][j + 2] = (__half)_params[offset + j + 2];
+                    _doubled_buffer[buf_index][j + 3] = (__half)_params[offset + j + 3];
+                }
+                CUDA_CHECK(cudaMemcpyAsync(dev_params + tile * TILE,
+                                           _doubled_buffer[buf_index],
+                                           TILE * sizeof(__half),
+                                           cudaMemcpyHostToDevice,
+                                           Context::Instance().GetCurrentStream()));
+                buf_index = !buf_index;
+                tile++;
+            }
+        }
+    }
+}
+
+int create_adam_optimizer(int optimizer_id,
+                          float alpha = 1e-3,
+                          float betta1 = 0.9,
+                          float betta2 = 0.999,
+                          float eps = 1e-8,
+                          float weight_decay = 0)
+
+{
+    auto opt = std::make_shared<Adam_Optimizer>(alpha, betta1, betta2, eps, weight_decay);
+
+    s_optimizers[optimizer_id] = opt;
+
+    std::cout << "Adam Optimizer #" << optimizer_id << " is created." << std::endl;
+
+    return 0;
+}
+
+int ds_adam_step(int optimizer_id,
+                 torch::Tensor& params,
+                 torch::Tensor& grads,
+                 torch::Tensor& exp_avg,
+                 torch::Tensor& exp_avg_sq)
+{
+    auto params_c = params.contiguous();
+    auto grads_c = grads.contiguous();
+    auto exp_avg_c = exp_avg.contiguous();
+    auto exp_avg_sq_c = exp_avg_sq.contiguous();
+
+    float* params_ptr = (float*)params_c.data_ptr();
+    float* grads_ptr = (float*)grads_c.data_ptr();
+    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
+    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
+
+    std::shared_ptr<Adam_Optimizer> opt =
+        std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
+
+    opt->Step_4(params_ptr, grads_ptr, exp_avg_ptr, exp_avg_sq_ptr, params_c.size(0));
+
+    return 0;
+}
+
+int ds_adam_step_plus_copy(int optimizer_id,
+                           torch::Tensor& params,
+                           torch::Tensor& grads,
+                           torch::Tensor& exp_avg,
+                           torch::Tensor& exp_avg_sq,
+                           torch::Tensor& gpu_params)
+{
+    auto params_c = params.contiguous();
+    auto gpu_params_c = gpu_params.contiguous();
+    auto exp_avg_c = exp_avg.contiguous();
+    auto exp_avg_sq_c = exp_avg_sq.contiguous();
+    auto grads_c = grads.contiguous();
+
+    float* params_ptr = (float*)params_c.data_ptr();
+    float* grads_ptr = (float*)grads_c.data_ptr();
+    __half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
+    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
+    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
+
+    std::shared_ptr<Adam_Optimizer> opt =
+        std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
+
+    opt->Step_4(
+        params_ptr, grads_ptr, exp_avg_ptr, exp_avg_sq_ptr, params_c.size(0), gpu_params_ptr);
+
+    return 0;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("adam_update", &ds_adam_step, "DeepSpeed CPU Adam update (C++)");
+    m.def("adam_update_copy",
+          &ds_adam_step_plus_copy,
+          "DeepSpeed CPU Adam update and param copy (C++)");
+    m.def("create_adam", &create_adam_optimizer, "DeepSpeed CPU Adam (C++)");
+}
diff --git a/csrc/includes/cpu_adam.h b/csrc/includes/cpu_adam.h
new file mode 100644
index 000000000000..d1a2aa088731
--- /dev/null
+++ b/csrc/includes/cpu_adam.h
@@ -0,0 +1,70 @@
+#pragma once
+
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <cassert>
+#include "context.h"
+#include "cublas_v2.h"
+#include "cuda.h"
+#include "curand.h"
+
+#define CUDA_CHECK(callstr)                                                                    \
+    {                                                                                          \
+        cudaError_t error_code = callstr;                                                      \
+        if (error_code != cudaSuccess) {                                                       \
+            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
+            assert(0);                                                                         \
+        }                                                                                      \
+    }
+
+#define SIMD_WIDTH 16
+#define TILE 1048576
+
+class Adam_Optimizer {
+public:
+    Adam_Optimizer(float alpha = 1e-3,
+                   float betta1 = 0.9,
+                   float betta2 = 0.999,
+                   float eps = 1e-8,
+                   float weight_decay = 0)
+        : _alpha(alpha),
+          _betta1(betta1),
+          _betta2(betta2),
+          _eps(eps),
+          _weight_decay(weight_decay),
+          _betta1_t(1.0),
+          _betta2_t(1.0)
+    {
+        cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(__half));
+        cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(__half));
+    }
+    ~Adam_Optimizer()
+    {
+        cudaFreeHost(_doubled_buffer[0]);
+        cudaFreeHost(_doubled_buffer[1]);
+    }
+    void Step(float* _params,
+              float* grads,
+              float* _exp_avg,
+              float* _exp_avg_sq,
+              size_t param_size,
+              __half* dev_param = nullptr);
+    void Step_4(float* _params,
+                float* grads,
+                float* _exp_avg,
+                float* _exp_avg_sa,
+                size_t param_size,
+                __half* dev_param = nullptr);
+
+private:
+    float _alpha;
+    float _betta1;
+    float _betta2;
+    float _eps;
+    float _weight_decay;
+
+    float _betta1_t;
+    float _betta2_t;
+
+    __half* _doubled_buffer[2];
+};
diff --git a/deepspeed/ops/adam/__init__.py b/deepspeed/ops/adam/__init__.py
new file mode 100755
index 000000000000..13883dc7b590
--- /dev/null
+++ b/deepspeed/ops/adam/__init__.py
@@ -0,0 +1 @@
+from deepspeed.ops.adam.cpu_adam import DeepSpeedCPUAdam
diff --git a/deepspeed/ops/adam/cpu_adam.py b/deepspeed/ops/adam/cpu_adam.py
new file mode 100755
index 000000000000..5387090e3815
--- /dev/null
+++ b/deepspeed/ops/adam/cpu_adam.py
@@ -0,0 +1,70 @@
+import math
+import torch
+import deepspeed.ops.adam.cpu_adam_op as ds_opt_adam
+
+
+class DeepSpeedCPUAdam(torch.optim.Optimizer):
+
+    optimizer_id = 0
+
+    def __init__(self,
+                 model_params,
+                 lr=1e-3,
+                 bettas=(0.9,
+                         0.999),
+                 eps=1e-8,
+                 weight_decay=0,
+                 amsgrad=False):
+
+        default_args = dict(lr=lr,
+                            betas=bettas,
+                            eps=eps,
+                            weight_decay=weight_decay,
+                            amsgrad=amsgrad)
+        super(DeepSpeedCPUAdam, self).__init__(model_params, default_args)
+        self.opt_id = DeepSpeedCPUAdam.optimizer_id
+        DeepSpeedCPUAdam.optimizer_id = DeepSpeedCPUAdam.optimizer_id + 1
+        ds_opt_adam.create_adam(self.opt_id, lr, bettas[0], bettas[1], eps, weight_decay)
+
+    def __setstate__(self, state):
+        super(DeepSpeedCPUAdam, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    @torch.no_grad()
+    def step(self, closure=None, fp16_param_groups=None):
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for i, group in enumerate(self.param_groups):
+            for gid, p in enumerate(group['params']):
+
+                if p.grad is None:
+                    continue
+
+                grad = p.grad
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # gradient momentums
+                    state['exp_avg'] = torch.zeros_like(p, device='cpu')
+                    # gradient variances
+                    state['exp_avg_sq'] = torch.zeros_like(p, device='cpu')
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                state['step'] += 1
+
+                if fp16_param_groups is not None:
+                    p_fp16 = fp16_param_groups[i][gid]
+                    ds_opt_adam.adam_update_copy(self.opt_id,
+                                                 p,
+                                                 grad,
+                                                 exp_avg,
+                                                 exp_avg_sq,
+                                                 p_fp16)
+                else:
+                    ds_opt_adam.adam_update(self.opt_id, p, grad, exp_avg, exp_avg_sq)
+        return loss
diff --git a/setup.py b/setup.py
index 3d57e8791da8..02f2f7fd005a 100755
--- a/setup.py
+++ b/setup.py
@@ -31,13 +31,15 @@ def fetch_requirements(path):
 DS_BUILD_LAMB_MASK = 1
 DS_BUILD_TRANSFORMER_MASK = 10
 DS_BUILD_SPARSE_ATTN_MASK = 100
+DS_BUILD_ADAM_MASK = 1000
 
 # Allow for build_cuda to turn on or off all ops
-DS_BUILD_ALL_OPS = DS_BUILD_LAMB_MASK | DS_BUILD_TRANSFORMER_MASK | DS_BUILD_SPARSE_ATTN_MASK
+DS_BUILD_ALL_OPS = DS_BUILD_LAMB_MASK | DS_BUILD_TRANSFORMER_MASK | DS_BUILD_SPARSE_ATTN_MASK | DS_BUILD_ADAM_MASK
 DS_BUILD_CUDA = int(os.environ.get('DS_BUILD_CUDA', 1)) * DS_BUILD_ALL_OPS
 
 # Set default of each op based on if build_cuda is set
 OP_DEFAULT = DS_BUILD_CUDA == DS_BUILD_ALL_OPS
+DS_BUILD_ADAM = int(os.environ.get('DS_BUILD_ADAM', OP_DEFAULT)) * DS_BUILD_ADAM_MASK
 DS_BUILD_LAMB = int(os.environ.get('DS_BUILD_LAMB', OP_DEFAULT)) * DS_BUILD_LAMB_MASK
 DS_BUILD_TRANSFORMER = int(os.environ.get('DS_BUILD_TRANSFORMER',
                                           OP_DEFAULT)) * DS_BUILD_TRANSFORMER_MASK
@@ -45,11 +47,14 @@ def fetch_requirements(path):
                                           0)) * DS_BUILD_SPARSE_ATTN_MASK
 
 # Final effective mask is the bitwise OR of each op
-BUILD_MASK = (DS_BUILD_LAMB | DS_BUILD_TRANSFORMER | DS_BUILD_SPARSE_ATTN)
+BUILD_MASK = (DS_BUILD_LAMB | DS_BUILD_TRANSFORMER | DS_BUILD_SPARSE_ATTN
+              | DS_BUILD_ADAM)
 
 install_ops = []
 if BUILD_MASK & DS_BUILD_LAMB:
     install_ops.append('lamb')
+if BUILD_MASK & DS_BUILD_ADAM:
+    install_ops.append('adam')
 if BUILD_MASK & DS_BUILD_TRANSFORMER:
     install_ops.append('transformer')
 if BUILD_MASK & DS_BUILD_SPARSE_ATTN:
@@ -104,6 +109,25 @@ def fetch_requirements(path):
                                    '--use_fast_math'] + version_dependent_macros
                       }))
 
+## Adam ##
+if BUILD_MASK & DS_BUILD_ADAM:
+    ext_modules.append(
+        CUDAExtension(
+            name='deepspeed.ops.adam.cpu_adam_op',
+            sources=[
+                'csrc/adam/cpu_adam.cpp',
+            ],
+            include_dirs=['csrc/includes'],
+            extra_compile_args={
+                'cxx':
+                ['-O3',
+                 '-std=c++14',
+                 '-g',
+                 '-Wno-reorder',
+                 '-march=native',
+                 '-fopenmp']
+            }))
+
 ## Transformer ##
 if BUILD_MASK & DS_BUILD_TRANSFORMER:
     ext_modules.append(
diff --git a/tests/unit/adam_test.py b/tests/unit/adam_test.py
new file mode 100755
index 000000000000..8e6caa382c12
--- /dev/null
+++ b/tests/unit/adam_test.py
@@ -0,0 +1,18 @@
+import torch
+from deepspeed import DeepSpeedCPUAdam
+import time
+
+device = 'cpu'
+model_size = 1 * 1024**3
+param = torch.nn.Parameter(torch.ones(model_size, device=device))
+optimizer = DeepSpeedCPUAdam([param])
+#torch.set_num_threads(128)
+param.grad = torch.ones(model_size, device=device)
+avg = 0
+for i in range(100):
+    start = time.time()
+    optimizer.step()
+    stop = time.time()
+    avg += (stop - start)
+    param.grad = torch.ones(model_size, device=device) * 2
+print("Elapsed Time is ", avg / 100)
diff --git a/tests/unit/test_adam_acuracy.py b/tests/unit/test_adam_acuracy.py
new file mode 100755
index 000000000000..c41585f00ea3
--- /dev/null
+++ b/tests/unit/test_adam_acuracy.py
@@ -0,0 +1,39 @@
+import argparse
+import torch
+
+import time
+import numpy as np
+import pytest
+import copy
+
+from deepspeed import DeepSpeedCPUAdam
+
+def check_equal(first, second, atol=1e-2, verbose=False):
+    x = first.detach().numpy()
+    y = second.detach().numpy()
+    if verbose:
+        print("x = {}".format(x.flatten()))
+        print("y = {}".format(y.flatten()))
+        print('-' * 80)
+    np.testing.assert_allclose(x, y, err_msg="param-update dismatch!", atol=atol)
+
+@pytest.mark.parametrize('model_size',
+                         [
+                             (1048576),
+                         ]) # yapf: disable
+def test_adam_opt(model_size):
+    device = 'cpu'
+
+    param = torch.nn.Parameter(torch.ones(model_size, device=device))
+    param1 = torch.nn.Parameter(torch.ones(model_size, device=device))
+
+    optimizer1 = torch.optim.Adam([param1])
+    optimizer = DeepSpeedCPUAdam([param])
+
+    param.grad=torch.ones(model_size, device=device)
+    param1.grad=torch.ones(model_size, device=device)
+
+    optimizer.step()
+    optimizer1.step()
+
+    check_equal(param, param1)

From 504a643b0337fe4c110ba5da7e4743cce8d75bc1 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Thu, 3 Sep 2020 23:23:02 +0000
Subject: [PATCH 08/44] make the adam unit test work with random params and
 grads and for more steps

---
 tests/unit/test_adam_acuracy.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/tests/unit/test_adam_acuracy.py b/tests/unit/test_adam_acuracy.py
index c41585f00ea3..410543f516d1 100755
--- a/tests/unit/test_adam_acuracy.py
+++ b/tests/unit/test_adam_acuracy.py
@@ -9,6 +9,9 @@
 from deepspeed import DeepSpeedCPUAdam
 
 def check_equal(first, second, atol=1e-2, verbose=False):
+    if verbos:
+        print(first)
+        print(second)
     x = first.detach().numpy()
     y = second.detach().numpy()
     if verbose:
@@ -23,17 +26,21 @@ def check_equal(first, second, atol=1e-2, verbose=False):
                          ]) # yapf: disable
 def test_adam_opt(model_size):
     device = 'cpu'
-
-    param = torch.nn.Parameter(torch.ones(model_size, device=device))
-    param1 = torch.nn.Parameter(torch.ones(model_size, device=device))
+    rng_state = torch.get_rng_state()
+    param = torch.nn.Parameter(torch.randn(model_size, device=device))
+    torch.set_rng_state(rng_state)
+    param1 = torch.nn.Parameter(torch.randn(model_size, device=device))
 
     optimizer1 = torch.optim.Adam([param1])
     optimizer = DeepSpeedCPUAdam([param])
 
-    param.grad=torch.ones(model_size, device=device)
-    param1.grad=torch.ones(model_size, device=device)
+    for i in range(10):
+        rng_state = torch.get_rng_state()
+        param.grad=torch.randn(model_size, device=device)
+        torch.set_rng_state(rng_state)
+        param1.grad=torch.randn(model_size, device=device)
 
-    optimizer.step()
-    optimizer1.step()
+        optimizer.step()
+        optimizer1.step()
 
-    check_equal(param, param1)
+    check_equal(param, param1, atol = 1e-2, verbose=True)

From af51211745d89fb6088598f4aac2ba057a38133f Mon Sep 17 00:00:00 2001
From: Samyam Rajbhandari <samyamr@microsoft.com>
Date: Thu, 3 Sep 2020 20:39:31 -0700
Subject: [PATCH 09/44] Samyamr/zero offload correctness (#359)

* fixing gradient accumulation for zero offload

* Bug fixes. ZeRO Stage 1,2 and Offload all produce the same loss with gradient accumulation step of 2
---
 deepspeed/runtime/config.py      |  2 +-
 deepspeed/runtime/zero/stage1.py |  9 ++++++---
 deepspeed/runtime/zero/stage2.py | 30 ++++++++++++++++--------------
 3 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
index 206993dbe4c9..7637b091efe9 100755
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -623,7 +623,7 @@ def _do_error_check(self):
             assert self.zero_optimization_stage <= MAX_STAGE_ZERO_OPTIMIZATION, "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format(MAX_STAGE_ZERO_OPTIMIZATION)
             if self.zero_config.cpu_offload is True:
                 assert self.zero_optimization_stage == ZERO_OPTIMIZATION_GRADIENTS, "DeepSpeedConfig: cpu-offload supported ZeRO stage is {}".format(ZERO_OPTIMIZATION_GRADIENTS)
-                assert self.gradient_accumulation_steps == 1, "DeepSpeedConfig: {}is not supported for {}".format(GRADIENT_ACCUMULATION_STEPS, ZERO_OPTIMIZATION_CPU_OFFLOAD)
+                #assert self.gradient_accumulation_steps == 1, "DeepSpeedConfig: {}is not supported for {}".format(GRADIENT_ACCUMULATION_STEPS, ZERO_OPTIMIZATION_CPU_OFFLOAD)
 
     def _do_warning_check(self):
         fp16_enabled = self.fp16_enabled or self.zero_enabled
diff --git a/deepspeed/runtime/zero/stage1.py b/deepspeed/runtime/zero/stage1.py
index 1acef4898e36..6f8953daa4cc 100755
--- a/deepspeed/runtime/zero/stage1.py
+++ b/deepspeed/runtime/zero/stage1.py
@@ -793,9 +793,12 @@ def _get_groups_without_padding(self, groups_with_padding):
     def _get_state_without_padding(self, state_with_padding, padding):
         lean_state = {}
         for key, value in state_with_padding.items():
-            lean_length = value.numel() - padding
-            lean_state[key] = value[:lean_length]
-
+            if torch.is_tensor(value):
+                lean_length = value.numel() - padding
+                lean_state[key] = value[:lean_length]
+            else:
+                lean_state[key] = value
+            
         return lean_state
 
     # Return base optimizer states.
diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py
index 69cdcf34adf1..f10d16122320 100755
--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -477,9 +477,10 @@ def independent_gradient_partition_epilogue(self):
 
         if self.overlap_comm:
             torch.cuda.synchronize()
-
+            
         if self.cpu_offload is False:
             for i, _ in enumerate(self.fp16_groups):
+            
                 if not i in self.averaged_gradients or self.averaged_gradients[i] is None:
                     self.averaged_gradients[i] = self.get_flat_partition(
                         self.params_in_partition[i],
@@ -498,6 +499,8 @@ def independent_gradient_partition_epilogue(self):
 
                     for accumulated_grad, new_avg_grad in zip(self.averaged_gradients[i],avg_new):
                         accumulated_grad.add_(new_avg_grad)
+            
+            
         self._release_ipg_buffers()
 
         # No need to keep the gradients anymore.
@@ -867,6 +870,7 @@ def async_inplace_copy_grad_to_fp32_buffer_from_gpu(self, param):
 
         src_tensor = param.grad.view(-1).narrow(0, source_offset, num_elements).float()
         dest_tensor.copy_(src_tensor, non_blocking=True)
+        param.grad=None
 
     def complete_grad_norm_calculation_for_cpu_offload(self, params):
         total_norm = 0.0
@@ -899,25 +903,19 @@ def complete_grad_norm_calculation_for_cpu_offload(self, params):
 
     def copy_grads_in_partition(self, param):
         if self.cpu_offload:
-            #print(f"GAS: {self.gradient_accumulation_steps}")
-            #print(f"GAS: {self.is_gradient_accumulation_boundary}")
-            #with torch.cuda.stream(torch.cuda.current_stream()):
-
-            self.update_overflow_tracker_for_param_grad(param)
-
+            
             if self.gradient_accumulation_steps > 1:
                 self.async_accumulate_grad_in_cpu_via_gpu(param)
 
             if self.is_gradient_accumulation_boundary:
                 self.set_norm_for_param_grad_in_gpu(param)
+
+                self.update_overflow_tracker_for_param_grad(param)
+
                 self.async_inplace_copy_grad_to_fp32_buffer_from_gpu(param)
 
-                #new_grad_tensor = async_copy_to(param.grad.view(-1),
-                #                                'cpu',
-                #                                self.cpu_computation_stream)
-            #param.grad.data = new_grad_tensor.data.view_as(param.grad)
             return
-
+        #print(f"ID {self.get_param_id(param)} grad norm {param.grad.norm()}")
         if self.grads_in_partition is None:
             self.grads_in_partition_offset = 0
             total_size = 0
@@ -938,6 +936,7 @@ def copy_grads_in_partition(self, param):
             param.numel())
         new_grad_tensor.copy_(param.grad.view(-1))
         param.grad.data = new_grad_tensor.data.view_as(param.grad)
+        #print(f"Grad norm after copy to contiguous_buffer {param.grad.data.norm()}")
         self.grads_in_partition_offset += param.numel()
 
     def reduce_ipg_grads(self):
@@ -1319,6 +1318,7 @@ def free_grad_in_param_list(self, param_list):
 
     def reset_cpu_buffers(self):
         self.norm_for_param_grads = {}
+        self.local_overflow = False
         with torch.cuda.stream(self.migration_stream):
             for key, value in self.accumulated_grads_in_cpu.items():
                 value.mul_(0.0)
@@ -1327,7 +1327,7 @@ def step(self, closure=None):
         """
         Not supporting closure.
         """
-        self.micro_step_id = 0
+        self.micro_step_id = -1
 
         if self.cpu_offload:
             torch.cuda.current_stream().wait_stream(self.migration_stream)
@@ -1346,6 +1346,8 @@ def step(self, closure=None):
             self.zero_grad()
             if self.cpu_offload:
                 self.reset_cpu_buffers()
+            else:
+                self.averaged_gradients = {}
 
             see_memory_usage('After overflow after clearing gradients')
 
@@ -1557,6 +1559,7 @@ def backward(self, loss, retain_graph=False):
         2. scaled_loss = fp32_loss*loss_scale
         3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's fp16 leaves
         """
+        self.micro_step_id += 1
         if self.cpu_offload:
             torch.cuda.current_stream().wait_stream(self.migration_stream)
 
@@ -1576,7 +1579,6 @@ def backward(self, loss, retain_graph=False):
             self.ipg_index = 0
 
         self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
-        self.micro_step_id += 1
 
     def check_overflow(self, partition_gradients=True):
         self._check_overflow(partition_gradients)

From 130dd70bfe289406f9a46dce74c77842cd0d405b Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Thu, 3 Sep 2020 20:39:47 -0700
Subject: [PATCH 10/44] Import path fixes + conditional imports (#358)

* use relative imports and add support for conditional op imports

* formatting and llvm command check change

* fix remaining absolute import

* hide the isntalled ops var

* fix unit tests

Co-authored-by: Reza Yazdani <reyazda@microsoft.com>
---
 deepspeed/__init__.py           | 18 +++++++++--------
 deepspeed/ops/__init__.py       |  7 +++++++
 deepspeed/ops/adam/__init__.py  |  2 +-
 deepspeed/ops/adam/cpu_adam.py  |  6 +++++-
 setup.py                        | 34 +++++++++++++++++++++++++--------
 tests/unit/adam_test.py         |  2 +-
 tests/unit/test_adam_acuracy.py |  4 ++--
 7 files changed, 52 insertions(+), 21 deletions(-)

diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py
index de5d1efcd1aa..a6c906f1119a 100755
--- a/deepspeed/__init__.py
+++ b/deepspeed/__init__.py
@@ -4,16 +4,18 @@
 import sys
 import types
 
-from deepspeed.runtime.engine import DeepSpeedEngine
-from deepspeed.runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER
-from deepspeed.runtime.lr_schedules import add_tuning_arguments
-from deepspeed.runtime.config import DeepSpeedConfig
-from deepspeed.runtime.activation_checkpointing import checkpointing
-from deepspeed.ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
-from deepspeed.utils import logger
+from . import ops
+
+from .runtime.engine import DeepSpeedEngine
+from .runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER
+from .runtime.lr_schedules import add_tuning_arguments
+from .runtime.config import DeepSpeedConfig
+from .runtime.activation_checkpointing import checkpointing
+from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
+from .utils import logger
 
 try:
-    from deepspeed.git_version_info import version, git_hash, git_branch
+    from .git_version_info import version, git_hash, git_branch
 except ImportError:
     version = "0.0.0+unknown"
     git_hash = None
diff --git a/deepspeed/ops/__init__.py b/deepspeed/ops/__init__.py
index e69de29bb2d1..6c4187415aae 100644
--- a/deepspeed/ops/__init__.py
+++ b/deepspeed/ops/__init__.py
@@ -0,0 +1,7 @@
+from ..git_version_info import installed_ops as __installed_ops__
+from . import lamb
+from . import transformer
+if __installed_ops__['sparse-attn']:
+    from . import sparse_attention
+if __installed_ops__['cpu-adam']:
+    from . import adam
diff --git a/deepspeed/ops/adam/__init__.py b/deepspeed/ops/adam/__init__.py
index 13883dc7b590..1d8844409374 100755
--- a/deepspeed/ops/adam/__init__.py
+++ b/deepspeed/ops/adam/__init__.py
@@ -1 +1 @@
-from deepspeed.ops.adam.cpu_adam import DeepSpeedCPUAdam
+from .cpu_adam import DeepSpeedCPUAdam
diff --git a/deepspeed/ops/adam/cpu_adam.py b/deepspeed/ops/adam/cpu_adam.py
index 5387090e3815..888be4dc9ad4 100755
--- a/deepspeed/ops/adam/cpu_adam.py
+++ b/deepspeed/ops/adam/cpu_adam.py
@@ -1,7 +1,8 @@
 import math
 import torch
-import deepspeed.ops.adam.cpu_adam_op as ds_opt_adam
+import importlib
 
+ds_opt_adam = None
 
 class DeepSpeedCPUAdam(torch.optim.Optimizer):
 
@@ -24,6 +25,9 @@ def __init__(self,
         super(DeepSpeedCPUAdam, self).__init__(model_params, default_args)
         self.opt_id = DeepSpeedCPUAdam.optimizer_id
         DeepSpeedCPUAdam.optimizer_id = DeepSpeedCPUAdam.optimizer_id + 1
+
+        global ds_opt_adam
+        ds_opt_adam = importlib.import_module('deepspeed.ops.adam.cpu_adam_op')
         ds_opt_adam.create_adam(self.opt_id, lr, bettas[0], bettas[1], eps, weight_decay)
 
     def __setstate__(self, state):
diff --git a/setup.py b/setup.py
index 02f2f7fd005a..ae5010a65b11 100755
--- a/setup.py
+++ b/setup.py
@@ -27,6 +27,12 @@ def fetch_requirements(path):
 dev_requires = fetch_requirements('requirements/requirements-dev.txt')
 sparse_attn_requires = fetch_requirements('requirements/requirements-sparse-attn.txt')
 
+# Constants for each op
+LAMB = "lamb"
+TRANSFORMER = "transformer"
+SPARSE_ATTN = "sparse-attn"
+ADAM = "cpu-adam"
+
 # Build environment variables for custom builds
 DS_BUILD_LAMB_MASK = 1
 DS_BUILD_TRANSFORMER_MASK = 10
@@ -50,15 +56,15 @@ def fetch_requirements(path):
 BUILD_MASK = (DS_BUILD_LAMB | DS_BUILD_TRANSFORMER | DS_BUILD_SPARSE_ATTN
               | DS_BUILD_ADAM)
 
-install_ops = []
+install_ops = dict.fromkeys([LAMB, TRANSFORMER, SPARSE_ATTN, ADAM], False)
 if BUILD_MASK & DS_BUILD_LAMB:
-    install_ops.append('lamb')
+    install_ops[LAMB] = True
 if BUILD_MASK & DS_BUILD_ADAM:
-    install_ops.append('adam')
+    install_ops[ADAM] = True
 if BUILD_MASK & DS_BUILD_TRANSFORMER:
-    install_ops.append('transformer')
+    install_ops[TRANSFORMER] = True
 if BUILD_MASK & DS_BUILD_SPARSE_ATTN:
-    install_ops.append('sparse-attn')
+    install_ops[SPARSE_ATTN] = True
 if len(install_ops) == 0:
     print("Building without any cuda/cpp extensions")
 print(f'BUILD_MASK={BUILD_MASK}, install_ops={install_ops}')
@@ -196,14 +202,21 @@ def fetch_requirements(path):
 
 
 def command_exists(cmd):
-    result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
-    return result.wait() == 0
+    if '|' in cmd:
+        cmds = cmd.split("|")
+    else:
+        cmds = [cmd]
+    valid = False
+    for cmd in cmds:
+        result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
+        valid = valid or result.wait() == 0
+    return valid
 
 
 ## Sparse transformer ##
 if BUILD_MASK & DS_BUILD_SPARSE_ATTN:
     # Check to see if llvm and cmake are installed since they are dependencies
-    required_commands = ['llc-9', 'cmake']
+    required_commands = ['llvm-config|llvm-config-9', 'cmake']
 
     command_status = list(map(command_exists, required_commands))
     if not all(command_status):
@@ -213,6 +226,8 @@ def command_exists(cmd):
         )
         warnings.warn(
             'Skipping sparse attention installation due to missing required packages')
+        # remove from installed ops list
+        install_ops[SPARSE_ATTN] = False
     elif TORCH_MAJOR == 1 and TORCH_MINOR >= 5:
         ext_modules.append(
             CppExtension(name='deepspeed.ops.sparse_attention.cpp_utils',
@@ -223,6 +238,8 @@ def command_exists(cmd):
         install_requires += sparse_attn_requires
     else:
         warnings.warn('Unable to meet requirements to install sparse attention')
+        # remove from installed ops list
+        install_ops[SPARSE_ATTN] = False
 
 # Add development requirements
 install_requires += dev_requires
@@ -243,6 +260,7 @@ def command_exists(cmd):
     fd.write(f"version='{VERSION}+{git_hash}'\n")
     fd.write(f"git_hash='{git_hash}'\n")
     fd.write(f"git_branch='{git_branch}'\n")
+    fd.write(f"installed_ops={install_ops}\n")
 
 print(f'install_requires={install_requires}')
 
diff --git a/tests/unit/adam_test.py b/tests/unit/adam_test.py
index 8e6caa382c12..0fedfeb18e42 100755
--- a/tests/unit/adam_test.py
+++ b/tests/unit/adam_test.py
@@ -1,5 +1,5 @@
 import torch
-from deepspeed import DeepSpeedCPUAdam
+from deepspeed.ops.adam import DeepSpeedCPUAdam
 import time
 
 device = 'cpu'
diff --git a/tests/unit/test_adam_acuracy.py b/tests/unit/test_adam_acuracy.py
index 410543f516d1..b002231622f4 100755
--- a/tests/unit/test_adam_acuracy.py
+++ b/tests/unit/test_adam_acuracy.py
@@ -6,10 +6,10 @@
 import pytest
 import copy
 
-from deepspeed import DeepSpeedCPUAdam
+from deepspeed.ops.adam import DeepSpeedCPUAdam
 
 def check_equal(first, second, atol=1e-2, verbose=False):
-    if verbos:
+    if verbose:
         print(first)
         print(second)
     x = first.detach().numpy()

From ea5b991802bc68f68f245ba746d7a5ca18a8b76e Mon Sep 17 00:00:00 2001
From: Tunji Ruwase <olruwase@microsoft.com>
Date: Fri, 4 Sep 2020 05:06:51 +0000
Subject: [PATCH 11/44] Enable contiguous gradients for cpu_offload

---
 deepspeed/runtime/zero/stage2.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py
index 69cdcf34adf1..40910d9cff0c 100755
--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -293,7 +293,8 @@ def __init__(self,
         #map between param_id and bool to specify if a param is in this partition
         self.is_param_in_current_partition = {}
 
-        self.contiguous_gradients = contiguous_gradients
+        # CPU-Offload requires contiguous gradients
+        self.contiguous_gradients = contiguous_gradients or cpu_offload
         self.grads_in_ipg_bucket = []
         self.params_in_ipg_bucket = []
         self.elements_in_ipg_bucket = 0

From 7be128afcc06b29b041b4dbabfee64978d152eb6 Mon Sep 17 00:00:00 2001
From: Samyam Rajbhandari <samyamr@microsoft.com>
Date: Thu, 3 Sep 2020 22:48:41 -0700
Subject: [PATCH 12/44] Allocating CPU memory directly on CPU without
 transfering them from GPU (#360)

* Allocating CPU memory directly on CPU without transfering them from GPU

* formatting fixes
---
 deepspeed/ops/adam/cpu_adam.py   |  1 +
 deepspeed/runtime/zero/stage1.py |  2 +-
 deepspeed/runtime/zero/stage2.py | 24 +++++++++++++-----------
 tests/unit/test_adam_acuracy.py  |  7 ++++---
 4 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/deepspeed/ops/adam/cpu_adam.py b/deepspeed/ops/adam/cpu_adam.py
index 888be4dc9ad4..8407da73128d 100755
--- a/deepspeed/ops/adam/cpu_adam.py
+++ b/deepspeed/ops/adam/cpu_adam.py
@@ -4,6 +4,7 @@
 
 ds_opt_adam = None
 
+
 class DeepSpeedCPUAdam(torch.optim.Optimizer):
 
     optimizer_id = 0
diff --git a/deepspeed/runtime/zero/stage1.py b/deepspeed/runtime/zero/stage1.py
index 6f8953daa4cc..a8e50c591e1f 100755
--- a/deepspeed/runtime/zero/stage1.py
+++ b/deepspeed/runtime/zero/stage1.py
@@ -798,7 +798,7 @@ def _get_state_without_padding(self, state_with_padding, padding):
                 lean_state[key] = value[:lean_length]
             else:
                 lean_state[key] = value
-            
+
         return lean_state
 
     # Return base optimizer states.
diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py
index c0abe6c13f02..f91677499ce5 100755
--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -262,8 +262,8 @@ def __init__(self,
 
             # a partition of the fp32 master weights that will be updated by this process
             self.single_partition_of_fp32_groups.append(
-                self.parallel_partitioned_fp16_groups[i]
-                [partition_id].clone().float().detach().to(self.device))
+                self.parallel_partitioned_fp16_groups[i][partition_id].to(
+                    self.device).clone().float().detach())
 
             # modify optimizer of have flat master weight
             self.single_partition_of_fp32_groups[
@@ -331,7 +331,8 @@ def __init__(self,
             self.local_overflow = False
             self.grad_position = {}
             self.temp_grad_buffer_for_cpu_offload = torch.zeros(
-                largest_param_numel).half().pin_memory()
+                largest_param_numel,
+                device=self.device).half().pin_memory()
             self.temp_grad_buffer_for_gpu_offload = torch.zeros(
                 largest_param_numel,
                 device=torch.cuda.current_device()).half()
@@ -478,10 +479,10 @@ def independent_gradient_partition_epilogue(self):
 
         if self.overlap_comm:
             torch.cuda.synchronize()
-            
+
         if self.cpu_offload is False:
             for i, _ in enumerate(self.fp16_groups):
-            
+
                 if not i in self.averaged_gradients or self.averaged_gradients[i] is None:
                     self.averaged_gradients[i] = self.get_flat_partition(
                         self.params_in_partition[i],
@@ -500,8 +501,7 @@ def independent_gradient_partition_epilogue(self):
 
                     for accumulated_grad, new_avg_grad in zip(self.averaged_gradients[i],avg_new):
                         accumulated_grad.add_(new_avg_grad)
-            
-            
+
         self._release_ipg_buffers()
 
         # No need to keep the gradients anymore.
@@ -788,7 +788,8 @@ def async_accumulate_grad_in_cpu(self, param):
         if param_id not in self.accumulated_grads_in_cpu:
             self.accumulated_grads_in_cpu[param_id] = torch.zeros(
                 param.numel(),
-                dtype=param.dtype).pin_memory()
+                dtype=param.dtype,
+                device=self.device).pin_memory()
 
         self.accumulated_grads_in_cpu[param_id].add_(dest_buffer)
 
@@ -804,7 +805,8 @@ def async_accumulate_grad_in_cpu_via_gpu(self, param):
         if param_id not in self.accumulated_grads_in_cpu:
             self.accumulated_grads_in_cpu[param_id] = torch.zeros(
                 param.numel(),
-                dtype=param.dtype).pin_memory()
+                dtype=param.dtype,
+                device=self.device).pin_memory()
 
         if self.micro_step_id > 0:
             dest_buffer.copy_(self.accumulated_grads_in_cpu[param_id].view(-1),
@@ -871,7 +873,7 @@ def async_inplace_copy_grad_to_fp32_buffer_from_gpu(self, param):
 
         src_tensor = param.grad.view(-1).narrow(0, source_offset, num_elements).float()
         dest_tensor.copy_(src_tensor, non_blocking=True)
-        param.grad=None
+        param.grad = None
 
     def complete_grad_norm_calculation_for_cpu_offload(self, params):
         total_norm = 0.0
@@ -904,7 +906,7 @@ def complete_grad_norm_calculation_for_cpu_offload(self, params):
 
     def copy_grads_in_partition(self, param):
         if self.cpu_offload:
-            
+
             if self.gradient_accumulation_steps > 1:
                 self.async_accumulate_grad_in_cpu_via_gpu(param)
 
diff --git a/tests/unit/test_adam_acuracy.py b/tests/unit/test_adam_acuracy.py
index b002231622f4..e616e140bdde 100755
--- a/tests/unit/test_adam_acuracy.py
+++ b/tests/unit/test_adam_acuracy.py
@@ -8,6 +8,7 @@
 
 from deepspeed.ops.adam import DeepSpeedCPUAdam
 
+
 def check_equal(first, second, atol=1e-2, verbose=False):
     if verbose:
         print(first)
@@ -36,11 +37,11 @@ def test_adam_opt(model_size):
 
     for i in range(10):
         rng_state = torch.get_rng_state()
-        param.grad=torch.randn(model_size, device=device)
+        param.grad = torch.randn(model_size, device=device)
         torch.set_rng_state(rng_state)
-        param1.grad=torch.randn(model_size, device=device)
+        param1.grad = torch.randn(model_size, device=device)
 
         optimizer.step()
         optimizer1.step()
 
-    check_equal(param, param1, atol = 1e-2, verbose=True)
+    check_equal(param, param1, atol=1e-2, verbose=True)

From 1a4a82b6cd439891167dae2bfc452d9608d2d674 Mon Sep 17 00:00:00 2001
From: Samyam Rajbhandari <samyamr@microsoft.com>
Date: Thu, 3 Sep 2020 23:00:27 -0700
Subject: [PATCH 13/44] change gpt2 pretrain to have DeepSpeed adam (#361)

Co-authored-by: Reza Yazdani <reyazda@microsoft.com>
---
 deepspeed/runtime/engine.py      | 10 +++++++++-
 deepspeed/runtime/zero/stage2.py |  6 +++---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index f77fc64de7d2..f358be2b8223 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -13,6 +13,7 @@
 
 from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer
 from deepspeed.runtime.zero.stage1 import FP16_DeepSpeedZeroOptimizer_Stage1
+from deepspeed.ops.adam import DeepSpeedCPUAdam
 from deepspeed.runtime.activation_checkpointing import checkpointing as activation_checkpointing
 from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer
 from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
@@ -292,6 +293,9 @@ def zero_overlap_comm(self):
     def zero_cpu_offload(self):
         return self._config.zero_config.cpu_offload
 
+    def deepspeed_adam(self):
+        return self._config.zero_config.deepspeed_adam
+        
     def zero_optimization_stage(self):
         return self._config.zero_optimization_stage
 
@@ -532,7 +536,11 @@ def _configure_basic_optimizer(self, model_parameters):
             )
         if self.optimizer_name() == ADAM_OPTIMIZER:
             if self.zero_cpu_offload():
-                optimizer = torch.optim.Adam(model_parameters, **optimizer_parameters)
+                if False: #self.deepspeed_adam():
+                    optimizer = DeepSpeedCPUAdam(model_parameters, **optimizer_parameters)
+                else:
+                    optimizer = torch.optim.Adam(model_parameters, **optimizer_parameters)
+                
             else:
                 from apex.optimizers.fused_adam import FusedAdam
                 optimizer = FusedAdam(model_parameters, **optimizer_parameters)
diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py
index f91677499ce5..63fd2d80d3e2 100755
--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -1408,14 +1408,14 @@ def step(self, closure=None):
         self.unscale_and_clip_grads(single_partition_grad_groups, norm_groups)
         #torch.set_num_threads(12)
         timers('optimizer_step').start()
-        self.optimizer.step()
+        self.optimizer.step(fp16_param_groups=self.parallel_partitioned_fp16_groups)
         #get rid of the fp32 gradients. Not needed anymore
         if not self.cpu_offload:
             for group in self.single_partition_of_fp32_groups:
                 group.grad = None
 
-        for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
-            fp16_partitions[partition_id].data.copy_(fp32_partition.data)
+        #for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
+        #    fp16_partitions[partition_id].data.copy_(fp32_partition.data)
         timers('optimizer_step').stop()
 
         if self.cpu_offload:

From ac12833ea790932eb2c23747375279ebc9aecfdf Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Thu, 3 Sep 2020 23:35:03 -0700
Subject: [PATCH 14/44] Jekyll installation instructions (#351)

---
 docs/Gemfile.lock |  6 +++---
 docs/README.md    | 40 +++++++++++++++++++++++++++++++++-------
 2 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock
index 5261407799ec..728ca8f3f1bb 100644
--- a/docs/Gemfile.lock
+++ b/docs/Gemfile.lock
@@ -81,7 +81,7 @@ GEM
       octokit (~> 4.0)
       public_suffix (~> 3.0)
       typhoeus (~> 1.3)
-    html-pipeline (2.13.0)
+    html-pipeline (2.14.0)
       activesupport (>= 2)
       nokogiri (>= 1.4)
     http_parser.rb (0.6.0)
@@ -94,7 +94,7 @@ GEM
       i18n (~> 0.7)
       jekyll-sass-converter (~> 1.0)
       jekyll-watch (~> 2.0)
-      kramdown (>= 2.30, < 3)
+      kramdown (>= 1.17, < 3)
       liquid (~> 4.0)
       mercenary (~> 0.3.3)
       pathutil (~> 0.9)
@@ -215,7 +215,7 @@ GEM
       jekyll-include-cache (~> 0.1)
       jekyll-paginate (~> 1.1)
       jekyll-sitemap (~> 1.3)
-    minitest (5.14.1)
+    minitest (5.14.2)
     multipart-post (2.1.1)
     nokogiri (1.10.10)
       mini_portile2 (~> 2.4.0)
diff --git a/docs/README.md b/docs/README.md
index c0095845b625..be535b3ab8a2 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,15 +1,41 @@
 # DeepSpeed Documentation
 
-This directory includes the documentation of DeepSpeed. There are three ways to read it:
+This directory includes the source code for the website and documentation of DeepSpeed. The `code-docs/` directory is used to build [deepspeed.readthedocs.io](https://deepspeed.readthedocs.io/en/latest/).
 
-## 1. Access [deepspeed.ai](https://www.deepspeed.ai/)
+[deepspeed.ai](https://www.deepspeed.ai/) is the recommended way to read all DeepSpeed documentation. Directly viewing the Markdown files in this directory will not include images and other features.
 
-This is the most recommended way to read the documentation.
+## Building the documentation locally
+You can serve the DeepSpeed website locally. This is especially useful for development.
 
-## 2. Directly read files in this directory
+### Prerequisites
+The DeepSpeed website relies on [Jekyll](https://jekyllrb.com/). There are several [guides for installation](https://jekyllrb.com/docs/installation/). The instructions below assume you are in an Ubuntu environment and have been tested on WSL.
 
-We do not recommend this way because this directory is organized to build the [deepspeed.ai](https://www.deepspeed.ai/) website using Jekyll. Thus some of the files actually are not DeepSpeed documentation. In addition, some of the url links in the documentation only work through the webpages generated by Jekyll.
+First ensure that you have the necessary packages (e.g., `make` and `zlib`).
+```
+sudo apt-get install build-essential zlib1g-dev ruby-full
+```
 
-## 3. Build [deepspeed.ai](https://www.deepspeed.ai/) website locally using Jekyll
+Add these lines to your `.bashrc` or equivalent to ensure you have permissions to install Ruby packages without `sudo`.
+```
+export GEM_HOME="$HOME/gems"
+export PATH="$HOME/gems/bin:$PATH"
+```
+Don't forget to `source ~/.bashrc` afterwards 😊.
 
-This is recommended for local website development or when you do not have internet access. You can follow the instruction at [here](https://help.github.com/en/github/working-with-github-pages/testing-your-github-pages-site-locally-with-jekyll) to install Ruby, Bundler, and Jekyll. Then run `bundle exec jekyll serve` at this directory so that you can view the website in your web browser at `http://localhost:4000`.
+
+Now we can install Jekyll and [Bundler](https://bundler.io/):
+```
+gem install jekyll bundler
+```
+
+### Start a local webserver
+We now need to install the required Ruby packages for the website. **NOTE**: this step frequently hangs when connected to a VPN (including MSVPN). Simply disconnect for the package installation.
+```
+bundle install
+```
+
+You can now start a local webserver via:
+```
+bundle exec jekyll serve
+```
+The website should now be accessible at [http://localhost:4000](http://localhost:4000)

From 253b04479301715e6be156f780766c1e5b1a9dbe Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Fri, 4 Sep 2020 06:44:56 -0700
Subject: [PATCH 15/44] Generalize detection of ZeRO supported optimizers
 (#349)

* Improve test for ZeRO supported optimizers

* Rename test function

* Format fixes

* Add model tests that wraps client FusedAdam with fused fp16 optimizer

* Format fixes
---
 deepspeed/runtime/engine.py                   | 22 +++++++-----
 deepspeed/runtime/zero/utils.py               |  9 ++++-
 .../ds_config_func_bs4_zero1.json             |  1 -
 .../ds_config_func_bs4_zero2.json             |  1 -
 .../ds_config_func_bs4_zero2_offload.json     |  1 -
 .../ds_config_func_bs8_zero0_gas3.json        | 11 ++----
 .../ds_config_func_bs8_zero1.json             |  1 -
 .../ds_config_func_bs8_zero2_gas3.json        | 11 ++----
 .../Megatron_GPT2/ds_config_perf_bs16.json    |  7 ----
 .../Megatron_GPT2/ds_config_perf_bs32.json    |  7 ----
 .../Megatron_GPT2/ds_config_perf_bs8.json     |  7 ----
 tests/model/Megatron_GPT2/run_func_test.py    | 21 +++++++++++
 tests/unit/test_fp16.py                       | 36 +++++++++++++++++++
 13 files changed, 83 insertions(+), 52 deletions(-)
 mode change 100644 => 100755 deepspeed/runtime/zero/utils.py
 mode change 100644 => 100755 tests/model/Megatron_GPT2/ds_config_perf_bs16.json
 mode change 100644 => 100755 tests/model/Megatron_GPT2/ds_config_perf_bs8.json

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index f358be2b8223..c61f53999126 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -6,6 +6,8 @@
 import torch
 import warnings
 import torch.distributed as dist
+
+import apex
 from apex import amp
 from torch.nn.modules import Module
 from torch.distributed.distributed_c10d import _get_global_rank
@@ -13,6 +15,7 @@
 
 from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer
 from deepspeed.runtime.zero.stage1 import FP16_DeepSpeedZeroOptimizer_Stage1
+from deepspeed.runtime.zero.utils import is_zero_supported_optimizer
 from deepspeed.ops.adam import DeepSpeedCPUAdam
 from deepspeed.runtime.activation_checkpointing import checkpointing as activation_checkpointing
 from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer
@@ -264,7 +267,7 @@ def train_micro_batch_size_per_gpu(self):
         return self._config.train_micro_batch_size_per_gpu
 
     def optimizer_name(self):
-        return self._config.optimizer_name
+        return self.client_optimizer.__class__.__name__ if self.client_optimizer else self._config.optimizer_name
 
     def optimizer_params(self):
         return self._config.optimizer_params
@@ -295,7 +298,7 @@ def zero_cpu_offload(self):
 
     def deepspeed_adam(self):
         return self._config.zero_config.deepspeed_adam
-        
+
     def zero_optimization_stage(self):
         return self._config.zero_optimization_stage
 
@@ -506,7 +509,7 @@ def _configure_optimizer(self, client_optimizer, model_parameters):
 
         if self.zero_optimization():
             assert not self.amp_enabled(), "Amp and ZeRO are not currently compatible, please use (legacy) fp16 mode which performs similar to amp opt_mode=O2"
-            if self.optimizer_name() not in [ADAM_OPTIMIZER]:
+            if not is_zero_supported_optimizer(basic_optimizer):
                 assert self.zero_allow_untested_optimizer(), \
                     'You are using an untested ZeRO Optimizer. Please add <"zero_allow_untested_optimizer": true> in the configuration file to use it.'
 
@@ -536,11 +539,13 @@ def _configure_basic_optimizer(self, model_parameters):
             )
         if self.optimizer_name() == ADAM_OPTIMIZER:
             if self.zero_cpu_offload():
-                if False: #self.deepspeed_adam():
-                    optimizer = DeepSpeedCPUAdam(model_parameters, **optimizer_parameters)
+                if False:  #self.deepspeed_adam():
+                    optimizer = DeepSpeedCPUAdam(model_parameters,
+                                                 **optimizer_parameters)
                 else:
-                    optimizer = torch.optim.Adam(model_parameters, **optimizer_parameters)
-                
+                    optimizer = torch.optim.Adam(model_parameters,
+                                                 **optimizer_parameters)
+
             else:
                 from apex.optimizers.fused_adam import FusedAdam
                 optimizer = FusedAdam(model_parameters, **optimizer_parameters)
@@ -555,7 +560,8 @@ def _configure_fp16_optimizer(self, optimizer):
         initial_dynamic_scale = self.initial_dynamic_scale()
         dynamic_loss_args = self.dynamic_loss_scale_args()
         clip_grad = self.gradient_clipping()
-        if self.optimizer_name() == ADAM_OPTIMIZER:
+
+        if isinstance(optimizer, apex.optimizers.FusedAdam):
             if self.dynamic_loss_scale():
                 logger.info('Creating fp16 optimizer with dynamic loss scale')
                 timers = self.timers if self.wall_clock_breakdown() else None
diff --git a/deepspeed/runtime/zero/utils.py b/deepspeed/runtime/zero/utils.py
old mode 100644
new mode 100755
index 86d1930a9223..ba7cf8e87ebe
--- a/deepspeed/runtime/zero/utils.py
+++ b/deepspeed/runtime/zero/utils.py
@@ -1,6 +1,6 @@
 import torch
 import torch.distributed as dist
-
+import apex
 from deepspeed.utils import logger
 
 
@@ -20,3 +20,10 @@ def _initialize_parameter_parallel_groups(parameter_parallel_size=None):
         if rank in ranks:
             my_group = group
     return my_group
+
+
+ZERO_SUPPORTED_OPTIMIZERS = [torch.optim.Adam, apex.optimizers.FusedAdam]
+
+
+def is_zero_supported_optimizer(optimizer):
+    return type(optimizer) in ZERO_SUPPORTED_OPTIMIZERS
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs4_zero1.json b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero1.json
index c3322eca8138..33562614aa9a 100755
--- a/tests/model/Megatron_GPT2/ds_config_func_bs4_zero1.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero1.json
@@ -5,7 +5,6 @@
   "zero_optimization": {
     "stage": 1
   },
-  "zero_allow_untested_optimizer": true,
   "gradient_clipping": 1.0,
   "fp16": {
     "enabled": true,
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2.json b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2.json
index f6a6db57daf2..afc33ee7ea0d 100755
--- a/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2.json
@@ -8,7 +8,6 @@
     "allgather_bucket_size": 7000000,
     "reduce_scatter": true
   },
-  "zero_allow_untested_optimizer": true,
   "gradient_clipping": 1.0,
   "fp16": {
     "enabled": true,
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2_offload.json b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2_offload.json
index ad054d31bb66..a436fe9f04c6 100755
--- a/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2_offload.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2_offload.json
@@ -9,7 +9,6 @@
     "reduce_scatter": true,
     "cpu_offload": true
   },
-  "zero_allow_untested_optimizer": true,
   "gradient_clipping": 1.0,
   "fp16": {
     "enabled": true,
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero0_gas3.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero0_gas3.json
index 5c5efb0a7e57..166db751361a 100755
--- a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero0_gas3.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero0_gas3.json
@@ -1,19 +1,13 @@
 {
-  "train_micro_batch_size_per_gpu":8,
+  "train_micro_batch_size_per_gpu": 8,
   "gradient_accumulation_steps": 3,
   "steps_per_print": 1,
   "zero_optimization": {
-    "stage":0,
+    "stage": 0,
     "reduce_bucket_size": 7000000,
     "allgather_bucket_size": 7000000,
     "reduce_scatter": true
   },
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015
-    }
-  },
   "gradient_clipping": 1.0,
   "fp16": {
     "enabled": true,
@@ -26,5 +20,4 @@
     "partition_activations": true,
     "contiguous_memory_optimization": true
   }
-
 }
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json
index 342fd665ccae..ca4d5ed9dfef 100755
--- a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json
@@ -5,7 +5,6 @@
   "zero_optimization": {
     "stage": 1
   },
-  "zero_allow_untested_optimizer": true,
   "gradient_clipping": 1.0,
   "fp16": {
     "enabled": true,
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_gas3.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_gas3.json
index e245309603b8..531975c499f5 100755
--- a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_gas3.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_gas3.json
@@ -1,19 +1,13 @@
 {
-  "train_micro_batch_size_per_gpu":8,
+  "train_micro_batch_size_per_gpu": 8,
   "gradient_accumulation_steps": 3,
   "steps_per_print": 1,
   "zero_optimization": {
-    "stage":2,
+    "stage": 2,
     "reduce_bucket_size": 7000000,
     "allgather_bucket_size": 7000000,
     "reduce_scatter": true
   },
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015
-    }
-  },
   "gradient_clipping": 1.0,
   "fp16": {
     "enabled": true,
@@ -26,5 +20,4 @@
     "partition_activations": true,
     "contiguous_memory_optimization": true
   }
-
 }
diff --git a/tests/model/Megatron_GPT2/ds_config_perf_bs16.json b/tests/model/Megatron_GPT2/ds_config_perf_bs16.json
old mode 100644
new mode 100755
index a40f3e4c7d44..ae22cf625c69
--- a/tests/model/Megatron_GPT2/ds_config_perf_bs16.json
+++ b/tests/model/Megatron_GPT2/ds_config_perf_bs16.json
@@ -5,14 +5,7 @@
   "zero_optimization": {
     "stage": 1
   },
-  "zero_allow_untested_optimizer": true,
   "disable_allgather": true,
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015
-    }
-  },
   "gradient_clipping": 1.0,
   "fp16": {
     "enabled": true,
diff --git a/tests/model/Megatron_GPT2/ds_config_perf_bs32.json b/tests/model/Megatron_GPT2/ds_config_perf_bs32.json
index 096a0d3645cd..64fb73e9bec8 100755
--- a/tests/model/Megatron_GPT2/ds_config_perf_bs32.json
+++ b/tests/model/Megatron_GPT2/ds_config_perf_bs32.json
@@ -5,14 +5,7 @@
   "zero_optimization": {
     "stage": 1
   },
-  "zero_allow_untested_optimizer": true,
   "disable_allgather": true,
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015
-    }
-  },
   "gradient_clipping": 1.0,
   "fp16": {
     "enabled": true,
diff --git a/tests/model/Megatron_GPT2/ds_config_perf_bs8.json b/tests/model/Megatron_GPT2/ds_config_perf_bs8.json
old mode 100644
new mode 100755
index e793e221e1e7..c274b15becf2
--- a/tests/model/Megatron_GPT2/ds_config_perf_bs8.json
+++ b/tests/model/Megatron_GPT2/ds_config_perf_bs8.json
@@ -5,14 +5,7 @@
   "zero_optimization": {
     "stage": 1
   },
-  "zero_allow_untested_optimizer": true,
   "disable_allgather": true,
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015
-    }
-  },
   "gradient_clipping": 1.0,
   "fp16": {
     "enabled": true,
diff --git a/tests/model/Megatron_GPT2/run_func_test.py b/tests/model/Megatron_GPT2/run_func_test.py
index 77545f1e15a4..68264ca8bc65 100755
--- a/tests/model/Megatron_GPT2/run_func_test.py
+++ b/tests/model/Megatron_GPT2/run_func_test.py
@@ -48,6 +48,24 @@ def setUp(self):
     def tearDown(self):
         os.chdir(self.save_dir)
 
+    def test_mp1_gpu2_node1_fp16(self):
+        test_config = {
+            "mp": 1,
+            "gpus": 2,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1000,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
+            "deepspeed": False,
+            "json": "ds_config_func_bs8_no_zero.json",
+        }
+
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+
     def test_mp1_gpu1_node1_zero1(self):
         test_config = {
             "mp": 1,
@@ -348,6 +366,9 @@ def check_parity(self, base_file, test_file, r_tol):
 
 def suite():
     suite = unittest.TestSuite()
+
+    suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_fp16'))
+
     suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero1'))
     suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero1'))
     suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero1'))
diff --git a/tests/unit/test_fp16.py b/tests/unit/test_fp16.py
index 69a76a85830d..94a36efc3087 100755
--- a/tests/unit/test_fp16.py
+++ b/tests/unit/test_fp16.py
@@ -1,4 +1,5 @@
 import torch
+import apex
 import deepspeed
 import argparse
 import pytest
@@ -608,3 +609,38 @@ def _test_adam_amp_o2_empty_grad(args, model, hidden_dim):
             model.step()
 
     _test_adam_amp_o2_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
+
+
+@pytest.mark.parametrize('zero_stage, optimizer_constructor',
+                         [(1,
+                           apex.optimizers.FusedAdam),
+                          (2,
+                           torch.optim.Adam),
+                          (2,
+                           apex.optimizers.FusedAdam)])
+def test_zero_supported_client_optimizer(tmpdir, zero_stage, optimizer_constructor):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "fp16": {
+            "enabled": True
+        },
+        "zero_optimization": {
+            "stage": zero_stage
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim, empty_grad=False)
+
+    @distributed_test(world_size=[1])
+    def _test_zero_supported_client_optimizer(args, model, optimizer_constructor):
+        client_optimizer = optimizer_constructor(params=model.parameters())
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                               model=model,
+                                               optimizer=client_optimizer)
+
+    _test_zero_supported_client_optimizer(args=args,
+                                          model=model,
+                                          optimizer_constructor=optimizer_constructor)

From 9ba232aa4e583c894d5d1ddf4ac964870672be1f Mon Sep 17 00:00:00 2001
From: Samyam Rajbhandari <samyamr@microsoft.com>
Date: Fri, 4 Sep 2020 17:22:23 +0000
Subject: [PATCH 16/44] everything is working

---
 deepspeed/runtime/zero/stage2.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py
index 63fd2d80d3e2..de2d995e7a13 100755
--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -1408,14 +1408,15 @@ def step(self, closure=None):
         self.unscale_and_clip_grads(single_partition_grad_groups, norm_groups)
         #torch.set_num_threads(12)
         timers('optimizer_step').start()
-        self.optimizer.step(fp16_param_groups=self.parallel_partitioned_fp16_groups)
+        self.optimizer.step() #fp16_param_groups=self.parallel_partitioned_fp16_groups)
         #get rid of the fp32 gradients. Not needed anymore
         if not self.cpu_offload:
             for group in self.single_partition_of_fp32_groups:
                 group.grad = None
 
-        #for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
-        #    fp16_partitions[partition_id].data.copy_(fp32_partition.data)
+        for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
+            fp16_partitions[partition_id].data.copy_(fp32_partition.data)
+            
         timers('optimizer_step').stop()
 
         if self.cpu_offload:

From 606543de7f2921235cb2e01a94e2710ab44d6448 Mon Sep 17 00:00:00 2001
From: RezaYazdaniAminabadi
 <44502768+RezaYazdaniAminabadi@users.noreply.github.com>
Date: Fri, 4 Sep 2020 14:55:27 -0700
Subject: [PATCH 17/44] fixing the cpu_adam API and add deepspeed_adam flag in
 config.py (#365)

* fixing the cpu_adam API and add deepspeed_adam flag in config.py

* run precommit
---
 deepspeed/__init__.py            |  2 +-
 deepspeed/ops/adam/cpu_adam.py   | 21 +++++++++++++--------
 deepspeed/runtime/config.py      |  3 ++-
 deepspeed/runtime/engine.py      | 17 +++++------------
 deepspeed/runtime/zero/stage2.py |  6 ++++--
 5 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py
index a6c906f1119a..1e5965461f02 100755
--- a/deepspeed/__init__.py
+++ b/deepspeed/__init__.py
@@ -7,7 +7,7 @@
 from . import ops
 
 from .runtime.engine import DeepSpeedEngine
-from .runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER
+from .runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER, DEEPSPEED_ADAM
 from .runtime.lr_schedules import add_tuning_arguments
 from .runtime.config import DeepSpeedConfig
 from .runtime.activation_checkpointing import checkpointing
diff --git a/deepspeed/ops/adam/cpu_adam.py b/deepspeed/ops/adam/cpu_adam.py
index 8407da73128d..d51e2126cda2 100755
--- a/deepspeed/ops/adam/cpu_adam.py
+++ b/deepspeed/ops/adam/cpu_adam.py
@@ -24,6 +24,7 @@ def __init__(self,
                             weight_decay=weight_decay,
                             amsgrad=amsgrad)
         super(DeepSpeedCPUAdam, self).__init__(model_params, default_args)
+
         self.opt_id = DeepSpeedCPUAdam.optimizer_id
         DeepSpeedCPUAdam.optimizer_id = DeepSpeedCPUAdam.optimizer_id + 1
 
@@ -43,33 +44,37 @@ def step(self, closure=None, fp16_param_groups=None):
             with torch.enable_grad():
                 loss = closure()
 
-        for i, group in enumerate(self.param_groups):
-            for gid, p in enumerate(group['params']):
+        for group_id, group in enumerate(self.param_groups):
+            for param_id, p in enumerate(group['params']):
 
                 if p.grad is None:
                     continue
 
-                grad = p.grad
+                grad = p.grad.data
                 state = self.state[p]
                 # State initialization
                 if len(state) == 0:
                     state['step'] = 0
                     # gradient momentums
-                    state['exp_avg'] = torch.zeros_like(p, device='cpu')
+                    state['exp_avg'] = torch.zeros_like(p.data, device='cpu')
                     # gradient variances
-                    state['exp_avg_sq'] = torch.zeros_like(p, device='cpu')
+                    state['exp_avg_sq'] = torch.zeros_like(p.data, device='cpu')
 
                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                 state['step'] += 1
 
                 if fp16_param_groups is not None:
-                    p_fp16 = fp16_param_groups[i][gid]
+                    p_fp16 = fp16_param_groups[group_id][param_id]
                     ds_opt_adam.adam_update_copy(self.opt_id,
-                                                 p,
+                                                 p.data,
                                                  grad,
                                                  exp_avg,
                                                  exp_avg_sq,
                                                  p_fp16)
                 else:
-                    ds_opt_adam.adam_update(self.opt_id, p, grad, exp_avg, exp_avg_sq)
+                    ds_opt_adam.adam_update(self.opt_id,
+                                            p.data,
+                                            grad,
+                                            exp_avg,
+                                            exp_avg_sq)
         return loss
diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
index 7637b091efe9..483209fdffbc 100755
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -17,7 +17,8 @@
 TENSOR_CORE_ALIGN_SIZE = 8
 ADAM_OPTIMIZER = 'adam'
 LAMB_OPTIMIZER = 'lamb'
-DEEPSPEED_OPTIMIZERS = [ADAM_OPTIMIZER, LAMB_OPTIMIZER]
+DEEPSPEED_ADAM = 'deepspeed_adam'
+DEEPSPEED_OPTIMIZERS = [ADAM_OPTIMIZER, LAMB_OPTIMIZER, DEEPSPEED_ADAM]
 
 
 def get_amp_enabled(param_dict):
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index c61f53999126..a3d9a442ae4c 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -16,12 +16,11 @@
 from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer
 from deepspeed.runtime.zero.stage1 import FP16_DeepSpeedZeroOptimizer_Stage1
 from deepspeed.runtime.zero.utils import is_zero_supported_optimizer
-from deepspeed.ops.adam import DeepSpeedCPUAdam
 from deepspeed.runtime.activation_checkpointing import checkpointing as activation_checkpointing
 from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer
 from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
 from deepspeed.runtime.config import DeepSpeedConfig, \
-    ADAM_OPTIMIZER, LAMB_OPTIMIZER, DEEPSPEED_OPTIMIZERS
+    ADAM_OPTIMIZER, DEEPSPEED_ADAM, LAMB_OPTIMIZER, DEEPSPEED_OPTIMIZERS
 from deepspeed.runtime.dataloader import DeepSpeedDataLoader
 from deepspeed.runtime.constants import \
     ROUTE_TRAIN, ROUTE_PREDICT, ROUTE_EVAL, \
@@ -296,9 +295,6 @@ def zero_overlap_comm(self):
     def zero_cpu_offload(self):
         return self._config.zero_config.cpu_offload
 
-    def deepspeed_adam(self):
-        return self._config.zero_config.deepspeed_adam
-
     def zero_optimization_stage(self):
         return self._config.zero_optimization_stage
 
@@ -539,16 +535,13 @@ def _configure_basic_optimizer(self, model_parameters):
             )
         if self.optimizer_name() == ADAM_OPTIMIZER:
             if self.zero_cpu_offload():
-                if False:  #self.deepspeed_adam():
-                    optimizer = DeepSpeedCPUAdam(model_parameters,
-                                                 **optimizer_parameters)
-                else:
-                    optimizer = torch.optim.Adam(model_parameters,
-                                                 **optimizer_parameters)
-
+                optimizer = torch.optim.Adam(model_parameters, **optimizer_parameters)
             else:
                 from apex.optimizers.fused_adam import FusedAdam
                 optimizer = FusedAdam(model_parameters, **optimizer_parameters)
+        elif self.optimizer_name() == DEEPSPEED_ADAM:
+            from deepspeed.ops.adam import DeepSpeedCPUAdam
+            optimizer = DeepSpeedCPUAdam(model_parameters, **optimizer_parameters)
         elif self.optimizer_name() == LAMB_OPTIMIZER:
             optimizer = FusedLamb(model_parameters, **optimizer_parameters)
         else:
diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py
index de2d995e7a13..ae5722fc318d 100755
--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -1408,7 +1408,9 @@ def step(self, closure=None):
         self.unscale_and_clip_grads(single_partition_grad_groups, norm_groups)
         #torch.set_num_threads(12)
         timers('optimizer_step').start()
-        self.optimizer.step() #fp16_param_groups=self.parallel_partitioned_fp16_groups)
+
+        #self.optimizer.step(fp16_param_groups=self.parallel_partitioned_fp16_groups)
+        self.optimizer.step()
         #get rid of the fp32 gradients. Not needed anymore
         if not self.cpu_offload:
             for group in self.single_partition_of_fp32_groups:
@@ -1416,7 +1418,7 @@ def step(self, closure=None):
 
         for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
             fp16_partitions[partition_id].data.copy_(fp32_partition.data)
-            
+
         timers('optimizer_step').stop()
 
         if self.cpu_offload:

From 1d4b41fe21b4278fa994d1ee72df8eff601b7e8b Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Sat, 5 Sep 2020 01:56:43 +0000
Subject: [PATCH 18/44] fixing adam copy fp16-param-add more compile flags for
 cpu_adam

---
 csrc/adam/cpu_adam.cpp   | 526 ++++++++++++++++++++++++++++-----------
 csrc/includes/cpu_adam.h |   9 +-
 setup.py                 |   6 +-
 tests/unit/adam_test.py  |  10 +-
 4 files changed, 398 insertions(+), 153 deletions(-)
 mode change 100644 => 100755 csrc/adam/cpu_adam.cpp
 mode change 100644 => 100755 csrc/includes/cpu_adam.h

diff --git a/csrc/adam/cpu_adam.cpp b/csrc/adam/cpu_adam.cpp
old mode 100644
new mode 100755
index 2d3b521bea1c..ab5b6c0b7054
--- a/csrc/adam/cpu_adam.cpp
+++ b/csrc/adam/cpu_adam.cpp
@@ -49,54 +49,60 @@ void Adam_Optimizer::Step(float* _params,
 
     size_t tile = 0;
 
-#pragma omp parallel for
-    for (size_t i = 0; i < _param_size; i += SIMD_WIDTH) {
-        __m512 grad_4 = _mm512_loadu_ps(grads + i);
-
-        __m512 momntum_4 = _mm512_loadu_ps(_exp_avg + i);
-        __m512 varianc_4 = _mm512_loadu_ps(_exp_avg_sq + i);
-
-        __m512 param_4 = _mm512_loadu_ps(_params + i);
-
-        if (_weight_decay > 0) {
-            __m512 weight_decay4 = _mm512_set1_ps(_weight_decay);
-            grad_4 = _mm512_fmadd_ps(param_4, weight_decay4, grad_4);
-        }
+    for (size_t t = 0; t < _param_size; t += TILE)
+    {
+        size_t copy_size = TILE;
+        if((t + TILE) > _param_size)copy_size = _param_size - t;
+        size_t offset = copy_size + t;
+        #pragma omp parallel for
+        for(size_t i = t; i < offset;i += SIMD_WIDTH)
+        {
+            __m512 grad_4 = _mm512_loadu_ps(grads + i);
+
+            __m512 momntum_4 = _mm512_loadu_ps(_exp_avg + i);
+            __m512 varianc_4 = _mm512_loadu_ps(_exp_avg_sq + i);
+
+            __m512 param_4 = _mm512_loadu_ps(_params + i);
+
+            if (_weight_decay > 0) {
+                __m512 weight_decay4 = _mm512_set1_ps(_weight_decay);
+                grad_4 = _mm512_fmadd_ps(param_4, weight_decay4, grad_4);
+            }
 
-        momntum_4 = _mm512_mul_ps(momntum_4, betta1_4);
-        momntum_4 = _mm512_fmadd_ps(grad_4, betta1_minus1_4, momntum_4);
+            momntum_4 = _mm512_mul_ps(momntum_4, betta1_4);
+            momntum_4 = _mm512_fmadd_ps(grad_4, betta1_minus1_4, momntum_4);
 
-        varianc_4 = _mm512_mul_ps(varianc_4, betta2_4);
-        grad_4 = _mm512_mul_ps(grad_4, grad_4);
-        varianc_4 = _mm512_fmadd_ps(grad_4, betta2_minus1_4, varianc_4);
+            varianc_4 = _mm512_mul_ps(varianc_4, betta2_4);
+            grad_4 = _mm512_mul_ps(grad_4, grad_4);
+            varianc_4 = _mm512_fmadd_ps(grad_4, betta2_minus1_4, varianc_4);
 
-        grad_4 = _mm512_sqrt_ps(varianc_4) / bias2_sqrt;
-        grad_4 = _mm512_add_ps(grad_4, eps_4);
-        grad_4 = _mm512_div_ps(momntum_4, grad_4);
+            grad_4 = _mm512_sqrt_ps(varianc_4) / bias2_sqrt;
+            grad_4 = _mm512_add_ps(grad_4, eps_4);
+            grad_4 = _mm512_div_ps(momntum_4, grad_4);
 
-        param_4 = _mm512_fmadd_ps(grad_4, step_size_4, param_4);
+            param_4 = _mm512_fmadd_ps(grad_4, step_size_4, param_4);
 
-        _mm512_storeu_ps(_params + i, param_4);
-        _mm512_storeu_ps(_exp_avg + i, momntum_4);
-        _mm512_storeu_ps(_exp_avg_sq + i, varianc_4);
-        if (dev_params) {
-            if ((i + SIMD_WIDTH) % TILE == 0) {
-                size_t offset = tile * TILE;
-#pragma omp parallel for
-                for (size_t j = 0; j < TILE; j += 4) {
-                    _doubled_buffer[buf_index][j] = (__half)_params[offset + j];
-                    _doubled_buffer[buf_index][j + 1] = (__half)_params[offset + j + 1];
-                    _doubled_buffer[buf_index][j + 2] = (__half)_params[offset + j + 2];
-                    _doubled_buffer[buf_index][j + 3] = (__half)_params[offset + j + 3];
-                }
-                CUDA_CHECK(cudaMemcpyAsync(dev_params + tile * TILE,
-                                           _doubled_buffer[buf_index],
-                                           TILE * sizeof(__half),
-                                           cudaMemcpyHostToDevice,
-                                           Context::Instance().GetCurrentStream()));
-                buf_index = !buf_index;
-                tile++;
+            _mm512_storeu_ps(_params + i, param_4);
+            _mm512_storeu_ps(_exp_avg + i, momntum_4);
+            _mm512_storeu_ps(_exp_avg_sq + i, varianc_4);
+        }
+        if (dev_params) 
+        {
+            #pragma omp parallel for
+            for (size_t j = 0; j < copy_size; j += 4) 
+            {
+                _doubled_buffer[buf_index][j] = (__half)_params[t + j];
+                _doubled_buffer[buf_index][j + 1] = (__half)_params[t + j + 1];
+                _doubled_buffer[buf_index][j + 2] = (__half)_params[t + j + 2];
+                _doubled_buffer[buf_index][j + 3] = (__half)_params[t + j + 3];
             }
+            
+            CUDA_CHECK(cudaMemcpyAsync(dev_params + t,
+                                       _doubled_buffer[buf_index],
+                                       copy_size * sizeof(__half),
+                                       cudaMemcpyHostToDevice,
+                                       Context::Instance().GetCurrentStream()));
+            buf_index = !buf_index;
         }
     }
 }
@@ -134,113 +140,120 @@ void Adam_Optimizer::Step_4(float* _params,
 
     __m512 bias2_sqrt = _mm512_sqrt_ps(bias_correction2_4);
 
+    for (size_t t = 0; t < _param_size; t += TILE)
+    {
+        size_t copy_size = TILE;
+        if((t + TILE) > _param_size)copy_size = _param_size - t;
+        size_t offset = copy_size + t;
 #pragma omp parallel for
-    for (size_t i = 0; i < _param_size; i += (SIMD_WIDTH << 2)) {
-        __m512 grad_4[4];
-        grad_4[0] = _mm512_loadu_ps(grads + i);
-        grad_4[1] = _mm512_loadu_ps(grads + i + SIMD_WIDTH);
-        grad_4[2] = _mm512_loadu_ps(grads + i + (SIMD_WIDTH << 1));
-        grad_4[3] = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 3);
-
-        __m512 momntum_4[2];
-        momntum_4[0] = _mm512_loadu_ps(_exp_avg + i);
-        momntum_4[1] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH);
-        momntum_4[2] = _mm512_loadu_ps(_exp_avg + i + (SIMD_WIDTH << 1));
-        momntum_4[3] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 3);
-
-        __m512 varianc_4[2];
-        varianc_4[0] = _mm512_loadu_ps(_exp_avg_sq + i);
-        varianc_4[1] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH);
-        varianc_4[2] = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1));
-        varianc_4[3] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3);
-
-        __m512 param_4[2];
-        param_4[0] = _mm512_loadu_ps(_params + i);
-        param_4[1] = _mm512_loadu_ps(_params + i + SIMD_WIDTH);
-        param_4[2] = _mm512_loadu_ps(_params + i + (SIMD_WIDTH << 1));
-        param_4[3] = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 3);
-
-        if (_weight_decay > 0) {
-            __m512 weight_decay4 = _mm512_set1_ps(_weight_decay);
-            grad_4[0] = _mm512_fmadd_ps(param_4[0], weight_decay4, grad_4[0]);
-            grad_4[1] = _mm512_fmadd_ps(param_4[1], weight_decay4, grad_4[1]);
-            grad_4[2] = _mm512_fmadd_ps(param_4[2], weight_decay4, grad_4[2]);
-            grad_4[3] = _mm512_fmadd_ps(param_4[3], weight_decay4, grad_4[3]);
+        for(size_t i = t; i < offset;i += (SIMD_WIDTH << 2))
+        {
+            __m512 grad_4[4];
+            grad_4[0] = _mm512_loadu_ps(grads + i);
+            grad_4[1] = _mm512_loadu_ps(grads + i + SIMD_WIDTH);
+            grad_4[2] = _mm512_loadu_ps(grads + i + (SIMD_WIDTH << 1));
+            grad_4[3] = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 3);
+
+            __m512 momntum_4[2];
+            momntum_4[0] = _mm512_loadu_ps(_exp_avg + i);
+            momntum_4[1] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH);
+            momntum_4[2] = _mm512_loadu_ps(_exp_avg + i + (SIMD_WIDTH << 1));
+            momntum_4[3] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 3);
+
+            __m512 varianc_4[2];
+            varianc_4[0] = _mm512_loadu_ps(_exp_avg_sq + i);
+            varianc_4[1] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH);
+            varianc_4[2] = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1));
+            varianc_4[3] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3);
+
+            __m512 param_4[2];
+            param_4[0] = _mm512_loadu_ps(_params + i);
+            param_4[1] = _mm512_loadu_ps(_params + i + SIMD_WIDTH);
+            param_4[2] = _mm512_loadu_ps(_params + i + (SIMD_WIDTH << 1));
+            param_4[3] = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 3);
+
+            if (_weight_decay > 0) {
+                __m512 weight_decay4 = _mm512_set1_ps(_weight_decay);
+                grad_4[0] = _mm512_fmadd_ps(param_4[0], weight_decay4, grad_4[0]);
+                grad_4[1] = _mm512_fmadd_ps(param_4[1], weight_decay4, grad_4[1]);
+                grad_4[2] = _mm512_fmadd_ps(param_4[2], weight_decay4, grad_4[2]);
+                grad_4[3] = _mm512_fmadd_ps(param_4[3], weight_decay4, grad_4[3]);
+            }
+
+            momntum_4[0] = _mm512_mul_ps(momntum_4[0], betta1_4);
+            momntum_4[0] = _mm512_fmadd_ps(grad_4[0], betta1_minus1_4, momntum_4[0]);
+            momntum_4[1] = _mm512_mul_ps(momntum_4[1], betta1_4);
+            momntum_4[1] = _mm512_fmadd_ps(grad_4[1], betta1_minus1_4, momntum_4[1]);
+            momntum_4[2] = _mm512_mul_ps(momntum_4[2], betta1_4);
+            momntum_4[2] = _mm512_fmadd_ps(grad_4[2], betta1_minus1_4, momntum_4[2]);
+            momntum_4[3] = _mm512_mul_ps(momntum_4[3], betta1_4);
+            momntum_4[3] = _mm512_fmadd_ps(grad_4[3], betta1_minus1_4, momntum_4[3]);
+
+            varianc_4[0] = _mm512_mul_ps(varianc_4[0], betta2_4);
+            varianc_4[1] = _mm512_mul_ps(varianc_4[1], betta2_4);
+            varianc_4[2] = _mm512_mul_ps(varianc_4[2], betta2_4);
+            varianc_4[3] = _mm512_mul_ps(varianc_4[3], betta2_4);
+            grad_4[0] = _mm512_mul_ps(grad_4[0], grad_4[0]);
+            grad_4[1] = _mm512_mul_ps(grad_4[1], grad_4[1]);
+            grad_4[2] = _mm512_mul_ps(grad_4[2], grad_4[2]);
+            grad_4[3] = _mm512_mul_ps(grad_4[3], grad_4[3]);
+            varianc_4[0] = _mm512_fmadd_ps(grad_4[0], betta2_minus1_4, varianc_4[0]);
+            varianc_4[1] = _mm512_fmadd_ps(grad_4[1], betta2_minus1_4, varianc_4[1]);
+            varianc_4[2] = _mm512_fmadd_ps(grad_4[2], betta2_minus1_4, varianc_4[2]);
+            varianc_4[3] = _mm512_fmadd_ps(grad_4[3], betta2_minus1_4, varianc_4[3]);
+
+            grad_4[0] = _mm512_sqrt_ps(varianc_4[0]) / bias2_sqrt;
+            grad_4[1] = _mm512_sqrt_ps(varianc_4[1]) / bias2_sqrt;
+            grad_4[2] = _mm512_sqrt_ps(varianc_4[2]) / bias2_sqrt;
+            grad_4[3] = _mm512_sqrt_ps(varianc_4[3]) / bias2_sqrt;
+
+            grad_4[0] = _mm512_add_ps(grad_4[0], eps_4);
+            grad_4[1] = _mm512_add_ps(grad_4[1], eps_4);
+            grad_4[2] = _mm512_add_ps(grad_4[2], eps_4);
+            grad_4[3] = _mm512_add_ps(grad_4[3], eps_4);
+            grad_4[0] = _mm512_div_ps(momntum_4[0], grad_4[0]);
+            grad_4[1] = _mm512_div_ps(momntum_4[1], grad_4[1]);
+            grad_4[2] = _mm512_div_ps(momntum_4[2], grad_4[2]);
+            grad_4[3] = _mm512_div_ps(momntum_4[3], grad_4[3]);
+
+            param_4[0] = _mm512_fmadd_ps(grad_4[0], step_size_4, param_4[0]);
+            param_4[1] = _mm512_fmadd_ps(grad_4[1], step_size_4, param_4[1]);
+            param_4[2] = _mm512_fmadd_ps(grad_4[2], step_size_4, param_4[2]);
+            param_4[3] = _mm512_fmadd_ps(grad_4[3], step_size_4, param_4[3]);
+
+            _mm512_storeu_ps(_params + i, param_4[0]);
+            _mm512_storeu_ps(_params + i + SIMD_WIDTH, param_4[1]);
+            _mm512_storeu_ps(_params + i + (SIMD_WIDTH << 1), param_4[2]);
+            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 3, param_4[3]);
+
+            _mm512_storeu_ps(_exp_avg + i, momntum_4[0]);
+            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH, momntum_4[1]);
+            _mm512_storeu_ps(_exp_avg + i + (SIMD_WIDTH << 1), momntum_4[2]);
+            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 3, momntum_4[3]);
+
+            _mm512_storeu_ps(_exp_avg_sq + i, varianc_4[0]);
+            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH, varianc_4[1]);
+            _mm512_storeu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1), varianc_4[2]);
+            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3, varianc_4[3]);
         }
 
-        momntum_4[0] = _mm512_mul_ps(momntum_4[0], betta1_4);
-        momntum_4[0] = _mm512_fmadd_ps(grad_4[0], betta1_minus1_4, momntum_4[0]);
-        momntum_4[1] = _mm512_mul_ps(momntum_4[1], betta1_4);
-        momntum_4[1] = _mm512_fmadd_ps(grad_4[1], betta1_minus1_4, momntum_4[1]);
-        momntum_4[2] = _mm512_mul_ps(momntum_4[2], betta1_4);
-        momntum_4[2] = _mm512_fmadd_ps(grad_4[2], betta1_minus1_4, momntum_4[2]);
-        momntum_4[3] = _mm512_mul_ps(momntum_4[3], betta1_4);
-        momntum_4[3] = _mm512_fmadd_ps(grad_4[3], betta1_minus1_4, momntum_4[3]);
-
-        varianc_4[0] = _mm512_mul_ps(varianc_4[0], betta2_4);
-        varianc_4[1] = _mm512_mul_ps(varianc_4[1], betta2_4);
-        varianc_4[2] = _mm512_mul_ps(varianc_4[2], betta2_4);
-        varianc_4[3] = _mm512_mul_ps(varianc_4[3], betta2_4);
-        grad_4[0] = _mm512_mul_ps(grad_4[0], grad_4[0]);
-        grad_4[1] = _mm512_mul_ps(grad_4[1], grad_4[1]);
-        grad_4[2] = _mm512_mul_ps(grad_4[2], grad_4[2]);
-        grad_4[3] = _mm512_mul_ps(grad_4[3], grad_4[3]);
-        varianc_4[0] = _mm512_fmadd_ps(grad_4[0], betta2_minus1_4, varianc_4[0]);
-        varianc_4[1] = _mm512_fmadd_ps(grad_4[1], betta2_minus1_4, varianc_4[1]);
-        varianc_4[2] = _mm512_fmadd_ps(grad_4[2], betta2_minus1_4, varianc_4[2]);
-        varianc_4[3] = _mm512_fmadd_ps(grad_4[3], betta2_minus1_4, varianc_4[3]);
-
-        grad_4[0] = _mm512_sqrt_ps(varianc_4[0]) / bias2_sqrt;
-        grad_4[1] = _mm512_sqrt_ps(varianc_4[1]) / bias2_sqrt;
-        grad_4[2] = _mm512_sqrt_ps(varianc_4[2]) / bias2_sqrt;
-        grad_4[3] = _mm512_sqrt_ps(varianc_4[3]) / bias2_sqrt;
-
-        grad_4[0] = _mm512_add_ps(grad_4[0], eps_4);
-        grad_4[1] = _mm512_add_ps(grad_4[1], eps_4);
-        grad_4[2] = _mm512_add_ps(grad_4[2], eps_4);
-        grad_4[3] = _mm512_add_ps(grad_4[3], eps_4);
-        grad_4[0] = _mm512_div_ps(momntum_4[0], grad_4[0]);
-        grad_4[1] = _mm512_div_ps(momntum_4[1], grad_4[1]);
-        grad_4[2] = _mm512_div_ps(momntum_4[2], grad_4[2]);
-        grad_4[3] = _mm512_div_ps(momntum_4[3], grad_4[3]);
-
-        param_4[0] = _mm512_fmadd_ps(grad_4[0], step_size_4, param_4[0]);
-        param_4[1] = _mm512_fmadd_ps(grad_4[1], step_size_4, param_4[1]);
-        param_4[2] = _mm512_fmadd_ps(grad_4[2], step_size_4, param_4[2]);
-        param_4[3] = _mm512_fmadd_ps(grad_4[3], step_size_4, param_4[3]);
-
-        _mm512_storeu_ps(_params + i, param_4[0]);
-        _mm512_storeu_ps(_params + i + SIMD_WIDTH, param_4[1]);
-        _mm512_storeu_ps(_params + i + (SIMD_WIDTH << 1), param_4[2]);
-        _mm512_storeu_ps(_params + i + SIMD_WIDTH * 3, param_4[3]);
-
-        _mm512_storeu_ps(_exp_avg + i, momntum_4[0]);
-        _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH, momntum_4[1]);
-        _mm512_storeu_ps(_exp_avg + i + (SIMD_WIDTH << 1), momntum_4[2]);
-        _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 3, momntum_4[3]);
-
-        _mm512_storeu_ps(_exp_avg_sq + i, varianc_4[0]);
-        _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH, varianc_4[1]);
-        _mm512_storeu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1), varianc_4[2]);
-        _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3, varianc_4[3]);
-        if (dev_params) {
-            if ((i + (SIMD_WIDTH << 2)) % TILE == 0) {
-                size_t offset = tile * TILE;
-#pragma omp parallel for
-                for (size_t j = 0; j < TILE; j += 4) {
-                    _doubled_buffer[buf_index][j] = (__half)_params[offset + j];
-                    _doubled_buffer[buf_index][j + 1] = (__half)_params[offset + j + 1];
-                    _doubled_buffer[buf_index][j + 2] = (__half)_params[offset + j + 2];
-                    _doubled_buffer[buf_index][j + 3] = (__half)_params[offset + j + 3];
-                }
-                CUDA_CHECK(cudaMemcpyAsync(dev_params + tile * TILE,
-                                           _doubled_buffer[buf_index],
-                                           TILE * sizeof(__half),
-                                           cudaMemcpyHostToDevice,
-                                           Context::Instance().GetCurrentStream()));
-                buf_index = !buf_index;
-                tile++;
+        if (dev_params) 
+        {
+            #pragma omp parallel for
+            for (size_t j = 0; j < copy_size; j += 4) 
+            {
+                _doubled_buffer[buf_index][j] = (__half)_params[t + j];
+                _doubled_buffer[buf_index][j + 1] = (__half)_params[t + j + 1];
+                _doubled_buffer[buf_index][j + 2] = (__half)_params[t + j + 2];
+                _doubled_buffer[buf_index][j + 3] = (__half)_params[t + j + 3];
             }
+            
+            CUDA_CHECK(cudaMemcpyAsync(dev_params + t,
+                                       _doubled_buffer[buf_index],
+                                       copy_size * sizeof(__half),
+                                       cudaMemcpyHostToDevice,
+                                       Context::Instance().GetCurrentStream()));
+            buf_index = !buf_index;
         }
     }
 }
@@ -262,6 +275,225 @@ int create_adam_optimizer(int optimizer_id,
     return 0;
 }
 
+void Adam_Optimizer::Step_8(float *_params, 
+                            float *grads, 
+                            float *_exp_avg, 
+                            float *_exp_avg_sq, 
+                            size_t _param_size, 
+                            __half* dev_params)
+{
+
+    _betta1_t *= _betta1;
+    _betta2_t *= _betta2;
+
+    __m512 betta1_4 = _mm512_set1_ps(_betta1);
+    __m512 betta2_4 = _mm512_set1_ps(_betta2);
+
+    bool buf_index = 0;
+
+    float betta1_minus1 = 1 - _betta1; 
+    float betta2_minus1 = 1 - _betta2; 
+    __m512 betta1_minus1_4 = _mm512_set1_ps(betta1_minus1);
+    __m512 betta2_minus1_4 = _mm512_set1_ps(betta2_minus1);
+
+    float bias_correction1 = 1 - _betta1_t;
+    float bias_correction2 = 1 - _betta2_t;
+    //__m512 bias_correction1_4 = _mm512_set1_ps(bias_correction1);
+    __m512 bias_correction2_4 = _mm512_set1_ps(bias_correction2);
+
+    __m512 eps_4 = _mm512_set1_ps(_eps);
+
+    float step_size = -1 * _alpha / bias_correction1;
+    __m512 step_size_4 = _mm512_set1_ps(step_size);
+
+    __m512 bias2_sqrt = _mm512_sqrt_ps(bias_correction2_4);
+
+    for (size_t t = 0; t < _param_size; t += TILE)
+    {
+        size_t copy_size = TILE;
+        if((t + TILE) > _param_size)copy_size = _param_size - t;
+        size_t offset = copy_size + t;
+#pragma omp parallel for
+        for(size_t i = t; i < offset;i += (SIMD_WIDTH << 3))
+        {
+            __m512 grad_4[8];
+            grad_4[0] = _mm512_loadu_ps(grads + i);
+            grad_4[1] = _mm512_loadu_ps(grads + i + SIMD_WIDTH);
+            grad_4[2] = _mm512_loadu_ps(grads + i + (SIMD_WIDTH<<1));
+            grad_4[3] = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 3);
+            grad_4[4] = _mm512_loadu_ps(grads + i + (SIMD_WIDTH<<2));
+            grad_4[5] = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 5);
+            grad_4[6] = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 6);
+            grad_4[7] = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 7);
+
+            __m512 momntum_4[8];
+            momntum_4[0] = _mm512_loadu_ps(_exp_avg + i);
+            momntum_4[1] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH);
+            momntum_4[2] = _mm512_loadu_ps(_exp_avg + i + (SIMD_WIDTH<<1));
+            momntum_4[3] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 3);
+            momntum_4[4] = _mm512_loadu_ps(_exp_avg + i + (SIMD_WIDTH<<2));
+            momntum_4[5] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 5);
+            momntum_4[6] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 6);
+            momntum_4[7] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 7);
+
+            __m512 varianc_4[8];
+            varianc_4[0] = _mm512_loadu_ps(_exp_avg_sq + i);
+            varianc_4[1] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH);
+            varianc_4[2] = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH<<1));
+            varianc_4[3] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3);
+            varianc_4[5] = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH<<2));
+            varianc_4[6] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 5);
+            varianc_4[7] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 6);
+            varianc_4[8] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 7);
+
+            __m512 param_4[8];
+            param_4[0] = _mm512_loadu_ps(_params + i);
+            param_4[1] = _mm512_loadu_ps(_params + i + SIMD_WIDTH);
+            param_4[2] = _mm512_loadu_ps(_params + i + (SIMD_WIDTH<<1));
+            param_4[3] = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 3);
+            param_4[4] = _mm512_loadu_ps(_params + i + (SIMD_WIDTH<<2));
+            param_4[5] = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 5);
+            param_4[6] = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 6);
+            param_4[7] = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 7);
+
+            if(_weight_decay > 0)
+            {
+                __m512 weight_decay4 = _mm512_set1_ps(_weight_decay);
+                grad_4[0] = _mm512_fmadd_ps(param_4[0], weight_decay4, grad_4[0]);
+                grad_4[1] = _mm512_fmadd_ps(param_4[1], weight_decay4, grad_4[1]);
+                grad_4[2] = _mm512_fmadd_ps(param_4[2], weight_decay4, grad_4[2]);
+                grad_4[3] = _mm512_fmadd_ps(param_4[3], weight_decay4, grad_4[3]);
+                grad_4[4] = _mm512_fmadd_ps(param_4[4], weight_decay4, grad_4[4]);
+                grad_4[5] = _mm512_fmadd_ps(param_4[5], weight_decay4, grad_4[5]);
+                grad_4[6] = _mm512_fmadd_ps(param_4[6], weight_decay4, grad_4[6]);
+                grad_4[7] = _mm512_fmadd_ps(param_4[7], weight_decay4, grad_4[7]);
+            }
+
+            momntum_4[0] = _mm512_mul_ps(momntum_4[0], betta1_4);
+            momntum_4[0] = _mm512_fmadd_ps(grad_4[0], betta1_minus1_4, momntum_4[0]);
+            momntum_4[1] = _mm512_mul_ps(momntum_4[1], betta1_4);
+            momntum_4[1] = _mm512_fmadd_ps(grad_4[1], betta1_minus1_4, momntum_4[1]);
+            momntum_4[2] = _mm512_mul_ps(momntum_4[2], betta1_4);
+            momntum_4[2] = _mm512_fmadd_ps(grad_4[2], betta1_minus1_4, momntum_4[2]);
+            momntum_4[3] = _mm512_mul_ps(momntum_4[3], betta1_4);
+            momntum_4[3] = _mm512_fmadd_ps(grad_4[3], betta1_minus1_4, momntum_4[3]);
+            momntum_4[4] = _mm512_mul_ps(momntum_4[4], betta1_4);
+            momntum_4[4] = _mm512_fmadd_ps(grad_4[4], betta1_minus1_4, momntum_4[4]);
+            momntum_4[5] = _mm512_mul_ps(momntum_4[5], betta1_4);
+            momntum_4[5] = _mm512_fmadd_ps(grad_4[5], betta1_minus1_4, momntum_4[5]);
+            momntum_4[6] = _mm512_mul_ps(momntum_4[6], betta1_4);
+            momntum_4[6] = _mm512_fmadd_ps(grad_4[6], betta1_minus1_4, momntum_4[6]);
+            momntum_4[7] = _mm512_mul_ps(momntum_4[7], betta1_4);
+            momntum_4[7] = _mm512_fmadd_ps(grad_4[7], betta1_minus1_4, momntum_4[7]);
+
+            varianc_4[0] = _mm512_mul_ps(varianc_4[0], betta2_4);
+            varianc_4[1] = _mm512_mul_ps(varianc_4[1], betta2_4);
+            varianc_4[2] = _mm512_mul_ps(varianc_4[2], betta2_4);
+            varianc_4[3] = _mm512_mul_ps(varianc_4[3], betta2_4);
+            varianc_4[4] = _mm512_mul_ps(varianc_4[4], betta2_4);
+            varianc_4[5] = _mm512_mul_ps(varianc_4[5], betta2_4);
+            varianc_4[6] = _mm512_mul_ps(varianc_4[6], betta2_4);
+            varianc_4[7] = _mm512_mul_ps(varianc_4[7], betta2_4);
+            grad_4[0] = _mm512_mul_ps(grad_4[0], grad_4[0]);
+            grad_4[1] = _mm512_mul_ps(grad_4[1], grad_4[1]);
+            grad_4[2] = _mm512_mul_ps(grad_4[2], grad_4[2]);
+            grad_4[3] = _mm512_mul_ps(grad_4[3], grad_4[3]);
+            grad_4[4] = _mm512_mul_ps(grad_4[4], grad_4[4]);
+            grad_4[5] = _mm512_mul_ps(grad_4[5], grad_4[5]);
+            grad_4[6] = _mm512_mul_ps(grad_4[6], grad_4[6]);
+            grad_4[7] = _mm512_mul_ps(grad_4[7], grad_4[7]);
+            varianc_4[0] = _mm512_fmadd_ps(grad_4[0], betta2_minus1_4, varianc_4[0]);
+            varianc_4[1] = _mm512_fmadd_ps(grad_4[1], betta2_minus1_4, varianc_4[1]);
+            varianc_4[2] = _mm512_fmadd_ps(grad_4[2], betta2_minus1_4, varianc_4[2]);
+            varianc_4[3] = _mm512_fmadd_ps(grad_4[3], betta2_minus1_4, varianc_4[3]);
+            varianc_4[4] = _mm512_fmadd_ps(grad_4[4], betta2_minus1_4, varianc_4[4]);
+            varianc_4[5] = _mm512_fmadd_ps(grad_4[5], betta2_minus1_4, varianc_4[5]);
+            varianc_4[6] = _mm512_fmadd_ps(grad_4[6], betta2_minus1_4, varianc_4[6]);
+            varianc_4[7] = _mm512_fmadd_ps(grad_4[7], betta2_minus1_4, varianc_4[7]);
+
+            grad_4[0] = _mm512_sqrt_ps(varianc_4[0]) / bias2_sqrt;
+            grad_4[1] = _mm512_sqrt_ps(varianc_4[1]) / bias2_sqrt;
+            grad_4[2] = _mm512_sqrt_ps(varianc_4[2]) / bias2_sqrt;
+            grad_4[3] = _mm512_sqrt_ps(varianc_4[3]) / bias2_sqrt;
+            grad_4[4] = _mm512_sqrt_ps(varianc_4[4]) / bias2_sqrt;
+            grad_4[5] = _mm512_sqrt_ps(varianc_4[5]) / bias2_sqrt;
+            grad_4[6] = _mm512_sqrt_ps(varianc_4[6]) / bias2_sqrt;
+            grad_4[7] = _mm512_sqrt_ps(varianc_4[7]) / bias2_sqrt;
+
+            grad_4[0] = _mm512_add_ps(grad_4[0], eps_4);
+            grad_4[1] = _mm512_add_ps(grad_4[1], eps_4);
+            grad_4[2] = _mm512_add_ps(grad_4[2], eps_4);
+            grad_4[3] = _mm512_add_ps(grad_4[3], eps_4);
+            grad_4[4] = _mm512_add_ps(grad_4[4], eps_4);
+            grad_4[5] = _mm512_add_ps(grad_4[5], eps_4);
+            grad_4[6] = _mm512_add_ps(grad_4[6], eps_4);
+            grad_4[7] = _mm512_add_ps(grad_4[7], eps_4);
+            grad_4[0] = _mm512_div_ps(momntum_4[0], grad_4[0]);
+            grad_4[1] = _mm512_div_ps(momntum_4[1], grad_4[1]);
+            grad_4[2] = _mm512_div_ps(momntum_4[2], grad_4[2]);
+            grad_4[3] = _mm512_div_ps(momntum_4[3], grad_4[3]);
+            grad_4[4] = _mm512_div_ps(momntum_4[4], grad_4[4]);
+            grad_4[5] = _mm512_div_ps(momntum_4[5], grad_4[5]);
+            grad_4[6] = _mm512_div_ps(momntum_4[6], grad_4[6]);
+            grad_4[7] = _mm512_div_ps(momntum_4[7], grad_4[7]);
+
+            param_4[0] = _mm512_fmadd_ps(grad_4[0], step_size_4, param_4[0]);
+            param_4[1] = _mm512_fmadd_ps(grad_4[1], step_size_4, param_4[1]);
+            param_4[2] = _mm512_fmadd_ps(grad_4[2], step_size_4, param_4[2]);
+            param_4[3] = _mm512_fmadd_ps(grad_4[3], step_size_4, param_4[3]);
+            param_4[4] = _mm512_fmadd_ps(grad_4[4], step_size_4, param_4[4]);
+            param_4[5] = _mm512_fmadd_ps(grad_4[5], step_size_4, param_4[5]);
+            param_4[6] = _mm512_fmadd_ps(grad_4[6], step_size_4, param_4[6]);
+            param_4[7] = _mm512_fmadd_ps(grad_4[7], step_size_4, param_4[7]);
+
+            _mm512_storeu_ps(_params + i, param_4[0]);
+            _mm512_storeu_ps(_params + i + SIMD_WIDTH, param_4[1]);
+            _mm512_storeu_ps(_params + i + (SIMD_WIDTH<<1), param_4[2]);
+            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 3, param_4[3]);
+            _mm512_storeu_ps(_params + i + (SIMD_WIDTH<<2), param_4[4]);
+            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 5, param_4[5]);
+            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 6, param_4[6]);
+            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 7, param_4[7]);
+
+            _mm512_storeu_ps(_exp_avg + i, momntum_4[0]);
+            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH, momntum_4[1]);
+            _mm512_storeu_ps(_exp_avg + i + (SIMD_WIDTH<<1), momntum_4[2]);
+            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 3, momntum_4[3]);
+            _mm512_storeu_ps(_exp_avg + i + (SIMD_WIDTH<<2), momntum_4[4]);
+            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 5, momntum_4[5]);
+            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 6, momntum_4[6]);
+            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 7, momntum_4[7]);
+
+            _mm512_storeu_ps(_exp_avg_sq + i, varianc_4[0]);
+            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH, varianc_4[1]);
+            _mm512_storeu_ps(_exp_avg_sq + i + (SIMD_WIDTH<<1), varianc_4[2]);
+            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3, varianc_4[3]);
+            _mm512_storeu_ps(_exp_avg_sq + i + (SIMD_WIDTH<<2), varianc_4[4]);
+            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 5, varianc_4[5]);
+            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 6, varianc_4[6]);
+            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 7, varianc_4[7]);
+        }
+        if (dev_params) 
+        {
+            #pragma omp parallel for
+            for (size_t j = 0; j < copy_size; j += 4) 
+            {
+                _doubled_buffer[buf_index][j] = (__half)_params[t + j];
+                _doubled_buffer[buf_index][j + 1] = (__half)_params[t + j + 1];
+                _doubled_buffer[buf_index][j + 2] = (__half)_params[t + j + 2];
+                _doubled_buffer[buf_index][j + 3] = (__half)_params[t + j + 3];
+            }
+            
+            CUDA_CHECK(cudaMemcpyAsync(dev_params + t,
+                                       _doubled_buffer[buf_index],
+                                       copy_size * sizeof(__half),
+                                       cudaMemcpyHostToDevice,
+                                       Context::Instance().GetCurrentStream()));
+            buf_index = !buf_index;
+        }
+    }
+}
+
 int ds_adam_step(int optimizer_id,
                  torch::Tensor& params,
                  torch::Tensor& grads,
@@ -281,7 +513,7 @@ int ds_adam_step(int optimizer_id,
     std::shared_ptr<Adam_Optimizer> opt =
         std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
 
-    opt->Step_4(params_ptr, grads_ptr, exp_avg_ptr, exp_avg_sq_ptr, params_c.size(0));
+    opt->Step_8(params_ptr, grads_ptr, exp_avg_ptr, exp_avg_sq_ptr, params_c.size(0));
 
     return 0;
 }
@@ -308,7 +540,7 @@ int ds_adam_step_plus_copy(int optimizer_id,
     std::shared_ptr<Adam_Optimizer> opt =
         std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
 
-    opt->Step_4(
+    opt->Step_8(
         params_ptr, grads_ptr, exp_avg_ptr, exp_avg_sq_ptr, params_c.size(0), gpu_params_ptr);
 
     return 0;
diff --git a/csrc/includes/cpu_adam.h b/csrc/includes/cpu_adam.h
old mode 100644
new mode 100755
index d1a2aa088731..c74c31c72f16
--- a/csrc/includes/cpu_adam.h
+++ b/csrc/includes/cpu_adam.h
@@ -18,7 +18,8 @@
     }
 
 #define SIMD_WIDTH 16
-#define TILE 1048576
+
+#define TILE (1024 * 1024 * 1024)
 
 class Adam_Optimizer {
 public:
@@ -55,6 +56,12 @@ class Adam_Optimizer {
                 float* _exp_avg_sa,
                 size_t param_size,
                 __half* dev_param = nullptr);
+    void Step_8(float *_params, 
+                float *grads, 
+                float *_exp_avg, 
+                float *_exp_avg_sq, 
+                size_t _param_size, 
+                __half* dev_params = nullptr);
 
 private:
     float _alpha;
diff --git a/setup.py b/setup.py
index ae5010a65b11..77b7241b582b 100755
--- a/setup.py
+++ b/setup.py
@@ -123,11 +123,15 @@ def fetch_requirements(path):
             sources=[
                 'csrc/adam/cpu_adam.cpp',
             ],
-            include_dirs=['csrc/includes'],
+            include_dirs=['csrc/includes',
+                        '/usr/local/cuda/include'],
             extra_compile_args={
                 'cxx':
                 ['-O3',
                  '-std=c++14',
+                 '-L/usr/local/cuda/lib64',
+                 '-lcudart',
+                 '-lcublas',
                  '-g',
                  '-Wno-reorder',
                  '-march=native',
diff --git a/tests/unit/adam_test.py b/tests/unit/adam_test.py
index 0fedfeb18e42..51c588dda962 100755
--- a/tests/unit/adam_test.py
+++ b/tests/unit/adam_test.py
@@ -3,16 +3,18 @@
 import time
 
 device = 'cpu'
-model_size = 1 * 1024**3
+model_size = 10 * 1024**3
 param = torch.nn.Parameter(torch.ones(model_size, device=device))
+param_fp16 = torch.nn.Parameter(torch.ones(model_size, dtype=torch.half, device='cuda:0'))
+
 optimizer = DeepSpeedCPUAdam([param])
 #torch.set_num_threads(128)
 param.grad = torch.ones(model_size, device=device)
 avg = 0
-for i in range(100):
+for i in range(10):
     start = time.time()
-    optimizer.step()
+    optimizer.step(fp16_param_groups=[param_fp16])
     stop = time.time()
     avg += (stop - start)
     param.grad = torch.ones(model_size, device=device) * 2
-print("Elapsed Time is ", avg / 100)
+print("Elapsed Time is ", avg / 10)

From 59ffc1a9092f4a47744676b727d522c2886260d0 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Sat, 5 Sep 2020 01:58:55 +0000
Subject: [PATCH 19/44] run precommit

---
 csrc/adam/cpu_adam.cpp   | 102 +++++++++++++++++----------------------
 csrc/includes/cpu_adam.h |  10 ++--
 setup.py                 |  38 +++++++--------
 tests/unit/adam_test.py  |   4 +-
 4 files changed, 71 insertions(+), 83 deletions(-)
 mode change 100755 => 100644 csrc/adam/cpu_adam.cpp
 mode change 100755 => 100644 csrc/includes/cpu_adam.h

diff --git a/csrc/adam/cpu_adam.cpp b/csrc/adam/cpu_adam.cpp
old mode 100755
new mode 100644
index ab5b6c0b7054..98ac32e04a2f
--- a/csrc/adam/cpu_adam.cpp
+++ b/csrc/adam/cpu_adam.cpp
@@ -49,14 +49,12 @@ void Adam_Optimizer::Step(float* _params,
 
     size_t tile = 0;
 
-    for (size_t t = 0; t < _param_size; t += TILE)
-    {
+    for (size_t t = 0; t < _param_size; t += TILE) {
         size_t copy_size = TILE;
-        if((t + TILE) > _param_size)copy_size = _param_size - t;
+        if ((t + TILE) > _param_size) copy_size = _param_size - t;
         size_t offset = copy_size + t;
-        #pragma omp parallel for
-        for(size_t i = t; i < offset;i += SIMD_WIDTH)
-        {
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += SIMD_WIDTH) {
             __m512 grad_4 = _mm512_loadu_ps(grads + i);
 
             __m512 momntum_4 = _mm512_loadu_ps(_exp_avg + i);
@@ -86,17 +84,15 @@ void Adam_Optimizer::Step(float* _params,
             _mm512_storeu_ps(_exp_avg + i, momntum_4);
             _mm512_storeu_ps(_exp_avg_sq + i, varianc_4);
         }
-        if (dev_params) 
-        {
-            #pragma omp parallel for
-            for (size_t j = 0; j < copy_size; j += 4) 
-            {
+        if (dev_params) {
+#pragma omp parallel for
+            for (size_t j = 0; j < copy_size; j += 4) {
                 _doubled_buffer[buf_index][j] = (__half)_params[t + j];
                 _doubled_buffer[buf_index][j + 1] = (__half)_params[t + j + 1];
                 _doubled_buffer[buf_index][j + 2] = (__half)_params[t + j + 2];
                 _doubled_buffer[buf_index][j + 3] = (__half)_params[t + j + 3];
             }
-            
+
             CUDA_CHECK(cudaMemcpyAsync(dev_params + t,
                                        _doubled_buffer[buf_index],
                                        copy_size * sizeof(__half),
@@ -140,14 +136,12 @@ void Adam_Optimizer::Step_4(float* _params,
 
     __m512 bias2_sqrt = _mm512_sqrt_ps(bias_correction2_4);
 
-    for (size_t t = 0; t < _param_size; t += TILE)
-    {
+    for (size_t t = 0; t < _param_size; t += TILE) {
         size_t copy_size = TILE;
-        if((t + TILE) > _param_size)copy_size = _param_size - t;
+        if ((t + TILE) > _param_size) copy_size = _param_size - t;
         size_t offset = copy_size + t;
 #pragma omp parallel for
-        for(size_t i = t; i < offset;i += (SIMD_WIDTH << 2))
-        {
+        for (size_t i = t; i < offset; i += (SIMD_WIDTH << 2)) {
             __m512 grad_4[4];
             grad_4[0] = _mm512_loadu_ps(grads + i);
             grad_4[1] = _mm512_loadu_ps(grads + i + SIMD_WIDTH);
@@ -237,17 +231,15 @@ void Adam_Optimizer::Step_4(float* _params,
             _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3, varianc_4[3]);
         }
 
-        if (dev_params) 
-        {
-            #pragma omp parallel for
-            for (size_t j = 0; j < copy_size; j += 4) 
-            {
+        if (dev_params) {
+#pragma omp parallel for
+            for (size_t j = 0; j < copy_size; j += 4) {
                 _doubled_buffer[buf_index][j] = (__half)_params[t + j];
                 _doubled_buffer[buf_index][j + 1] = (__half)_params[t + j + 1];
                 _doubled_buffer[buf_index][j + 2] = (__half)_params[t + j + 2];
                 _doubled_buffer[buf_index][j + 3] = (__half)_params[t + j + 3];
             }
-            
+
             CUDA_CHECK(cudaMemcpyAsync(dev_params + t,
                                        _doubled_buffer[buf_index],
                                        copy_size * sizeof(__half),
@@ -275,14 +267,13 @@ int create_adam_optimizer(int optimizer_id,
     return 0;
 }
 
-void Adam_Optimizer::Step_8(float *_params, 
-                            float *grads, 
-                            float *_exp_avg, 
-                            float *_exp_avg_sq, 
-                            size_t _param_size, 
+void Adam_Optimizer::Step_8(float* _params,
+                            float* grads,
+                            float* _exp_avg,
+                            float* _exp_avg_sq,
+                            size_t _param_size,
                             __half* dev_params)
 {
-
     _betta1_t *= _betta1;
     _betta2_t *= _betta2;
 
@@ -291,8 +282,8 @@ void Adam_Optimizer::Step_8(float *_params,
 
     bool buf_index = 0;
 
-    float betta1_minus1 = 1 - _betta1; 
-    float betta2_minus1 = 1 - _betta2; 
+    float betta1_minus1 = 1 - _betta1;
+    float betta2_minus1 = 1 - _betta2;
     __m512 betta1_minus1_4 = _mm512_set1_ps(betta1_minus1);
     __m512 betta2_minus1_4 = _mm512_set1_ps(betta2_minus1);
 
@@ -308,20 +299,18 @@ void Adam_Optimizer::Step_8(float *_params,
 
     __m512 bias2_sqrt = _mm512_sqrt_ps(bias_correction2_4);
 
-    for (size_t t = 0; t < _param_size; t += TILE)
-    {
+    for (size_t t = 0; t < _param_size; t += TILE) {
         size_t copy_size = TILE;
-        if((t + TILE) > _param_size)copy_size = _param_size - t;
+        if ((t + TILE) > _param_size) copy_size = _param_size - t;
         size_t offset = copy_size + t;
 #pragma omp parallel for
-        for(size_t i = t; i < offset;i += (SIMD_WIDTH << 3))
-        {
+        for (size_t i = t; i < offset; i += (SIMD_WIDTH << 3)) {
             __m512 grad_4[8];
             grad_4[0] = _mm512_loadu_ps(grads + i);
             grad_4[1] = _mm512_loadu_ps(grads + i + SIMD_WIDTH);
-            grad_4[2] = _mm512_loadu_ps(grads + i + (SIMD_WIDTH<<1));
+            grad_4[2] = _mm512_loadu_ps(grads + i + (SIMD_WIDTH << 1));
             grad_4[3] = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 3);
-            grad_4[4] = _mm512_loadu_ps(grads + i + (SIMD_WIDTH<<2));
+            grad_4[4] = _mm512_loadu_ps(grads + i + (SIMD_WIDTH << 2));
             grad_4[5] = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 5);
             grad_4[6] = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 6);
             grad_4[7] = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 7);
@@ -329,9 +318,9 @@ void Adam_Optimizer::Step_8(float *_params,
             __m512 momntum_4[8];
             momntum_4[0] = _mm512_loadu_ps(_exp_avg + i);
             momntum_4[1] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH);
-            momntum_4[2] = _mm512_loadu_ps(_exp_avg + i + (SIMD_WIDTH<<1));
+            momntum_4[2] = _mm512_loadu_ps(_exp_avg + i + (SIMD_WIDTH << 1));
             momntum_4[3] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 3);
-            momntum_4[4] = _mm512_loadu_ps(_exp_avg + i + (SIMD_WIDTH<<2));
+            momntum_4[4] = _mm512_loadu_ps(_exp_avg + i + (SIMD_WIDTH << 2));
             momntum_4[5] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 5);
             momntum_4[6] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 6);
             momntum_4[7] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 7);
@@ -339,9 +328,9 @@ void Adam_Optimizer::Step_8(float *_params,
             __m512 varianc_4[8];
             varianc_4[0] = _mm512_loadu_ps(_exp_avg_sq + i);
             varianc_4[1] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH);
-            varianc_4[2] = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH<<1));
+            varianc_4[2] = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1));
             varianc_4[3] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3);
-            varianc_4[5] = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH<<2));
+            varianc_4[5] = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 2));
             varianc_4[6] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 5);
             varianc_4[7] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 6);
             varianc_4[8] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 7);
@@ -349,15 +338,14 @@ void Adam_Optimizer::Step_8(float *_params,
             __m512 param_4[8];
             param_4[0] = _mm512_loadu_ps(_params + i);
             param_4[1] = _mm512_loadu_ps(_params + i + SIMD_WIDTH);
-            param_4[2] = _mm512_loadu_ps(_params + i + (SIMD_WIDTH<<1));
+            param_4[2] = _mm512_loadu_ps(_params + i + (SIMD_WIDTH << 1));
             param_4[3] = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 3);
-            param_4[4] = _mm512_loadu_ps(_params + i + (SIMD_WIDTH<<2));
+            param_4[4] = _mm512_loadu_ps(_params + i + (SIMD_WIDTH << 2));
             param_4[5] = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 5);
             param_4[6] = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 6);
             param_4[7] = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 7);
 
-            if(_weight_decay > 0)
-            {
+            if (_weight_decay > 0) {
                 __m512 weight_decay4 = _mm512_set1_ps(_weight_decay);
                 grad_4[0] = _mm512_fmadd_ps(param_4[0], weight_decay4, grad_4[0]);
                 grad_4[1] = _mm512_fmadd_ps(param_4[1], weight_decay4, grad_4[1]);
@@ -448,42 +436,40 @@ void Adam_Optimizer::Step_8(float *_params,
 
             _mm512_storeu_ps(_params + i, param_4[0]);
             _mm512_storeu_ps(_params + i + SIMD_WIDTH, param_4[1]);
-            _mm512_storeu_ps(_params + i + (SIMD_WIDTH<<1), param_4[2]);
+            _mm512_storeu_ps(_params + i + (SIMD_WIDTH << 1), param_4[2]);
             _mm512_storeu_ps(_params + i + SIMD_WIDTH * 3, param_4[3]);
-            _mm512_storeu_ps(_params + i + (SIMD_WIDTH<<2), param_4[4]);
+            _mm512_storeu_ps(_params + i + (SIMD_WIDTH << 2), param_4[4]);
             _mm512_storeu_ps(_params + i + SIMD_WIDTH * 5, param_4[5]);
             _mm512_storeu_ps(_params + i + SIMD_WIDTH * 6, param_4[6]);
             _mm512_storeu_ps(_params + i + SIMD_WIDTH * 7, param_4[7]);
 
             _mm512_storeu_ps(_exp_avg + i, momntum_4[0]);
             _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH, momntum_4[1]);
-            _mm512_storeu_ps(_exp_avg + i + (SIMD_WIDTH<<1), momntum_4[2]);
+            _mm512_storeu_ps(_exp_avg + i + (SIMD_WIDTH << 1), momntum_4[2]);
             _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 3, momntum_4[3]);
-            _mm512_storeu_ps(_exp_avg + i + (SIMD_WIDTH<<2), momntum_4[4]);
+            _mm512_storeu_ps(_exp_avg + i + (SIMD_WIDTH << 2), momntum_4[4]);
             _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 5, momntum_4[5]);
             _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 6, momntum_4[6]);
             _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 7, momntum_4[7]);
 
             _mm512_storeu_ps(_exp_avg_sq + i, varianc_4[0]);
             _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH, varianc_4[1]);
-            _mm512_storeu_ps(_exp_avg_sq + i + (SIMD_WIDTH<<1), varianc_4[2]);
+            _mm512_storeu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1), varianc_4[2]);
             _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3, varianc_4[3]);
-            _mm512_storeu_ps(_exp_avg_sq + i + (SIMD_WIDTH<<2), varianc_4[4]);
+            _mm512_storeu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 2), varianc_4[4]);
             _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 5, varianc_4[5]);
             _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 6, varianc_4[6]);
             _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 7, varianc_4[7]);
         }
-        if (dev_params) 
-        {
-            #pragma omp parallel for
-            for (size_t j = 0; j < copy_size; j += 4) 
-            {
+        if (dev_params) {
+#pragma omp parallel for
+            for (size_t j = 0; j < copy_size; j += 4) {
                 _doubled_buffer[buf_index][j] = (__half)_params[t + j];
                 _doubled_buffer[buf_index][j + 1] = (__half)_params[t + j + 1];
                 _doubled_buffer[buf_index][j + 2] = (__half)_params[t + j + 2];
                 _doubled_buffer[buf_index][j + 3] = (__half)_params[t + j + 3];
             }
-            
+
             CUDA_CHECK(cudaMemcpyAsync(dev_params + t,
                                        _doubled_buffer[buf_index],
                                        copy_size * sizeof(__half),
diff --git a/csrc/includes/cpu_adam.h b/csrc/includes/cpu_adam.h
old mode 100755
new mode 100644
index c74c31c72f16..40f4cba692ea
--- a/csrc/includes/cpu_adam.h
+++ b/csrc/includes/cpu_adam.h
@@ -56,11 +56,11 @@ class Adam_Optimizer {
                 float* _exp_avg_sa,
                 size_t param_size,
                 __half* dev_param = nullptr);
-    void Step_8(float *_params, 
-                float *grads, 
-                float *_exp_avg, 
-                float *_exp_avg_sq, 
-                size_t _param_size, 
+    void Step_8(float* _params,
+                float* grads,
+                float* _exp_avg,
+                float* _exp_avg_sq,
+                size_t _param_size,
                 __half* dev_params = nullptr);
 
 private:
diff --git a/setup.py b/setup.py
index 77b7241b582b..10564e59a6e8 100755
--- a/setup.py
+++ b/setup.py
@@ -118,25 +118,25 @@ def fetch_requirements(path):
 ## Adam ##
 if BUILD_MASK & DS_BUILD_ADAM:
     ext_modules.append(
-        CUDAExtension(
-            name='deepspeed.ops.adam.cpu_adam_op',
-            sources=[
-                'csrc/adam/cpu_adam.cpp',
-            ],
-            include_dirs=['csrc/includes',
-                        '/usr/local/cuda/include'],
-            extra_compile_args={
-                'cxx':
-                ['-O3',
-                 '-std=c++14',
-                 '-L/usr/local/cuda/lib64',
-                 '-lcudart',
-                 '-lcublas',
-                 '-g',
-                 '-Wno-reorder',
-                 '-march=native',
-                 '-fopenmp']
-            }))
+        CUDAExtension(name='deepspeed.ops.adam.cpu_adam_op',
+                      sources=[
+                          'csrc/adam/cpu_adam.cpp',
+                      ],
+                      include_dirs=['csrc/includes',
+                                    '/usr/local/cuda/include'],
+                      extra_compile_args={
+                          'cxx': [
+                              '-O3',
+                              '-std=c++14',
+                              '-L/usr/local/cuda/lib64',
+                              '-lcudart',
+                              '-lcublas',
+                              '-g',
+                              '-Wno-reorder',
+                              '-march=native',
+                              '-fopenmp'
+                          ]
+                      }))
 
 ## Transformer ##
 if BUILD_MASK & DS_BUILD_TRANSFORMER:
diff --git a/tests/unit/adam_test.py b/tests/unit/adam_test.py
index 51c588dda962..be5caa5cbecc 100755
--- a/tests/unit/adam_test.py
+++ b/tests/unit/adam_test.py
@@ -5,7 +5,9 @@
 device = 'cpu'
 model_size = 10 * 1024**3
 param = torch.nn.Parameter(torch.ones(model_size, device=device))
-param_fp16 = torch.nn.Parameter(torch.ones(model_size, dtype=torch.half, device='cuda:0'))
+param_fp16 = torch.nn.Parameter(torch.ones(model_size,
+                                           dtype=torch.half,
+                                           device='cuda:0'))
 
 optimizer = DeepSpeedCPUAdam([param])
 #torch.set_num_threads(128)

From ad8af38467a974dd54cf2e3da3ecd169f9270209 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Sat, 5 Sep 2020 02:04:45 +0000
Subject: [PATCH 20/44] fix variance indexes

---
 csrc/adam/cpu_adam.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
 mode change 100644 => 100755 csrc/adam/cpu_adam.cpp

diff --git a/csrc/adam/cpu_adam.cpp b/csrc/adam/cpu_adam.cpp
old mode 100644
new mode 100755
index 98ac32e04a2f..5644ab1d4235
--- a/csrc/adam/cpu_adam.cpp
+++ b/csrc/adam/cpu_adam.cpp
@@ -330,10 +330,10 @@ void Adam_Optimizer::Step_8(float* _params,
             varianc_4[1] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH);
             varianc_4[2] = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1));
             varianc_4[3] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3);
-            varianc_4[5] = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 2));
-            varianc_4[6] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 5);
-            varianc_4[7] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 6);
-            varianc_4[8] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 7);
+            varianc_4[4] = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 2));
+            varianc_4[5] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 5);
+            varianc_4[6] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 6);
+            varianc_4[7] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 7);
 
             __m512 param_4[8];
             param_4[0] = _mm512_loadu_ps(_params + i);

From 2d17a6d3fe09a32f847bcdca5d615bb4a43653ac Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Sat, 5 Sep 2020 04:50:32 +0000
Subject: [PATCH 21/44] fix array-sizes

---
 csrc/adam/cpu_adam.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/csrc/adam/cpu_adam.cpp b/csrc/adam/cpu_adam.cpp
index 5644ab1d4235..9399438b57f6 100755
--- a/csrc/adam/cpu_adam.cpp
+++ b/csrc/adam/cpu_adam.cpp
@@ -148,19 +148,19 @@ void Adam_Optimizer::Step_4(float* _params,
             grad_4[2] = _mm512_loadu_ps(grads + i + (SIMD_WIDTH << 1));
             grad_4[3] = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 3);
 
-            __m512 momntum_4[2];
+            __m512 momntum_4[4];
             momntum_4[0] = _mm512_loadu_ps(_exp_avg + i);
             momntum_4[1] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH);
             momntum_4[2] = _mm512_loadu_ps(_exp_avg + i + (SIMD_WIDTH << 1));
             momntum_4[3] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 3);
 
-            __m512 varianc_4[2];
+            __m512 varianc_4[4];
             varianc_4[0] = _mm512_loadu_ps(_exp_avg_sq + i);
             varianc_4[1] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH);
             varianc_4[2] = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1));
             varianc_4[3] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3);
 
-            __m512 param_4[2];
+            __m512 param_4[4];
             param_4[0] = _mm512_loadu_ps(_params + i);
             param_4[1] = _mm512_loadu_ps(_params + i + SIMD_WIDTH);
             param_4[2] = _mm512_loadu_ps(_params + i + (SIMD_WIDTH << 1));

From aa3f2893b3051a0b7efc38a4db1403d9afcd9e1f Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Fri, 4 Sep 2020 22:05:58 -0700
Subject: [PATCH 22/44] ZeRO-Offload passing model functionality tests (#366)

* cpu_offload enables overlap_comm and contiguous_gradients
Remove non-portable tensor.mul_()

* Model functionality tests now passing

* Move to perf tests folder
---
 deepspeed/runtime/zero/stage2.py              | 3662 ++++++++---------
 .../ds_config_func_bs8_zero2.json             |    1 -
 .../ds_config_func_bs8_zero2_offload.json     |    1 -
 .../ds_config_func_scheduler.json             |    1 -
 tests/model/Megatron_GPT2/ds_gpt2_test.sh     |    9 +-
 tests/model/Megatron_GPT2/run_func_test.py    |  212 +-
 tests/{unit => perf}/adam_test.py             |    0
 7 files changed, 2042 insertions(+), 1844 deletions(-)
 rename tests/{unit => perf}/adam_test.py (100%)

diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py
index ae5722fc318d..9ed9c07df054 100755
--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -1,1833 +1,1829 @@
-'''
-Copyright 2019 The Microsoft DeepSpeed Team
-'''
-
-import torch
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-from torch.distributed.distributed_c10d import _get_global_rank
-import torch.distributed as dist
-import math
-from torch._six import inf
-from torch.autograd import Variable
-
-import collections
-
-from deepspeed.runtime.fp16.loss_scaler import LossScaler, DynamicLossScaler
-from deepspeed.runtime.utils import see_memory_usage, is_model_parallel_parameter
-from deepspeed.runtime.zero.config import ZERO_OPTIMIZATION_GRADIENTS
-from deepspeed.utils import logger
-#Toggle this to true to enable correctness test
-#with gradient partitioning and without
-pg_correctness_test = False
-
-try:
-    from apex_C import flatten
-    from apex_C import unflatten
-except ImportError:
-    try:
-        _ = warned_flatten
-    except NameError:
-        logger.warning(
-            "apex was installed without --cpp_ext.  Falling back to Python flatten and unflatten."
-        )
-        warned_flatten = True
-    from torch._utils import _flatten_dense_tensors as flatten
-    from torch._utils import _unflatten_dense_tensors as unflatten
-
-
-def input(msg):
-    return
-
-
-def split_half_float_double(tensors):
-    dtypes = [
-        "torch.cuda.HalfTensor",
-        "torch.cuda.FloatTensor",
-        "torch.cuda.DoubleTensor"
-    ]
-    buckets = []
-    for i, dtype in enumerate(dtypes):
-        bucket = [t for t in tensors if t.type() == dtype]
-        if bucket:
-            buckets.append(bucket)
-    return buckets
-
-
-def isclose(a, b, rtol=1e-09, atol=0.0):
-    return abs(a - b) <= max(rtol * max(abs(a), abs(b)), atol)
-
-
-def lcm(x, y):
-    from fractions import gcd  # or can import gcd from `math` in Python 3
-    return x * y // gcd(x, y)
-
-
-# create a flat tensor aligned at the alignment boundary
-def flatten_dense_tensors_aligned(tensor_list, alignment):
-    num_elements = 0
-    for tensor in tensor_list:
-        num_elements = num_elements + tensor.numel()
-
-    remaining = num_elements % alignment
-
-    if remaining:
-        elements_to_add = alignment - remaining
-        pad_tensor = torch.zeros(elements_to_add,
-                                 device=tensor_list[0].device,
-                                 dtype=tensor_list[0].dtype)
-        padded_tensor_list = tensor_list + [pad_tensor]
-
-        num_elements = num_elements + elements_to_add
-    else:
-        padded_tensor_list = tensor_list
-
-    return _flatten_dense_tensors(padded_tensor_list)
-
-
-def get_alignment_padding(tensor_list, alignment):
-    num_elements = sum([tensor.numel() for tensor in tensor_list])
-    remainder = num_elements % alignment
-    return (alignment - remainder) if remainder else remainder
-
-
-def move_to_cpu(tensor_list):
-    for tensor in tensor_list:
-        tensor.data = tensor.data.cpu()
-
-
-def print_rank_msg(msg):
-    print(f"rank {dist.get_rank()} - {msg}")
-
-
-class FP16_DeepSpeedZeroOptimizer(object):
-    """
-    DeepSpeedZeroOptimizer designed to reduce the memory footprint
-    required for training large deep learning models.
-
-    For more details please see ZeRO: Memory Optimization Towards Training A Trillion Parameter Models
-    https://arxiv.org/abs/1910.02054
-
-    For usage examples, refer to TODO: DeepSpeed Tutorial
-
-    """
-    def __init__(self,
-                 init_optimizer,
-                 timers,
-                 static_loss_scale=1.0,
-                 dynamic_loss_scale=False,
-                 dynamic_loss_args=None,
-                 verbose=True,
-                 contiguous_gradients=True,
-                 reduce_bucket_size=500000000,
-                 allgather_bucket_size=5000000000,
-                 dp_process_group=None,
-                 reduce_scatter=True,
-                 overlap_comm=False,
-                 cpu_offload=False,
-                 mpu=None,
-                 clip_grad=0.0,
-                 allreduce_always_fp32=False,
-                 postscale_gradients=True,
-                 gradient_predivide_factor=1.0,
-                 gradient_accumulation_steps=1):
-
-        if dist.get_rank() == 0:
-            logger.info(f"Reduce bucket size {reduce_bucket_size}")
-            logger.info(f"Allgather bucket size {allgather_bucket_size}")
-            logger.info(f"CPU Offload: {cpu_offload}")
-        # The fused optimizer does all the work. We need this layer for two reason:
-        # 1. maintain same user API from apex.fp16_utils
-        # 2. keep common stuff here in case we need to add ne552w fused optimizer later
-
-        # differences from apex.fp16_utils:
-        # - assume all model params in fp16
-        # - assume all params requires grad
-        # - flat by groups, not keeping state. TODO: remove state explicitly?
-        # - master gard and unflat master weight never exist. TODO: a way to save out unflat master?
-        if not torch.cuda.is_available:
-            raise SystemError("Cannot use fp16 without CUDA.")
-        self.optimizer = init_optimizer
-
-        self.timers = timers
-
-        self.reduce_scatter = reduce_scatter
-
-        self.overlap_comm = overlap_comm
-
-        self.cpu_offload = cpu_offload
-        self.device = torch.cuda.current_device() if not self.cpu_offload else 'cpu'
-
-        self.dp_process_group = dp_process_group
-
-        self.partition_count = dist.get_world_size(group=self.dp_process_group)
-
-        if mpu is None:
-            self.model_parallel_group = None
-            self.model_parallel_rank = 0
-        else:
-            self.model_parallel_group = mpu.get_model_parallel_group()
-            self.model_parallel_rank = mpu.get_model_parallel_rank()
-
-        self.overflow = False
-        self.clip_grad = clip_grad
-        self.allreduce_always_fp32 = allreduce_always_fp32
-        self.gradient_predivide_factor = gradient_predivide_factor
-        self.postscale_gradients = postscale_gradients
-        self.gradient_accumulation_steps = gradient_accumulation_steps
-        self.micro_step_id = 0
-
-        if self.reduce_scatter:
-            assert not self.allreduce_always_fp32, "allreduce_always_fp32 is not yet supported with ZeRO-2 with reduce scatter enabled"
-            assert self.gradient_predivide_factor == 1.0, "gradient_predivide_factor != 1.0 is not yet supported with ZeRO-2 with reduce scatter enabled"
-            assert self.postscale_gradients, "pre-scale gradients is not yet supported with ZeRO-2 with reduce scatter enabled"
-
-        # param flattened by groups
-        self.fp16_groups = []
-        self.fp16_groups_flat = []
-
-        #param partitioned by data parallel degree
-        #this will contain a list of equal sized tensors
-        #each of which will be updated by a different process
-        self.parallel_partitioned_fp16_groups = []
-
-        #a single 32-bit partition of the parallel partitioned parameters
-        #that this process will update
-        self.single_partition_of_fp32_groups = []
-
-        #param partition info
-
-        #These are the parameters in each group that will not be updated by this process directly
-        self.params_not_in_partition = []
-
-        #These are the parameters that will be updated by this process directly
-        self.params_in_partition = []
-
-        #Offset from the first paramter in the the self.params_in_partition
-        #the parameter boundaries may not align with partition boundaries
-        #so we need to keep track of the offset
-        self.first_offset = []
-
-        #number of elements per partition in each group
-        self.partition_size = []
-
-        partition_id = dist.get_rank(group=self.dp_process_group)
-
-        self.all_reduce_print = False
-
-        # padding on each partition for alignment purposes
-        self.groups_padding = []
-        # loop to deal with groups
-        for i, param_group in enumerate(self.optimizer.param_groups):
-            # push this group to list before modify
-            self.fp16_groups.append(param_group['params'])
-            # Record padding required to align group to world size
-            if partition_id == dist.get_world_size(group=self.dp_process_group) - 1:
-                padding = get_alignment_padding(self.fp16_groups[i],
-                                                self.partition_count)
-            else:
-                padding = 0
-            self.groups_padding.append(padding)
-
-            #not sure why apex was cloning the weights before flattening
-            #removing cloning here
-
-            see_memory_usage(f"Before moving param group {i} to CPU")
-            #move all the parameters to cpu to free up GPU space for creating flat buffer
-            move_to_cpu(self.fp16_groups[i])
-            see_memory_usage(f"After moving param group {i} to CPU")
-
-            #create flat buffer in CPU and move to GPU
-            self.fp16_groups_flat.append(
-                flatten_dense_tensors_aligned(
-                    self.fp16_groups[i],
-                    dist.get_world_size(group=self.dp_process_group)).cuda(
-                        torch.cuda.current_device()))
-            see_memory_usage(f"After flattening and moving param group {i} to GPU")
-
-            if dist.get_rank(group=self.dp_process_group) == 0:
-                see_memory_usage(
-                    f"After Flattening and after emptying param group {i} cache")
-
-            # set model fp16 weight to slices of flattened buffer
-            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
-                                                      self.fp16_groups[i])
-            for p, q in zip(self.fp16_groups[i], updated_params):
-                p.data = q.data
-
-            #divide the flat weights into near equal paritition equal to the data parallel degree
-            #each process will compute on a different part of the partition
-            data_parallel_partitions = self.get_data_parallel_partitions(
-                self.fp16_groups_flat[i])
-            self.parallel_partitioned_fp16_groups.append(data_parallel_partitions)
-
-            # a partition of the fp32 master weights that will be updated by this process
-            self.single_partition_of_fp32_groups.append(
-                self.parallel_partitioned_fp16_groups[i][partition_id].to(
-                    self.device).clone().float().detach())
-
-            # modify optimizer of have flat master weight
-            self.single_partition_of_fp32_groups[
-                i].requires_grad = True  # keep this in case internal optimizer uses it
-            param_group['params'] = [self.single_partition_of_fp32_groups[i]]
-
-            partition_size = len(self.fp16_groups_flat[i]) / dist.get_world_size(
-                group=self.dp_process_group)
-            params_in_partition, params_not_in_partition, first_offset = self.get_partition_info(self.fp16_groups[i], partition_size, partition_id)
-
-            self.partition_size.append(partition_size)
-            self.params_in_partition.append(params_in_partition)
-            self.params_not_in_partition.append(params_not_in_partition)
-            self.first_offset.append(first_offset)
-
-        self.reduce_bucket_size = int(reduce_bucket_size)
-        self.allgather_bucket_size = int(allgather_bucket_size)
-
-        self.reduction_event = torch.cuda.Event(enable_timing=False, blocking=False)
-        self.reduction_stream = torch.cuda.Stream()
-        self.cpu_computation_stream = torch.cuda.Stream()
-        self.migration_stream = torch.cuda.Stream()
-        self.callback_queued = False
-
-        self.param_dict = {}
-
-        #map between param_id and bool to specify if a param is in this partition
-        self.is_param_in_current_partition = {}
-
-        # CPU-Offload requires contiguous gradients
-        self.contiguous_gradients = contiguous_gradients or cpu_offload
-        self.grads_in_ipg_bucket = []
-        self.params_in_ipg_bucket = []
-        self.elements_in_ipg_bucket = 0
-        self.params_already_reduced = []
-        self._release_ipg_buffers()
-        self.previous_reduced_grads = None
-
-        #simplified param id
-        self.param_id = {}
-
-        largest_param_numel = 0
-        count = 0
-        for i, params_group in enumerate(self.fp16_groups):
-            for param in params_group:
-                unique_id = id(param)
-                self.param_id[unique_id] = count
-                self.param_dict[count] = param
-                self.params_already_reduced.append(False)
-                if param.numel() > largest_param_numel:
-                    largest_param_numel = param.numel()
-                count = count + 1
-
-        for param_group in self.params_in_partition:
-            for param in param_group:
-                self.is_param_in_current_partition[self.get_param_id(param)] = True
-
-        for param_group in self.params_not_in_partition:
-            for param in param_group:
-                self.is_param_in_current_partition[self.get_param_id(param)] = False
-
-        if self.cpu_offload:
-            self.accumulated_grads_in_cpu = {}
-            self.norm_for_param_grads = {}
-            self.local_overflow = False
-            self.grad_position = {}
-            self.temp_grad_buffer_for_cpu_offload = torch.zeros(
-                largest_param_numel,
-                device=self.device).half().pin_memory()
-            self.temp_grad_buffer_for_gpu_offload = torch.zeros(
-                largest_param_numel,
-                device=torch.cuda.current_device()).half()
-
-            for i, params_group in enumerate(self.fp16_groups):
-                self.get_grad_position(i,
-                                       self.params_in_partition[i],
-                                       self.first_offset[i],
-                                       self.partition_size[i])
-
-        #mapping from parameter to partition that it belongs to
-        self.param_to_partition_ids = {}
-
-        #stores if a partition has been reduced in this step
-        self.is_partition_reduced = {}
-
-        #number of grads in partition that still need to be computed
-        self.remaining_grads_in_partition = {}
-
-        #total number of grads in partition
-        self.total_grads_in_partition = {}
-
-        #stores if a grad in a partition has been computed or not
-        self.is_grad_computed = {}
-
-        #stores the offset at which a parameter gradient needs to be inserted in a partition
-        self.grad_partition_insertion_offset = {}
-
-        #the offset in the gradient at which it must be inserted at the beginning of the paritition
-        self.grad_start_offset = {}
-
-        #will store the averaged gradients required by this parititon
-        self.averaged_gradients = {}
-
-        # store index of first parameter in each partition
-        self.first_param_index_in_partition = {}
-
-        #initializes all data structures for implementing gradient partitioning
-        self.initialize_gradient_partitioning_data_structures()
-
-        #resets the data structure value for the next backward propagation
-        self.reset_partition_gradient_structures()
-
-        #creates backward hooks for gradient partitioning
-        self.create_reduce_and_remove_grad_hooks()
-
-        # we may have a way of fusing dynamic scale. Do not support for now
-        if dynamic_loss_scale:
-            if dynamic_loss_args is None:
-                self.loss_scaler = DynamicLossScaler()
-            else:
-                self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
-
-            self.dynamic_loss_scale = True
-
-        else:
-            self.dynamic_loss_scale = False
-            self.loss_scaler = LossScaler(scale=static_loss_scale)
-            self.cur_iter = 0
-
-        see_memory_usage("Before initializing optimizer states")
-        self.initialize_optimizer_states()
-        see_memory_usage("After initializing optimizer states")
-
-        if dist.get_rank() == 0:
-            logger.info(f"optimizer state initialized")
-
-        if dist.get_rank(group=self.dp_process_group) == 0:
-            see_memory_usage(f"After initializing ZeRO optimizer")
-
-    def _release_ipg_buffers(self):
-        if self.contiguous_gradients:
-            self.ipg_buffer = None
-            self.grads_in_partition = None
-            self.grads_in_partition_offset = 0
-
-    def initialize_optimizer_states(self):
-
-        for i, group in enumerate(self.fp16_groups):
-            single_grad_partition = torch.zeros(
-                int(self.partition_size[i]),
-                dtype=self.single_partition_of_fp32_groups[i].dtype,
-                device=self.device)
-            self.single_partition_of_fp32_groups[
-                i].grad = single_grad_partition.pin_memory(
-                ) if self.cpu_offload else single_grad_partition
-
-        self.optimizer.step()
-
-        if not self.cpu_offload:
-            for group in self.single_partition_of_fp32_groups:
-                group.grad = None
-
-        return
-
-    #########################################################################
-    #########################ZeRO Partition Gradients########################
-    #########################################################################
-
-    def get_first_param_index(self, group_id, param_group, partition_id):
-        for index, param in enumerate(param_group):
-            param_id = self.get_param_id(param)
-            if partition_id in self.param_to_partition_ids[group_id][param_id]:
-                return index
-        return None
-
-    def initialize_gradient_partitioning_data_structures(self):
-
-        total_partitions = dist.get_world_size(group=self.dp_process_group)
-
-        for i, param_group in enumerate(self.fp16_groups):
-
-            self.param_to_partition_ids[i] = {}
-            self.is_partition_reduced[i] = {}
-            self.total_grads_in_partition[i] = {}
-            self.remaining_grads_in_partition[i] = {}
-            self.is_grad_computed[i] = {}
-            self.grad_partition_insertion_offset[i] = {}
-            self.grad_start_offset[i] = {}
-            self.first_param_index_in_partition[i] = {}
-
-            for partition_id in range(total_partitions):
-                self.is_grad_computed[i][partition_id] = {}
-                self.grad_partition_insertion_offset[i][partition_id] = {}
-                self.grad_start_offset[i][partition_id] = {}
-                self.total_grads_in_partition[i][partition_id] = 0
-                self.initialize_gradient_partition(i, param_group, partition_id)
-                self.is_partition_reduced[i][partition_id] = False
-                self.first_param_index_in_partition[i][
-                    partition_id] = self.get_first_param_index(
-                        i,
-                        param_group,
-                        partition_id)
-
-    def independent_gradient_partition_epilogue(self):
-        self.report_ipg_memory_usage(f"In ipg_epilogue before reduce_ipg_grads", 0)
-        self.reduce_ipg_grads()
-        self.report_ipg_memory_usage(f"In ipg_epilogue after reduce_ipg_grads", 0)
-
-        #if dist.get_rank() == 0:
-        #    logger.info("Params already reduced %s", self.params_already_reduced)
-        for i in range(len(self.params_already_reduced)):
-            self.params_already_reduced[i] = False
-
-        if self.overlap_comm:
-            torch.cuda.synchronize()
-
-        if self.cpu_offload is False:
-            for i, _ in enumerate(self.fp16_groups):
-
-                if not i in self.averaged_gradients or self.averaged_gradients[i] is None:
-                    self.averaged_gradients[i] = self.get_flat_partition(
-                        self.params_in_partition[i],
-                        self.first_offset[i],
-                        self.partition_size[i],
-                        dtype=torch.half,
-                        device=torch.cuda.current_device(),
-                        return_tensor_list=True)
-                else:
-                    avg_new = self.get_flat_partition(self.params_in_partition[i],
-                                                      self.first_offset[i],
-                                                      self.partition_size[i],
-                                                      dtype=torch.half,
-                                                      device=torch.cuda.current_device(),
-                                                      return_tensor_list=True)
-
-                    for accumulated_grad, new_avg_grad in zip(self.averaged_gradients[i],avg_new):
-                        accumulated_grad.add_(new_avg_grad)
-
-        self._release_ipg_buffers()
-
-        # No need to keep the gradients anymore.
-        # All gradients required by the step
-        # are in self.averaged_gradients
-        self.zero_grad()
-        see_memory_usage(f"End ipg_epilogue")
-
-    # resets all partition to no reduced
-    # sets remianing grads to the total number of grads in each partition
-    # set is grad computed to false for all grads in partition
-    def reset_partition_gradient_structures(self):
-        total_partitions = dist.get_world_size(group=self.dp_process_group)
-        for i, _ in enumerate(self.fp16_groups):
-            for partition_id in range(total_partitions):
-                self.is_partition_reduced[i][partition_id] = False
-                self.remaining_grads_in_partition[i][
-                    partition_id] = self.total_grads_in_partition[i][partition_id]
-
-                for param_id in self.is_grad_computed[i][partition_id]:
-                    self.is_grad_computed[i][partition_id][param_id] = False
-
-    def initialize_gradient_partition(self, i, param_group, partition_id):
-        def set_key_value_list(dictionary, key, value):
-            if key in dictionary:
-                dictionary[key].append(value)
-            else:
-                dictionary[key] = [value]
-
-        def increment_value(dictionary, key):
-            if key in dictionary:
-                dictionary[key] += 1
-            else:
-                dictionary[key] = 1
-
-        partition_size = self.partition_size[i]
-
-        start_index = partition_size * partition_id
-        end_index = partition_size * (partition_id + 1)
-
-        current_index = 0
-        first_offset = 0
-
-        for param in param_group:
-
-            param_size = param.numel()
-            param_id = self.get_param_id(param)
-
-            if (current_index >= start_index and current_index < end_index):
-                set_key_value_list(self.param_to_partition_ids[i],
-                                   param_id,
-                                   partition_id)
-                increment_value(self.total_grads_in_partition[i], partition_id)
-
-                self.is_grad_computed[i][partition_id][param_id] = False
-
-                self.grad_partition_insertion_offset[i][partition_id][
-                    param_id] = current_index - start_index
-                self.grad_start_offset[i][partition_id][param_id] = 0
-
-            elif start_index > current_index and start_index < (current_index +
-                                                                param_size):
-                assert (first_offset==0), "This can happen either zero or only once as this must be the first tensor in the partition"
-                first_offset = start_index - current_index
-
-                set_key_value_list(self.param_to_partition_ids[i],
-                                   param_id,
-                                   partition_id)
-                increment_value(self.total_grads_in_partition[i], partition_id)
-
-                self.is_grad_computed[i][partition_id][param_id] = False
-
-                self.grad_partition_insertion_offset[i][partition_id][param_id] = 0
-                self.grad_start_offset[i][partition_id][param_id] = first_offset
-
-            current_index = current_index + param_size
-
-    def overlapping_partition_gradients_reduce_epilogue(self):
-        self.independent_gradient_partition_epilogue()
-
-    def create_reduce_and_remove_grad_hooks(self):
-        self.grad_accs = []
-        for i, param_group in enumerate(self.fp16_groups):
-            for param in param_group:
-                if param.requires_grad:
-
-                    def wrapper(param, i):
-                        param_tmp = param.expand_as(param)
-                        grad_acc = param_tmp.grad_fn.next_functions[0][0]
-
-                        def reduce_partition_and_remove_grads(*notneeded):
-                            self.reduce_ready_partitions_and_remove_grads(param, i)
-
-                        grad_acc.register_hook(reduce_partition_and_remove_grads)
-                        self.grad_accs.append(grad_acc)
-
-                    wrapper(param, i)
-
-    def get_param_id(self, param):
-        unique_id = id(param)
-        return self.param_id[unique_id]
-
-    def report_ipg_memory_usage(self, tag, param_elems):
-        elem_count = self.elements_in_ipg_bucket + param_elems
-        percent_of_bucket_size = (100.0 * elem_count) // self.reduce_bucket_size
-        see_memory_usage(
-            f"{tag}: elems in_bucket {self.elements_in_ipg_bucket} param {param_elems} max_percent {percent_of_bucket_size}"
-        )
-
-    ###############Idependent Partition Gradient ########################
-    def reduce_independent_p_g_buckets_and_remove_grads(self, param, i):
-        if self.elements_in_ipg_bucket + param.numel() > self.reduce_bucket_size:
-            self.report_ipg_memory_usage("In ipg_remove_grads before reduce_ipg_grads",
-                                         param.numel())
-            self.reduce_ipg_grads()
-            if self.contiguous_gradients and self.overlap_comm:
-                # Swap ipg_index between 0 and 1
-                self.ipg_index = 1 - self.ipg_index
-            self.report_ipg_memory_usage("In ipg_remove_grads after reduce_ipg_grads",
-                                         param.numel())
-
-        param_id = self.get_param_id(param)
-
-        assert self.params_already_reduced[param_id] == False, \
-            f"The parameter {param_id} has already been reduced. \
-            Gradient computed twice for this partition. \
-            Multiple gradient reduction is currently not supported"
-
-        #keeping the gradients contiguous to prevent memory fragmentation, and avoid flattening
-        if self.contiguous_gradients:
-            new_grad_tensor = self.ipg_buffer[self.ipg_index].narrow(
-                0,
-                self.elements_in_ipg_bucket,
-                param.numel())
-            new_grad_tensor.copy_(param.grad.view(-1))
-            param.grad.data = new_grad_tensor.data.view_as(param.grad)
-
-        self.elements_in_ipg_bucket += param.numel()
-        self.grads_in_ipg_bucket.append(param.grad)
-        self.params_in_ipg_bucket.append((i, param, param_id))
-
-        self.report_ipg_memory_usage("End ipg_remove_grads", 0)
-
-    def print_rank_0(self, message):
-        if dist.get_rank() == 0:
-            logger.info(message)
-
-    def gradient_reduction_w_predivide(self, tensor):
-        dp_world_size = dist.get_world_size(group=self.dp_process_group)
-
-        tensor_to_allreduce = tensor
-
-        if self.allreduce_always_fp32:
-            tensor_to_allreduce = tensor.float()
-
-        if self.postscale_gradients:
-            if self.gradient_predivide_factor != 1.0:
-                tensor_to_allreduce.mul_(1. / self.gradient_predivide_factor)
-
-            dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
-
-            if self.gradient_predivide_factor() != dp_world_size:
-                tensor_to_allreduce.mul_(self.gradient_predivide_factor() /
-                                         dp_world_size)
-        else:
-            tensor_to_allreduce.div_(dp_world_size)
-            dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
-
-        if self.allreduce_always_fp32 and tensor is not tensor_to_allreduce:
-            tensor.copy_(tensor_to_allreduce)
-
-        return tensor
-
-    def average_tensor(self, tensor):
-        if self.overlap_comm:
-            torch.cuda.synchronize()
-            stream = self.reduction_stream
-        else:
-            stream = torch.cuda.current_stream()
-
-        with torch.cuda.stream(stream):
-            if not self.reduce_scatter:
-                self.gradient_reduction_w_predivide(tensor)
-                return
-
-            # Accumulate destination ranks and bucket offsets for each gradient slice.
-            # Note: potential future optimization, record access pattern of parameters
-            # in backward pass and partition gradients w.r.t. access pattern so that our
-            # bucket is guaranteed to be contiguous w.r.t. ranks
-            rank_and_offsets = []
-            curr_size = 0
-            prev_id = -1
-            for i, param, param_id in self.params_in_ipg_bucket:
-                partition_ids = self.param_to_partition_ids[i][param_id]
-                partition_size = self.partition_size[i]
-                # Get all partition ids + their offsets
-                partition_ids_w_offsets = []
-                for partition_id in partition_ids:
-                    offset = self.grad_start_offset[i][partition_id][param_id]
-                    partition_ids_w_offsets.append((partition_id, offset))
-                partition_ids_w_offsets.sort(key=lambda t: t[1])
-
-                # Calculate rank and offsets for grad slices
-                for idx in range(len(partition_ids_w_offsets)):
-                    partition_id, offset = partition_ids_w_offsets[idx]
-
-                    # Calculate numel for grad slice depending on partition location
-                    if idx == len(partition_ids_w_offsets) - 1:
-                        # Last partition_id uses its own offset
-                        numel = param.numel() - offset
-                    else:
-                        # Set numel to next partition's offset
-                        numel = partition_ids_w_offsets[idx + 1][1] - offset
-
-                    # Merge bucket ranges if they belong to the same rank
-                    if partition_id == prev_id:
-                        prev_pid, prev_size, prev_numel = rank_and_offsets[-1]
-                        rank_and_offsets[-1] = (prev_pid, prev_size, prev_numel + numel)
-                    else:
-                        rank_and_offsets.append((partition_id, curr_size, numel))
-
-                    curr_size += numel
-                    prev_id = partition_id
-            tensor.div_(dist.get_world_size(group=self.dp_process_group))
-
-            async_handles = []
-            for dst, bucket_offset, numel in rank_and_offsets:
-                grad_slice = tensor.narrow(0, int(bucket_offset), int(numel))
-                dst_rank = _get_global_rank(self.dp_process_group, dst)
-                async_handle = dist.reduce(grad_slice,
-                                           dst=dst_rank,
-                                           group=self.dp_process_group,
-                                           async_op=True)
-                async_handles.append(async_handle)
-
-            for handle in async_handles:
-                handle.wait()
-
-    ##############################################################################
-    ############################# CPU Offload Methods#############################
-    ##############################################################################
-    def get_grad_position(self, group_id, tensor_list, first_offset, partition_size):
-        current_offset = 0
-
-        for i, tensor in enumerate(tensor_list):
-            param_id = self.get_param_id(tensor)
-            param_start_offset = 0
-
-            num_elements = tensor.numel()
-            tensor_offset = 0
-
-            #we need to offset to get to the right element
-            if i == 0 and first_offset > 0:
-                tensor_offset = first_offset
-                num_elements = num_elements - tensor_offset
-                param_start_offset = first_offset
-
-            #we dont need all elements of the tensor
-            if num_elements > (partition_size - current_offset):
-                num_elements = partition_size - current_offset
-
-            self.grad_position[param_id] = [
-                int(group_id),
-                int(param_start_offset),
-                int(current_offset),
-                int(num_elements)
-            ]
-            current_offset += num_elements
-
-    def update_overflow_tracker_for_param_grad(self, param):
-        if param.grad is not None and self._has_inf_or_nan(param.grad.data):
-            self.local_overflow = True
-
-    def async_accumulate_grad_in_cpu(self, param):
-        param_id = self.get_param_id(param)
-
-        #copy to a preexisiting buffer to avoid memory allocation penalty
-        dest_buffer = self.temp_grad_buffer_for_cpu_offload.view(-1).narrow(
-            0,
-            0,
-            param.numel())
-        dest_buffer.copy_(param.grad.view(-1), non_blocking=True)
-
-        if param_id not in self.accumulated_grads_in_cpu:
-            self.accumulated_grads_in_cpu[param_id] = torch.zeros(
-                param.numel(),
-                dtype=param.dtype,
-                device=self.device).pin_memory()
-
-        self.accumulated_grads_in_cpu[param_id].add_(dest_buffer)
-
-    def async_accumulate_grad_in_cpu_via_gpu(self, param):
-        param_id = self.get_param_id(param)
-
-        #copy to a preexisiting buffer to avoid memory allocation penalty
-        dest_buffer = self.temp_grad_buffer_for_gpu_offload.view(-1).narrow(
-            0,
-            0,
-            param.numel())
-
-        if param_id not in self.accumulated_grads_in_cpu:
-            self.accumulated_grads_in_cpu[param_id] = torch.zeros(
-                param.numel(),
-                dtype=param.dtype,
-                device=self.device).pin_memory()
-
-        if self.micro_step_id > 0:
-            dest_buffer.copy_(self.accumulated_grads_in_cpu[param_id].view(-1),
-                              non_blocking=True)
-            param.grad.data.view(-1).add_(dest_buffer)
-
-        #at the boundary we will send 32bit directly
-        if not self.is_gradient_accumulation_boundary:
-            self.accumulated_grads_in_cpu[param_id].data.copy_(param.grad.data.view(-1),
-                                                               non_blocking=True)
-
-    def set_norm_for_param_grad(self, param):
-        param_id = self.get_param_id(param)
-        accumulated_grad = self.accumulated_grads_in_cpu[
-            param_id] if self.gradient_accumulation_steps > 1 else param.grad
-
-        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
-
-        start = source_offset
-        accumulated_grad = accumulated_grad.view(-1).narrow(0, start, num_elements)
-
-        self.norm_for_param_grads[param_id] = accumulated_grad.data.double().norm(2)
-
-    def set_norm_for_param_grad_in_gpu(self, param):
-        param_id = self.get_param_id(param)
-        accumulated_grad = param.grad
-
-        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
-
-        start = source_offset
-        accumulated_grad = accumulated_grad.view(-1).narrow(0, start, num_elements)
-
-        self.norm_for_param_grads[param_id] = accumulated_grad.data.double().norm(2)
-
-    def async_inplace_copy_grad_to_fp32_buffer(self, param):
-        param_id = self.get_param_id(param)
-
-        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
-
-        dest_tensor = self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(
-            0,
-            dest_offset,
-            num_elements)
-        if self.gradient_accumulation_steps > 1:
-            src_tensor = self.accumulated_grads_in_cpu[param_id].view(-1).narrow(
-                0,
-                source_offset,
-                num_elements)
-        else:
-            src_tensor = param.grad.view(-1).narrow(0,
-                                                    source_offset,
-                                                    num_elements).float()
-        dest_tensor.copy_(src_tensor, non_blocking=True)
-
-    def async_inplace_copy_grad_to_fp32_buffer_from_gpu(self, param):
-        param_id = self.get_param_id(param)
-
-        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
-
-        dest_tensor = self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(
-            0,
-            dest_offset,
-            num_elements)
-
-        src_tensor = param.grad.view(-1).narrow(0, source_offset, num_elements).float()
-        dest_tensor.copy_(src_tensor, non_blocking=True)
-        param.grad = None
-
-    def complete_grad_norm_calculation_for_cpu_offload(self, params):
-        total_norm = 0.0
-        norm_type = 2.0
-        for p in params:
-            if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
-                param_id = self.get_param_id(p)
-                param_norm = self.norm_for_param_grads[param_id]
-                total_norm += param_norm.item()**2
-
-        # Sum across all model parallel GPUs.
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-
-        torch.distributed.all_reduce(total_norm_cuda,
-                                     op=torch.distributed.ReduceOp.SUM,
-                                     group=self.dp_process_group)
-
-        self._model_parallel_all_reduce(tensor=total_norm_cuda,
-                                        op=torch.distributed.ReduceOp.SUM)
-
-        total_norm = total_norm_cuda[0].item()**(1. / norm_type)
-
-        if total_norm == float(
-                'inf') or total_norm == -float('inf') or total_norm != total_norm:
-            total_norm = -1
-
-        return total_norm
-
-    ############################################################################################
-
-    def copy_grads_in_partition(self, param):
-        if self.cpu_offload:
-
-            if self.gradient_accumulation_steps > 1:
-                self.async_accumulate_grad_in_cpu_via_gpu(param)
-
-            if self.is_gradient_accumulation_boundary:
-                self.set_norm_for_param_grad_in_gpu(param)
-
-                self.update_overflow_tracker_for_param_grad(param)
-
-                self.async_inplace_copy_grad_to_fp32_buffer_from_gpu(param)
-
-            return
-        #print(f"ID {self.get_param_id(param)} grad norm {param.grad.norm()}")
-        if self.grads_in_partition is None:
-            self.grads_in_partition_offset = 0
-            total_size = 0
-            for group in self.params_in_partition:
-                for param_in_partition in group:
-                    total_size += param_in_partition.numel()
-
-            see_memory_usage(f"before copying {total_size} gradients into partition")
-            self.grads_in_partition = torch.empty(int(total_size),
-                                                  dtype=torch.half,
-                                                  device=torch.cuda.current_device())
-            see_memory_usage(f"after copying {total_size} gradients into partition")
-
-        #The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer
-        new_grad_tensor = self.grads_in_partition.view(-1).narrow(
-            0,
-            self.grads_in_partition_offset,
-            param.numel())
-        new_grad_tensor.copy_(param.grad.view(-1))
-        param.grad.data = new_grad_tensor.data.view_as(param.grad)
-        #print(f"Grad norm after copy to contiguous_buffer {param.grad.data.norm()}")
-        self.grads_in_partition_offset += param.numel()
-
-    def reduce_ipg_grads(self):
-        if self.overlap_comm:
-            stream = self.reduction_stream
-        else:
-            stream = torch.cuda.current_stream()
-
-        if self.contiguous_gradients:
-            self.average_tensor(self.ipg_buffer[self.ipg_index])
-        else:
-            self.buffered_reduce_fallback(
-                None,
-                self.grads_in_ipg_bucket,
-                elements_per_buffer=self.elements_in_ipg_bucket)
-
-        with torch.cuda.stream(stream):
-            for _, param, param_id in self.params_in_ipg_bucket:
-                self.params_already_reduced[param_id] = True
-
-                if not self.is_param_in_current_partition[param_id]:
-                    if self.overlap_comm and self.contiguous_gradients is False:
-                        # Clear the previous grads during the next reduction
-                        # to avoid clearing them before the reduction is complete.
-                        if self.previous_reduced_grads is None:
-                            self.previous_reduced_grads = []
-                        self.previous_reduced_grads.append(param)
-                    else:
-                        param.grad = None
-                elif self.contiguous_gradients:
-                    self.copy_grads_in_partition(param)
-
-        self.grads_in_ipg_bucket = []
-        self.params_in_ipg_bucket = []
-        self.elements_in_ipg_bucket = 0
-        #####################################################################
-
-    def reduce_ready_partitions_and_remove_grads(self, param, i):
-        self.reduce_independent_p_g_buckets_and_remove_grads(param, i)
-
-    def zero_reduced_gradients(self, partition_id, i):
-        def are_all_related_partitions_reduced(params_id):
-            for partition_id in self.param_to_partition_ids[i][params_id]:
-                if not self.is_partition_reduced[i][partition_id]:
-                    return False
-            return True
-
-        for params_id in self.is_grad_computed[i][partition_id]:
-            if are_all_related_partitions_reduced(params_id):
-                self.param_dict[params_id].grad = None
-
-    def flatten_and_print(self, message, tensors, start=0, n=5):
-        flatten_tensor = _flatten_dense_tensors(tensors)
-
-        def print_func():
-            logger.info(flatten_tensor.contiguous().view(-1).narrow(0, start, n))
-
-        self.sequential_execution(print_func, message)
-
-    def get_grads_to_reduce(self, i, partition_id):
-        def get_reducable_portion(key):
-            grad = self.param_dict[key].grad
-            total_elements = grad.numel()
-            start = self.grad_start_offset[i][partition_id][key]
-            num_elements = min(
-                total_elements - start,
-                self.partition_size[i] -
-                self.grad_partition_insertion_offset[i][partition_id][key])
-            if not pg_correctness_test:
-                if num_elements == total_elements:
-                    return grad
-                else:
-                    return grad.contiguous().view(-1).narrow(0,
-                                                             int(start),
-                                                             int(num_elements))
-            else:
-                if num_elements == total_elements:
-                    return grad.clone()
-                else:
-                    return grad.clone().contiguous().view(-1).narrow(
-                        0,
-                        int(start),
-                        int(num_elements))
-
-        grads_to_reduce = []
-        for key in self.is_grad_computed[i][partition_id]:
-            grad = get_reducable_portion(key)
-            grads_to_reduce.append(grad)
-        return grads_to_reduce
-
-    def sequential_execution(self, function, message, group=None):
-        if group is None:
-            group = self.dp_process_group
-        if dist.get_rank(group=group) == 0:
-            logger.info(message)
-        for id in range(dist.get_world_size(group=group)):
-            if id == dist.get_rank(group=group):
-                function()
-            dist.barrier(group=group)
-
-    def set_none_gradients_to_zero(self, i, partition_id):
-        for param_id in self.is_grad_computed[i][partition_id]:
-            param = self.param_dict[param_id]
-            if param.grad is None:
-                param.grad = torch.zero_like(param)
-
-    ######################Reduction Related Methods##############################
-
-    def allreduce_bucket(self, bucket, allreduce_always_fp32=False, rank=None, log=None):
-        rank = None
-        tensor = flatten(bucket)
-
-        tensor_to_allreduce = tensor
-
-        if pg_correctness_test:
-            allreduce_always_fp32 = True
-
-        if allreduce_always_fp32:
-            tensor_to_allreduce = tensor.float()
-
-        tensor_to_allreduce.div_(dist.get_world_size(group=self.dp_process_group))
-
-        if rank is None:
-            #    "All Reducing"
-            dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
-        else:
-            global_rank = _get_global_rank(self.dp_process_group, rank)
-            dist.reduce(tensor_to_allreduce, global_rank, group=self.dp_process_group)
-
-        if allreduce_always_fp32 and tensor is not tensor_to_allreduce:
-            if rank is None or rank == dist.get_rank(group=self.dp_process_group):
-                tensor.copy_(tensor_to_allreduce)
-
-        return tensor
-
-    #if rank is specified do a reduction instead of an allreduce
-    def allreduce_and_copy(self, small_bucket, rank=None, log=None):
-        if self.overlap_comm:
-            torch.cuda.synchronize()
-            if self.previous_reduced_grads is not None:
-                # previous_reduced_grads has the previous reduced grads,
-                # now it is safe to clear.
-                for param in self.previous_reduced_grads:
-                    param.grad = None
-                self.previous_reduced_grads = None
-            stream = self.reduction_stream
-        else:
-            stream = torch.cuda.current_stream()
-
-        with torch.cuda.stream(stream):
-            allreduced = self.allreduce_bucket(small_bucket, rank=rank, log=log)
-            if rank is None or rank == dist.get_rank(group=self.dp_process_group):
-                for buf, synced in zip(small_bucket, unflatten(allreduced, small_bucket)):
-                    buf.copy_(synced)
-
-    def allreduce_no_retain(self,
-                            bucket,
-                            numel_per_bucket=500000000,
-                            rank=None,
-                            log=None):
-        small_bucket = []
-        numel = 0
-        for tensor in bucket:
-            small_bucket.append(tensor)
-            numel = numel + tensor.numel()
-            if numel > numel_per_bucket:
-                self.allreduce_and_copy(small_bucket, rank=rank, log=None)
-                small_bucket = []
-        if len(small_bucket) > 0:
-            self.allreduce_and_copy(small_bucket, rank=rank, log=log)
-
-    #allows using reduction of gradients instead of using all_reduce
-    def buffered_reduce_fallback(self,
-                                 rank,
-                                 grads,
-                                 elements_per_buffer=500000000,
-                                 log=None):
-        split_buckets = split_half_float_double(grads)
-
-        for i, bucket in enumerate(split_buckets):
-            self.allreduce_no_retain(bucket,
-                                     numel_per_bucket=elements_per_buffer,
-                                     rank=rank,
-                                     log=log)
-
-    #############################################################################
-    #############################################################################
-    #############################################################################
-
-    #views the tensor as multiple partitions and returns
-    #those partitions
-    def get_data_parallel_partitions(self, tensor):
-        partitions = []
-
-        dp = dist.get_world_size(group=self.dp_process_group)
-        dp_id = dist.get_rank(group=self.dp_process_group)
-
-        total_num_elements = tensor.numel()
-
-        base_size = total_num_elements // dp
-        remaining = total_num_elements % dp
-
-        start = 0
-        for id in range(dp):
-            partition_size = base_size
-            if id < remaining:
-                partition_size = partition_size + 1
-            partitions.append(tensor.narrow(0, start, partition_size))
-            start = start + partition_size
-        return partitions
-
-    def get_partition_info(self, tensor_list, partition_size, partition_id):
-        params_in_partition = []
-        params_not_in_partition = []
-
-        start_index = partition_size * partition_id
-        end_index = partition_size * (partition_id + 1)
-
-        current_index = 0
-        first_offset = 0
-
-        for tensor in tensor_list:
-
-            tensor_size = tensor.numel()
-
-            if (current_index >= start_index and current_index < end_index):
-                params_in_partition.append(tensor)
-
-            elif start_index > current_index and start_index < (current_index +
-                                                                tensor_size):
-                params_in_partition.append(tensor)
-
-                assert (first_offset==0), "This can happen either zero or only once as this must be the first tensor in the partition"
-                first_offset = start_index - current_index
-
-            else:
-                params_not_in_partition.append(tensor)
-
-            current_index = current_index + tensor_size
-
-        return params_in_partition, params_not_in_partition, first_offset
-
-    def zero_grad(self, set_grads_to_None=True):
-        """
-        Zero FP16 parameter grads.
-        """
-        # FP32 grad should never exist.
-        # For speed, set model fp16 grad to None by default
-        for group in self.fp16_groups:
-            for p in group:
-                if set_grads_to_None:
-                    p.grad = None
-                else:
-                    if p.grad is not None:
-                        p.grad.detach_()
-                        p.grad.zero_()
-
-    def _model_parallel_all_reduce(self, tensor, op):
-        """ Perform all reduce within model parallel group, if any.
-        """
-        if self.model_parallel_group is None:
-            torch.distributed.all_reduce(tensor=tensor, op=op)
-        else:
-            torch.distributed.all_reduce(tensor=tensor,
-                                         op=op,
-                                         group=self.model_parallel_group)
-
-    def get_grad_norm_direct(self, gradients, params, norm_type=2):
-        """Clips gradient norm of an iterable of parameters.
-
-        This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
-        added functionality to handle model parallel parameters. Note that
-        the gradients are modified in place.
-
-        Arguments:
-            parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
-                single Tensor that will have gradients normalized
-            max_norm (float or int): max norm of the gradients
-            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
-                infinity norm.
-
-        Returns:
-            Total norm of the parameters (viewed as a single vector).
-        """
-        norm_type = float(norm_type)
-        if norm_type == inf:
-            total_norm = max(g.data.abs().max() for g in gradients)
-            total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=self.dp_process_group)
-
-            # Take max across all GPUs.
-            self._model_parallel_all_reduce(tensor=total_norm_cuda,
-                                            op=torch.distributed.ReduceOp.MAX)
-            total_norm = total_norm_cuda[0].item()
-        else:
-            total_norm = 0.0
-            #if dist.get_rank() == 0:
-            #    logger.info(f"Total Norm begining {total_norm}")
-            for g, p in zip(gradients, params):
-                if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
-                    param_norm = g.data.double().norm(2)
-                    total_norm += param_norm.item()**2
-            # Sum across all model parallel GPUs.
-            total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.SUM,
-                                         group=self.dp_process_group)
-
-            self._model_parallel_all_reduce(tensor=total_norm_cuda,
-                                            op=torch.distributed.ReduceOp.SUM)
-
-            total_norm = total_norm_cuda[0].item()**(1. / norm_type)
-
-        if total_norm == float(
-                'inf') or total_norm == -float('inf') or total_norm != total_norm:
-            total_norm = -1
-
-        return total_norm
-
-    #creates a flat fused tensor from the tensor list starting at the first_offset
-    #in the first tensor of the list. If there are not enough elements in the tensor
-    #list then the flat tensor will be padded with zeros
-    def get_flat_partition(self,
-                           tensor_list,
-                           first_offset,
-                           partition_size,
-                           dtype,
-                           device,
-                           return_tensor_list=False):
-        flat_tensor_list = []
-        current_size = 0
-        for i, tensor in enumerate(tensor_list):
-            if tensor.grad is None:
-                continue
-
-            tensor = tensor.grad
-            num_elements = tensor.numel()
-            tensor_offset = 0
-
-            #we need to offset to get to the right element
-            if i == 0 and first_offset > 0:
-                tensor_offset = first_offset
-                num_elements = num_elements - tensor_offset
-
-            #we dont need all elements of the tensor
-            if num_elements > (partition_size - current_size):
-                num_elements = partition_size - current_size
-
-            #we need a narrow view of the tensor based on the tensor offset and number of elements that
-            #we need from this tensor
-            if tensor_offset > 0 or num_elements < tensor.numel():
-                flat_tensor_list.append(tensor.contiguous().view(-1).narrow(
-                    0,
-                    int(tensor_offset),
-                    int(num_elements)))
-            else:
-                flat_tensor_list.append(tensor)
-
-            current_size = current_size + num_elements
-
-        #this means its the last partition and does not align with the dp boundary. We need to pad before flattening
-        if current_size < partition_size:
-            flat_tensor_list.append(
-                torch.zeros(int(partition_size - current_size),
-                            dtype=dtype,
-                            device=device))
-
-        if return_tensor_list:
-            return flat_tensor_list
-
-        return _flatten_dense_tensors(flat_tensor_list)
-
-    def free_grad_in_param_list(self, param_list):
-        for p in param_list:
-            p.grad = None
-
-    def reset_cpu_buffers(self):
-        self.norm_for_param_grads = {}
-        self.local_overflow = False
-        with torch.cuda.stream(self.migration_stream):
-            for key, value in self.accumulated_grads_in_cpu.items():
-                value.mul_(0.0)
-
-    def step(self, closure=None):
-        """
-        Not supporting closure.
-        """
-        self.micro_step_id = -1
-
-        if self.cpu_offload:
-            torch.cuda.current_stream().wait_stream(self.migration_stream)
-
-        see_memory_usage(f"In step before checking overflow")
-
-        # First compute norm for all group so we know if there is overflow
-        self.check_overflow()
-
-        timers = self.timers
-
-        prev_scale = self.loss_scale
-        self._update_scale(self.overflow)
-        if self.overflow:
-            see_memory_usage('After overflow before clearing gradients')
-            self.zero_grad()
-            if self.cpu_offload:
-                self.reset_cpu_buffers()
-            else:
-                self.averaged_gradients = {}
-
-            see_memory_usage('After overflow after clearing gradients')
-
-            logger.info(
-                "[deepscale] OVERFLOW! Rank {} Skipping step. Attempted loss scale: {}, "
-                "reducing to {}".format(dist.get_rank(),
-                                        prev_scale,
-                                        self.loss_scale))
-            timers('optimizer_step').start()
-            timers('optimizer_step').stop()
-            timers('optimizer_allgather').start()
-            timers('optimizer_allgather').stop()
-            return
-
-        norm_groups = []
-        single_partition_grad_groups = []
-        skip = False
-        partition_id = dist.get_rank(group=self.dp_process_group)
-        for i, group in enumerate(self.fp16_groups):
-            if self.cpu_offload:
-                norm_groups.append(
-                    self.complete_grad_norm_calculation_for_cpu_offload(
-                        self.params_in_partition[i]))
-                single_grad_partition = self.single_partition_of_fp32_groups[i].grad
-            else:
-                norm_groups.append(
-                    self.get_grad_norm_direct(self.averaged_gradients[i],
-                                              self.params_in_partition[i]))
-
-                #free gradients for all the prameters that are not updated by this process
-                self.free_grad_in_param_list(self.params_not_in_partition[i])
-
-                #create a flat gradients for parameters updated by this process
-                # If we are last partition, ensure we have same size grads and partition size, if not pad with zero tensors
-                if partition_id == dist.get_world_size(group=self.dp_process_group) - 1:
-                    single_grad_partition = flatten_dense_tensors_aligned(
-                        self.averaged_gradients[i],
-                        int(self.partition_size[i])).to(
-                            self.single_partition_of_fp32_groups[i].dtype)
-                else:
-                    single_grad_partition = _flatten_dense_tensors(
-                        self.averaged_gradients[i]).to(
-                            self.single_partition_of_fp32_groups[i].dtype)
-                assert single_grad_partition.numel() == self.partition_size[i], \
-                    "averaged gradients have different number of elements that partition size {} {} {} {}".format(single_grad_partition.numel(), self.partition_size[i], i, partition_id)
-
-                self.single_partition_of_fp32_groups[i].grad = single_grad_partition
-                #release all the gradient since we have already created a necessary copy in dp_grad_partition
-                self.free_grad_in_param_list(self.params_in_partition[i])
-
-                self.averaged_gradients[i] = None
-
-            single_partition_grad_groups.append(single_grad_partition)
-
-        self.unscale_and_clip_grads(single_partition_grad_groups, norm_groups)
-        #torch.set_num_threads(12)
-        timers('optimizer_step').start()
-
-        #self.optimizer.step(fp16_param_groups=self.parallel_partitioned_fp16_groups)
-        self.optimizer.step()
-        #get rid of the fp32 gradients. Not needed anymore
-        if not self.cpu_offload:
-            for group in self.single_partition_of_fp32_groups:
-                group.grad = None
-
-        for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
-            fp16_partitions[partition_id].data.copy_(fp32_partition.data)
-
-        timers('optimizer_step').stop()
-
-        if self.cpu_offload:
-            self.reset_cpu_buffers()
-
-        timers('optimizer_allgather').start()
-        #gather the updated weights from everyone
-        for group_id, partitioned_params in enumerate(self.parallel_partitioned_fp16_groups):
-
-            #Sequential AllGather Best of both worlds
-            dp_world_size = dist.get_world_size(group=self.dp_process_group)
-            num_shards = max(
-                1,
-                partitioned_params[partition_id].numel() * dp_world_size //
-                self.allgather_bucket_size)
-
-            shard_size = partitioned_params[partition_id].numel() // num_shards
-            num_elements = shard_size
-
-            assert shard_size * num_shards <= partitioned_params[partition_id].numel()
-
-            for shard_id in range(num_shards):
-
-                if shard_id == (num_shards - 1):
-                    num_elements = partitioned_params[partition_id].numel(
-                    ) - shard_id * shard_size
-
-                shard_list = []
-                for dp_id in range(dp_world_size):
-                    curr_shard = partitioned_params[dp_id].narrow(
-                        0,
-                        shard_id * shard_size,
-                        num_elements).detach()
-                    shard_list.append(curr_shard)
-
-                dist.all_gather(shard_list,
-                                shard_list[partition_id],
-                                group=self.dp_process_group)
-        timers('optimizer_allgather').stop()
-
-        # TODO: we probably don't need this? just to be safe
-        for i in range(len(norm_groups)):
-            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
-                                                      self.fp16_groups[i])
-            for p, q in zip(self.fp16_groups[i], updated_params):
-                p.data = q.data
-
-        see_memory_usage('After zero_optimizer step')
-        return
-
-    def unscale_and_clip_grads(self, grad_groups_flat, norm_groups):
-        total_norm = 0.0
-        for norm in norm_groups:
-            total_norm += norm**2.0
-        total_norm = math.sqrt(total_norm)
-
-        # compute combined scale factor for this group
-        combined_scale = self.loss_scale
-        if self.clip_grad > 0.:
-            # norm is in fact norm*scale
-            clip = ((total_norm / self.loss_scale) + 1e-6) / self.clip_grad
-            if clip > 1:
-                combined_scale = clip * self.loss_scale
-
-        for grad in grad_groups_flat:
-            if isinstance(grad, list):
-                sub_partitions = grad
-                for g in sub_partitions:
-                    g.data.mul_(1. / combined_scale)
-            else:
-                grad.data.mul_(1. / combined_scale)
-
-    def _check_overflow(self, partition_gradients=True):
-        self.overflow = self.has_overflow(partition_gradients)
-
-    # `params` is a list / generator of torch.Variable
-    def has_overflow_serial(self, params, is_grad_list=False):
-        for p in params:
-            if p.grad is not None and self._has_inf_or_nan(p.grad.data):
-                return True
-
-        return False
-
-    def has_overflow_partitioned_grads_serial(self):
-        for i in range(len(self.fp16_groups)):
-            for j, grad in enumerate(self.averaged_gradients[i]):
-                if grad is not None and self._has_inf_or_nan(grad.data, j):
-                    return True
-        return False
-
-    def has_overflow(self, partition_gradients=True):
-        if partition_gradients:
-            overflow = self.local_overflow if self.cpu_offload else self.has_overflow_partitioned_grads_serial(
-            )
-            overflow_gpu = torch.cuda.ByteTensor([overflow])
-            torch.distributed.all_reduce(overflow_gpu,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=self.dp_process_group)
-
-        else:
-            params = []
-            for group in self.fp16_groups:
-                for param in group:
-                    params.append(param)
-
-            overflow = self.has_overflow_serial(params, is_grad_list=partition_gradients)
-            overflow_gpu = torch.cuda.ByteTensor([overflow])
-
-        # Since each model parallel GPU carries only part of the model,
-        # make sure overflow flag is synced across all the model parallel GPUs
-        self._model_parallel_all_reduce(tensor=overflow_gpu,
-                                        op=torch.distributed.ReduceOp.MAX)
-
-        overflow = overflow_gpu[0].item()
-        return bool(overflow)
-
-    # `x` is a torch.Tensor
-    @staticmethod
-    def _has_inf_or_nan(x, j=None):
-        try:
-            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
-            # Pytorch's .sum() creates a one-element tensor of the same type as x
-            # (which is true for some recent version of pytorch).
-            cpu_sum = float(x.float().sum())
-            # More efficient version that can be used if .sum() returns a Python scalar
-            # cpu_sum = float(x.sum())
-        except RuntimeError as instance:
-            # We want to check if inst is actually an overflow exception.
-            # RuntimeError could come from a different error.
-            # If so, we still want the exception to propagate.
-            if "value cannot be converted" not in instance.args[0]:
-                raise
-            return True
-        else:
-            if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
-                return True
-            return False
-
-    def backward(self, loss, retain_graph=False):
-        """
-        :attr:`backward` performs the following steps:
-
-        1. fp32_loss = loss.float()
-        2. scaled_loss = fp32_loss*loss_scale
-        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's fp16 leaves
-        """
-        self.micro_step_id += 1
-        if self.cpu_offload:
-            torch.cuda.current_stream().wait_stream(self.migration_stream)
-
-        if self.contiguous_gradients:
-            self.ipg_buffer = []
-            buf_0 = torch.empty(int(self.reduce_bucket_size * 4.5),
-                                dtype=torch.half,
-                                device=torch.cuda.current_device())
-            self.ipg_buffer.append(buf_0)
-
-            # Use double buffers to avoid data access conflict when overlap_comm is enabled.
-            if self.overlap_comm:
-                buf_1 = torch.empty(int(self.reduce_bucket_size * 4.5),
-                                    dtype=torch.half,
-                                    device=torch.cuda.current_device())
-                self.ipg_buffer.append(buf_1)
-            self.ipg_index = 0
-
-        self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
-
-    def check_overflow(self, partition_gradients=True):
-        self._check_overflow(partition_gradients)
-
-    def _update_scale(self, has_overflow=False):
-        self.loss_scaler.update_scale(has_overflow)
-
-    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
-    def _get_state(self):
-        return self.optimizer.state
-
-    def _set_state(self, value):
-        self.optimizer.state = value
-
-    state = property(_get_state, _set_state)
-
-    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
-    # (for example, to adjust the learning rate)
-    def _get_param_groups(self):
-        return self.optimizer.param_groups
-
-    def _set_param_groups(self, value):
-        self.optimizer.param_groups = value
-
-    param_groups = property(_get_param_groups, _set_param_groups)
-
-    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
-    def _get_loss_scale(self):
-        return self.loss_scaler.loss_scale
-
-    def _set_loss_scale(self, value):
-        self.loss_scaler.cur_scale = value
-
-    loss_scale = property(_get_loss_scale, _set_loss_scale)
-    cur_scale = property(_get_loss_scale, _set_loss_scale)
-
-    # Return group tensor after removing paddings that are added for alignment to DP world size.
-    # This method works on the assumption that each group contains a single flattened tensor.
-    def _get_groups_without_padding(self, groups_with_padding):
-        groups_without_padding = []
-        for i, group in enumerate(groups_with_padding):
-            lean_length = group.numel() - self.groups_padding[i]
-            groups_without_padding.append(group[:lean_length])
-
-        return groups_without_padding
-
-    # Return optimizer state after removing paddings that are added for alignment.
-    def _get_state_without_padding(self, state_with_padding, padding):
-        lean_state = {}
-        for key, value in state_with_padding.items():
-            if torch.is_tensor(value):
-                lean_length = value.numel() - padding
-                lean_state[key] = value[:lean_length]
-            else:
-                lean_state[key] = value
-
-        return lean_state
-
-    # Return base optimizer states.
-    # This method assumes that each param group contains a single flattened tensor.
-    def _get_base_optimizer_state(self):
-        optimizer_groups_state = []
-        for i, group in enumerate(self.optimizer.param_groups):
-            p = group['params'][0]
-            lean_optimizer_state = self._get_state_without_padding(
-                self.optimizer.state[p],
-                self.groups_padding[i])
-            optimizer_groups_state.append(lean_optimizer_state)
-
-        return optimizer_groups_state
-
-    def state_dict(self):
-        """
-        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
-        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
-        of the contained Pytorch optimizer.
-        Example::
-            checkpoint = {}
-            checkpoint['model'] = model.state_dict()
-            checkpoint['optimizer'] = optimizer.state_dict()
-            torch.save(checkpoint, "saved.pth")
-        """
-        state_dict = {}
-        state_dict['loss_scaler'] = self.loss_scaler
-        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
-        state_dict['overflow'] = self.overflow
-        state_dict['base_optimizer_state'] = self._get_base_optimizer_state()
-
-        state_dict['zero_stage'] = ZERO_OPTIMIZATION_GRADIENTS
-        state_dict['partition_count'] = self.partition_count
-
-        # Remove paddings for DP alignment to enable loading for other alignment values
-        fp32_groups_without_padding = self._get_groups_without_padding(
-            self.single_partition_of_fp32_groups)
-        state_dict['single_partition_of_fp32_groups'] = fp32_groups_without_padding
-
-        #        if self.cpu_offload:
-        #            state_dict_tmp = async_copy_to(state_dict,
-        #                                           'cpu',
-        #                                           torch.cuda.current_stream())
-        #            state_dict = state_dict_tmp
-
-        return state_dict
-
-    # Restore base optimizer fp32 weights from checkpoint by:
-    # 1) Merging fp32 weights from checkpoints of all partitions
-    # 2) Extracting fp32 weights for current partition from merged weights
-    # 3) Using extracted weights to update base optimizer weights directly.
-    def _restore_from_fp32_weights(self, all_state_dict):
-        partition_id = dist.get_rank(group=self.dp_process_group)
-        merged_single_partition_of_fp32_groups = []
-        for i in range(len(self.single_partition_of_fp32_groups)):
-            merged_partitions = [
-                sd['single_partition_of_fp32_groups'][i] for sd in all_state_dict
-            ]
-            flat_merged_partitions = flatten_dense_tensors_aligned(
-                merged_partitions,
-                dist.get_world_size(group=self.dp_process_group))
-            dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions)
-            merged_single_partition_of_fp32_groups.append(dp_partitions[partition_id])
-
-        for current, saved in zip(self.single_partition_of_fp32_groups, merged_single_partition_of_fp32_groups):
-            current.data.copy_(saved.data)
-
-    # Restore base optimizer fp32 weights from ZeRO fp16 weights
-    def _restore_from_fp16_weights(self):
-        partition_id = dist.get_rank(group=self.dp_process_group)
-        for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
-            fp32_partition.data.copy_(fp16_partitions[partition_id].data)
-
-    # Refresh the fp32 master params from the fp16 copies.
-    def refresh_fp32_params(self):
-        self._restore_from_fp16_weights()
-
-    # Extract optimizer state for current partition from merged states of all partitions
-    def _partition_base_optimizer_state(self, state_key, all_partition_states):
-        partition_id = dist.get_rank(group=self.dp_process_group)
-        alignment = dist.get_world_size(group=self.dp_process_group)
-        if torch.is_tensor(all_partition_states[0]):
-            flat_merged_partitions = flatten_dense_tensors_aligned(
-                all_partition_states,
-                alignment)
-            dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions)
-            return dp_partitions[partition_id]
-        else:
-            # Assume non-tensor states are not partitioned and equal across ranks, so return first one
-            return all_partition_states[0]
-
-    # Restore base optimizer state from checkpoint by
-    # 1) Merging optimizer state from checkpoints of all partitions
-    # 2) Extracting optimizer state for current partition from the merged state
-    # 3) Using the extracted value to directly update the base optimizer.
-    def _restore_base_optimizer_state(self, all_state_dict):
-        base_optimizer_group_states = []
-        for i in range(len(self.optimizer.param_groups)):
-            partition_states = {}
-            all_partition_group_states = [
-                sd['base_optimizer_state'][i] for sd in all_state_dict
-            ]
-            for key in all_partition_group_states[0].keys():
-                all_partition_states = [
-                    all_states[key] for all_states in all_partition_group_states
-                ]
-                partition_states[key] = self._partition_base_optimizer_state(
-                    key,
-                    all_partition_states)
-            base_optimizer_group_states.append(partition_states)
-
-        for i, group in enumerate(self.optimizer.param_groups):
-            p = group['params'][0]
-            for key, saved in base_optimizer_group_states[i].items():
-                if torch.is_tensor(self.optimizer.state[p][key]):
-                    self.optimizer.state[p][key].data.copy_(saved.data)
-                else:
-                    self.optimizer.state[p][key] = saved
-
-    def load_state_dict(self,
-                        state_dict_list,
-                        load_optimizer_states=True,
-                        load_from_fp32_weights=False):
-        r"""Loading ZeRO checkpoint
-
-        Arguments:
-            state_dict_list: List of all saved ZeRO checkpoints, one for each saved partition.
-                Note that the number of saved partitions may differ from number of loading partitions to support
-                changing GPU count, specifically DP world size, between saving and loading checkpoints.
-            load_optimizer_states: Boolean indicating whether or not to load base optimizer states
-            load_from_fp32_weights: Boolean indicating whether to initialize fp32 master weights from fp32
-            copies in checkpoints (no precision loss) or from model's fp16 copies (with precision loss).
-        """
-        """
-        Loads a state_dict created by an earlier call to state_dict().
-        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
-        whose parameters in turn came from ``model``, it is expected that the user
-        will call ``model.load_state_dict()`` before
-        ``fp16_optimizer_instance.load_state_dict()`` is called.
-        Example::
-            model = torch.nn.Linear(D_in, D_out).cuda().half()
-            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
-            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
-            ...
-            checkpoint = torch.load("saved.pth")
-            model.load_state_dict(checkpoint['model'])
-            optimizer.load_state_dict(checkpoint['optimizer'])
-        """
-        # I think it should actually be ok to reload the optimizer before the model.
-        self.loss_scaler = state_dict_list[0]['loss_scaler']
-        self.dynamic_loss_scale = state_dict_list[0]['dynamic_loss_scale']
-        self.overflow = state_dict_list[0]['overflow']
-
-        if load_optimizer_states:
-            self._restore_base_optimizer_state(state_dict_list)
-
-        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
-        # The optimizer's hyperparameters and internal buffers are also up to date.
-        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
-        # out of date.  There are two options.
-        # 1:  Refresh the master params from the model's fp16 params.
-        # This requires less storage but incurs precision loss.
-        # 2:  Save and restore the fp32 master copies separately.
-        # We choose option 1 if changing DP degree and option 2 otherwise.
-        #
-        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
-        # of their associated parameters, because it's possible those buffers might not exist yet in
-        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been
-        # constructed in the same way as the one whose state_dict we are loading, the same master params
-        # are guaranteed to exist, so we can just copy_() from the saved master params.
-
-        if load_from_fp32_weights:
-            self._restore_from_fp32_weights(state_dict_list)
-        else:
-            self._restore_from_fp16_weights()
-
-
-def _handle_overflow(cpu_sum, x, i):
-    import math
-    rank = torch.distributed.get_rank()
-    if rank == 0:
-        t_i = -1
-        for v_i, v in enumerate(x.data.contiguous().view(-1)):
-            if not math.isfinite(float(v)):
-                t_i = v_i
-                break
-        logger.info(
-            f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}"
-        )
+'''
+Copyright 2019 The Microsoft DeepSpeed Team
+'''
+
+import torch
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from torch.distributed.distributed_c10d import _get_global_rank
+import torch.distributed as dist
+import math
+from torch._six import inf
+from torch.autograd import Variable
+
+import collections
+
+from deepspeed.runtime.fp16.loss_scaler import LossScaler, DynamicLossScaler
+from deepspeed.runtime.utils import see_memory_usage, is_model_parallel_parameter
+from deepspeed.runtime.zero.config import ZERO_OPTIMIZATION_GRADIENTS
+from deepspeed.utils import logger
+#Toggle this to true to enable correctness test
+#with gradient partitioning and without
+pg_correctness_test = False
+
+try:
+    from apex_C import flatten
+    from apex_C import unflatten
+except ImportError:
+    try:
+        _ = warned_flatten
+    except NameError:
+        logger.warning(
+            "apex was installed without --cpp_ext.  Falling back to Python flatten and unflatten."
+        )
+        warned_flatten = True
+    from torch._utils import _flatten_dense_tensors as flatten
+    from torch._utils import _unflatten_dense_tensors as unflatten
+
+
+def input(msg):
+    return
+
+
+def split_half_float_double(tensors):
+    dtypes = [
+        "torch.cuda.HalfTensor",
+        "torch.cuda.FloatTensor",
+        "torch.cuda.DoubleTensor"
+    ]
+    buckets = []
+    for i, dtype in enumerate(dtypes):
+        bucket = [t for t in tensors if t.type() == dtype]
+        if bucket:
+            buckets.append(bucket)
+    return buckets
+
+
+def isclose(a, b, rtol=1e-09, atol=0.0):
+    return abs(a - b) <= max(rtol * max(abs(a), abs(b)), atol)
+
+
+def lcm(x, y):
+    from fractions import gcd  # or can import gcd from `math` in Python 3
+    return x * y // gcd(x, y)
+
+
+# create a flat tensor aligned at the alignment boundary
+def flatten_dense_tensors_aligned(tensor_list, alignment):
+    num_elements = 0
+    for tensor in tensor_list:
+        num_elements = num_elements + tensor.numel()
+
+    remaining = num_elements % alignment
+
+    if remaining:
+        elements_to_add = alignment - remaining
+        pad_tensor = torch.zeros(elements_to_add,
+                                 device=tensor_list[0].device,
+                                 dtype=tensor_list[0].dtype)
+        padded_tensor_list = tensor_list + [pad_tensor]
+
+        num_elements = num_elements + elements_to_add
+    else:
+        padded_tensor_list = tensor_list
+
+    return _flatten_dense_tensors(padded_tensor_list)
+
+
+def get_alignment_padding(tensor_list, alignment):
+    num_elements = sum([tensor.numel() for tensor in tensor_list])
+    remainder = num_elements % alignment
+    return (alignment - remainder) if remainder else remainder
+
+
+def move_to_cpu(tensor_list):
+    for tensor in tensor_list:
+        tensor.data = tensor.data.cpu()
+
+
+def print_rank_msg(msg):
+    print(f"rank {dist.get_rank()} - {msg}")
+
+
+class FP16_DeepSpeedZeroOptimizer(object):
+    """
+    DeepSpeedZeroOptimizer designed to reduce the memory footprint
+    required for training large deep learning models.
+
+    For more details please see ZeRO: Memory Optimization Towards Training A Trillion Parameter Models
+    https://arxiv.org/abs/1910.02054
+
+    For usage examples, refer to TODO: DeepSpeed Tutorial
+
+    """
+    def __init__(self,
+                 init_optimizer,
+                 timers,
+                 static_loss_scale=1.0,
+                 dynamic_loss_scale=False,
+                 dynamic_loss_args=None,
+                 verbose=True,
+                 contiguous_gradients=True,
+                 reduce_bucket_size=500000000,
+                 allgather_bucket_size=5000000000,
+                 dp_process_group=None,
+                 reduce_scatter=True,
+                 overlap_comm=False,
+                 cpu_offload=False,
+                 mpu=None,
+                 clip_grad=0.0,
+                 allreduce_always_fp32=False,
+                 postscale_gradients=True,
+                 gradient_predivide_factor=1.0,
+                 gradient_accumulation_steps=1):
+
+        if dist.get_rank() == 0:
+            logger.info(f"Reduce bucket size {reduce_bucket_size}")
+            logger.info(f"Allgather bucket size {allgather_bucket_size}")
+            logger.info(f"CPU Offload: {cpu_offload}")
+        # The fused optimizer does all the work. We need this layer for two reason:
+        # 1. maintain same user API from apex.fp16_utils
+        # 2. keep common stuff here in case we need to add ne552w fused optimizer later
+
+        # differences from apex.fp16_utils:
+        # - assume all model params in fp16
+        # - assume all params requires grad
+        # - flat by groups, not keeping state. TODO: remove state explicitly?
+        # - master gard and unflat master weight never exist. TODO: a way to save out unflat master?
+        if not torch.cuda.is_available:
+            raise SystemError("Cannot use fp16 without CUDA.")
+        self.optimizer = init_optimizer
+
+        self.timers = timers
+
+        self.reduce_scatter = reduce_scatter
+
+        self.overlap_comm = overlap_comm or cpu_offload
+
+        self.cpu_offload = cpu_offload
+        self.device = torch.cuda.current_device() if not self.cpu_offload else 'cpu'
+
+        self.dp_process_group = dp_process_group
+
+        self.partition_count = dist.get_world_size(group=self.dp_process_group)
+
+        if mpu is None:
+            self.model_parallel_group = None
+            self.model_parallel_rank = 0
+        else:
+            self.model_parallel_group = mpu.get_model_parallel_group()
+            self.model_parallel_rank = mpu.get_model_parallel_rank()
+
+        self.overflow = False
+        self.clip_grad = clip_grad
+        self.allreduce_always_fp32 = allreduce_always_fp32
+        self.gradient_predivide_factor = gradient_predivide_factor
+        self.postscale_gradients = postscale_gradients
+        self.gradient_accumulation_steps = gradient_accumulation_steps
+        self.micro_step_id = 0
+
+        if self.reduce_scatter:
+            assert not self.allreduce_always_fp32, "allreduce_always_fp32 is not yet supported with ZeRO-2 with reduce scatter enabled"
+            assert self.gradient_predivide_factor == 1.0, "gradient_predivide_factor != 1.0 is not yet supported with ZeRO-2 with reduce scatter enabled"
+            assert self.postscale_gradients, "pre-scale gradients is not yet supported with ZeRO-2 with reduce scatter enabled"
+
+        # param flattened by groups
+        self.fp16_groups = []
+        self.fp16_groups_flat = []
+
+        #param partitioned by data parallel degree
+        #this will contain a list of equal sized tensors
+        #each of which will be updated by a different process
+        self.parallel_partitioned_fp16_groups = []
+
+        #a single 32-bit partition of the parallel partitioned parameters
+        #that this process will update
+        self.single_partition_of_fp32_groups = []
+
+        #param partition info
+
+        #These are the parameters in each group that will not be updated by this process directly
+        self.params_not_in_partition = []
+
+        #These are the parameters that will be updated by this process directly
+        self.params_in_partition = []
+
+        #Offset from the first paramter in the the self.params_in_partition
+        #the parameter boundaries may not align with partition boundaries
+        #so we need to keep track of the offset
+        self.first_offset = []
+
+        #number of elements per partition in each group
+        self.partition_size = []
+
+        partition_id = dist.get_rank(group=self.dp_process_group)
+
+        self.all_reduce_print = False
+
+        # padding on each partition for alignment purposes
+        self.groups_padding = []
+        # loop to deal with groups
+        for i, param_group in enumerate(self.optimizer.param_groups):
+            # push this group to list before modify
+            self.fp16_groups.append(param_group['params'])
+            # Record padding required to align group to world size
+            if partition_id == dist.get_world_size(group=self.dp_process_group) - 1:
+                padding = get_alignment_padding(self.fp16_groups[i],
+                                                self.partition_count)
+            else:
+                padding = 0
+            self.groups_padding.append(padding)
+
+            #not sure why apex was cloning the weights before flattening
+            #removing cloning here
+
+            see_memory_usage(f"Before moving param group {i} to CPU")
+            #move all the parameters to cpu to free up GPU space for creating flat buffer
+            move_to_cpu(self.fp16_groups[i])
+            see_memory_usage(f"After moving param group {i} to CPU")
+
+            #create flat buffer in CPU and move to GPU
+            self.fp16_groups_flat.append(
+                flatten_dense_tensors_aligned(
+                    self.fp16_groups[i],
+                    dist.get_world_size(group=self.dp_process_group)).cuda(
+                        torch.cuda.current_device()))
+            see_memory_usage(f"After flattening and moving param group {i} to GPU")
+
+            if dist.get_rank(group=self.dp_process_group) == 0:
+                see_memory_usage(
+                    f"After Flattening and after emptying param group {i} cache")
+
+            # set model fp16 weight to slices of flattened buffer
+            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
+                                                      self.fp16_groups[i])
+            for p, q in zip(self.fp16_groups[i], updated_params):
+                p.data = q.data
+
+            #divide the flat weights into near equal paritition equal to the data parallel degree
+            #each process will compute on a different part of the partition
+            data_parallel_partitions = self.get_data_parallel_partitions(
+                self.fp16_groups_flat[i])
+            self.parallel_partitioned_fp16_groups.append(data_parallel_partitions)
+
+            # a partition of the fp32 master weights that will be updated by this process
+            self.single_partition_of_fp32_groups.append(
+                self.parallel_partitioned_fp16_groups[i][partition_id].to(
+                    self.device).clone().float().detach())
+
+            # modify optimizer of have flat master weight
+            self.single_partition_of_fp32_groups[
+                i].requires_grad = True  # keep this in case internal optimizer uses it
+            param_group['params'] = [self.single_partition_of_fp32_groups[i]]
+
+            partition_size = len(self.fp16_groups_flat[i]) / dist.get_world_size(
+                group=self.dp_process_group)
+            params_in_partition, params_not_in_partition, first_offset = self.get_partition_info(self.fp16_groups[i], partition_size, partition_id)
+
+            self.partition_size.append(partition_size)
+            self.params_in_partition.append(params_in_partition)
+            self.params_not_in_partition.append(params_not_in_partition)
+            self.first_offset.append(first_offset)
+
+        self.reduce_bucket_size = int(reduce_bucket_size)
+        self.allgather_bucket_size = int(allgather_bucket_size)
+
+        self.reduction_event = torch.cuda.Event(enable_timing=False, blocking=False)
+        self.reduction_stream = torch.cuda.Stream()
+        self.cpu_computation_stream = torch.cuda.Stream()
+        self.migration_stream = torch.cuda.Stream()
+        self.callback_queued = False
+
+        self.param_dict = {}
+
+        #map between param_id and bool to specify if a param is in this partition
+        self.is_param_in_current_partition = {}
+
+        # CPU-Offload requires contiguous gradients
+        self.contiguous_gradients = contiguous_gradients or cpu_offload
+        self.grads_in_ipg_bucket = []
+        self.params_in_ipg_bucket = []
+        self.elements_in_ipg_bucket = 0
+        self.params_already_reduced = []
+        self._release_ipg_buffers()
+        self.previous_reduced_grads = None
+
+        #simplified param id
+        self.param_id = {}
+
+        largest_param_numel = 0
+        count = 0
+        for i, params_group in enumerate(self.fp16_groups):
+            for param in params_group:
+                unique_id = id(param)
+                self.param_id[unique_id] = count
+                self.param_dict[count] = param
+                self.params_already_reduced.append(False)
+                if param.numel() > largest_param_numel:
+                    largest_param_numel = param.numel()
+                count = count + 1
+
+        for param_group in self.params_in_partition:
+            for param in param_group:
+                self.is_param_in_current_partition[self.get_param_id(param)] = True
+
+        for param_group in self.params_not_in_partition:
+            for param in param_group:
+                self.is_param_in_current_partition[self.get_param_id(param)] = False
+
+        if self.cpu_offload:
+            self.accumulated_grads_in_cpu = {}
+            self.norm_for_param_grads = {}
+            self.local_overflow = False
+            self.grad_position = {}
+            self.temp_grad_buffer_for_cpu_offload = torch.zeros(
+                largest_param_numel,
+                device=self.device).half().pin_memory()
+            self.temp_grad_buffer_for_gpu_offload = torch.zeros(
+                largest_param_numel,
+                device=torch.cuda.current_device()).half()
+
+            for i, params_group in enumerate(self.fp16_groups):
+                self.get_grad_position(i,
+                                       self.params_in_partition[i],
+                                       self.first_offset[i],
+                                       self.partition_size[i])
+
+        #mapping from parameter to partition that it belongs to
+        self.param_to_partition_ids = {}
+
+        #stores if a partition has been reduced in this step
+        self.is_partition_reduced = {}
+
+        #number of grads in partition that still need to be computed
+        self.remaining_grads_in_partition = {}
+
+        #total number of grads in partition
+        self.total_grads_in_partition = {}
+
+        #stores if a grad in a partition has been computed or not
+        self.is_grad_computed = {}
+
+        #stores the offset at which a parameter gradient needs to be inserted in a partition
+        self.grad_partition_insertion_offset = {}
+
+        #the offset in the gradient at which it must be inserted at the beginning of the paritition
+        self.grad_start_offset = {}
+
+        #will store the averaged gradients required by this parititon
+        self.averaged_gradients = {}
+
+        # store index of first parameter in each partition
+        self.first_param_index_in_partition = {}
+
+        #initializes all data structures for implementing gradient partitioning
+        self.initialize_gradient_partitioning_data_structures()
+
+        #resets the data structure value for the next backward propagation
+        self.reset_partition_gradient_structures()
+
+        #creates backward hooks for gradient partitioning
+        self.create_reduce_and_remove_grad_hooks()
+
+        # we may have a way of fusing dynamic scale. Do not support for now
+        if dynamic_loss_scale:
+            if dynamic_loss_args is None:
+                self.loss_scaler = DynamicLossScaler()
+            else:
+                self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
+
+            self.dynamic_loss_scale = True
+
+        else:
+            self.dynamic_loss_scale = False
+            self.loss_scaler = LossScaler(scale=static_loss_scale)
+            self.cur_iter = 0
+
+        see_memory_usage("Before initializing optimizer states")
+        self.initialize_optimizer_states()
+        see_memory_usage("After initializing optimizer states")
+
+        if dist.get_rank() == 0:
+            logger.info(f"optimizer state initialized")
+
+        if dist.get_rank(group=self.dp_process_group) == 0:
+            see_memory_usage(f"After initializing ZeRO optimizer")
+
+    def _release_ipg_buffers(self):
+        if self.contiguous_gradients:
+            self.ipg_buffer = None
+            self.grads_in_partition = None
+            self.grads_in_partition_offset = 0
+
+    def initialize_optimizer_states(self):
+
+        for i, group in enumerate(self.fp16_groups):
+            single_grad_partition = torch.zeros(
+                int(self.partition_size[i]),
+                dtype=self.single_partition_of_fp32_groups[i].dtype,
+                device=self.device)
+            self.single_partition_of_fp32_groups[
+                i].grad = single_grad_partition.pin_memory(
+                ) if self.cpu_offload else single_grad_partition
+
+        self.optimizer.step()
+
+        if not self.cpu_offload:
+            for group in self.single_partition_of_fp32_groups:
+                group.grad = None
+
+        return
+
+    #########################################################################
+    #########################ZeRO Partition Gradients########################
+    #########################################################################
+
+    def get_first_param_index(self, group_id, param_group, partition_id):
+        for index, param in enumerate(param_group):
+            param_id = self.get_param_id(param)
+            if partition_id in self.param_to_partition_ids[group_id][param_id]:
+                return index
+        return None
+
+    def initialize_gradient_partitioning_data_structures(self):
+
+        total_partitions = dist.get_world_size(group=self.dp_process_group)
+
+        for i, param_group in enumerate(self.fp16_groups):
+
+            self.param_to_partition_ids[i] = {}
+            self.is_partition_reduced[i] = {}
+            self.total_grads_in_partition[i] = {}
+            self.remaining_grads_in_partition[i] = {}
+            self.is_grad_computed[i] = {}
+            self.grad_partition_insertion_offset[i] = {}
+            self.grad_start_offset[i] = {}
+            self.first_param_index_in_partition[i] = {}
+
+            for partition_id in range(total_partitions):
+                self.is_grad_computed[i][partition_id] = {}
+                self.grad_partition_insertion_offset[i][partition_id] = {}
+                self.grad_start_offset[i][partition_id] = {}
+                self.total_grads_in_partition[i][partition_id] = 0
+                self.initialize_gradient_partition(i, param_group, partition_id)
+                self.is_partition_reduced[i][partition_id] = False
+                self.first_param_index_in_partition[i][
+                    partition_id] = self.get_first_param_index(
+                        i,
+                        param_group,
+                        partition_id)
+
+    def independent_gradient_partition_epilogue(self):
+        self.report_ipg_memory_usage(f"In ipg_epilogue before reduce_ipg_grads", 0)
+        self.reduce_ipg_grads()
+        self.report_ipg_memory_usage(f"In ipg_epilogue after reduce_ipg_grads", 0)
+
+        #if dist.get_rank() == 0:
+        #    logger.info("Params already reduced %s", self.params_already_reduced)
+        for i in range(len(self.params_already_reduced)):
+            self.params_already_reduced[i] = False
+
+        if self.overlap_comm:
+            torch.cuda.synchronize()
+
+        if self.cpu_offload is False:
+            for i, _ in enumerate(self.fp16_groups):
+
+                if not i in self.averaged_gradients or self.averaged_gradients[i] is None:
+                    self.averaged_gradients[i] = self.get_flat_partition(
+                        self.params_in_partition[i],
+                        self.first_offset[i],
+                        self.partition_size[i],
+                        dtype=torch.half,
+                        device=torch.cuda.current_device(),
+                        return_tensor_list=True)
+                else:
+                    avg_new = self.get_flat_partition(self.params_in_partition[i],
+                                                      self.first_offset[i],
+                                                      self.partition_size[i],
+                                                      dtype=torch.half,
+                                                      device=torch.cuda.current_device(),
+                                                      return_tensor_list=True)
+
+                    for accumulated_grad, new_avg_grad in zip(self.averaged_gradients[i],avg_new):
+                        accumulated_grad.add_(new_avg_grad)
+
+        self._release_ipg_buffers()
+
+        # No need to keep the gradients anymore.
+        # All gradients required by the step
+        # are in self.averaged_gradients
+        self.zero_grad()
+        see_memory_usage(f"End ipg_epilogue")
+
+    # resets all partition to no reduced
+    # sets remianing grads to the total number of grads in each partition
+    # set is grad computed to false for all grads in partition
+    def reset_partition_gradient_structures(self):
+        total_partitions = dist.get_world_size(group=self.dp_process_group)
+        for i, _ in enumerate(self.fp16_groups):
+            for partition_id in range(total_partitions):
+                self.is_partition_reduced[i][partition_id] = False
+                self.remaining_grads_in_partition[i][
+                    partition_id] = self.total_grads_in_partition[i][partition_id]
+
+                for param_id in self.is_grad_computed[i][partition_id]:
+                    self.is_grad_computed[i][partition_id][param_id] = False
+
+    def initialize_gradient_partition(self, i, param_group, partition_id):
+        def set_key_value_list(dictionary, key, value):
+            if key in dictionary:
+                dictionary[key].append(value)
+            else:
+                dictionary[key] = [value]
+
+        def increment_value(dictionary, key):
+            if key in dictionary:
+                dictionary[key] += 1
+            else:
+                dictionary[key] = 1
+
+        partition_size = self.partition_size[i]
+
+        start_index = partition_size * partition_id
+        end_index = partition_size * (partition_id + 1)
+
+        current_index = 0
+        first_offset = 0
+
+        for param in param_group:
+
+            param_size = param.numel()
+            param_id = self.get_param_id(param)
+
+            if (current_index >= start_index and current_index < end_index):
+                set_key_value_list(self.param_to_partition_ids[i],
+                                   param_id,
+                                   partition_id)
+                increment_value(self.total_grads_in_partition[i], partition_id)
+
+                self.is_grad_computed[i][partition_id][param_id] = False
+
+                self.grad_partition_insertion_offset[i][partition_id][
+                    param_id] = current_index - start_index
+                self.grad_start_offset[i][partition_id][param_id] = 0
+
+            elif start_index > current_index and start_index < (current_index +
+                                                                param_size):
+                assert (first_offset==0), "This can happen either zero or only once as this must be the first tensor in the partition"
+                first_offset = start_index - current_index
+
+                set_key_value_list(self.param_to_partition_ids[i],
+                                   param_id,
+                                   partition_id)
+                increment_value(self.total_grads_in_partition[i], partition_id)
+
+                self.is_grad_computed[i][partition_id][param_id] = False
+
+                self.grad_partition_insertion_offset[i][partition_id][param_id] = 0
+                self.grad_start_offset[i][partition_id][param_id] = first_offset
+
+            current_index = current_index + param_size
+
+    def overlapping_partition_gradients_reduce_epilogue(self):
+        self.independent_gradient_partition_epilogue()
+
+    def create_reduce_and_remove_grad_hooks(self):
+        self.grad_accs = []
+        for i, param_group in enumerate(self.fp16_groups):
+            for param in param_group:
+                if param.requires_grad:
+
+                    def wrapper(param, i):
+                        param_tmp = param.expand_as(param)
+                        grad_acc = param_tmp.grad_fn.next_functions[0][0]
+
+                        def reduce_partition_and_remove_grads(*notneeded):
+                            self.reduce_ready_partitions_and_remove_grads(param, i)
+
+                        grad_acc.register_hook(reduce_partition_and_remove_grads)
+                        self.grad_accs.append(grad_acc)
+
+                    wrapper(param, i)
+
+    def get_param_id(self, param):
+        unique_id = id(param)
+        return self.param_id[unique_id]
+
+    def report_ipg_memory_usage(self, tag, param_elems):
+        elem_count = self.elements_in_ipg_bucket + param_elems
+        percent_of_bucket_size = (100.0 * elem_count) // self.reduce_bucket_size
+        see_memory_usage(
+            f"{tag}: elems in_bucket {self.elements_in_ipg_bucket} param {param_elems} max_percent {percent_of_bucket_size}"
+        )
+
+    ###############Idependent Partition Gradient ########################
+    def reduce_independent_p_g_buckets_and_remove_grads(self, param, i):
+        if self.elements_in_ipg_bucket + param.numel() > self.reduce_bucket_size:
+            self.report_ipg_memory_usage("In ipg_remove_grads before reduce_ipg_grads",
+                                         param.numel())
+            self.reduce_ipg_grads()
+            if self.contiguous_gradients and self.overlap_comm:
+                # Swap ipg_index between 0 and 1
+                self.ipg_index = 1 - self.ipg_index
+            self.report_ipg_memory_usage("In ipg_remove_grads after reduce_ipg_grads",
+                                         param.numel())
+
+        param_id = self.get_param_id(param)
+
+        assert self.params_already_reduced[param_id] == False, \
+            f"The parameter {param_id} has already been reduced. \
+            Gradient computed twice for this partition. \
+            Multiple gradient reduction is currently not supported"
+
+        #keeping the gradients contiguous to prevent memory fragmentation, and avoid flattening
+        if self.contiguous_gradients:
+            new_grad_tensor = self.ipg_buffer[self.ipg_index].narrow(
+                0,
+                self.elements_in_ipg_bucket,
+                param.numel())
+            new_grad_tensor.copy_(param.grad.view(-1))
+            param.grad.data = new_grad_tensor.data.view_as(param.grad)
+
+        self.elements_in_ipg_bucket += param.numel()
+        self.grads_in_ipg_bucket.append(param.grad)
+        self.params_in_ipg_bucket.append((i, param, param_id))
+
+        self.report_ipg_memory_usage("End ipg_remove_grads", 0)
+
+    def print_rank_0(self, message):
+        if dist.get_rank() == 0:
+            logger.info(message)
+
+    def gradient_reduction_w_predivide(self, tensor):
+        dp_world_size = dist.get_world_size(group=self.dp_process_group)
+
+        tensor_to_allreduce = tensor
+
+        if self.allreduce_always_fp32:
+            tensor_to_allreduce = tensor.float()
+
+        if self.postscale_gradients:
+            if self.gradient_predivide_factor != 1.0:
+                tensor_to_allreduce.mul_(1. / self.gradient_predivide_factor)
+
+            dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
+
+            if self.gradient_predivide_factor() != dp_world_size:
+                tensor_to_allreduce.mul_(self.gradient_predivide_factor() /
+                                         dp_world_size)
+        else:
+            tensor_to_allreduce.div_(dp_world_size)
+            dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
+
+        if self.allreduce_always_fp32 and tensor is not tensor_to_allreduce:
+            tensor.copy_(tensor_to_allreduce)
+
+        return tensor
+
+    def average_tensor(self, tensor):
+        if self.overlap_comm:
+            torch.cuda.synchronize()
+            stream = self.reduction_stream
+        else:
+            stream = torch.cuda.current_stream()
+
+        with torch.cuda.stream(stream):
+            if not self.reduce_scatter:
+                self.gradient_reduction_w_predivide(tensor)
+                return
+
+            # Accumulate destination ranks and bucket offsets for each gradient slice.
+            # Note: potential future optimization, record access pattern of parameters
+            # in backward pass and partition gradients w.r.t. access pattern so that our
+            # bucket is guaranteed to be contiguous w.r.t. ranks
+            rank_and_offsets = []
+            curr_size = 0
+            prev_id = -1
+            for i, param, param_id in self.params_in_ipg_bucket:
+                partition_ids = self.param_to_partition_ids[i][param_id]
+                partition_size = self.partition_size[i]
+                # Get all partition ids + their offsets
+                partition_ids_w_offsets = []
+                for partition_id in partition_ids:
+                    offset = self.grad_start_offset[i][partition_id][param_id]
+                    partition_ids_w_offsets.append((partition_id, offset))
+                partition_ids_w_offsets.sort(key=lambda t: t[1])
+
+                # Calculate rank and offsets for grad slices
+                for idx in range(len(partition_ids_w_offsets)):
+                    partition_id, offset = partition_ids_w_offsets[idx]
+
+                    # Calculate numel for grad slice depending on partition location
+                    if idx == len(partition_ids_w_offsets) - 1:
+                        # Last partition_id uses its own offset
+                        numel = param.numel() - offset
+                    else:
+                        # Set numel to next partition's offset
+                        numel = partition_ids_w_offsets[idx + 1][1] - offset
+
+                    # Merge bucket ranges if they belong to the same rank
+                    if partition_id == prev_id:
+                        prev_pid, prev_size, prev_numel = rank_and_offsets[-1]
+                        rank_and_offsets[-1] = (prev_pid, prev_size, prev_numel + numel)
+                    else:
+                        rank_and_offsets.append((partition_id, curr_size, numel))
+
+                    curr_size += numel
+                    prev_id = partition_id
+            tensor.div_(dist.get_world_size(group=self.dp_process_group))
+
+            async_handles = []
+            for dst, bucket_offset, numel in rank_and_offsets:
+                grad_slice = tensor.narrow(0, int(bucket_offset), int(numel))
+                dst_rank = _get_global_rank(self.dp_process_group, dst)
+                async_handle = dist.reduce(grad_slice,
+                                           dst=dst_rank,
+                                           group=self.dp_process_group,
+                                           async_op=True)
+                async_handles.append(async_handle)
+
+            for handle in async_handles:
+                handle.wait()
+
+    ##############################################################################
+    ############################# CPU Offload Methods#############################
+    ##############################################################################
+    def get_grad_position(self, group_id, tensor_list, first_offset, partition_size):
+        current_offset = 0
+
+        for i, tensor in enumerate(tensor_list):
+            param_id = self.get_param_id(tensor)
+            param_start_offset = 0
+
+            num_elements = tensor.numel()
+            tensor_offset = 0
+
+            #we need to offset to get to the right element
+            if i == 0 and first_offset > 0:
+                tensor_offset = first_offset
+                num_elements = num_elements - tensor_offset
+                param_start_offset = first_offset
+
+            #we dont need all elements of the tensor
+            if num_elements > (partition_size - current_offset):
+                num_elements = partition_size - current_offset
+
+            self.grad_position[param_id] = [
+                int(group_id),
+                int(param_start_offset),
+                int(current_offset),
+                int(num_elements)
+            ]
+            current_offset += num_elements
+
+    def update_overflow_tracker_for_param_grad(self, param):
+        if param.grad is not None and self._has_inf_or_nan(param.grad.data):
+            self.local_overflow = True
+
+    def async_accumulate_grad_in_cpu(self, param):
+        param_id = self.get_param_id(param)
+
+        #copy to a preexisiting buffer to avoid memory allocation penalty
+        dest_buffer = self.temp_grad_buffer_for_cpu_offload.view(-1).narrow(
+            0,
+            0,
+            param.numel())
+        dest_buffer.copy_(param.grad.view(-1), non_blocking=True)
+
+        if param_id not in self.accumulated_grads_in_cpu:
+            self.accumulated_grads_in_cpu[param_id] = torch.zeros(
+                param.numel(),
+                dtype=param.dtype,
+                device=self.device).pin_memory()
+
+        self.accumulated_grads_in_cpu[param_id].add_(dest_buffer)
+
+    def async_accumulate_grad_in_cpu_via_gpu(self, param):
+        param_id = self.get_param_id(param)
+
+        #copy to a preexisiting buffer to avoid memory allocation penalty
+        dest_buffer = self.temp_grad_buffer_for_gpu_offload.view(-1).narrow(
+            0,
+            0,
+            param.numel())
+
+        if param_id not in self.accumulated_grads_in_cpu:
+            self.accumulated_grads_in_cpu[param_id] = torch.zeros(
+                param.numel(),
+                dtype=param.dtype,
+                device=self.device).pin_memory()
+
+        if self.micro_step_id > 0:
+            dest_buffer.copy_(self.accumulated_grads_in_cpu[param_id].view(-1),
+                              non_blocking=True)
+            param.grad.data.view(-1).add_(dest_buffer)
+
+        #at the boundary we will send 32bit directly
+        if not self.is_gradient_accumulation_boundary:
+            self.accumulated_grads_in_cpu[param_id].data.copy_(param.grad.data.view(-1),
+                                                               non_blocking=True)
+
+    def set_norm_for_param_grad(self, param):
+        param_id = self.get_param_id(param)
+        accumulated_grad = self.accumulated_grads_in_cpu[
+            param_id] if self.gradient_accumulation_steps > 1 else param.grad
+
+        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
+
+        start = source_offset
+        accumulated_grad = accumulated_grad.view(-1).narrow(0, start, num_elements)
+
+        self.norm_for_param_grads[param_id] = accumulated_grad.data.double().norm(2)
+
+    def set_norm_for_param_grad_in_gpu(self, param):
+        param_id = self.get_param_id(param)
+        accumulated_grad = param.grad
+
+        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
+
+        start = source_offset
+        accumulated_grad = accumulated_grad.view(-1).narrow(0, start, num_elements)
+
+        self.norm_for_param_grads[param_id] = accumulated_grad.data.double().norm(2)
+
+    def async_inplace_copy_grad_to_fp32_buffer(self, param):
+        param_id = self.get_param_id(param)
+
+        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
+
+        dest_tensor = self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(
+            0,
+            dest_offset,
+            num_elements)
+        if self.gradient_accumulation_steps > 1:
+            src_tensor = self.accumulated_grads_in_cpu[param_id].view(-1).narrow(
+                0,
+                source_offset,
+                num_elements)
+        else:
+            src_tensor = param.grad.view(-1).narrow(0,
+                                                    source_offset,
+                                                    num_elements).float()
+        dest_tensor.copy_(src_tensor, non_blocking=True)
+
+    def async_inplace_copy_grad_to_fp32_buffer_from_gpu(self, param):
+        param_id = self.get_param_id(param)
+
+        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
+
+        dest_tensor = self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(
+            0,
+            dest_offset,
+            num_elements)
+
+        src_tensor = param.grad.view(-1).narrow(0, source_offset, num_elements).float()
+        dest_tensor.copy_(src_tensor, non_blocking=True)
+        param.grad = None
+
+    def complete_grad_norm_calculation_for_cpu_offload(self, params):
+        total_norm = 0.0
+        norm_type = 2.0
+        for p in params:
+            if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
+                param_id = self.get_param_id(p)
+                param_norm = self.norm_for_param_grads[param_id]
+                total_norm += param_norm.item()**2
+
+        # Sum across all model parallel GPUs.
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+
+        torch.distributed.all_reduce(total_norm_cuda,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=self.dp_process_group)
+
+        self._model_parallel_all_reduce(tensor=total_norm_cuda,
+                                        op=torch.distributed.ReduceOp.SUM)
+
+        total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+
+        if total_norm == float(
+                'inf') or total_norm == -float('inf') or total_norm != total_norm:
+            total_norm = -1
+
+        return total_norm
+
+    ############################################################################################
+
+    def copy_grads_in_partition(self, param):
+        if self.cpu_offload:
+
+            if self.gradient_accumulation_steps > 1:
+                self.async_accumulate_grad_in_cpu_via_gpu(param)
+
+            if self.is_gradient_accumulation_boundary:
+                self.set_norm_for_param_grad_in_gpu(param)
+
+                self.update_overflow_tracker_for_param_grad(param)
+
+                self.async_inplace_copy_grad_to_fp32_buffer_from_gpu(param)
+
+            return
+        #print(f"ID {self.get_param_id(param)} grad norm {param.grad.norm()}")
+        if self.grads_in_partition is None:
+            self.grads_in_partition_offset = 0
+            total_size = 0
+            for group in self.params_in_partition:
+                for param_in_partition in group:
+                    total_size += param_in_partition.numel()
+
+            see_memory_usage(f"before copying {total_size} gradients into partition")
+            self.grads_in_partition = torch.empty(int(total_size),
+                                                  dtype=torch.half,
+                                                  device=torch.cuda.current_device())
+            see_memory_usage(f"after copying {total_size} gradients into partition")
+
+        #The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer
+        new_grad_tensor = self.grads_in_partition.view(-1).narrow(
+            0,
+            self.grads_in_partition_offset,
+            param.numel())
+        new_grad_tensor.copy_(param.grad.view(-1))
+        param.grad.data = new_grad_tensor.data.view_as(param.grad)
+        #print(f"Grad norm after copy to contiguous_buffer {param.grad.data.norm()}")
+        self.grads_in_partition_offset += param.numel()
+
+    def reduce_ipg_grads(self):
+        if self.overlap_comm:
+            stream = self.reduction_stream
+        else:
+            stream = torch.cuda.current_stream()
+
+        if self.contiguous_gradients:
+            self.average_tensor(self.ipg_buffer[self.ipg_index])
+        else:
+            self.buffered_reduce_fallback(
+                None,
+                self.grads_in_ipg_bucket,
+                elements_per_buffer=self.elements_in_ipg_bucket)
+
+        with torch.cuda.stream(stream):
+            for _, param, param_id in self.params_in_ipg_bucket:
+                self.params_already_reduced[param_id] = True
+
+                if not self.is_param_in_current_partition[param_id]:
+                    if self.overlap_comm and self.contiguous_gradients is False:
+                        # Clear the previous grads during the next reduction
+                        # to avoid clearing them before the reduction is complete.
+                        if self.previous_reduced_grads is None:
+                            self.previous_reduced_grads = []
+                        self.previous_reduced_grads.append(param)
+                    else:
+                        param.grad = None
+                elif self.contiguous_gradients:
+                    self.copy_grads_in_partition(param)
+
+        self.grads_in_ipg_bucket = []
+        self.params_in_ipg_bucket = []
+        self.elements_in_ipg_bucket = 0
+        #####################################################################
+
+    def reduce_ready_partitions_and_remove_grads(self, param, i):
+        self.reduce_independent_p_g_buckets_and_remove_grads(param, i)
+
+    def zero_reduced_gradients(self, partition_id, i):
+        def are_all_related_partitions_reduced(params_id):
+            for partition_id in self.param_to_partition_ids[i][params_id]:
+                if not self.is_partition_reduced[i][partition_id]:
+                    return False
+            return True
+
+        for params_id in self.is_grad_computed[i][partition_id]:
+            if are_all_related_partitions_reduced(params_id):
+                self.param_dict[params_id].grad = None
+
+    def flatten_and_print(self, message, tensors, start=0, n=5):
+        flatten_tensor = _flatten_dense_tensors(tensors)
+
+        def print_func():
+            logger.info(flatten_tensor.contiguous().view(-1).narrow(0, start, n))
+
+        self.sequential_execution(print_func, message)
+
+    def get_grads_to_reduce(self, i, partition_id):
+        def get_reducable_portion(key):
+            grad = self.param_dict[key].grad
+            total_elements = grad.numel()
+            start = self.grad_start_offset[i][partition_id][key]
+            num_elements = min(
+                total_elements - start,
+                self.partition_size[i] -
+                self.grad_partition_insertion_offset[i][partition_id][key])
+            if not pg_correctness_test:
+                if num_elements == total_elements:
+                    return grad
+                else:
+                    return grad.contiguous().view(-1).narrow(0,
+                                                             int(start),
+                                                             int(num_elements))
+            else:
+                if num_elements == total_elements:
+                    return grad.clone()
+                else:
+                    return grad.clone().contiguous().view(-1).narrow(
+                        0,
+                        int(start),
+                        int(num_elements))
+
+        grads_to_reduce = []
+        for key in self.is_grad_computed[i][partition_id]:
+            grad = get_reducable_portion(key)
+            grads_to_reduce.append(grad)
+        return grads_to_reduce
+
+    def sequential_execution(self, function, message, group=None):
+        if group is None:
+            group = self.dp_process_group
+        if dist.get_rank(group=group) == 0:
+            logger.info(message)
+        for id in range(dist.get_world_size(group=group)):
+            if id == dist.get_rank(group=group):
+                function()
+            dist.barrier(group=group)
+
+    def set_none_gradients_to_zero(self, i, partition_id):
+        for param_id in self.is_grad_computed[i][partition_id]:
+            param = self.param_dict[param_id]
+            if param.grad is None:
+                param.grad = torch.zero_like(param)
+
+    ######################Reduction Related Methods##############################
+
+    def allreduce_bucket(self, bucket, allreduce_always_fp32=False, rank=None, log=None):
+        rank = None
+        tensor = flatten(bucket)
+
+        tensor_to_allreduce = tensor
+
+        if pg_correctness_test:
+            allreduce_always_fp32 = True
+
+        if allreduce_always_fp32:
+            tensor_to_allreduce = tensor.float()
+
+        tensor_to_allreduce.div_(dist.get_world_size(group=self.dp_process_group))
+
+        if rank is None:
+            #    "All Reducing"
+            dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
+        else:
+            global_rank = _get_global_rank(self.dp_process_group, rank)
+            dist.reduce(tensor_to_allreduce, global_rank, group=self.dp_process_group)
+
+        if allreduce_always_fp32 and tensor is not tensor_to_allreduce:
+            if rank is None or rank == dist.get_rank(group=self.dp_process_group):
+                tensor.copy_(tensor_to_allreduce)
+
+        return tensor
+
+    #if rank is specified do a reduction instead of an allreduce
+    def allreduce_and_copy(self, small_bucket, rank=None, log=None):
+        if self.overlap_comm:
+            torch.cuda.synchronize()
+            if self.previous_reduced_grads is not None:
+                # previous_reduced_grads has the previous reduced grads,
+                # now it is safe to clear.
+                for param in self.previous_reduced_grads:
+                    param.grad = None
+                self.previous_reduced_grads = None
+            stream = self.reduction_stream
+        else:
+            stream = torch.cuda.current_stream()
+
+        with torch.cuda.stream(stream):
+            allreduced = self.allreduce_bucket(small_bucket, rank=rank, log=log)
+            if rank is None or rank == dist.get_rank(group=self.dp_process_group):
+                for buf, synced in zip(small_bucket, unflatten(allreduced, small_bucket)):
+                    buf.copy_(synced)
+
+    def allreduce_no_retain(self,
+                            bucket,
+                            numel_per_bucket=500000000,
+                            rank=None,
+                            log=None):
+        small_bucket = []
+        numel = 0
+        for tensor in bucket:
+            small_bucket.append(tensor)
+            numel = numel + tensor.numel()
+            if numel > numel_per_bucket:
+                self.allreduce_and_copy(small_bucket, rank=rank, log=None)
+                small_bucket = []
+        if len(small_bucket) > 0:
+            self.allreduce_and_copy(small_bucket, rank=rank, log=log)
+
+    #allows using reduction of gradients instead of using all_reduce
+    def buffered_reduce_fallback(self,
+                                 rank,
+                                 grads,
+                                 elements_per_buffer=500000000,
+                                 log=None):
+        split_buckets = split_half_float_double(grads)
+
+        for i, bucket in enumerate(split_buckets):
+            self.allreduce_no_retain(bucket,
+                                     numel_per_bucket=elements_per_buffer,
+                                     rank=rank,
+                                     log=log)
+
+    #############################################################################
+    #############################################################################
+    #############################################################################
+
+    #views the tensor as multiple partitions and returns
+    #those partitions
+    def get_data_parallel_partitions(self, tensor):
+        partitions = []
+
+        dp = dist.get_world_size(group=self.dp_process_group)
+        dp_id = dist.get_rank(group=self.dp_process_group)
+
+        total_num_elements = tensor.numel()
+
+        base_size = total_num_elements // dp
+        remaining = total_num_elements % dp
+
+        start = 0
+        for id in range(dp):
+            partition_size = base_size
+            if id < remaining:
+                partition_size = partition_size + 1
+            partitions.append(tensor.narrow(0, start, partition_size))
+            start = start + partition_size
+        return partitions
+
+    def get_partition_info(self, tensor_list, partition_size, partition_id):
+        params_in_partition = []
+        params_not_in_partition = []
+
+        start_index = partition_size * partition_id
+        end_index = partition_size * (partition_id + 1)
+
+        current_index = 0
+        first_offset = 0
+
+        for tensor in tensor_list:
+
+            tensor_size = tensor.numel()
+
+            if (current_index >= start_index and current_index < end_index):
+                params_in_partition.append(tensor)
+
+            elif start_index > current_index and start_index < (current_index +
+                                                                tensor_size):
+                params_in_partition.append(tensor)
+
+                assert (first_offset==0), "This can happen either zero or only once as this must be the first tensor in the partition"
+                first_offset = start_index - current_index
+
+            else:
+                params_not_in_partition.append(tensor)
+
+            current_index = current_index + tensor_size
+
+        return params_in_partition, params_not_in_partition, first_offset
+
+    def zero_grad(self, set_grads_to_None=True):
+        """
+        Zero FP16 parameter grads.
+        """
+        # FP32 grad should never exist.
+        # For speed, set model fp16 grad to None by default
+        for group in self.fp16_groups:
+            for p in group:
+                if set_grads_to_None:
+                    p.grad = None
+                else:
+                    if p.grad is not None:
+                        p.grad.detach_()
+                        p.grad.zero_()
+
+    def _model_parallel_all_reduce(self, tensor, op):
+        """ Perform all reduce within model parallel group, if any.
+        """
+        if self.model_parallel_group is None:
+            torch.distributed.all_reduce(tensor=tensor, op=op)
+        else:
+            torch.distributed.all_reduce(tensor=tensor,
+                                         op=op,
+                                         group=self.model_parallel_group)
+
+    def get_grad_norm_direct(self, gradients, params, norm_type=2):
+        """Clips gradient norm of an iterable of parameters.
+
+        This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
+        added functionality to handle model parallel parameters. Note that
+        the gradients are modified in place.
+
+        Arguments:
+            parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+                single Tensor that will have gradients normalized
+            max_norm (float or int): max norm of the gradients
+            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+                infinity norm.
+
+        Returns:
+            Total norm of the parameters (viewed as a single vector).
+        """
+        norm_type = float(norm_type)
+        if norm_type == inf:
+            total_norm = max(g.data.abs().max() for g in gradients)
+            total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+            torch.distributed.all_reduce(total_norm_cuda,
+                                         op=torch.distributed.ReduceOp.MAX,
+                                         group=self.dp_process_group)
+
+            # Take max across all GPUs.
+            self._model_parallel_all_reduce(tensor=total_norm_cuda,
+                                            op=torch.distributed.ReduceOp.MAX)
+            total_norm = total_norm_cuda[0].item()
+        else:
+            total_norm = 0.0
+            #if dist.get_rank() == 0:
+            #    logger.info(f"Total Norm begining {total_norm}")
+            for g, p in zip(gradients, params):
+                if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
+                    param_norm = g.data.double().norm(2)
+                    total_norm += param_norm.item()**2
+            # Sum across all model parallel GPUs.
+            total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+
+            torch.distributed.all_reduce(total_norm_cuda,
+                                         op=torch.distributed.ReduceOp.SUM,
+                                         group=self.dp_process_group)
+
+            self._model_parallel_all_reduce(tensor=total_norm_cuda,
+                                            op=torch.distributed.ReduceOp.SUM)
+
+            total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+
+        if total_norm == float(
+                'inf') or total_norm == -float('inf') or total_norm != total_norm:
+            total_norm = -1
+
+        return total_norm
+
+    #creates a flat fused tensor from the tensor list starting at the first_offset
+    #in the first tensor of the list. If there are not enough elements in the tensor
+    #list then the flat tensor will be padded with zeros
+    def get_flat_partition(self,
+                           tensor_list,
+                           first_offset,
+                           partition_size,
+                           dtype,
+                           device,
+                           return_tensor_list=False):
+        flat_tensor_list = []
+        current_size = 0
+        for i, tensor in enumerate(tensor_list):
+            if tensor.grad is None:
+                continue
+
+            tensor = tensor.grad
+            num_elements = tensor.numel()
+            tensor_offset = 0
+
+            #we need to offset to get to the right element
+            if i == 0 and first_offset > 0:
+                tensor_offset = first_offset
+                num_elements = num_elements - tensor_offset
+
+            #we dont need all elements of the tensor
+            if num_elements > (partition_size - current_size):
+                num_elements = partition_size - current_size
+
+            #we need a narrow view of the tensor based on the tensor offset and number of elements that
+            #we need from this tensor
+            if tensor_offset > 0 or num_elements < tensor.numel():
+                flat_tensor_list.append(tensor.contiguous().view(-1).narrow(
+                    0,
+                    int(tensor_offset),
+                    int(num_elements)))
+            else:
+                flat_tensor_list.append(tensor)
+
+            current_size = current_size + num_elements
+
+        #this means its the last partition and does not align with the dp boundary. We need to pad before flattening
+        if current_size < partition_size:
+            flat_tensor_list.append(
+                torch.zeros(int(partition_size - current_size),
+                            dtype=dtype,
+                            device=device))
+
+        if return_tensor_list:
+            return flat_tensor_list
+
+        return _flatten_dense_tensors(flat_tensor_list)
+
+    def free_grad_in_param_list(self, param_list):
+        for p in param_list:
+            p.grad = None
+
+    def reset_cpu_buffers(self):
+        self.norm_for_param_grads = {}
+        self.local_overflow = False
+
+    def step(self, closure=None):
+        """
+        Not supporting closure.
+        """
+        self.micro_step_id = -1
+
+        if self.cpu_offload:
+            torch.cuda.current_stream().wait_stream(self.migration_stream)
+
+        see_memory_usage(f"In step before checking overflow")
+
+        # First compute norm for all group so we know if there is overflow
+        self.check_overflow()
+
+        timers = self.timers
+
+        prev_scale = self.loss_scale
+        self._update_scale(self.overflow)
+        if self.overflow:
+            see_memory_usage('After overflow before clearing gradients')
+            self.zero_grad()
+            if self.cpu_offload:
+                self.reset_cpu_buffers()
+            else:
+                self.averaged_gradients = {}
+
+            see_memory_usage('After overflow after clearing gradients')
+
+            logger.info(
+                "[deepscale] OVERFLOW! Rank {} Skipping step. Attempted loss scale: {}, "
+                "reducing to {}".format(dist.get_rank(),
+                                        prev_scale,
+                                        self.loss_scale))
+            timers('optimizer_step').start()
+            timers('optimizer_step').stop()
+            timers('optimizer_allgather').start()
+            timers('optimizer_allgather').stop()
+            return
+
+        norm_groups = []
+        single_partition_grad_groups = []
+        skip = False
+        partition_id = dist.get_rank(group=self.dp_process_group)
+        for i, group in enumerate(self.fp16_groups):
+            if self.cpu_offload:
+                norm_groups.append(
+                    self.complete_grad_norm_calculation_for_cpu_offload(
+                        self.params_in_partition[i]))
+                single_grad_partition = self.single_partition_of_fp32_groups[i].grad
+            else:
+                norm_groups.append(
+                    self.get_grad_norm_direct(self.averaged_gradients[i],
+                                              self.params_in_partition[i]))
+
+                #free gradients for all the prameters that are not updated by this process
+                self.free_grad_in_param_list(self.params_not_in_partition[i])
+
+                #create a flat gradients for parameters updated by this process
+                # If we are last partition, ensure we have same size grads and partition size, if not pad with zero tensors
+                if partition_id == dist.get_world_size(group=self.dp_process_group) - 1:
+                    single_grad_partition = flatten_dense_tensors_aligned(
+                        self.averaged_gradients[i],
+                        int(self.partition_size[i])).to(
+                            self.single_partition_of_fp32_groups[i].dtype)
+                else:
+                    single_grad_partition = _flatten_dense_tensors(
+                        self.averaged_gradients[i]).to(
+                            self.single_partition_of_fp32_groups[i].dtype)
+                assert single_grad_partition.numel() == self.partition_size[i], \
+                    "averaged gradients have different number of elements that partition size {} {} {} {}".format(single_grad_partition.numel(), self.partition_size[i], i, partition_id)
+
+                self.single_partition_of_fp32_groups[i].grad = single_grad_partition
+                #release all the gradient since we have already created a necessary copy in dp_grad_partition
+                self.free_grad_in_param_list(self.params_in_partition[i])
+
+                self.averaged_gradients[i] = None
+
+            single_partition_grad_groups.append(single_grad_partition)
+
+        self.unscale_and_clip_grads(single_partition_grad_groups, norm_groups)
+        #torch.set_num_threads(12)
+        timers('optimizer_step').start()
+        #self.optimizer.step(fp16_param_groups=self.parallel_partitioned_fp16_groups)
+        self.optimizer.step()
+        #get rid of the fp32 gradients. Not needed anymore
+        if not self.cpu_offload:
+            for group in self.single_partition_of_fp32_groups:
+                group.grad = None
+
+        for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
+            fp16_partitions[partition_id].data.copy_(fp32_partition.data)
+
+        timers('optimizer_step').stop()
+
+        if self.cpu_offload:
+            self.reset_cpu_buffers()
+
+        timers('optimizer_allgather').start()
+        #gather the updated weights from everyone
+        for group_id, partitioned_params in enumerate(self.parallel_partitioned_fp16_groups):
+
+            #Sequential AllGather Best of both worlds
+            dp_world_size = dist.get_world_size(group=self.dp_process_group)
+            num_shards = max(
+                1,
+                partitioned_params[partition_id].numel() * dp_world_size //
+                self.allgather_bucket_size)
+
+            shard_size = partitioned_params[partition_id].numel() // num_shards
+            num_elements = shard_size
+
+            assert shard_size * num_shards <= partitioned_params[partition_id].numel()
+
+            for shard_id in range(num_shards):
+
+                if shard_id == (num_shards - 1):
+                    num_elements = partitioned_params[partition_id].numel(
+                    ) - shard_id * shard_size
+
+                shard_list = []
+                for dp_id in range(dp_world_size):
+                    curr_shard = partitioned_params[dp_id].narrow(
+                        0,
+                        shard_id * shard_size,
+                        num_elements).detach()
+                    shard_list.append(curr_shard)
+
+                dist.all_gather(shard_list,
+                                shard_list[partition_id],
+                                group=self.dp_process_group)
+        timers('optimizer_allgather').stop()
+
+        # TODO: we probably don't need this? just to be safe
+        for i in range(len(norm_groups)):
+            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
+                                                      self.fp16_groups[i])
+            for p, q in zip(self.fp16_groups[i], updated_params):
+                p.data = q.data
+
+        see_memory_usage('After zero_optimizer step')
+        return
+
+    def unscale_and_clip_grads(self, grad_groups_flat, norm_groups):
+        total_norm = 0.0
+        for norm in norm_groups:
+            total_norm += norm**2.0
+        total_norm = math.sqrt(total_norm)
+
+        # compute combined scale factor for this group
+        combined_scale = self.loss_scale
+        if self.clip_grad > 0.:
+            # norm is in fact norm*scale
+            clip = ((total_norm / self.loss_scale) + 1e-6) / self.clip_grad
+            if clip > 1:
+                combined_scale = clip * self.loss_scale
+
+        for grad in grad_groups_flat:
+            if isinstance(grad, list):
+                sub_partitions = grad
+                for g in sub_partitions:
+                    g.data.mul_(1. / combined_scale)
+            else:
+                grad.data.mul_(1. / combined_scale)
+
+    def _check_overflow(self, partition_gradients=True):
+        self.overflow = self.has_overflow(partition_gradients)
+
+    # `params` is a list / generator of torch.Variable
+    def has_overflow_serial(self, params, is_grad_list=False):
+        for p in params:
+            if p.grad is not None and self._has_inf_or_nan(p.grad.data):
+                return True
+
+        return False
+
+    def has_overflow_partitioned_grads_serial(self):
+        for i in range(len(self.fp16_groups)):
+            for j, grad in enumerate(self.averaged_gradients[i]):
+                if grad is not None and self._has_inf_or_nan(grad.data, j):
+                    return True
+        return False
+
+    def has_overflow(self, partition_gradients=True):
+        if partition_gradients:
+            overflow = self.local_overflow if self.cpu_offload else self.has_overflow_partitioned_grads_serial(
+            )
+            overflow_gpu = torch.cuda.ByteTensor([overflow])
+            torch.distributed.all_reduce(overflow_gpu,
+                                         op=torch.distributed.ReduceOp.MAX,
+                                         group=self.dp_process_group)
+
+        else:
+            params = []
+            for group in self.fp16_groups:
+                for param in group:
+                    params.append(param)
+
+            overflow = self.has_overflow_serial(params, is_grad_list=partition_gradients)
+            overflow_gpu = torch.cuda.ByteTensor([overflow])
+
+        # Since each model parallel GPU carries only part of the model,
+        # make sure overflow flag is synced across all the model parallel GPUs
+        self._model_parallel_all_reduce(tensor=overflow_gpu,
+                                        op=torch.distributed.ReduceOp.MAX)
+
+        overflow = overflow_gpu[0].item()
+        return bool(overflow)
+
+    # `x` is a torch.Tensor
+    @staticmethod
+    def _has_inf_or_nan(x, j=None):
+        try:
+            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
+            # Pytorch's .sum() creates a one-element tensor of the same type as x
+            # (which is true for some recent version of pytorch).
+            cpu_sum = float(x.float().sum())
+            # More efficient version that can be used if .sum() returns a Python scalar
+            # cpu_sum = float(x.sum())
+        except RuntimeError as instance:
+            # We want to check if inst is actually an overflow exception.
+            # RuntimeError could come from a different error.
+            # If so, we still want the exception to propagate.
+            if "value cannot be converted" not in instance.args[0]:
+                raise
+            return True
+        else:
+            if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
+                return True
+            return False
+
+    def backward(self, loss, retain_graph=False):
+        """
+        :attr:`backward` performs the following steps:
+
+        1. fp32_loss = loss.float()
+        2. scaled_loss = fp32_loss*loss_scale
+        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's fp16 leaves
+        """
+        self.micro_step_id += 1
+        if self.cpu_offload:
+            torch.cuda.current_stream().wait_stream(self.migration_stream)
+
+        if self.contiguous_gradients:
+            self.ipg_buffer = []
+            buf_0 = torch.empty(int(self.reduce_bucket_size * 4.5),
+                                dtype=torch.half,
+                                device=torch.cuda.current_device())
+            self.ipg_buffer.append(buf_0)
+
+            # Use double buffers to avoid data access conflict when overlap_comm is enabled.
+            if self.overlap_comm:
+                buf_1 = torch.empty(int(self.reduce_bucket_size * 4.5),
+                                    dtype=torch.half,
+                                    device=torch.cuda.current_device())
+                self.ipg_buffer.append(buf_1)
+            self.ipg_index = 0
+
+        self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
+
+    def check_overflow(self, partition_gradients=True):
+        self._check_overflow(partition_gradients)
+
+    def _update_scale(self, has_overflow=False):
+        self.loss_scaler.update_scale(has_overflow)
+
+    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
+    def _get_state(self):
+        return self.optimizer.state
+
+    def _set_state(self, value):
+        self.optimizer.state = value
+
+    state = property(_get_state, _set_state)
+
+    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
+    # (for example, to adjust the learning rate)
+    def _get_param_groups(self):
+        return self.optimizer.param_groups
+
+    def _set_param_groups(self, value):
+        self.optimizer.param_groups = value
+
+    param_groups = property(_get_param_groups, _set_param_groups)
+
+    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
+    def _get_loss_scale(self):
+        return self.loss_scaler.loss_scale
+
+    def _set_loss_scale(self, value):
+        self.loss_scaler.cur_scale = value
+
+    loss_scale = property(_get_loss_scale, _set_loss_scale)
+    cur_scale = property(_get_loss_scale, _set_loss_scale)
+
+    # Return group tensor after removing paddings that are added for alignment to DP world size.
+    # This method works on the assumption that each group contains a single flattened tensor.
+    def _get_groups_without_padding(self, groups_with_padding):
+        groups_without_padding = []
+        for i, group in enumerate(groups_with_padding):
+            lean_length = group.numel() - self.groups_padding[i]
+            groups_without_padding.append(group[:lean_length])
+
+        return groups_without_padding
+
+    # Return optimizer state after removing paddings that are added for alignment.
+    def _get_state_without_padding(self, state_with_padding, padding):
+        lean_state = {}
+        for key, value in state_with_padding.items():
+            if torch.is_tensor(value):
+                lean_length = value.numel() - padding
+                lean_state[key] = value[:lean_length]
+            else:
+                lean_state[key] = value
+
+        return lean_state
+
+    # Return base optimizer states.
+    # This method assumes that each param group contains a single flattened tensor.
+    def _get_base_optimizer_state(self):
+        optimizer_groups_state = []
+        for i, group in enumerate(self.optimizer.param_groups):
+            p = group['params'][0]
+            lean_optimizer_state = self._get_state_without_padding(
+                self.optimizer.state[p],
+                self.groups_padding[i])
+            optimizer_groups_state.append(lean_optimizer_state)
+
+        return optimizer_groups_state
+
+    def state_dict(self):
+        """
+        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
+        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
+        of the contained Pytorch optimizer.
+        Example::
+            checkpoint = {}
+            checkpoint['model'] = model.state_dict()
+            checkpoint['optimizer'] = optimizer.state_dict()
+            torch.save(checkpoint, "saved.pth")
+        """
+        state_dict = {}
+        state_dict['loss_scaler'] = self.loss_scaler
+        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
+        state_dict['overflow'] = self.overflow
+        state_dict['base_optimizer_state'] = self._get_base_optimizer_state()
+
+        state_dict['zero_stage'] = ZERO_OPTIMIZATION_GRADIENTS
+        state_dict['partition_count'] = self.partition_count
+
+        # Remove paddings for DP alignment to enable loading for other alignment values
+        fp32_groups_without_padding = self._get_groups_without_padding(
+            self.single_partition_of_fp32_groups)
+        state_dict['single_partition_of_fp32_groups'] = fp32_groups_without_padding
+
+        #        if self.cpu_offload:
+        #            state_dict_tmp = async_copy_to(state_dict,
+        #                                           'cpu',
+        #                                           torch.cuda.current_stream())
+        #            state_dict = state_dict_tmp
+
+        return state_dict
+
+    # Restore base optimizer fp32 weights from checkpoint by:
+    # 1) Merging fp32 weights from checkpoints of all partitions
+    # 2) Extracting fp32 weights for current partition from merged weights
+    # 3) Using extracted weights to update base optimizer weights directly.
+    def _restore_from_fp32_weights(self, all_state_dict):
+        partition_id = dist.get_rank(group=self.dp_process_group)
+        merged_single_partition_of_fp32_groups = []
+        for i in range(len(self.single_partition_of_fp32_groups)):
+            merged_partitions = [
+                sd['single_partition_of_fp32_groups'][i] for sd in all_state_dict
+            ]
+            flat_merged_partitions = flatten_dense_tensors_aligned(
+                merged_partitions,
+                dist.get_world_size(group=self.dp_process_group))
+            dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions)
+            merged_single_partition_of_fp32_groups.append(dp_partitions[partition_id])
+
+        for current, saved in zip(self.single_partition_of_fp32_groups, merged_single_partition_of_fp32_groups):
+            current.data.copy_(saved.data)
+
+    # Restore base optimizer fp32 weights from ZeRO fp16 weights
+    def _restore_from_fp16_weights(self):
+        partition_id = dist.get_rank(group=self.dp_process_group)
+        for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
+            fp32_partition.data.copy_(fp16_partitions[partition_id].data)
+
+    # Refresh the fp32 master params from the fp16 copies.
+    def refresh_fp32_params(self):
+        self._restore_from_fp16_weights()
+
+    # Extract optimizer state for current partition from merged states of all partitions
+    def _partition_base_optimizer_state(self, state_key, all_partition_states):
+        partition_id = dist.get_rank(group=self.dp_process_group)
+        alignment = dist.get_world_size(group=self.dp_process_group)
+        if torch.is_tensor(all_partition_states[0]):
+            flat_merged_partitions = flatten_dense_tensors_aligned(
+                all_partition_states,
+                alignment)
+            dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions)
+            return dp_partitions[partition_id]
+        else:
+            # Assume non-tensor states are not partitioned and equal across ranks, so return first one
+            return all_partition_states[0]
+
+    # Restore base optimizer state from checkpoint by
+    # 1) Merging optimizer state from checkpoints of all partitions
+    # 2) Extracting optimizer state for current partition from the merged state
+    # 3) Using the extracted value to directly update the base optimizer.
+    def _restore_base_optimizer_state(self, all_state_dict):
+        base_optimizer_group_states = []
+        for i in range(len(self.optimizer.param_groups)):
+            partition_states = {}
+            all_partition_group_states = [
+                sd['base_optimizer_state'][i] for sd in all_state_dict
+            ]
+            for key in all_partition_group_states[0].keys():
+                all_partition_states = [
+                    all_states[key] for all_states in all_partition_group_states
+                ]
+                partition_states[key] = self._partition_base_optimizer_state(
+                    key,
+                    all_partition_states)
+            base_optimizer_group_states.append(partition_states)
+
+        for i, group in enumerate(self.optimizer.param_groups):
+            p = group['params'][0]
+            for key, saved in base_optimizer_group_states[i].items():
+                if torch.is_tensor(self.optimizer.state[p][key]):
+                    self.optimizer.state[p][key].data.copy_(saved.data)
+                else:
+                    self.optimizer.state[p][key] = saved
+
+    def load_state_dict(self,
+                        state_dict_list,
+                        load_optimizer_states=True,
+                        load_from_fp32_weights=False):
+        r"""Loading ZeRO checkpoint
+
+        Arguments:
+            state_dict_list: List of all saved ZeRO checkpoints, one for each saved partition.
+                Note that the number of saved partitions may differ from number of loading partitions to support
+                changing GPU count, specifically DP world size, between saving and loading checkpoints.
+            load_optimizer_states: Boolean indicating whether or not to load base optimizer states
+            load_from_fp32_weights: Boolean indicating whether to initialize fp32 master weights from fp32
+            copies in checkpoints (no precision loss) or from model's fp16 copies (with precision loss).
+        """
+        """
+        Loads a state_dict created by an earlier call to state_dict().
+        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
+        whose parameters in turn came from ``model``, it is expected that the user
+        will call ``model.load_state_dict()`` before
+        ``fp16_optimizer_instance.load_state_dict()`` is called.
+        Example::
+            model = torch.nn.Linear(D_in, D_out).cuda().half()
+            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+            ...
+            checkpoint = torch.load("saved.pth")
+            model.load_state_dict(checkpoint['model'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+        """
+        # I think it should actually be ok to reload the optimizer before the model.
+        self.loss_scaler = state_dict_list[0]['loss_scaler']
+        self.dynamic_loss_scale = state_dict_list[0]['dynamic_loss_scale']
+        self.overflow = state_dict_list[0]['overflow']
+
+        if load_optimizer_states:
+            self._restore_base_optimizer_state(state_dict_list)
+
+        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
+        # The optimizer's hyperparameters and internal buffers are also up to date.
+        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
+        # out of date.  There are two options.
+        # 1:  Refresh the master params from the model's fp16 params.
+        # This requires less storage but incurs precision loss.
+        # 2:  Save and restore the fp32 master copies separately.
+        # We choose option 1 if changing DP degree and option 2 otherwise.
+        #
+        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
+        # of their associated parameters, because it's possible those buffers might not exist yet in
+        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been
+        # constructed in the same way as the one whose state_dict we are loading, the same master params
+        # are guaranteed to exist, so we can just copy_() from the saved master params.
+
+        if load_from_fp32_weights:
+            self._restore_from_fp32_weights(state_dict_list)
+        else:
+            self._restore_from_fp16_weights()
+
+
+def _handle_overflow(cpu_sum, x, i):
+    import math
+    rank = torch.distributed.get_rank()
+    if rank == 0:
+        t_i = -1
+        for v_i, v in enumerate(x.data.contiguous().view(-1)):
+            if not math.isfinite(float(v)):
+                t_i = v_i
+                break
+        logger.info(
+            f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}"
+        )
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json
index 0e2582fa102f..9dd4a0807ba4 100755
--- a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json
@@ -8,7 +8,6 @@
     "allgather_bucket_size": 7000000,
     "reduce_scatter": true
   },
-  "zero_allow_untested_optimizer": true,
   "gradient_clipping": 1.0,
   "fp16": {
     "enabled": true,
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_offload.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_offload.json
index 5c66ed7cc585..48d620c79e33 100755
--- a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_offload.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_offload.json
@@ -9,7 +9,6 @@
     "reduce_scatter": true,
     "cpu_offload": true
   },
-  "zero_allow_untested_optimizer": true,
   "gradient_clipping": 1.0,
   "fp16": {
     "enabled": true,
diff --git a/tests/model/Megatron_GPT2/ds_config_func_scheduler.json b/tests/model/Megatron_GPT2/ds_config_func_scheduler.json
index 2d2ab356e57c..6f8f320b08cb 100755
--- a/tests/model/Megatron_GPT2/ds_config_func_scheduler.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_scheduler.json
@@ -5,7 +5,6 @@
   "zero_optimization": {
     "stage": 2
   },
-  "zero_allow_untested_optimizer": true,
   "gradient_clipping": 1.0,
   "scheduler": {
     "type": "WarmupLR",
diff --git a/tests/model/Megatron_GPT2/ds_gpt2_test.sh b/tests/model/Megatron_GPT2/ds_gpt2_test.sh
index a8af44df9c7e..ac5d7e379023 100755
--- a/tests/model/Megatron_GPT2/ds_gpt2_test.sh
+++ b/tests/model/Megatron_GPT2/ds_gpt2_test.sh
@@ -3,7 +3,7 @@
 helpFunction()
 {
     echo ""
-    echo "Usage: $0 -m model-parallelism -g gpu-per-node -n node# -b batch-size -s stpes -l layers -h hidden_size -q seq_length -e heads -c ckpt_num_layers [-d]"
+    echo "Usage: $0 -m model-parallelism -g gpu-per-node -n node# -b batch-size -s stpes -l layers -h hidden_size -q seq_length -e heads -c ckpt_num_layers -p [-d]"
     echo -e "\t-m model parallelism"
     echo -e "\t-g gpus per node"
     echo -e "\t-n node count"
@@ -17,6 +17,7 @@ helpFunction()
     echo -e "\t-o other args"
     echo -e "\t-d DeepSpeed config json file"
     echo -e "\t-z Enable Zero optimization"
+    echo -e "\t-p DeepSpeed master port"
     exit 1
 }
 
@@ -27,6 +28,7 @@ ckpt_num_layers=1
 other_args=""
 ds_opt=""
 zero_opt=""
+master_port=29600
 
 script_path=$(realpath $0)
 script_dir=$(dirname $script_path)
@@ -44,6 +46,7 @@ do
         q ) seq_length="$OPTARG" ;;
         e ) heads="$OPTARG" ;;
         c ) ckpt_num_layers="$OPTARG" ;;
+        p ) master_port="$OPTARG" ;;
         o ) other_args="$OPTARG" ;;
         d ) ds_opt="--deepspeed --deepspeed_config $script_dir/$OPTARG" ;;
         z ) zero_opt="--zero_optimization" ;;
@@ -91,9 +94,9 @@ gpt_options=" \
        ${ds_opt} \
        ${zero_opt} \
 "
-DEEPSPEED_PORT=29600
+
 work_dir="../../../DeepSpeedExamples/Megatron-LM/"
-run_cmd="(cd ${work_dir} && deepspeed --master_port ${DEEPSPEED_PORT} --num_nodes $nodes --num_gpus $gpus pretrain_gpt2.py ${gpt_options})"
+run_cmd="(cd ${work_dir} && deepspeed --master_port ${master_port} --num_nodes $nodes --num_gpus $gpus pretrain_gpt2.py ${gpt_options})"
 echo ${run_cmd}
 eval ${run_cmd}
 
diff --git a/tests/model/Megatron_GPT2/run_func_test.py b/tests/model/Megatron_GPT2/run_func_test.py
index 68264ca8bc65..a61b0e9ba85b 100755
--- a/tests/model/Megatron_GPT2/run_func_test.py
+++ b/tests/model/Megatron_GPT2/run_func_test.py
@@ -14,6 +14,7 @@
 HIDDEN_SIZE = 128
 ATTN_HEADS = 8
 SEQ_LEN = 64
+MASTER_PORT = 29700
 
 
 def grep_loss_from_file(file_name):
@@ -189,10 +190,12 @@ def test_mp2_gpu4_node1_zero2(self):
             "json": "ds_config_func_bs8_zero2.json",
         }
 
-        succ = self.run_test(test_config, 0.01)
+        basic_run_config = test_config
+        succ = self.run_test(basic_run_config, 0.01)
         self.assertTrue(succ)
 
-        succ = self.run_partition_activations_test(test_config, 0.01)
+        partition_activation_config = test_config
+        succ = self.run_partition_activations_test(partition_activation_config, 0.01)
         self.assertTrue(succ)
 
     def test_mp4_gpu4_node1_zero2(self):
@@ -210,10 +213,47 @@ def test_mp4_gpu4_node1_zero2(self):
             "json": "ds_config_func_bs8_zero2.json",
         }
 
+        basic_run_config = test_config
+        succ = self.run_test(basic_run_config, 0.01)
+        self.assertTrue(succ)
+        partition_activation_config = test_config
+        succ = self.run_partition_activations_test(partition_activation_config, 0.01)
+        self.assertTrue(succ)
+
+    def test_mp1_gpu1_node1_zero2_offload(self):
+        test_config = {
+            "mp": 1,
+            "gpus": 1,
+            "nodes": 1,
+            "bs": 4,
+            "steps": 1000,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
+            "deepspeed": False,
+            "json": "ds_config_func_bs4_zero2_offload.json",
+            "cpu_optimizer": True,
+        }
         succ = self.run_test(test_config, 0.01)
         self.assertTrue(succ)
 
-        succ = self.run_partition_activations_test(test_config, 0.01)
+    def test_mp1_gpu2_node1_zero2_offload(self):
+        test_config = {
+            "mp": 1,
+            "gpus": 2,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1000,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
+            "deepspeed": False,
+            "json": "ds_config_func_bs8_zero2_offload.json",
+            "cpu_optimizer": True,
+        }
+        succ = self.run_test(test_config, 0.01)
         self.assertTrue(succ)
 
     def test_mp2_gpu4_node1_zero2_gas(self):
@@ -238,6 +278,139 @@ def test_mp2_gpu4_node1_zero2_gas(self):
         succ = self.run_partition_activations_test(test_config, 0.01)
         self.assertTrue(succ)
 
+    def test_mp2_gpu4_node1_zero2_offload(self):
+        test_config = {
+            "mp": 2,
+            "gpus": 4,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1000,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
+            "deepspeed": False,
+            "json": "ds_config_func_bs8_zero2_offload.json",
+            "cpu_optimizer": True,
+        }
+
+        basic_run_config = test_config
+        succ = self.run_test(basic_run_config, 0.01)
+        self.assertTrue(succ)
+
+        partition_activation_config = test_config
+        succ = self.run_partition_activations_test(partition_activation_config, 0.01)
+        self.assertTrue(succ)
+
+    def test_mp4_gpu4_node1_zero2_offload(self):
+        test_config = {
+            "mp": 4,
+            "gpus": 4,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1000,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
+            "deepspeed": False,
+            "json": "ds_config_func_bs8_zero2_offload.json",
+            "cpu_optimizer": True,
+        }
+
+        basic_run_config = test_config
+        succ = self.run_test(basic_run_config, 0.01)
+        self.assertTrue(succ)
+
+        partition_activation_config = test_config
+        succ = self.run_partition_activations_test(partition_activation_config, 0.01)
+        self.assertTrue(succ)
+
+    def test_mp1_gpu1_node1_zero2_cpu_optimizer(self):
+        test_config = {
+            "mp": 1,
+            "gpus": 1,
+            "nodes": 1,
+            "bs": 4,
+            "steps": 1000,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
+            "deepspeed": False,
+            "json": "ds_config_func_bs4_zero2.json",
+            "cpu_optimizer": True,
+        }
+
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+
+    def test_mp1_gpu2_node1_zero2_cpu_optimizer(self):
+        test_config = {
+            "mp": 1,
+            "gpus": 2,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1000,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
+            "deepspeed": False,
+            "json": "ds_config_func_bs8_zero2.json",
+            "cpu_optimizer": True,
+        }
+
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+
+    def test_mp2_gpu4_node1_zero2_cpu_optimizer(self):
+        test_config = {
+            "mp": 2,
+            "gpus": 4,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1000,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
+            "deepspeed": False,
+            "json": "ds_config_func_bs8_zero2.json",
+            "cpu_optimizer": True,
+        }
+
+        basic_run_config = test_config
+        succ = self.run_test(basic_run_config, 0.01)
+        self.assertTrue(succ)
+
+        partition_activation_config = test_config
+        succ = self.run_partition_activations_test(partition_activation_config, 0.01)
+        self.assertTrue(succ)
+
+    def test_mp4_gpu4_node1_zero2_cpu_optimizer(self):
+        test_config = {
+            "mp": 4,
+            "gpus": 4,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1000,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
+            "deepspeed": False,
+            "json": "ds_config_func_bs8_zero2.json",
+            "cpu_optimizer": True,
+        }
+
+        basic_run_config = test_config
+        succ = self.run_test(basic_run_config, 0.01)
+        self.assertTrue(succ)
+
+        partition_activation_config = test_config
+        succ = self.run_partition_activations_test(partition_activation_config, 0.01)
+
     def test_optimizer_scheduler(self):
         test_config = {
             "mp": 1,
@@ -267,6 +440,11 @@ def run_partition_activations_test(self, test_config, r_tol):
         deepspeed_config = test_config["json"]
         baseline_deepspeed_config = False
 
+        if "cpu_optimizer" in test_config and test_config["cpu_optimizer"]:
+            cpu_optimizer_flag = "--cpu-optimizer"
+        else:
+            cpu_optimizer_flag = ""
+
         # baseline run...
         # turnoff deepspeed if baseline deepspeed config
         # is not provided
@@ -277,6 +455,7 @@ def run_partition_activations_test(self, test_config, r_tol):
             baseline_prefix += test_config["json"][0:-5]
             baseline_deepspeed_config = True
 
+        test_config["other_args"] = cpu_optimizer_flag
         base_file = self.gen_output_name(test_config,
                                          baseline_prefix,
                                          baseline_config=baseline_deepspeed_config)
@@ -290,8 +469,10 @@ def run_partition_activations_test(self, test_config, r_tol):
 
         # DeepSpeed run...
         test_config["deepspeed"] = True
-        test_config["other_args"] = "--deepspeed-activation-checkpointing"
+        test_config[
+            "other_args"] = f"\"--deepspeed-activation-checkpointing {cpu_optimizer_flag}\""
         test_config["json"] = deepspeed_config
+
         print("{0}: DeepSpeed run.".format(self.id()))
         test_file = self.gen_output_name(test_config, prefix)
         self.run_gpt2_test(test_config, test_file)
@@ -308,6 +489,11 @@ def run_test(self, test_config, r_tol):
         deepspeed_config = test_config["json"]
         baseline_deepspeed_config = False
 
+        if "cpu_optimizer" in test_config and test_config["cpu_optimizer"]:
+            cpu_optimizer_flag = "--cpu-optimizer"
+        else:
+            cpu_optimizer_flag = ""
+
         # baseline run...
         # turn off deepspeed if a baseline deepspeed config
         # is not provided
@@ -318,6 +504,8 @@ def run_test(self, test_config, r_tol):
             baseline_prefix = prefix + test_config["json"][0:-5]
             baseline_deepspeed_config = True
 
+        test_config["other_args"] = f"\"{cpu_optimizer_flag}\""
+
         # baseline run...
         base_file = self.gen_output_name(test_config,
                                          baseline_prefix,
@@ -332,7 +520,7 @@ def run_test(self, test_config, r_tol):
 
         # DeepSpeed run...
         test_config["deepspeed"] = True
-        test_config["json"] = deepspeed_config
+        test_config["other_args"] = f"\"{cpu_optimizer_flag}\""
 
         print("{0}: DeepSpeed run.".format(self.id()))
         test_file = self.gen_output_name(test_config, prefix)
@@ -369,6 +557,20 @@ def suite():
 
     suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_fp16'))
 
+    # Baseline = Megatron + Torch.Optim.Adam
+    # Test = Megatron + Torch.Optim.Adam + ZeRO-Stage-2
+    suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero2_cpu_optimizer'))
+    suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero2_cpu_optimizer'))
+    suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2_cpu_optimizer'))
+    suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero2_cpu_optimizer'))
+
+    # Baseline = Megatron + Torch.Optim.Adam
+    # Test = Megatron + Torch.Optim.Adam + ZeRO-Offload
+    suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero2_offload'))
+    suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero2_offload'))
+    suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2_offload'))
+    suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero2_offload'))
+
     suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero1'))
     suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero1'))
     suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero1'))
diff --git a/tests/unit/adam_test.py b/tests/perf/adam_test.py
similarity index 100%
rename from tests/unit/adam_test.py
rename to tests/perf/adam_test.py

From 37887241c27ce1ae52c428592105fb123a08b418 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Sat, 5 Sep 2020 05:17:26 +0000
Subject: [PATCH 23/44] move adam_test

---
 tests/{unit => perf}/adam_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename tests/{unit => perf}/adam_test.py (93%)

diff --git a/tests/unit/adam_test.py b/tests/perf/adam_test.py
similarity index 93%
rename from tests/unit/adam_test.py
rename to tests/perf/adam_test.py
index be5caa5cbecc..800cb4f42eaa 100755
--- a/tests/unit/adam_test.py
+++ b/tests/perf/adam_test.py
@@ -3,7 +3,7 @@
 import time
 
 device = 'cpu'
-model_size = 10 * 1024**3
+model_size = 1 * 1024**3
 param = torch.nn.Parameter(torch.ones(model_size, device=device))
 param_fp16 = torch.nn.Parameter(torch.ones(model_size,
                                            dtype=torch.half,

From 36d5fde79bf1002fa3c0031377e6b391faa850f3 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Sat, 5 Sep 2020 05:20:10 +0000
Subject: [PATCH 24/44] rename perf test

---
 tests/perf/{adam_test.py => adam_test1.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/perf/{adam_test.py => adam_test1.py} (100%)

diff --git a/tests/perf/adam_test.py b/tests/perf/adam_test1.py
similarity index 100%
rename from tests/perf/adam_test.py
rename to tests/perf/adam_test1.py

From d8ff56c34f9313a48855e88df8857223dd2e22ea Mon Sep 17 00:00:00 2001
From: RezaYazdaniAminabadi
 <44502768+RezaYazdaniAminabadi@users.noreply.github.com>
Date: Fri, 4 Sep 2020 22:20:45 -0700
Subject: [PATCH 25/44] fixing adam copy fp16-param and add more compile flags
 for cpu_adam (#367)

* fixing adam copy fp16-param-add more compile flags for cpu_adam

* run precommit

* fix variance indexes

* fix array-sizes

* move adam_test

* rename perf test
---
 csrc/adam/cpu_adam.cpp   | 496 ++++++++++++++++++++++++++++-----------
 csrc/includes/cpu_adam.h |   9 +-
 setup.py                 |  34 +--
 tests/perf/adam_test1.py |  22 ++
 4 files changed, 406 insertions(+), 155 deletions(-)
 mode change 100644 => 100755 csrc/adam/cpu_adam.cpp
 create mode 100755 tests/perf/adam_test1.py

diff --git a/csrc/adam/cpu_adam.cpp b/csrc/adam/cpu_adam.cpp
old mode 100644
new mode 100755
index 2d3b521bea1c..9399438b57f6
--- a/csrc/adam/cpu_adam.cpp
+++ b/csrc/adam/cpu_adam.cpp
@@ -49,54 +49,56 @@ void Adam_Optimizer::Step(float* _params,
 
     size_t tile = 0;
 
+    for (size_t t = 0; t < _param_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > _param_size) copy_size = _param_size - t;
+        size_t offset = copy_size + t;
 #pragma omp parallel for
-    for (size_t i = 0; i < _param_size; i += SIMD_WIDTH) {
-        __m512 grad_4 = _mm512_loadu_ps(grads + i);
+        for (size_t i = t; i < offset; i += SIMD_WIDTH) {
+            __m512 grad_4 = _mm512_loadu_ps(grads + i);
 
-        __m512 momntum_4 = _mm512_loadu_ps(_exp_avg + i);
-        __m512 varianc_4 = _mm512_loadu_ps(_exp_avg_sq + i);
+            __m512 momntum_4 = _mm512_loadu_ps(_exp_avg + i);
+            __m512 varianc_4 = _mm512_loadu_ps(_exp_avg_sq + i);
 
-        __m512 param_4 = _mm512_loadu_ps(_params + i);
+            __m512 param_4 = _mm512_loadu_ps(_params + i);
 
-        if (_weight_decay > 0) {
-            __m512 weight_decay4 = _mm512_set1_ps(_weight_decay);
-            grad_4 = _mm512_fmadd_ps(param_4, weight_decay4, grad_4);
-        }
+            if (_weight_decay > 0) {
+                __m512 weight_decay4 = _mm512_set1_ps(_weight_decay);
+                grad_4 = _mm512_fmadd_ps(param_4, weight_decay4, grad_4);
+            }
 
-        momntum_4 = _mm512_mul_ps(momntum_4, betta1_4);
-        momntum_4 = _mm512_fmadd_ps(grad_4, betta1_minus1_4, momntum_4);
+            momntum_4 = _mm512_mul_ps(momntum_4, betta1_4);
+            momntum_4 = _mm512_fmadd_ps(grad_4, betta1_minus1_4, momntum_4);
 
-        varianc_4 = _mm512_mul_ps(varianc_4, betta2_4);
-        grad_4 = _mm512_mul_ps(grad_4, grad_4);
-        varianc_4 = _mm512_fmadd_ps(grad_4, betta2_minus1_4, varianc_4);
+            varianc_4 = _mm512_mul_ps(varianc_4, betta2_4);
+            grad_4 = _mm512_mul_ps(grad_4, grad_4);
+            varianc_4 = _mm512_fmadd_ps(grad_4, betta2_minus1_4, varianc_4);
 
-        grad_4 = _mm512_sqrt_ps(varianc_4) / bias2_sqrt;
-        grad_4 = _mm512_add_ps(grad_4, eps_4);
-        grad_4 = _mm512_div_ps(momntum_4, grad_4);
+            grad_4 = _mm512_sqrt_ps(varianc_4) / bias2_sqrt;
+            grad_4 = _mm512_add_ps(grad_4, eps_4);
+            grad_4 = _mm512_div_ps(momntum_4, grad_4);
 
-        param_4 = _mm512_fmadd_ps(grad_4, step_size_4, param_4);
+            param_4 = _mm512_fmadd_ps(grad_4, step_size_4, param_4);
 
-        _mm512_storeu_ps(_params + i, param_4);
-        _mm512_storeu_ps(_exp_avg + i, momntum_4);
-        _mm512_storeu_ps(_exp_avg_sq + i, varianc_4);
+            _mm512_storeu_ps(_params + i, param_4);
+            _mm512_storeu_ps(_exp_avg + i, momntum_4);
+            _mm512_storeu_ps(_exp_avg_sq + i, varianc_4);
+        }
         if (dev_params) {
-            if ((i + SIMD_WIDTH) % TILE == 0) {
-                size_t offset = tile * TILE;
 #pragma omp parallel for
-                for (size_t j = 0; j < TILE; j += 4) {
-                    _doubled_buffer[buf_index][j] = (__half)_params[offset + j];
-                    _doubled_buffer[buf_index][j + 1] = (__half)_params[offset + j + 1];
-                    _doubled_buffer[buf_index][j + 2] = (__half)_params[offset + j + 2];
-                    _doubled_buffer[buf_index][j + 3] = (__half)_params[offset + j + 3];
-                }
-                CUDA_CHECK(cudaMemcpyAsync(dev_params + tile * TILE,
-                                           _doubled_buffer[buf_index],
-                                           TILE * sizeof(__half),
-                                           cudaMemcpyHostToDevice,
-                                           Context::Instance().GetCurrentStream()));
-                buf_index = !buf_index;
-                tile++;
+            for (size_t j = 0; j < copy_size; j += 4) {
+                _doubled_buffer[buf_index][j] = (__half)_params[t + j];
+                _doubled_buffer[buf_index][j + 1] = (__half)_params[t + j + 1];
+                _doubled_buffer[buf_index][j + 2] = (__half)_params[t + j + 2];
+                _doubled_buffer[buf_index][j + 3] = (__half)_params[t + j + 3];
             }
+
+            CUDA_CHECK(cudaMemcpyAsync(dev_params + t,
+                                       _doubled_buffer[buf_index],
+                                       copy_size * sizeof(__half),
+                                       cudaMemcpyHostToDevice,
+                                       Context::Instance().GetCurrentStream()));
+            buf_index = !buf_index;
         }
     }
 }
@@ -134,113 +136,116 @@ void Adam_Optimizer::Step_4(float* _params,
 
     __m512 bias2_sqrt = _mm512_sqrt_ps(bias_correction2_4);
 
+    for (size_t t = 0; t < _param_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > _param_size) copy_size = _param_size - t;
+        size_t offset = copy_size + t;
 #pragma omp parallel for
-    for (size_t i = 0; i < _param_size; i += (SIMD_WIDTH << 2)) {
-        __m512 grad_4[4];
-        grad_4[0] = _mm512_loadu_ps(grads + i);
-        grad_4[1] = _mm512_loadu_ps(grads + i + SIMD_WIDTH);
-        grad_4[2] = _mm512_loadu_ps(grads + i + (SIMD_WIDTH << 1));
-        grad_4[3] = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 3);
-
-        __m512 momntum_4[2];
-        momntum_4[0] = _mm512_loadu_ps(_exp_avg + i);
-        momntum_4[1] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH);
-        momntum_4[2] = _mm512_loadu_ps(_exp_avg + i + (SIMD_WIDTH << 1));
-        momntum_4[3] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 3);
-
-        __m512 varianc_4[2];
-        varianc_4[0] = _mm512_loadu_ps(_exp_avg_sq + i);
-        varianc_4[1] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH);
-        varianc_4[2] = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1));
-        varianc_4[3] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3);
-
-        __m512 param_4[2];
-        param_4[0] = _mm512_loadu_ps(_params + i);
-        param_4[1] = _mm512_loadu_ps(_params + i + SIMD_WIDTH);
-        param_4[2] = _mm512_loadu_ps(_params + i + (SIMD_WIDTH << 1));
-        param_4[3] = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 3);
-
-        if (_weight_decay > 0) {
-            __m512 weight_decay4 = _mm512_set1_ps(_weight_decay);
-            grad_4[0] = _mm512_fmadd_ps(param_4[0], weight_decay4, grad_4[0]);
-            grad_4[1] = _mm512_fmadd_ps(param_4[1], weight_decay4, grad_4[1]);
-            grad_4[2] = _mm512_fmadd_ps(param_4[2], weight_decay4, grad_4[2]);
-            grad_4[3] = _mm512_fmadd_ps(param_4[3], weight_decay4, grad_4[3]);
+        for (size_t i = t; i < offset; i += (SIMD_WIDTH << 2)) {
+            __m512 grad_4[4];
+            grad_4[0] = _mm512_loadu_ps(grads + i);
+            grad_4[1] = _mm512_loadu_ps(grads + i + SIMD_WIDTH);
+            grad_4[2] = _mm512_loadu_ps(grads + i + (SIMD_WIDTH << 1));
+            grad_4[3] = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 3);
+
+            __m512 momntum_4[4];
+            momntum_4[0] = _mm512_loadu_ps(_exp_avg + i);
+            momntum_4[1] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH);
+            momntum_4[2] = _mm512_loadu_ps(_exp_avg + i + (SIMD_WIDTH << 1));
+            momntum_4[3] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 3);
+
+            __m512 varianc_4[4];
+            varianc_4[0] = _mm512_loadu_ps(_exp_avg_sq + i);
+            varianc_4[1] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH);
+            varianc_4[2] = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1));
+            varianc_4[3] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3);
+
+            __m512 param_4[4];
+            param_4[0] = _mm512_loadu_ps(_params + i);
+            param_4[1] = _mm512_loadu_ps(_params + i + SIMD_WIDTH);
+            param_4[2] = _mm512_loadu_ps(_params + i + (SIMD_WIDTH << 1));
+            param_4[3] = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 3);
+
+            if (_weight_decay > 0) {
+                __m512 weight_decay4 = _mm512_set1_ps(_weight_decay);
+                grad_4[0] = _mm512_fmadd_ps(param_4[0], weight_decay4, grad_4[0]);
+                grad_4[1] = _mm512_fmadd_ps(param_4[1], weight_decay4, grad_4[1]);
+                grad_4[2] = _mm512_fmadd_ps(param_4[2], weight_decay4, grad_4[2]);
+                grad_4[3] = _mm512_fmadd_ps(param_4[3], weight_decay4, grad_4[3]);
+            }
+
+            momntum_4[0] = _mm512_mul_ps(momntum_4[0], betta1_4);
+            momntum_4[0] = _mm512_fmadd_ps(grad_4[0], betta1_minus1_4, momntum_4[0]);
+            momntum_4[1] = _mm512_mul_ps(momntum_4[1], betta1_4);
+            momntum_4[1] = _mm512_fmadd_ps(grad_4[1], betta1_minus1_4, momntum_4[1]);
+            momntum_4[2] = _mm512_mul_ps(momntum_4[2], betta1_4);
+            momntum_4[2] = _mm512_fmadd_ps(grad_4[2], betta1_minus1_4, momntum_4[2]);
+            momntum_4[3] = _mm512_mul_ps(momntum_4[3], betta1_4);
+            momntum_4[3] = _mm512_fmadd_ps(grad_4[3], betta1_minus1_4, momntum_4[3]);
+
+            varianc_4[0] = _mm512_mul_ps(varianc_4[0], betta2_4);
+            varianc_4[1] = _mm512_mul_ps(varianc_4[1], betta2_4);
+            varianc_4[2] = _mm512_mul_ps(varianc_4[2], betta2_4);
+            varianc_4[3] = _mm512_mul_ps(varianc_4[3], betta2_4);
+            grad_4[0] = _mm512_mul_ps(grad_4[0], grad_4[0]);
+            grad_4[1] = _mm512_mul_ps(grad_4[1], grad_4[1]);
+            grad_4[2] = _mm512_mul_ps(grad_4[2], grad_4[2]);
+            grad_4[3] = _mm512_mul_ps(grad_4[3], grad_4[3]);
+            varianc_4[0] = _mm512_fmadd_ps(grad_4[0], betta2_minus1_4, varianc_4[0]);
+            varianc_4[1] = _mm512_fmadd_ps(grad_4[1], betta2_minus1_4, varianc_4[1]);
+            varianc_4[2] = _mm512_fmadd_ps(grad_4[2], betta2_minus1_4, varianc_4[2]);
+            varianc_4[3] = _mm512_fmadd_ps(grad_4[3], betta2_minus1_4, varianc_4[3]);
+
+            grad_4[0] = _mm512_sqrt_ps(varianc_4[0]) / bias2_sqrt;
+            grad_4[1] = _mm512_sqrt_ps(varianc_4[1]) / bias2_sqrt;
+            grad_4[2] = _mm512_sqrt_ps(varianc_4[2]) / bias2_sqrt;
+            grad_4[3] = _mm512_sqrt_ps(varianc_4[3]) / bias2_sqrt;
+
+            grad_4[0] = _mm512_add_ps(grad_4[0], eps_4);
+            grad_4[1] = _mm512_add_ps(grad_4[1], eps_4);
+            grad_4[2] = _mm512_add_ps(grad_4[2], eps_4);
+            grad_4[3] = _mm512_add_ps(grad_4[3], eps_4);
+            grad_4[0] = _mm512_div_ps(momntum_4[0], grad_4[0]);
+            grad_4[1] = _mm512_div_ps(momntum_4[1], grad_4[1]);
+            grad_4[2] = _mm512_div_ps(momntum_4[2], grad_4[2]);
+            grad_4[3] = _mm512_div_ps(momntum_4[3], grad_4[3]);
+
+            param_4[0] = _mm512_fmadd_ps(grad_4[0], step_size_4, param_4[0]);
+            param_4[1] = _mm512_fmadd_ps(grad_4[1], step_size_4, param_4[1]);
+            param_4[2] = _mm512_fmadd_ps(grad_4[2], step_size_4, param_4[2]);
+            param_4[3] = _mm512_fmadd_ps(grad_4[3], step_size_4, param_4[3]);
+
+            _mm512_storeu_ps(_params + i, param_4[0]);
+            _mm512_storeu_ps(_params + i + SIMD_WIDTH, param_4[1]);
+            _mm512_storeu_ps(_params + i + (SIMD_WIDTH << 1), param_4[2]);
+            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 3, param_4[3]);
+
+            _mm512_storeu_ps(_exp_avg + i, momntum_4[0]);
+            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH, momntum_4[1]);
+            _mm512_storeu_ps(_exp_avg + i + (SIMD_WIDTH << 1), momntum_4[2]);
+            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 3, momntum_4[3]);
+
+            _mm512_storeu_ps(_exp_avg_sq + i, varianc_4[0]);
+            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH, varianc_4[1]);
+            _mm512_storeu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1), varianc_4[2]);
+            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3, varianc_4[3]);
         }
 
-        momntum_4[0] = _mm512_mul_ps(momntum_4[0], betta1_4);
-        momntum_4[0] = _mm512_fmadd_ps(grad_4[0], betta1_minus1_4, momntum_4[0]);
-        momntum_4[1] = _mm512_mul_ps(momntum_4[1], betta1_4);
-        momntum_4[1] = _mm512_fmadd_ps(grad_4[1], betta1_minus1_4, momntum_4[1]);
-        momntum_4[2] = _mm512_mul_ps(momntum_4[2], betta1_4);
-        momntum_4[2] = _mm512_fmadd_ps(grad_4[2], betta1_minus1_4, momntum_4[2]);
-        momntum_4[3] = _mm512_mul_ps(momntum_4[3], betta1_4);
-        momntum_4[3] = _mm512_fmadd_ps(grad_4[3], betta1_minus1_4, momntum_4[3]);
-
-        varianc_4[0] = _mm512_mul_ps(varianc_4[0], betta2_4);
-        varianc_4[1] = _mm512_mul_ps(varianc_4[1], betta2_4);
-        varianc_4[2] = _mm512_mul_ps(varianc_4[2], betta2_4);
-        varianc_4[3] = _mm512_mul_ps(varianc_4[3], betta2_4);
-        grad_4[0] = _mm512_mul_ps(grad_4[0], grad_4[0]);
-        grad_4[1] = _mm512_mul_ps(grad_4[1], grad_4[1]);
-        grad_4[2] = _mm512_mul_ps(grad_4[2], grad_4[2]);
-        grad_4[3] = _mm512_mul_ps(grad_4[3], grad_4[3]);
-        varianc_4[0] = _mm512_fmadd_ps(grad_4[0], betta2_minus1_4, varianc_4[0]);
-        varianc_4[1] = _mm512_fmadd_ps(grad_4[1], betta2_minus1_4, varianc_4[1]);
-        varianc_4[2] = _mm512_fmadd_ps(grad_4[2], betta2_minus1_4, varianc_4[2]);
-        varianc_4[3] = _mm512_fmadd_ps(grad_4[3], betta2_minus1_4, varianc_4[3]);
-
-        grad_4[0] = _mm512_sqrt_ps(varianc_4[0]) / bias2_sqrt;
-        grad_4[1] = _mm512_sqrt_ps(varianc_4[1]) / bias2_sqrt;
-        grad_4[2] = _mm512_sqrt_ps(varianc_4[2]) / bias2_sqrt;
-        grad_4[3] = _mm512_sqrt_ps(varianc_4[3]) / bias2_sqrt;
-
-        grad_4[0] = _mm512_add_ps(grad_4[0], eps_4);
-        grad_4[1] = _mm512_add_ps(grad_4[1], eps_4);
-        grad_4[2] = _mm512_add_ps(grad_4[2], eps_4);
-        grad_4[3] = _mm512_add_ps(grad_4[3], eps_4);
-        grad_4[0] = _mm512_div_ps(momntum_4[0], grad_4[0]);
-        grad_4[1] = _mm512_div_ps(momntum_4[1], grad_4[1]);
-        grad_4[2] = _mm512_div_ps(momntum_4[2], grad_4[2]);
-        grad_4[3] = _mm512_div_ps(momntum_4[3], grad_4[3]);
-
-        param_4[0] = _mm512_fmadd_ps(grad_4[0], step_size_4, param_4[0]);
-        param_4[1] = _mm512_fmadd_ps(grad_4[1], step_size_4, param_4[1]);
-        param_4[2] = _mm512_fmadd_ps(grad_4[2], step_size_4, param_4[2]);
-        param_4[3] = _mm512_fmadd_ps(grad_4[3], step_size_4, param_4[3]);
-
-        _mm512_storeu_ps(_params + i, param_4[0]);
-        _mm512_storeu_ps(_params + i + SIMD_WIDTH, param_4[1]);
-        _mm512_storeu_ps(_params + i + (SIMD_WIDTH << 1), param_4[2]);
-        _mm512_storeu_ps(_params + i + SIMD_WIDTH * 3, param_4[3]);
-
-        _mm512_storeu_ps(_exp_avg + i, momntum_4[0]);
-        _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH, momntum_4[1]);
-        _mm512_storeu_ps(_exp_avg + i + (SIMD_WIDTH << 1), momntum_4[2]);
-        _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 3, momntum_4[3]);
-
-        _mm512_storeu_ps(_exp_avg_sq + i, varianc_4[0]);
-        _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH, varianc_4[1]);
-        _mm512_storeu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1), varianc_4[2]);
-        _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3, varianc_4[3]);
         if (dev_params) {
-            if ((i + (SIMD_WIDTH << 2)) % TILE == 0) {
-                size_t offset = tile * TILE;
 #pragma omp parallel for
-                for (size_t j = 0; j < TILE; j += 4) {
-                    _doubled_buffer[buf_index][j] = (__half)_params[offset + j];
-                    _doubled_buffer[buf_index][j + 1] = (__half)_params[offset + j + 1];
-                    _doubled_buffer[buf_index][j + 2] = (__half)_params[offset + j + 2];
-                    _doubled_buffer[buf_index][j + 3] = (__half)_params[offset + j + 3];
-                }
-                CUDA_CHECK(cudaMemcpyAsync(dev_params + tile * TILE,
-                                           _doubled_buffer[buf_index],
-                                           TILE * sizeof(__half),
-                                           cudaMemcpyHostToDevice,
-                                           Context::Instance().GetCurrentStream()));
-                buf_index = !buf_index;
-                tile++;
+            for (size_t j = 0; j < copy_size; j += 4) {
+                _doubled_buffer[buf_index][j] = (__half)_params[t + j];
+                _doubled_buffer[buf_index][j + 1] = (__half)_params[t + j + 1];
+                _doubled_buffer[buf_index][j + 2] = (__half)_params[t + j + 2];
+                _doubled_buffer[buf_index][j + 3] = (__half)_params[t + j + 3];
             }
+
+            CUDA_CHECK(cudaMemcpyAsync(dev_params + t,
+                                       _doubled_buffer[buf_index],
+                                       copy_size * sizeof(__half),
+                                       cudaMemcpyHostToDevice,
+                                       Context::Instance().GetCurrentStream()));
+            buf_index = !buf_index;
         }
     }
 }
@@ -262,6 +267,219 @@ int create_adam_optimizer(int optimizer_id,
     return 0;
 }
 
+void Adam_Optimizer::Step_8(float* _params,
+                            float* grads,
+                            float* _exp_avg,
+                            float* _exp_avg_sq,
+                            size_t _param_size,
+                            __half* dev_params)
+{
+    _betta1_t *= _betta1;
+    _betta2_t *= _betta2;
+
+    __m512 betta1_4 = _mm512_set1_ps(_betta1);
+    __m512 betta2_4 = _mm512_set1_ps(_betta2);
+
+    bool buf_index = 0;
+
+    float betta1_minus1 = 1 - _betta1;
+    float betta2_minus1 = 1 - _betta2;
+    __m512 betta1_minus1_4 = _mm512_set1_ps(betta1_minus1);
+    __m512 betta2_minus1_4 = _mm512_set1_ps(betta2_minus1);
+
+    float bias_correction1 = 1 - _betta1_t;
+    float bias_correction2 = 1 - _betta2_t;
+    //__m512 bias_correction1_4 = _mm512_set1_ps(bias_correction1);
+    __m512 bias_correction2_4 = _mm512_set1_ps(bias_correction2);
+
+    __m512 eps_4 = _mm512_set1_ps(_eps);
+
+    float step_size = -1 * _alpha / bias_correction1;
+    __m512 step_size_4 = _mm512_set1_ps(step_size);
+
+    __m512 bias2_sqrt = _mm512_sqrt_ps(bias_correction2_4);
+
+    for (size_t t = 0; t < _param_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > _param_size) copy_size = _param_size - t;
+        size_t offset = copy_size + t;
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += (SIMD_WIDTH << 3)) {
+            __m512 grad_4[8];
+            grad_4[0] = _mm512_loadu_ps(grads + i);
+            grad_4[1] = _mm512_loadu_ps(grads + i + SIMD_WIDTH);
+            grad_4[2] = _mm512_loadu_ps(grads + i + (SIMD_WIDTH << 1));
+            grad_4[3] = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 3);
+            grad_4[4] = _mm512_loadu_ps(grads + i + (SIMD_WIDTH << 2));
+            grad_4[5] = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 5);
+            grad_4[6] = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 6);
+            grad_4[7] = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 7);
+
+            __m512 momntum_4[8];
+            momntum_4[0] = _mm512_loadu_ps(_exp_avg + i);
+            momntum_4[1] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH);
+            momntum_4[2] = _mm512_loadu_ps(_exp_avg + i + (SIMD_WIDTH << 1));
+            momntum_4[3] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 3);
+            momntum_4[4] = _mm512_loadu_ps(_exp_avg + i + (SIMD_WIDTH << 2));
+            momntum_4[5] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 5);
+            momntum_4[6] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 6);
+            momntum_4[7] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 7);
+
+            __m512 varianc_4[8];
+            varianc_4[0] = _mm512_loadu_ps(_exp_avg_sq + i);
+            varianc_4[1] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH);
+            varianc_4[2] = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1));
+            varianc_4[3] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3);
+            varianc_4[4] = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 2));
+            varianc_4[5] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 5);
+            varianc_4[6] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 6);
+            varianc_4[7] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 7);
+
+            __m512 param_4[8];
+            param_4[0] = _mm512_loadu_ps(_params + i);
+            param_4[1] = _mm512_loadu_ps(_params + i + SIMD_WIDTH);
+            param_4[2] = _mm512_loadu_ps(_params + i + (SIMD_WIDTH << 1));
+            param_4[3] = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 3);
+            param_4[4] = _mm512_loadu_ps(_params + i + (SIMD_WIDTH << 2));
+            param_4[5] = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 5);
+            param_4[6] = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 6);
+            param_4[7] = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 7);
+
+            if (_weight_decay > 0) {
+                __m512 weight_decay4 = _mm512_set1_ps(_weight_decay);
+                grad_4[0] = _mm512_fmadd_ps(param_4[0], weight_decay4, grad_4[0]);
+                grad_4[1] = _mm512_fmadd_ps(param_4[1], weight_decay4, grad_4[1]);
+                grad_4[2] = _mm512_fmadd_ps(param_4[2], weight_decay4, grad_4[2]);
+                grad_4[3] = _mm512_fmadd_ps(param_4[3], weight_decay4, grad_4[3]);
+                grad_4[4] = _mm512_fmadd_ps(param_4[4], weight_decay4, grad_4[4]);
+                grad_4[5] = _mm512_fmadd_ps(param_4[5], weight_decay4, grad_4[5]);
+                grad_4[6] = _mm512_fmadd_ps(param_4[6], weight_decay4, grad_4[6]);
+                grad_4[7] = _mm512_fmadd_ps(param_4[7], weight_decay4, grad_4[7]);
+            }
+
+            momntum_4[0] = _mm512_mul_ps(momntum_4[0], betta1_4);
+            momntum_4[0] = _mm512_fmadd_ps(grad_4[0], betta1_minus1_4, momntum_4[0]);
+            momntum_4[1] = _mm512_mul_ps(momntum_4[1], betta1_4);
+            momntum_4[1] = _mm512_fmadd_ps(grad_4[1], betta1_minus1_4, momntum_4[1]);
+            momntum_4[2] = _mm512_mul_ps(momntum_4[2], betta1_4);
+            momntum_4[2] = _mm512_fmadd_ps(grad_4[2], betta1_minus1_4, momntum_4[2]);
+            momntum_4[3] = _mm512_mul_ps(momntum_4[3], betta1_4);
+            momntum_4[3] = _mm512_fmadd_ps(grad_4[3], betta1_minus1_4, momntum_4[3]);
+            momntum_4[4] = _mm512_mul_ps(momntum_4[4], betta1_4);
+            momntum_4[4] = _mm512_fmadd_ps(grad_4[4], betta1_minus1_4, momntum_4[4]);
+            momntum_4[5] = _mm512_mul_ps(momntum_4[5], betta1_4);
+            momntum_4[5] = _mm512_fmadd_ps(grad_4[5], betta1_minus1_4, momntum_4[5]);
+            momntum_4[6] = _mm512_mul_ps(momntum_4[6], betta1_4);
+            momntum_4[6] = _mm512_fmadd_ps(grad_4[6], betta1_minus1_4, momntum_4[6]);
+            momntum_4[7] = _mm512_mul_ps(momntum_4[7], betta1_4);
+            momntum_4[7] = _mm512_fmadd_ps(grad_4[7], betta1_minus1_4, momntum_4[7]);
+
+            varianc_4[0] = _mm512_mul_ps(varianc_4[0], betta2_4);
+            varianc_4[1] = _mm512_mul_ps(varianc_4[1], betta2_4);
+            varianc_4[2] = _mm512_mul_ps(varianc_4[2], betta2_4);
+            varianc_4[3] = _mm512_mul_ps(varianc_4[3], betta2_4);
+            varianc_4[4] = _mm512_mul_ps(varianc_4[4], betta2_4);
+            varianc_4[5] = _mm512_mul_ps(varianc_4[5], betta2_4);
+            varianc_4[6] = _mm512_mul_ps(varianc_4[6], betta2_4);
+            varianc_4[7] = _mm512_mul_ps(varianc_4[7], betta2_4);
+            grad_4[0] = _mm512_mul_ps(grad_4[0], grad_4[0]);
+            grad_4[1] = _mm512_mul_ps(grad_4[1], grad_4[1]);
+            grad_4[2] = _mm512_mul_ps(grad_4[2], grad_4[2]);
+            grad_4[3] = _mm512_mul_ps(grad_4[3], grad_4[3]);
+            grad_4[4] = _mm512_mul_ps(grad_4[4], grad_4[4]);
+            grad_4[5] = _mm512_mul_ps(grad_4[5], grad_4[5]);
+            grad_4[6] = _mm512_mul_ps(grad_4[6], grad_4[6]);
+            grad_4[7] = _mm512_mul_ps(grad_4[7], grad_4[7]);
+            varianc_4[0] = _mm512_fmadd_ps(grad_4[0], betta2_minus1_4, varianc_4[0]);
+            varianc_4[1] = _mm512_fmadd_ps(grad_4[1], betta2_minus1_4, varianc_4[1]);
+            varianc_4[2] = _mm512_fmadd_ps(grad_4[2], betta2_minus1_4, varianc_4[2]);
+            varianc_4[3] = _mm512_fmadd_ps(grad_4[3], betta2_minus1_4, varianc_4[3]);
+            varianc_4[4] = _mm512_fmadd_ps(grad_4[4], betta2_minus1_4, varianc_4[4]);
+            varianc_4[5] = _mm512_fmadd_ps(grad_4[5], betta2_minus1_4, varianc_4[5]);
+            varianc_4[6] = _mm512_fmadd_ps(grad_4[6], betta2_minus1_4, varianc_4[6]);
+            varianc_4[7] = _mm512_fmadd_ps(grad_4[7], betta2_minus1_4, varianc_4[7]);
+
+            grad_4[0] = _mm512_sqrt_ps(varianc_4[0]) / bias2_sqrt;
+            grad_4[1] = _mm512_sqrt_ps(varianc_4[1]) / bias2_sqrt;
+            grad_4[2] = _mm512_sqrt_ps(varianc_4[2]) / bias2_sqrt;
+            grad_4[3] = _mm512_sqrt_ps(varianc_4[3]) / bias2_sqrt;
+            grad_4[4] = _mm512_sqrt_ps(varianc_4[4]) / bias2_sqrt;
+            grad_4[5] = _mm512_sqrt_ps(varianc_4[5]) / bias2_sqrt;
+            grad_4[6] = _mm512_sqrt_ps(varianc_4[6]) / bias2_sqrt;
+            grad_4[7] = _mm512_sqrt_ps(varianc_4[7]) / bias2_sqrt;
+
+            grad_4[0] = _mm512_add_ps(grad_4[0], eps_4);
+            grad_4[1] = _mm512_add_ps(grad_4[1], eps_4);
+            grad_4[2] = _mm512_add_ps(grad_4[2], eps_4);
+            grad_4[3] = _mm512_add_ps(grad_4[3], eps_4);
+            grad_4[4] = _mm512_add_ps(grad_4[4], eps_4);
+            grad_4[5] = _mm512_add_ps(grad_4[5], eps_4);
+            grad_4[6] = _mm512_add_ps(grad_4[6], eps_4);
+            grad_4[7] = _mm512_add_ps(grad_4[7], eps_4);
+            grad_4[0] = _mm512_div_ps(momntum_4[0], grad_4[0]);
+            grad_4[1] = _mm512_div_ps(momntum_4[1], grad_4[1]);
+            grad_4[2] = _mm512_div_ps(momntum_4[2], grad_4[2]);
+            grad_4[3] = _mm512_div_ps(momntum_4[3], grad_4[3]);
+            grad_4[4] = _mm512_div_ps(momntum_4[4], grad_4[4]);
+            grad_4[5] = _mm512_div_ps(momntum_4[5], grad_4[5]);
+            grad_4[6] = _mm512_div_ps(momntum_4[6], grad_4[6]);
+            grad_4[7] = _mm512_div_ps(momntum_4[7], grad_4[7]);
+
+            param_4[0] = _mm512_fmadd_ps(grad_4[0], step_size_4, param_4[0]);
+            param_4[1] = _mm512_fmadd_ps(grad_4[1], step_size_4, param_4[1]);
+            param_4[2] = _mm512_fmadd_ps(grad_4[2], step_size_4, param_4[2]);
+            param_4[3] = _mm512_fmadd_ps(grad_4[3], step_size_4, param_4[3]);
+            param_4[4] = _mm512_fmadd_ps(grad_4[4], step_size_4, param_4[4]);
+            param_4[5] = _mm512_fmadd_ps(grad_4[5], step_size_4, param_4[5]);
+            param_4[6] = _mm512_fmadd_ps(grad_4[6], step_size_4, param_4[6]);
+            param_4[7] = _mm512_fmadd_ps(grad_4[7], step_size_4, param_4[7]);
+
+            _mm512_storeu_ps(_params + i, param_4[0]);
+            _mm512_storeu_ps(_params + i + SIMD_WIDTH, param_4[1]);
+            _mm512_storeu_ps(_params + i + (SIMD_WIDTH << 1), param_4[2]);
+            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 3, param_4[3]);
+            _mm512_storeu_ps(_params + i + (SIMD_WIDTH << 2), param_4[4]);
+            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 5, param_4[5]);
+            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 6, param_4[6]);
+            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 7, param_4[7]);
+
+            _mm512_storeu_ps(_exp_avg + i, momntum_4[0]);
+            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH, momntum_4[1]);
+            _mm512_storeu_ps(_exp_avg + i + (SIMD_WIDTH << 1), momntum_4[2]);
+            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 3, momntum_4[3]);
+            _mm512_storeu_ps(_exp_avg + i + (SIMD_WIDTH << 2), momntum_4[4]);
+            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 5, momntum_4[5]);
+            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 6, momntum_4[6]);
+            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 7, momntum_4[7]);
+
+            _mm512_storeu_ps(_exp_avg_sq + i, varianc_4[0]);
+            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH, varianc_4[1]);
+            _mm512_storeu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1), varianc_4[2]);
+            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3, varianc_4[3]);
+            _mm512_storeu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 2), varianc_4[4]);
+            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 5, varianc_4[5]);
+            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 6, varianc_4[6]);
+            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 7, varianc_4[7]);
+        }
+        if (dev_params) {
+#pragma omp parallel for
+            for (size_t j = 0; j < copy_size; j += 4) {
+                _doubled_buffer[buf_index][j] = (__half)_params[t + j];
+                _doubled_buffer[buf_index][j + 1] = (__half)_params[t + j + 1];
+                _doubled_buffer[buf_index][j + 2] = (__half)_params[t + j + 2];
+                _doubled_buffer[buf_index][j + 3] = (__half)_params[t + j + 3];
+            }
+
+            CUDA_CHECK(cudaMemcpyAsync(dev_params + t,
+                                       _doubled_buffer[buf_index],
+                                       copy_size * sizeof(__half),
+                                       cudaMemcpyHostToDevice,
+                                       Context::Instance().GetCurrentStream()));
+            buf_index = !buf_index;
+        }
+    }
+}
+
 int ds_adam_step(int optimizer_id,
                  torch::Tensor& params,
                  torch::Tensor& grads,
@@ -281,7 +499,7 @@ int ds_adam_step(int optimizer_id,
     std::shared_ptr<Adam_Optimizer> opt =
         std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
 
-    opt->Step_4(params_ptr, grads_ptr, exp_avg_ptr, exp_avg_sq_ptr, params_c.size(0));
+    opt->Step_8(params_ptr, grads_ptr, exp_avg_ptr, exp_avg_sq_ptr, params_c.size(0));
 
     return 0;
 }
@@ -308,7 +526,7 @@ int ds_adam_step_plus_copy(int optimizer_id,
     std::shared_ptr<Adam_Optimizer> opt =
         std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
 
-    opt->Step_4(
+    opt->Step_8(
         params_ptr, grads_ptr, exp_avg_ptr, exp_avg_sq_ptr, params_c.size(0), gpu_params_ptr);
 
     return 0;
diff --git a/csrc/includes/cpu_adam.h b/csrc/includes/cpu_adam.h
index d1a2aa088731..40f4cba692ea 100644
--- a/csrc/includes/cpu_adam.h
+++ b/csrc/includes/cpu_adam.h
@@ -18,7 +18,8 @@
     }
 
 #define SIMD_WIDTH 16
-#define TILE 1048576
+
+#define TILE (1024 * 1024 * 1024)
 
 class Adam_Optimizer {
 public:
@@ -55,6 +56,12 @@ class Adam_Optimizer {
                 float* _exp_avg_sa,
                 size_t param_size,
                 __half* dev_param = nullptr);
+    void Step_8(float* _params,
+                float* grads,
+                float* _exp_avg,
+                float* _exp_avg_sq,
+                size_t _param_size,
+                __half* dev_params = nullptr);
 
 private:
     float _alpha;
diff --git a/setup.py b/setup.py
index ae5010a65b11..10564e59a6e8 100755
--- a/setup.py
+++ b/setup.py
@@ -118,21 +118,25 @@ def fetch_requirements(path):
 ## Adam ##
 if BUILD_MASK & DS_BUILD_ADAM:
     ext_modules.append(
-        CUDAExtension(
-            name='deepspeed.ops.adam.cpu_adam_op',
-            sources=[
-                'csrc/adam/cpu_adam.cpp',
-            ],
-            include_dirs=['csrc/includes'],
-            extra_compile_args={
-                'cxx':
-                ['-O3',
-                 '-std=c++14',
-                 '-g',
-                 '-Wno-reorder',
-                 '-march=native',
-                 '-fopenmp']
-            }))
+        CUDAExtension(name='deepspeed.ops.adam.cpu_adam_op',
+                      sources=[
+                          'csrc/adam/cpu_adam.cpp',
+                      ],
+                      include_dirs=['csrc/includes',
+                                    '/usr/local/cuda/include'],
+                      extra_compile_args={
+                          'cxx': [
+                              '-O3',
+                              '-std=c++14',
+                              '-L/usr/local/cuda/lib64',
+                              '-lcudart',
+                              '-lcublas',
+                              '-g',
+                              '-Wno-reorder',
+                              '-march=native',
+                              '-fopenmp'
+                          ]
+                      }))
 
 ## Transformer ##
 if BUILD_MASK & DS_BUILD_TRANSFORMER:
diff --git a/tests/perf/adam_test1.py b/tests/perf/adam_test1.py
new file mode 100755
index 000000000000..800cb4f42eaa
--- /dev/null
+++ b/tests/perf/adam_test1.py
@@ -0,0 +1,22 @@
+import torch
+from deepspeed.ops.adam import DeepSpeedCPUAdam
+import time
+
+device = 'cpu'
+model_size = 1 * 1024**3
+param = torch.nn.Parameter(torch.ones(model_size, device=device))
+param_fp16 = torch.nn.Parameter(torch.ones(model_size,
+                                           dtype=torch.half,
+                                           device='cuda:0'))
+
+optimizer = DeepSpeedCPUAdam([param])
+#torch.set_num_threads(128)
+param.grad = torch.ones(model_size, device=device)
+avg = 0
+for i in range(10):
+    start = time.time()
+    optimizer.step(fp16_param_groups=[param_fp16])
+    stop = time.time()
+    avg += (stop - start)
+    param.grad = torch.ones(model_size, device=device) * 2
+print("Elapsed Time is ", avg / 10)

From f0c34d0c12fac869cf8cdf405c1f839be9be1f3d Mon Sep 17 00:00:00 2001
From: Tunji Ruwase <olruwase@microsoft.com>
Date: Sat, 5 Sep 2020 07:03:37 +0000
Subject: [PATCH 26/44] Perf tests

---
 tests/perf/adam_test.py  | 16 +++++++++++-----
 tests/perf/adam_test1.py |  4 ++--
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/tests/perf/adam_test.py b/tests/perf/adam_test.py
index 0fedfeb18e42..89b70e96e7ac 100755
--- a/tests/perf/adam_test.py
+++ b/tests/perf/adam_test.py
@@ -3,16 +3,22 @@
 import time
 
 device = 'cpu'
-model_size = 1 * 1024**3
-param = torch.nn.Parameter(torch.ones(model_size, device=device))
-optimizer = DeepSpeedCPUAdam([param])
+model_size = 1 * 1024 ** 3
+group_size = [model_size, 274432]
+
+param = [torch.nn.Parameter(torch.ones(size, device=device)) for size in group_size]
+optimizer = DeepSpeedCPUAdam(param)
 #torch.set_num_threads(128)
-param.grad = torch.ones(model_size, device=device)
+for i, p in enumerate(param):
+    p.grad = torch.ones(group_size[i], device=device)
+#param.grad = torch.ones(model_size, device=device)
 avg = 0
 for i in range(100):
     start = time.time()
     optimizer.step()
     stop = time.time()
     avg += (stop - start)
-    param.grad = torch.ones(model_size, device=device) * 2
+    for i, p in enumerate(param):
+        p.grad = torch.ones(group_size[i], device=device) * 2
+    #param.grad = torch.ones(model_size, device=device) * 2
 print("Elapsed Time is ", avg / 100)
diff --git a/tests/perf/adam_test1.py b/tests/perf/adam_test1.py
index 800cb4f42eaa..b0aba0fcd6b9 100755
--- a/tests/perf/adam_test1.py
+++ b/tests/perf/adam_test1.py
@@ -13,10 +13,10 @@
 #torch.set_num_threads(128)
 param.grad = torch.ones(model_size, device=device)
 avg = 0
-for i in range(10):
+for i in range(100):
     start = time.time()
     optimizer.step(fp16_param_groups=[param_fp16])
     stop = time.time()
     avg += (stop - start)
     param.grad = torch.ones(model_size, device=device) * 2
-print("Elapsed Time is ", avg / 10)
+print("Elapsed Time is ", avg / 100)

From 942ec9055f078095156cecc145d92533e3d0182d Mon Sep 17 00:00:00 2001
From: Tunji Ruwase <olruwase@microsoft.com>
Date: Sat, 5 Sep 2020 17:16:35 +0000
Subject: [PATCH 27/44] BumpDSE

---
 DeepSpeedExamples | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DeepSpeedExamples b/DeepSpeedExamples
index 8774a178169f..6236dc461f3c 160000
--- a/DeepSpeedExamples
+++ b/DeepSpeedExamples
@@ -1 +1 @@
-Subproject commit 8774a178169f31f7139a3b1a1b937c2a91aac53f
+Subproject commit 6236dc461f3c5009a4b858535e9e61008f67995a

From a64b0abbcc8fb0be18b6572982e95641a6871e01 Mon Sep 17 00:00:00 2001
From: Arash Ashari <arashari@microsoft.com>
Date: Sat, 5 Sep 2020 11:48:35 -0700
Subject: [PATCH 28/44] fixed a typo; this was fixed before but seems like it
 has been lost in the refactor (#364)

---
 deepspeed/runtime/config.py    | 4 ++--
 deepspeed/runtime/constants.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
index 78e9eaaf1b3f..26c10f3b0e35 100755
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -205,7 +205,7 @@ def get_sparse_fixed_config(sparsity):
         sparsity,
         SPARSE_HORIZONTAL_GLOBAL_ATTENTION,
         SPARSE_HORIZONTAL_GLOBAL_ATTENTION_DEFAULT)
-    num_differnt_global_patterns = get_scalar_param(
+    num_different_global_patterns = get_scalar_param(
         sparsity,
         SPARSE_NUM_DIFFERENT_GLOBAL_PATTERNS,
         SPARSE_NUM_DIFFERENT_GLOBAL_PATTERNS_DEFAULT)
@@ -218,7 +218,7 @@ def get_sparse_fixed_config(sparsity):
         SPARSE_NUM_GLOBAL_BLOCKS: num_global_blocks,
         SPARSE_ATTENTION_TYPE: attention,
         SPARSE_HORIZONTAL_GLOBAL_ATTENTION: horizontal_global_attention,
-        SPARSE_NUM_DIFFERENT_GLOBAL_PATTERNS: num_differnt_global_patterns
+        SPARSE_NUM_DIFFERENT_GLOBAL_PATTERNS: num_different_global_patterns
     }
 
 
diff --git a/deepspeed/runtime/constants.py b/deepspeed/runtime/constants.py
index 57bd5ecf81fc..ea767b93e68d 100755
--- a/deepspeed/runtime/constants.py
+++ b/deepspeed/runtime/constants.py
@@ -40,7 +40,7 @@
 SPARSE_ATTENTION_TYPE_DEFAULT = "bidirectional"
 SPARSE_HORIZONTAL_GLOBAL_ATTENTION = "horizontal_global_attention"
 SPARSE_HORIZONTAL_GLOBAL_ATTENTION_DEFAULT = False
-SPARSE_NUM_DIFFERENT_GLOBAL_PATTERNS = "num_differnt_global_patterns"
+SPARSE_NUM_DIFFERENT_GLOBAL_PATTERNS = "num_different_global_patterns"
 SPARSE_NUM_DIFFERENT_GLOBAL_PATTERNS_DEFAULT = 1
 SPARSE_NUM_RANDOM_BLOCKS = "num_random_blocks"
 SPARSE_NUM_RANDOM_BLOCKS_DEFAULT = 0

From 4d4eafbe8d3b6242166aeb7e002795c4d1ac7169 Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Sat, 5 Sep 2020 15:06:12 -0700
Subject: [PATCH 29/44] Move code quality tests to Azure-hosted agents. (#368)

---
 azure-pipelines.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 207b00de543c..f98be617c938 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -91,11 +91,14 @@ jobs:
 
 - job: Code_Quality_Checks
   pool:
-    name: 'DS_testing'
+    vmImage: 'ubuntu-latest'
   variables:
     conda_env: 'ds_codetest'
 
   steps:
+  - bash: echo "##vso[task.prependpath]$CONDA/bin"
+    displayName: Add conda to PATH
+
   - script: |
       conda create --force --yes -n $(conda_env) python=3.7
       source activate $(conda_env)

From bb75df777f784fd6208a4fc4b0ae2afbcbb6d947 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Sun, 6 Sep 2020 05:12:26 +0000
Subject: [PATCH 30/44] add casting kernel

---
 csrc/adam/cpu_adam.cpp             | 33 +++++++++++++-----------
 csrc/adam/custom_cuda_kernel.cu    | 40 ++++++++++++++++++++++++++++++
 csrc/includes/custom_cuda_layers.h |  3 +++
 setup.py                           | 13 ++++++++++
 4 files changed, 74 insertions(+), 15 deletions(-)
 create mode 100755 csrc/adam/custom_cuda_kernel.cu

diff --git a/csrc/adam/cpu_adam.cpp b/csrc/adam/cpu_adam.cpp
index 9399438b57f6..89c907e5b3d5 100755
--- a/csrc/adam/cpu_adam.cpp
+++ b/csrc/adam/cpu_adam.cpp
@@ -10,6 +10,7 @@
 #include "cublas_v2.h"
 #include "cuda.h"
 #include "curand.h"
+#include "custom_cuda_layers.h"
 
 static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
 
@@ -304,7 +305,8 @@ void Adam_Optimizer::Step_8(float* _params,
         if ((t + TILE) > _param_size) copy_size = _param_size - t;
         size_t offset = copy_size + t;
 #pragma omp parallel for
-        for (size_t i = t; i < offset; i += (SIMD_WIDTH << 3)) {
+        for (size_t i = t; i < offset; i += (SIMD_WIDTH << 3)) 
+        {
             __m512 grad_4[8];
             grad_4[0] = _mm512_loadu_ps(grads + i);
             grad_4[1] = _mm512_loadu_ps(grads + i + SIMD_WIDTH);
@@ -443,6 +445,15 @@ void Adam_Optimizer::Step_8(float* _params,
             _mm512_storeu_ps(_params + i + SIMD_WIDTH * 6, param_4[6]);
             _mm512_storeu_ps(_params + i + SIMD_WIDTH * 7, param_4[7]);
 
+            _mm512_storeu_ps(_doubled_buffer[buf_index] + (i - t), param_4[0]);
+            _mm512_storeu_ps(_doubled_buffer[buf_index] + (i - t) + SIMD_WIDTH, param_4[1]);
+            _mm512_storeu_ps(_doubled_buffer[buf_index] + (i - t) + (SIMD_WIDTH << 1), param_4[2]);
+            _mm512_storeu_ps(_doubled_buffer[buf_index] + (i - t) + SIMD_WIDTH * 3, param_4[3]);
+            _mm512_storeu_ps(_doubled_buffer[buf_index] + (i - t) + (SIMD_WIDTH << 2), param_4[4]);
+            _mm512_storeu_ps(_doubled_buffer[buf_index] + (i - t) + SIMD_WIDTH * 5, param_4[5]);
+            _mm512_storeu_ps(_doubled_buffer[buf_index] + (i - t) + SIMD_WIDTH * 6, param_4[6]);
+            _mm512_storeu_ps(_doubled_buffer[buf_index] + (i - t) + SIMD_WIDTH * 7, param_4[7]);
+
             _mm512_storeu_ps(_exp_avg + i, momntum_4[0]);
             _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH, momntum_4[1]);
             _mm512_storeu_ps(_exp_avg + i + (SIMD_WIDTH << 1), momntum_4[2]);
@@ -461,20 +472,12 @@ void Adam_Optimizer::Step_8(float* _params,
             _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 6, varianc_4[6]);
             _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 7, varianc_4[7]);
         }
-        if (dev_params) {
-#pragma omp parallel for
-            for (size_t j = 0; j < copy_size; j += 4) {
-                _doubled_buffer[buf_index][j] = (__half)_params[t + j];
-                _doubled_buffer[buf_index][j + 1] = (__half)_params[t + j + 1];
-                _doubled_buffer[buf_index][j + 2] = (__half)_params[t + j + 2];
-                _doubled_buffer[buf_index][j + 3] = (__half)_params[t + j + 3];
-            }
-
-            CUDA_CHECK(cudaMemcpyAsync(dev_params + t,
-                                       _doubled_buffer[buf_index],
-                                       copy_size * sizeof(__half),
-                                       cudaMemcpyHostToDevice,
-                                       Context::Instance().GetCurrentStream()));
+        if (dev_params) 
+        {
+            launch_param_update(_doubled_buffer[buf_index], 
+                                dev_params + t, 
+                                copy_size, 
+                                Context::Instance().GetCurrentStream());
             buf_index = !buf_index;
         }
     }
diff --git a/csrc/adam/custom_cuda_kernel.cu b/csrc/adam/custom_cuda_kernel.cu
new file mode 100755
index 000000000000..709eea5d9f46
--- /dev/null
+++ b/csrc/adam/custom_cuda_kernel.cu
@@ -0,0 +1,40 @@
+
+
+#include "custom_cuda_layers.h"
+
+
+__global__ void param_update_kernel(const float* input, __half* output, int size)
+{
+    const float4 *input_cast = reinterpret_cast<const float4*>(input);
+    float2 *output_cast = reinterpret_cast<float2*>(output);
+
+    int id = blockIdx.x * blockDim.x + threadIdx.x;
+
+     
+
+    if(id < size)
+    {
+        float4 data = input_cast[id];
+        float2 cast_data;
+        __half *output_h = reinterpret_cast<__half*>(&cast_data);
+
+        output_h[0] = (__half)data.x;
+        output_h[1] = (__half)data.y;
+        output_h[2] = (__half)data.z;
+        output_h[3] = (__half)data.w;
+        
+        output_cast[id] = cast_data;
+    }
+        
+}
+
+void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream)
+{
+    int threads = 512;
+
+    size /= 4; 
+    dim3 grid_dim((size-1) / threads + 1);
+    dim3 block_dim(threads);
+
+    param_update_kernel<<<grid_dim, block_dim, 0, stream>>>(input, output, size);
+}
\ No newline at end of file
diff --git a/csrc/includes/custom_cuda_layers.h b/csrc/includes/custom_cuda_layers.h
index d9d0a8a68b38..d23e7b02faad 100755
--- a/csrc/includes/custom_cuda_layers.h
+++ b/csrc/includes/custom_cuda_layers.h
@@ -264,3 +264,6 @@ void launch_fuse_transpose_bias_kernel(const T* inp,
                                        int rows,
                                        int cols,
                                        cudaStream_t stream);
+
+
+void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream);
diff --git a/setup.py b/setup.py
index 10564e59a6e8..eb8d6ed07e51 100755
--- a/setup.py
+++ b/setup.py
@@ -121,6 +121,7 @@ def fetch_requirements(path):
         CUDAExtension(name='deepspeed.ops.adam.cpu_adam_op',
                       sources=[
                           'csrc/adam/cpu_adam.cpp',
+                          'csrc/adam/custom_cuda_kernel.cu',
                       ],
                       include_dirs=['csrc/includes',
                                     '/usr/local/cuda/include'],
@@ -135,6 +136,18 @@ def fetch_requirements(path):
                               '-Wno-reorder',
                               '-march=native',
                               '-fopenmp'
+                          ],
+                          'nvcc': [
+                              '-O3',
+                              '--use_fast_math',
+                              '-gencode',
+                              'arch=compute_61,code=compute_61',
+                              '-gencode',
+                              'arch=compute_70,code=compute_70',
+                              '-std=c++14',
+                              '-U__CUDA_NO_HALF_OPERATORS__',
+                              '-U__CUDA_NO_HALF_CONVERSIONS__',
+                              '-U__CUDA_NO_HALF2_OPERATORS__'
                           ]
                       }))
 

From 7aaccf45f954cf7282a8ba9f58db9b12157c6872 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Sun, 6 Sep 2020 05:13:12 +0000
Subject: [PATCH 31/44] run precommit

---
 csrc/adam/cpu_adam.cpp             | 12 +++++-------
 csrc/adam/custom_cuda_kernel.cu    | 21 ++++++++-------------
 csrc/includes/custom_cuda_layers.h |  1 -
 3 files changed, 13 insertions(+), 21 deletions(-)
 mode change 100755 => 100644 csrc/adam/cpu_adam.cpp
 mode change 100755 => 100644 csrc/adam/custom_cuda_kernel.cu
 mode change 100755 => 100644 csrc/includes/custom_cuda_layers.h

diff --git a/csrc/adam/cpu_adam.cpp b/csrc/adam/cpu_adam.cpp
old mode 100755
new mode 100644
index 89c907e5b3d5..d884451700e9
--- a/csrc/adam/cpu_adam.cpp
+++ b/csrc/adam/cpu_adam.cpp
@@ -305,8 +305,7 @@ void Adam_Optimizer::Step_8(float* _params,
         if ((t + TILE) > _param_size) copy_size = _param_size - t;
         size_t offset = copy_size + t;
 #pragma omp parallel for
-        for (size_t i = t; i < offset; i += (SIMD_WIDTH << 3)) 
-        {
+        for (size_t i = t; i < offset; i += (SIMD_WIDTH << 3)) {
             __m512 grad_4[8];
             grad_4[0] = _mm512_loadu_ps(grads + i);
             grad_4[1] = _mm512_loadu_ps(grads + i + SIMD_WIDTH);
@@ -472,11 +471,10 @@ void Adam_Optimizer::Step_8(float* _params,
             _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 6, varianc_4[6]);
             _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 7, varianc_4[7]);
         }
-        if (dev_params) 
-        {
-            launch_param_update(_doubled_buffer[buf_index], 
-                                dev_params + t, 
-                                copy_size, 
+        if (dev_params) {
+            launch_param_update(_doubled_buffer[buf_index],
+                                dev_params + t,
+                                copy_size,
                                 Context::Instance().GetCurrentStream());
             buf_index = !buf_index;
         }
diff --git a/csrc/adam/custom_cuda_kernel.cu b/csrc/adam/custom_cuda_kernel.cu
old mode 100755
new mode 100644
index 709eea5d9f46..8f8d2c826771
--- a/csrc/adam/custom_cuda_kernel.cu
+++ b/csrc/adam/custom_cuda_kernel.cu
@@ -2,39 +2,34 @@
 
 #include "custom_cuda_layers.h"
 
-
 __global__ void param_update_kernel(const float* input, __half* output, int size)
 {
-    const float4 *input_cast = reinterpret_cast<const float4*>(input);
-    float2 *output_cast = reinterpret_cast<float2*>(output);
+    const float4* input_cast = reinterpret_cast<const float4*>(input);
+    float2* output_cast = reinterpret_cast<float2*>(output);
 
     int id = blockIdx.x * blockDim.x + threadIdx.x;
 
-     
-
-    if(id < size)
-    {
+    if (id < size) {
         float4 data = input_cast[id];
         float2 cast_data;
-        __half *output_h = reinterpret_cast<__half*>(&cast_data);
+        __half* output_h = reinterpret_cast<__half*>(&cast_data);
 
         output_h[0] = (__half)data.x;
         output_h[1] = (__half)data.y;
         output_h[2] = (__half)data.z;
         output_h[3] = (__half)data.w;
-        
+
         output_cast[id] = cast_data;
     }
-        
 }
 
 void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream)
 {
     int threads = 512;
 
-    size /= 4; 
-    dim3 grid_dim((size-1) / threads + 1);
+    size /= 4;
+    dim3 grid_dim((size - 1) / threads + 1);
     dim3 block_dim(threads);
 
     param_update_kernel<<<grid_dim, block_dim, 0, stream>>>(input, output, size);
-}
\ No newline at end of file
+}
diff --git a/csrc/includes/custom_cuda_layers.h b/csrc/includes/custom_cuda_layers.h
old mode 100755
new mode 100644
index d23e7b02faad..2e72a35292c6
--- a/csrc/includes/custom_cuda_layers.h
+++ b/csrc/includes/custom_cuda_layers.h
@@ -265,5 +265,4 @@ void launch_fuse_transpose_bias_kernel(const T* inp,
                                        int cols,
                                        cudaStream_t stream);
 
-
 void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream);

From d4c3b0a9123ef02d97f5463371ddc60b4ae5b447 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Sun, 6 Sep 2020 05:30:47 +0000
Subject: [PATCH 32/44] revert changes

---
 tests/perf/adam_test1.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/tests/perf/adam_test1.py b/tests/perf/adam_test1.py
index 800cb4f42eaa..0fedfeb18e42 100755
--- a/tests/perf/adam_test1.py
+++ b/tests/perf/adam_test1.py
@@ -5,18 +5,14 @@
 device = 'cpu'
 model_size = 1 * 1024**3
 param = torch.nn.Parameter(torch.ones(model_size, device=device))
-param_fp16 = torch.nn.Parameter(torch.ones(model_size,
-                                           dtype=torch.half,
-                                           device='cuda:0'))
-
 optimizer = DeepSpeedCPUAdam([param])
 #torch.set_num_threads(128)
 param.grad = torch.ones(model_size, device=device)
 avg = 0
-for i in range(10):
+for i in range(100):
     start = time.time()
-    optimizer.step(fp16_param_groups=[param_fp16])
+    optimizer.step()
     stop = time.time()
     avg += (stop - start)
     param.grad = torch.ones(model_size, device=device) * 2
-print("Elapsed Time is ", avg / 10)
+print("Elapsed Time is ", avg / 100)

From 4c257dd0d99de4cf92ff5459f63cea44630db8a2 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Sun, 6 Sep 2020 05:33:18 +0000
Subject: [PATCH 33/44] revert changes

---
 tests/{perf/adam_test1.py => unit/adam_test.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/{perf/adam_test1.py => unit/adam_test.py} (100%)

diff --git a/tests/perf/adam_test1.py b/tests/unit/adam_test.py
similarity index 100%
rename from tests/perf/adam_test1.py
rename to tests/unit/adam_test.py

From 19aac8ad194ee3cf603e6ad2e1563d02ae1de427 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Sat, 5 Sep 2020 22:46:24 -0700
Subject: [PATCH 34/44] ZeRO-Offload: Integration code fixes (#370)

* Various correctness fixes

* Format fixes
---
 deepspeed/ops/adam/cpu_adam.py             |  9 +--
 deepspeed/runtime/zero/stage2.py           | 19 ++++--
 deepspeed/runtime/zero/utils.py            | 10 +++-
 tests/model/Megatron_GPT2/run_func_test.py |  6 +-
 tests/model/run_sanity_check.py            |  2 +-
 tests/perf/adam_test.py                    |  2 +-
 tests/unit/test_checkpointing.py           | 69 ++++++++++++++--------
 7 files changed, 75 insertions(+), 42 deletions(-)

diff --git a/deepspeed/ops/adam/cpu_adam.py b/deepspeed/ops/adam/cpu_adam.py
index d51e2126cda2..76d8d323f6f5 100755
--- a/deepspeed/ops/adam/cpu_adam.py
+++ b/deepspeed/ops/adam/cpu_adam.py
@@ -12,14 +12,14 @@ class DeepSpeedCPUAdam(torch.optim.Optimizer):
     def __init__(self,
                  model_params,
                  lr=1e-3,
-                 bettas=(0.9,
-                         0.999),
+                 betas=(0.9,
+                        0.999),
                  eps=1e-8,
                  weight_decay=0,
                  amsgrad=False):
 
         default_args = dict(lr=lr,
-                            betas=bettas,
+                            betas=betas,
                             eps=eps,
                             weight_decay=weight_decay,
                             amsgrad=amsgrad)
@@ -30,7 +30,7 @@ def __init__(self,
 
         global ds_opt_adam
         ds_opt_adam = importlib.import_module('deepspeed.ops.adam.cpu_adam_op')
-        ds_opt_adam.create_adam(self.opt_id, lr, bettas[0], bettas[1], eps, weight_decay)
+        ds_opt_adam.create_adam(self.opt_id, lr, betas[0], betas[1], eps, weight_decay)
 
     def __setstate__(self, state):
         super(DeepSpeedCPUAdam, self).__setstate__(state)
@@ -54,6 +54,7 @@ def step(self, closure=None, fp16_param_groups=None):
                 state = self.state[p]
                 # State initialization
                 if len(state) == 0:
+                    print(f'group {group_id} param {param_id} = {p.numel()}')
                     state['step'] = 0
                     # gradient momentums
                     state['exp_avg'] = torch.zeros_like(p.data, device='cpu')
diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py
index 9ed9c07df054..d39c17f37161 100755
--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -15,6 +15,7 @@
 from deepspeed.runtime.fp16.loss_scaler import LossScaler, DynamicLossScaler
 from deepspeed.runtime.utils import see_memory_usage, is_model_parallel_parameter
 from deepspeed.runtime.zero.config import ZERO_OPTIMIZATION_GRADIENTS
+
 from deepspeed.utils import logger
 #Toggle this to true to enable correctness test
 #with gradient partitioning and without
@@ -155,6 +156,7 @@ def __init__(self,
         self.overlap_comm = overlap_comm or cpu_offload
 
         self.cpu_offload = cpu_offload
+
         self.device = torch.cuda.current_device() if not self.cpu_offload else 'cpu'
 
         self.dp_process_group = dp_process_group
@@ -1405,17 +1407,22 @@ def step(self, closure=None):
         self.unscale_and_clip_grads(single_partition_grad_groups, norm_groups)
         #torch.set_num_threads(12)
         timers('optimizer_step').start()
-        #self.optimizer.step(fp16_param_groups=self.parallel_partitioned_fp16_groups)
-        self.optimizer.step()
-        #get rid of the fp32 gradients. Not needed anymore
-        if not self.cpu_offload:
+        if self.cpu_offload:
+            #            self.optimizer.step(fp16_param_groups=self.parallel_partitioned_fp16_groups)
+            self.optimizer.step()
+            for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
+                fp16_partitions[partition_id].data.copy_(fp32_partition.data)
+        else:
+            self.optimizer.step()
+            #get rid of the fp32 gradients. Not needed anymore
             for group in self.single_partition_of_fp32_groups:
                 group.grad = None
 
-        for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
-            fp16_partitions[partition_id].data.copy_(fp32_partition.data)
+            for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
+                fp16_partitions[partition_id].data.copy_(fp32_partition.data)
 
         timers('optimizer_step').stop()
+        timers.log(names=['optimizer_step'])
 
         if self.cpu_offload:
             self.reset_cpu_buffers()
diff --git a/deepspeed/runtime/zero/utils.py b/deepspeed/runtime/zero/utils.py
index ba7cf8e87ebe..e51ad7c1eca1 100755
--- a/deepspeed/runtime/zero/utils.py
+++ b/deepspeed/runtime/zero/utils.py
@@ -2,6 +2,7 @@
 import torch.distributed as dist
 import apex
 from deepspeed.utils import logger
+from deepspeed.ops.adam import DeepSpeedCPUAdam
 
 
 def _initialize_parameter_parallel_groups(parameter_parallel_size=None):
@@ -22,8 +23,15 @@ def _initialize_parameter_parallel_groups(parameter_parallel_size=None):
     return my_group
 
 
-ZERO_SUPPORTED_OPTIMIZERS = [torch.optim.Adam, apex.optimizers.FusedAdam]
+ZERO_SUPPORTED_OPTIMIZERS = [
+    torch.optim.Adam,
+    apex.optimizers.FusedAdam,
+    DeepSpeedCPUAdam
+]
 
 
 def is_zero_supported_optimizer(optimizer):
+    print(
+        f'Checking ZeRO support for optimizer={optimizer.__class__.__name__} type={type(optimizer)}'
+    )
     return type(optimizer) in ZERO_SUPPORTED_OPTIMIZERS
diff --git a/tests/model/Megatron_GPT2/run_func_test.py b/tests/model/Megatron_GPT2/run_func_test.py
index a61b0e9ba85b..552314a0d037 100755
--- a/tests/model/Megatron_GPT2/run_func_test.py
+++ b/tests/model/Megatron_GPT2/run_func_test.py
@@ -19,7 +19,7 @@
 
 def grep_loss_from_file(file_name):
     loss = 0.0
-
+    print(f'grepping {file_name}')
     with open(file_name, 'r') as f:
         lines = f.readlines()
         line_filter = "validation loss at the end of training for test data | LM loss:"
@@ -455,7 +455,7 @@ def run_partition_activations_test(self, test_config, r_tol):
             baseline_prefix += test_config["json"][0:-5]
             baseline_deepspeed_config = True
 
-        test_config["other_args"] = cpu_optimizer_flag
+        test_config["other_args"] = f"\"{cpu_optimizer_flag}\""
         base_file = self.gen_output_name(test_config,
                                          baseline_prefix,
                                          baseline_config=baseline_deepspeed_config)
@@ -565,7 +565,7 @@ def suite():
     suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero2_cpu_optimizer'))
 
     # Baseline = Megatron + Torch.Optim.Adam
-    # Test = Megatron + Torch.Optim.Adam + ZeRO-Offload
+    # Test = Megatron + DeepSpeedAdam + ZeRO-Offload
     suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero2_offload'))
     suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero2_offload'))
     suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2_offload'))
diff --git a/tests/model/run_sanity_check.py b/tests/model/run_sanity_check.py
index b7d12ba18f07..21c3bc5d9d43 100755
--- a/tests/model/run_sanity_check.py
+++ b/tests/model/run_sanity_check.py
@@ -30,7 +30,7 @@ def pytest_hack(runner_result):
 
 
 def test_megatron():
-    runner = unittest.TextTestRunner(failfast=True)
+    runner = unittest.TextTestRunner(failfast=False)
     pytest_hack(runner.run(Megatron_GPT2.suite()))
 
 
diff --git a/tests/perf/adam_test.py b/tests/perf/adam_test.py
index 89b70e96e7ac..0f29cab4662e 100755
--- a/tests/perf/adam_test.py
+++ b/tests/perf/adam_test.py
@@ -3,7 +3,7 @@
 import time
 
 device = 'cpu'
-model_size = 1 * 1024 ** 3
+model_size = 1 * 1024**3
 group_size = [model_size, 274432]
 
 param = [torch.nn.Parameter(torch.ones(size, device=device)) for size in group_size]
diff --git a/tests/unit/test_checkpointing.py b/tests/unit/test_checkpointing.py
index 4ce0d36284e0..07a8f643d134 100755
--- a/tests/unit/test_checkpointing.py
+++ b/tests/unit/test_checkpointing.py
@@ -235,21 +235,24 @@ def _test_checkpoint_fused_optimizer(args, model, hidden_dim, load_optimizer_sta
                                      load_optimizer_states=False)
 
 
-@pytest.mark.parametrize('zero_stage, use_cpu_offload',
+@pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer',
                          [
                              (1,
-                              False),
+                              False,
+                              'Adam'),
                              (2,
-                              False),
+                              False,
+                              'Adam'),
                              (2,
-                              True),
+                              True,
+                              'deepspeed_adam'),
                          ])
-def test_checkpoint_zero_optimizer(tmpdir, zero_stage, use_cpu_offload):
+def test_checkpoint_zero_optimizer(tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
     config_dict = {
         "train_batch_size": 2,
         "steps_per_print": 1,
         "optimizer": {
-            "type": "Adam",
+            "type": adam_optimizer,
             "params": {
                 "lr": 0.00015,
                 "betas": [0.8,
@@ -285,21 +288,27 @@ def _test_checkpoint_zero_optimizer(args, model, hidden_dim, load_optimizer_stat
                                     load_optimizer_states=True)
 
 
-@pytest.mark.parametrize('zero_stage, use_cpu_offload',
+@pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer',
                          [
                              (1,
-                              False),
+                              False,
+                              "Adam"),
                              (2,
-                              False),
+                              False,
+                              "Adam"),
                              (2,
-                              True),
+                              True,
+                              'deepspeed_adam'),
                          ])
-def test_checkpoint_zero_no_optimizer(tmpdir, zero_stage, use_cpu_offload):
+def test_checkpoint_zero_no_optimizer(tmpdir,
+                                      zero_stage,
+                                      use_cpu_offload,
+                                      adam_optimizer):
     config_dict = {
         "train_batch_size": 2,
         "steps_per_print": 1,
         "optimizer": {
-            "type": "Adam",
+            "type": adam_optimizer,
             "params": {
                 "lr": 0.00015,
                 "betas": [0.8,
@@ -338,23 +347,27 @@ def _test_checkpoint_zero_no_optimizer(args,
                                        load_optimizer_states=False)
 
 
-@pytest.mark.parametrize('zero_stage, use_cpu_offload',
+@pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer',
                          [
                              (0,
-                              False),
+                              False,
+                              'Adam'),
                              (1,
-                              False),
+                              False,
+                              'Adam'),
                              (2,
-                              False),
+                              False,
+                              'Adam'),
                              (2,
-                              True),
+                              True,
+                              'deepspeed_adam'),
                          ])
-def test_checkpoint_lr_scheduler(tmpdir, zero_stage, use_cpu_offload):
+def test_checkpoint_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
     config_dict = {
         "train_batch_size": 2,
         "steps_per_print": 1,
         "optimizer": {
-            "type": "Adam",
+            "type": adam_optimizer,
             "params": {
                 "lr": 0.00015,
                 "betas": [0.8,
@@ -405,23 +418,27 @@ def _test_checkpoint_lr_scheduler(args,
                                   load_lr_scheduler_states=True)
 
 
-@pytest.mark.parametrize('zero_stage, use_cpu_offload',
+@pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer',
                          [
                              (0,
-                              False),
+                              False,
+                              'Adam'),
                              (1,
-                              False),
+                              False,
+                              'Adam'),
                              (2,
-                              False),
+                              False,
+                              'Adam'),
                              (2,
-                              True),
+                              True,
+                              'deepspeed_adam'),
                          ])
-def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage, use_cpu_offload):
+def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
     config_dict = {
         "train_batch_size": 2,
         "steps_per_print": 1,
         "optimizer": {
-            "type": "Adam",
+            "type": adam_optimizer,
             "params": {
                 "lr": 1e-5
             }

From 9e83ef21ea842a56ed6ba4c7b6952eaa6efd5cca Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Sun, 6 Sep 2020 08:59:48 -0700
Subject: [PATCH 35/44] Update installation instructions (#362)

---
 docs/README.md | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/docs/README.md b/docs/README.md
index be535b3ab8a2..0ac7783f3860 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -29,7 +29,15 @@ gem install jekyll bundler
 ```
 
 ### Start a local webserver
-We now need to install the required Ruby packages for the website. **NOTE**: this step frequently hangs when connected to a VPN (including MSVPN). Simply disconnect for the package installation.
+We now need to install the required Ruby packages for the website.
+
+**NOTE**: you should change to this folder (i.e., docs) before running the installation command to avoid this [error](https://stackoverflow.com/questions/10012181/bundle-install-returns-could-not-locate-gemfile/35157872):
+
+> Could not locate Gemfile
+
+**NOTE**: this step frequently hangs when connected to a VPN (including MSVPN). Simply disconnect for the package installation.
+
+
 ```
 bundle install
 ```

From 9dadf38dd637d283d0869837cd5324f4086cbe1c Mon Sep 17 00:00:00 2001
From: Arash Ashari <arashari@microsoft.com>
Date: Sun, 6 Sep 2020 11:20:48 -0700
Subject: [PATCH 36/44] Update Sparse Attention Tutorial (#357)

* adding BingSqaud e2e test

* updating the draft test; bring final step under try section

* finalizinf test for base deepspeed and deepspeed with ZeRO

* applying the comment (thanks Jeff); fixed formatting

* update Sparse Attention Tutorial

* fixed few issues and applied comments for better organization and readability

* updated sparse attention tutorial with making how to use section incremental; applying more comments

Co-authored-by: arashashari <arashashari@ArashMSLaptop.redmond.corp.microsoft.com>
---
 docs/_data/navigation.yml                     |   2 +-
 docs/_posts/2020-09-09-sparse-attention.md    |  23 +--
 ...parse_attention.md => sparse-attention.md} | 143 +++++++++++++++---
 3 files changed, 138 insertions(+), 30 deletions(-)
 rename docs/_tutorials/{sparse_attention.md => sparse-attention.md} (57%)

diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml
index 7a98032b863c..2928a5445c50 100644
--- a/docs/_data/navigation.yml
+++ b/docs/_data/navigation.yml
@@ -65,6 +65,6 @@ lnav:
       - title: "Learning Rate Range Test"
         url: /tutorials/lrrt/
       - title: "DeepSpeed Sparse Attention"
-        url: /tutorials/sparse_attention/
+        url: /tutorials/sparse-attention/
   - title: "Contributing"
     url: /contributing/
diff --git a/docs/_posts/2020-09-09-sparse-attention.md b/docs/_posts/2020-09-09-sparse-attention.md
index b6221c85d5c4..deb37ea84485 100644
--- a/docs/_posts/2020-09-09-sparse-attention.md
+++ b/docs/_posts/2020-09-09-sparse-attention.md
@@ -7,7 +7,7 @@ new_post: true
 date: 2020-09-09 01:00:00
 ---
 
-Attention-based deep learning models such as the transformers are highly effective in capturing relationship between tokens in an input sequence, even across long distances. As a result, they are used with text, image, and sound-based inputs, where the sequence length can be in thousands of tokens. However, despite the effectiveness of attention modules to capture long term dependencies, in practice, their application to long sequence input is limited by compute and memory requirements of the attention computation that grow quadratically, `O(n^2)`, with the sequence length n.
+Attention-based deep learning models such as the transformers are highly effective in capturing relationship between tokens in an input sequence, even across long distances. As a result, they are used with text, image, and sound-based inputs, where the sequence length can be in thousands of tokens. However, despite the effectiveness of attention modules to capture long term dependencies, in practice, their application to long sequence input is limited by compute and memory requirements of the attention computation that grow quadratically, `O(n^2)`, with the sequence length `n`.
 
 To address this limitation, DeepSpeed offers a suite of sparse attention kernels --an instrumental technology that can reduce the compute and memory requirement of attention computation by orders-of-magnitude via block-sparse computation. The suite not only alleviates the memory bottleneck of attention calculation, but also performs sparse computation efficiently. Its APIs allow convenient integration with any transformer-based models. Along with providing a wide spectrum of sparsity structures, it has the flexibility of handling any user-defined block-sparse structures. More specifically, sparse attention (SA) can be designed to compute local attention between nearby tokens, or global attention via summary tokens computed with local attention. Moreover, SA can also allow random attention, or any combination of local, global, and random attention as shown in the following figure with blue, orange, and green blocks, respectively. As a result, SA decreases the memory footprint to `O(wn)`, in which `1 < w < n` is a parameter, whose value depends on the attention structure.
 
@@ -20,7 +20,7 @@ Block-sparse computations handled by DeepSpeed Sparse Attention kernels are illu
 
 ![Sparse attention backward pass](/assets/images/sa_backward_pass.png){: .align-center}
 
-To learn more about Sparsity Config, and also how to use this library, please check our [tutorial](https://github.com/microsoft/DeepSpeed-internal/tree/master/docs/_tutorials/sparse_attention.md) that provides detailed information about it.
+To learn more about Sparsity Config, and also how to use this library, please check our [tutorial](/tutorials/sparse_attention/) that provides detailed information about it.
 
 ## Performance Results
 
@@ -44,17 +44,10 @@ Related works along the line of sparse attention ([Sparse Transformer](https://a
 
 ![Accuracy of long document comprehension application](/assets/images/sa_long_document_comprehension_result.png){: .align-center}
 
-* **flexibility to handle any block-sparse structure**
-DeepSpeed Sparse Attention suite does not target at any specific sparse structure but enables model scientists to explore any block sparse structure with efficient system support. Currently, we have added popular sparse structure like:
-  * [Fixed](https://arxiv.org/pdf/1904.10509.pdf) (from OpenAI Sparse Transformer)
-  * [BigBird](https://arxiv.org/pdf/2007.14062.pdf) (from Google)
-  * BSLongformer (Block-Sparse implementation of [Longformer](https://arxiv.org/pdf/2004.05150.pdf) from AI2)
-We also define a template to have `variable` structure (top figure), which can be used to simply customize any block-sparse random/local/global attention pattern. In addition to this list, user can add any other sparsity structure as described in [tutorial](https://github.com/microsoft/DeepSpeed-internal/tree/master/docs/_tutorials/sparse_transformer.md) section.
-
 
 * **comparison with state of the art, Longformer**
 We compared SA with Longformer, a state-of-the-art sparse structure and implementation. In our experiment, SA uses `Fixed` sparsity, and two implementations have comparable accuracy. On system performance, SA outperforms Longformer both in training and inference:
-  * 1.47x faster execution pre-training MLM on Wikitext103
+  * **1.47x** faster execution pre-training MLM on Wikitext103
 We ran an experiment following the [notebook](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) offered by Longformer. In this experiment, we pre-train an MLM model using RoBERTa-base checkpoint. This is done on 8 V100-SXM2 GPU. Following table shows the details of the result in which using DeepSpeed Sparse Attention shows 1.47x speed up.
 
 |Model 	            |Local Window Size |BPC     |Train Step  |Time Per Iteration  |Time Improvement  |Accuracy improvement  |
@@ -70,7 +63,7 @@ We ran an experiment following the [notebook](https://github.com/allenai/longfor
 |Sparse Attention   |                  |1.8693  |            |1.1372              |                  |                      |
 
 
-  * 3.13x faster execution inference on BERT-Base
+  * **3.13x** faster execution inference on BERT-Base
 Through our Long Document Comprehension application we described above, we also checked the inference time for different window sizes testing BERT model on a `2,048` Sequence Length and batch size `1`. In this experiment, we noticed up to `3.13X` speed up replacing Bert Attention with DeepSpeed Sparse Attention instead of Longformer Attention. Following table shows the complete result.
 
 |Local Window Size   |Time Improvement|
@@ -81,3 +74,11 @@ Through our Long Document Comprehension application we described above, we also
 |64                  |1.5             |
 |32                  |1.24            |
 |16                  |1.23            |
+
+* **flexibility to handle any block-sparse structure**
+DeepSpeed Sparse Attention suite does not target at any specific sparse structure but enables model scientists to explore any block sparse structure with efficient system support. Currently, we have added popular sparse structure like:
+  * [Fixed](https://arxiv.org/pdf/1904.10509.pdf) (from OpenAI Sparse Transformer)
+  * [BigBird](https://arxiv.org/pdf/2007.14062.pdf) (from Google)
+  * BSLongformer (Block-Sparse implementation of [Longformer](https://arxiv.org/pdf/2004.05150.pdf) from AI2)
+
+We also define a template to have `variable` structure (top figure), which can be used to simply customize any block-sparse random/local/global attention pattern. In addition to this list, user can add any other sparsity structure as described in [tutorial](https://www.deepspeed.ai/tutorials/sparse_attention/) section.
diff --git a/docs/_tutorials/sparse_attention.md b/docs/_tutorials/sparse-attention.md
similarity index 57%
rename from docs/_tutorials/sparse_attention.md
rename to docs/_tutorials/sparse-attention.md
index 7b8233543378..5e4027150ef0 100644
--- a/docs/_tutorials/sparse_attention.md
+++ b/docs/_tutorials/sparse-attention.md
@@ -2,24 +2,19 @@
 title: "DeepSpeed  Sparse Attention"
 ---
 
-In this tutorial we describe how to use DeepSpeed Sparse Attention and its building-block kernels through DeepSpeed launcher or integrating individual kernels into your code.
+In this tutorial we describe how to use DeepSpeed Sparse Attention (SA) and its building-block kernels. The easiest way to use SA is through DeepSpeed launcher. We will describe this through an example in [How to use sparse attention with DeepSpeed launcher](/tutorials/sparse-attention/#how-to-use-sparse-attention-with-deepspeed-launcher) section. But before that, we introduce modules provided by DeepSpeed SA in the [next](/tutorials/sparse-attention/#sparse-attention-modules) section.
 
-**Note:** Currently DeepSpeed Sparse Attention can be used only on Nvidia V100 GPU using Cuda 10.1 or 10.2.
+**Note:** Currently DeepSpeed Sparse Attention can be used only on Nvidia V100 GPU using Torch >= 1.5 and Cuda 10.1 or 10.2.
 {: .notice--warning}
 
-## How to use
-DeepSpeed Sparse Attention can be used as a feature through DeepSpeed, or simply integrated with any Transformer model as a self-attention module alone. Further, the building block kernels, matrix multiplication and softmax can be used separately. To use sparse attention alone, you can simply install DeepSpeed and import any of the following modules from it; example:
-```python
-from deepspeed.ops.sparse_attention import SparseSelfAttention
-```
-Following we describe Sparse Attention modules:
-* **MatMul**: This module handles block-sparse matrix-matrix multiplication. Currently it supports SDD, DSD, and DDS as described in [DeepSpeed Sparse Attention](https://github.com/microsoft/DeepSpeed-internal/tree/master/docs/_posts/2020-09-09-sparse-attention.md) section.
+## Sparse attention modules
+* **MatMul**: This module handles block-sparse matrix-matrix multiplication. Currently it supports SDD, DSD, and DDS as described in [DeepSpeed Sparse Attention](https://www.deepspeed.ai/news/2020/09/08/sparse-attention.html) section.
 * **Softmax**: This module applies block sparse softmax. It handles both forward and backward pass.
 * **SparseSelfAttention**: This module uses MatMul and Softmax kernels and generates Context Layer output given Query, Keys and Values. It is a simplified version of common operations in any self-attention layer. It can also apply:
   * `Relative position embedding`
   * `Attention mask`
   * `Key padding mask`
-on the intermediate attention scores. For more details about SelfAttantion, please check [MultiHeadAttention](https://pytorch.org/docs/master/generated/torch.nn.MultiheadAttention.html#multiheadattention).
+on the intermediate attention scores. For more details about self attention, please check [MultiHeadAttention](https://pytorch.org/docs/master/generated/torch.nn.MultiheadAttention.html#multiheadattention).
 * **BertSparseSelfAttention**: This module contains a simplified BertSelfAttention layer that can be used instead of original dense Bert Self-Attention layer. Our implementation is based on [DeepSpeedExample](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py#L373-#L434).
 * **SparseAttentionUtils**: This module provides few utility functions to handle adapting pre-trained model with sparse attention:
   * `replace_model_self_attention_with_sparse_self_attention`: If you have currently loaded a model and want to replace self-attention module with sparse self-attention, you can simply use this function to handle it for you. It currently handles BERT and RoBERTa based pre-trained models, but you can extend it base on your model type if it is different from these two. You also need to extend the position embedding to handle new sequence length; this can be done using `extend_position_embedding` function.
@@ -27,20 +22,132 @@ on the intermediate attention scores. For more details about SelfAttantion, plea
   * `extend_position_embedding`: This function extends the position embedding based on the current values. For example, if you have a 128 max sequence length model and extending it to a 1k sequence length, it replicates current embeddings 8 times to initialize new embedding. Experimentally we have seen such initialization works much better than initializing from scratch; leads to faster convergence.
   * `pad_to_block_size`: This function pads input tokens and attention mask on sequence length dimension to be multiple of block size; this is a requirement for SA.
   * `unpad_sequence_output`: This function unpads sequence output if inputs of the model were padded.
-* **SparsityConfig**: this is an abstract class for sparsity structure. Any sparsity structure extends this class and writes its own `make_layout` function. DeepSpeed currently provides the following structures that will be described in next section:
+* **SparsityConfig**: this is an abstract class for sparsity structure. Any sparsity structure needs to extend this class and writes its own sparsity pattern construction; `make_layout` function. DeepSpeed currently provides the following structures that will be described in [How to config sparsity structures](/tutorials/sparse-attention/#how-to-config-sparsity-structures) section:
   * `FixedSparsityConfig`
   * `BSLongformerSparsityConfig`
   * `BigBirdSparsityConfig`
   * `VariableSparsityConfig`
-
-### BertSparseSelfAttention Example
-We have currently integrated Sparse Attention with our [bing_bert](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py) code that can be used as an example for integration. In this example, we replace, BertSelfAttention module with BertSparseSelfAttention. Using DeepSpeed launcher, you can enable sparse attention using `deepspeed_sparse_attention` argument ([example](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/ds_sa_train_bert_bsz64k_seq128.sh)) and add your desired sparsity config into the [DeepSpeed config file](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_bsz64k_lamb_config_seq128.json). In this example, we have used `fixed` sparsity mode. Further, you need to pad sequence dimension of `input_ids` and `attention_mask` to be multiple of sparse block size. As mentioned above, DeepSpeed provides utility functions for padding and unpadding and you can check our [example](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py) to see where and how pad and unpad the inputs or outputs of the model.
+  * `DenseSparsityConfig`
 
 **Note:** Currently DeepSpeed Transformer Kernels do not support Sparse Attention. To use Sparse Attention, you need to disable Transformer Kernels!
 {: .notice--warning}
 
-### Sparsity structures
-Following we describe supported sparsity structures, their parameter set and the flexibility of adding arbitrary sparsity pattern on the self-attention layer.
+## How to use sparse attention with DeepSpeed launcher
+In this section we describe how to use DeepSpeed Sparse Attention through our [bing_bert](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py) code.
+
+* **Update attention module**: First, you need to update your attention module based on sparse computation. Here, we use [BertSparseSelfAttention](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py) which is the sparse version of `BertSelfAttention` from our [bing_bert](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py) code. It rewrites `BertSelfAttention` where it replaces:
+
+```python
+attention_scores = torch.matmul(query_layer, key_layer)
+attention_scores = attention_scores / math.sqrt(
+    self.attention_head_size)
+
+# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+attention_scores = attention_scores + attention_mask
+
+pdtype = attention_scores.dtype
+# Normalize the attention scores to probabilities.
+attention_probs = self.softmax(attention_scores)
+
+# This is actually dropping out entire tokens to attend to, which might
+# seem a bit unusual, but is taken from the original Transformer paper.
+attention_probs = self.dropout(attention_probs)
+
+context_layer = torch.matmul(attention_probs, value_layer)
+```
+
+with:
+
+```python
+context_layer =
+  self.sparse_self_attention(
+	query_layer,
+	key_layer,
+	value_layer,
+	key_padding_mask=attention_mask)
+```
+
+in which `sparse_self_attention` is an instance of [SparseSelfAttention](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/ops/sparse_attention/sparse_self_attention.py). This module computes attention context through sparse attention replacing underlying matrix multiplications and softmax with their equivalent sparse version. You can update any other attention module similarly.
+
+* **Setup sparse attention config in the model**: You need to setup the sparse attention config. In our example, this is done in the `BertModel`.
+
+```python
+self.pad_token_id = config.pad_token_id if hasattr(
+   config, 'pad_token_id') and config.pad_token_id is not None else 0
+# set sparse_attention_config if it has been selected
+self.sparse_attention_config = get_sparse_attention_config(
+   args, config.num_attention_heads)
+self.encoder = BertEncoder(
+   config, args, sparse_attention_config=self.sparse_attention_config)
+```
+
+* **Update encoder model**: Further, you need to update your encoder model to use SA for the attention layer when SA is enabled. Please check our [bing_bert example](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py) in which we use `BertSparseSelfAttention` instead of `BertSelfAttention` when SA is enabled.
+
+```python
+if sparse_attention_config is not None:
+    from deepspeed.ops.sparse_attention import BertSparseSelfAttention
+
+    layer.attention.self = BertSparseSelfAttention(
+         config, sparsity_config=sparse_attention_config)
+```
+
+* **Pad and unpad input data**: Also you may need to pad sequence dimension of `input_ids` and `attention_mask` to be multiple of sparse block size. As mentioned in [module](/tutorials/sparse-attention/#sparse-attention-modules) section above, DeepSpeed provides utility functions for padding and unpadding. Please check our [bing_bert example](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py) to see where and how pad and unpad the inputs or outputs of the model.
+
+```python
+if self.sparse_attention_config is not None:
+   pad_len, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds = SparseAttentionUtils.pad_to_block_size(
+      block_size=self.sparse_attention_config.block,
+      input_ids=input_ids,
+      attention_mask=extended_attention_mask,
+      token_type_ids=token_type_ids,
+      position_ids=None,
+      inputs_embeds=None,
+      pad_token_id=self.pad_token_id,
+      model_mbeddings=self.embeddings)
+.
+.
+.
+# If BertEncoder uses sparse attention, and input_ids were padded, sequence output needs to be unpadded to original length
+if self.sparse_attention_config is not None and pad_len > 0:
+   encoded_layers[-1] = SparseAttentionUtils.unpad_sequence_output(
+      pad_len, encoded_layers[-1])
+```
+
+* **Enable sparse attention*: To use DeepSpeed Sparse Attention, you need to enable it in the launcher script through `deepspeed_sparse_attention` argument:
+
+```python
+--deepspeed_sparse_attention
+```
+
+Please check [our bing_bert runner script](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/ds_sa_train_bert_bsz64k_seq128.sh) as an example of how to enable SA with DeepSpeed launcher.
+
+* **Add sparsity config**: The sparsity config can be set through the [DeepSpeed JSON config file](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_bsz64k_lamb_config_seq128.json). In this example, we have used `fixed` sparsity mode that will be described in [How to config sparsity structures](/tutorials/sparse-attention/#how-to-config-sparsity-structures) section.
+
+```python
+"sparse_attention": {
+    "mode": "fixed",
+    "block": 16,
+    "different_layout_per_head": true,
+    "num_local_blocks": 4,
+    "num_global_blocks": 1,
+    "attention": "bidirectional",
+    "horizontal_global_attention": false,
+    "num_different_global_patterns": 4
+}
+```
+
+## How to use individual kernels
+DeepSpeed Sparse Attention can be used as a feature through DeepSpeed, as described above, or simply integrated with any Transformer model as a self-attention module alone. Further, the building block kernels, matrix multiplication and softmax can be used separately. To use sparse attention alone, you can simply install DeepSpeed and import any of the modules described in [modules](/tutorials/sparse-attention/#sparse-attention-modules) section; example:
+
+```python
+from deepspeed.ops.sparse_attention import SparseSelfAttention
+```
+
+Please refer to the Docstrings for details of how to use each module separately.
+
+## How to config sparsity structures
+Following we describe supported sparsity structures, their parameter set and the flexibility of adding arbitrary sparsity pattern on the self-attention layer. You can update DeepSpeed config file using any of the supported sparsity structures and set the parameters accordingly.
+
 * **SpasityConfig**:
 This module, is the parent class for all sparsity structures and contains the shared features of all sparsity structures. It takes the following parameters:
   * `num_heads`: an integer determining number of attention heads of the layer.
@@ -84,5 +191,5 @@ Figure bellow illustrates an example of `variable` sparsity, in which blue, oran
 Further, we provide a `dense` pattern (`DenseSparsityConfig`), that can be used for the sake of testing while it represents the full attention.
 
 
-### How to expand block-base sparsity patterns
-Our building block kernels, block-based `MatMul` & `Softmax`, can accept any block-based sparsity. This provides the flexibility to apply any block-based sparsity pattern to attention score. To define and apply a new sparsity pattern, you can simply follow any of the above sparsity structures. You need to add a new class that expands `SparsityConfig` and define `make_layout` function based on how your sparsity is structured. You can add any extra parameters you may need or just use default parameters of the parent class.
+## How to support new user defined sparsity structures
+Our building block kernels, block-based `MatMul` and `Softmax`, can accept any block-based sparsity. This provides the flexibility to apply any block-based sparsity pattern to attention score. To define and apply a new sparsity pattern, you can simply follow any of the above sparsity structures. You need to add a new class that expands `SparsityConfig` and define `make_layout` function based on how your sparsity is structured. You can add any extra parameters you may need or just use default parameters of the parent class.

From bae81311b3b3571f7e5b25c402d2e65a1475498d Mon Sep 17 00:00:00 2001
From: RezaYazdaniAminabadi
 <44502768+RezaYazdaniAminabadi@users.noreply.github.com>
Date: Sun, 6 Sep 2020 12:50:23 -0700
Subject: [PATCH 37/44] fixing corner cases (#371)

---
 csrc/adam/cpu_adam.cpp          | 771 ++++++++++++++++++--------------
 csrc/includes/cpu_adam.h        |  11 +-
 tests/unit/test_adam_acuracy.py |   8 +-
 3 files changed, 457 insertions(+), 333 deletions(-)
 mode change 100644 => 100755 csrc/includes/cpu_adam.h

diff --git a/csrc/adam/cpu_adam.cpp b/csrc/adam/cpu_adam.cpp
index d884451700e9..e6afa7afe3e8 100755
--- a/csrc/adam/cpu_adam.cpp
+++ b/csrc/adam/cpu_adam.cpp
@@ -2,7 +2,6 @@
 #include <cuda_runtime_api.h>
 #include <omp.h>
 #include <torch/extension.h>
-#include <x86intrin.h>
 #include <iostream>
 #include <memory>
 #include <type_traits>
@@ -11,9 +10,13 @@
 #include "cuda.h"
 #include "curand.h"
 #include "custom_cuda_layers.h"
+#include <math.h>
 
 static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
 
+#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
+
+
 // C++ interface
 
 void Adam_Optimizer::Step(float* _params,
@@ -26,80 +29,136 @@ void Adam_Optimizer::Step(float* _params,
     _betta1_t *= _betta1;
     _betta2_t *= _betta2;
 
-    bool buf_index = 0;
-
-    __m512 betta1_4 = _mm512_set1_ps(_betta1);
-    __m512 betta2_4 = _mm512_set1_ps(_betta2);
+    AVX_512 betta1_4;
+    betta1_4.data = _mm512_set1_ps(_betta1);
+    AVX_512 betta2_4;
+    betta2_4.data = _mm512_set1_ps(_betta2);
 
     float betta1_minus1 = 1 - _betta1;
     float betta2_minus1 = 1 - _betta2;
-    __m512 betta1_minus1_4 = _mm512_set1_ps(betta1_minus1);
-    __m512 betta2_minus1_4 = _mm512_set1_ps(betta2_minus1);
+    AVX_512 betta1_minus1_4;
+    betta1_minus1_4.data = _mm512_set1_ps(betta1_minus1);
+    AVX_512 betta2_minus1_4;
+    betta2_minus1_4.data = _mm512_set1_ps(betta2_minus1);
 
     float bias_correction1 = 1 - _betta1_t;
-    float bias_correction2 = 1 - _betta2_t;
-    //__m512 bias_correction1_4 = _mm512_set1_ps(bias_correction1);
-    __m512 bias_correction2_4 = _mm512_set1_ps(bias_correction2);
+    float bias_correction2 = 1 / sqrt(1 - _betta2_t);
+    //AVX_512 bias_correction1_4 = _mm512_set1_ps(bias_correction1);
+    AVX_512 bias2_sqrt ;
+    bias2_sqrt.data = _mm512_set1_ps(bias_correction2);
 
-    __m512 eps_4 = _mm512_set1_ps(_eps);
+    AVX_512 eps_4;
+    eps_4.data = _mm512_set1_ps(_eps);
 
     float step_size = -1 * _alpha / bias_correction1;
-    __m512 step_size_4 = _mm512_set1_ps(step_size);
+    AVX_512 step_size_4;
+    step_size_4.data = _mm512_set1_ps(step_size);
 
-    __m512 bias2_sqrt = _mm512_sqrt_ps(bias_correction2_4);
+    size_t rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH);
 
-    size_t tile = 0;
-
-    for (size_t t = 0; t < _param_size; t += TILE) {
+    for (size_t t = 0; t < rounded_size; t += TILE) {
         size_t copy_size = TILE;
-        if ((t + TILE) > _param_size) copy_size = _param_size - t;
+        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
         size_t offset = copy_size + t;
 #pragma omp parallel for
         for (size_t i = t; i < offset; i += SIMD_WIDTH) {
-            __m512 grad_4 = _mm512_loadu_ps(grads + i);
+            AVX_512 grad_4;
+            grad_4.data = _mm512_loadu_ps(grads + i);
 
-            __m512 momntum_4 = _mm512_loadu_ps(_exp_avg + i);
-            __m512 varianc_4 = _mm512_loadu_ps(_exp_avg_sq + i);
+            AVX_512 momntum_4;
+            momntum_4.data = _mm512_loadu_ps(_exp_avg + i);
+            AVX_512 varianc_4;
+            varianc_4.data = _mm512_loadu_ps(_exp_avg_sq + i);
 
-            __m512 param_4 = _mm512_loadu_ps(_params + i);
+            AVX_512 param_4;
+            param_4.data = _mm512_loadu_ps(_params + i);
 
             if (_weight_decay > 0) {
-                __m512 weight_decay4 = _mm512_set1_ps(_weight_decay);
-                grad_4 = _mm512_fmadd_ps(param_4, weight_decay4, grad_4);
+                AVX_512 weight_decay4;
+                weight_decay4.data = _mm512_set1_ps(_weight_decay);
+                grad_4.data = _mm512_fmadd_ps(param_4.data, weight_decay4.data, grad_4.data);
             }
 
-            momntum_4 = _mm512_mul_ps(momntum_4, betta1_4);
-            momntum_4 = _mm512_fmadd_ps(grad_4, betta1_minus1_4, momntum_4);
-
-            varianc_4 = _mm512_mul_ps(varianc_4, betta2_4);
-            grad_4 = _mm512_mul_ps(grad_4, grad_4);
-            varianc_4 = _mm512_fmadd_ps(grad_4, betta2_minus1_4, varianc_4);
-
-            grad_4 = _mm512_sqrt_ps(varianc_4) / bias2_sqrt;
-            grad_4 = _mm512_add_ps(grad_4, eps_4);
-            grad_4 = _mm512_div_ps(momntum_4, grad_4);
-
-            param_4 = _mm512_fmadd_ps(grad_4, step_size_4, param_4);
-
-            _mm512_storeu_ps(_params + i, param_4);
-            _mm512_storeu_ps(_exp_avg + i, momntum_4);
-            _mm512_storeu_ps(_exp_avg_sq + i, varianc_4);
+            momntum_4.data = _mm512_mul_ps(momntum_4.data, betta1_4.data);
+            momntum_4.data = _mm512_fmadd_ps(grad_4.data, betta1_minus1_4.data, momntum_4.data);
+
+            varianc_4.data = _mm512_mul_ps(varianc_4.data, betta2_4.data);
+            grad_4.data = _mm512_mul_ps(grad_4.data, grad_4.data);
+            varianc_4.data = _mm512_fmadd_ps(grad_4.data, betta2_minus1_4.data, varianc_4.data);
+
+            grad_4.data = _mm512_sqrt_ps(varianc_4.data);
+            grad_4.data = _mm512_fmadd_ps(grad_4.data, bias2_sqrt.data, eps_4.data);
+            grad_4.data = _mm512_div_ps(momntum_4.data, grad_4.data);
+
+            param_4.data = _mm512_fmadd_ps(grad_4.data, step_size_4.data, param_4.data);
+            if (dev_params) {
+                for (size_t j = 0; j < SIMD_WIDTH; j += 4) {
+                    _doubled_buffer[_buf_index][(i - t) + (j << 2)] =     (__half)param_4.data_f[(j << 2)];
+                    _doubled_buffer[_buf_index][(i - t) + (j << 2) + 1] = (__half)param_4.data_f[(j << 2) + 1];
+                    _doubled_buffer[_buf_index][(i - t) + (j << 2) + 2] = (__half)param_4.data_f[(j << 2) + 2];
+                    _doubled_buffer[_buf_index][(i - t) + (j << 2) + 3] = (__half)param_4.data_f[(j << 2) + 3];
+                }
+            }
+            _mm512_storeu_ps(_params + i, param_4.data);
+            _mm512_storeu_ps(_exp_avg + i, momntum_4.data);
+            _mm512_storeu_ps(_exp_avg_sq + i, varianc_4.data);
         }
-        if (dev_params) {
+        if (dev_params) {/*
 #pragma omp parallel for
             for (size_t j = 0; j < copy_size; j += 4) {
-                _doubled_buffer[buf_index][j] = (__half)_params[t + j];
-                _doubled_buffer[buf_index][j + 1] = (__half)_params[t + j + 1];
-                _doubled_buffer[buf_index][j + 2] = (__half)_params[t + j + 2];
-                _doubled_buffer[buf_index][j + 3] = (__half)_params[t + j + 3];
-            }
+                _doubled_buffer[_buf_index][j] = (__half)_params[t + j];
+                _doubled_buffer[_buf_index][j + 1] = (__half)_params[t + j + 1];
+                _doubled_buffer[_buf_index][j + 2] = (__half)_params[t + j + 2];
+                _doubled_buffer[_buf_index][j + 3] = (__half)_params[t + j + 3];
+            }*/
 
             CUDA_CHECK(cudaMemcpyAsync(dev_params + t,
-                                       _doubled_buffer[buf_index],
+                                       _doubled_buffer[_buf_index],
                                        copy_size * sizeof(__half),
                                        cudaMemcpyHostToDevice,
                                        Context::Instance().GetCurrentStream()));
-            buf_index = !buf_index;
+            _buf_index = !_buf_index;
+        }
+    }
+
+    if(_param_size > rounded_size)
+    {
+#pragma omp parallel for
+        for (size_t k = rounded_size; k < _param_size; k++) 
+        {
+            float grad = grads[k];
+            float param = _params[k];
+            float momntum = _exp_avg[k];
+            float varianc = _exp_avg_sq[k];
+            if (_weight_decay > 0) {
+                grad = param * _weight_decay + grad;
+            }
+
+            momntum *= momntum * _betta1;
+            momntum = grad * betta1_minus1 + momntum;
+
+            varianc = varianc * _betta2;
+            grad = grad * grad;
+            varianc = grad * betta2_minus1 + varianc;
+
+            grad = sqrt(varianc);
+            grad = grad * bias_correction2 + _eps;
+            grad = momntum / grad;
+
+            param = grad * step_size + param;
+            if (dev_params) 
+                _doubled_buffer[_buf_index][k - rounded_size] = (__half)param;
+
+            _params[k] = param;
+            _exp_avg[k] = momntum;
+            _exp_avg_sq[k] = varianc;
+        }
+        if (dev_params) {
+            CUDA_CHECK(cudaMemcpyAsync(dev_params + rounded_size,
+                                _doubled_buffer[_buf_index],
+                                (_param_size - rounded_size) * sizeof(__half),
+                                cudaMemcpyHostToDevice,
+                                Context::Instance().GetCurrentStream()));
         }
     }
 }
@@ -114,141 +173,165 @@ void Adam_Optimizer::Step_4(float* _params,
     _betta1_t *= _betta1;
     _betta2_t *= _betta2;
 
-    __m512 betta1_4 = _mm512_set1_ps(_betta1);
-    __m512 betta2_4 = _mm512_set1_ps(_betta2);
-
-    bool buf_index = 0;
-    size_t tile = 0;
+    AVX_512 betta1_4;
+    betta1_4.data = _mm512_set1_ps(_betta1);
+    AVX_512 betta2_4;
+    betta2_4.data = _mm512_set1_ps(_betta2);
 
     float betta1_minus1 = 1 - _betta1;
     float betta2_minus1 = 1 - _betta2;
-    __m512 betta1_minus1_4 = _mm512_set1_ps(betta1_minus1);
-    __m512 betta2_minus1_4 = _mm512_set1_ps(betta2_minus1);
+    AVX_512 betta1_minus1_4;
+    betta1_minus1_4.data = _mm512_set1_ps(betta1_minus1);
+    AVX_512 betta2_minus1_4;
+    betta2_minus1_4.data = _mm512_set1_ps(betta2_minus1);
 
     float bias_correction1 = 1 - _betta1_t;
-    float bias_correction2 = 1 - _betta2_t;
-    //__m512 bias_correction1_4 = _mm512_set1_ps(bias_correction1);
-    __m512 bias_correction2_4 = _mm512_set1_ps(bias_correction2);
+    float bias_correction2 = 1 / sqrt(1 - _betta2_t);
+    //AVX_512 bias_correction1_4 = _mm512_set1_ps(bias_correction1);
+    AVX_512 bias2_sqrt ;
+    bias2_sqrt.data = _mm512_set1_ps(bias_correction2);
 
-    __m512 eps_4 = _mm512_set1_ps(_eps);
+    AVX_512 eps_4;
+    eps_4.data = _mm512_set1_ps(_eps);
 
     float step_size = -1 * _alpha / bias_correction1;
-    __m512 step_size_4 = _mm512_set1_ps(step_size);
+    AVX_512 step_size_4;
+    step_size_4.data = _mm512_set1_ps(step_size);
 
-    __m512 bias2_sqrt = _mm512_sqrt_ps(bias_correction2_4);
+    size_t rounded_size = ROUND_DOWN(_param_size, (SIMD_WIDTH << 2));
 
-    for (size_t t = 0; t < _param_size; t += TILE) {
+    for (size_t t = 0; t < rounded_size; t += TILE) {
         size_t copy_size = TILE;
-        if ((t + TILE) > _param_size) copy_size = _param_size - t;
+        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
         size_t offset = copy_size + t;
 #pragma omp parallel for
         for (size_t i = t; i < offset; i += (SIMD_WIDTH << 2)) {
-            __m512 grad_4[4];
-            grad_4[0] = _mm512_loadu_ps(grads + i);
-            grad_4[1] = _mm512_loadu_ps(grads + i + SIMD_WIDTH);
-            grad_4[2] = _mm512_loadu_ps(grads + i + (SIMD_WIDTH << 1));
-            grad_4[3] = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 3);
-
-            __m512 momntum_4[4];
-            momntum_4[0] = _mm512_loadu_ps(_exp_avg + i);
-            momntum_4[1] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH);
-            momntum_4[2] = _mm512_loadu_ps(_exp_avg + i + (SIMD_WIDTH << 1));
-            momntum_4[3] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 3);
-
-            __m512 varianc_4[4];
-            varianc_4[0] = _mm512_loadu_ps(_exp_avg_sq + i);
-            varianc_4[1] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH);
-            varianc_4[2] = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1));
-            varianc_4[3] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3);
-
-            __m512 param_4[4];
-            param_4[0] = _mm512_loadu_ps(_params + i);
-            param_4[1] = _mm512_loadu_ps(_params + i + SIMD_WIDTH);
-            param_4[2] = _mm512_loadu_ps(_params + i + (SIMD_WIDTH << 1));
-            param_4[3] = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 3);
+            AVX_512 grad_4[4];
+            grad_4[0].data = _mm512_loadu_ps(grads + i);
+            grad_4[1].data = _mm512_loadu_ps(grads + i + SIMD_WIDTH);
+            grad_4[2].data = _mm512_loadu_ps(grads + i + (SIMD_WIDTH << 1));
+            grad_4[3].data = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 3);
+
+            AVX_512 momntum_4[4];
+            momntum_4[0].data = _mm512_loadu_ps(_exp_avg + i);
+            momntum_4[1].data = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH);
+            momntum_4[2].data = _mm512_loadu_ps(_exp_avg + i + (SIMD_WIDTH << 1));
+            momntum_4[3].data = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 3);
+
+            AVX_512 varianc_4[4];
+            varianc_4[0].data = _mm512_loadu_ps(_exp_avg_sq + i);
+            varianc_4[1].data = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH);
+            varianc_4[2].data = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1));
+            varianc_4[3].data = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3);
+
+            AVX_512 param_4[4];
+            param_4[0].data = _mm512_loadu_ps(_params + i);
+            param_4[1].data = _mm512_loadu_ps(_params + i + SIMD_WIDTH);
+            param_4[2].data = _mm512_loadu_ps(_params + i + (SIMD_WIDTH << 1));
+            param_4[3].data = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 3);
 
             if (_weight_decay > 0) {
-                __m512 weight_decay4 = _mm512_set1_ps(_weight_decay);
-                grad_4[0] = _mm512_fmadd_ps(param_4[0], weight_decay4, grad_4[0]);
-                grad_4[1] = _mm512_fmadd_ps(param_4[1], weight_decay4, grad_4[1]);
-                grad_4[2] = _mm512_fmadd_ps(param_4[2], weight_decay4, grad_4[2]);
-                grad_4[3] = _mm512_fmadd_ps(param_4[3], weight_decay4, grad_4[3]);
+                AVX_512 weight_decay4;
+                weight_decay4.data = _mm512_set1_ps(_weight_decay);
+                grad_4[0].data = _mm512_fmadd_ps(param_4[0].data, weight_decay4.data, grad_4[0].data);
+                grad_4[1].data = _mm512_fmadd_ps(param_4[1].data, weight_decay4.data, grad_4[1].data);
+                grad_4[2].data = _mm512_fmadd_ps(param_4[2].data, weight_decay4.data, grad_4[2].data);
+                grad_4[3].data = _mm512_fmadd_ps(param_4[3].data, weight_decay4.data, grad_4[3].data);
+            }
+
+            momntum_4[0].data = _mm512_mul_ps(momntum_4[0].data, betta1_4.data);
+            momntum_4[0].data = _mm512_fmadd_ps(grad_4[0].data, betta1_minus1_4.data, momntum_4[0].data);
+            momntum_4[1].data = _mm512_mul_ps(momntum_4[1].data, betta1_4.data);
+            momntum_4[1].data = _mm512_fmadd_ps(grad_4[1].data, betta1_minus1_4.data, momntum_4[1].data);
+            momntum_4[2].data = _mm512_mul_ps(momntum_4[2].data, betta1_4.data);
+            momntum_4[2].data = _mm512_fmadd_ps(grad_4[2].data, betta1_minus1_4.data, momntum_4[2].data);
+            momntum_4[3].data = _mm512_mul_ps(momntum_4[3].data, betta1_4.data);
+            momntum_4[3].data = _mm512_fmadd_ps(grad_4[3].data, betta1_minus1_4.data, momntum_4[3].data);
+
+            varianc_4[0].data = _mm512_mul_ps(varianc_4[0].data, betta2_4.data);
+            varianc_4[1].data = _mm512_mul_ps(varianc_4[1].data, betta2_4.data);
+            varianc_4[2].data = _mm512_mul_ps(varianc_4[2].data, betta2_4.data);
+            varianc_4[3].data = _mm512_mul_ps(varianc_4[3].data, betta2_4.data);
+            grad_4[0].data = _mm512_mul_ps(grad_4[0].data, grad_4[0].data);
+            grad_4[1].data = _mm512_mul_ps(grad_4[1].data, grad_4[1].data);
+            grad_4[2].data = _mm512_mul_ps(grad_4[2].data, grad_4[2].data);
+            grad_4[3].data = _mm512_mul_ps(grad_4[3].data, grad_4[3].data);
+            varianc_4[0].data = _mm512_fmadd_ps(grad_4[0].data, betta2_minus1_4.data, varianc_4[0].data);
+            varianc_4[1].data = _mm512_fmadd_ps(grad_4[1].data, betta2_minus1_4.data, varianc_4[1].data);
+            varianc_4[2].data = _mm512_fmadd_ps(grad_4[2].data, betta2_minus1_4.data, varianc_4[2].data);
+            varianc_4[3].data = _mm512_fmadd_ps(grad_4[3].data, betta2_minus1_4.data, varianc_4[3].data);
+
+            grad_4[0].data = _mm512_sqrt_ps(varianc_4[0].data);
+            grad_4[1].data = _mm512_sqrt_ps(varianc_4[1].data);
+            grad_4[2].data = _mm512_sqrt_ps(varianc_4[2].data);
+            grad_4[3].data = _mm512_sqrt_ps(varianc_4[3].data);
+
+            grad_4[0].data = _mm512_fmadd_ps(grad_4[0].data, bias2_sqrt.data, eps_4.data);
+            grad_4[1].data = _mm512_fmadd_ps(grad_4[1].data, bias2_sqrt.data, eps_4.data);
+            grad_4[2].data = _mm512_fmadd_ps(grad_4[2].data, bias2_sqrt.data, eps_4.data);
+            grad_4[3].data = _mm512_fmadd_ps(grad_4[3].data, bias2_sqrt.data, eps_4.data);
+            grad_4[0].data = _mm512_div_ps(momntum_4[0].data, grad_4[0].data);
+            grad_4[1].data = _mm512_div_ps(momntum_4[1].data, grad_4[1].data);
+            grad_4[2].data = _mm512_div_ps(momntum_4[2].data, grad_4[2].data);
+            grad_4[3].data = _mm512_div_ps(momntum_4[3].data, grad_4[3].data);
+
+            param_4[0].data = _mm512_fmadd_ps(grad_4[0].data, step_size_4.data, param_4[0].data);
+            param_4[1].data = _mm512_fmadd_ps(grad_4[1].data, step_size_4.data, param_4[1].data);
+            param_4[2].data = _mm512_fmadd_ps(grad_4[2].data, step_size_4.data, param_4[2].data);
+            param_4[3].data = _mm512_fmadd_ps(grad_4[3].data, step_size_4.data, param_4[3].data);
+
+            if (dev_params) {
+                for(int u = 0;u < 4;u++)
+                {
+                    for (size_t j = 0; j < SIMD_WIDTH; j += 4) {
+                        _doubled_buffer[_buf_index][(i - t) + (u << 4) + (j << 2)] =     (__half)param_4[u].data_f[(j << 2)];
+                        _doubled_buffer[_buf_index][(i - t) + (u << 4) + (j << 2) + 1] = (__half)param_4[u].data_f[(j << 2) + 1];
+                        _doubled_buffer[_buf_index][(i - t) + (u << 4) + (j << 2) + 2] = (__half)param_4[u].data_f[(j << 2) + 2];
+                        _doubled_buffer[_buf_index][(i - t) + (u << 4) + (j << 2) + 3] = (__half)param_4[u].data_f[(j << 2) + 3];
+                    }
+                }
             }
 
-            momntum_4[0] = _mm512_mul_ps(momntum_4[0], betta1_4);
-            momntum_4[0] = _mm512_fmadd_ps(grad_4[0], betta1_minus1_4, momntum_4[0]);
-            momntum_4[1] = _mm512_mul_ps(momntum_4[1], betta1_4);
-            momntum_4[1] = _mm512_fmadd_ps(grad_4[1], betta1_minus1_4, momntum_4[1]);
-            momntum_4[2] = _mm512_mul_ps(momntum_4[2], betta1_4);
-            momntum_4[2] = _mm512_fmadd_ps(grad_4[2], betta1_minus1_4, momntum_4[2]);
-            momntum_4[3] = _mm512_mul_ps(momntum_4[3], betta1_4);
-            momntum_4[3] = _mm512_fmadd_ps(grad_4[3], betta1_minus1_4, momntum_4[3]);
-
-            varianc_4[0] = _mm512_mul_ps(varianc_4[0], betta2_4);
-            varianc_4[1] = _mm512_mul_ps(varianc_4[1], betta2_4);
-            varianc_4[2] = _mm512_mul_ps(varianc_4[2], betta2_4);
-            varianc_4[3] = _mm512_mul_ps(varianc_4[3], betta2_4);
-            grad_4[0] = _mm512_mul_ps(grad_4[0], grad_4[0]);
-            grad_4[1] = _mm512_mul_ps(grad_4[1], grad_4[1]);
-            grad_4[2] = _mm512_mul_ps(grad_4[2], grad_4[2]);
-            grad_4[3] = _mm512_mul_ps(grad_4[3], grad_4[3]);
-            varianc_4[0] = _mm512_fmadd_ps(grad_4[0], betta2_minus1_4, varianc_4[0]);
-            varianc_4[1] = _mm512_fmadd_ps(grad_4[1], betta2_minus1_4, varianc_4[1]);
-            varianc_4[2] = _mm512_fmadd_ps(grad_4[2], betta2_minus1_4, varianc_4[2]);
-            varianc_4[3] = _mm512_fmadd_ps(grad_4[3], betta2_minus1_4, varianc_4[3]);
-
-            grad_4[0] = _mm512_sqrt_ps(varianc_4[0]) / bias2_sqrt;
-            grad_4[1] = _mm512_sqrt_ps(varianc_4[1]) / bias2_sqrt;
-            grad_4[2] = _mm512_sqrt_ps(varianc_4[2]) / bias2_sqrt;
-            grad_4[3] = _mm512_sqrt_ps(varianc_4[3]) / bias2_sqrt;
-
-            grad_4[0] = _mm512_add_ps(grad_4[0], eps_4);
-            grad_4[1] = _mm512_add_ps(grad_4[1], eps_4);
-            grad_4[2] = _mm512_add_ps(grad_4[2], eps_4);
-            grad_4[3] = _mm512_add_ps(grad_4[3], eps_4);
-            grad_4[0] = _mm512_div_ps(momntum_4[0], grad_4[0]);
-            grad_4[1] = _mm512_div_ps(momntum_4[1], grad_4[1]);
-            grad_4[2] = _mm512_div_ps(momntum_4[2], grad_4[2]);
-            grad_4[3] = _mm512_div_ps(momntum_4[3], grad_4[3]);
-
-            param_4[0] = _mm512_fmadd_ps(grad_4[0], step_size_4, param_4[0]);
-            param_4[1] = _mm512_fmadd_ps(grad_4[1], step_size_4, param_4[1]);
-            param_4[2] = _mm512_fmadd_ps(grad_4[2], step_size_4, param_4[2]);
-            param_4[3] = _mm512_fmadd_ps(grad_4[3], step_size_4, param_4[3]);
-
-            _mm512_storeu_ps(_params + i, param_4[0]);
-            _mm512_storeu_ps(_params + i + SIMD_WIDTH, param_4[1]);
-            _mm512_storeu_ps(_params + i + (SIMD_WIDTH << 1), param_4[2]);
-            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 3, param_4[3]);
-
-            _mm512_storeu_ps(_exp_avg + i, momntum_4[0]);
-            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH, momntum_4[1]);
-            _mm512_storeu_ps(_exp_avg + i + (SIMD_WIDTH << 1), momntum_4[2]);
-            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 3, momntum_4[3]);
-
-            _mm512_storeu_ps(_exp_avg_sq + i, varianc_4[0]);
-            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH, varianc_4[1]);
-            _mm512_storeu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1), varianc_4[2]);
-            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3, varianc_4[3]);
+            _mm512_storeu_ps(_params + i, param_4[0].data);
+            _mm512_storeu_ps(_params + i + SIMD_WIDTH, param_4[1].data);
+            _mm512_storeu_ps(_params + i + (SIMD_WIDTH << 1), param_4[2].data);
+            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 3, param_4[3].data);
+
+            _mm512_storeu_ps(_exp_avg + i, momntum_4[0].data);
+            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH, momntum_4[1].data);
+            _mm512_storeu_ps(_exp_avg + i + (SIMD_WIDTH << 1), momntum_4[2].data);
+            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 3, momntum_4[3].data);
+
+            _mm512_storeu_ps(_exp_avg_sq + i, varianc_4[0].data);
+            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH, varianc_4[1].data);
+            _mm512_storeu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1), varianc_4[2].data);
+            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3, varianc_4[3].data);
         }
 
-        if (dev_params) {
+        if (dev_params) {/*
 #pragma omp parallel for
             for (size_t j = 0; j < copy_size; j += 4) {
-                _doubled_buffer[buf_index][j] = (__half)_params[t + j];
-                _doubled_buffer[buf_index][j + 1] = (__half)_params[t + j + 1];
-                _doubled_buffer[buf_index][j + 2] = (__half)_params[t + j + 2];
-                _doubled_buffer[buf_index][j + 3] = (__half)_params[t + j + 3];
-            }
+                _doubled_buffer[_buf_index][j] = (__half)_params[t + j];
+                _doubled_buffer[_buf_index][j + 1] = (__half)_params[t + j + 1];
+                _doubled_buffer[_buf_index][j + 2] = (__half)_params[t + j + 2];
+                _doubled_buffer[_buf_index][j + 3] = (__half)_params[t + j + 3];
+            }*/
 
             CUDA_CHECK(cudaMemcpyAsync(dev_params + t,
-                                       _doubled_buffer[buf_index],
+                                       _doubled_buffer[_buf_index],
                                        copy_size * sizeof(__half),
                                        cudaMemcpyHostToDevice,
                                        Context::Instance().GetCurrentStream()));
-            buf_index = !buf_index;
+            _buf_index = !_buf_index;
         }
     }
+    if(_param_size > rounded_size)
+        Step((_params + rounded_size),
+             (grads + rounded_size),
+             (_exp_avg + rounded_size),
+             (_exp_avg_sq + rounded_size),
+             (_param_size - rounded_size),
+             (dev_params != nullptr ? (dev_params + rounded_size) : dev_params));
 }
 
 int create_adam_optimizer(int optimizer_id,
@@ -278,207 +361,237 @@ void Adam_Optimizer::Step_8(float* _params,
     _betta1_t *= _betta1;
     _betta2_t *= _betta2;
 
-    __m512 betta1_4 = _mm512_set1_ps(_betta1);
-    __m512 betta2_4 = _mm512_set1_ps(_betta2);
-
-    bool buf_index = 0;
+    AVX_512 betta1_4;
+    betta1_4.data = _mm512_set1_ps(_betta1);
+    AVX_512 betta2_4;
+    betta2_4.data = _mm512_set1_ps(_betta2);
 
     float betta1_minus1 = 1 - _betta1;
     float betta2_minus1 = 1 - _betta2;
-    __m512 betta1_minus1_4 = _mm512_set1_ps(betta1_minus1);
-    __m512 betta2_minus1_4 = _mm512_set1_ps(betta2_minus1);
+    AVX_512 betta1_minus1_4;
+    betta1_minus1_4.data = _mm512_set1_ps(betta1_minus1);
+    AVX_512 betta2_minus1_4;
+    betta2_minus1_4.data = _mm512_set1_ps(betta2_minus1);
 
     float bias_correction1 = 1 - _betta1_t;
-    float bias_correction2 = 1 - _betta2_t;
-    //__m512 bias_correction1_4 = _mm512_set1_ps(bias_correction1);
-    __m512 bias_correction2_4 = _mm512_set1_ps(bias_correction2);
+    float bias_correction2 = 1 / sqrt(1 - _betta2_t);
+    //AVX_512 bias_correction1_4 = _mm512_set1_ps(bias_correction1);
+    AVX_512 bias2_sqrt ;
+    bias2_sqrt.data = _mm512_set1_ps(bias_correction2);
 
-    __m512 eps_4 = _mm512_set1_ps(_eps);
+    AVX_512 eps_4;
+    eps_4.data = _mm512_set1_ps(_eps);
 
     float step_size = -1 * _alpha / bias_correction1;
-    __m512 step_size_4 = _mm512_set1_ps(step_size);
+    AVX_512 step_size_4;
+    step_size_4.data = _mm512_set1_ps(step_size);
 
-    __m512 bias2_sqrt = _mm512_sqrt_ps(bias_correction2_4);
+    size_t rounded_size = ROUND_DOWN(_param_size, (SIMD_WIDTH << 3));
 
-    for (size_t t = 0; t < _param_size; t += TILE) {
+    for (size_t t = 0; t < rounded_size; t += TILE) {
         size_t copy_size = TILE;
-        if ((t + TILE) > _param_size) copy_size = _param_size - t;
+        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
         size_t offset = copy_size + t;
 #pragma omp parallel for
         for (size_t i = t; i < offset; i += (SIMD_WIDTH << 3)) {
-            __m512 grad_4[8];
-            grad_4[0] = _mm512_loadu_ps(grads + i);
-            grad_4[1] = _mm512_loadu_ps(grads + i + SIMD_WIDTH);
-            grad_4[2] = _mm512_loadu_ps(grads + i + (SIMD_WIDTH << 1));
-            grad_4[3] = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 3);
-            grad_4[4] = _mm512_loadu_ps(grads + i + (SIMD_WIDTH << 2));
-            grad_4[5] = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 5);
-            grad_4[6] = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 6);
-            grad_4[7] = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 7);
-
-            __m512 momntum_4[8];
-            momntum_4[0] = _mm512_loadu_ps(_exp_avg + i);
-            momntum_4[1] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH);
-            momntum_4[2] = _mm512_loadu_ps(_exp_avg + i + (SIMD_WIDTH << 1));
-            momntum_4[3] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 3);
-            momntum_4[4] = _mm512_loadu_ps(_exp_avg + i + (SIMD_WIDTH << 2));
-            momntum_4[5] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 5);
-            momntum_4[6] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 6);
-            momntum_4[7] = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 7);
-
-            __m512 varianc_4[8];
-            varianc_4[0] = _mm512_loadu_ps(_exp_avg_sq + i);
-            varianc_4[1] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH);
-            varianc_4[2] = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1));
-            varianc_4[3] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3);
-            varianc_4[4] = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 2));
-            varianc_4[5] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 5);
-            varianc_4[6] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 6);
-            varianc_4[7] = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 7);
-
-            __m512 param_4[8];
-            param_4[0] = _mm512_loadu_ps(_params + i);
-            param_4[1] = _mm512_loadu_ps(_params + i + SIMD_WIDTH);
-            param_4[2] = _mm512_loadu_ps(_params + i + (SIMD_WIDTH << 1));
-            param_4[3] = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 3);
-            param_4[4] = _mm512_loadu_ps(_params + i + (SIMD_WIDTH << 2));
-            param_4[5] = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 5);
-            param_4[6] = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 6);
-            param_4[7] = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 7);
+            AVX_512 grad_4[8];
+            grad_4[0].data = _mm512_loadu_ps(grads + i);
+            grad_4[1].data = _mm512_loadu_ps(grads + i + SIMD_WIDTH);
+            grad_4[2].data = _mm512_loadu_ps(grads + i + (SIMD_WIDTH << 1));
+            grad_4[3].data = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 3);
+            grad_4[4].data = _mm512_loadu_ps(grads + i + (SIMD_WIDTH << 2));
+            grad_4[5].data = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 5);
+            grad_4[6].data = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 6);
+            grad_4[7].data = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 7);
+
+            AVX_512 momntum_4[8];
+            momntum_4[0].data = _mm512_loadu_ps(_exp_avg + i);
+            momntum_4[1].data = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH);
+            momntum_4[2].data = _mm512_loadu_ps(_exp_avg + i + (SIMD_WIDTH << 1));
+            momntum_4[3].data = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 3);
+            momntum_4[4].data = _mm512_loadu_ps(_exp_avg + i + (SIMD_WIDTH << 2));
+            momntum_4[5].data = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 5);
+            momntum_4[6].data = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 6);
+            momntum_4[7].data = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 7);
+
+            AVX_512 varianc_4[8];
+            varianc_4[0].data = _mm512_loadu_ps(_exp_avg_sq + i);
+            varianc_4[1].data = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH);
+            varianc_4[2].data = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1));
+            varianc_4[3].data = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3);
+            varianc_4[4].data = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 2));
+            varianc_4[5].data = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 5);
+            varianc_4[6].data = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 6);
+            varianc_4[7].data = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 7);
+
+            AVX_512 param_4[8];
+            param_4[0].data = _mm512_loadu_ps(_params + i);
+            param_4[1].data = _mm512_loadu_ps(_params + i + SIMD_WIDTH);
+            param_4[2].data = _mm512_loadu_ps(_params + i + (SIMD_WIDTH << 1));
+            param_4[3].data = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 3);
+            param_4[4].data = _mm512_loadu_ps(_params + i + (SIMD_WIDTH << 2));
+            param_4[5].data = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 5);
+            param_4[6].data = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 6);
+            param_4[7].data = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 7);
 
             if (_weight_decay > 0) {
-                __m512 weight_decay4 = _mm512_set1_ps(_weight_decay);
-                grad_4[0] = _mm512_fmadd_ps(param_4[0], weight_decay4, grad_4[0]);
-                grad_4[1] = _mm512_fmadd_ps(param_4[1], weight_decay4, grad_4[1]);
-                grad_4[2] = _mm512_fmadd_ps(param_4[2], weight_decay4, grad_4[2]);
-                grad_4[3] = _mm512_fmadd_ps(param_4[3], weight_decay4, grad_4[3]);
-                grad_4[4] = _mm512_fmadd_ps(param_4[4], weight_decay4, grad_4[4]);
-                grad_4[5] = _mm512_fmadd_ps(param_4[5], weight_decay4, grad_4[5]);
-                grad_4[6] = _mm512_fmadd_ps(param_4[6], weight_decay4, grad_4[6]);
-                grad_4[7] = _mm512_fmadd_ps(param_4[7], weight_decay4, grad_4[7]);
+                AVX_512 weight_decay4;
+                weight_decay4.data = _mm512_set1_ps(_weight_decay);
+                grad_4[0].data = _mm512_fmadd_ps(param_4[0].data, weight_decay4.data, grad_4[0].data);
+                grad_4[1].data = _mm512_fmadd_ps(param_4[1].data, weight_decay4.data, grad_4[1].data);
+                grad_4[2].data = _mm512_fmadd_ps(param_4[2].data, weight_decay4.data, grad_4[2].data);
+                grad_4[3].data = _mm512_fmadd_ps(param_4[3].data, weight_decay4.data, grad_4[3].data);
+                grad_4[4].data = _mm512_fmadd_ps(param_4[4].data, weight_decay4.data, grad_4[4].data);
+                grad_4[5].data = _mm512_fmadd_ps(param_4[5].data, weight_decay4.data, grad_4[5].data);
+                grad_4[6].data = _mm512_fmadd_ps(param_4[6].data, weight_decay4.data, grad_4[6].data);
+                grad_4[7].data = _mm512_fmadd_ps(param_4[7].data, weight_decay4.data, grad_4[7].data);
+            }
+
+            momntum_4[0].data = _mm512_mul_ps(momntum_4[0].data, betta1_4.data);
+            momntum_4[0].data = _mm512_fmadd_ps(grad_4[0].data, betta1_minus1_4.data, momntum_4[0].data);
+            momntum_4[1].data = _mm512_mul_ps(momntum_4[1].data, betta1_4.data);
+            momntum_4[1].data = _mm512_fmadd_ps(grad_4[1].data, betta1_minus1_4.data, momntum_4[1].data);
+            momntum_4[2].data = _mm512_mul_ps(momntum_4[2].data, betta1_4.data);
+            momntum_4[2].data = _mm512_fmadd_ps(grad_4[2].data, betta1_minus1_4.data, momntum_4[2].data);
+            momntum_4[3].data = _mm512_mul_ps(momntum_4[3].data, betta1_4.data);
+            momntum_4[3].data = _mm512_fmadd_ps(grad_4[3].data, betta1_minus1_4.data, momntum_4[3].data);
+            momntum_4[4].data = _mm512_mul_ps(momntum_4[4].data, betta1_4.data);
+            momntum_4[4].data = _mm512_fmadd_ps(grad_4[4].data, betta1_minus1_4.data, momntum_4[4].data);
+            momntum_4[5].data = _mm512_mul_ps(momntum_4[5].data, betta1_4.data);
+            momntum_4[5].data = _mm512_fmadd_ps(grad_4[5].data, betta1_minus1_4.data, momntum_4[5].data);
+            momntum_4[6].data = _mm512_mul_ps(momntum_4[6].data, betta1_4.data);
+            momntum_4[6].data = _mm512_fmadd_ps(grad_4[6].data, betta1_minus1_4.data, momntum_4[6].data);
+            momntum_4[7].data = _mm512_mul_ps(momntum_4[7].data, betta1_4.data);
+            momntum_4[7].data = _mm512_fmadd_ps(grad_4[7].data, betta1_minus1_4.data, momntum_4[7].data);
+
+            varianc_4[0].data = _mm512_mul_ps(varianc_4[0].data, betta2_4.data);
+            varianc_4[1].data = _mm512_mul_ps(varianc_4[1].data, betta2_4.data);
+            varianc_4[2].data = _mm512_mul_ps(varianc_4[2].data, betta2_4.data);
+            varianc_4[3].data = _mm512_mul_ps(varianc_4[3].data, betta2_4.data);
+            varianc_4[4].data = _mm512_mul_ps(varianc_4[4].data, betta2_4.data);
+            varianc_4[5].data = _mm512_mul_ps(varianc_4[5].data, betta2_4.data);
+            varianc_4[6].data = _mm512_mul_ps(varianc_4[6].data, betta2_4.data);
+            varianc_4[7].data = _mm512_mul_ps(varianc_4[7].data, betta2_4.data);
+            grad_4[0].data = _mm512_mul_ps(grad_4[0].data, grad_4[0].data);
+            grad_4[1].data = _mm512_mul_ps(grad_4[1].data, grad_4[1].data);
+            grad_4[2].data = _mm512_mul_ps(grad_4[2].data, grad_4[2].data);
+            grad_4[3].data = _mm512_mul_ps(grad_4[3].data, grad_4[3].data);
+            grad_4[4].data = _mm512_mul_ps(grad_4[4].data, grad_4[4].data);
+            grad_4[5].data = _mm512_mul_ps(grad_4[5].data, grad_4[5].data);
+            grad_4[6].data = _mm512_mul_ps(grad_4[6].data, grad_4[6].data);
+            grad_4[7].data = _mm512_mul_ps(grad_4[7].data, grad_4[7].data);
+            varianc_4[0].data = _mm512_fmadd_ps(grad_4[0].data, betta2_minus1_4.data, varianc_4[0].data);
+            varianc_4[1].data = _mm512_fmadd_ps(grad_4[1].data, betta2_minus1_4.data, varianc_4[1].data);
+            varianc_4[2].data = _mm512_fmadd_ps(grad_4[2].data, betta2_minus1_4.data, varianc_4[2].data);
+            varianc_4[3].data = _mm512_fmadd_ps(grad_4[3].data, betta2_minus1_4.data, varianc_4[3].data);
+            varianc_4[4].data = _mm512_fmadd_ps(grad_4[4].data, betta2_minus1_4.data, varianc_4[4].data);
+            varianc_4[5].data = _mm512_fmadd_ps(grad_4[5].data, betta2_minus1_4.data, varianc_4[5].data);
+            varianc_4[6].data = _mm512_fmadd_ps(grad_4[6].data, betta2_minus1_4.data, varianc_4[6].data);
+            varianc_4[7].data = _mm512_fmadd_ps(grad_4[7].data, betta2_minus1_4.data, varianc_4[7].data);
+
+            grad_4[0].data = _mm512_sqrt_ps(varianc_4[0].data);
+            grad_4[1].data = _mm512_sqrt_ps(varianc_4[1].data);
+            grad_4[2].data = _mm512_sqrt_ps(varianc_4[2].data);
+            grad_4[3].data = _mm512_sqrt_ps(varianc_4[3].data);
+            grad_4[4].data = _mm512_sqrt_ps(varianc_4[4].data);
+            grad_4[5].data = _mm512_sqrt_ps(varianc_4[5].data);
+            grad_4[6].data = _mm512_sqrt_ps(varianc_4[6].data);
+            grad_4[7].data = _mm512_sqrt_ps(varianc_4[7].data);
+
+            grad_4[0].data = _mm512_fmadd_ps(grad_4[0].data, bias2_sqrt.data, eps_4.data);
+            grad_4[1].data = _mm512_fmadd_ps(grad_4[1].data, bias2_sqrt.data, eps_4.data);
+            grad_4[2].data = _mm512_fmadd_ps(grad_4[2].data, bias2_sqrt.data, eps_4.data);
+            grad_4[3].data = _mm512_fmadd_ps(grad_4[3].data, bias2_sqrt.data, eps_4.data);
+            grad_4[4].data = _mm512_fmadd_ps(grad_4[4].data, bias2_sqrt.data, eps_4.data);
+            grad_4[5].data = _mm512_fmadd_ps(grad_4[5].data, bias2_sqrt.data, eps_4.data);
+            grad_4[6].data = _mm512_fmadd_ps(grad_4[6].data, bias2_sqrt.data, eps_4.data);
+            grad_4[7].data = _mm512_fmadd_ps(grad_4[7].data, bias2_sqrt.data, eps_4.data);
+            grad_4[0].data = _mm512_div_ps(momntum_4[0].data, grad_4[0].data);
+            grad_4[1].data = _mm512_div_ps(momntum_4[1].data, grad_4[1].data);
+            grad_4[2].data = _mm512_div_ps(momntum_4[2].data, grad_4[2].data);
+            grad_4[3].data = _mm512_div_ps(momntum_4[3].data, grad_4[3].data);
+            grad_4[4].data = _mm512_div_ps(momntum_4[4].data, grad_4[4].data);
+            grad_4[5].data = _mm512_div_ps(momntum_4[5].data, grad_4[5].data);
+            grad_4[6].data = _mm512_div_ps(momntum_4[6].data, grad_4[6].data);
+            grad_4[7].data = _mm512_div_ps(momntum_4[7].data, grad_4[7].data);
+
+            param_4[0].data = _mm512_fmadd_ps(grad_4[0].data, step_size_4.data, param_4[0].data);
+            param_4[1].data = _mm512_fmadd_ps(grad_4[1].data, step_size_4.data, param_4[1].data);
+            param_4[2].data = _mm512_fmadd_ps(grad_4[2].data, step_size_4.data, param_4[2].data);
+            param_4[3].data = _mm512_fmadd_ps(grad_4[3].data, step_size_4.data, param_4[3].data);
+            param_4[4].data = _mm512_fmadd_ps(grad_4[4].data, step_size_4.data, param_4[4].data);
+            param_4[5].data = _mm512_fmadd_ps(grad_4[5].data, step_size_4.data, param_4[5].data);
+            param_4[6].data = _mm512_fmadd_ps(grad_4[6].data, step_size_4.data, param_4[6].data);
+            param_4[7].data = _mm512_fmadd_ps(grad_4[7].data, step_size_4.data, param_4[7].data);
+
+            _mm512_storeu_ps(_params + i, param_4[0].data);
+            _mm512_storeu_ps(_params + i + SIMD_WIDTH, param_4[1].data);
+            _mm512_storeu_ps(_params + i + (SIMD_WIDTH << 1), param_4[2].data);
+            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 3, param_4[3].data);
+            _mm512_storeu_ps(_params + i + (SIMD_WIDTH << 2), param_4[4].data);
+            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 5, param_4[5].data);
+            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 6, param_4[6].data);
+            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 7, param_4[7].data);
+
+            //_mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t), param_4[0]);
+            //_mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH, param_4[1]);
+            //_mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + (SIMD_WIDTH << 1), param_4[2]);
+            //_mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 3, param_4[3]);
+            //_mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + (SIMD_WIDTH << 2), param_4[4]);
+            //_mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 5, param_4[5]);
+            //_mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 6, param_4[6]);
+            //_mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 7, param_4[7]);
+            if (dev_params) {
+                for(int u = 0;u < 8;u++)
+                {
+                    for (size_t j = 0; j < SIMD_WIDTH; j += 4) {
+                        _doubled_buffer[_buf_index][(i - t) + (u << 4) + (j << 2)] =     (__half)param_4[u].data_f[(j << 2)];
+                        _doubled_buffer[_buf_index][(i - t) + (u << 4) + (j << 2) + 1] = (__half)param_4[u].data_f[(j << 2) + 1];
+                        _doubled_buffer[_buf_index][(i - t) + (u << 4) + (j << 2) + 2] = (__half)param_4[u].data_f[(j << 2) + 2];
+                        _doubled_buffer[_buf_index][(i - t) + (u << 4) + (j << 2) + 3] = (__half)param_4[u].data_f[(j << 2) + 3];
+                    }
+                }
             }
 
-            momntum_4[0] = _mm512_mul_ps(momntum_4[0], betta1_4);
-            momntum_4[0] = _mm512_fmadd_ps(grad_4[0], betta1_minus1_4, momntum_4[0]);
-            momntum_4[1] = _mm512_mul_ps(momntum_4[1], betta1_4);
-            momntum_4[1] = _mm512_fmadd_ps(grad_4[1], betta1_minus1_4, momntum_4[1]);
-            momntum_4[2] = _mm512_mul_ps(momntum_4[2], betta1_4);
-            momntum_4[2] = _mm512_fmadd_ps(grad_4[2], betta1_minus1_4, momntum_4[2]);
-            momntum_4[3] = _mm512_mul_ps(momntum_4[3], betta1_4);
-            momntum_4[3] = _mm512_fmadd_ps(grad_4[3], betta1_minus1_4, momntum_4[3]);
-            momntum_4[4] = _mm512_mul_ps(momntum_4[4], betta1_4);
-            momntum_4[4] = _mm512_fmadd_ps(grad_4[4], betta1_minus1_4, momntum_4[4]);
-            momntum_4[5] = _mm512_mul_ps(momntum_4[5], betta1_4);
-            momntum_4[5] = _mm512_fmadd_ps(grad_4[5], betta1_minus1_4, momntum_4[5]);
-            momntum_4[6] = _mm512_mul_ps(momntum_4[6], betta1_4);
-            momntum_4[6] = _mm512_fmadd_ps(grad_4[6], betta1_minus1_4, momntum_4[6]);
-            momntum_4[7] = _mm512_mul_ps(momntum_4[7], betta1_4);
-            momntum_4[7] = _mm512_fmadd_ps(grad_4[7], betta1_minus1_4, momntum_4[7]);
-
-            varianc_4[0] = _mm512_mul_ps(varianc_4[0], betta2_4);
-            varianc_4[1] = _mm512_mul_ps(varianc_4[1], betta2_4);
-            varianc_4[2] = _mm512_mul_ps(varianc_4[2], betta2_4);
-            varianc_4[3] = _mm512_mul_ps(varianc_4[3], betta2_4);
-            varianc_4[4] = _mm512_mul_ps(varianc_4[4], betta2_4);
-            varianc_4[5] = _mm512_mul_ps(varianc_4[5], betta2_4);
-            varianc_4[6] = _mm512_mul_ps(varianc_4[6], betta2_4);
-            varianc_4[7] = _mm512_mul_ps(varianc_4[7], betta2_4);
-            grad_4[0] = _mm512_mul_ps(grad_4[0], grad_4[0]);
-            grad_4[1] = _mm512_mul_ps(grad_4[1], grad_4[1]);
-            grad_4[2] = _mm512_mul_ps(grad_4[2], grad_4[2]);
-            grad_4[3] = _mm512_mul_ps(grad_4[3], grad_4[3]);
-            grad_4[4] = _mm512_mul_ps(grad_4[4], grad_4[4]);
-            grad_4[5] = _mm512_mul_ps(grad_4[5], grad_4[5]);
-            grad_4[6] = _mm512_mul_ps(grad_4[6], grad_4[6]);
-            grad_4[7] = _mm512_mul_ps(grad_4[7], grad_4[7]);
-            varianc_4[0] = _mm512_fmadd_ps(grad_4[0], betta2_minus1_4, varianc_4[0]);
-            varianc_4[1] = _mm512_fmadd_ps(grad_4[1], betta2_minus1_4, varianc_4[1]);
-            varianc_4[2] = _mm512_fmadd_ps(grad_4[2], betta2_minus1_4, varianc_4[2]);
-            varianc_4[3] = _mm512_fmadd_ps(grad_4[3], betta2_minus1_4, varianc_4[3]);
-            varianc_4[4] = _mm512_fmadd_ps(grad_4[4], betta2_minus1_4, varianc_4[4]);
-            varianc_4[5] = _mm512_fmadd_ps(grad_4[5], betta2_minus1_4, varianc_4[5]);
-            varianc_4[6] = _mm512_fmadd_ps(grad_4[6], betta2_minus1_4, varianc_4[6]);
-            varianc_4[7] = _mm512_fmadd_ps(grad_4[7], betta2_minus1_4, varianc_4[7]);
-
-            grad_4[0] = _mm512_sqrt_ps(varianc_4[0]) / bias2_sqrt;
-            grad_4[1] = _mm512_sqrt_ps(varianc_4[1]) / bias2_sqrt;
-            grad_4[2] = _mm512_sqrt_ps(varianc_4[2]) / bias2_sqrt;
-            grad_4[3] = _mm512_sqrt_ps(varianc_4[3]) / bias2_sqrt;
-            grad_4[4] = _mm512_sqrt_ps(varianc_4[4]) / bias2_sqrt;
-            grad_4[5] = _mm512_sqrt_ps(varianc_4[5]) / bias2_sqrt;
-            grad_4[6] = _mm512_sqrt_ps(varianc_4[6]) / bias2_sqrt;
-            grad_4[7] = _mm512_sqrt_ps(varianc_4[7]) / bias2_sqrt;
-
-            grad_4[0] = _mm512_add_ps(grad_4[0], eps_4);
-            grad_4[1] = _mm512_add_ps(grad_4[1], eps_4);
-            grad_4[2] = _mm512_add_ps(grad_4[2], eps_4);
-            grad_4[3] = _mm512_add_ps(grad_4[3], eps_4);
-            grad_4[4] = _mm512_add_ps(grad_4[4], eps_4);
-            grad_4[5] = _mm512_add_ps(grad_4[5], eps_4);
-            grad_4[6] = _mm512_add_ps(grad_4[6], eps_4);
-            grad_4[7] = _mm512_add_ps(grad_4[7], eps_4);
-            grad_4[0] = _mm512_div_ps(momntum_4[0], grad_4[0]);
-            grad_4[1] = _mm512_div_ps(momntum_4[1], grad_4[1]);
-            grad_4[2] = _mm512_div_ps(momntum_4[2], grad_4[2]);
-            grad_4[3] = _mm512_div_ps(momntum_4[3], grad_4[3]);
-            grad_4[4] = _mm512_div_ps(momntum_4[4], grad_4[4]);
-            grad_4[5] = _mm512_div_ps(momntum_4[5], grad_4[5]);
-            grad_4[6] = _mm512_div_ps(momntum_4[6], grad_4[6]);
-            grad_4[7] = _mm512_div_ps(momntum_4[7], grad_4[7]);
-
-            param_4[0] = _mm512_fmadd_ps(grad_4[0], step_size_4, param_4[0]);
-            param_4[1] = _mm512_fmadd_ps(grad_4[1], step_size_4, param_4[1]);
-            param_4[2] = _mm512_fmadd_ps(grad_4[2], step_size_4, param_4[2]);
-            param_4[3] = _mm512_fmadd_ps(grad_4[3], step_size_4, param_4[3]);
-            param_4[4] = _mm512_fmadd_ps(grad_4[4], step_size_4, param_4[4]);
-            param_4[5] = _mm512_fmadd_ps(grad_4[5], step_size_4, param_4[5]);
-            param_4[6] = _mm512_fmadd_ps(grad_4[6], step_size_4, param_4[6]);
-            param_4[7] = _mm512_fmadd_ps(grad_4[7], step_size_4, param_4[7]);
-
-            _mm512_storeu_ps(_params + i, param_4[0]);
-            _mm512_storeu_ps(_params + i + SIMD_WIDTH, param_4[1]);
-            _mm512_storeu_ps(_params + i + (SIMD_WIDTH << 1), param_4[2]);
-            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 3, param_4[3]);
-            _mm512_storeu_ps(_params + i + (SIMD_WIDTH << 2), param_4[4]);
-            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 5, param_4[5]);
-            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 6, param_4[6]);
-            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 7, param_4[7]);
-
-            _mm512_storeu_ps(_doubled_buffer[buf_index] + (i - t), param_4[0]);
-            _mm512_storeu_ps(_doubled_buffer[buf_index] + (i - t) + SIMD_WIDTH, param_4[1]);
-            _mm512_storeu_ps(_doubled_buffer[buf_index] + (i - t) + (SIMD_WIDTH << 1), param_4[2]);
-            _mm512_storeu_ps(_doubled_buffer[buf_index] + (i - t) + SIMD_WIDTH * 3, param_4[3]);
-            _mm512_storeu_ps(_doubled_buffer[buf_index] + (i - t) + (SIMD_WIDTH << 2), param_4[4]);
-            _mm512_storeu_ps(_doubled_buffer[buf_index] + (i - t) + SIMD_WIDTH * 5, param_4[5]);
-            _mm512_storeu_ps(_doubled_buffer[buf_index] + (i - t) + SIMD_WIDTH * 6, param_4[6]);
-            _mm512_storeu_ps(_doubled_buffer[buf_index] + (i - t) + SIMD_WIDTH * 7, param_4[7]);
-
-            _mm512_storeu_ps(_exp_avg + i, momntum_4[0]);
-            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH, momntum_4[1]);
-            _mm512_storeu_ps(_exp_avg + i + (SIMD_WIDTH << 1), momntum_4[2]);
-            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 3, momntum_4[3]);
-            _mm512_storeu_ps(_exp_avg + i + (SIMD_WIDTH << 2), momntum_4[4]);
-            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 5, momntum_4[5]);
-            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 6, momntum_4[6]);
-            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 7, momntum_4[7]);
-
-            _mm512_storeu_ps(_exp_avg_sq + i, varianc_4[0]);
-            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH, varianc_4[1]);
-            _mm512_storeu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1), varianc_4[2]);
-            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3, varianc_4[3]);
-            _mm512_storeu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 2), varianc_4[4]);
-            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 5, varianc_4[5]);
-            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 6, varianc_4[6]);
-            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 7, varianc_4[7]);
+            _mm512_storeu_ps(_exp_avg + i, momntum_4[0].data);
+            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH, momntum_4[1].data);
+            _mm512_storeu_ps(_exp_avg + i + (SIMD_WIDTH << 1), momntum_4[2].data);
+            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 3, momntum_4[3].data);
+            _mm512_storeu_ps(_exp_avg + i + (SIMD_WIDTH << 2), momntum_4[4].data);
+            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 5, momntum_4[5].data);
+            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 6, momntum_4[6].data);
+            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 7, momntum_4[7].data);
+
+            _mm512_storeu_ps(_exp_avg_sq + i, varianc_4[0].data);
+            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH, varianc_4[1].data);
+            _mm512_storeu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1), varianc_4[2].data);
+            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3, varianc_4[3].data);
+            _mm512_storeu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 2), varianc_4[4].data);
+            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 5, varianc_4[5].data);
+            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 6, varianc_4[6].data);
+            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 7, varianc_4[7].data);
         }
         if (dev_params) {
-            launch_param_update(_doubled_buffer[buf_index],
+            /*launch_param_update(_doubled_buffer[_buf_index],
                                 dev_params + t,
                                 copy_size,
                                 Context::Instance().GetCurrentStream());
-            buf_index = !buf_index;
+            _buf_index = !_buf_index;*/
+            CUDA_CHECK(cudaMemcpyAsync(dev_params + t,
+                                       _doubled_buffer[_buf_index],
+                                       copy_size * sizeof(__half),
+                                       cudaMemcpyHostToDevice,
+                                       Context::Instance().GetCurrentStream()));
+            _buf_index = !_buf_index;
         }
     }
+    if(_param_size > rounded_size)
+         Step_4((_params + rounded_size),
+             (grads + rounded_size),
+             (_exp_avg + rounded_size),
+             (_exp_avg_sq + rounded_size),
+             (_param_size - rounded_size),
+             (dev_params != nullptr ? (dev_params + rounded_size) : dev_params));
 }
 
 int ds_adam_step(int optimizer_id,
diff --git a/csrc/includes/cpu_adam.h b/csrc/includes/cpu_adam.h
old mode 100644
new mode 100755
index 40f4cba692ea..f56eb1501aa4
--- a/csrc/includes/cpu_adam.h
+++ b/csrc/includes/cpu_adam.h
@@ -7,6 +7,7 @@
 #include "cublas_v2.h"
 #include "cuda.h"
 #include "curand.h"
+#include <x86intrin.h>
 
 #define CUDA_CHECK(callstr)                                                                    \
     {                                                                                          \
@@ -34,7 +35,8 @@ class Adam_Optimizer {
           _eps(eps),
           _weight_decay(weight_decay),
           _betta1_t(1.0),
-          _betta2_t(1.0)
+          _betta2_t(1.0),
+          _buf_index(false)
     {
         cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(__half));
         cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(__half));
@@ -64,6 +66,12 @@ class Adam_Optimizer {
                 __half* dev_params = nullptr);
 
 private:
+
+    union AVX_512{
+        __m512 data;
+        float data_f[16];
+    };
+
     float _alpha;
     float _betta1;
     float _betta2;
@@ -74,4 +82,5 @@ class Adam_Optimizer {
     float _betta2_t;
 
     __half* _doubled_buffer[2];
+    bool _buf_index;
 };
diff --git a/tests/unit/test_adam_acuracy.py b/tests/unit/test_adam_acuracy.py
index e616e140bdde..f61b6ecba58b 100755
--- a/tests/unit/test_adam_acuracy.py
+++ b/tests/unit/test_adam_acuracy.py
@@ -10,9 +10,6 @@
 
 
 def check_equal(first, second, atol=1e-2, verbose=False):
-    if verbose:
-        print(first)
-        print(second)
     x = first.detach().numpy()
     y = second.detach().numpy()
     if verbose:
@@ -23,6 +20,11 @@ def check_equal(first, second, atol=1e-2, verbose=False):
 
 @pytest.mark.parametrize('model_size',
                          [
+                             (64),
+                             (22),
+                             (55),
+                             (127),
+                             (1024),
                              (1048576),
                          ]) # yapf: disable
 def test_adam_opt(model_size):

From 75e9e320a8c8ea539299d28e7a0973b54f4f9af7 Mon Sep 17 00:00:00 2001
From: RezaYazdaniAminabadi
 <44502768+RezaYazdaniAminabadi@users.noreply.github.com>
Date: Sun, 6 Sep 2020 13:43:38 -0700
Subject: [PATCH 38/44] fix adam perormance (#372)

* fixing corner cases

* revert to the previous perf for adam

* adam high performance

* run precommit
---
 csrc/adam/cpu_adam.cpp   | 301 +++++++++++++++++++++------------------
 csrc/includes/cpu_adam.h |  13 +-
 2 files changed, 168 insertions(+), 146 deletions(-)
 mode change 100755 => 100644 csrc/adam/cpu_adam.cpp
 mode change 100755 => 100644 csrc/includes/cpu_adam.h

diff --git a/csrc/adam/cpu_adam.cpp b/csrc/adam/cpu_adam.cpp
old mode 100755
new mode 100644
index e6afa7afe3e8..2baab6177c2e
--- a/csrc/adam/cpu_adam.cpp
+++ b/csrc/adam/cpu_adam.cpp
@@ -1,5 +1,6 @@
 #include "cpu_adam.h"
 #include <cuda_runtime_api.h>
+#include <math.h>
 #include <omp.h>
 #include <torch/extension.h>
 #include <iostream>
@@ -10,13 +11,11 @@
 #include "cuda.h"
 #include "curand.h"
 #include "custom_cuda_layers.h"
-#include <math.h>
 
 static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
 
 #define ROUND_DOWN(size, step) ((size) & ~((step)-1))
 
-
 // C++ interface
 
 void Adam_Optimizer::Step(float* _params,
@@ -43,8 +42,8 @@ void Adam_Optimizer::Step(float* _params,
 
     float bias_correction1 = 1 - _betta1_t;
     float bias_correction2 = 1 / sqrt(1 - _betta2_t);
-    //AVX_512 bias_correction1_4 = _mm512_set1_ps(bias_correction1);
-    AVX_512 bias2_sqrt ;
+    // AVX_512 bias_correction1_4 = _mm512_set1_ps(bias_correction1);
+    AVX_512 bias2_sqrt;
     bias2_sqrt.data = _mm512_set1_ps(bias_correction2);
 
     AVX_512 eps_4;
@@ -91,48 +90,44 @@ void Adam_Optimizer::Step(float* _params,
             grad_4.data = _mm512_div_ps(momntum_4.data, grad_4.data);
 
             param_4.data = _mm512_fmadd_ps(grad_4.data, step_size_4.data, param_4.data);
-            if (dev_params) {
-                for (size_t j = 0; j < SIMD_WIDTH; j += 4) {
-                    _doubled_buffer[_buf_index][(i - t) + (j << 2)] =     (__half)param_4.data_f[(j << 2)];
-                    _doubled_buffer[_buf_index][(i - t) + (j << 2) + 1] = (__half)param_4.data_f[(j << 2) + 1];
-                    _doubled_buffer[_buf_index][(i - t) + (j << 2) + 2] = (__half)param_4.data_f[(j << 2) + 2];
-                    _doubled_buffer[_buf_index][(i - t) + (j << 2) + 3] = (__half)param_4.data_f[(j << 2) + 3];
-                }
-            }
+
             _mm512_storeu_ps(_params + i, param_4.data);
+
+            if (dev_params) _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t), param_4.data);
+
             _mm512_storeu_ps(_exp_avg + i, momntum_4.data);
             _mm512_storeu_ps(_exp_avg_sq + i, varianc_4.data);
         }
-        if (dev_params) {/*
-#pragma omp parallel for
-            for (size_t j = 0; j < copy_size; j += 4) {
-                _doubled_buffer[_buf_index][j] = (__half)_params[t + j];
-                _doubled_buffer[_buf_index][j + 1] = (__half)_params[t + j + 1];
-                _doubled_buffer[_buf_index][j + 2] = (__half)_params[t + j + 2];
-                _doubled_buffer[_buf_index][j + 3] = (__half)_params[t + j + 3];
-            }*/
-
-            CUDA_CHECK(cudaMemcpyAsync(dev_params + t,
-                                       _doubled_buffer[_buf_index],
-                                       copy_size * sizeof(__half),
-                                       cudaMemcpyHostToDevice,
-                                       Context::Instance().GetCurrentStream()));
+        if (dev_params) { /*
+ #pragma omp parallel for
+             for (size_t j = 0; j < copy_size; j += 4) {
+                 _doubled_buffer[_buf_index][j] = (__half)_params[t + j];
+                 _doubled_buffer[_buf_index][j + 1] = (__half)_params[t + j + 1];
+                 _doubled_buffer[_buf_index][j + 2] = (__half)_params[t + j + 2];
+                 _doubled_buffer[_buf_index][j + 3] = (__half)_params[t + j + 3];
+             }
+
+             CUDA_CHECK(cudaMemcpyAsync(dev_params + t,
+                                        _doubled_buffer[_buf_index],
+                                        copy_size * sizeof(__half),
+                                        cudaMemcpyHostToDevice,
+                                        Context::Instance().GetCurrentStream()));*/
+            launch_param_update(_doubled_buffer[_buf_index],
+                                dev_params + t,
+                                copy_size,
+                                Context::Instance().GetCurrentStream());
             _buf_index = !_buf_index;
         }
     }
 
-    if(_param_size > rounded_size)
-    {
+    if (_param_size > rounded_size) {
 #pragma omp parallel for
-        for (size_t k = rounded_size; k < _param_size; k++) 
-        {
+        for (size_t k = rounded_size; k < _param_size; k++) {
             float grad = grads[k];
             float param = _params[k];
             float momntum = _exp_avg[k];
             float varianc = _exp_avg_sq[k];
-            if (_weight_decay > 0) {
-                grad = param * _weight_decay + grad;
-            }
+            if (_weight_decay > 0) { grad = param * _weight_decay + grad; }
 
             momntum *= momntum * _betta1;
             momntum = grad * betta1_minus1 + momntum;
@@ -146,19 +141,17 @@ void Adam_Optimizer::Step(float* _params,
             grad = momntum / grad;
 
             param = grad * step_size + param;
-            if (dev_params) 
-                _doubled_buffer[_buf_index][k - rounded_size] = (__half)param;
+            if (dev_params) _doubled_buffer[_buf_index][k - rounded_size] = (__half)param;
 
             _params[k] = param;
             _exp_avg[k] = momntum;
             _exp_avg_sq[k] = varianc;
         }
         if (dev_params) {
-            CUDA_CHECK(cudaMemcpyAsync(dev_params + rounded_size,
-                                _doubled_buffer[_buf_index],
-                                (_param_size - rounded_size) * sizeof(__half),
-                                cudaMemcpyHostToDevice,
-                                Context::Instance().GetCurrentStream()));
+            launch_param_update(_doubled_buffer[_buf_index],
+                                dev_params + rounded_size,
+                                (_param_size - rounded_size),
+                                Context::Instance().GetCurrentStream());
         }
     }
 }
@@ -187,8 +180,8 @@ void Adam_Optimizer::Step_4(float* _params,
 
     float bias_correction1 = 1 - _betta1_t;
     float bias_correction2 = 1 / sqrt(1 - _betta2_t);
-    //AVX_512 bias_correction1_4 = _mm512_set1_ps(bias_correction1);
-    AVX_512 bias2_sqrt ;
+    // AVX_512 bias_correction1_4 = _mm512_set1_ps(bias_correction1);
+    AVX_512 bias2_sqrt;
     bias2_sqrt.data = _mm512_set1_ps(bias_correction2);
 
     AVX_512 eps_4;
@@ -233,20 +226,28 @@ void Adam_Optimizer::Step_4(float* _params,
             if (_weight_decay > 0) {
                 AVX_512 weight_decay4;
                 weight_decay4.data = _mm512_set1_ps(_weight_decay);
-                grad_4[0].data = _mm512_fmadd_ps(param_4[0].data, weight_decay4.data, grad_4[0].data);
-                grad_4[1].data = _mm512_fmadd_ps(param_4[1].data, weight_decay4.data, grad_4[1].data);
-                grad_4[2].data = _mm512_fmadd_ps(param_4[2].data, weight_decay4.data, grad_4[2].data);
-                grad_4[3].data = _mm512_fmadd_ps(param_4[3].data, weight_decay4.data, grad_4[3].data);
+                grad_4[0].data =
+                    _mm512_fmadd_ps(param_4[0].data, weight_decay4.data, grad_4[0].data);
+                grad_4[1].data =
+                    _mm512_fmadd_ps(param_4[1].data, weight_decay4.data, grad_4[1].data);
+                grad_4[2].data =
+                    _mm512_fmadd_ps(param_4[2].data, weight_decay4.data, grad_4[2].data);
+                grad_4[3].data =
+                    _mm512_fmadd_ps(param_4[3].data, weight_decay4.data, grad_4[3].data);
             }
 
             momntum_4[0].data = _mm512_mul_ps(momntum_4[0].data, betta1_4.data);
-            momntum_4[0].data = _mm512_fmadd_ps(grad_4[0].data, betta1_minus1_4.data, momntum_4[0].data);
+            momntum_4[0].data =
+                _mm512_fmadd_ps(grad_4[0].data, betta1_minus1_4.data, momntum_4[0].data);
             momntum_4[1].data = _mm512_mul_ps(momntum_4[1].data, betta1_4.data);
-            momntum_4[1].data = _mm512_fmadd_ps(grad_4[1].data, betta1_minus1_4.data, momntum_4[1].data);
+            momntum_4[1].data =
+                _mm512_fmadd_ps(grad_4[1].data, betta1_minus1_4.data, momntum_4[1].data);
             momntum_4[2].data = _mm512_mul_ps(momntum_4[2].data, betta1_4.data);
-            momntum_4[2].data = _mm512_fmadd_ps(grad_4[2].data, betta1_minus1_4.data, momntum_4[2].data);
+            momntum_4[2].data =
+                _mm512_fmadd_ps(grad_4[2].data, betta1_minus1_4.data, momntum_4[2].data);
             momntum_4[3].data = _mm512_mul_ps(momntum_4[3].data, betta1_4.data);
-            momntum_4[3].data = _mm512_fmadd_ps(grad_4[3].data, betta1_minus1_4.data, momntum_4[3].data);
+            momntum_4[3].data =
+                _mm512_fmadd_ps(grad_4[3].data, betta1_minus1_4.data, momntum_4[3].data);
 
             varianc_4[0].data = _mm512_mul_ps(varianc_4[0].data, betta2_4.data);
             varianc_4[1].data = _mm512_mul_ps(varianc_4[1].data, betta2_4.data);
@@ -256,10 +257,14 @@ void Adam_Optimizer::Step_4(float* _params,
             grad_4[1].data = _mm512_mul_ps(grad_4[1].data, grad_4[1].data);
             grad_4[2].data = _mm512_mul_ps(grad_4[2].data, grad_4[2].data);
             grad_4[3].data = _mm512_mul_ps(grad_4[3].data, grad_4[3].data);
-            varianc_4[0].data = _mm512_fmadd_ps(grad_4[0].data, betta2_minus1_4.data, varianc_4[0].data);
-            varianc_4[1].data = _mm512_fmadd_ps(grad_4[1].data, betta2_minus1_4.data, varianc_4[1].data);
-            varianc_4[2].data = _mm512_fmadd_ps(grad_4[2].data, betta2_minus1_4.data, varianc_4[2].data);
-            varianc_4[3].data = _mm512_fmadd_ps(grad_4[3].data, betta2_minus1_4.data, varianc_4[3].data);
+            varianc_4[0].data =
+                _mm512_fmadd_ps(grad_4[0].data, betta2_minus1_4.data, varianc_4[0].data);
+            varianc_4[1].data =
+                _mm512_fmadd_ps(grad_4[1].data, betta2_minus1_4.data, varianc_4[1].data);
+            varianc_4[2].data =
+                _mm512_fmadd_ps(grad_4[2].data, betta2_minus1_4.data, varianc_4[2].data);
+            varianc_4[3].data =
+                _mm512_fmadd_ps(grad_4[3].data, betta2_minus1_4.data, varianc_4[3].data);
 
             grad_4[0].data = _mm512_sqrt_ps(varianc_4[0].data);
             grad_4[1].data = _mm512_sqrt_ps(varianc_4[1].data);
@@ -280,23 +285,21 @@ void Adam_Optimizer::Step_4(float* _params,
             param_4[2].data = _mm512_fmadd_ps(grad_4[2].data, step_size_4.data, param_4[2].data);
             param_4[3].data = _mm512_fmadd_ps(grad_4[3].data, step_size_4.data, param_4[3].data);
 
-            if (dev_params) {
-                for(int u = 0;u < 4;u++)
-                {
-                    for (size_t j = 0; j < SIMD_WIDTH; j += 4) {
-                        _doubled_buffer[_buf_index][(i - t) + (u << 4) + (j << 2)] =     (__half)param_4[u].data_f[(j << 2)];
-                        _doubled_buffer[_buf_index][(i - t) + (u << 4) + (j << 2) + 1] = (__half)param_4[u].data_f[(j << 2) + 1];
-                        _doubled_buffer[_buf_index][(i - t) + (u << 4) + (j << 2) + 2] = (__half)param_4[u].data_f[(j << 2) + 2];
-                        _doubled_buffer[_buf_index][(i - t) + (u << 4) + (j << 2) + 3] = (__half)param_4[u].data_f[(j << 2) + 3];
-                    }
-                }
-            }
-
             _mm512_storeu_ps(_params + i, param_4[0].data);
             _mm512_storeu_ps(_params + i + SIMD_WIDTH, param_4[1].data);
             _mm512_storeu_ps(_params + i + (SIMD_WIDTH << 1), param_4[2].data);
             _mm512_storeu_ps(_params + i + SIMD_WIDTH * 3, param_4[3].data);
 
+            if (dev_params) {
+                _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t), param_4[0].data);
+                _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH,
+                                 param_4[1].data);
+                _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + (SIMD_WIDTH << 1),
+                                 param_4[2].data);
+                _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 3,
+                                 param_4[3].data);
+            }
+
             _mm512_storeu_ps(_exp_avg + i, momntum_4[0].data);
             _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH, momntum_4[1].data);
             _mm512_storeu_ps(_exp_avg + i + (SIMD_WIDTH << 1), momntum_4[2].data);
@@ -308,24 +311,29 @@ void Adam_Optimizer::Step_4(float* _params,
             _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3, varianc_4[3].data);
         }
 
-        if (dev_params) {/*
-#pragma omp parallel for
-            for (size_t j = 0; j < copy_size; j += 4) {
-                _doubled_buffer[_buf_index][j] = (__half)_params[t + j];
-                _doubled_buffer[_buf_index][j + 1] = (__half)_params[t + j + 1];
-                _doubled_buffer[_buf_index][j + 2] = (__half)_params[t + j + 2];
-                _doubled_buffer[_buf_index][j + 3] = (__half)_params[t + j + 3];
-            }*/
-
-            CUDA_CHECK(cudaMemcpyAsync(dev_params + t,
-                                       _doubled_buffer[_buf_index],
-                                       copy_size * sizeof(__half),
-                                       cudaMemcpyHostToDevice,
-                                       Context::Instance().GetCurrentStream()));
+        if (dev_params) { /*
+ #pragma omp parallel for
+             for (size_t j = 0; j < copy_size; j += 4) {
+                 _doubled_buffer[_buf_index][j] = (__half)_params[t + j];
+                 _doubled_buffer[_buf_index][j + 1] = (__half)_params[t + j + 1];
+                 _doubled_buffer[_buf_index][j + 2] = (__half)_params[t + j + 2];
+                 _doubled_buffer[_buf_index][j + 3] = (__half)_params[t + j + 3];
+             }
+
+             CUDA_CHECK(cudaMemcpyAsync(dev_params + t,
+                                        _doubled_buffer[_buf_index],
+                                        copy_size * sizeof(__half),
+                                        cudaMemcpyHostToDevice,
+                                        Context::Instance().GetCurrentStream()));
+             */
+            launch_param_update(_doubled_buffer[_buf_index],
+                                dev_params + t,
+                                copy_size,
+                                Context::Instance().GetCurrentStream());
             _buf_index = !_buf_index;
         }
     }
-    if(_param_size > rounded_size)
+    if (_param_size > rounded_size)
         Step((_params + rounded_size),
              (grads + rounded_size),
              (_exp_avg + rounded_size),
@@ -340,7 +348,6 @@ int create_adam_optimizer(int optimizer_id,
                           float betta2 = 0.999,
                           float eps = 1e-8,
                           float weight_decay = 0)
-
 {
     auto opt = std::make_shared<Adam_Optimizer>(alpha, betta1, betta2, eps, weight_decay);
 
@@ -375,8 +382,8 @@ void Adam_Optimizer::Step_8(float* _params,
 
     float bias_correction1 = 1 - _betta1_t;
     float bias_correction2 = 1 / sqrt(1 - _betta2_t);
-    //AVX_512 bias_correction1_4 = _mm512_set1_ps(bias_correction1);
-    AVX_512 bias2_sqrt ;
+    // AVX_512 bias_correction1_4 = _mm512_set1_ps(bias_correction1);
+    AVX_512 bias2_sqrt;
     bias2_sqrt.data = _mm512_set1_ps(bias_correction2);
 
     AVX_512 eps_4;
@@ -437,32 +444,48 @@ void Adam_Optimizer::Step_8(float* _params,
             if (_weight_decay > 0) {
                 AVX_512 weight_decay4;
                 weight_decay4.data = _mm512_set1_ps(_weight_decay);
-                grad_4[0].data = _mm512_fmadd_ps(param_4[0].data, weight_decay4.data, grad_4[0].data);
-                grad_4[1].data = _mm512_fmadd_ps(param_4[1].data, weight_decay4.data, grad_4[1].data);
-                grad_4[2].data = _mm512_fmadd_ps(param_4[2].data, weight_decay4.data, grad_4[2].data);
-                grad_4[3].data = _mm512_fmadd_ps(param_4[3].data, weight_decay4.data, grad_4[3].data);
-                grad_4[4].data = _mm512_fmadd_ps(param_4[4].data, weight_decay4.data, grad_4[4].data);
-                grad_4[5].data = _mm512_fmadd_ps(param_4[5].data, weight_decay4.data, grad_4[5].data);
-                grad_4[6].data = _mm512_fmadd_ps(param_4[6].data, weight_decay4.data, grad_4[6].data);
-                grad_4[7].data = _mm512_fmadd_ps(param_4[7].data, weight_decay4.data, grad_4[7].data);
+                grad_4[0].data =
+                    _mm512_fmadd_ps(param_4[0].data, weight_decay4.data, grad_4[0].data);
+                grad_4[1].data =
+                    _mm512_fmadd_ps(param_4[1].data, weight_decay4.data, grad_4[1].data);
+                grad_4[2].data =
+                    _mm512_fmadd_ps(param_4[2].data, weight_decay4.data, grad_4[2].data);
+                grad_4[3].data =
+                    _mm512_fmadd_ps(param_4[3].data, weight_decay4.data, grad_4[3].data);
+                grad_4[4].data =
+                    _mm512_fmadd_ps(param_4[4].data, weight_decay4.data, grad_4[4].data);
+                grad_4[5].data =
+                    _mm512_fmadd_ps(param_4[5].data, weight_decay4.data, grad_4[5].data);
+                grad_4[6].data =
+                    _mm512_fmadd_ps(param_4[6].data, weight_decay4.data, grad_4[6].data);
+                grad_4[7].data =
+                    _mm512_fmadd_ps(param_4[7].data, weight_decay4.data, grad_4[7].data);
             }
 
             momntum_4[0].data = _mm512_mul_ps(momntum_4[0].data, betta1_4.data);
-            momntum_4[0].data = _mm512_fmadd_ps(grad_4[0].data, betta1_minus1_4.data, momntum_4[0].data);
+            momntum_4[0].data =
+                _mm512_fmadd_ps(grad_4[0].data, betta1_minus1_4.data, momntum_4[0].data);
             momntum_4[1].data = _mm512_mul_ps(momntum_4[1].data, betta1_4.data);
-            momntum_4[1].data = _mm512_fmadd_ps(grad_4[1].data, betta1_minus1_4.data, momntum_4[1].data);
+            momntum_4[1].data =
+                _mm512_fmadd_ps(grad_4[1].data, betta1_minus1_4.data, momntum_4[1].data);
             momntum_4[2].data = _mm512_mul_ps(momntum_4[2].data, betta1_4.data);
-            momntum_4[2].data = _mm512_fmadd_ps(grad_4[2].data, betta1_minus1_4.data, momntum_4[2].data);
+            momntum_4[2].data =
+                _mm512_fmadd_ps(grad_4[2].data, betta1_minus1_4.data, momntum_4[2].data);
             momntum_4[3].data = _mm512_mul_ps(momntum_4[3].data, betta1_4.data);
-            momntum_4[3].data = _mm512_fmadd_ps(grad_4[3].data, betta1_minus1_4.data, momntum_4[3].data);
+            momntum_4[3].data =
+                _mm512_fmadd_ps(grad_4[3].data, betta1_minus1_4.data, momntum_4[3].data);
             momntum_4[4].data = _mm512_mul_ps(momntum_4[4].data, betta1_4.data);
-            momntum_4[4].data = _mm512_fmadd_ps(grad_4[4].data, betta1_minus1_4.data, momntum_4[4].data);
+            momntum_4[4].data =
+                _mm512_fmadd_ps(grad_4[4].data, betta1_minus1_4.data, momntum_4[4].data);
             momntum_4[5].data = _mm512_mul_ps(momntum_4[5].data, betta1_4.data);
-            momntum_4[5].data = _mm512_fmadd_ps(grad_4[5].data, betta1_minus1_4.data, momntum_4[5].data);
+            momntum_4[5].data =
+                _mm512_fmadd_ps(grad_4[5].data, betta1_minus1_4.data, momntum_4[5].data);
             momntum_4[6].data = _mm512_mul_ps(momntum_4[6].data, betta1_4.data);
-            momntum_4[6].data = _mm512_fmadd_ps(grad_4[6].data, betta1_minus1_4.data, momntum_4[6].data);
+            momntum_4[6].data =
+                _mm512_fmadd_ps(grad_4[6].data, betta1_minus1_4.data, momntum_4[6].data);
             momntum_4[7].data = _mm512_mul_ps(momntum_4[7].data, betta1_4.data);
-            momntum_4[7].data = _mm512_fmadd_ps(grad_4[7].data, betta1_minus1_4.data, momntum_4[7].data);
+            momntum_4[7].data =
+                _mm512_fmadd_ps(grad_4[7].data, betta1_minus1_4.data, momntum_4[7].data);
 
             varianc_4[0].data = _mm512_mul_ps(varianc_4[0].data, betta2_4.data);
             varianc_4[1].data = _mm512_mul_ps(varianc_4[1].data, betta2_4.data);
@@ -480,14 +503,22 @@ void Adam_Optimizer::Step_8(float* _params,
             grad_4[5].data = _mm512_mul_ps(grad_4[5].data, grad_4[5].data);
             grad_4[6].data = _mm512_mul_ps(grad_4[6].data, grad_4[6].data);
             grad_4[7].data = _mm512_mul_ps(grad_4[7].data, grad_4[7].data);
-            varianc_4[0].data = _mm512_fmadd_ps(grad_4[0].data, betta2_minus1_4.data, varianc_4[0].data);
-            varianc_4[1].data = _mm512_fmadd_ps(grad_4[1].data, betta2_minus1_4.data, varianc_4[1].data);
-            varianc_4[2].data = _mm512_fmadd_ps(grad_4[2].data, betta2_minus1_4.data, varianc_4[2].data);
-            varianc_4[3].data = _mm512_fmadd_ps(grad_4[3].data, betta2_minus1_4.data, varianc_4[3].data);
-            varianc_4[4].data = _mm512_fmadd_ps(grad_4[4].data, betta2_minus1_4.data, varianc_4[4].data);
-            varianc_4[5].data = _mm512_fmadd_ps(grad_4[5].data, betta2_minus1_4.data, varianc_4[5].data);
-            varianc_4[6].data = _mm512_fmadd_ps(grad_4[6].data, betta2_minus1_4.data, varianc_4[6].data);
-            varianc_4[7].data = _mm512_fmadd_ps(grad_4[7].data, betta2_minus1_4.data, varianc_4[7].data);
+            varianc_4[0].data =
+                _mm512_fmadd_ps(grad_4[0].data, betta2_minus1_4.data, varianc_4[0].data);
+            varianc_4[1].data =
+                _mm512_fmadd_ps(grad_4[1].data, betta2_minus1_4.data, varianc_4[1].data);
+            varianc_4[2].data =
+                _mm512_fmadd_ps(grad_4[2].data, betta2_minus1_4.data, varianc_4[2].data);
+            varianc_4[3].data =
+                _mm512_fmadd_ps(grad_4[3].data, betta2_minus1_4.data, varianc_4[3].data);
+            varianc_4[4].data =
+                _mm512_fmadd_ps(grad_4[4].data, betta2_minus1_4.data, varianc_4[4].data);
+            varianc_4[5].data =
+                _mm512_fmadd_ps(grad_4[5].data, betta2_minus1_4.data, varianc_4[5].data);
+            varianc_4[6].data =
+                _mm512_fmadd_ps(grad_4[6].data, betta2_minus1_4.data, varianc_4[6].data);
+            varianc_4[7].data =
+                _mm512_fmadd_ps(grad_4[7].data, betta2_minus1_4.data, varianc_4[7].data);
 
             grad_4[0].data = _mm512_sqrt_ps(varianc_4[0].data);
             grad_4[1].data = _mm512_sqrt_ps(varianc_4[1].data);
@@ -533,24 +564,22 @@ void Adam_Optimizer::Step_8(float* _params,
             _mm512_storeu_ps(_params + i + SIMD_WIDTH * 6, param_4[6].data);
             _mm512_storeu_ps(_params + i + SIMD_WIDTH * 7, param_4[7].data);
 
-            //_mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t), param_4[0]);
-            //_mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH, param_4[1]);
-            //_mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + (SIMD_WIDTH << 1), param_4[2]);
-            //_mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 3, param_4[3]);
-            //_mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + (SIMD_WIDTH << 2), param_4[4]);
-            //_mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 5, param_4[5]);
-            //_mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 6, param_4[6]);
-            //_mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 7, param_4[7]);
             if (dev_params) {
-                for(int u = 0;u < 8;u++)
-                {
-                    for (size_t j = 0; j < SIMD_WIDTH; j += 4) {
-                        _doubled_buffer[_buf_index][(i - t) + (u << 4) + (j << 2)] =     (__half)param_4[u].data_f[(j << 2)];
-                        _doubled_buffer[_buf_index][(i - t) + (u << 4) + (j << 2) + 1] = (__half)param_4[u].data_f[(j << 2) + 1];
-                        _doubled_buffer[_buf_index][(i - t) + (u << 4) + (j << 2) + 2] = (__half)param_4[u].data_f[(j << 2) + 2];
-                        _doubled_buffer[_buf_index][(i - t) + (u << 4) + (j << 2) + 3] = (__half)param_4[u].data_f[(j << 2) + 3];
-                    }
-                }
+                _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t), param_4[0].data);
+                _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH,
+                                 param_4[1].data);
+                _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + (SIMD_WIDTH << 1),
+                                 param_4[2].data);
+                _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 3,
+                                 param_4[3].data);
+                _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + (SIMD_WIDTH << 2),
+                                 param_4[4].data);
+                _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 5,
+                                 param_4[5].data);
+                _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 6,
+                                 param_4[6].data);
+                _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 7,
+                                 param_4[7].data);
             }
 
             _mm512_storeu_ps(_exp_avg + i, momntum_4[0].data);
@@ -572,26 +601,20 @@ void Adam_Optimizer::Step_8(float* _params,
             _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 7, varianc_4[7].data);
         }
         if (dev_params) {
-            /*launch_param_update(_doubled_buffer[_buf_index],
+            launch_param_update(_doubled_buffer[_buf_index],
                                 dev_params + t,
                                 copy_size,
                                 Context::Instance().GetCurrentStream());
-            _buf_index = !_buf_index;*/
-            CUDA_CHECK(cudaMemcpyAsync(dev_params + t,
-                                       _doubled_buffer[_buf_index],
-                                       copy_size * sizeof(__half),
-                                       cudaMemcpyHostToDevice,
-                                       Context::Instance().GetCurrentStream()));
             _buf_index = !_buf_index;
         }
     }
-    if(_param_size > rounded_size)
-         Step_4((_params + rounded_size),
-             (grads + rounded_size),
-             (_exp_avg + rounded_size),
-             (_exp_avg_sq + rounded_size),
-             (_param_size - rounded_size),
-             (dev_params != nullptr ? (dev_params + rounded_size) : dev_params));
+    if (_param_size > rounded_size)
+        Step_4((_params + rounded_size),
+               (grads + rounded_size),
+               (_exp_avg + rounded_size),
+               (_exp_avg_sq + rounded_size),
+               (_param_size - rounded_size),
+               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params));
 }
 
 int ds_adam_step(int optimizer_id,
diff --git a/csrc/includes/cpu_adam.h b/csrc/includes/cpu_adam.h
old mode 100755
new mode 100644
index f56eb1501aa4..b373162612f2
--- a/csrc/includes/cpu_adam.h
+++ b/csrc/includes/cpu_adam.h
@@ -2,12 +2,12 @@
 
 #include <cuda_fp16.h>
 #include <cuda_runtime_api.h>
+#include <x86intrin.h>
 #include <cassert>
 #include "context.h"
 #include "cublas_v2.h"
 #include "cuda.h"
 #include "curand.h"
-#include <x86intrin.h>
 
 #define CUDA_CHECK(callstr)                                                                    \
     {                                                                                          \
@@ -38,8 +38,8 @@ class Adam_Optimizer {
           _betta2_t(1.0),
           _buf_index(false)
     {
-        cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(__half));
-        cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(__half));
+        cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
+        cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
     }
     ~Adam_Optimizer()
     {
@@ -66,10 +66,9 @@ class Adam_Optimizer {
                 __half* dev_params = nullptr);
 
 private:
-
-    union AVX_512{
+    union AVX_512 {
         __m512 data;
-        float data_f[16];
+        // float data_f[16];
     };
 
     float _alpha;
@@ -81,6 +80,6 @@ class Adam_Optimizer {
     float _betta1_t;
     float _betta2_t;
 
-    __half* _doubled_buffer[2];
+    float* _doubled_buffer[2];
     bool _buf_index;
 };

From 485a365ffecbd5ce74264957833a562788348e25 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Tue, 8 Sep 2020 12:03:21 -0700
Subject: [PATCH 39/44] ZeRO-Offload passing model tests (#374)

* Add ZeRO-Offload model tests
Restrict optimizer update+copy to DeepSpeedCPUAdam

* Format fixes

* Increate bucket size scaler
---
 deepspeed/runtime/zero/stage2.py           | 32 +++++---
 tests/model/Megatron_GPT2/run_func_test.py | 87 +++++++++++++---------
 tests/model/run_sanity_check.py            |  2 +-
 3 files changed, 74 insertions(+), 47 deletions(-)

diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py
index d39c17f37161..8ce325e6b1b8 100755
--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -15,6 +15,7 @@
 from deepspeed.runtime.fp16.loss_scaler import LossScaler, DynamicLossScaler
 from deepspeed.runtime.utils import see_memory_usage, is_model_parallel_parameter
 from deepspeed.runtime.zero.config import ZERO_OPTIMIZATION_GRADIENTS
+from deepspeed.ops.adam import DeepSpeedCPUAdam
 
 from deepspeed.utils import logger
 #Toggle this to true to enable correctness test
@@ -153,10 +154,13 @@ def __init__(self,
 
         self.reduce_scatter = reduce_scatter
 
-        self.overlap_comm = overlap_comm or cpu_offload
+        self.overlap_comm = overlap_comm
 
         self.cpu_offload = cpu_offload
 
+        self.deepspeed_adam_offload = (cpu_offload
+                                       and type(init_optimizer) == DeepSpeedCPUAdam)
+
         self.device = torch.cuda.current_device() if not self.cpu_offload else 'cpu'
 
         self.dp_process_group = dp_process_group
@@ -1358,12 +1362,15 @@ def step(self, closure=None):
                 "reducing to {}".format(dist.get_rank(),
                                         prev_scale,
                                         self.loss_scale))
+            timers('optimizer_gradients').start()
+            timers('optimizer_gradients').stop()
             timers('optimizer_step').start()
             timers('optimizer_step').stop()
             timers('optimizer_allgather').start()
             timers('optimizer_allgather').stop()
             return
 
+        timers('optimizer_gradients').start()
         norm_groups = []
         single_partition_grad_groups = []
         skip = False
@@ -1405,24 +1412,27 @@ def step(self, closure=None):
             single_partition_grad_groups.append(single_grad_partition)
 
         self.unscale_and_clip_grads(single_partition_grad_groups, norm_groups)
+        timers('optimizer_gradients').stop()
+
         #torch.set_num_threads(12)
         timers('optimizer_step').start()
-        if self.cpu_offload:
-            #            self.optimizer.step(fp16_param_groups=self.parallel_partitioned_fp16_groups)
-            self.optimizer.step()
-            for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
-                fp16_partitions[partition_id].data.copy_(fp32_partition.data)
+        if self.deepspeed_adam_offload:
+            self.optimizer.step(fp16_param_groups=self.parallel_partitioned_fp16_groups)
+            #self.optimizer.step()
+            #for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
+            #    fp16_partitions[partition_id].data.copy_(fp32_partition.data)
         else:
             self.optimizer.step()
+
             #get rid of the fp32 gradients. Not needed anymore
-            for group in self.single_partition_of_fp32_groups:
-                group.grad = None
+            if not self.cpu_offload:
+                for group in self.single_partition_of_fp32_groups:
+                    group.grad = None
 
             for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
                 fp16_partitions[partition_id].data.copy_(fp32_partition.data)
 
         timers('optimizer_step').stop()
-        timers.log(names=['optimizer_step'])
 
         if self.cpu_offload:
             self.reset_cpu_buffers()
@@ -1469,6 +1479,10 @@ def step(self, closure=None):
             for p, q in zip(self.fp16_groups[i], updated_params):
                 p.data = q.data
 
+        timers.log(
+            names=['optimizer_gradients',
+                   'optimizer_step',
+                   'optimizer_allgather'])
         see_memory_usage('After zero_optimizer step')
         return
 
diff --git a/tests/model/Megatron_GPT2/run_func_test.py b/tests/model/Megatron_GPT2/run_func_test.py
index 552314a0d037..f8ab5bcb3333 100755
--- a/tests/model/Megatron_GPT2/run_func_test.py
+++ b/tests/model/Megatron_GPT2/run_func_test.py
@@ -216,11 +216,12 @@ def test_mp4_gpu4_node1_zero2(self):
         basic_run_config = test_config
         succ = self.run_test(basic_run_config, 0.01)
         self.assertTrue(succ)
+
         partition_activation_config = test_config
         succ = self.run_partition_activations_test(partition_activation_config, 0.01)
         self.assertTrue(succ)
 
-    def test_mp1_gpu1_node1_zero2_offload(self):
+    def test_mp1_gpu1_node1_zero2_ds_offload(self):
         test_config = {
             "mp": 1,
             "gpus": 1,
@@ -235,10 +236,10 @@ def test_mp1_gpu1_node1_zero2_offload(self):
             "json": "ds_config_func_bs4_zero2_offload.json",
             "cpu_optimizer": True,
         }
-        succ = self.run_test(test_config, 0.01)
+        succ = self.run_test(test_config, 0.02)
         self.assertTrue(succ)
 
-    def test_mp1_gpu2_node1_zero2_offload(self):
+    def test_mp1_gpu2_node1_zero2_ds_offload(self):
         test_config = {
             "mp": 1,
             "gpus": 2,
@@ -253,7 +254,7 @@ def test_mp1_gpu2_node1_zero2_offload(self):
             "json": "ds_config_func_bs8_zero2_offload.json",
             "cpu_optimizer": True,
         }
-        succ = self.run_test(test_config, 0.01)
+        succ = self.run_test(test_config, 0.02)
         self.assertTrue(succ)
 
     def test_mp2_gpu4_node1_zero2_gas(self):
@@ -278,7 +279,7 @@ def test_mp2_gpu4_node1_zero2_gas(self):
         succ = self.run_partition_activations_test(test_config, 0.01)
         self.assertTrue(succ)
 
-    def test_mp2_gpu4_node1_zero2_offload(self):
+    def test_mp2_gpu4_node1_zero2_ds_offload(self):
         test_config = {
             "mp": 2,
             "gpus": 4,
@@ -295,14 +296,14 @@ def test_mp2_gpu4_node1_zero2_offload(self):
         }
 
         basic_run_config = test_config
-        succ = self.run_test(basic_run_config, 0.01)
+        succ = self.run_test(basic_run_config, 0.02)
         self.assertTrue(succ)
 
         partition_activation_config = test_config
-        succ = self.run_partition_activations_test(partition_activation_config, 0.01)
+        succ = self.run_partition_activations_test(partition_activation_config, 0.02)
         self.assertTrue(succ)
 
-    def test_mp4_gpu4_node1_zero2_offload(self):
+    def test_mp4_gpu4_node1_zero2_ds_offload(self):
         test_config = {
             "mp": 4,
             "gpus": 4,
@@ -319,14 +320,14 @@ def test_mp4_gpu4_node1_zero2_offload(self):
         }
 
         basic_run_config = test_config
-        succ = self.run_test(basic_run_config, 0.01)
+        succ = self.run_test(basic_run_config, 0.02)
         self.assertTrue(succ)
 
         partition_activation_config = test_config
-        succ = self.run_partition_activations_test(partition_activation_config, 0.01)
+        succ = self.run_partition_activations_test(partition_activation_config, 0.02)
         self.assertTrue(succ)
 
-    def test_mp1_gpu1_node1_zero2_cpu_optimizer(self):
+    def test_mp1_gpu1_node1_zero2_torch_offload(self):
         test_config = {
             "mp": 1,
             "gpus": 1,
@@ -338,14 +339,15 @@ def test_mp1_gpu1_node1_zero2_cpu_optimizer(self):
             "seq_length": SEQ_LEN,
             "heads": ATTN_HEADS,
             "deepspeed": False,
-            "json": "ds_config_func_bs4_zero2.json",
+            "json": "ds_config_func_bs4_zero2_offload.json",
             "cpu_optimizer": True,
+            "test_torch_offload": True,
         }
 
         succ = self.run_test(test_config, 0.01)
         self.assertTrue(succ)
 
-    def test_mp1_gpu2_node1_zero2_cpu_optimizer(self):
+    def test_mp1_gpu2_node1_zero2_torch_offload(self):
         test_config = {
             "mp": 1,
             "gpus": 2,
@@ -357,14 +359,15 @@ def test_mp1_gpu2_node1_zero2_cpu_optimizer(self):
             "seq_length": SEQ_LEN,
             "heads": ATTN_HEADS,
             "deepspeed": False,
-            "json": "ds_config_func_bs8_zero2.json",
+            "json": "ds_config_func_bs8_zero2_offload.json",
             "cpu_optimizer": True,
+            "test_torch_offload": True,
         }
 
         succ = self.run_test(test_config, 0.01)
         self.assertTrue(succ)
 
-    def test_mp2_gpu4_node1_zero2_cpu_optimizer(self):
+    def test_mp2_gpu4_node1_zero2_torch_offload(self):
         test_config = {
             "mp": 2,
             "gpus": 4,
@@ -376,8 +379,9 @@ def test_mp2_gpu4_node1_zero2_cpu_optimizer(self):
             "seq_length": SEQ_LEN,
             "heads": ATTN_HEADS,
             "deepspeed": False,
-            "json": "ds_config_func_bs8_zero2.json",
+            "json": "ds_config_func_bs8_zero2_offload.json",
             "cpu_optimizer": True,
+            "test_torch_offload": True,
         }
 
         basic_run_config = test_config
@@ -388,7 +392,7 @@ def test_mp2_gpu4_node1_zero2_cpu_optimizer(self):
         succ = self.run_partition_activations_test(partition_activation_config, 0.01)
         self.assertTrue(succ)
 
-    def test_mp4_gpu4_node1_zero2_cpu_optimizer(self):
+    def test_mp4_gpu4_node1_zero2_torch_offload(self):
         test_config = {
             "mp": 4,
             "gpus": 4,
@@ -400,8 +404,9 @@ def test_mp4_gpu4_node1_zero2_cpu_optimizer(self):
             "seq_length": SEQ_LEN,
             "heads": ATTN_HEADS,
             "deepspeed": False,
-            "json": "ds_config_func_bs8_zero2.json",
+            "json": "ds_config_func_bs8_zero2_offload.json",
             "cpu_optimizer": True,
+            "test_torch_offload": True,
         }
 
         basic_run_config = test_config
@@ -439,11 +444,7 @@ def run_partition_activations_test(self, test_config, r_tol):
 
         deepspeed_config = test_config["json"]
         baseline_deepspeed_config = False
-
-        if "cpu_optimizer" in test_config and test_config["cpu_optimizer"]:
-            cpu_optimizer_flag = "--cpu-optimizer"
-        else:
-            cpu_optimizer_flag = ""
+        cpu_optimizer_flag = self.gen_cpu_optimizer_flag(test_config, True)
 
         # baseline run...
         # turnoff deepspeed if baseline deepspeed config
@@ -469,6 +470,7 @@ def run_partition_activations_test(self, test_config, r_tol):
 
         # DeepSpeed run...
         test_config["deepspeed"] = True
+        cpu_optimizer_flag = self.gen_cpu_optimizer_flag(test_config, False)
         test_config[
             "other_args"] = f"\"--deepspeed-activation-checkpointing {cpu_optimizer_flag}\""
         test_config["json"] = deepspeed_config
@@ -488,11 +490,7 @@ def run_test(self, test_config, r_tol):
 
         deepspeed_config = test_config["json"]
         baseline_deepspeed_config = False
-
-        if "cpu_optimizer" in test_config and test_config["cpu_optimizer"]:
-            cpu_optimizer_flag = "--cpu-optimizer"
-        else:
-            cpu_optimizer_flag = ""
+        cpu_optimizer_flag = self.gen_cpu_optimizer_flag(test_config, True)
 
         # baseline run...
         # turn off deepspeed if a baseline deepspeed config
@@ -520,6 +518,7 @@ def run_test(self, test_config, r_tol):
 
         # DeepSpeed run...
         test_config["deepspeed"] = True
+        cpu_optimizer_flag = self.gen_cpu_optimizer_flag(test_config, False)
         test_config["other_args"] = f"\"{cpu_optimizer_flag}\""
 
         print("{0}: DeepSpeed run.".format(self.id()))
@@ -551,6 +550,20 @@ def check_parity(self, base_file, test_file, r_tol):
 
         return True
 
+    def gen_cpu_optimizer_flag(self, test_config, is_baseline):
+        if 'cpu_optimizer' in test_config and test_config['cpu_optimizer']:
+            cpu_optimizer_flag = "--cpu-optimizer"
+            if is_baseline:
+                cpu_optimizer_flag += " --cpu_torch_adam"
+                return cpu_optimizer_flag
+            if 'test_torch_offload' in test_config and test_config['test_torch_offload']:
+                cpu_optimizer_flag += " --cpu_torch_adam"
+                return cpu_optimizer_flag
+        else:
+            cpu_optimizer_flag = ""
+
+        return cpu_optimizer_flag
+
 
 def suite():
     suite = unittest.TestSuite()
@@ -558,18 +571,18 @@ def suite():
     suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_fp16'))
 
     # Baseline = Megatron + Torch.Optim.Adam
-    # Test = Megatron + Torch.Optim.Adam + ZeRO-Stage-2
-    suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero2_cpu_optimizer'))
-    suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero2_cpu_optimizer'))
-    suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2_cpu_optimizer'))
-    suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero2_cpu_optimizer'))
+    # Test = Megatron + Torch.Optim.Adam + ZeRO-Offload
+    suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero2_torch_offload'))
+    suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero2_torch_offload'))
+    suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2_torch_offload'))
+    suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero2_torch_offload'))
 
     # Baseline = Megatron + Torch.Optim.Adam
     # Test = Megatron + DeepSpeedAdam + ZeRO-Offload
-    suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero2_offload'))
-    suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero2_offload'))
-    suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2_offload'))
-    suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero2_offload'))
+    suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero2_ds_offload'))
+    suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero2_ds_offload'))
+    suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2_ds_offload'))
+    suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero2_ds_offload'))
 
     suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero1'))
     suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero1'))
diff --git a/tests/model/run_sanity_check.py b/tests/model/run_sanity_check.py
index 21c3bc5d9d43..b7d12ba18f07 100755
--- a/tests/model/run_sanity_check.py
+++ b/tests/model/run_sanity_check.py
@@ -30,7 +30,7 @@ def pytest_hack(runner_result):
 
 
 def test_megatron():
-    runner = unittest.TextTestRunner(failfast=False)
+    runner = unittest.TextTestRunner(failfast=True)
     pytest_hack(runner.run(Megatron_GPT2.suite()))
 
 

From 5b214a12513ffe8a61dab1809608a74964a6afd0 Mon Sep 17 00:00:00 2001
From: RezaYazdaniAminabadi
 <44502768+RezaYazdaniAminabadi@users.noreply.github.com>
Date: Tue, 8 Sep 2020 16:20:48 -0700
Subject: [PATCH 40/44] fix cpu adam compilation for AVX2 (#378)

* fixing the compilation error for AVX2 architecture

* running precommit

* adding cpufeature to requirements

* Update install.sh

* Update install.sh

* include cpu-adam in the features

* update features

* update features

Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
---
 csrc/adam/cpu_adam.cpp        | 786 ++++++++++++++++------------------
 csrc/includes/cpu_adam.h      |  39 +-
 docs/_pages/features.md       |   6 +
 docs/index.md                 |   1 +
 install.sh                    |  16 +-
 requirements/requirements.txt |   1 +
 setup.py                      |  13 +-
 7 files changed, 431 insertions(+), 431 deletions(-)

diff --git a/csrc/adam/cpu_adam.cpp b/csrc/adam/cpu_adam.cpp
index 2baab6177c2e..380bc4ea0ab0 100644
--- a/csrc/adam/cpu_adam.cpp
+++ b/csrc/adam/cpu_adam.cpp
@@ -25,35 +25,41 @@ void Adam_Optimizer::Step(float* _params,
                           size_t _param_size,
                           __half* dev_params)
 {
-    _betta1_t *= _betta1;
-    _betta2_t *= _betta2;
-
-    AVX_512 betta1_4;
-    betta1_4.data = _mm512_set1_ps(_betta1);
-    AVX_512 betta2_4;
-    betta2_4.data = _mm512_set1_ps(_betta2);
-
     float betta1_minus1 = 1 - _betta1;
     float betta2_minus1 = 1 - _betta2;
-    AVX_512 betta1_minus1_4;
-    betta1_minus1_4.data = _mm512_set1_ps(betta1_minus1);
-    AVX_512 betta2_minus1_4;
-    betta2_minus1_4.data = _mm512_set1_ps(betta2_minus1);
 
     float bias_correction1 = 1 - _betta1_t;
     float bias_correction2 = 1 / sqrt(1 - _betta2_t);
-    // AVX_512 bias_correction1_4 = _mm512_set1_ps(bias_correction1);
-    AVX_512 bias2_sqrt;
-    bias2_sqrt.data = _mm512_set1_ps(bias_correction2);
-
-    AVX_512 eps_4;
-    eps_4.data = _mm512_set1_ps(_eps);
 
     float step_size = -1 * _alpha / bias_correction1;
-    AVX_512 step_size_4;
-    step_size_4.data = _mm512_set1_ps(step_size);
 
-    size_t rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH);
+    size_t rounded_size = 0;
+
+#if defined(__AVX512__) or defined(__AVX256__)
+
+    AVX_Data betta1_4;
+    betta1_4.data = SIMD_SET(_betta1);
+    AVX_Data betta2_4;
+    betta2_4.data = SIMD_SET(_betta2);
+
+    AVX_Data betta1_minus1_4;
+    betta1_minus1_4.data = SIMD_SET(betta1_minus1);
+    AVX_Data betta2_minus1_4;
+    betta2_minus1_4.data = SIMD_SET(betta2_minus1);
+
+    AVX_Data bias2_sqrt;
+    bias2_sqrt.data = SIMD_SET(bias_correction2);
+
+    AVX_Data eps_4;
+    eps_4.data = SIMD_SET(_eps);
+
+    AVX_Data step_size_4;
+    step_size_4.data = SIMD_SET(step_size);
+
+    AVX_Data weight_decay4;
+    if (_weight_decay > 0) weight_decay4.data = SIMD_SET(_weight_decay);
+
+    rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH);
 
     for (size_t t = 0; t < rounded_size; t += TILE) {
         size_t copy_size = TILE;
@@ -61,57 +67,41 @@ void Adam_Optimizer::Step(float* _params,
         size_t offset = copy_size + t;
 #pragma omp parallel for
         for (size_t i = t; i < offset; i += SIMD_WIDTH) {
-            AVX_512 grad_4;
-            grad_4.data = _mm512_loadu_ps(grads + i);
+            AVX_Data grad_4;
+            grad_4.data = SIMD_LOAD(grads + i);
 
-            AVX_512 momntum_4;
-            momntum_4.data = _mm512_loadu_ps(_exp_avg + i);
-            AVX_512 varianc_4;
-            varianc_4.data = _mm512_loadu_ps(_exp_avg_sq + i);
+            AVX_Data momentum_4;
+            momentum_4.data = SIMD_LOAD(_exp_avg + i);
+            AVX_Data variance_4;
+            variance_4.data = SIMD_LOAD(_exp_avg_sq + i);
 
-            AVX_512 param_4;
-            param_4.data = _mm512_loadu_ps(_params + i);
+            AVX_Data param_4;
+            param_4.data = SIMD_LOAD(_params + i);
 
-            if (_weight_decay > 0) {
-                AVX_512 weight_decay4;
-                weight_decay4.data = _mm512_set1_ps(_weight_decay);
-                grad_4.data = _mm512_fmadd_ps(param_4.data, weight_decay4.data, grad_4.data);
-            }
+            if (_weight_decay > 0)
+                grad_4.data = SIMD_FMA(param_4.data, weight_decay4.data, grad_4.data);
 
-            momntum_4.data = _mm512_mul_ps(momntum_4.data, betta1_4.data);
-            momntum_4.data = _mm512_fmadd_ps(grad_4.data, betta1_minus1_4.data, momntum_4.data);
+            momentum_4.data = SIMD_MUL(momentum_4.data, betta1_4.data);
+            momentum_4.data = SIMD_FMA(grad_4.data, betta1_minus1_4.data, momentum_4.data);
 
-            varianc_4.data = _mm512_mul_ps(varianc_4.data, betta2_4.data);
-            grad_4.data = _mm512_mul_ps(grad_4.data, grad_4.data);
-            varianc_4.data = _mm512_fmadd_ps(grad_4.data, betta2_minus1_4.data, varianc_4.data);
+            variance_4.data = SIMD_MUL(variance_4.data, betta2_4.data);
+            grad_4.data = SIMD_MUL(grad_4.data, grad_4.data);
+            variance_4.data = SIMD_FMA(grad_4.data, betta2_minus1_4.data, variance_4.data);
 
-            grad_4.data = _mm512_sqrt_ps(varianc_4.data);
-            grad_4.data = _mm512_fmadd_ps(grad_4.data, bias2_sqrt.data, eps_4.data);
-            grad_4.data = _mm512_div_ps(momntum_4.data, grad_4.data);
+            grad_4.data = SIMD_SQRT(variance_4.data);
+            grad_4.data = SIMD_FMA(grad_4.data, bias2_sqrt.data, eps_4.data);
+            grad_4.data = SIMD_DIV(momentum_4.data, grad_4.data);
 
-            param_4.data = _mm512_fmadd_ps(grad_4.data, step_size_4.data, param_4.data);
+            param_4.data = SIMD_FMA(grad_4.data, step_size_4.data, param_4.data);
 
-            _mm512_storeu_ps(_params + i, param_4.data);
+            SIMD_STORE(_params + i, param_4.data);
 
-            if (dev_params) _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t), param_4.data);
+            if (dev_params) SIMD_STORE(_doubled_buffer[_buf_index] + (i - t), param_4.data);
 
-            _mm512_storeu_ps(_exp_avg + i, momntum_4.data);
-            _mm512_storeu_ps(_exp_avg_sq + i, varianc_4.data);
+            SIMD_STORE(_exp_avg + i, momentum_4.data);
+            SIMD_STORE(_exp_avg_sq + i, variance_4.data);
         }
-        if (dev_params) { /*
- #pragma omp parallel for
-             for (size_t j = 0; j < copy_size; j += 4) {
-                 _doubled_buffer[_buf_index][j] = (__half)_params[t + j];
-                 _doubled_buffer[_buf_index][j + 1] = (__half)_params[t + j + 1];
-                 _doubled_buffer[_buf_index][j + 2] = (__half)_params[t + j + 2];
-                 _doubled_buffer[_buf_index][j + 3] = (__half)_params[t + j + 3];
-             }
-
-             CUDA_CHECK(cudaMemcpyAsync(dev_params + t,
-                                        _doubled_buffer[_buf_index],
-                                        copy_size * sizeof(__half),
-                                        cudaMemcpyHostToDevice,
-                                        Context::Instance().GetCurrentStream()));*/
+        if (dev_params) {
             launch_param_update(_doubled_buffer[_buf_index],
                                 dev_params + t,
                                 copy_size,
@@ -120,32 +110,34 @@ void Adam_Optimizer::Step(float* _params,
         }
     }
 
+#endif
+
     if (_param_size > rounded_size) {
 #pragma omp parallel for
         for (size_t k = rounded_size; k < _param_size; k++) {
             float grad = grads[k];
             float param = _params[k];
-            float momntum = _exp_avg[k];
-            float varianc = _exp_avg_sq[k];
-            if (_weight_decay > 0) { grad = param * _weight_decay + grad; }
+            float momentum = _exp_avg[k];
+            float variance = _exp_avg_sq[k];
+            if (_weight_decay > 0) grad = param * _weight_decay + grad;
 
-            momntum *= momntum * _betta1;
-            momntum = grad * betta1_minus1 + momntum;
+            momentum *= momentum * _betta1;
+            momentum = grad * betta1_minus1 + momentum;
 
-            varianc = varianc * _betta2;
+            variance = variance * _betta2;
             grad = grad * grad;
-            varianc = grad * betta2_minus1 + varianc;
+            variance = grad * betta2_minus1 + variance;
 
-            grad = sqrt(varianc);
+            grad = sqrt(variance);
             grad = grad * bias_correction2 + _eps;
-            grad = momntum / grad;
+            grad = momentum / grad;
 
             param = grad * step_size + param;
             if (dev_params) _doubled_buffer[_buf_index][k - rounded_size] = (__half)param;
 
             _params[k] = param;
-            _exp_avg[k] = momntum;
-            _exp_avg_sq[k] = varianc;
+            _exp_avg[k] = momentum;
+            _exp_avg_sq[k] = variance;
         }
         if (dev_params) {
             launch_param_update(_doubled_buffer[_buf_index],
@@ -163,35 +155,36 @@ void Adam_Optimizer::Step_4(float* _params,
                             size_t _param_size,
                             __half* dev_params)
 {
-    _betta1_t *= _betta1;
-    _betta2_t *= _betta2;
+    size_t rounded_size = 0;
+
+#if defined(__AVX512__) or defined(__AVX256__)
 
-    AVX_512 betta1_4;
-    betta1_4.data = _mm512_set1_ps(_betta1);
-    AVX_512 betta2_4;
-    betta2_4.data = _mm512_set1_ps(_betta2);
+    AVX_Data betta1_4;
+    betta1_4.data = SIMD_SET(_betta1);
+    AVX_Data betta2_4;
+    betta2_4.data = SIMD_SET(_betta2);
 
     float betta1_minus1 = 1 - _betta1;
     float betta2_minus1 = 1 - _betta2;
-    AVX_512 betta1_minus1_4;
-    betta1_minus1_4.data = _mm512_set1_ps(betta1_minus1);
-    AVX_512 betta2_minus1_4;
-    betta2_minus1_4.data = _mm512_set1_ps(betta2_minus1);
+    AVX_Data betta1_minus1_4;
+    betta1_minus1_4.data = SIMD_SET(betta1_minus1);
+    AVX_Data betta2_minus1_4;
+    betta2_minus1_4.data = SIMD_SET(betta2_minus1);
 
     float bias_correction1 = 1 - _betta1_t;
     float bias_correction2 = 1 / sqrt(1 - _betta2_t);
-    // AVX_512 bias_correction1_4 = _mm512_set1_ps(bias_correction1);
-    AVX_512 bias2_sqrt;
-    bias2_sqrt.data = _mm512_set1_ps(bias_correction2);
+    // AVX_Data bias_correction1_4 = SIMD_SET(bias_correction1);
+    AVX_Data bias2_sqrt;
+    bias2_sqrt.data = SIMD_SET(bias_correction2);
 
-    AVX_512 eps_4;
-    eps_4.data = _mm512_set1_ps(_eps);
+    AVX_Data eps_4;
+    eps_4.data = SIMD_SET(_eps);
 
     float step_size = -1 * _alpha / bias_correction1;
-    AVX_512 step_size_4;
-    step_size_4.data = _mm512_set1_ps(step_size);
+    AVX_Data step_size_4;
+    step_size_4.data = SIMD_SET(step_size);
 
-    size_t rounded_size = ROUND_DOWN(_param_size, (SIMD_WIDTH << 2));
+    rounded_size = ROUND_DOWN(_param_size, (SIMD_WIDTH << 2));
 
     for (size_t t = 0; t < rounded_size; t += TILE) {
         size_t copy_size = TILE;
@@ -199,133 +192,105 @@ void Adam_Optimizer::Step_4(float* _params,
         size_t offset = copy_size + t;
 #pragma omp parallel for
         for (size_t i = t; i < offset; i += (SIMD_WIDTH << 2)) {
-            AVX_512 grad_4[4];
-            grad_4[0].data = _mm512_loadu_ps(grads + i);
-            grad_4[1].data = _mm512_loadu_ps(grads + i + SIMD_WIDTH);
-            grad_4[2].data = _mm512_loadu_ps(grads + i + (SIMD_WIDTH << 1));
-            grad_4[3].data = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 3);
-
-            AVX_512 momntum_4[4];
-            momntum_4[0].data = _mm512_loadu_ps(_exp_avg + i);
-            momntum_4[1].data = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH);
-            momntum_4[2].data = _mm512_loadu_ps(_exp_avg + i + (SIMD_WIDTH << 1));
-            momntum_4[3].data = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 3);
-
-            AVX_512 varianc_4[4];
-            varianc_4[0].data = _mm512_loadu_ps(_exp_avg_sq + i);
-            varianc_4[1].data = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH);
-            varianc_4[2].data = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1));
-            varianc_4[3].data = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3);
-
-            AVX_512 param_4[4];
-            param_4[0].data = _mm512_loadu_ps(_params + i);
-            param_4[1].data = _mm512_loadu_ps(_params + i + SIMD_WIDTH);
-            param_4[2].data = _mm512_loadu_ps(_params + i + (SIMD_WIDTH << 1));
-            param_4[3].data = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 3);
+            AVX_Data grad_4[4];
+            grad_4[0].data = SIMD_LOAD(grads + i);
+            grad_4[1].data = SIMD_LOAD(grads + i + SIMD_WIDTH);
+            grad_4[2].data = SIMD_LOAD(grads + i + (SIMD_WIDTH << 1));
+            grad_4[3].data = SIMD_LOAD(grads + i + SIMD_WIDTH * 3);
+
+            AVX_Data momentum_4[4];
+            momentum_4[0].data = SIMD_LOAD(_exp_avg + i);
+            momentum_4[1].data = SIMD_LOAD(_exp_avg + i + SIMD_WIDTH);
+            momentum_4[2].data = SIMD_LOAD(_exp_avg + i + (SIMD_WIDTH << 1));
+            momentum_4[3].data = SIMD_LOAD(_exp_avg + i + SIMD_WIDTH * 3);
+
+            AVX_Data variance_4[4];
+            variance_4[0].data = SIMD_LOAD(_exp_avg_sq + i);
+            variance_4[1].data = SIMD_LOAD(_exp_avg_sq + i + SIMD_WIDTH);
+            variance_4[2].data = SIMD_LOAD(_exp_avg_sq + i + (SIMD_WIDTH << 1));
+            variance_4[3].data = SIMD_LOAD(_exp_avg_sq + i + SIMD_WIDTH * 3);
+
+            AVX_Data param_4[4];
+            param_4[0].data = SIMD_LOAD(_params + i);
+            param_4[1].data = SIMD_LOAD(_params + i + SIMD_WIDTH);
+            param_4[2].data = SIMD_LOAD(_params + i + (SIMD_WIDTH << 1));
+            param_4[3].data = SIMD_LOAD(_params + i + SIMD_WIDTH * 3);
 
             if (_weight_decay > 0) {
-                AVX_512 weight_decay4;
-                weight_decay4.data = _mm512_set1_ps(_weight_decay);
-                grad_4[0].data =
-                    _mm512_fmadd_ps(param_4[0].data, weight_decay4.data, grad_4[0].data);
-                grad_4[1].data =
-                    _mm512_fmadd_ps(param_4[1].data, weight_decay4.data, grad_4[1].data);
-                grad_4[2].data =
-                    _mm512_fmadd_ps(param_4[2].data, weight_decay4.data, grad_4[2].data);
-                grad_4[3].data =
-                    _mm512_fmadd_ps(param_4[3].data, weight_decay4.data, grad_4[3].data);
+                AVX_Data weight_decay4;
+                weight_decay4.data = SIMD_SET(_weight_decay);
+                grad_4[0].data = SIMD_FMA(param_4[0].data, weight_decay4.data, grad_4[0].data);
+                grad_4[1].data = SIMD_FMA(param_4[1].data, weight_decay4.data, grad_4[1].data);
+                grad_4[2].data = SIMD_FMA(param_4[2].data, weight_decay4.data, grad_4[2].data);
+                grad_4[3].data = SIMD_FMA(param_4[3].data, weight_decay4.data, grad_4[3].data);
             }
 
-            momntum_4[0].data = _mm512_mul_ps(momntum_4[0].data, betta1_4.data);
-            momntum_4[0].data =
-                _mm512_fmadd_ps(grad_4[0].data, betta1_minus1_4.data, momntum_4[0].data);
-            momntum_4[1].data = _mm512_mul_ps(momntum_4[1].data, betta1_4.data);
-            momntum_4[1].data =
-                _mm512_fmadd_ps(grad_4[1].data, betta1_minus1_4.data, momntum_4[1].data);
-            momntum_4[2].data = _mm512_mul_ps(momntum_4[2].data, betta1_4.data);
-            momntum_4[2].data =
-                _mm512_fmadd_ps(grad_4[2].data, betta1_minus1_4.data, momntum_4[2].data);
-            momntum_4[3].data = _mm512_mul_ps(momntum_4[3].data, betta1_4.data);
-            momntum_4[3].data =
-                _mm512_fmadd_ps(grad_4[3].data, betta1_minus1_4.data, momntum_4[3].data);
-
-            varianc_4[0].data = _mm512_mul_ps(varianc_4[0].data, betta2_4.data);
-            varianc_4[1].data = _mm512_mul_ps(varianc_4[1].data, betta2_4.data);
-            varianc_4[2].data = _mm512_mul_ps(varianc_4[2].data, betta2_4.data);
-            varianc_4[3].data = _mm512_mul_ps(varianc_4[3].data, betta2_4.data);
-            grad_4[0].data = _mm512_mul_ps(grad_4[0].data, grad_4[0].data);
-            grad_4[1].data = _mm512_mul_ps(grad_4[1].data, grad_4[1].data);
-            grad_4[2].data = _mm512_mul_ps(grad_4[2].data, grad_4[2].data);
-            grad_4[3].data = _mm512_mul_ps(grad_4[3].data, grad_4[3].data);
-            varianc_4[0].data =
-                _mm512_fmadd_ps(grad_4[0].data, betta2_minus1_4.data, varianc_4[0].data);
-            varianc_4[1].data =
-                _mm512_fmadd_ps(grad_4[1].data, betta2_minus1_4.data, varianc_4[1].data);
-            varianc_4[2].data =
-                _mm512_fmadd_ps(grad_4[2].data, betta2_minus1_4.data, varianc_4[2].data);
-            varianc_4[3].data =
-                _mm512_fmadd_ps(grad_4[3].data, betta2_minus1_4.data, varianc_4[3].data);
-
-            grad_4[0].data = _mm512_sqrt_ps(varianc_4[0].data);
-            grad_4[1].data = _mm512_sqrt_ps(varianc_4[1].data);
-            grad_4[2].data = _mm512_sqrt_ps(varianc_4[2].data);
-            grad_4[3].data = _mm512_sqrt_ps(varianc_4[3].data);
-
-            grad_4[0].data = _mm512_fmadd_ps(grad_4[0].data, bias2_sqrt.data, eps_4.data);
-            grad_4[1].data = _mm512_fmadd_ps(grad_4[1].data, bias2_sqrt.data, eps_4.data);
-            grad_4[2].data = _mm512_fmadd_ps(grad_4[2].data, bias2_sqrt.data, eps_4.data);
-            grad_4[3].data = _mm512_fmadd_ps(grad_4[3].data, bias2_sqrt.data, eps_4.data);
-            grad_4[0].data = _mm512_div_ps(momntum_4[0].data, grad_4[0].data);
-            grad_4[1].data = _mm512_div_ps(momntum_4[1].data, grad_4[1].data);
-            grad_4[2].data = _mm512_div_ps(momntum_4[2].data, grad_4[2].data);
-            grad_4[3].data = _mm512_div_ps(momntum_4[3].data, grad_4[3].data);
-
-            param_4[0].data = _mm512_fmadd_ps(grad_4[0].data, step_size_4.data, param_4[0].data);
-            param_4[1].data = _mm512_fmadd_ps(grad_4[1].data, step_size_4.data, param_4[1].data);
-            param_4[2].data = _mm512_fmadd_ps(grad_4[2].data, step_size_4.data, param_4[2].data);
-            param_4[3].data = _mm512_fmadd_ps(grad_4[3].data, step_size_4.data, param_4[3].data);
-
-            _mm512_storeu_ps(_params + i, param_4[0].data);
-            _mm512_storeu_ps(_params + i + SIMD_WIDTH, param_4[1].data);
-            _mm512_storeu_ps(_params + i + (SIMD_WIDTH << 1), param_4[2].data);
-            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 3, param_4[3].data);
+            momentum_4[0].data = SIMD_MUL(momentum_4[0].data, betta1_4.data);
+            momentum_4[0].data = SIMD_FMA(grad_4[0].data, betta1_minus1_4.data, momentum_4[0].data);
+            momentum_4[1].data = SIMD_MUL(momentum_4[1].data, betta1_4.data);
+            momentum_4[1].data = SIMD_FMA(grad_4[1].data, betta1_minus1_4.data, momentum_4[1].data);
+            momentum_4[2].data = SIMD_MUL(momentum_4[2].data, betta1_4.data);
+            momentum_4[2].data = SIMD_FMA(grad_4[2].data, betta1_minus1_4.data, momentum_4[2].data);
+            momentum_4[3].data = SIMD_MUL(momentum_4[3].data, betta1_4.data);
+            momentum_4[3].data = SIMD_FMA(grad_4[3].data, betta1_minus1_4.data, momentum_4[3].data);
+
+            variance_4[0].data = SIMD_MUL(variance_4[0].data, betta2_4.data);
+            variance_4[1].data = SIMD_MUL(variance_4[1].data, betta2_4.data);
+            variance_4[2].data = SIMD_MUL(variance_4[2].data, betta2_4.data);
+            variance_4[3].data = SIMD_MUL(variance_4[3].data, betta2_4.data);
+            grad_4[0].data = SIMD_MUL(grad_4[0].data, grad_4[0].data);
+            grad_4[1].data = SIMD_MUL(grad_4[1].data, grad_4[1].data);
+            grad_4[2].data = SIMD_MUL(grad_4[2].data, grad_4[2].data);
+            grad_4[3].data = SIMD_MUL(grad_4[3].data, grad_4[3].data);
+            variance_4[0].data = SIMD_FMA(grad_4[0].data, betta2_minus1_4.data, variance_4[0].data);
+            variance_4[1].data = SIMD_FMA(grad_4[1].data, betta2_minus1_4.data, variance_4[1].data);
+            variance_4[2].data = SIMD_FMA(grad_4[2].data, betta2_minus1_4.data, variance_4[2].data);
+            variance_4[3].data = SIMD_FMA(grad_4[3].data, betta2_minus1_4.data, variance_4[3].data);
+
+            grad_4[0].data = SIMD_SQRT(variance_4[0].data);
+            grad_4[1].data = SIMD_SQRT(variance_4[1].data);
+            grad_4[2].data = SIMD_SQRT(variance_4[2].data);
+            grad_4[3].data = SIMD_SQRT(variance_4[3].data);
+
+            grad_4[0].data = SIMD_FMA(grad_4[0].data, bias2_sqrt.data, eps_4.data);
+            grad_4[1].data = SIMD_FMA(grad_4[1].data, bias2_sqrt.data, eps_4.data);
+            grad_4[2].data = SIMD_FMA(grad_4[2].data, bias2_sqrt.data, eps_4.data);
+            grad_4[3].data = SIMD_FMA(grad_4[3].data, bias2_sqrt.data, eps_4.data);
+            grad_4[0].data = SIMD_DIV(momentum_4[0].data, grad_4[0].data);
+            grad_4[1].data = SIMD_DIV(momentum_4[1].data, grad_4[1].data);
+            grad_4[2].data = SIMD_DIV(momentum_4[2].data, grad_4[2].data);
+            grad_4[3].data = SIMD_DIV(momentum_4[3].data, grad_4[3].data);
+
+            param_4[0].data = SIMD_FMA(grad_4[0].data, step_size_4.data, param_4[0].data);
+            param_4[1].data = SIMD_FMA(grad_4[1].data, step_size_4.data, param_4[1].data);
+            param_4[2].data = SIMD_FMA(grad_4[2].data, step_size_4.data, param_4[2].data);
+            param_4[3].data = SIMD_FMA(grad_4[3].data, step_size_4.data, param_4[3].data);
+
+            SIMD_STORE(_params + i, param_4[0].data);
+            SIMD_STORE(_params + i + SIMD_WIDTH, param_4[1].data);
+            SIMD_STORE(_params + i + (SIMD_WIDTH << 1), param_4[2].data);
+            SIMD_STORE(_params + i + SIMD_WIDTH * 3, param_4[3].data);
 
             if (dev_params) {
-                _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t), param_4[0].data);
-                _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH,
-                                 param_4[1].data);
-                _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + (SIMD_WIDTH << 1),
-                                 param_4[2].data);
-                _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 3,
-                                 param_4[3].data);
+                SIMD_STORE(_doubled_buffer[_buf_index] + (i - t), param_4[0].data);
+                SIMD_STORE(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH, param_4[1].data);
+                SIMD_STORE(_doubled_buffer[_buf_index] + (i - t) + (SIMD_WIDTH << 1),
+                           param_4[2].data);
+                SIMD_STORE(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 3, param_4[3].data);
             }
 
-            _mm512_storeu_ps(_exp_avg + i, momntum_4[0].data);
-            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH, momntum_4[1].data);
-            _mm512_storeu_ps(_exp_avg + i + (SIMD_WIDTH << 1), momntum_4[2].data);
-            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 3, momntum_4[3].data);
+            SIMD_STORE(_exp_avg + i, momentum_4[0].data);
+            SIMD_STORE(_exp_avg + i + SIMD_WIDTH, momentum_4[1].data);
+            SIMD_STORE(_exp_avg + i + (SIMD_WIDTH << 1), momentum_4[2].data);
+            SIMD_STORE(_exp_avg + i + SIMD_WIDTH * 3, momentum_4[3].data);
 
-            _mm512_storeu_ps(_exp_avg_sq + i, varianc_4[0].data);
-            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH, varianc_4[1].data);
-            _mm512_storeu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1), varianc_4[2].data);
-            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3, varianc_4[3].data);
+            SIMD_STORE(_exp_avg_sq + i, variance_4[0].data);
+            SIMD_STORE(_exp_avg_sq + i + SIMD_WIDTH, variance_4[1].data);
+            SIMD_STORE(_exp_avg_sq + i + (SIMD_WIDTH << 1), variance_4[2].data);
+            SIMD_STORE(_exp_avg_sq + i + SIMD_WIDTH * 3, variance_4[3].data);
         }
 
-        if (dev_params) { /*
- #pragma omp parallel for
-             for (size_t j = 0; j < copy_size; j += 4) {
-                 _doubled_buffer[_buf_index][j] = (__half)_params[t + j];
-                 _doubled_buffer[_buf_index][j + 1] = (__half)_params[t + j + 1];
-                 _doubled_buffer[_buf_index][j + 2] = (__half)_params[t + j + 2];
-                 _doubled_buffer[_buf_index][j + 3] = (__half)_params[t + j + 3];
-             }
-
-             CUDA_CHECK(cudaMemcpyAsync(dev_params + t,
-                                        _doubled_buffer[_buf_index],
-                                        copy_size * sizeof(__half),
-                                        cudaMemcpyHostToDevice,
-                                        Context::Instance().GetCurrentStream()));
-             */
+        if (dev_params) {
             launch_param_update(_doubled_buffer[_buf_index],
                                 dev_params + t,
                                 copy_size,
@@ -333,6 +298,7 @@ void Adam_Optimizer::Step_4(float* _params,
             _buf_index = !_buf_index;
         }
     }
+#endif
     if (_param_size > rounded_size)
         Step((_params + rounded_size),
              (grads + rounded_size),
@@ -352,9 +318,18 @@ int create_adam_optimizer(int optimizer_id,
     auto opt = std::make_shared<Adam_Optimizer>(alpha, betta1, betta2, eps, weight_decay);
 
     s_optimizers[optimizer_id] = opt;
-
-    std::cout << "Adam Optimizer #" << optimizer_id << " is created." << std::endl;
-
+#if defined(__AVX512__)
+    std::cout << "Adam Optimizer #" << optimizer_id
+              << " is created with AVX512 arithmetic capability." << std::endl;
+#else
+#if defined(__AVX256__)
+    std::cout << "Adam Optimizer #" << optimizer_id
+              << " is created with AVX2 arithmetic capability." << std::endl;
+#else
+    std::cout << "Adam Optimizer #" << optimizer_id
+              << " is created with scalar arithmetic capability." << std::endl;
+#endif
+#endif
     return 0;
 }
 
@@ -365,35 +340,36 @@ void Adam_Optimizer::Step_8(float* _params,
                             size_t _param_size,
                             __half* dev_params)
 {
-    _betta1_t *= _betta1;
-    _betta2_t *= _betta2;
+    size_t rounded_size = 0;
+
+#if defined(__AVX512__) or defined(__AVX256__)
 
-    AVX_512 betta1_4;
-    betta1_4.data = _mm512_set1_ps(_betta1);
-    AVX_512 betta2_4;
-    betta2_4.data = _mm512_set1_ps(_betta2);
+    AVX_Data betta1_4;
+    betta1_4.data = SIMD_SET(_betta1);
+    AVX_Data betta2_4;
+    betta2_4.data = SIMD_SET(_betta2);
 
     float betta1_minus1 = 1 - _betta1;
     float betta2_minus1 = 1 - _betta2;
-    AVX_512 betta1_minus1_4;
-    betta1_minus1_4.data = _mm512_set1_ps(betta1_minus1);
-    AVX_512 betta2_minus1_4;
-    betta2_minus1_4.data = _mm512_set1_ps(betta2_minus1);
+    AVX_Data betta1_minus1_4;
+    betta1_minus1_4.data = SIMD_SET(betta1_minus1);
+    AVX_Data betta2_minus1_4;
+    betta2_minus1_4.data = SIMD_SET(betta2_minus1);
 
     float bias_correction1 = 1 - _betta1_t;
     float bias_correction2 = 1 / sqrt(1 - _betta2_t);
-    // AVX_512 bias_correction1_4 = _mm512_set1_ps(bias_correction1);
-    AVX_512 bias2_sqrt;
-    bias2_sqrt.data = _mm512_set1_ps(bias_correction2);
+    // AVX_Data bias_correction1_4 = SIMD_SET(bias_correction1);
+    AVX_Data bias2_sqrt;
+    bias2_sqrt.data = SIMD_SET(bias_correction2);
 
-    AVX_512 eps_4;
-    eps_4.data = _mm512_set1_ps(_eps);
+    AVX_Data eps_4;
+    eps_4.data = SIMD_SET(_eps);
 
     float step_size = -1 * _alpha / bias_correction1;
-    AVX_512 step_size_4;
-    step_size_4.data = _mm512_set1_ps(step_size);
+    AVX_Data step_size_4;
+    step_size_4.data = SIMD_SET(step_size);
 
-    size_t rounded_size = ROUND_DOWN(_param_size, (SIMD_WIDTH << 3));
+    rounded_size = ROUND_DOWN(_param_size, (SIMD_WIDTH << 3));
 
     for (size_t t = 0; t < rounded_size; t += TILE) {
         size_t copy_size = TILE;
@@ -401,204 +377,175 @@ void Adam_Optimizer::Step_8(float* _params,
         size_t offset = copy_size + t;
 #pragma omp parallel for
         for (size_t i = t; i < offset; i += (SIMD_WIDTH << 3)) {
-            AVX_512 grad_4[8];
-            grad_4[0].data = _mm512_loadu_ps(grads + i);
-            grad_4[1].data = _mm512_loadu_ps(grads + i + SIMD_WIDTH);
-            grad_4[2].data = _mm512_loadu_ps(grads + i + (SIMD_WIDTH << 1));
-            grad_4[3].data = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 3);
-            grad_4[4].data = _mm512_loadu_ps(grads + i + (SIMD_WIDTH << 2));
-            grad_4[5].data = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 5);
-            grad_4[6].data = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 6);
-            grad_4[7].data = _mm512_loadu_ps(grads + i + SIMD_WIDTH * 7);
-
-            AVX_512 momntum_4[8];
-            momntum_4[0].data = _mm512_loadu_ps(_exp_avg + i);
-            momntum_4[1].data = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH);
-            momntum_4[2].data = _mm512_loadu_ps(_exp_avg + i + (SIMD_WIDTH << 1));
-            momntum_4[3].data = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 3);
-            momntum_4[4].data = _mm512_loadu_ps(_exp_avg + i + (SIMD_WIDTH << 2));
-            momntum_4[5].data = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 5);
-            momntum_4[6].data = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 6);
-            momntum_4[7].data = _mm512_loadu_ps(_exp_avg + i + SIMD_WIDTH * 7);
-
-            AVX_512 varianc_4[8];
-            varianc_4[0].data = _mm512_loadu_ps(_exp_avg_sq + i);
-            varianc_4[1].data = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH);
-            varianc_4[2].data = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1));
-            varianc_4[3].data = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3);
-            varianc_4[4].data = _mm512_loadu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 2));
-            varianc_4[5].data = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 5);
-            varianc_4[6].data = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 6);
-            varianc_4[7].data = _mm512_loadu_ps(_exp_avg_sq + i + SIMD_WIDTH * 7);
-
-            AVX_512 param_4[8];
-            param_4[0].data = _mm512_loadu_ps(_params + i);
-            param_4[1].data = _mm512_loadu_ps(_params + i + SIMD_WIDTH);
-            param_4[2].data = _mm512_loadu_ps(_params + i + (SIMD_WIDTH << 1));
-            param_4[3].data = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 3);
-            param_4[4].data = _mm512_loadu_ps(_params + i + (SIMD_WIDTH << 2));
-            param_4[5].data = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 5);
-            param_4[6].data = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 6);
-            param_4[7].data = _mm512_loadu_ps(_params + i + SIMD_WIDTH * 7);
+            AVX_Data grad_4[8];
+            grad_4[0].data = SIMD_LOAD(grads + i);
+            grad_4[1].data = SIMD_LOAD(grads + i + SIMD_WIDTH);
+            grad_4[2].data = SIMD_LOAD(grads + i + (SIMD_WIDTH << 1));
+            grad_4[3].data = SIMD_LOAD(grads + i + SIMD_WIDTH * 3);
+            grad_4[4].data = SIMD_LOAD(grads + i + (SIMD_WIDTH << 2));
+            grad_4[5].data = SIMD_LOAD(grads + i + SIMD_WIDTH * 5);
+            grad_4[6].data = SIMD_LOAD(grads + i + SIMD_WIDTH * 6);
+            grad_4[7].data = SIMD_LOAD(grads + i + SIMD_WIDTH * 7);
+
+            AVX_Data momentum_4[8];
+            momentum_4[0].data = SIMD_LOAD(_exp_avg + i);
+            momentum_4[1].data = SIMD_LOAD(_exp_avg + i + SIMD_WIDTH);
+            momentum_4[2].data = SIMD_LOAD(_exp_avg + i + (SIMD_WIDTH << 1));
+            momentum_4[3].data = SIMD_LOAD(_exp_avg + i + SIMD_WIDTH * 3);
+            momentum_4[4].data = SIMD_LOAD(_exp_avg + i + (SIMD_WIDTH << 2));
+            momentum_4[5].data = SIMD_LOAD(_exp_avg + i + SIMD_WIDTH * 5);
+            momentum_4[6].data = SIMD_LOAD(_exp_avg + i + SIMD_WIDTH * 6);
+            momentum_4[7].data = SIMD_LOAD(_exp_avg + i + SIMD_WIDTH * 7);
+
+            AVX_Data variance_4[8];
+            variance_4[0].data = SIMD_LOAD(_exp_avg_sq + i);
+            variance_4[1].data = SIMD_LOAD(_exp_avg_sq + i + SIMD_WIDTH);
+            variance_4[2].data = SIMD_LOAD(_exp_avg_sq + i + (SIMD_WIDTH << 1));
+            variance_4[3].data = SIMD_LOAD(_exp_avg_sq + i + SIMD_WIDTH * 3);
+            variance_4[4].data = SIMD_LOAD(_exp_avg_sq + i + (SIMD_WIDTH << 2));
+            variance_4[5].data = SIMD_LOAD(_exp_avg_sq + i + SIMD_WIDTH * 5);
+            variance_4[6].data = SIMD_LOAD(_exp_avg_sq + i + SIMD_WIDTH * 6);
+            variance_4[7].data = SIMD_LOAD(_exp_avg_sq + i + SIMD_WIDTH * 7);
+
+            AVX_Data param_4[8];
+            param_4[0].data = SIMD_LOAD(_params + i);
+            param_4[1].data = SIMD_LOAD(_params + i + SIMD_WIDTH);
+            param_4[2].data = SIMD_LOAD(_params + i + (SIMD_WIDTH << 1));
+            param_4[3].data = SIMD_LOAD(_params + i + SIMD_WIDTH * 3);
+            param_4[4].data = SIMD_LOAD(_params + i + (SIMD_WIDTH << 2));
+            param_4[5].data = SIMD_LOAD(_params + i + SIMD_WIDTH * 5);
+            param_4[6].data = SIMD_LOAD(_params + i + SIMD_WIDTH * 6);
+            param_4[7].data = SIMD_LOAD(_params + i + SIMD_WIDTH * 7);
 
             if (_weight_decay > 0) {
-                AVX_512 weight_decay4;
-                weight_decay4.data = _mm512_set1_ps(_weight_decay);
-                grad_4[0].data =
-                    _mm512_fmadd_ps(param_4[0].data, weight_decay4.data, grad_4[0].data);
-                grad_4[1].data =
-                    _mm512_fmadd_ps(param_4[1].data, weight_decay4.data, grad_4[1].data);
-                grad_4[2].data =
-                    _mm512_fmadd_ps(param_4[2].data, weight_decay4.data, grad_4[2].data);
-                grad_4[3].data =
-                    _mm512_fmadd_ps(param_4[3].data, weight_decay4.data, grad_4[3].data);
-                grad_4[4].data =
-                    _mm512_fmadd_ps(param_4[4].data, weight_decay4.data, grad_4[4].data);
-                grad_4[5].data =
-                    _mm512_fmadd_ps(param_4[5].data, weight_decay4.data, grad_4[5].data);
-                grad_4[6].data =
-                    _mm512_fmadd_ps(param_4[6].data, weight_decay4.data, grad_4[6].data);
-                grad_4[7].data =
-                    _mm512_fmadd_ps(param_4[7].data, weight_decay4.data, grad_4[7].data);
+                AVX_Data weight_decay4;
+                weight_decay4.data = SIMD_SET(_weight_decay);
+                grad_4[0].data = SIMD_FMA(param_4[0].data, weight_decay4.data, grad_4[0].data);
+                grad_4[1].data = SIMD_FMA(param_4[1].data, weight_decay4.data, grad_4[1].data);
+                grad_4[2].data = SIMD_FMA(param_4[2].data, weight_decay4.data, grad_4[2].data);
+                grad_4[3].data = SIMD_FMA(param_4[3].data, weight_decay4.data, grad_4[3].data);
+                grad_4[4].data = SIMD_FMA(param_4[4].data, weight_decay4.data, grad_4[4].data);
+                grad_4[5].data = SIMD_FMA(param_4[5].data, weight_decay4.data, grad_4[5].data);
+                grad_4[6].data = SIMD_FMA(param_4[6].data, weight_decay4.data, grad_4[6].data);
+                grad_4[7].data = SIMD_FMA(param_4[7].data, weight_decay4.data, grad_4[7].data);
             }
 
-            momntum_4[0].data = _mm512_mul_ps(momntum_4[0].data, betta1_4.data);
-            momntum_4[0].data =
-                _mm512_fmadd_ps(grad_4[0].data, betta1_minus1_4.data, momntum_4[0].data);
-            momntum_4[1].data = _mm512_mul_ps(momntum_4[1].data, betta1_4.data);
-            momntum_4[1].data =
-                _mm512_fmadd_ps(grad_4[1].data, betta1_minus1_4.data, momntum_4[1].data);
-            momntum_4[2].data = _mm512_mul_ps(momntum_4[2].data, betta1_4.data);
-            momntum_4[2].data =
-                _mm512_fmadd_ps(grad_4[2].data, betta1_minus1_4.data, momntum_4[2].data);
-            momntum_4[3].data = _mm512_mul_ps(momntum_4[3].data, betta1_4.data);
-            momntum_4[3].data =
-                _mm512_fmadd_ps(grad_4[3].data, betta1_minus1_4.data, momntum_4[3].data);
-            momntum_4[4].data = _mm512_mul_ps(momntum_4[4].data, betta1_4.data);
-            momntum_4[4].data =
-                _mm512_fmadd_ps(grad_4[4].data, betta1_minus1_4.data, momntum_4[4].data);
-            momntum_4[5].data = _mm512_mul_ps(momntum_4[5].data, betta1_4.data);
-            momntum_4[5].data =
-                _mm512_fmadd_ps(grad_4[5].data, betta1_minus1_4.data, momntum_4[5].data);
-            momntum_4[6].data = _mm512_mul_ps(momntum_4[6].data, betta1_4.data);
-            momntum_4[6].data =
-                _mm512_fmadd_ps(grad_4[6].data, betta1_minus1_4.data, momntum_4[6].data);
-            momntum_4[7].data = _mm512_mul_ps(momntum_4[7].data, betta1_4.data);
-            momntum_4[7].data =
-                _mm512_fmadd_ps(grad_4[7].data, betta1_minus1_4.data, momntum_4[7].data);
-
-            varianc_4[0].data = _mm512_mul_ps(varianc_4[0].data, betta2_4.data);
-            varianc_4[1].data = _mm512_mul_ps(varianc_4[1].data, betta2_4.data);
-            varianc_4[2].data = _mm512_mul_ps(varianc_4[2].data, betta2_4.data);
-            varianc_4[3].data = _mm512_mul_ps(varianc_4[3].data, betta2_4.data);
-            varianc_4[4].data = _mm512_mul_ps(varianc_4[4].data, betta2_4.data);
-            varianc_4[5].data = _mm512_mul_ps(varianc_4[5].data, betta2_4.data);
-            varianc_4[6].data = _mm512_mul_ps(varianc_4[6].data, betta2_4.data);
-            varianc_4[7].data = _mm512_mul_ps(varianc_4[7].data, betta2_4.data);
-            grad_4[0].data = _mm512_mul_ps(grad_4[0].data, grad_4[0].data);
-            grad_4[1].data = _mm512_mul_ps(grad_4[1].data, grad_4[1].data);
-            grad_4[2].data = _mm512_mul_ps(grad_4[2].data, grad_4[2].data);
-            grad_4[3].data = _mm512_mul_ps(grad_4[3].data, grad_4[3].data);
-            grad_4[4].data = _mm512_mul_ps(grad_4[4].data, grad_4[4].data);
-            grad_4[5].data = _mm512_mul_ps(grad_4[5].data, grad_4[5].data);
-            grad_4[6].data = _mm512_mul_ps(grad_4[6].data, grad_4[6].data);
-            grad_4[7].data = _mm512_mul_ps(grad_4[7].data, grad_4[7].data);
-            varianc_4[0].data =
-                _mm512_fmadd_ps(grad_4[0].data, betta2_minus1_4.data, varianc_4[0].data);
-            varianc_4[1].data =
-                _mm512_fmadd_ps(grad_4[1].data, betta2_minus1_4.data, varianc_4[1].data);
-            varianc_4[2].data =
-                _mm512_fmadd_ps(grad_4[2].data, betta2_minus1_4.data, varianc_4[2].data);
-            varianc_4[3].data =
-                _mm512_fmadd_ps(grad_4[3].data, betta2_minus1_4.data, varianc_4[3].data);
-            varianc_4[4].data =
-                _mm512_fmadd_ps(grad_4[4].data, betta2_minus1_4.data, varianc_4[4].data);
-            varianc_4[5].data =
-                _mm512_fmadd_ps(grad_4[5].data, betta2_minus1_4.data, varianc_4[5].data);
-            varianc_4[6].data =
-                _mm512_fmadd_ps(grad_4[6].data, betta2_minus1_4.data, varianc_4[6].data);
-            varianc_4[7].data =
-                _mm512_fmadd_ps(grad_4[7].data, betta2_minus1_4.data, varianc_4[7].data);
-
-            grad_4[0].data = _mm512_sqrt_ps(varianc_4[0].data);
-            grad_4[1].data = _mm512_sqrt_ps(varianc_4[1].data);
-            grad_4[2].data = _mm512_sqrt_ps(varianc_4[2].data);
-            grad_4[3].data = _mm512_sqrt_ps(varianc_4[3].data);
-            grad_4[4].data = _mm512_sqrt_ps(varianc_4[4].data);
-            grad_4[5].data = _mm512_sqrt_ps(varianc_4[5].data);
-            grad_4[6].data = _mm512_sqrt_ps(varianc_4[6].data);
-            grad_4[7].data = _mm512_sqrt_ps(varianc_4[7].data);
-
-            grad_4[0].data = _mm512_fmadd_ps(grad_4[0].data, bias2_sqrt.data, eps_4.data);
-            grad_4[1].data = _mm512_fmadd_ps(grad_4[1].data, bias2_sqrt.data, eps_4.data);
-            grad_4[2].data = _mm512_fmadd_ps(grad_4[2].data, bias2_sqrt.data, eps_4.data);
-            grad_4[3].data = _mm512_fmadd_ps(grad_4[3].data, bias2_sqrt.data, eps_4.data);
-            grad_4[4].data = _mm512_fmadd_ps(grad_4[4].data, bias2_sqrt.data, eps_4.data);
-            grad_4[5].data = _mm512_fmadd_ps(grad_4[5].data, bias2_sqrt.data, eps_4.data);
-            grad_4[6].data = _mm512_fmadd_ps(grad_4[6].data, bias2_sqrt.data, eps_4.data);
-            grad_4[7].data = _mm512_fmadd_ps(grad_4[7].data, bias2_sqrt.data, eps_4.data);
-            grad_4[0].data = _mm512_div_ps(momntum_4[0].data, grad_4[0].data);
-            grad_4[1].data = _mm512_div_ps(momntum_4[1].data, grad_4[1].data);
-            grad_4[2].data = _mm512_div_ps(momntum_4[2].data, grad_4[2].data);
-            grad_4[3].data = _mm512_div_ps(momntum_4[3].data, grad_4[3].data);
-            grad_4[4].data = _mm512_div_ps(momntum_4[4].data, grad_4[4].data);
-            grad_4[5].data = _mm512_div_ps(momntum_4[5].data, grad_4[5].data);
-            grad_4[6].data = _mm512_div_ps(momntum_4[6].data, grad_4[6].data);
-            grad_4[7].data = _mm512_div_ps(momntum_4[7].data, grad_4[7].data);
-
-            param_4[0].data = _mm512_fmadd_ps(grad_4[0].data, step_size_4.data, param_4[0].data);
-            param_4[1].data = _mm512_fmadd_ps(grad_4[1].data, step_size_4.data, param_4[1].data);
-            param_4[2].data = _mm512_fmadd_ps(grad_4[2].data, step_size_4.data, param_4[2].data);
-            param_4[3].data = _mm512_fmadd_ps(grad_4[3].data, step_size_4.data, param_4[3].data);
-            param_4[4].data = _mm512_fmadd_ps(grad_4[4].data, step_size_4.data, param_4[4].data);
-            param_4[5].data = _mm512_fmadd_ps(grad_4[5].data, step_size_4.data, param_4[5].data);
-            param_4[6].data = _mm512_fmadd_ps(grad_4[6].data, step_size_4.data, param_4[6].data);
-            param_4[7].data = _mm512_fmadd_ps(grad_4[7].data, step_size_4.data, param_4[7].data);
-
-            _mm512_storeu_ps(_params + i, param_4[0].data);
-            _mm512_storeu_ps(_params + i + SIMD_WIDTH, param_4[1].data);
-            _mm512_storeu_ps(_params + i + (SIMD_WIDTH << 1), param_4[2].data);
-            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 3, param_4[3].data);
-            _mm512_storeu_ps(_params + i + (SIMD_WIDTH << 2), param_4[4].data);
-            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 5, param_4[5].data);
-            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 6, param_4[6].data);
-            _mm512_storeu_ps(_params + i + SIMD_WIDTH * 7, param_4[7].data);
+            momentum_4[0].data = SIMD_MUL(momentum_4[0].data, betta1_4.data);
+            momentum_4[0].data = SIMD_FMA(grad_4[0].data, betta1_minus1_4.data, momentum_4[0].data);
+            momentum_4[1].data = SIMD_MUL(momentum_4[1].data, betta1_4.data);
+            momentum_4[1].data = SIMD_FMA(grad_4[1].data, betta1_minus1_4.data, momentum_4[1].data);
+            momentum_4[2].data = SIMD_MUL(momentum_4[2].data, betta1_4.data);
+            momentum_4[2].data = SIMD_FMA(grad_4[2].data, betta1_minus1_4.data, momentum_4[2].data);
+            momentum_4[3].data = SIMD_MUL(momentum_4[3].data, betta1_4.data);
+            momentum_4[3].data = SIMD_FMA(grad_4[3].data, betta1_minus1_4.data, momentum_4[3].data);
+            momentum_4[4].data = SIMD_MUL(momentum_4[4].data, betta1_4.data);
+            momentum_4[4].data = SIMD_FMA(grad_4[4].data, betta1_minus1_4.data, momentum_4[4].data);
+            momentum_4[5].data = SIMD_MUL(momentum_4[5].data, betta1_4.data);
+            momentum_4[5].data = SIMD_FMA(grad_4[5].data, betta1_minus1_4.data, momentum_4[5].data);
+            momentum_4[6].data = SIMD_MUL(momentum_4[6].data, betta1_4.data);
+            momentum_4[6].data = SIMD_FMA(grad_4[6].data, betta1_minus1_4.data, momentum_4[6].data);
+            momentum_4[7].data = SIMD_MUL(momentum_4[7].data, betta1_4.data);
+            momentum_4[7].data = SIMD_FMA(grad_4[7].data, betta1_minus1_4.data, momentum_4[7].data);
+
+            variance_4[0].data = SIMD_MUL(variance_4[0].data, betta2_4.data);
+            variance_4[1].data = SIMD_MUL(variance_4[1].data, betta2_4.data);
+            variance_4[2].data = SIMD_MUL(variance_4[2].data, betta2_4.data);
+            variance_4[3].data = SIMD_MUL(variance_4[3].data, betta2_4.data);
+            variance_4[4].data = SIMD_MUL(variance_4[4].data, betta2_4.data);
+            variance_4[5].data = SIMD_MUL(variance_4[5].data, betta2_4.data);
+            variance_4[6].data = SIMD_MUL(variance_4[6].data, betta2_4.data);
+            variance_4[7].data = SIMD_MUL(variance_4[7].data, betta2_4.data);
+            grad_4[0].data = SIMD_MUL(grad_4[0].data, grad_4[0].data);
+            grad_4[1].data = SIMD_MUL(grad_4[1].data, grad_4[1].data);
+            grad_4[2].data = SIMD_MUL(grad_4[2].data, grad_4[2].data);
+            grad_4[3].data = SIMD_MUL(grad_4[3].data, grad_4[3].data);
+            grad_4[4].data = SIMD_MUL(grad_4[4].data, grad_4[4].data);
+            grad_4[5].data = SIMD_MUL(grad_4[5].data, grad_4[5].data);
+            grad_4[6].data = SIMD_MUL(grad_4[6].data, grad_4[6].data);
+            grad_4[7].data = SIMD_MUL(grad_4[7].data, grad_4[7].data);
+            variance_4[0].data = SIMD_FMA(grad_4[0].data, betta2_minus1_4.data, variance_4[0].data);
+            variance_4[1].data = SIMD_FMA(grad_4[1].data, betta2_minus1_4.data, variance_4[1].data);
+            variance_4[2].data = SIMD_FMA(grad_4[2].data, betta2_minus1_4.data, variance_4[2].data);
+            variance_4[3].data = SIMD_FMA(grad_4[3].data, betta2_minus1_4.data, variance_4[3].data);
+            variance_4[4].data = SIMD_FMA(grad_4[4].data, betta2_minus1_4.data, variance_4[4].data);
+            variance_4[5].data = SIMD_FMA(grad_4[5].data, betta2_minus1_4.data, variance_4[5].data);
+            variance_4[6].data = SIMD_FMA(grad_4[6].data, betta2_minus1_4.data, variance_4[6].data);
+            variance_4[7].data = SIMD_FMA(grad_4[7].data, betta2_minus1_4.data, variance_4[7].data);
+
+            grad_4[0].data = SIMD_SQRT(variance_4[0].data);
+            grad_4[1].data = SIMD_SQRT(variance_4[1].data);
+            grad_4[2].data = SIMD_SQRT(variance_4[2].data);
+            grad_4[3].data = SIMD_SQRT(variance_4[3].data);
+            grad_4[4].data = SIMD_SQRT(variance_4[4].data);
+            grad_4[5].data = SIMD_SQRT(variance_4[5].data);
+            grad_4[6].data = SIMD_SQRT(variance_4[6].data);
+            grad_4[7].data = SIMD_SQRT(variance_4[7].data);
+
+            grad_4[0].data = SIMD_FMA(grad_4[0].data, bias2_sqrt.data, eps_4.data);
+            grad_4[1].data = SIMD_FMA(grad_4[1].data, bias2_sqrt.data, eps_4.data);
+            grad_4[2].data = SIMD_FMA(grad_4[2].data, bias2_sqrt.data, eps_4.data);
+            grad_4[3].data = SIMD_FMA(grad_4[3].data, bias2_sqrt.data, eps_4.data);
+            grad_4[4].data = SIMD_FMA(grad_4[4].data, bias2_sqrt.data, eps_4.data);
+            grad_4[5].data = SIMD_FMA(grad_4[5].data, bias2_sqrt.data, eps_4.data);
+            grad_4[6].data = SIMD_FMA(grad_4[6].data, bias2_sqrt.data, eps_4.data);
+            grad_4[7].data = SIMD_FMA(grad_4[7].data, bias2_sqrt.data, eps_4.data);
+            grad_4[0].data = SIMD_DIV(momentum_4[0].data, grad_4[0].data);
+            grad_4[1].data = SIMD_DIV(momentum_4[1].data, grad_4[1].data);
+            grad_4[2].data = SIMD_DIV(momentum_4[2].data, grad_4[2].data);
+            grad_4[3].data = SIMD_DIV(momentum_4[3].data, grad_4[3].data);
+            grad_4[4].data = SIMD_DIV(momentum_4[4].data, grad_4[4].data);
+            grad_4[5].data = SIMD_DIV(momentum_4[5].data, grad_4[5].data);
+            grad_4[6].data = SIMD_DIV(momentum_4[6].data, grad_4[6].data);
+            grad_4[7].data = SIMD_DIV(momentum_4[7].data, grad_4[7].data);
+
+            param_4[0].data = SIMD_FMA(grad_4[0].data, step_size_4.data, param_4[0].data);
+            param_4[1].data = SIMD_FMA(grad_4[1].data, step_size_4.data, param_4[1].data);
+            param_4[2].data = SIMD_FMA(grad_4[2].data, step_size_4.data, param_4[2].data);
+            param_4[3].data = SIMD_FMA(grad_4[3].data, step_size_4.data, param_4[3].data);
+            param_4[4].data = SIMD_FMA(grad_4[4].data, step_size_4.data, param_4[4].data);
+            param_4[5].data = SIMD_FMA(grad_4[5].data, step_size_4.data, param_4[5].data);
+            param_4[6].data = SIMD_FMA(grad_4[6].data, step_size_4.data, param_4[6].data);
+            param_4[7].data = SIMD_FMA(grad_4[7].data, step_size_4.data, param_4[7].data);
+
+            SIMD_STORE(_params + i, param_4[0].data);
+            SIMD_STORE(_params + i + SIMD_WIDTH, param_4[1].data);
+            SIMD_STORE(_params + i + (SIMD_WIDTH << 1), param_4[2].data);
+            SIMD_STORE(_params + i + SIMD_WIDTH * 3, param_4[3].data);
+            SIMD_STORE(_params + i + (SIMD_WIDTH << 2), param_4[4].data);
+            SIMD_STORE(_params + i + SIMD_WIDTH * 5, param_4[5].data);
+            SIMD_STORE(_params + i + SIMD_WIDTH * 6, param_4[6].data);
+            SIMD_STORE(_params + i + SIMD_WIDTH * 7, param_4[7].data);
 
             if (dev_params) {
-                _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t), param_4[0].data);
-                _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH,
-                                 param_4[1].data);
-                _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + (SIMD_WIDTH << 1),
-                                 param_4[2].data);
-                _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 3,
-                                 param_4[3].data);
-                _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + (SIMD_WIDTH << 2),
-                                 param_4[4].data);
-                _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 5,
-                                 param_4[5].data);
-                _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 6,
-                                 param_4[6].data);
-                _mm512_storeu_ps(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 7,
-                                 param_4[7].data);
+                SIMD_STORE(_doubled_buffer[_buf_index] + (i - t), param_4[0].data);
+                SIMD_STORE(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH, param_4[1].data);
+                SIMD_STORE(_doubled_buffer[_buf_index] + (i - t) + (SIMD_WIDTH << 1),
+                           param_4[2].data);
+                SIMD_STORE(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 3, param_4[3].data);
+                SIMD_STORE(_doubled_buffer[_buf_index] + (i - t) + (SIMD_WIDTH << 2),
+                           param_4[4].data);
+                SIMD_STORE(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 5, param_4[5].data);
+                SIMD_STORE(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 6, param_4[6].data);
+                SIMD_STORE(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 7, param_4[7].data);
             }
 
-            _mm512_storeu_ps(_exp_avg + i, momntum_4[0].data);
-            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH, momntum_4[1].data);
-            _mm512_storeu_ps(_exp_avg + i + (SIMD_WIDTH << 1), momntum_4[2].data);
-            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 3, momntum_4[3].data);
-            _mm512_storeu_ps(_exp_avg + i + (SIMD_WIDTH << 2), momntum_4[4].data);
-            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 5, momntum_4[5].data);
-            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 6, momntum_4[6].data);
-            _mm512_storeu_ps(_exp_avg + i + SIMD_WIDTH * 7, momntum_4[7].data);
-
-            _mm512_storeu_ps(_exp_avg_sq + i, varianc_4[0].data);
-            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH, varianc_4[1].data);
-            _mm512_storeu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 1), varianc_4[2].data);
-            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 3, varianc_4[3].data);
-            _mm512_storeu_ps(_exp_avg_sq + i + (SIMD_WIDTH << 2), varianc_4[4].data);
-            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 5, varianc_4[5].data);
-            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 6, varianc_4[6].data);
-            _mm512_storeu_ps(_exp_avg_sq + i + SIMD_WIDTH * 7, varianc_4[7].data);
+            SIMD_STORE(_exp_avg + i, momentum_4[0].data);
+            SIMD_STORE(_exp_avg + i + SIMD_WIDTH, momentum_4[1].data);
+            SIMD_STORE(_exp_avg + i + (SIMD_WIDTH << 1), momentum_4[2].data);
+            SIMD_STORE(_exp_avg + i + SIMD_WIDTH * 3, momentum_4[3].data);
+            SIMD_STORE(_exp_avg + i + (SIMD_WIDTH << 2), momentum_4[4].data);
+            SIMD_STORE(_exp_avg + i + SIMD_WIDTH * 5, momentum_4[5].data);
+            SIMD_STORE(_exp_avg + i + SIMD_WIDTH * 6, momentum_4[6].data);
+            SIMD_STORE(_exp_avg + i + SIMD_WIDTH * 7, momentum_4[7].data);
+
+            SIMD_STORE(_exp_avg_sq + i, variance_4[0].data);
+            SIMD_STORE(_exp_avg_sq + i + SIMD_WIDTH, variance_4[1].data);
+            SIMD_STORE(_exp_avg_sq + i + (SIMD_WIDTH << 1), variance_4[2].data);
+            SIMD_STORE(_exp_avg_sq + i + SIMD_WIDTH * 3, variance_4[3].data);
+            SIMD_STORE(_exp_avg_sq + i + (SIMD_WIDTH << 2), variance_4[4].data);
+            SIMD_STORE(_exp_avg_sq + i + SIMD_WIDTH * 5, variance_4[5].data);
+            SIMD_STORE(_exp_avg_sq + i + SIMD_WIDTH * 6, variance_4[6].data);
+            SIMD_STORE(_exp_avg_sq + i + SIMD_WIDTH * 7, variance_4[7].data);
         }
         if (dev_params) {
             launch_param_update(_doubled_buffer[_buf_index],
@@ -608,6 +555,7 @@ void Adam_Optimizer::Step_8(float* _params,
             _buf_index = !_buf_index;
         }
     }
+#endif
     if (_param_size > rounded_size)
         Step_4((_params + rounded_size),
                (grads + rounded_size),
@@ -635,7 +583,7 @@ int ds_adam_step(int optimizer_id,
 
     std::shared_ptr<Adam_Optimizer> opt =
         std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
-
+    opt->IncrementStep();
     opt->Step_8(params_ptr, grads_ptr, exp_avg_ptr, exp_avg_sq_ptr, params_c.size(0));
 
     return 0;
@@ -662,7 +610,7 @@ int ds_adam_step_plus_copy(int optimizer_id,
 
     std::shared_ptr<Adam_Optimizer> opt =
         std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
-
+    opt->IncrementStep();
     opt->Step_8(
         params_ptr, grads_ptr, exp_avg_ptr, exp_avg_sq_ptr, params_c.size(0), gpu_params_ptr);
 
diff --git a/csrc/includes/cpu_adam.h b/csrc/includes/cpu_adam.h
index b373162612f2..996450c56ae9 100644
--- a/csrc/includes/cpu_adam.h
+++ b/csrc/includes/cpu_adam.h
@@ -1,7 +1,9 @@
 #pragma once
 
+#include <cpuid.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime_api.h>
+#include <stdio.h>
 #include <x86intrin.h>
 #include <cassert>
 #include "context.h"
@@ -18,10 +20,30 @@
         }                                                                                      \
     }
 
-#define SIMD_WIDTH 16
-
 #define TILE (1024 * 1024 * 1024)
 
+#if defined(__AVX512__)
+#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
+#define SIMD_LOAD(x) _mm512_loadu_ps(x)
+#define SIMD_SET(x) _mm512_set1_ps(x)
+#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
+#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
+#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
+#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
+#define SIMD_WIDTH 16
+#else
+#if defined(__AVX256__)
+#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
+#define SIMD_LOAD(x) _mm256_loadu_ps(x)
+#define SIMD_SET(x) _mm256_set1_ps(x)
+#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
+#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
+#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
+#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
+#define SIMD_WIDTH 8
+#endif
+#endif
+
 class Adam_Optimizer {
 public:
     Adam_Optimizer(float alpha = 1e-3,
@@ -64,12 +86,23 @@ class Adam_Optimizer {
                 float* _exp_avg_sq,
                 size_t _param_size,
                 __half* dev_params = nullptr);
+    inline void IncrementStep()
+    {
+        _betta1_t *= _betta1;
+        _betta2_t *= _betta2;
+    }
 
 private:
-    union AVX_512 {
+#if defined(__AVX512__) or defined(__AVX256__)
+    union AVX_Data {
+#if defined(__AVX512__)
         __m512 data;
+#else
+        __m256 data;
+#endif
         // float data_f[16];
     };
+#endif
 
     float _alpha;
     float _betta1;
diff --git a/docs/_pages/features.md b/docs/_pages/features.md
index 451e3b2af534..f93f9465882b 100755
--- a/docs/_pages/features.md
+++ b/docs/_pages/features.md
@@ -162,6 +162,12 @@ Please see the [core API doc](https://deepspeed.readthedocs.io/) for more detail
 With DeepSpeed, the user can choose to use a high performance implementation of ADAM from
 NVIDIA, or any training optimizer that extends torch's `torch.optim.Optimizer` class.
 
+### CPU-Adam: High-Performance vectorized implementation of Adam
+We introduce an efficient implementation of Adam optimizer on CPU that improves the parameter-update
+performance by nearly an order of magnitude. Comparing to torch-adam, we observe 5.1x to 6.5x 
+speedups considering the model-size between 1 to 10 billion parameters. For the CPU-Adam implementation, 
+we use the AVX SIMD instructions on Intel-x86 architecture. We support both AVX-512 and AVX-2 instruction sets. 
+
 ### Memory bandwidth optimized FP16 Optimizer
 Mixed precision training is handled by the DeepSpeed FP16 Optimizer. This optimizer not
 only handles FP16 training but is also highly efficient. The performance of weight update
diff --git a/docs/index.md b/docs/index.md
index 6dea83db268f..4aa29673ecc6 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -167,6 +167,7 @@ overview](/features/) for descriptions and usage.
     * Automatic loss scaling with mixed precision
 * [Training Optimizers](/features/#training-optimizers)
     * Fused Adam optimizer and arbitrary `torch.optim.Optimizer`
+    * CPU-Adam: High-Performance vectorized Adam
     * Memory bandwidth optimized FP16 Optimizer
     * Large Batch Training with LAMB Optimizer
     * Memory efficient Training with ZeRO Optimizer
diff --git a/install.sh b/install.sh
index 3dae80b4033b..a4ee6bbf3b72 100755
--- a/install.sh
+++ b/install.sh
@@ -164,10 +164,10 @@ if [ ! -f $hostfile ]; then
     local_only=1
 fi
 
-#if [ "$skip_requirements" == "0" ]; then
-#    # Ensure dependencies are installed locally
-#    $PIP_SUDO $PIP_INSTALL -r requirements.txt
-#fi
+if [ "$skip_requirements" == "0" ]; then
+   # Ensure dependencies are installed locally
+   $PIP_SUDO $PIP_INSTALL -r requirements/requirements.txt
+fi
 
 # Build wheels
 if [ "$third_party_install" == "1" ]; then
@@ -220,10 +220,10 @@ else
     tmp_wheel_path="/tmp/deepspeed_wheels"
 
     pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl; else mkdir -pv $tmp_wheel_path; fi"
-    #pdcp -w $hosts requirements/*.txt ${tmp_wheel_path}/
-    #if [ "$skip_requirements" == "0" ]; then
-    #    pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL -r ${tmp_wheel_path}/requirements.txt"
-    #fi
+    pdcp -w $hosts requirements/requirements.txt ${tmp_wheel_path}/
+    if [ "$skip_requirements" == "0" ]; then
+       pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL -r ${tmp_wheel_path}/requirements.txt"
+    fi
     if [ "$third_party_install" == "1" ]; then
         pdsh -w $hosts "$PIP_SUDO pip uninstall -y apex"
         pdcp -w $hosts third_party/apex/dist/apex*.whl $tmp_wheel_path/
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 63f0022b314d..d9881f4bc580 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -2,4 +2,5 @@ torch>=1.2
 torchvision>=0.4.0
 tqdm
 psutil
+cpufeature
 tensorboardX==1.8
diff --git a/setup.py b/setup.py
index eb8d6ed07e51..f2dc2ba3b37a 100755
--- a/setup.py
+++ b/setup.py
@@ -96,6 +96,16 @@ def fetch_requirements(path):
     version_ge_1_5 = ['-DVERSION_GE_1_5']
 version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5
 
+import cpufeature
+d = cpufeature.CPUFeature
+
+SIMD_WIDTH = ''
+if d['AVX512f']:
+    SIMD_WIDTH = '-D__AVX512__'
+elif d['AVX2']:
+    SIMD_WIDTH = '-D__AVX256__'
+print("SIMD_WIDTH = ", SIMD_WIDTH)
+
 ext_modules = []
 
 ## Lamb ##
@@ -135,7 +145,8 @@ def fetch_requirements(path):
                               '-g',
                               '-Wno-reorder',
                               '-march=native',
-                              '-fopenmp'
+                              '-fopenmp',
+                              SIMD_WIDTH
                           ],
                           'nvcc': [
                               '-O3',

From 135dd080400c23682eddd6a9a327375f3b9b2ace Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Sat, 5 Sep 2020 15:06:12 -0700
Subject: [PATCH 41/44] Move code quality tests to Azure-hosted agents. (#368)

---
 azure-pipelines.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 207b00de543c..f98be617c938 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -91,11 +91,14 @@ jobs:
 
 - job: Code_Quality_Checks
   pool:
-    name: 'DS_testing'
+    vmImage: 'ubuntu-latest'
   variables:
     conda_env: 'ds_codetest'
 
   steps:
+  - bash: echo "##vso[task.prependpath]$CONDA/bin"
+    displayName: Add conda to PATH
+
   - script: |
       conda create --force --yes -n $(conda_env) python=3.7
       source activate $(conda_env)

From 2df751cc93be70f40a362dc5696623d99ebeeae5 Mon Sep 17 00:00:00 2001
From: Tunji Ruwase <olruwase@microsoft.com>
Date: Wed, 9 Sep 2020 00:21:38 +0000
Subject: [PATCH 42/44] Bump DSE

---
 DeepSpeedExamples | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DeepSpeedExamples b/DeepSpeedExamples
index 6236dc461f3c..9518eab7dcf5 160000
--- a/DeepSpeedExamples
+++ b/DeepSpeedExamples
@@ -1 +1 @@
-Subproject commit 6236dc461f3c5009a4b858535e9e61008f67995a
+Subproject commit 9518eab7dcf5751c3c392a72df21b4b92ed5b529

From b73894dedc433076991e57ca3460e2477c04714c Mon Sep 17 00:00:00 2001
From: Arash Ashari <arashari@microsoft.com>
Date: Tue, 8 Sep 2020 19:15:44 -0700
Subject: [PATCH 43/44] adding sparse attention to feature index page (#377)

---
 docs/_pages/features.md | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/docs/_pages/features.md b/docs/_pages/features.md
index 451e3b2af534..74c3bc6eaf48 100755
--- a/docs/_pages/features.md
+++ b/docs/_pages/features.md
@@ -221,3 +221,22 @@ Please see the [core API doc](https://deepspeed.readthedocs.io/) for more detail
   }
 }
 ```
+## Sparse Attention
+DeepSpeed offers sparse attention to support long sequences. Please refer to the [Sparse Attention](/tutorials/sparse_attention/) tutorial.
+
+```python
+--deepspeed_sparse_attention
+```
+
+```python
+"sparse_attention": {
+    "mode": "fixed",
+    "block": 16,
+    "different_layout_per_head": true,
+    "num_local_blocks": 4,
+    "num_global_blocks": 1,
+    "attention": "bidirectional",
+    "horizontal_global_attention": false,
+    "num_different_global_patterns": 4
+}
+```

From 9a80f4a10a97064ccdea7ba026dd1b81b4ee556f Mon Sep 17 00:00:00 2001
From: RezaYazdaniAminabadi
 <44502768+RezaYazdaniAminabadi@users.noreply.github.com>
Date: Wed, 9 Sep 2020 11:55:25 -0700
Subject: [PATCH 44/44] support avx2 by default (#383)

* add DS_BUILD_AVX512 flag and update the feature part accordingly

* run precommit
---
 docs/_pages/features.md |  8 +++++---
 setup.py                | 10 ++++++----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/docs/_pages/features.md b/docs/_pages/features.md
index 0bf848e52ac7..d449bab96ca4 100755
--- a/docs/_pages/features.md
+++ b/docs/_pages/features.md
@@ -164,9 +164,11 @@ NVIDIA, or any training optimizer that extends torch's `torch.optim.Optimizer` c
 
 ### CPU-Adam: High-Performance vectorized implementation of Adam
 We introduce an efficient implementation of Adam optimizer on CPU that improves the parameter-update
-performance by nearly an order of magnitude. Comparing to torch-adam, we observe 5.1x to 6.5x 
-speedups considering the model-size between 1 to 10 billion parameters. For the CPU-Adam implementation, 
-we use the AVX SIMD instructions on Intel-x86 architecture. We support both AVX-512 and AVX-2 instruction sets. 
+performance by nearly an order of magnitude. We use the AVX SIMD instructions on Intel-x86 architecture
+for the CPU-Adam implementation. We support both AVX-512 and AVX-2 instruction sets. DeepSpeed uses
+AVX-2 by defualt which can be switched to AVX-512 by setting the build flag, `DS_BUILD_AVX512` to 1 when
+installing DeepSpeed. Using AVX-512, we observe 5.1x to 6.5x speedups considering the model-size between
+1 to 10 billion parameters with respect to torch-adam.
 
 ### Memory bandwidth optimized FP16 Optimizer
 Mixed precision training is handled by the DeepSpeed FP16 Optimizer. This optimizer not
diff --git a/setup.py b/setup.py
index f2dc2ba3b37a..46a6b4bf0a4e 100755
--- a/setup.py
+++ b/setup.py
@@ -38,9 +38,10 @@ def fetch_requirements(path):
 DS_BUILD_TRANSFORMER_MASK = 10
 DS_BUILD_SPARSE_ATTN_MASK = 100
 DS_BUILD_ADAM_MASK = 1000
+DS_BUILD_AVX512_MASK = 10000
 
 # Allow for build_cuda to turn on or off all ops
-DS_BUILD_ALL_OPS = DS_BUILD_LAMB_MASK | DS_BUILD_TRANSFORMER_MASK | DS_BUILD_SPARSE_ATTN_MASK | DS_BUILD_ADAM_MASK
+DS_BUILD_ALL_OPS = DS_BUILD_LAMB_MASK | DS_BUILD_TRANSFORMER_MASK | DS_BUILD_SPARSE_ATTN_MASK | DS_BUILD_ADAM_MASK | DS_BUILD_AVX512_MASK
 DS_BUILD_CUDA = int(os.environ.get('DS_BUILD_CUDA', 1)) * DS_BUILD_ALL_OPS
 
 # Set default of each op based on if build_cuda is set
@@ -51,6 +52,7 @@ def fetch_requirements(path):
                                           OP_DEFAULT)) * DS_BUILD_TRANSFORMER_MASK
 DS_BUILD_SPARSE_ATTN = int(os.environ.get('DS_BUILD_SPARSE_ATTN',
                                           0)) * DS_BUILD_SPARSE_ATTN_MASK
+DS_BUILD_AVX512 = int(os.environ.get('DS_BUILD_AVX512', 0)) * DS_BUILD_AVX512_MASK
 
 # Final effective mask is the bitwise OR of each op
 BUILD_MASK = (DS_BUILD_LAMB | DS_BUILD_TRANSFORMER | DS_BUILD_SPARSE_ATTN
@@ -97,12 +99,12 @@ def fetch_requirements(path):
 version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5
 
 import cpufeature
-d = cpufeature.CPUFeature
+cpu_info = cpufeature.CPUFeature
 
 SIMD_WIDTH = ''
-if d['AVX512f']:
+if cpu_info['AVX512f'] and DS_BUILD_AVX512:
     SIMD_WIDTH = '-D__AVX512__'
-elif d['AVX2']:
+elif cpu_info['AVX2']:
     SIMD_WIDTH = '-D__AVX256__'
 print("SIMD_WIDTH = ", SIMD_WIDTH)