diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index 87cc64950006..a7b3827c01b7 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -206,6 +206,16 @@ def _build_data_iter(self, dataset):
         self.set_dataloader(pipe_dataloader)
 
     def _exec_reduce_tied_grads(self):
+        # We need to run this first to write to self.averaged_gradients;
+        # since this class turns `enable_backward_allreduce` off,
+        # `self.overlapping_partition_gradients_reduce_epilogue()` defined in the DeepSpeedEngine
+        # never actually runs. I suspect this is because of efficiency problems; get_flat_partition in
+        # stage2.py might do something expensive; someone will have to look into that later. But
+        # in the meantime, this fixes ZeRO2 + Pipelining enough to run a demo. Further profiling
+        # needed to decide if it actually breaks everything.
+        # (see https://github.com/EleutherAI/gpt-neox/issues/62#issuecomment-761471944)
+        if self.zero_optimization_partition_gradients():
+            self.optimizer.overlapping_partition_gradients_reduce_epilogue()
         self.module.allreduce_tied_weight_gradients()
 
     def _exec_reduce_grads(self):