diff --git a/.clang-format b/.clang-format
new file mode 100644
index 000000000..8920ed8b2
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,5 @@
+---
+BasedOnStyle: Google
+---
+Language: Cpp
+ColumnLimit: 80
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 000000000..6bb305c9e
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,10 @@
+# Contribution guidelines
+
+First of all, thanks for taking the time to contribute!
+
+Please refer to the following guidelines to contribute new functionality or bug fixes:
+
+1. Use [autopep8](https://github.com/hhatto/autopep8) to format the Python code.
+2. Use [clang-format](https://clang.llvm.org/docs/ClangFormat.html) to format C++ code. Changes to BytePS C++ code should conform to [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
+3. Add unit tests for any new code you write.
+4. Run unit tests in both CI and GPU environments.
diff --git a/byteps/_keras/__init__.py b/byteps/_keras/__init__.py
index bf547885f..254769a17 100644
--- a/byteps/_keras/__init__.py
+++ b/byteps/_keras/__init__.py
@@ -99,4 +99,4 @@ def load_model(keras, wrap_optimizer, filepath, custom_optimizers, custom_object
     if custom_objects is not None:
         byteps_objects.update(custom_objects)
 
-    return keras.models.load_model(filepath, custom_objects=byteps_objects)
\ No newline at end of file
+    return keras.models.load_model(filepath, custom_objects=byteps_objects)
diff --git a/byteps/_keras/callbacks.py b/byteps/_keras/callbacks.py
index 4db0077ec..47840d89b 100644
--- a/byteps/_keras/callbacks.py
+++ b/byteps/_keras/callbacks.py
@@ -112,7 +112,7 @@ def _adjust_learning_rate(self, epoch):
             # See the paper cited above for more information about momentum correction.
             self.restore_momentum = self.backend.get_value(self.model.optimizer.momentum)
             self.backend.set_value(self.model.optimizer.momentum,
-                        self.restore_momentum * new_lr / old_lr)
+                                   self.restore_momentum * new_lr / old_lr)
 
     def _restore_momentum_if_needed(self):
         if self.restore_momentum:
@@ -168,4 +168,4 @@ def on_epoch_end(self, epoch, logs=None):
         if epoch == self.end_epoch - 1 and self.verbose > 0:
             new_lr = self.backend.get_value(self.model.optimizer.lr)
             print('\nEpoch %d: finished gradual learning rate warmup to %g.' %
-                  (epoch + 1, new_lr))
\ No newline at end of file
+                  (epoch + 1, new_lr))
diff --git a/byteps/common/common.cc b/byteps/common/common.cc
index 7c0e1c9e4..0ea59e769 100644
--- a/byteps/common/common.cc
+++ b/byteps/common/common.cc
@@ -14,8 +14,8 @@
 // limitations under the License.
 // =============================================================================
 
-#include <sstream>
 #include <cassert>
+#include <sstream>
 
 #include "common.h"
 #include "logging.h"
@@ -30,9 +30,7 @@ Status::Status(StatusType type, std::string reason) {
   reason_ = reason;
 }
 
-Status Status::OK() {
-  return Status();
-}
+Status Status::OK() { return Status(); }
 
 Status Status::UnknownError(std::string message) {
   return Status(StatusType::UNKNOWN_ERROR, message);
@@ -50,29 +48,17 @@ Status Status::InvalidArgument(std::string message) {
   return Status(StatusType::INVALID_ARGUMENT, message);
 }
 
-Status Status::InProgress() {
-  return Status(StatusType::IN_PROGRESS, "");
-}
+Status Status::InProgress() { return Status(StatusType::IN_PROGRESS, ""); }
 
-bool Status::ok() const {
-  return type_ == StatusType::OK;
-}
+bool Status::ok() const { return type_ == StatusType::OK; }
 
-bool Status::in_progress() const {
-  return type_ == StatusType::IN_PROGRESS;
-}
+bool Status::in_progress() const { return type_ == StatusType::IN_PROGRESS; }
 
-StatusType Status::type() const {
-  return type_;
-}
+StatusType Status::type() const { return type_; }
 
-const std::string& Status::reason() const {
-  return reason_;
-}
+const std::string& Status::reason() const { return reason_; }
 
-void TensorShape::AddDim(int64_t dim) {
-  shape_.push_back(dim);
-}
+void TensorShape::AddDim(int64_t dim) { shape_.push_back(dim); }
 
 void TensorShape::AppendShape(TensorShape& other) {
   for (auto dim : other.shape_) {
@@ -93,9 +79,7 @@ const std::string TensorShape::DebugString() const {
   return args.str();
 }
 
-int TensorShape::dims() const {
-  return (int)shape_.size();
-}
+int TensorShape::dims() const { return (int)shape_.size(); }
 
 int64_t TensorShape::dim_size(int idx) const {
   assert(idx >= 0);
@@ -157,5 +141,5 @@ int getDataTypeLength(int dtype) {
   return 4;
 }
 
-} // namespace common
-} // namespace byteps
+}  // namespace common
+}  // namespace byteps
diff --git a/byteps/common/common.h b/byteps/common/common.h
index 16889a316..6a39a4663 100644
--- a/byteps/common/common.h
+++ b/byteps/common/common.h
@@ -17,15 +17,15 @@
 #ifndef BYTEPS_COMMON_H
 #define BYTEPS_COMMON_H
 
+#include <cuda_runtime.h>
+#include <nccl.h>
+#include <atomic>
 #include <functional>
 #include <memory>
+#include <mutex>
 #include <string>
 #include <unordered_map>
-#include <atomic>
 #include <vector>
-#include <mutex>
-#include <nccl.h>
-#include <cuda_runtime.h>
 
 namespace byteps {
 namespace common {
@@ -54,25 +54,41 @@ enum DataType {
 // List of supported frameworks.
 enum Framework { TENSORFLOW, PYTORCH, MXNET };
 
-enum StatusType { OK, UNKNOWN_ERROR, PRECONDITION_ERROR, ABORTED, INVALID_ARGUMENT, IN_PROGRESS };
+enum StatusType {
+  OK,
+  UNKNOWN_ERROR,
+  PRECONDITION_ERROR,
+  ABORTED,
+  INVALID_ARGUMENT,
+  IN_PROGRESS
+};
 
 enum DeviceType { CPU, GPU };
 
-enum QueueType { COORDINATE_REDUCE, REDUCE, COPYD2H,
-                 PCIE_REDUCE, COORDINATE_PUSH, PUSH, PULL,
-                 COPYH2D, COORDINATE_BROADCAST, BROADCAST,
-                 QUEUE_NUM_AND_NOT_A_REAL_QUEUE_TYPE_AND_MUST_BE_THE_LAST };
+enum QueueType {
+  COORDINATE_REDUCE,
+  REDUCE,
+  COPYD2H,
+  PCIE_REDUCE,
+  COORDINATE_PUSH,
+  PUSH,
+  PULL,
+  COPYH2D,
+  COORDINATE_BROADCAST,
+  BROADCAST,
+  QUEUE_NUM_AND_NOT_A_REAL_QUEUE_TYPE_AND_MUST_BE_THE_LAST
+};
 
-const int QueueNum = (int)QUEUE_NUM_AND_NOT_A_REAL_QUEUE_TYPE_AND_MUST_BE_THE_LAST;
+const int QueueNum =
+    (int)QUEUE_NUM_AND_NOT_A_REAL_QUEUE_TYPE_AND_MUST_BE_THE_LAST;
 
 const std::vector<std::string> LogStrings = {
-  "COORDINATE_REDUCE", "REDUCE", "COPYD2H",
-  "PCIE_REDUCE", "COORDINATE_PUSH", "PUSH", "PULL",
-  "COPYH2D", "COORDINATE_BROADCAST", "BROADCAST"
-};
+    "COORDINATE_REDUCE",    "REDUCE",   "COPYD2H", "PCIE_REDUCE",
+    "COORDINATE_PUSH",      "PUSH",     "PULL",    "COPYH2D",
+    "COORDINATE_BROADCAST", "BROADCAST"};
 
 class Status {
-public:
+ public:
   Status();
   static Status OK();
   static Status UnknownError(std::string message);
@@ -85,14 +101,14 @@ class Status {
   StatusType type() const;
   const std::string& reason() const;
 
-private:
+ private:
   StatusType type_ = StatusType::OK;
   std::string reason_ = "";
   Status(StatusType type, std::string reason);
 };
 
 class TensorShape {
-public:
+ public:
   void AddDim(int64_t dim);
   void AppendShape(TensorShape& other);
 
@@ -109,36 +125,36 @@ class TensorShape {
     return shape_ != rhs.shape_;
   }
 
-private:
+ private:
   std::vector<int64_t> shape_;
 };
 
 class ReadyEvent {
-public:
+ public:
   virtual bool Ready() const = 0;
   virtual ~ReadyEvent() = default;
 };
 
 typedef struct BytePSContext {
-    bool initialized;
-    std::mutex init_mutex;
-    // tensor name
-    std::string tensor_name;
-    // using ps::Key = uint64_t
-    uint64_t declared_key;
-    // the actual keys being used
-    std::vector<uint64_t> key_list;
-    // a copy on CPU
-    void* cpubuff;
-    // GPU ptr if the tensor is on CPU
-    void* gpu_ptr;
-    // CPU buffer for cross-PCIe-switch merging
-    std::vector<void*> pcie_cpubuff;
-    size_t buff_len;
+  bool initialized;
+  std::mutex init_mutex;
+  // tensor name
+  std::string tensor_name;
+  // using ps::Key = uint64_t
+  uint64_t declared_key;
+  // the actual keys being used
+  std::vector<uint64_t> key_list;
+  // a copy on CPU
+  void* cpubuff;
+  // GPU ptr if the tensor is on CPU
+  void* gpu_ptr;
+  // CPU buffer for cross-PCIe-switch merging
+  std::vector<void*> pcie_cpubuff;
+  size_t buff_len;
 } BPSContext;
 
 class Tensor {
-public:
+ public:
   virtual const DataType dtype() const = 0;
   virtual const TensorShape shape() const = 0;
   virtual const void* data() const = 0;
@@ -194,7 +210,9 @@ struct TensorTableEntry {
 using TensorTable = std::unordered_map<std::string, TensorTableEntry>;
 
 enum class RequestType {
-  kDefaultPushPull, kRowSparsePushPull, kCompressedPushPull
+  kDefaultPushPull,
+  kRowSparsePushPull,
+  kCompressedPushPull
 };
 
 int GetCommandType(RequestType requestType, int d);
@@ -203,7 +221,7 @@ ncclDataType_t getNcclDataType(DataType dtype);
 
 int getDataTypeLength(int dtype);
 
-} // namespace common
-} // namespace byteps
+}  // namespace common
+}  // namespace byteps
 
-#endif // BYTEPS_COMMON_H
+#endif  // BYTEPS_COMMON_H
diff --git a/byteps/common/communicator.cc b/byteps/common/communicator.cc
index f1ae6aa2c..b2a13ac4c 100644
--- a/byteps/common/communicator.cc
+++ b/byteps/common/communicator.cc
@@ -13,229 +13,238 @@
 // limitations under the License.
 // =============================================================================
 
-#include "logging.h"
 #include "communicator.h"
-#include "global.h"
 #include <cerrno>
 #include <cstring>
+#include "global.h"
+#include "logging.h"
+
 namespace byteps {
 namespace common {
 
-
 // Copy constructor that provides the option to reconfigure members.
-// The ranks in members always use local_rank, regardless that the members 
+// The ranks in members always use local_rank, regardless that the members
 // may be a subset of all local ranks.
 BytePSCommSocket::BytePSCommSocket(std::shared_ptr<BytePSComm> comm,
-                                   const std::string &path_suffix,
-                                   const std::vector<int> &members) {
-    std::shared_ptr<BytePSCommSocket> sock_comm = std::static_pointer_cast<BytePSCommSocket>(comm);
-    // TODO: use private members directly
-    _rank = sock_comm->getRank();
-    _size = sock_comm->getSize();
-    _local_rank = sock_comm->getLocalRank();
-    _local_size = sock_comm->getLocalSize();
-    _worker_id = sock_comm->getWorkerID();
-    _send_path = sock_comm->getSendPath() + path_suffix;
-    _recv_path = sock_comm->getRecvPath() + path_suffix;
-    _send_fd = initSocket(_local_rank, _send_path);
-    _recv_fd = initSocket(_local_rank, _recv_path);
-
-    _members = (members.size() > 0) ? members : sock_comm->getMembers();
-    _root = _members.back();
-
-    auto my_role = (_local_rank == _root) ? LOCAL_ROOT : LOCAL_WORKER;
-    bool is_root = (my_role == LOCAL_ROOT) ? true : false;
-    // init socket comm
-    if (is_root) { // root
-        _listen_thread = new std::thread(&BytePSCommSocket::startListenThread, this);
-    }
-
-    BPS_LOG(DEBUG) << "This is " << path_suffix << (is_root ? " ROOT" : " WORKER")
-                   << " device, rank=" << _local_rank
-                   << ", all sockets create successfully";
+                                   const std::string& path_suffix,
+                                   const std::vector<int>& members) {
+  std::shared_ptr<BytePSCommSocket> sock_comm =
+      std::static_pointer_cast<BytePSCommSocket>(comm);
+  // TODO: use private members directly
+  _rank = sock_comm->getRank();
+  _size = sock_comm->getSize();
+  _local_rank = sock_comm->getLocalRank();
+  _local_size = sock_comm->getLocalSize();
+  _worker_id = sock_comm->getWorkerID();
+  _send_path = sock_comm->getSendPath() + path_suffix;
+  _recv_path = sock_comm->getRecvPath() + path_suffix;
+  _send_fd = initSocket(_local_rank, _send_path);
+  _recv_fd = initSocket(_local_rank, _recv_path);
+
+  _members = (members.size() > 0) ? members : sock_comm->getMembers();
+  _root = _members.back();
+
+  auto my_role = (_local_rank == _root) ? LOCAL_ROOT : LOCAL_WORKER;
+  bool is_root = (my_role == LOCAL_ROOT) ? true : false;
+  // init socket comm
+  if (is_root) {  // root
+    _listen_thread =
+        new std::thread(&BytePSCommSocket::startListenThread, this);
+  }
+
+  BPS_LOG(DEBUG) << "This is " << path_suffix << (is_root ? " ROOT" : " WORKER")
+                 << " device, rank=" << _local_rank
+                 << ", all sockets create successfully";
 }
 
-void BytePSCommSocket::init(int* rank, int* size, int* local_rank, int* local_size,
-                            int* worker_id, BytePSRole* my_role) {
-
-    BPS_LOG(DEBUG) << "Using Communicator=Socket";
-
-    // We should init rank, size, etc. using getenv
-    // do env check
-    BPS_CHECK(getenv("BYTEPS_LOCAL_RANK")) << "error: env BYTEPS_LOCAL_RANK not set";
-    BPS_CHECK(getenv("BYTEPS_LOCAL_SIZE")) << "error: env BYTEPS_LOCAL_SIZE not set";
-    BPS_CHECK(getenv("DMLC_WORKER_ID")) << "error: env DMLC_WORKER_ID not set";
-    BPS_CHECK(getenv("DMLC_NUM_WORKER")) << "error: env DMLC_NUM_WORKER not set";
-
-    *local_rank = atoi(getenv("BYTEPS_LOCAL_RANK"));
-    *local_size = atoi(getenv("BYTEPS_LOCAL_SIZE"));
-    *worker_id = atoi(getenv("DMLC_WORKER_ID"));
-    auto num_worker = atoi(getenv("DMLC_NUM_WORKER"));
-
-    // we assume _local_size (i.e., # GPU) is consistent on all workers
-    *rank = (*local_rank) + (*worker_id) * (*local_size);
-    *size = num_worker * (*local_size);
-
-    _rank = *rank;
-    _size = *size;
-    _local_rank = *local_rank;
-    _local_size = *local_size;
-    _worker_id = *worker_id;
-
-    for (int i = 0; i < _local_size; i++) {
-        _members.push_back(i);
-    }
-    _root = _members.back();
-
-    *my_role = (_local_rank == _root) ? LOCAL_ROOT : LOCAL_WORKER;
-    bool is_root = (*my_role == LOCAL_ROOT) ? true : false;
-
-    if (getenv("BYTEPS_SOCKET_PATH")) {
-        _send_path = std::string(getenv("BYTEPS_SOCKET_PATH")) + std::string("/socket_send_");
-        _recv_path = std::string(getenv("BYTEPS_SOCKET_PATH")) + std::string("/socket_recv_");
-    } else {
-        _send_path = std::string(DEFAULT_BASE_SOCKET_PATH_SEND);
-        _recv_path = std::string(DEFAULT_BASE_SOCKET_PATH_RECV);
-    }
-
-    _send_fd = initSocket(_local_rank, _send_path);
-    _recv_fd = initSocket(_local_rank, _recv_path);
-
-    // init socket comm
-    if (is_root) { // root
-        _listen_thread = new std::thread(&BytePSCommSocket::startListenThread, this);
-
-        // Just in case launching root earlier than non-root
-        // TODO: use retry instead of sleep
-        // if (_local_size > 1) std::this_thread::sleep_for(std::chrono::microseconds(1000000));
-    }
-
-    BPS_LOG(DEBUG) << "This is " << (is_root ? "ROOT" : "WORKER")
-                   << " device, rank=" << _local_rank
-                   << ", all sockets create successfully";
+void BytePSCommSocket::init(int* rank, int* size, int* local_rank,
+                            int* local_size, int* worker_id,
+                            BytePSRole* my_role) {
+  BPS_LOG(DEBUG) << "Using Communicator=Socket";
+
+  // We should init rank, size, etc. using getenv
+  // do env check
+  BPS_CHECK(getenv("BYTEPS_LOCAL_RANK"))
+      << "error: env BYTEPS_LOCAL_RANK not set";
+  BPS_CHECK(getenv("BYTEPS_LOCAL_SIZE"))
+      << "error: env BYTEPS_LOCAL_SIZE not set";
+  BPS_CHECK(getenv("DMLC_WORKER_ID")) << "error: env DMLC_WORKER_ID not set";
+  BPS_CHECK(getenv("DMLC_NUM_WORKER")) << "error: env DMLC_NUM_WORKER not set";
+
+  *local_rank = atoi(getenv("BYTEPS_LOCAL_RANK"));
+  *local_size = atoi(getenv("BYTEPS_LOCAL_SIZE"));
+  *worker_id = atoi(getenv("DMLC_WORKER_ID"));
+  auto num_worker = atoi(getenv("DMLC_NUM_WORKER"));
+
+  // we assume _local_size (i.e., # GPU) is consistent on all workers
+  *rank = (*local_rank) + (*worker_id) * (*local_size);
+  *size = num_worker * (*local_size);
+
+  _rank = *rank;
+  _size = *size;
+  _local_rank = *local_rank;
+  _local_size = *local_size;
+  _worker_id = *worker_id;
+
+  for (int i = 0; i < _local_size; i++) {
+    _members.push_back(i);
+  }
+  _root = _members.back();
+
+  *my_role = (_local_rank == _root) ? LOCAL_ROOT : LOCAL_WORKER;
+  bool is_root = (*my_role == LOCAL_ROOT) ? true : false;
+
+  if (getenv("BYTEPS_SOCKET_PATH")) {
+    _send_path = std::string(getenv("BYTEPS_SOCKET_PATH")) +
+                 std::string("/socket_send_");
+    _recv_path = std::string(getenv("BYTEPS_SOCKET_PATH")) +
+                 std::string("/socket_recv_");
+  } else {
+    _send_path = std::string(DEFAULT_BASE_SOCKET_PATH_SEND);
+    _recv_path = std::string(DEFAULT_BASE_SOCKET_PATH_RECV);
+  }
+
+  _send_fd = initSocket(_local_rank, _send_path);
+  _recv_fd = initSocket(_local_rank, _recv_path);
+
+  // init socket comm
+  if (is_root) {  // root
+    _listen_thread =
+        new std::thread(&BytePSCommSocket::startListenThread, this);
+
+    // Just in case launching root earlier than non-root
+    // TODO: use retry instead of sleep
+    // if (_local_size > 1)
+    // std::this_thread::sleep_for(std::chrono::microseconds(1000000));
+  }
+
+  BPS_LOG(DEBUG) << "This is " << (is_root ? "ROOT" : "WORKER")
+                 << " device, rank=" << _local_rank
+                 << ", all sockets create successfully";
 }
 
-int BytePSCommSocket::initSocket(int rank, const std::string &path) {
-    int fd;
-    // init the socket
-    fd = socket(AF_UNIX, SOCK_DGRAM, 0);
-    BPS_CHECK_GE(fd, 0) << "recv socket create failed";
+int BytePSCommSocket::initSocket(int rank, const std::string& path) {
+  int fd;
+  // init the socket
+  fd = socket(AF_UNIX, SOCK_DGRAM, 0);
+  BPS_CHECK_GE(fd, 0) << "recv socket create failed";
 
-    struct sockaddr_un addr;
-    memset(&addr, 0, sizeof(addr));
+  struct sockaddr_un addr;
+  memset(&addr, 0, sizeof(addr));
 
-    // TODO: use absolute unique socket path name (consider multi-tenancy)
-    std::string fd_path(path);
-    fd_path += std::to_string(rank); // should use the rank id to guarantee no conflict
+  // TODO: use absolute unique socket path name (consider multi-tenancy)
+  std::string fd_path(path);
+  fd_path +=
+      std::to_string(rank);  // should use the rank id to guarantee no conflict
 
-    // filling addr information
-    addr.sun_family = AF_UNIX;
-    strncpy(addr.sun_path, fd_path.c_str(), sizeof(addr.sun_path)-1);
+  // filling addr information
+  addr.sun_family = AF_UNIX;
+  strncpy(addr.sun_path, fd_path.c_str(), sizeof(addr.sun_path) - 1);
 
-    // before bind, clear the path first
-    unlink(fd_path.c_str());
+  // before bind, clear the path first
+  unlink(fd_path.c_str());
 
-    // bind the socket to addr
-    int ret = bind(fd, (struct sockaddr *)&addr, sizeof(addr));
-    BPS_CHECK_GE(ret, 0) << fd_path << " bind failed: " << strerror(errno);
+  // bind the socket to addr
+  int ret = bind(fd, (struct sockaddr*)&addr, sizeof(addr));
+  BPS_CHECK_GE(ret, 0) << fd_path << " bind failed: " << strerror(errno);
 
-    BPS_LOG(DEBUG) << "Init socket at " << fd_path;
+  BPS_LOG(DEBUG) << "Init socket at " << fd_path;
 
-    return fd;
+  return fd;
 }
 
+void BytePSCommSocket::startListenThread() {  // only root starts this in
+                                              // background thread
+  BPS_LOG(DEBUG) << "Listening on socket " << _local_rank;
+  char buffer[MAX_LINE];
+  while (true) {
+    int rc;
+    rc = recv(_recv_fd, buffer, sizeof(buffer), MSG_WAITALL);
+    BPS_CHECK_GE(rc, 0) << std::strerror(errno) << ", rank=" << _local_rank;
 
-void BytePSCommSocket::startListenThread() { // only root starts this in background thread
-    BPS_LOG(DEBUG) << "Listening on socket " << _local_rank;
-    char buffer[MAX_LINE];
-    while (true) {
-        int rc;
-        rc = recv(_recv_fd, buffer, sizeof(buffer), MSG_WAITALL);
-        BPS_CHECK_GE(rc, 0) << std::strerror(errno) << ", rank=" << _local_rank;
-
-        auto message = *(BytePSCommMsg*) buffer;
-
-        switch (message.signal) {
-            case REDUCE_READY:
-                BytePSGlobal::GetReduceTable()->AddReadyCount(message.key);
-                break;
-            case PCIE_REDUCE_READY:
-                BytePSGlobal::GetPcieReduceTable()->AddReadyCount(message.key);
-                break;
-            case BCAST_READY:
-                BytePSGlobal::GetBroadcastTable()->AddReadyCount(message.key);
-                break;
-            case PUSH_READY:
-                BytePSGlobal::GetPushTable()->AddReadyCount(message.key);
-                break;
-            default:
-                BPS_CHECK(0) << "unsupported signal: " << message.signal;
-        }
-
-        BPS_LOG(TRACE) << "root socket recved: src=" << message.src
-                       << ", signal=" << message.signal
-                       << ", key=" << message.key
-                       << ", myrank=" << _local_rank;
+    auto message = *(BytePSCommMsg*)buffer;
+
+    switch (message.signal) {
+      case REDUCE_READY:
+        BytePSGlobal::GetReduceTable()->AddReadyCount(message.key);
+        break;
+      case PCIE_REDUCE_READY:
+        BytePSGlobal::GetPcieReduceTable()->AddReadyCount(message.key);
+        break;
+      case BCAST_READY:
+        BytePSGlobal::GetBroadcastTable()->AddReadyCount(message.key);
+        break;
+      case PUSH_READY:
+        BytePSGlobal::GetPushTable()->AddReadyCount(message.key);
+        break;
+      default:
+        BPS_CHECK(0) << "unsupported signal: " << message.signal;
     }
+
+    BPS_LOG(TRACE) << "root socket recved: src=" << message.src
+                   << ", signal=" << message.signal << ", key=" << message.key
+                   << ", myrank=" << _local_rank;
+  }
 }
 
 int BytePSCommSocket::sendSignal(int destination, void* data, int len) {
-    std::lock_guard<std::mutex> lock(_socket_mu);
-    struct sockaddr_un destaddr;
-    memset(&destaddr, 0, sizeof(destaddr));
-    destaddr.sun_family = AF_UNIX;
-
-    std::string fd_path(_recv_path);
-    fd_path += std::to_string(destination);
-    strncpy(destaddr.sun_path, fd_path.c_str(), sizeof(destaddr.sun_path)-1);
-
-    int ret = -1;
-    while (ret < 0) {
-        ret = sendto(_send_fd, data, len, 0,
-            (struct sockaddr *)&destaddr, sizeof(struct sockaddr_un));
-        if (ret < 0) {
-            BPS_LOG(DEBUG) << "Socket send error " <<  std::strerror(errno) << ", rank=" << _local_rank;
-            std::this_thread::sleep_for(std::chrono::microseconds(1000000));
-        }
+  std::lock_guard<std::mutex> lock(_socket_mu);
+  struct sockaddr_un destaddr;
+  memset(&destaddr, 0, sizeof(destaddr));
+  destaddr.sun_family = AF_UNIX;
+
+  std::string fd_path(_recv_path);
+  fd_path += std::to_string(destination);
+  strncpy(destaddr.sun_path, fd_path.c_str(), sizeof(destaddr.sun_path) - 1);
+
+  int ret = -1;
+  while (ret < 0) {
+    ret = sendto(_send_fd, data, len, 0, (struct sockaddr*)&destaddr,
+                 sizeof(struct sockaddr_un));
+    if (ret < 0) {
+      BPS_LOG(DEBUG) << "Socket send error " << std::strerror(errno)
+                     << ", rank=" << _local_rank;
+      std::this_thread::sleep_for(std::chrono::microseconds(1000000));
     }
-    
-    return ret;
+  }
+
+  return ret;
 }
 
 int BytePSCommSocket::sendSignalToRoot(void* data, int len) {
-    return sendSignal(_root, data, len);
+  return sendSignal(_root, data, len);
 }
 
 int BytePSCommSocket::recvSignal(int* source, void* data, int max_len) {
-    int rc = recv(_recv_fd, data, MAX_LINE, MSG_WAITALL);
-    BPS_CHECK_GE(rc, 0) << std::strerror(errno) << ", rank=" << _local_rank;
-    BPS_CHECK_LE(rc, max_len) << "recv_len=" << rc << ", but given max_len=" << max_len;
+  int rc = recv(_recv_fd, data, MAX_LINE, MSG_WAITALL);
+  BPS_CHECK_GE(rc, 0) << std::strerror(errno) << ", rank=" << _local_rank;
+  BPS_CHECK_LE(rc, max_len)
+      << "recv_len=" << rc << ", but given max_len=" << max_len;
 
-    auto message = *(BytePSCommMsg*) data;
-    *source = message.src;
+  auto message = *(BytePSCommMsg*)data;
+  *source = message.src;
 
-    BPS_LOG(TRACE) << "non-root socket recved: src=" << message.src
-                   << ", signal=" << message.signal
-                   << ", key=" << message.key
-                   << ", myrank=" << _local_rank;
+  BPS_LOG(TRACE) << "non-root socket recved: src=" << message.src
+                 << ", signal=" << message.signal << ", key=" << message.key
+                 << ", myrank=" << _local_rank;
 
-    return rc;
+  return rc;
 }
 
 int BytePSCommSocket::recvSignalFromRoot(void* data, int max_len) {
-    int src;
-    int rc = recvSignal(&src, data, max_len);
-    BPS_CHECK_EQ(src, _root) << "Non-root received signal from another non-root";
-    return rc;
+  int src;
+  int rc = recvSignal(&src, data, max_len);
+  BPS_CHECK_EQ(src, _root) << "Non-root received signal from another non-root";
+  return rc;
 }
 
 int BytePSCommSocket::broadcastSignal(void* data, int len) {
-    for (int i : _members) {
-        if (i == _local_rank) continue;
-        sendSignal(i, (void *)data, len);
-    }
-    return 0;
+  for (int i : _members) {
+    if (i == _local_rank) continue;
+    sendSignal(i, (void*)data, len);
+  }
+  return 0;
 }
 
-}
-}
+}  // namespace common
+}  // namespace byteps
diff --git a/byteps/common/communicator.h b/byteps/common/communicator.h
index 252c9f0d0..3b0997b63 100644
--- a/byteps/common/communicator.h
+++ b/byteps/common/communicator.h
@@ -16,123 +16,126 @@
 #ifndef BYTEPS_COMMUNICATOR_H
 #define BYTEPS_COMMUNICATOR_H
 
-#include <cstdlib>
-#include <cstdio>
-#include <vector>
-#include <sys/mman.h>
 #include <errno.h>
-#include <unistd.h>
-#include <sys/un.h>
+#include <nccl.h>
 #include <string.h>
-#include <sys/types.h>
+#include <sys/mman.h>
 #include <sys/socket.h>
-#include <nccl.h>
-#include <thread>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <unistd.h>
+#include <cstdio>
+#include <cstdlib>
 #include <mutex>
+#include <thread>
+#include <vector>
 #include "logging.h"
 
-#define DEFAULT_BASE_SOCKET_PATH_RECV   "/tmp/socket_recv_"
-#define DEFAULT_BASE_SOCKET_PATH_SEND   "/tmp/socket_send_"
+#define DEFAULT_BASE_SOCKET_PATH_RECV "/tmp/socket_recv_"
+#define DEFAULT_BASE_SOCKET_PATH_SEND "/tmp/socket_send_"
 #define MAX_LINE 8000
 
 namespace byteps {
 namespace common {
 enum BytePSRole { LOCAL_ROOT, LOCAL_WORKER };
 
-enum BytePSCommSignal { REDUCE_READY, PCIE_REDUCE_READY, BCAST_READY, PUSH_READY, DO_REDUCE, DO_BROADCAST, DO_GROUP, DO_COPYH2D };
+enum BytePSCommSignal {
+  REDUCE_READY,
+  PCIE_REDUCE_READY,
+  BCAST_READY,
+  PUSH_READY,
+  DO_REDUCE,
+  DO_BROADCAST,
+  DO_GROUP,
+  DO_COPYH2D
+};
 
 struct BytePSCommMsg {
-    int src;
-    BytePSCommSignal signal;
-    uint64_t key;
+  int src;
+  BytePSCommSignal signal;
+  uint64_t key;
 };
 
 class BytePSComm {
-
-public:
-    BytePSComm() { _comm = nullptr; }
-
-    virtual void init(int* rank, int* size, int* local_rank, int* local_size,
-                      int* worker_id, BytePSRole* my_role) = 0;
-    virtual int sendSignal(int destination, void* data, int len) = 0;
-    virtual int sendSignalToRoot(void* data, int len) = 0;
-    virtual int recvSignal(int* source, void* data, int max_len) = 0;
-    virtual int recvSignalFromRoot(void* data, int max_len) = 0;
-    virtual int broadcastSignal(void* data, int len) = 0;
-
-    virtual int getRank() { return _rank; }
-    virtual int getSize() { return _size; }
-    virtual int getLocalRank() { return _local_rank; }
-    virtual int getLocalSize() { return _local_size; }
-    virtual int getWorkerID() { return _worker_id; }
-
-    virtual std::vector<int> getMembers() { return _members; }
-    virtual int getRoot() { return _root; }
-
-protected:
-
-    int _rank;
-    int _size;
-    int _local_rank;
-    int _local_size;
-    int _worker_id;
-
-    std::vector<int> _members;
-    int _root;
-
-    void* _comm;
+ public:
+  BytePSComm() { _comm = nullptr; }
+
+  virtual void init(int* rank, int* size, int* local_rank, int* local_size,
+                    int* worker_id, BytePSRole* my_role) = 0;
+  virtual int sendSignal(int destination, void* data, int len) = 0;
+  virtual int sendSignalToRoot(void* data, int len) = 0;
+  virtual int recvSignal(int* source, void* data, int max_len) = 0;
+  virtual int recvSignalFromRoot(void* data, int max_len) = 0;
+  virtual int broadcastSignal(void* data, int len) = 0;
+
+  virtual int getRank() { return _rank; }
+  virtual int getSize() { return _size; }
+  virtual int getLocalRank() { return _local_rank; }
+  virtual int getLocalSize() { return _local_size; }
+  virtual int getWorkerID() { return _worker_id; }
+
+  virtual std::vector<int> getMembers() { return _members; }
+  virtual int getRoot() { return _root; }
+
+ protected:
+  int _rank;
+  int _size;
+  int _local_rank;
+  int _local_size;
+  int _worker_id;
+
+  std::vector<int> _members;
+  int _root;
+
+  void* _comm;
 };
 
 class BytePSCommSocket : public BytePSComm {
-
-public:
-
-    BytePSCommSocket() {}
-    BytePSCommSocket(std::shared_ptr<BytePSComm> comm,
-                     const std::string &path_suffix,
-                     const std::vector<int> &members);
-
-    ~BytePSCommSocket() {
-        if (_listen_thread->joinable()) {
-            _listen_thread->join();
-        }
-        close(_recv_fd);
-        close(_send_fd);
-
-        BPS_LOG(DEBUG) << "Clear BytePSCommSocket";
+ public:
+  BytePSCommSocket() {}
+  BytePSCommSocket(std::shared_ptr<BytePSComm> comm,
+                   const std::string& path_suffix,
+                   const std::vector<int>& members);
+
+  ~BytePSCommSocket() {
+    if (_listen_thread->joinable()) {
+      _listen_thread->join();
     }
+    close(_recv_fd);
+    close(_send_fd);
 
-    void init(int* rank, int* size, int* local_rank, int* local_size,
-              int* worker_id, BytePSRole* my_role);
-    int sendSignal(int destination, void* data, int len);
-    int sendSignalToRoot(void* data, int len);
-    int recvSignal(int* source, void* data, int max_len);
-    int recvSignalFromRoot(void* data, int max_len);
-    int broadcastSignal(void* data, int len);
-
-    int getSendFd() { return _send_fd; }
-    int getRecvFd() { return _recv_fd; }
+    BPS_LOG(DEBUG) << "Clear BytePSCommSocket";
+  }
 
-    std::string getSendPath() { return _send_path; }
-    std::string getRecvPath() { return _recv_path; }
+  void init(int* rank, int* size, int* local_rank, int* local_size,
+            int* worker_id, BytePSRole* my_role);
+  int sendSignal(int destination, void* data, int len);
+  int sendSignalToRoot(void* data, int len);
+  int recvSignal(int* source, void* data, int max_len);
+  int recvSignalFromRoot(void* data, int max_len);
+  int broadcastSignal(void* data, int len);
 
-protected:
+  int getSendFd() { return _send_fd; }
+  int getRecvFd() { return _recv_fd; }
 
-    void startListenThread();
-    int initSocket(int rank, const std::string &path);
+  std::string getSendPath() { return _send_path; }
+  std::string getRecvPath() { return _recv_path; }
 
-    std::thread* _listen_thread;
+ protected:
+  void startListenThread();
+  int initSocket(int rank, const std::string& path);
 
-    std::string _send_path;
-    std::string _recv_path;
-    int _recv_fd;
-    int _send_fd;
+  std::thread* _listen_thread;
 
-    std::mutex _socket_mu;
+  std::string _send_path;
+  std::string _recv_path;
+  int _recv_fd;
+  int _send_fd;
 
+  std::mutex _socket_mu;
 };
 
-} // namespace common
-} // namespace byteps
+}  // namespace common
+}  // namespace byteps
 
-#endif // BYTEPS_COMMUNICATOR_H
+#endif  // BYTEPS_COMMUNICATOR_H
diff --git a/byteps/common/core_loops.cc b/byteps/common/core_loops.cc
index f6383d983..0e8f9216d 100644
--- a/byteps/common/core_loops.cc
+++ b/byteps/common/core_loops.cc
@@ -13,681 +13,669 @@
 // limitations under the License.
 // =============================================================================
 
-#include <memory>
-#include <chrono>
-#include <cuda_runtime.h>
-
-#include "logging.h"
 #include "core_loops.h"
+#include <cuda_runtime.h>
+#include <chrono>
+#include <memory>
 #include "common.h"
 #include "global.h"
+#include "logging.h"
 
 namespace byteps {
 namespace common {
 
 void FinishOrProceed(std::shared_ptr<TensorTableEntry> task) {
-    auto &queue_list = task->queue_list;
-    BPS_CHECK_GE(queue_list.size(), 1);
-    auto this_op = queue_list[0];
-    auto q = BytePSGlobal::GetScheduledQueue(this_op);
-    q->reportFinish(task->len);
-    if (BytePSGlobal::IsTensorSampled(task->key)) {
-        // We only support sampling 
-        BPS_CHECK(task->tensor->dtype() == common::BYTEPS_FLOAT32);
-        size_t i = task->offset / 4;
-        size_t j = (task->offset + task->len) / 4 - 1;
-        if (task->device == CPU_DEVICE_ID) {
-            BPS_LOG(DEBUG) << "Sampled key=" << task->key
-                        << " rank=" << BytePSGlobal::GetLocalRank()
-                        << " input[0]=" << *((float*)(task->tensor->data()) + i)
-                        << "\tinput[-1]=" << *((float*)(task->tensor->data()) + j)
-                        << "\toutput[0]=" << *((float*)(task->output->data()) + i)
-                        << "\toutput[-1]=" << *((float*)(task->output->data()) + j)
-                        << "\t after stage: " << LogStrings[this_op];
-        }
-        else {
-            float i0, i1, o0, o1;
-            cudaMemcpy(&i0, (float*)(task->tensor->data()) + i, 4, cudaMemcpyDeviceToHost);
-            cudaMemcpy(&i1, (float*)(task->tensor->data()) + j, 4, cudaMemcpyDeviceToHost);
-            cudaMemcpy(&o0, (float*)(task->output->data()) + i, 4, cudaMemcpyDeviceToHost);
-            cudaMemcpy(&o1, (float*)(task->output->data()) + j, 4, cudaMemcpyDeviceToHost);
-            BPS_LOG(DEBUG) << "Sampled key=" << task->key
-                        << " rank=" << BytePSGlobal::GetLocalRank()
-                        << " input[0]=" << i0
-                        << "\tinput[-1]=" << i1
-                        << "\toutput[0]=" << o0
-                        << "\toutput[-1]=" << o1
-                        << "\t after stage: " << LogStrings[this_op];
-        }
-    }
-    queue_list.erase(queue_list.begin());
-    if (queue_list.size() > 0) {
-        BPS_CHECK(task->tensor_name != "");
-        BPS_LOG(TRACE) << "Rank=" << BytePSGlobal::GetRank()
-                       << " finishes " << LogStrings[this_op]
-                       << ", tensor: " << task->tensor_name
-                       << ", key=" << task->key
-                       << "; Passing to the next queue.";
-        BytePSGlobal::GetScheduledQueue(queue_list[0])->addTask(task);
+  auto &queue_list = task->queue_list;
+  BPS_CHECK_GE(queue_list.size(), 1);
+  auto this_op = queue_list[0];
+  auto q = BytePSGlobal::GetScheduledQueue(this_op);
+  q->reportFinish(task->len);
+  if (BytePSGlobal::IsTensorSampled(task->key)) {
+    // We only support sampling
+    BPS_CHECK(task->tensor->dtype() == common::BYTEPS_FLOAT32);
+    size_t i = task->offset / 4;
+    size_t j = (task->offset + task->len) / 4 - 1;
+    if (task->device == CPU_DEVICE_ID) {
+      BPS_LOG(DEBUG) << "Sampled key=" << task->key
+                     << " rank=" << BytePSGlobal::GetLocalRank()
+                     << " input[0]=" << *((float *)(task->tensor->data()) + i)
+                     << "\tinput[-1]=" << *((float *)(task->tensor->data()) + j)
+                     << "\toutput[0]=" << *((float *)(task->output->data()) + i)
+                     << "\toutput[-1]="
+                     << *((float *)(task->output->data()) + j)
+                     << "\t after stage: " << LogStrings[this_op];
     } else {
-        BPS_CHECK(task->counter_ptr) << task->tensor_name << " counter_ptr is null";
-        int v = task->counter_ptr.get()->fetch_add(1);
-        if (v == (int)(task->total_partnum-1)) {
-            BPS_CHECK(task->tensor_name != "");
-            BPS_LOG(TRACE) << "Rank=" << BytePSGlobal::GetRank()
-                           << " finish processing tensor: "
-                           << task->tensor_name;
-            task->callback(Status::OK());
-        }
+      float i0, i1, o0, o1;
+      cudaMemcpy(&i0, (float *)(task->tensor->data()) + i, 4,
+                 cudaMemcpyDeviceToHost);
+      cudaMemcpy(&i1, (float *)(task->tensor->data()) + j, 4,
+                 cudaMemcpyDeviceToHost);
+      cudaMemcpy(&o0, (float *)(task->output->data()) + i, 4,
+                 cudaMemcpyDeviceToHost);
+      cudaMemcpy(&o1, (float *)(task->output->data()) + j, 4,
+                 cudaMemcpyDeviceToHost);
+      BPS_LOG(DEBUG) << "Sampled key=" << task->key
+                     << " rank=" << BytePSGlobal::GetLocalRank()
+                     << " input[0]=" << i0 << "\tinput[-1]=" << i1
+                     << "\toutput[0]=" << o0 << "\toutput[-1]=" << o1
+                     << "\t after stage: " << LogStrings[this_op];
     }
-    return;
-}
-
-bool RunCoordinateLoopOnce(QueueType this_op) {
-    auto q = BytePSGlobal::GetScheduledQueue(this_op);
-    auto task = q->getTask();
-    if (task){
-        int rank = BytePSGlobal::GetLocalRank();
-        auto key  = task->key;
-
-        // first send to next queue and then broadcast signal
-        // to guarantee the entry is available when getTask(key) at Reduce/Broadcast thread
-        FinishOrProceed(task);
-
-        BytePSCommSignal sig;
-        std::shared_ptr<BytePSComm> comm;
-
-        switch (this_op) {
-            case COORDINATE_REDUCE: {
-                sig = REDUCE_READY;
-                comm = BytePSGlobal::GetNccl()->GetSignalComm();
-                break;
-            }
-            case COORDINATE_BROADCAST: {
-                sig = BCAST_READY;
-                comm = BytePSGlobal::GetNccl()->GetSignalComm();
-                break;
-            }
-            case COORDINATE_PUSH: {
-                sig = PUSH_READY;
-                comm = BytePSGlobal::GetBasicComm();
-                break;
-            }
-            default:
-                BPS_CHECK(0) << "unsupported op: " << this_op;
-        }
-
-        BPS_CHECK_NE(rank, comm->getRoot()) << "only non-root device should enter COORDINATE loop";
-
-        struct BytePSCommMsg msg = { rank, sig, key };
-        comm->sendSignalToRoot(&msg, sizeof(BytePSCommMsg));
-
-        BPS_CHECK(task->tensor_name != "");
-        BPS_LOG(TRACE) << task->tensor_name
-                       << " send coordinate info: "
-                       << "Signal=" << sig
-                       << ", rank=" << rank
-                       << ", key="  << key;
-
-    } else {
-        std::this_thread::sleep_for(std::chrono::nanoseconds(1000));
+  }
+  queue_list.erase(queue_list.begin());
+  if (queue_list.size() > 0) {
+    BPS_CHECK(task->tensor_name != "");
+    BPS_LOG(TRACE) << "Rank=" << BytePSGlobal::GetRank() << " finishes "
+                   << LogStrings[this_op] << ", tensor: " << task->tensor_name
+                   << ", key=" << task->key << "; Passing to the next queue.";
+    BytePSGlobal::GetScheduledQueue(queue_list[0])->addTask(task);
+  } else {
+    BPS_CHECK(task->counter_ptr) << task->tensor_name << " counter_ptr is null";
+    int v = task->counter_ptr.get()->fetch_add(1);
+    if (v == (int)(task->total_partnum - 1)) {
+      BPS_CHECK(task->tensor_name != "");
+      BPS_LOG(TRACE) << "Rank=" << BytePSGlobal::GetRank()
+                     << " finish processing tensor: " << task->tensor_name;
+      task->callback(Status::OK());
     }
-    return true;
+  }
+  return;
 }
 
-inline void PostNcclCalls(std::shared_ptr<byteps::common::TensorTableEntry> task, QueueType this_op) {
-
-    BPS_CHECK(this_op == REDUCE || this_op == BROADCAST) << "Only REDUCE and BROADCAST use NCCL.";
-    auto tensor = (this_op == REDUCE) ? task->tensor : task->output;
-    BPS_CHECK(tensor);
-    BPS_CHECK_EQ(0, tensor->size() % tensor->shape().num_elements());
-
-    auto key  = task->key;
-    auto len = task->len;
-    auto offset = task->offset;
-    auto unit_len = tensor->size() / tensor->shape().num_elements();
-    auto p = (char*)(tensor->data()) + offset;
-    if (task->device == CPU_DEVICE_ID) {
-        p = (char*)(task->gpu_ptr) + offset;
+bool RunCoordinateLoopOnce(QueueType this_op) {
+  auto q = BytePSGlobal::GetScheduledQueue(this_op);
+  auto task = q->getTask();
+  if (task) {
+    int rank = BytePSGlobal::GetLocalRank();
+    auto key = task->key;
+
+    // first send to next queue and then broadcast signal
+    // to guarantee the entry is available when getTask(key) at Reduce/Broadcast
+    // thread
+    FinishOrProceed(task);
+
+    BytePSCommSignal sig;
+    std::shared_ptr<BytePSComm> comm;
+
+    switch (this_op) {
+      case COORDINATE_REDUCE: {
+        sig = REDUCE_READY;
+        comm = BytePSGlobal::GetNccl()->GetSignalComm();
+        break;
+      }
+      case COORDINATE_BROADCAST: {
+        sig = BCAST_READY;
+        comm = BytePSGlobal::GetNccl()->GetSignalComm();
+        break;
+      }
+      case COORDINATE_PUSH: {
+        sig = PUSH_READY;
+        comm = BytePSGlobal::GetBasicComm();
+        break;
+      }
+      default:
+        BPS_CHECK(0) << "unsupported op: " << this_op;
     }
 
-    auto nccl_dtype = getNcclDataType(tensor->dtype());
+    BPS_CHECK_NE(rank, comm->getRoot())
+        << "only non-root device should enter COORDINATE loop";
 
-    auto nccl = BytePSGlobal::GetNccl();
-    auto nccl_stream = nccl->GetStream(key, this_op);
-    auto nccl_comm = nccl->GetComm(key, this_op);
-    auto nccl_root = nccl->GetRoot(key, this_op);
-    auto nccl_size = nccl->GetSize();
-    auto nccl_rank = nccl->GetRank(key, this_op);
-
-    auto num_elem_per_gpu = len / nccl_size / unit_len;
-    auto left_elem = (len / unit_len) - (num_elem_per_gpu * nccl_size);
+    struct BytePSCommMsg msg = {rank, sig, key};
+    comm->sendSignalToRoot(&msg, sizeof(BytePSCommMsg));
 
     BPS_CHECK(task->tensor_name != "");
-    BPS_LOG(TRACE) << task->tensor_name
-                    << " calling NCCL "
-                    << LogStrings[this_op]
-                    << " (rank=" << nccl_rank
-                    << ") key=" << key
-                    << ", elements=" << len/unit_len
-                    << ", device=" << task->device;
-
-    if (this_op == REDUCE) {
-        // We reduce to task->output except that it is a CPU tensor
-        auto out_p = (char*)(task->output->data()) + offset;
-        if (task->device == CPU_DEVICE_ID && task->tensor == task->output) {
-            out_p = p;
-        }
-
-        if (num_elem_per_gpu) {
-            NCCLCHECK(ncclReduceScatter((const void*) p,
-                                    (void*) (out_p + nccl_rank * num_elem_per_gpu * unit_len),
-                                    (size_t) num_elem_per_gpu,
-                                    (ncclDataType_t) nccl_dtype,
-                                    (ncclRedOp_t) ncclSum,
-                                    (ncclComm_t) nccl_comm,
-                                    (cudaStream_t) nccl_stream));
-        }
-        if (left_elem) {
-            NCCLCHECK(ncclReduce((const void*) (p + len - left_elem * unit_len),
-                                    (void*) (out_p + len - left_elem * unit_len),
-                                    (size_t) left_elem,
-                                    (ncclDataType_t) nccl_dtype,
-                                    (ncclRedOp_t) ncclSum,
-                                    (int) nccl_root,
-                                    (ncclComm_t) nccl_comm,
-                                    (cudaStream_t) nccl_stream));
-        }
-    }
-    else {
-        if (num_elem_per_gpu) {
-            NCCLCHECK(ncclAllGather((const void*) (p + nccl_rank * num_elem_per_gpu * unit_len),
-                                    (void*) p,
-                                    (size_t) num_elem_per_gpu,
-                                    (ncclDataType_t) nccl_dtype,
-                                    (ncclComm_t) nccl_comm,
-                                    (cudaStream_t) nccl_stream));
-        }
-        if (left_elem) {
-            NCCLCHECK(ncclBroadcast((const void*) (p + len - left_elem * unit_len),
-                                    (void*) (p + len - left_elem * unit_len),
-                                    (size_t) left_elem,
-                                    (ncclDataType_t) nccl_dtype,
-                                    (int) nccl_root,
-                                    (ncclComm_t) nccl_comm,
-                                    (cudaStream_t) nccl_stream));
-        }
+    BPS_LOG(TRACE) << task->tensor_name << " send coordinate info: "
+                   << "Signal=" << sig << ", rank=" << rank << ", key=" << key;
 
-    }
+  } else {
+    std::this_thread::sleep_for(std::chrono::nanoseconds(1000));
+  }
+  return true;
 }
 
+inline void PostNcclCalls(
+    std::shared_ptr<byteps::common::TensorTableEntry> task, QueueType this_op) {
+  BPS_CHECK(this_op == REDUCE || this_op == BROADCAST)
+      << "Only REDUCE and BROADCAST use NCCL.";
+  auto tensor = (this_op == REDUCE) ? task->tensor : task->output;
+  BPS_CHECK(tensor);
+  BPS_CHECK_EQ(0, tensor->size() % tensor->shape().num_elements());
+
+  auto key = task->key;
+  auto len = task->len;
+  auto offset = task->offset;
+  auto unit_len = tensor->size() / tensor->shape().num_elements();
+  auto p = (char *)(tensor->data()) + offset;
+  if (task->device == CPU_DEVICE_ID) {
+    p = (char *)(task->gpu_ptr) + offset;
+  }
+
+  auto nccl_dtype = getNcclDataType(tensor->dtype());
+
+  auto nccl = BytePSGlobal::GetNccl();
+  auto nccl_stream = nccl->GetStream(key, this_op);
+  auto nccl_comm = nccl->GetComm(key, this_op);
+  auto nccl_root = nccl->GetRoot(key, this_op);
+  auto nccl_size = nccl->GetSize();
+  auto nccl_rank = nccl->GetRank(key, this_op);
+
+  auto num_elem_per_gpu = len / nccl_size / unit_len;
+  auto left_elem = (len / unit_len) - (num_elem_per_gpu * nccl_size);
+
+  BPS_CHECK(task->tensor_name != "");
+  BPS_LOG(TRACE) << task->tensor_name << " calling NCCL " << LogStrings[this_op]
+                 << " (rank=" << nccl_rank << ") key=" << key
+                 << ", elements=" << len / unit_len
+                 << ", device=" << task->device;
+
+  if (this_op == REDUCE) {
+    // We reduce to task->output except that it is a CPU tensor
+    auto out_p = (char *)(task->output->data()) + offset;
+    if (task->device == CPU_DEVICE_ID && task->tensor == task->output) {
+      out_p = p;
+    }
 
-bool RunRootNcclLoopOnce() {
-    auto signal_comm = BytePSGlobal::GetNccl()->GetSignalComm();
-    int root = signal_comm->getRoot();
-    int rank = BytePSGlobal::GetLocalRank();
-    BPS_CHECK_EQ(rank, root);
-
-    int nccl_size = BytePSGlobal::GetNccl()->GetSize();
-    QueueType nccl_ops[] = { REDUCE, BROADCAST };
-
-    auto nccl_entry = std::make_shared<NcclGroupEntry>(); 
-    auto &tasks = nccl_entry->tasks;
-    auto &queues = nccl_entry->queues;
-
-    NCCLCHECK(ncclGroupStart());
-    for (auto this_op : nccl_ops) {
-        auto q = BytePSGlobal::GetScheduledQueue(this_op);
-        for (int i = 0; i < BytePSGlobal::GetNccl()->GetGroupSize(); i++) {
-            auto task = q->getTask();
-            if (!task) { break; }
-            tasks.push_back(task);
-            queues.push_back(q);
-
-            if (nccl_size > 1) {
-                // notify non-root devices
-                struct BytePSCommMsg msg = { rank,
-                                             (this_op == REDUCE) ? DO_REDUCE : DO_BROADCAST,
-                                             task->key };
-                signal_comm->broadcastSignal(&msg,
-                                                sizeof(BytePSCommMsg));
-                PostNcclCalls(task, this_op);
-            }
-        }
+    if (num_elem_per_gpu) {
+      NCCLCHECK(ncclReduceScatter(
+          (const void *)p,
+          (void *)(out_p + nccl_rank * num_elem_per_gpu * unit_len),
+          (size_t)num_elem_per_gpu, (ncclDataType_t)nccl_dtype,
+          (ncclRedOp_t)ncclSum, (ncclComm_t)nccl_comm,
+          (cudaStream_t)nccl_stream));
     }
-    if (tasks.size()) {
-        struct BytePSCommMsg msg = { rank, DO_GROUP, 0 };
-        signal_comm->broadcastSignal(&msg, sizeof(BytePSCommMsg));
-        NCCLCHECK(ncclGroupEnd());
-        nccl_entry->RecordEvents();
-        BPS_LOG(TRACE) << "NCCL Group size=" << tasks.size() << " rank=" << rank;
-        BytePSGlobal::GetNccl()->EnqueueGroup(nccl_entry);
+    if (left_elem) {
+      NCCLCHECK(ncclReduce((const void *)(p + len - left_elem * unit_len),
+                           (void *)(out_p + len - left_elem * unit_len),
+                           (size_t)left_elem, (ncclDataType_t)nccl_dtype,
+                           (ncclRedOp_t)ncclSum, (int)nccl_root,
+                           (ncclComm_t)nccl_comm, (cudaStream_t)nccl_stream));
     }
-    else {
-        NCCLCHECK(ncclGroupEnd());
-        std::this_thread::sleep_for(std::chrono::nanoseconds(1000));
+  } else {
+    if (num_elem_per_gpu) {
+      NCCLCHECK(ncclAllGather(
+          (const void *)(p + nccl_rank * num_elem_per_gpu * unit_len),
+          (void *)p, (size_t)num_elem_per_gpu, (ncclDataType_t)nccl_dtype,
+          (ncclComm_t)nccl_comm, (cudaStream_t)nccl_stream));
     }
-
-    return true;
+    if (left_elem) {
+      NCCLCHECK(ncclBroadcast((const void *)(p + len - left_elem * unit_len),
+                              (void *)(p + len - left_elem * unit_len),
+                              (size_t)left_elem, (ncclDataType_t)nccl_dtype,
+                              (int)nccl_root, (ncclComm_t)nccl_comm,
+                              (cudaStream_t)nccl_stream));
+    }
+  }
 }
 
-bool RunNonRootNcclLoopOnce() {
-    auto signal_comm = BytePSGlobal::GetNccl()->GetSignalComm();
-    int root = signal_comm->getRoot();
-    int rank = BytePSGlobal::GetLocalRank();
-    BPS_CHECK_NE(rank, root);
-
-    auto nccl_entry = std::make_shared<NcclGroupEntry>(); 
-    auto &tasks = nccl_entry->tasks;
-    auto &queues = nccl_entry->queues;
-    struct BytePSCommMsg msg = {};
-
-    NCCLCHECK(ncclGroupStart());
-    while (1) {
-        signal_comm->recvSignalFromRoot(&msg, sizeof(BytePSCommMsg));
-        if (msg.signal == DO_GROUP) { break; }
-        QueueType this_op = REDUCE;
-        if (msg.signal == DO_BROADCAST) {
-            this_op = BROADCAST;
-        }
-        else {
-            BPS_CHECK_EQ(msg.signal, DO_REDUCE) << msg.signal << ", " << DO_REDUCE;
-        }
-
-        auto key = msg.key;
-
-        auto q = BytePSGlobal::GetScheduledQueue(this_op);
-        auto task = q->getTask(key);
-        BPS_CHECK(task);
-
-        tasks.push_back(task);
-        queues.push_back(q);
+bool RunRootNcclLoopOnce() {
+  auto signal_comm = BytePSGlobal::GetNccl()->GetSignalComm();
+  int root = signal_comm->getRoot();
+  int rank = BytePSGlobal::GetLocalRank();
+  BPS_CHECK_EQ(rank, root);
 
-        PostNcclCalls(task, this_op);
+  int nccl_size = BytePSGlobal::GetNccl()->GetSize();
+  QueueType nccl_ops[] = {REDUCE, BROADCAST};
+
+  auto nccl_entry = std::make_shared<NcclGroupEntry>();
+  auto &tasks = nccl_entry->tasks;
+  auto &queues = nccl_entry->queues;
 
+  NCCLCHECK(ncclGroupStart());
+  for (auto this_op : nccl_ops) {
+    auto q = BytePSGlobal::GetScheduledQueue(this_op);
+    for (int i = 0; i < BytePSGlobal::GetNccl()->GetGroupSize(); i++) {
+      auto task = q->getTask();
+      if (!task) {
+        break;
+      }
+      tasks.push_back(task);
+      queues.push_back(q);
+
+      if (nccl_size > 1) {
+        // notify non-root devices
+        struct BytePSCommMsg msg = {
+            rank, (this_op == REDUCE) ? DO_REDUCE : DO_BROADCAST, task->key};
+        signal_comm->broadcastSignal(&msg, sizeof(BytePSCommMsg));
+        PostNcclCalls(task, this_op);
+      }
     }
+  }
+  if (tasks.size()) {
+    struct BytePSCommMsg msg = {rank, DO_GROUP, 0};
+    signal_comm->broadcastSignal(&msg, sizeof(BytePSCommMsg));
     NCCLCHECK(ncclGroupEnd());
-
     nccl_entry->RecordEvents();
+    BPS_LOG(TRACE) << "NCCL Group size=" << tasks.size() << " rank=" << rank;
     BytePSGlobal::GetNccl()->EnqueueGroup(nccl_entry);
-    return true;
-}
+  } else {
+    NCCLCHECK(ncclGroupEnd());
+    std::this_thread::sleep_for(std::chrono::nanoseconds(1000));
+  }
 
-bool RunSyncNcclOnce() {
-    auto nccl_entry = BytePSGlobal::GetNccl()->DequeueGroup();
-    if (nccl_entry) {
-        nccl_entry->SynchronizeEvents();
-        for (size_t i = 0; i < nccl_entry->tasks.size(); i++) {
-            FinishOrProceed(nccl_entry->tasks[i]);
-        }
-        nccl_entry->DestroyEvents();
-        BPS_LOG(TRACE) << "Finished NCCL Group size=" << nccl_entry->tasks.size()
-                       << " rank=" << BytePSGlobal::GetLocalRank();
-    }
-    else {
-        std::this_thread::sleep_for(std::chrono::nanoseconds(1000));
-    }
-    return true;
+  return true;
 }
 
-bool RunCopyDevice2HostLoopOnce() {
-    QueueType this_op = COPYD2H;
-    auto q = BytePSGlobal::GetScheduledQueue(this_op);
-    auto task = q->getTask();
-
-    if (task) {
-        auto copy_d2h_Stream =  BytePSGlobal::GetCopyDevice2HostStream();
-        // If we ran NCCL reduce, we should copy from task->output
-        auto tensor = (BytePSGlobal::GetNccl()->GetSize() > 1) ?
-                      task->output : task->tensor;
-        BPS_CHECK(tensor);
-        auto key  = task->key;
-
-        auto nccl = BytePSGlobal::GetNccl();
-        auto nccl_root = nccl->GetRoot(key, REDUCE);
-        auto nccl_size = nccl->GetSize();
-        auto nccl_rank = nccl->GetRank(key, REDUCE);
-
-        auto len = task->len;
-        auto offset = task->offset;
-        auto p = (char*)(tensor->data()) + offset;
-        if (task->device == CPU_DEVICE_ID) {
-            p = (char*)(task->gpu_ptr) + offset;
-        }
-        auto unit_len = tensor->size() / tensor->shape().num_elements();
-        char* cpubuff;
-        if (BytePSGlobal::IsCrossPcieSwitch()) {
-            BPS_CHECK(task->pcie_cpubuff.size());
-            cpubuff = (char*)(task->pcie_cpubuff[BytePSGlobal::GetPcieSwitchIndex()]) + offset;
-        }
-        else {
-            cpubuff = (char*)(task->cpubuff) + offset;
-        }
-
-        BPS_CHECK(cpubuff) << task->tensor_name
-                            << ": CPU buffer not initialized, size=" << len;
-
-        auto num_elem_per_gpu = len / nccl_size / unit_len;
-        auto left_elem = (len / unit_len) - (num_elem_per_gpu * nccl_size);
-
-        auto copy_len = num_elem_per_gpu * unit_len;
-        if (left_elem && (nccl_root == nccl_rank)) {
-            copy_len += left_elem * unit_len;
-        }
-
-        if (copy_len) {
-            CUDA_CALL(cudaMemcpyAsync((void *) (cpubuff + nccl_rank * num_elem_per_gpu * unit_len),
-                                      (const void *) (p + nccl_rank * num_elem_per_gpu * unit_len),
-                                      (size_t) copy_len,
-                                      (cudaMemcpyKind) cudaMemcpyDeviceToHost,
-                                      (cudaStream_t) *copy_d2h_Stream));
-            CUDA_CALL(cudaStreamSynchronize(*copy_d2h_Stream));
-        }
-
-        FinishOrProceed(task);
+bool RunNonRootNcclLoopOnce() {
+  auto signal_comm = BytePSGlobal::GetNccl()->GetSignalComm();
+  int root = signal_comm->getRoot();
+  int rank = BytePSGlobal::GetLocalRank();
+  BPS_CHECK_NE(rank, root);
+
+  auto nccl_entry = std::make_shared<NcclGroupEntry>();
+  auto &tasks = nccl_entry->tasks;
+  auto &queues = nccl_entry->queues;
+  struct BytePSCommMsg msg = {};
+
+  NCCLCHECK(ncclGroupStart());
+  while (1) {
+    signal_comm->recvSignalFromRoot(&msg, sizeof(BytePSCommMsg));
+    if (msg.signal == DO_GROUP) {
+      break;
     }
-    else {
-        std::this_thread::sleep_for(std::chrono::nanoseconds(1000));
+    QueueType this_op = REDUCE;
+    if (msg.signal == DO_BROADCAST) {
+      this_op = BROADCAST;
+    } else {
+      BPS_CHECK_EQ(msg.signal, DO_REDUCE) << msg.signal << ", " << DO_REDUCE;
     }
-    return true;
-}
 
-bool RunPcieReduceLoopOnce() {
-    BPS_CHECK(BytePSGlobal::IsCrossPcieSwitch());
-    QueueType this_op = PCIE_REDUCE;
-    auto q = BytePSGlobal::GetScheduledQueue(this_op);
-    auto task = q->getTask();
-    if (task) {
-        auto reducer = BytePSGlobal::GetCpuReducer();
-        if (!reducer->isRoot()) {
-            // send signal to root
-            int rank = BytePSGlobal::GetLocalRank();
-            auto key  = task->key;
-            BytePSCommSignal sig = PCIE_REDUCE_READY;
-            struct BytePSCommMsg msg = { rank, sig, key };
-            reducer->getComm()->sendSignalToRoot(&msg, sizeof(BytePSCommMsg));
-        }
-        else {
-            auto tensor = task->tensor;
-            
-            auto key  = task->key;
-            auto len = task->len;
-            auto offset = task->offset;
-            auto unit_len = tensor->size() / tensor->shape().num_elements();
-
-            auto nccl = BytePSGlobal::GetNccl();
-            auto nccl_root = nccl->GetRoot(key, REDUCE);
-            auto nccl_size = nccl->GetSize();
-            auto nccl_rank = nccl->GetRank(key, REDUCE);
-
-            auto num_elem_per_gpu = len / nccl_size / unit_len;
-            auto left_elem = (len / unit_len) - (num_elem_per_gpu * nccl_size);
-
-            auto copy_len = num_elem_per_gpu * unit_len;
-            if (left_elem && (nccl_root == nccl_rank)) {
-                copy_len += left_elem * unit_len;
-            }
-
-            if (copy_len) {
-                auto total_offset = offset + nccl_rank * num_elem_per_gpu * unit_len;
-
-                // Below we assume there are only two PCIe switch
-                // and we run reducer in the context of the second switch
-                reducer->sum((void*)((char*)(task->cpubuff) + total_offset),
-                             (void*)((char*)(task->pcie_cpubuff[0]) + total_offset),
-                             copy_len, tensor->dtype());
-            }
-        }
-
-        FinishOrProceed(task);
-    }
-    else {
-        std::this_thread::sleep_for(std::chrono::nanoseconds(1000));
-    }
-    return true;
-}
+    auto key = msg.key;
 
-bool RunPushLoopOnce() {
-    QueueType this_op = PUSH;
     auto q = BytePSGlobal::GetScheduledQueue(this_op);
-    auto task = q->getTask();
-    if (task) {
-        BPS_CHECK(BytePSGlobal::IsRootDevice()) << "only root device should enter PUSH loop";
-        
-        if (BytePSGlobal::IsDistributed()) {
-            auto offset = task->offset;
-            auto len = task->len;
-
-            char* data;
-            BPS_CHECK(task->cpubuff);
-            data = const_cast<char*>(static_cast<const char*>(task->cpubuff) + offset);
-
-            // get metadata
-            const int dtype = task->tensor->dtype();
-
-            // false means not to delete data when SArray is deleted
-            ps::SArray<char> vals(data, len, false);
-
-            int cmd = GetCommandType(RequestType::kDefaultPushPull, dtype);
-            auto& pskv = BytePSGlobal::EncodeDefaultKey(task->key, len);
-            BytePSGlobal::GetPS()->ZPush(
-                pskv.keys, vals, pskv.lens, cmd,
-                [task, q]() {
-                    FinishOrProceed(task);
-                }
-            );
-        }
-        else {
-            // This is a dummy barrier for IsCrossPcieSwitch()
-            BPS_CHECK(BytePSGlobal::IsCrossPcieSwitch());
-            FinishOrProceed(task);
-        }
-    }
-    else {
-        std::this_thread::sleep_for(std::chrono::nanoseconds(1000));
-    }
-    return true;
+    auto task = q->getTask(key);
+    BPS_CHECK(task);
+
+    tasks.push_back(task);
+    queues.push_back(q);
+
+    PostNcclCalls(task, this_op);
+  }
+  NCCLCHECK(ncclGroupEnd());
+
+  nccl_entry->RecordEvents();
+  BytePSGlobal::GetNccl()->EnqueueGroup(nccl_entry);
+  return true;
 }
 
-bool RunPullLoopOnce() {
-    QueueType this_op = PULL;
-    auto q = BytePSGlobal::GetScheduledQueue(this_op);
-    auto task = q->getTask();
-    if (task) {
-        BPS_CHECK(BytePSGlobal::IsRootDevice()) << "only root device should enter PULL loop";
-        // TODO: allow merging
-        auto offset = task->offset;
-        auto len = task->len;
-
-        char* data;
-        BPS_CHECK(task->cpubuff);
-        data = const_cast<char*>(static_cast<const char*>(task->cpubuff) + offset);
-
-
-        // get metadata
-        const int dtype = task->output->dtype();
-
-        // false means not to delete data when SArray is deleted
-        auto vals = new ps::SArray<char>(data, len, false);
-
-        int cmd = GetCommandType(RequestType::kDefaultPushPull, dtype);
-        auto& pskv = BytePSGlobal::EncodeDefaultKey(task->key, len);
-        // issue pull
-        BytePSGlobal::GetPS()->ZPull(
-            pskv.keys, vals, &pskv.lens, cmd,
-            [vals, task, q]() {
-                delete vals;
-                FinishOrProceed(task);
-            });
-    }
-    else {
-        std::this_thread::sleep_for(std::chrono::nanoseconds(1000));
+bool RunSyncNcclOnce() {
+  auto nccl_entry = BytePSGlobal::GetNccl()->DequeueGroup();
+  if (nccl_entry) {
+    nccl_entry->SynchronizeEvents();
+    for (size_t i = 0; i < nccl_entry->tasks.size(); i++) {
+      FinishOrProceed(nccl_entry->tasks[i]);
     }
-    return true;
+    nccl_entry->DestroyEvents();
+    BPS_LOG(TRACE) << "Finished NCCL Group size=" << nccl_entry->tasks.size()
+                   << " rank=" << BytePSGlobal::GetLocalRank();
+  } else {
+    std::this_thread::sleep_for(std::chrono::nanoseconds(1000));
+  }
+  return true;
 }
 
-void CopyHost2Device(std::shared_ptr<byteps::common::TensorTableEntry> task) {
-    auto copy_h2d_stream = BytePSGlobal::GetCopyHost2DeviceStream();    
-    auto tensor = task->output;
+bool RunCopyDevice2HostLoopOnce() {
+  QueueType this_op = COPYD2H;
+  auto q = BytePSGlobal::GetScheduledQueue(this_op);
+  auto task = q->getTask();
+
+  if (task) {
+    auto copy_d2h_Stream = BytePSGlobal::GetCopyDevice2HostStream();
+    // If we ran NCCL reduce, we should copy from task->output
+    auto tensor =
+        (BytePSGlobal::GetNccl()->GetSize() > 1) ? task->output : task->tensor;
     BPS_CHECK(tensor);
-    auto key  = task->key;
+    auto key = task->key;
+
     auto nccl = BytePSGlobal::GetNccl();
-    auto nccl_root = nccl->GetRoot(key, BROADCAST);
+    auto nccl_root = nccl->GetRoot(key, REDUCE);
     auto nccl_size = nccl->GetSize();
-    auto nccl_rank = nccl->GetRank(key, BROADCAST);
+    auto nccl_rank = nccl->GetRank(key, REDUCE);
+
     auto len = task->len;
     auto offset = task->offset;
-    auto cpubuff = (char*)(task->cpubuff) + offset;
-    BPS_CHECK(cpubuff) << task->tensor_name << ": CPU buffer not initialized, size=" << len;
-
-    auto gpu_addr = (char*)(tensor->data()) + offset;
+    auto p = (char *)(tensor->data()) + offset;
     if (task->device == CPU_DEVICE_ID) {
-        gpu_addr = (char*)(task->gpu_ptr) + offset;
+      p = (char *)(task->gpu_ptr) + offset;
     }
-
     auto unit_len = tensor->size() / tensor->shape().num_elements();
+    char *cpubuff;
+    if (BytePSGlobal::IsCrossPcieSwitch()) {
+      BPS_CHECK(task->pcie_cpubuff.size());
+      cpubuff =
+          (char *)(task->pcie_cpubuff[BytePSGlobal::GetPcieSwitchIndex()]) +
+          offset;
+    } else {
+      cpubuff = (char *)(task->cpubuff) + offset;
+    }
+
+    BPS_CHECK(cpubuff) << task->tensor_name
+                       << ": CPU buffer not initialized, size=" << len;
+
     auto num_elem_per_gpu = len / nccl_size / unit_len;
     auto left_elem = (len / unit_len) - (num_elem_per_gpu * nccl_size);
-    
+
     auto copy_len = num_elem_per_gpu * unit_len;
     if (left_elem && (nccl_root == nccl_rank)) {
-        copy_len += left_elem * unit_len;
+      copy_len += left_elem * unit_len;
     }
 
     if (copy_len) {
-         CUDA_CALL(cudaMemcpyAsync((void *) (gpu_addr + nccl_rank * num_elem_per_gpu * unit_len),
-                              (const void *) (cpubuff + nccl_rank * num_elem_per_gpu * unit_len),
-                              (size_t) copy_len,
-                              (cudaMemcpyKind) cudaMemcpyHostToDevice,
-                              (cudaStream_t) *copy_h2d_stream));
-        CUDA_CALL(cudaStreamSynchronize(*copy_h2d_stream));
+      CUDA_CALL(cudaMemcpyAsync(
+          (void *)(cpubuff + nccl_rank * num_elem_per_gpu * unit_len),
+          (const void *)(p + nccl_rank * num_elem_per_gpu * unit_len),
+          (size_t)copy_len, (cudaMemcpyKind)cudaMemcpyDeviceToHost,
+          (cudaStream_t)*copy_d2h_Stream));
+      CUDA_CALL(cudaStreamSynchronize(*copy_d2h_Stream));
     }
 
-    return;
+    FinishOrProceed(task);
+  } else {
+    std::this_thread::sleep_for(std::chrono::nanoseconds(1000));
+  }
+  return true;
 }
 
-bool RunRootCopyHost2DeviceLoopOnce() {
-    QueueType this_op = COPYH2D;
-    auto q = BytePSGlobal::GetScheduledQueue(this_op);
-    auto task = q->getTask();
-
-    if (task) {
-        auto key  = task->key;
-        int local_rank = BytePSGlobal::GetLocalRank();
-        int local_size = BytePSGlobal::GetLocalSize();
-
-        if (local_size > 1) {
-            // notify non-root devices
-            struct BytePSCommMsg msg = { local_rank,
-                                         DO_COPYH2D,
-                                         key };
-            BytePSGlobal::GetBasicComm()->broadcastSignal(&msg,
-                                                          sizeof(BytePSCommMsg));
-        }
-        CopyHost2Device(task);
-
-        FinishOrProceed(task);
+bool RunPcieReduceLoopOnce() {
+  BPS_CHECK(BytePSGlobal::IsCrossPcieSwitch());
+  QueueType this_op = PCIE_REDUCE;
+  auto q = BytePSGlobal::GetScheduledQueue(this_op);
+  auto task = q->getTask();
+  if (task) {
+    auto reducer = BytePSGlobal::GetCpuReducer();
+    if (!reducer->isRoot()) {
+      // send signal to root
+      int rank = BytePSGlobal::GetLocalRank();
+      auto key = task->key;
+      BytePSCommSignal sig = PCIE_REDUCE_READY;
+      struct BytePSCommMsg msg = {rank, sig, key};
+      reducer->getComm()->sendSignalToRoot(&msg, sizeof(BytePSCommMsg));
+    } else {
+      auto tensor = task->tensor;
+
+      auto key = task->key;
+      auto len = task->len;
+      auto offset = task->offset;
+      auto unit_len = tensor->size() / tensor->shape().num_elements();
+
+      auto nccl = BytePSGlobal::GetNccl();
+      auto nccl_root = nccl->GetRoot(key, REDUCE);
+      auto nccl_size = nccl->GetSize();
+      auto nccl_rank = nccl->GetRank(key, REDUCE);
+
+      auto num_elem_per_gpu = len / nccl_size / unit_len;
+      auto left_elem = (len / unit_len) - (num_elem_per_gpu * nccl_size);
+
+      auto copy_len = num_elem_per_gpu * unit_len;
+      if (left_elem && (nccl_root == nccl_rank)) {
+        copy_len += left_elem * unit_len;
+      }
+
+      if (copy_len) {
+        auto total_offset = offset + nccl_rank * num_elem_per_gpu * unit_len;
+
+        // Below we assume there are only two PCIe switch
+        // and we run reducer in the context of the second switch
+        reducer->sum((void *)((char *)(task->cpubuff) + total_offset),
+                     (void *)((char *)(task->pcie_cpubuff[0]) + total_offset),
+                     copy_len, tensor->dtype());
+      }
     }
-    else {
-        std::this_thread::sleep_for(std::chrono::nanoseconds(1000));
+
+    FinishOrProceed(task);
+  } else {
+    std::this_thread::sleep_for(std::chrono::nanoseconds(1000));
+  }
+  return true;
+}
+
+bool RunPushLoopOnce() {
+  QueueType this_op = PUSH;
+  auto q = BytePSGlobal::GetScheduledQueue(this_op);
+  auto task = q->getTask();
+  if (task) {
+    BPS_CHECK(BytePSGlobal::IsRootDevice())
+        << "only root device should enter PUSH loop";
+
+    if (BytePSGlobal::IsDistributed()) {
+      auto offset = task->offset;
+      auto len = task->len;
+
+      char *data;
+      BPS_CHECK(task->cpubuff);
+      data =
+          const_cast<char *>(static_cast<const char *>(task->cpubuff) + offset);
+
+      // get metadata
+      const int dtype = task->tensor->dtype();
+
+      // false means not to delete data when SArray is deleted
+      ps::SArray<char> vals(data, len, false);
+
+      int cmd = GetCommandType(RequestType::kDefaultPushPull, dtype);
+      auto &pskv = BytePSGlobal::EncodeDefaultKey(task->key, len);
+      BytePSGlobal::GetPS()->ZPush(pskv.keys, vals, pskv.lens, cmd,
+                                   [task, q]() { FinishOrProceed(task); });
+    } else {
+      // This is a dummy barrier for IsCrossPcieSwitch()
+      BPS_CHECK(BytePSGlobal::IsCrossPcieSwitch());
+      FinishOrProceed(task);
     }
-    return true;
+  } else {
+    std::this_thread::sleep_for(std::chrono::nanoseconds(1000));
+  }
+  return true;
+}
+
+bool RunPullLoopOnce() {
+  QueueType this_op = PULL;
+  auto q = BytePSGlobal::GetScheduledQueue(this_op);
+  auto task = q->getTask();
+  if (task) {
+    BPS_CHECK(BytePSGlobal::IsRootDevice())
+        << "only root device should enter PULL loop";
+    // TODO: allow merging
+    auto offset = task->offset;
+    auto len = task->len;
+
+    char *data;
+    BPS_CHECK(task->cpubuff);
+    data =
+        const_cast<char *>(static_cast<const char *>(task->cpubuff) + offset);
+
+    // get metadata
+    const int dtype = task->output->dtype();
+
+    // false means not to delete data when SArray is deleted
+    auto vals = new ps::SArray<char>(data, len, false);
+
+    int cmd = GetCommandType(RequestType::kDefaultPushPull, dtype);
+    auto &pskv = BytePSGlobal::EncodeDefaultKey(task->key, len);
+    // issue pull
+    BytePSGlobal::GetPS()->ZPull(pskv.keys, vals, &pskv.lens, cmd,
+                                 [vals, task, q]() {
+                                   delete vals;
+                                   FinishOrProceed(task);
+                                 });
+  } else {
+    std::this_thread::sleep_for(std::chrono::nanoseconds(1000));
+  }
+  return true;
+}
+
+void CopyHost2Device(std::shared_ptr<byteps::common::TensorTableEntry> task) {
+  auto copy_h2d_stream = BytePSGlobal::GetCopyHost2DeviceStream();
+  auto tensor = task->output;
+  BPS_CHECK(tensor);
+  auto key = task->key;
+  auto nccl = BytePSGlobal::GetNccl();
+  auto nccl_root = nccl->GetRoot(key, BROADCAST);
+  auto nccl_size = nccl->GetSize();
+  auto nccl_rank = nccl->GetRank(key, BROADCAST);
+  auto len = task->len;
+  auto offset = task->offset;
+  auto cpubuff = (char *)(task->cpubuff) + offset;
+  BPS_CHECK(cpubuff) << task->tensor_name
+                     << ": CPU buffer not initialized, size=" << len;
+
+  auto gpu_addr = (char *)(tensor->data()) + offset;
+  if (task->device == CPU_DEVICE_ID) {
+    gpu_addr = (char *)(task->gpu_ptr) + offset;
+  }
+
+  auto unit_len = tensor->size() / tensor->shape().num_elements();
+  auto num_elem_per_gpu = len / nccl_size / unit_len;
+  auto left_elem = (len / unit_len) - (num_elem_per_gpu * nccl_size);
+
+  auto copy_len = num_elem_per_gpu * unit_len;
+  if (left_elem && (nccl_root == nccl_rank)) {
+    copy_len += left_elem * unit_len;
+  }
+
+  if (copy_len) {
+    CUDA_CALL(cudaMemcpyAsync(
+        (void *)(gpu_addr + nccl_rank * num_elem_per_gpu * unit_len),
+        (const void *)(cpubuff + nccl_rank * num_elem_per_gpu * unit_len),
+        (size_t)copy_len, (cudaMemcpyKind)cudaMemcpyHostToDevice,
+        (cudaStream_t)*copy_h2d_stream));
+    CUDA_CALL(cudaStreamSynchronize(*copy_h2d_stream));
+  }
+
+  return;
+}
+
+bool RunRootCopyHost2DeviceLoopOnce() {
+  QueueType this_op = COPYH2D;
+  auto q = BytePSGlobal::GetScheduledQueue(this_op);
+  auto task = q->getTask();
+
+  if (task) {
+    auto key = task->key;
+    int local_rank = BytePSGlobal::GetLocalRank();
+    int local_size = BytePSGlobal::GetLocalSize();
+
+    if (local_size > 1) {
+      // notify non-root devices
+      struct BytePSCommMsg msg = {local_rank, DO_COPYH2D, key};
+      BytePSGlobal::GetBasicComm()->broadcastSignal(&msg,
+                                                    sizeof(BytePSCommMsg));
+    }
+    CopyHost2Device(task);
+
+    FinishOrProceed(task);
+  } else {
+    std::this_thread::sleep_for(std::chrono::nanoseconds(1000));
+  }
+  return true;
 }
 
 bool RunNonRootCopyListenLoopOnce() {
-    auto signal_comm = BytePSGlobal::GetBasicComm();
-    int root = signal_comm->getRoot();
-    int rank = BytePSGlobal::GetLocalRank();
-    BPS_CHECK_NE(root, rank);
+  auto signal_comm = BytePSGlobal::GetBasicComm();
+  int root = signal_comm->getRoot();
+  int rank = BytePSGlobal::GetLocalRank();
+  BPS_CHECK_NE(root, rank);
 
-    struct BytePSCommMsg msg = {};
+  struct BytePSCommMsg msg = {};
 
-    signal_comm->recvSignalFromRoot(&msg, sizeof(BytePSCommMsg));
-    BPS_CHECK_EQ(msg.signal, DO_COPYH2D) << msg.signal;
+  signal_comm->recvSignalFromRoot(&msg, sizeof(BytePSCommMsg));
+  BPS_CHECK_EQ(msg.signal, DO_COPYH2D) << msg.signal;
 
-    BytePSGlobal::GetCopyTable()->AddReadyCount(msg.key);
+  BytePSGlobal::GetCopyTable()->AddReadyCount(msg.key);
 
-    BPS_LOG(TRACE) << "NonRootCopyListenLoop recved from root"
-                       << ", signal=" << msg.signal
-                       << ", key="    << msg.key
-                       << ", myrank=" << rank;
-    return true;
+  BPS_LOG(TRACE) << "NonRootCopyListenLoop recved from root"
+                 << ", signal=" << msg.signal << ", key=" << msg.key
+                 << ", myrank=" << rank;
+  return true;
 }
 
 bool RunNonRootCopyHost2DeviceLoopOnce() {
-    QueueType this_op = COPYH2D;
-    auto q = BytePSGlobal::GetScheduledQueue(this_op);
-    auto task = q->getTask();
-
-    if (task) {
-        CopyHost2Device(task);
-        FinishOrProceed(task);
-    } else {
-        std::this_thread::sleep_for(std::chrono::nanoseconds(1000));
-    }
-    return true;
+  QueueType this_op = COPYH2D;
+  auto q = BytePSGlobal::GetScheduledQueue(this_op);
+  auto task = q->getTask();
+
+  if (task) {
+    CopyHost2Device(task);
+    FinishOrProceed(task);
+  } else {
+    std::this_thread::sleep_for(std::chrono::nanoseconds(1000));
+  }
+  return true;
 }
 
 void CoordinateReduceLoop() {
-    while (RunCoordinateLoopOnce(COORDINATE_REDUCE) && !BytePSGlobal::ShouldShutdown()) {}
+  while (RunCoordinateLoopOnce(COORDINATE_REDUCE) &&
+         !BytePSGlobal::ShouldShutdown()) {
+  }
 }
 
 void CoordinateBroadcastLoop() {
-    while (RunCoordinateLoopOnce(COORDINATE_BROADCAST) && !BytePSGlobal::ShouldShutdown()) {}
+  while (RunCoordinateLoopOnce(COORDINATE_BROADCAST) &&
+         !BytePSGlobal::ShouldShutdown()) {
+  }
 }
 
 void CoordinatePushLoop() {
-    while (RunCoordinateLoopOnce(COORDINATE_PUSH) && !BytePSGlobal::ShouldShutdown()) {}
+  while (RunCoordinateLoopOnce(COORDINATE_PUSH) &&
+         !BytePSGlobal::ShouldShutdown()) {
+  }
 }
 
 void PcieReduceLoop() {
-    CUDA_CALL(cudaSetDevice(BytePSGlobal::GetLocalRank()));
-    while (RunPcieReduceLoopOnce() && !BytePSGlobal::ShouldShutdown()) {}
+  CUDA_CALL(cudaSetDevice(BytePSGlobal::GetLocalRank()));
+  while (RunPcieReduceLoopOnce() && !BytePSGlobal::ShouldShutdown()) {
+  }
 }
 
 void RootNcclLoop() {
-    CUDA_CALL(cudaSetDevice(BytePSGlobal::GetLocalRank()));
-    while (RunRootNcclLoopOnce() && !BytePSGlobal::ShouldShutdown()) {}
+  CUDA_CALL(cudaSetDevice(BytePSGlobal::GetLocalRank()));
+  while (RunRootNcclLoopOnce() && !BytePSGlobal::ShouldShutdown()) {
+  }
 }
 
 void NonRootNcclLoop() {
-    CUDA_CALL(cudaSetDevice(BytePSGlobal::GetLocalRank()));
-    while (RunNonRootNcclLoopOnce() && !BytePSGlobal::ShouldShutdown()) {}
+  CUDA_CALL(cudaSetDevice(BytePSGlobal::GetLocalRank()));
+  while (RunNonRootNcclLoopOnce() && !BytePSGlobal::ShouldShutdown()) {
+  }
 }
 
 void SyncNcclLoop() {
-    CUDA_CALL(cudaSetDevice(BytePSGlobal::GetLocalRank()));
-    while (RunSyncNcclOnce() && !BytePSGlobal::ShouldShutdown()) {}
+  CUDA_CALL(cudaSetDevice(BytePSGlobal::GetLocalRank()));
+  while (RunSyncNcclOnce() && !BytePSGlobal::ShouldShutdown()) {
+  }
 }
 
 void CopyDevice2HostLoop() {
-    CUDA_CALL(cudaSetDevice(BytePSGlobal::GetLocalRank()));
-    while (RunCopyDevice2HostLoopOnce() && !BytePSGlobal::ShouldShutdown()) {}
+  CUDA_CALL(cudaSetDevice(BytePSGlobal::GetLocalRank()));
+  while (RunCopyDevice2HostLoopOnce() && !BytePSGlobal::ShouldShutdown()) {
+  }
 }
 
 void PushLoop() {
-    while (RunPushLoopOnce() && !BytePSGlobal::ShouldShutdown()) {}
+  while (RunPushLoopOnce() && !BytePSGlobal::ShouldShutdown()) {
+  }
 }
 
 void PullLoop() {
-    while (RunPullLoopOnce() && !BytePSGlobal::ShouldShutdown()) {}
+  while (RunPullLoopOnce() && !BytePSGlobal::ShouldShutdown()) {
+  }
 }
 
 void RootCopyHost2DeviceLoop() {
-    CUDA_CALL(cudaSetDevice(BytePSGlobal::GetLocalRank()));
-    while (RunRootCopyHost2DeviceLoopOnce() && !BytePSGlobal::ShouldShutdown()) {}
+  CUDA_CALL(cudaSetDevice(BytePSGlobal::GetLocalRank()));
+  while (RunRootCopyHost2DeviceLoopOnce() && !BytePSGlobal::ShouldShutdown()) {
+  }
 }
 
 void NonRootCopyListenLoop() {
-    CUDA_CALL(cudaSetDevice(BytePSGlobal::GetLocalRank()));
-    while (RunNonRootCopyListenLoopOnce() && !BytePSGlobal::ShouldShutdown()) {}
+  CUDA_CALL(cudaSetDevice(BytePSGlobal::GetLocalRank()));
+  while (RunNonRootCopyListenLoopOnce() && !BytePSGlobal::ShouldShutdown()) {
+  }
 }
 
 void NonRootCopyHost2DeviceLoop() {
-    CUDA_CALL(cudaSetDevice(BytePSGlobal::GetLocalRank()));
-    while (RunNonRootCopyHost2DeviceLoopOnce() && !BytePSGlobal::ShouldShutdown()) {}
+  CUDA_CALL(cudaSetDevice(BytePSGlobal::GetLocalRank()));
+  while (RunNonRootCopyHost2DeviceLoopOnce() &&
+         !BytePSGlobal::ShouldShutdown()) {
+  }
 }
 
-
-} // namespace common
-} // namespace byteps
+}  // namespace common
+}  // namespace byteps
diff --git a/byteps/common/core_loops.h b/byteps/common/core_loops.h
index b668e8b9d..561eea17c 100644
--- a/byteps/common/core_loops.h
+++ b/byteps/common/core_loops.h
@@ -45,7 +45,7 @@ void NonRootCopyListenLoop();
 
 void NonRootCopyHost2DeviceLoop();
 
-} // namespace common
-} // namespace byteps
+}  // namespace common
+}  // namespace byteps
 
-#endif // BYTEPS_CORE_LOOPS_H
\ No newline at end of file
+#endif  // BYTEPS_CORE_LOOPS_H
diff --git a/byteps/common/cpu_reducer.cc b/byteps/common/cpu_reducer.cc
index f566f0803..4cd8790f0 100644
--- a/byteps/common/cpu_reducer.cc
+++ b/byteps/common/cpu_reducer.cc
@@ -19,270 +19,267 @@ namespace byteps {
 namespace common {
 
 CpuReducer::CpuReducer(std::shared_ptr<BytePSComm> comm) {
-    std::vector<int> peers;
-    auto pcie_size = BytePSGlobal::GetPcieSwitchSize();
-    for (int i = BytePSGlobal::GetLocalRank() % pcie_size;
-         i < BytePSGlobal::GetLocalSize();
-         i += pcie_size) {
-        peers.push_back(i);
-    }
-    _comm = std::make_shared<BytePSCommSocket>(comm, std::string("cpu"), peers);
-    if (getenv("BYTEPS_OMP_THREAD_PER_GPU")) {
-        _num_threads = atoi(getenv("BYTEPS_OMP_THREAD_PER_GPU"));
-    }
-    else {
-        _num_threads = 4;
-    }
-    return;
+  std::vector<int> peers;
+  auto pcie_size = BytePSGlobal::GetPcieSwitchSize();
+  for (int i = BytePSGlobal::GetLocalRank() % pcie_size;
+       i < BytePSGlobal::GetLocalSize(); i += pcie_size) {
+    peers.push_back(i);
+  }
+  _comm = std::make_shared<BytePSCommSocket>(comm, std::string("cpu"), peers);
+  if (getenv("BYTEPS_OMP_THREAD_PER_GPU")) {
+    _num_threads = atoi(getenv("BYTEPS_OMP_THREAD_PER_GPU"));
+  } else {
+    _num_threads = 4;
+  }
+  return;
 }
 
 bool CpuReducer::isRoot() {
-    return (_comm->getRoot() == BytePSGlobal::GetLocalRank());
+  return (_comm->getRoot() == BytePSGlobal::GetLocalRank());
 }
 
 int CpuReducer::sum(void* dst, void* src, size_t len, DataType dtype) {
-    switch (dtype) {
-        case BYTEPS_FLOAT32:
-            return _sum_float32(dst, src, len);
-        case BYTEPS_FLOAT64:
-            return _sum_float64(dst, src, len);
-        case BYTEPS_FLOAT16:
-            return _sum_float16(dst, src, len);
-        case BYTEPS_UINT8:
-            return _sum_unit8(dst, src, len);
-        case BYTEPS_INT32:
-            return _sum_int32(dst, src, len);
-        case BYTEPS_INT8:
-            return _sum_int8(dst, src, len);
-        case BYTEPS_INT64:
-            return _sum_int64(dst, src, len);
-        default:
-            BPS_CHECK(0) << "Unsupported data type: " << dtype;
-    }
-    return 0;
+  switch (dtype) {
+    case BYTEPS_FLOAT32:
+      return _sum_float32(dst, src, len);
+    case BYTEPS_FLOAT64:
+      return _sum_float64(dst, src, len);
+    case BYTEPS_FLOAT16:
+      return _sum_float16(dst, src, len);
+    case BYTEPS_UINT8:
+      return _sum_unit8(dst, src, len);
+    case BYTEPS_INT32:
+      return _sum_int32(dst, src, len);
+    case BYTEPS_INT8:
+      return _sum_int8(dst, src, len);
+    case BYTEPS_INT64:
+      return _sum_int64(dst, src, len);
+    default:
+      BPS_CHECK(0) << "Unsupported data type: " << dtype;
+  }
+  return 0;
 }
 
 int CpuReducer::_sum_float32(void* dst, void* src, size_t len) {
-    auto d = (float*)dst;
-    auto s = (float*)src;
+  auto d = (float*)dst;
+  auto s = (float*)src;
 #pragma omp parallel for simd num_threads(_num_threads)
-    for (size_t i = 0; i < len / (size_t) 4; ++i) {
-        d[i] = d[i] + s[i];
-    }
-    return 0;
+  for (size_t i = 0; i < len / (size_t)4; ++i) {
+    d[i] = d[i] + s[i];
+  }
+  return 0;
 }
 
 int CpuReducer::_sum_float64(void* dst, void* src, size_t len) {
-    auto d = (double*)dst;
-    auto s = (double*)src;
+  auto d = (double*)dst;
+  auto s = (double*)src;
 #pragma omp parallel for simd num_threads(_num_threads)
-    for (size_t i = 0; i < len / (size_t) 8; ++i) {
-        d[i] = d[i] + s[i];
-    }
-    return 0;
+  for (size_t i = 0; i < len / (size_t)8; ++i) {
+    d[i] = d[i] + s[i];
+  }
+  return 0;
 }
 
 int CpuReducer::_sum_float16(void* dst, void* src, size_t len) {
-    // cast src and dst to your float16 type
-    auto in = (unsigned short*)src;
-    auto inout = (unsigned short*)dst;
-    len = len / (size_t) 2;
+  // cast src and dst to your float16 type
+  auto in = (unsigned short*)src;
+  auto inout = (unsigned short*)dst;
+  len = len / (size_t)2;
 
 #if __AVX__ && __F16C__
-    if (is_avx_and_f16c()) {
+  if (is_avx_and_f16c()) {
 #pragma omp parallel for simd num_threads(_num_threads)
-      for (size_t i = 0; i < (size_t) (len / 8) * 8; i += 8) {
-        // convert in & inout to m256
-        __m256 in_m256 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(in + i)));
-        __m256 inout_m256 =
-            _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(inout + i)));
-
-        // add them together to new_inout_m256
-        __m256 new_inout_m256 = _mm256_add_ps(in_m256, inout_m256);
+    for (size_t i = 0; i < (size_t)(len / 8) * 8; i += 8) {
+      // convert in & inout to m256
+      __m256 in_m256 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(in + i)));
+      __m256 inout_m256 =
+          _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(inout + i)));
 
-        // convert back and store in inout
-        __m128i new_inout_m128i = _mm256_cvtps_ph(new_inout_m256, 0);
-        _mm_storeu_si128((__m128i*)(inout + i), new_inout_m128i);
+      // add them together to new_inout_m256
+      __m256 new_inout_m256 = _mm256_add_ps(in_m256, inout_m256);
 
-      }
+      // convert back and store in inout
+      __m128i new_inout_m128i = _mm256_cvtps_ph(new_inout_m256, 0);
+      _mm_storeu_si128((__m128i*)(inout + i), new_inout_m128i);
     }
+  }
 #endif
-    for (size_t i = (len / 8) * 8; i < (size_t) len; ++i) {
-      float in_float;
-      float inout_float;
-      HalfBits2Float(in + i, &in_float);
-      HalfBits2Float(inout + i, &inout_float);
-      inout_float += in_float;
-      Float2HalfBits(&inout_float, inout + i);
-    }
+  for (size_t i = (len / 8) * 8; i < (size_t)len; ++i) {
+    float in_float;
+    float inout_float;
+    HalfBits2Float(in + i, &in_float);
+    HalfBits2Float(inout + i, &inout_float);
+    inout_float += in_float;
+    Float2HalfBits(&inout_float, inout + i);
+  }
 
-    return 0;
+  return 0;
 }
 
 int CpuReducer::_sum_unit8(void* dst, void* src, size_t len) {
-    auto d = (unsigned char*)dst;
-    auto s = (unsigned char*)src;
+  auto d = (unsigned char*)dst;
+  auto s = (unsigned char*)src;
 #pragma omp parallel for simd num_threads(_num_threads)
-    for (size_t i = 0; i < len; ++i) {
-        d[i] = d[i] + s[i];
-    }
-    return 0;
+  for (size_t i = 0; i < len; ++i) {
+    d[i] = d[i] + s[i];
+  }
+  return 0;
 }
 
 int CpuReducer::_sum_int32(void* dst, void* src, size_t len) {
-    auto d = (int*)dst;
-    auto s = (int*)src;
+  auto d = (int*)dst;
+  auto s = (int*)src;
 #pragma omp parallel for simd num_threads(_num_threads)
-    for (size_t i = 0; i < len / (size_t) 4; ++i) {
-        d[i] = d[i] + s[i];
-    }
-    return 0;
+  for (size_t i = 0; i < len / (size_t)4; ++i) {
+    d[i] = d[i] + s[i];
+  }
+  return 0;
 }
 
 int CpuReducer::_sum_int8(void* dst, void* src, size_t len) {
-    auto d = (signed char*)dst;
-    auto s = (signed char*)src;
+  auto d = (signed char*)dst;
+  auto s = (signed char*)src;
 #pragma omp parallel for simd num_threads(_num_threads)
-    for (size_t i = 0; i < len; ++i) {
-        d[i] = d[i] + s[i];
-    }
-    return 0;
+  for (size_t i = 0; i < len; ++i) {
+    d[i] = d[i] + s[i];
+  }
+  return 0;
 }
 
 int CpuReducer::_sum_int64(void* dst, void* src, size_t len) {
-    auto d = (long long*)dst;
-    auto s = (long long*)src;
+  auto d = (long long*)dst;
+  auto s = (long long*)src;
 #pragma omp parallel for simd num_threads(_num_threads)
-    for (size_t i = 0; i < len / (size_t) 8; ++i) {
-        d[i] = d[i] + s[i];
-    }
-    return 0;
+  for (size_t i = 0; i < len / (size_t)8; ++i) {
+    d[i] = d[i] + s[i];
+  }
+  return 0;
 }
 
-int CpuReducer::sum(void* dst, void* src1, void* src2, size_t len, DataType dtype) {
-    switch (dtype) {
-        case BYTEPS_FLOAT32:
-            return _sum_float32(dst, src1, src2, len);
-        case BYTEPS_FLOAT64:
-            return _sum_float64(dst, src1, src2, len);
-        case BYTEPS_FLOAT16:
-            return _sum_float16(dst, src1, src2, len);
-        case BYTEPS_UINT8:
-            return _sum_unit8(dst, src1, src2, len);
-        case BYTEPS_INT32:
-            return _sum_int32(dst, src1, src2, len);
-        case BYTEPS_INT8:
-            return _sum_int8(dst, src1, src2, len);
-        case BYTEPS_INT64:
-            return _sum_int64(dst, src1, src2, len);
-        default:
-            BPS_CHECK(0) << "Unsupported data type: " << dtype;
-    }
-    return 0;
+int CpuReducer::sum(void* dst, void* src1, void* src2, size_t len,
+                    DataType dtype) {
+  switch (dtype) {
+    case BYTEPS_FLOAT32:
+      return _sum_float32(dst, src1, src2, len);
+    case BYTEPS_FLOAT64:
+      return _sum_float64(dst, src1, src2, len);
+    case BYTEPS_FLOAT16:
+      return _sum_float16(dst, src1, src2, len);
+    case BYTEPS_UINT8:
+      return _sum_unit8(dst, src1, src2, len);
+    case BYTEPS_INT32:
+      return _sum_int32(dst, src1, src2, len);
+    case BYTEPS_INT8:
+      return _sum_int8(dst, src1, src2, len);
+    case BYTEPS_INT64:
+      return _sum_int64(dst, src1, src2, len);
+    default:
+      BPS_CHECK(0) << "Unsupported data type: " << dtype;
+  }
+  return 0;
 }
 
 int CpuReducer::_sum_float32(void* dst, void* src1, void* src2, size_t len) {
-    auto d = (float*)dst;
-    auto s1 = (float*)src1;
-    auto s2 = (float*)src2;
+  auto d = (float*)dst;
+  auto s1 = (float*)src1;
+  auto s2 = (float*)src2;
 #pragma omp parallel for simd num_threads(_num_threads)
-    for (size_t i = 0; i < len / (size_t) 4; ++i) {
-        d[i] = s1[i] + s2[i];
-    }
-    return 0;
+  for (size_t i = 0; i < len / (size_t)4; ++i) {
+    d[i] = s1[i] + s2[i];
+  }
+  return 0;
 }
 
 int CpuReducer::_sum_float64(void* dst, void* src1, void* src2, size_t len) {
-    auto d = (double*)dst;
-    auto s1 = (double*)src1;
-    auto s2 = (double*)src2;
+  auto d = (double*)dst;
+  auto s1 = (double*)src1;
+  auto s2 = (double*)src2;
 #pragma omp parallel for simd num_threads(_num_threads)
-    for (size_t i = 0; i < len / (size_t) 8; ++i) {
-        d[i] = s1[i] + s2[i];
-    }
-    return 0;
+  for (size_t i = 0; i < len / (size_t)8; ++i) {
+    d[i] = s1[i] + s2[i];
+  }
+  return 0;
 }
 
 int CpuReducer::_sum_float16(void* dst, void* src1, void* src2, size_t len) {
-    // cast src and dst to your float16 type
-    auto in1 = (unsigned short*)src1;
-    auto in2 = (unsigned short*)src2;
-    auto out = (unsigned short*)dst;
-    len = len / (size_t) 2;
+  // cast src and dst to your float16 type
+  auto in1 = (unsigned short*)src1;
+  auto in2 = (unsigned short*)src2;
+  auto out = (unsigned short*)dst;
+  len = len / (size_t)2;
 
 #if __AVX__ && __F16C__
-    if (is_avx_and_f16c()) {
+  if (is_avx_and_f16c()) {
 #pragma omp parallel for simd num_threads(_num_threads)
-      for (size_t i = 0; i < (size_t) (len / 8) * 8; i += 8) {
-        // convert in1 & in2 to m256
-        __m256 in_m256 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(in1 + i)));
-        __m256 inout_m256 =
-            _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(in2 + i)));
+    for (size_t i = 0; i < (size_t)(len / 8) * 8; i += 8) {
+      // convert in1 & in2 to m256
+      __m256 in_m256 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(in1 + i)));
+      __m256 inout_m256 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(in2 + i)));
 
-        // add them together to new_inout_m256
-        __m256 new_inout_m256 = _mm256_add_ps(in_m256, inout_m256);
+      // add them together to new_inout_m256
+      __m256 new_inout_m256 = _mm256_add_ps(in_m256, inout_m256);
 
-        // convert back and store in out
-        __m128i new_inout_m128i = _mm256_cvtps_ph(new_inout_m256, 0);
-        _mm_storeu_si128((__m128i*)(out + i), new_inout_m128i);
-      }
+      // convert back and store in out
+      __m128i new_inout_m128i = _mm256_cvtps_ph(new_inout_m256, 0);
+      _mm_storeu_si128((__m128i*)(out + i), new_inout_m128i);
     }
+  }
 #endif
-    for (size_t i = (size_t) (len / 8) * 8; i < (size_t) len; ++i) {
-      float in1_float;
-      float in2_float;
-      float out_float;
-      HalfBits2Float(in1 + i, &in1_float);
-      HalfBits2Float(in2 + i, &in2_float);
-      out_float = in1_float + in2_float;
-      Float2HalfBits(&out_float, out + i);
-    }
-    return 0;
+  for (size_t i = (size_t)(len / 8) * 8; i < (size_t)len; ++i) {
+    float in1_float;
+    float in2_float;
+    float out_float;
+    HalfBits2Float(in1 + i, &in1_float);
+    HalfBits2Float(in2 + i, &in2_float);
+    out_float = in1_float + in2_float;
+    Float2HalfBits(&out_float, out + i);
+  }
+  return 0;
 }
 
 int CpuReducer::_sum_unit8(void* dst, void* src1, void* src2, size_t len) {
-    auto d = (unsigned char*)dst;
-    auto s1 = (unsigned char*)src1;
-    auto s2 = (unsigned char*)src2;
+  auto d = (unsigned char*)dst;
+  auto s1 = (unsigned char*)src1;
+  auto s2 = (unsigned char*)src2;
 #pragma omp parallel for simd num_threads(_num_threads)
-    for (size_t i = 0; i < len; ++i) {
-        d[i] = s1[i] + s2[i];
-    }
-    return 0;
+  for (size_t i = 0; i < len; ++i) {
+    d[i] = s1[i] + s2[i];
+  }
+  return 0;
 }
 
 int CpuReducer::_sum_int32(void* dst, void* src1, void* src2, size_t len) {
-    auto d = (int*)dst;
-    auto s1 = (int*)src1;
-    auto s2 = (int*)src2;
+  auto d = (int*)dst;
+  auto s1 = (int*)src1;
+  auto s2 = (int*)src2;
 #pragma omp parallel for simd num_threads(_num_threads)
-    for (size_t i = 0; i < len / (size_t) 4; ++i) {
-        d[i] = s1[i] + s2[i];
-    }
-    return 0;
+  for (size_t i = 0; i < len / (size_t)4; ++i) {
+    d[i] = s1[i] + s2[i];
+  }
+  return 0;
 }
 
 int CpuReducer::_sum_int8(void* dst, void* src1, void* src2, size_t len) {
-    auto d = (signed char*)dst;
-    auto s1 = (signed char*)src1;
-    auto s2 = (signed char*)src2;
+  auto d = (signed char*)dst;
+  auto s1 = (signed char*)src1;
+  auto s2 = (signed char*)src2;
 #pragma omp parallel for simd num_threads(_num_threads)
-    for (size_t i = 0; i < len; ++i) {
-        d[i] = s1[i] + s2[i];
-    }
-    return 0;
+  for (size_t i = 0; i < len; ++i) {
+    d[i] = s1[i] + s2[i];
+  }
+  return 0;
 }
 
 int CpuReducer::_sum_int64(void* dst, void* src1, void* src2, size_t len) {
-    auto d = (long long*)dst;
-    auto s1 = (long long*)src1;
-    auto s2 = (long long*)src2;
+  auto d = (long long*)dst;
+  auto s1 = (long long*)src1;
+  auto s2 = (long long*)src2;
 #pragma omp parallel for simd num_threads(_num_threads)
-    for (size_t i = 0; i < len / (size_t) 8; ++i) {
-        d[i] = s1[i] + s2[i];
-    }
-    return 0;
+  for (size_t i = 0; i < len / (size_t)8; ++i) {
+    d[i] = s1[i] + s2[i];
+  }
+  return 0;
 }
 
-} // namespace common
-} // namespace byteps
+}  // namespace common
+}  // namespace byteps
diff --git a/byteps/common/cpu_reducer.h b/byteps/common/cpu_reducer.h
index 2a2afe42a..44d0476eb 100644
--- a/byteps/common/cpu_reducer.h
+++ b/byteps/common/cpu_reducer.h
@@ -31,162 +31,158 @@
 namespace byteps {
 namespace common {
 
-
 class CpuReducer {
-
-public:
-    CpuReducer(std::shared_ptr<BytePSComm> comm);
-    ~CpuReducer() {
-        if (_comm) _comm.reset();
-        BPS_LOG(DEBUG) << "Clear CpuReducer";
-    }
-    
-    int sum(void* dst, void* src, size_t len, DataType dtype);
-    int sum(void* dst, void* src1, void* src2, size_t len, DataType dtype);
-    bool isRoot();
-    std::shared_ptr<BytePSComm> getComm() { return _comm; }
-
-private:
-
+ public:
+  CpuReducer(std::shared_ptr<BytePSComm> comm);
+  ~CpuReducer() {
+    if (_comm) _comm.reset();
+    BPS_LOG(DEBUG) << "Clear CpuReducer";
+  }
+
+  int sum(void* dst, void* src, size_t len, DataType dtype);
+  int sum(void* dst, void* src1, void* src2, size_t len, DataType dtype);
+  bool isRoot();
+  std::shared_ptr<BytePSComm> getComm() { return _comm; }
+
+ private:
 #if __AVX__ && __F16C__
-    // Query CPUID to determine AVX and F16C runtime support.
-    bool is_avx_and_f16c() {
-      static bool initialized = false;
-      static bool result = false;
-      if (!initialized) {
-        unsigned int eax, ebx, ecx, edx;
-        if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
-          result = (ecx & bit_AVX) && (ecx & bit_F16C);
-        }
-        initialized = true;
+  // Query CPUID to determine AVX and F16C runtime support.
+  bool is_avx_and_f16c() {
+    static bool initialized = false;
+    static bool result = false;
+    if (!initialized) {
+      unsigned int eax, ebx, ecx, edx;
+      if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
+        result = (ecx & bit_AVX) && (ecx & bit_F16C);
       }
-      return result;
+      initialized = true;
     }
+    return result;
+  }
 #endif
 
-    inline void HalfBits2Float(unsigned short* src, float* res) {
-      unsigned h = *src;
-      int sign = ((h >> 15) & 1);
-      int exp = ((h >> 10) & 0x1f);
-      int mantissa = (h & 0x3ff);
-      unsigned f = 0;
-
-      if (exp > 0 && exp < 31) {
-        // normal
-        exp += 112;
-        f = (sign << 31) | (exp << 23) | (mantissa << 13);
-      } else if (exp == 0) {
-        if (mantissa) {
-          // subnormal
-          exp += 113;
-          while ((mantissa & (1 << 10)) == 0) {
-            mantissa <<= 1;
-            exp--;
-          }
-          mantissa &= 0x3ff;
-          f = (sign << 31) | (exp << 23) | (mantissa << 13);
-        } else {
-          // sign-preserving zero
-          f = (sign << 31);
-        }
-      } else if (exp == 31) {
-        if (mantissa) {
-          f = 0x7fffffff;  // not a number
-        } else {
-          f = (0xff << 23) | (sign << 31);  //  inf
+  inline void HalfBits2Float(unsigned short* src, float* res) {
+    unsigned h = *src;
+    int sign = ((h >> 15) & 1);
+    int exp = ((h >> 10) & 0x1f);
+    int mantissa = (h & 0x3ff);
+    unsigned f = 0;
+
+    if (exp > 0 && exp < 31) {
+      // normal
+      exp += 112;
+      f = (sign << 31) | (exp << 23) | (mantissa << 13);
+    } else if (exp == 0) {
+      if (mantissa) {
+        // subnormal
+        exp += 113;
+        while ((mantissa & (1 << 10)) == 0) {
+          mantissa <<= 1;
+          exp--;
         }
+        mantissa &= 0x3ff;
+        f = (sign << 31) | (exp << 23) | (mantissa << 13);
+      } else {
+        // sign-preserving zero
+        f = (sign << 31);
+      }
+    } else if (exp == 31) {
+      if (mantissa) {
+        f = 0x7fffffff;  // not a number
+      } else {
+        f = (0xff << 23) | (sign << 31);  //  inf
       }
-
-      *res = *reinterpret_cast<float const*>(&f);
     }
 
-    inline void Float2HalfBits(float* src, unsigned short* dest) {
-      // software implementation rounds toward nearest even
-      unsigned const& s = *reinterpret_cast<unsigned const*>(src);
-      uint16_t sign = uint16_t((s >> 16) & 0x8000);
-      int16_t exp = uint16_t(((s >> 23) & 0xff) - 127);
-      int mantissa = s & 0x7fffff;
-      uint16_t u = 0;
+    *res = *reinterpret_cast<float const*>(&f);
+  }
+
+  inline void Float2HalfBits(float* src, unsigned short* dest) {
+    // software implementation rounds toward nearest even
+    unsigned const& s = *reinterpret_cast<unsigned const*>(src);
+    uint16_t sign = uint16_t((s >> 16) & 0x8000);
+    int16_t exp = uint16_t(((s >> 23) & 0xff) - 127);
+    int mantissa = s & 0x7fffff;
+    uint16_t u = 0;
+
+    if ((s & 0x7fffffff) == 0) {
+      // sign-preserving zero
+      *dest = sign;
+      return;
+    }
 
-      if ((s & 0x7fffffff) == 0) {
-        // sign-preserving zero
-        *dest = sign;
-        return;
+    if (exp > 15) {
+      if (exp == 128 && mantissa) {
+        // not a number
+        u = 0x7fff;
+      } else {
+        // overflow to infinity
+        u = sign | 0x7c00;
       }
+      *dest = u;
+      return;
+    }
 
-      if (exp > 15) {
-        if (exp == 128 && mantissa) {
-          // not a number
-          u = 0x7fff;
-        } else {
-          // overflow to infinity
-          u = sign | 0x7c00;
-        }
-        *dest = u;
-        return;
-      }
+    int sticky_bit = 0;
+
+    if (exp >= -14) {
+      // normal fp32 to normal fp16
+      exp = uint16_t(exp + uint16_t(15));
+      u = uint16_t(((exp & 0x1f) << 10));
+      u = uint16_t(u | (mantissa >> 13));
+    } else {
+      // normal single-precision to subnormal half_t-precision representation
+      int rshift = (-14 - exp);
+      if (rshift < 32) {
+        mantissa |= (1 << 23);
 
-      int sticky_bit = 0;
+        sticky_bit = ((mantissa & ((1 << rshift) - 1)) != 0);
 
-      if (exp >= -14) {
-        // normal fp32 to normal fp16
-        exp = uint16_t(exp + uint16_t(15));
-        u = uint16_t(((exp & 0x1f) << 10));
-        u = uint16_t(u | (mantissa >> 13));
+        mantissa = (mantissa >> rshift);
+        u = (uint16_t(mantissa >> 13) & 0x3ff);
       } else {
-        // normal single-precision to subnormal half_t-precision representation
-        int rshift = (-14 - exp);
-        if (rshift < 32) {
-          mantissa |= (1 << 23);
-
-          sticky_bit = ((mantissa & ((1 << rshift) - 1)) != 0);
-
-          mantissa = (mantissa >> rshift);
-          u = (uint16_t(mantissa >> 13) & 0x3ff);
-        } else {
-          mantissa = 0;
-          u = 0;
-        }
+        mantissa = 0;
+        u = 0;
       }
+    }
 
-      // round to nearest even
-      int round_bit = ((mantissa >> 12) & 1);
-      sticky_bit |= ((mantissa & ((1 << 12) - 1)) != 0);
+    // round to nearest even
+    int round_bit = ((mantissa >> 12) & 1);
+    sticky_bit |= ((mantissa & ((1 << 12) - 1)) != 0);
 
-      if ((round_bit && sticky_bit) || (round_bit && (u & 1))) {
-        u = uint16_t(u + 1);
-      }
+    if ((round_bit && sticky_bit) || (round_bit && (u & 1))) {
+      u = uint16_t(u + 1);
+    }
 
-      u |= sign;
+    u |= sign;
 
-      *dest = u;
-    }
+    *dest = u;
+  }
 
-    int _sum_float32(void* dst, void* src, size_t len);
-    int _sum_float64(void* dst, void* src, size_t len);
-    int _sum_float16(void* dst, void* src, size_t len);
-    int _sum_unit8(void* dst, void* src, size_t len);
-    int _sum_int32(void* dst, void* src, size_t len);
-    int _sum_int8(void* dst, void* src, size_t len);
-    int _sum_int64(void* dst, void* src, size_t len);
-
-    int _sum_float32(void* dst, void* src1, void* src2, size_t len);
-    int _sum_float64(void* dst, void* src1, void* src2, size_t len);
-    int _sum_float16(void* dst, void* src1, void* src2, size_t len);
-    int _sum_unit8(void* dst, void* src1, void* src2, size_t len);
-    int _sum_int32(void* dst, void* src1, void* src2, size_t len);
-    int _sum_int8(void* dst, void* src1, void* src2, size_t len);
-    int _sum_int64(void* dst, void* src1, void* src2, size_t len);
-
-    float _convert_half_to_full_precision(uint16_t h);
-    uint16_t _convert_full_to_half_precision(float f);
-
-    std::shared_ptr<BytePSComm> _comm;
-    int _num_threads;
-};
+  int _sum_float32(void* dst, void* src, size_t len);
+  int _sum_float64(void* dst, void* src, size_t len);
+  int _sum_float16(void* dst, void* src, size_t len);
+  int _sum_unit8(void* dst, void* src, size_t len);
+  int _sum_int32(void* dst, void* src, size_t len);
+  int _sum_int8(void* dst, void* src, size_t len);
+  int _sum_int64(void* dst, void* src, size_t len);
+
+  int _sum_float32(void* dst, void* src1, void* src2, size_t len);
+  int _sum_float64(void* dst, void* src1, void* src2, size_t len);
+  int _sum_float16(void* dst, void* src1, void* src2, size_t len);
+  int _sum_unit8(void* dst, void* src1, void* src2, size_t len);
+  int _sum_int32(void* dst, void* src1, void* src2, size_t len);
+  int _sum_int8(void* dst, void* src1, void* src2, size_t len);
+  int _sum_int64(void* dst, void* src1, void* src2, size_t len);
 
+  float _convert_half_to_full_precision(uint16_t h);
+  uint16_t _convert_full_to_half_precision(float f);
+
+  std::shared_ptr<BytePSComm> _comm;
+  int _num_threads;
+};
 
-} // namespace common
-} // namespace byteps
+}  // namespace common
+}  // namespace byteps
 
-#endif // BYTEPS_CPU_REDUCER_H
\ No newline at end of file
+#endif  // BYTEPS_CPU_REDUCER_H
diff --git a/byteps/common/global.cc b/byteps/common/global.cc
index a3d39f248..7e7d35892 100644
--- a/byteps/common/global.cc
+++ b/byteps/common/global.cc
@@ -15,8 +15,8 @@
 
 #include "global.h"
 #include <malloc.h>
-#include <unistd.h>
 #include <numa.h>
+#include <unistd.h>
 
 namespace byteps {
 namespace common {
@@ -65,261 +65,267 @@ std::shared_ptr<CpuReducer> BytePSGlobal::_cpu_reducer;
 uint64_t BytePSGlobal::_sample_key = std::numeric_limits<uint64_t>::max();
 
 BytePSScheduledQueue* BytePSGlobal::GetScheduledQueue(QueueType queueType) {
-    return (BytePSScheduledQueue*)_queues[queueType];
+  return (BytePSScheduledQueue*)_queues[queueType];
 }
 
 void BytePSGlobal::CreateScheduledQueue(QueueType queueType) {
-    std::lock_guard<std::mutex> lock(_queues_mutex[queueType]);
-    if (!_queues[queueType]) {
-        _queues[queueType] = new BytePSScheduledQueue(queueType);
-    }
-    return;
+  std::lock_guard<std::mutex> lock(_queues_mutex[queueType]);
+  if (!_queues[queueType]) {
+    _queues[queueType] = new BytePSScheduledQueue(queueType);
+  }
+  return;
 }
 
 void BytePSGlobal::Init() {
-    std::lock_guard<std::mutex> lock(_init_mutex);
-    
-    // We only init once
-    if (_initialized) {
-        return;
-    }
-
-    _basic_comm = std::make_shared<BytePSCommSocket>();
-
-    _basic_comm->init(&_rank, &_size, &_local_rank, &_local_size, &_worker_id, &_my_role);
-
-    _is_root_device = (_my_role == LOCAL_ROOT) ? true : false;
-    if (getenv("BYTEPS_PARTITION_BYTES")) {
-        _partition_bytes = atoi(getenv("BYTEPS_PARTITION_BYTES"));
-    }
-    BPS_LOG(DEBUG) << "Partition bound set to " << _partition_bytes << " bytes"
-                   << ", aligned to " << AlignTo(_partition_bytes, (8 * _local_size)) << " bytes";
-    // alignment for Reduce-Scatter/All-Gather
-    _partition_bytes = AlignTo(_partition_bytes, (8 * _local_size));
-
-    BPS_CHECK(getenv("DMLC_NUM_WORKER")) << "error: env DMLC_NUM_WORKER not set";
-
-    _num_worker = atoi(getenv("DMLC_NUM_WORKER"));
-
-    if (getenv("BYTEPS_FORCE_DISTRIBUTED")) {
-        _is_distributed_job = atoi(getenv("BYTEPS_FORCE_DISTRIBUTED"));
-    }
-    _is_distributed_job = (_num_worker>1) ? true : _is_distributed_job;
-
-    if (_is_distributed_job) {
-        BPS_CHECK(getenv("DMLC_NUM_SERVER")) << "error: launch distributed job, but env DMLC_NUM_SERVER not set";
-    }
-
-    BPS_LOG(DEBUG) << "Number of worker=" << _num_worker << ", launching "
-                   << (IsDistributed() ? "" : "non-") << "distributed job";
-
-    _shm_obj = std::make_shared<BytePSSharedMemory>(); // share memory obj
-
-    if (IsDistributed() && _my_role == BytePSRole::LOCAL_ROOT) { // only the root need to do networking
-        // init low-level ps implementation
-        _ps = new ps::KVWorker<char>(0, 0);
-        ps::StartAsync(0, "byteps\0");
-        if (!ps::Postoffice::Get()->is_recovery()) {
-            ps::Postoffice::Get()->Barrier(
-                0, ps::kWorkerGroup + ps::kServerGroup + ps::kScheduler);
-        }
-    }
-
-    // Set to associated GPU
-    CUDA_CALL(cudaSetDevice(_local_rank));
-
-    // Init NCCL
-    _nccl_manager = std::make_shared<NcclManager>(_basic_comm);
-    _is_cross_pcie_switch = (_local_size > _nccl_manager->GetSize());
-
-    // Bind to NUMA node
-    if (_is_cross_pcie_switch) {
-        auto numa_index = (GetPcieSwitchIndex() >  numa_max_node()) ?
-                          numa_max_node() : GetPcieSwitchIndex();
-        numa_bind(numa_parse_nodestring(std::to_string(numa_index).c_str()));
-    }
-
-    // Init CPU Reducer
-    if (_is_cross_pcie_switch) {
-        _cpu_reducer = std::make_shared<CpuReducer>(_basic_comm);
-    }
-
-    // ReadyTable for Push & Pull
-    if (_is_root_device) {
-        _push_table = new ReadyTable(_local_size-1, "PUSH");
-    }
-    else {
-        _copy_table = new ReadyTable(1, "COPY");
-    }
+  std::lock_guard<std::mutex> lock(_init_mutex);
 
-    // ReadyTable for cross-PCIe-switch reduce
-    if (_is_cross_pcie_switch) {
-        if (_cpu_reducer->isRoot()) {
-            _pcie_reduce_table = new ReadyTable(GetPcieSwitchNum()-1, "PCIE_REDUCE");
-        }
-    }
-
-    // ReadyTable for per-PCIe-switch NCCL calls
-    if (_nccl_manager->IsSignalRoot()) {
-        _reduce_table = new ReadyTable(GetPcieSwitchSize()-1, "NCCL_REDUCE");
-        _broadcast_table = new ReadyTable(GetPcieSwitchSize()-1, "NCCL_BROADCAST");
-    }
-
-    // Create CUDA streams for GPU-CPU copies
-    _copy_host2device_stream  = (cudaStream_t*) malloc(sizeof(cudaStream_t) * 1);
-    _copy_device2host_stream  = (cudaStream_t*) malloc(sizeof(cudaStream_t) * 1);
-    CUDA_CALL(cudaStreamCreateWithFlags(_copy_host2device_stream, cudaStreamNonBlocking));
-    CUDA_CALL(cudaStreamCreateWithFlags(_copy_device2host_stream, cudaStreamNonBlocking));
-    CUDA_CALL(cudaStreamSynchronize(*_copy_host2device_stream));
-    CUDA_CALL(cudaStreamSynchronize(*_copy_device2host_stream));
-
-    // Create queues
-    for (int i = 0; i < QueueNum; i++) {
-        BPS_LOG(DEBUG) << "Create schedule queue " << i;
-        auto type = static_cast<QueueType>(i);
-        BytePSGlobal::CreateScheduledQueue(type);
+  // We only init once
+  if (_initialized) {
+    return;
+  }
+
+  _basic_comm = std::make_shared<BytePSCommSocket>();
+
+  _basic_comm->init(&_rank, &_size, &_local_rank, &_local_size, &_worker_id,
+                    &_my_role);
+
+  _is_root_device = (_my_role == LOCAL_ROOT) ? true : false;
+  if (getenv("BYTEPS_PARTITION_BYTES")) {
+    _partition_bytes = atoi(getenv("BYTEPS_PARTITION_BYTES"));
+  }
+  BPS_LOG(DEBUG) << "Partition bound set to " << _partition_bytes << " bytes"
+                 << ", aligned to "
+                 << AlignTo(_partition_bytes, (8 * _local_size)) << " bytes";
+  // alignment for Reduce-Scatter/All-Gather
+  _partition_bytes = AlignTo(_partition_bytes, (8 * _local_size));
+
+  BPS_CHECK(getenv("DMLC_NUM_WORKER")) << "error: env DMLC_NUM_WORKER not set";
+
+  _num_worker = atoi(getenv("DMLC_NUM_WORKER"));
+
+  if (getenv("BYTEPS_FORCE_DISTRIBUTED")) {
+    _is_distributed_job = atoi(getenv("BYTEPS_FORCE_DISTRIBUTED"));
+  }
+  _is_distributed_job = (_num_worker > 1) ? true : _is_distributed_job;
+
+  if (_is_distributed_job) {
+    BPS_CHECK(getenv("DMLC_NUM_SERVER"))
+        << "error: launch distributed job, but env DMLC_NUM_SERVER not set";
+  }
+
+  BPS_LOG(DEBUG) << "Number of worker=" << _num_worker << ", launching "
+                 << (IsDistributed() ? "" : "non-") << "distributed job";
+
+  _shm_obj = std::make_shared<BytePSSharedMemory>();  // share memory obj
+
+  if (IsDistributed() &&
+      _my_role ==
+          BytePSRole::LOCAL_ROOT) {  // only the root need to do networking
+    // init low-level ps implementation
+    _ps = new ps::KVWorker<char>(0, 0);
+    ps::StartAsync(0, "byteps\0");
+    if (!ps::Postoffice::Get()->is_recovery()) {
+      ps::Postoffice::Get()->Barrier(
+          0, ps::kWorkerGroup + ps::kServerGroup + ps::kScheduler);
     }
-
-    _initialized = true;
-    BPS_LOG(DEBUG) << "Inited rank=" << _rank
-                   << " local_rank=" << _local_rank
-                   << " size=" << _size
-                   << " local_size=" << _local_size
-                   << " worker_id=" << _worker_id;
-
-    if (getenv("BYTEPS_DEBUG_SAMPLE_TENSOR")) {
-        _sample_key = strtoull(getenv("BYTEPS_DEBUG_SAMPLE_TENSOR"), nullptr, 0);
+  }
+
+  // Set to associated GPU
+  CUDA_CALL(cudaSetDevice(_local_rank));
+
+  // Init NCCL
+  _nccl_manager = std::make_shared<NcclManager>(_basic_comm);
+  _is_cross_pcie_switch = (_local_size > _nccl_manager->GetSize());
+
+  // Bind to NUMA node
+  if (_is_cross_pcie_switch) {
+    auto numa_index = (GetPcieSwitchIndex() > numa_max_node())
+                          ? numa_max_node()
+                          : GetPcieSwitchIndex();
+    numa_bind(numa_parse_nodestring(std::to_string(numa_index).c_str()));
+  }
+
+  // Init CPU Reducer
+  if (_is_cross_pcie_switch) {
+    _cpu_reducer = std::make_shared<CpuReducer>(_basic_comm);
+  }
+
+  // ReadyTable for Push & Pull
+  if (_is_root_device) {
+    _push_table = new ReadyTable(_local_size - 1, "PUSH");
+  } else {
+    _copy_table = new ReadyTable(1, "COPY");
+  }
+
+  // ReadyTable for cross-PCIe-switch reduce
+  if (_is_cross_pcie_switch) {
+    if (_cpu_reducer->isRoot()) {
+      _pcie_reduce_table =
+          new ReadyTable(GetPcieSwitchNum() - 1, "PCIE_REDUCE");
     }
-    return;
+  }
+
+  // ReadyTable for per-PCIe-switch NCCL calls
+  if (_nccl_manager->IsSignalRoot()) {
+    _reduce_table = new ReadyTable(GetPcieSwitchSize() - 1, "NCCL_REDUCE");
+    _broadcast_table =
+        new ReadyTable(GetPcieSwitchSize() - 1, "NCCL_BROADCAST");
+  }
+
+  // Create CUDA streams for GPU-CPU copies
+  _copy_host2device_stream = (cudaStream_t*)malloc(sizeof(cudaStream_t) * 1);
+  _copy_device2host_stream = (cudaStream_t*)malloc(sizeof(cudaStream_t) * 1);
+  CUDA_CALL(cudaStreamCreateWithFlags(_copy_host2device_stream,
+                                      cudaStreamNonBlocking));
+  CUDA_CALL(cudaStreamCreateWithFlags(_copy_device2host_stream,
+                                      cudaStreamNonBlocking));
+  CUDA_CALL(cudaStreamSynchronize(*_copy_host2device_stream));
+  CUDA_CALL(cudaStreamSynchronize(*_copy_device2host_stream));
+
+  // Create queues
+  for (int i = 0; i < QueueNum; i++) {
+    BPS_LOG(DEBUG) << "Create schedule queue " << i;
+    auto type = static_cast<QueueType>(i);
+    BytePSGlobal::CreateScheduledQueue(type);
+  }
+
+  _initialized = true;
+  BPS_LOG(DEBUG) << "Inited rank=" << _rank << " local_rank=" << _local_rank
+                 << " size=" << _size << " local_size=" << _local_size
+                 << " worker_id=" << _worker_id;
+
+  if (getenv("BYTEPS_DEBUG_SAMPLE_TENSOR")) {
+    _sample_key = strtoull(getenv("BYTEPS_DEBUG_SAMPLE_TENSOR"), nullptr, 0);
+  }
+  return;
 }
 
-void BytePSGlobal::Start(const std::vector<LoopFunction> &func) {
-    // Start background threads
-    for (size_t i = 0; i < func.size(); i++) {
-        _threads.push_back(new std::thread(func[i]));
-    }
-    BPS_LOG(DEBUG) << "Started " << func.size() << " background threads. rank=" << _local_rank;
+void BytePSGlobal::Start(const std::vector<LoopFunction>& func) {
+  // Start background threads
+  for (size_t i = 0; i < func.size(); i++) {
+    _threads.push_back(new std::thread(func[i]));
+  }
+  BPS_LOG(DEBUG) << "Started " << func.size()
+                 << " background threads. rank=" << _local_rank;
 }
 
-
 const Status NOT_INITIALIZED_ERROR = Status::PreconditionError(
     "BytePS has not been initialized; use bps.init().");
 
 Status BytePSGlobal::CheckInit() {
-    if (_initialized) {
-        return Status::OK();
-    }
-    else {
-        return NOT_INITIALIZED_ERROR;
-    }
+  if (_initialized) {
+    return Status::OK();
+  } else {
+    return NOT_INITIALIZED_ERROR;
+  }
 }
 
 void BytePSGlobal::Shutdown() {
-    _should_shutdown = true;
-    for (size_t i = 0; i < _threads.size(); i++) {
-        if (_threads[i]->joinable()) {
-            _threads[i]->join();
-            delete _threads[i];
-        }
-    }
-
-    for (size_t i = 0; i < QueueNum; i++) {
-        if (_queues[i]) {
-            delete _queues[i];
-        }
+  _should_shutdown = true;
+  for (size_t i = 0; i < _threads.size(); i++) {
+    if (_threads[i]->joinable()) {
+      _threads[i]->join();
+      delete _threads[i];
     }
+  }
 
-    if (_ps) {
-        ps::Finalize(0, false);
-        delete _ps;
+  for (size_t i = 0; i < QueueNum; i++) {
+    if (_queues[i]) {
+      delete _queues[i];
     }
-
-    CUDA_CALL(cudaStreamDestroy(*_copy_device2host_stream));
-    CUDA_CALL(cudaStreamDestroy(*_copy_host2device_stream));
-
-    if (_reduce_table) {
-        delete _reduce_table;
-    }
-    if (_pcie_reduce_table) {
-        delete _pcie_reduce_table;
-    }
-    if (_broadcast_table) {
-        delete _broadcast_table;
-    }
-    if (_push_table) {
-        delete _push_table;
-    }
-
-    if (_copy_table) {
-        delete _copy_table;
-    }
-
-    _basic_comm.reset();
-    _shm_obj.reset();
-    _cpu_reducer.reset();
-    _nccl_manager.reset();
-
-    BPS_LOG(DEBUG) << "Clear all BytePS resources";
-    return;
+  }
+
+  if (_ps) {
+    ps::Finalize(0, false);
+    delete _ps;
+  }
+
+  CUDA_CALL(cudaStreamDestroy(*_copy_device2host_stream));
+  CUDA_CALL(cudaStreamDestroy(*_copy_host2device_stream));
+
+  if (_reduce_table) {
+    delete _reduce_table;
+  }
+  if (_pcie_reduce_table) {
+    delete _pcie_reduce_table;
+  }
+  if (_broadcast_table) {
+    delete _broadcast_table;
+  }
+  if (_push_table) {
+    delete _push_table;
+  }
+
+  if (_copy_table) {
+    delete _copy_table;
+  }
+
+  _basic_comm.reset();
+  _shm_obj.reset();
+  _cpu_reducer.reset();
+  _nccl_manager.reset();
+
+  BPS_LOG(DEBUG) << "Clear all BytePS resources";
+  return;
 }
 
-BPSContext& BytePSGlobal::GetContextFromName(const std::string &name) {
-    std::lock_guard<std::mutex> lock(_context_mutex);
-    BPS_CHECK(_name_to_cxt.find(name) != _name_to_cxt.end()) << name << " is not initialized";
-    return _name_to_cxt[name];
+BPSContext& BytePSGlobal::GetContextFromName(const std::string& name) {
+  std::lock_guard<std::mutex> lock(_context_mutex);
+  BPS_CHECK(_name_to_cxt.find(name) != _name_to_cxt.end())
+      << name << " is not initialized";
+  return _name_to_cxt[name];
 }
 
-bool BytePSGlobal::IsTensorDeclared(const std::string &name) {
-    std::lock_guard<std::mutex> lock(_context_mutex);
-    if (_name_to_cxt.find(name) == _name_to_cxt.end()) {
-        _name_to_cxt[name].initialized = false;
-        _name_to_cxt[name].tensor_name = name.c_str(); // disable copy-on-write
-        _name_to_cxt[name].declared_key = (ps::Key) next_key_++;
-        BPS_LOG(DEBUG) << "Declared tensor " << name
-                       << ", declared key (not PS key): " << _name_to_cxt[name].declared_key
-                       << " rank=" << BytePSGlobal::GetLocalRank();
-        return false;
-    }
-    return true;
+bool BytePSGlobal::IsTensorDeclared(const std::string& name) {
+  std::lock_guard<std::mutex> lock(_context_mutex);
+  if (_name_to_cxt.find(name) == _name_to_cxt.end()) {
+    _name_to_cxt[name].initialized = false;
+    _name_to_cxt[name].tensor_name = name.c_str();  // disable copy-on-write
+    _name_to_cxt[name].declared_key = (ps::Key)next_key_++;
+    BPS_LOG(DEBUG) << "Declared tensor " << name
+                   << ", declared key (not PS key): "
+                   << _name_to_cxt[name].declared_key
+                   << " rank=" << BytePSGlobal::GetLocalRank();
+    return false;
+  }
+  return true;
 }
 
 PSKV& BytePSGlobal::EncodeDefaultKey(uint64_t key, size_t len) {
-    std::lock_guard<std::mutex> lock(_encode_mutex);
-    PSKV& pskv = ps_kv_[key];
-    if (!pskv.keys.empty()) {
-        BPS_CHECK_EQ(static_cast<size_t>(pskv.size), len)
-            << "The value size cannot be changed " << len
-            << ". Key is " << key;
-    } else {
-        auto krs = ps::Postoffice::Get()->GetServerKeyRanges();
-        const int num_servers = krs.size();
-        BPS_CHECK_GT(num_servers, 0);
-        // send it to a single random picked server
-        int server = (((key >> 16) + key) * 9973) % num_servers;
-        BPS_LOG(DEBUG) << "key " << key << " assigned to server " << server;
-        ps::Key ps_key = krs[server].begin() + key;
-        BPS_CHECK_LT(ps_key, krs[server].end());
-        pskv.keys.push_back(ps_key);
-        pskv.lens.push_back(len);
-        pskv.size = len;
-    }
-    BPS_LOG(TRACE) << "key " << key << " is encoded to " << pskv.keys[0];
-    return pskv;
+  std::lock_guard<std::mutex> lock(_encode_mutex);
+  PSKV& pskv = ps_kv_[key];
+  if (!pskv.keys.empty()) {
+    BPS_CHECK_EQ(static_cast<size_t>(pskv.size), len)
+        << "The value size cannot be changed " << len << ". Key is " << key;
+  } else {
+    auto krs = ps::Postoffice::Get()->GetServerKeyRanges();
+    const int num_servers = krs.size();
+    BPS_CHECK_GT(num_servers, 0);
+    // send it to a single random picked server
+    int server = (((key >> 16) + key) * 9973) % num_servers;
+    BPS_LOG(DEBUG) << "key " << key << " assigned to server " << server;
+    ps::Key ps_key = krs[server].begin() + key;
+    BPS_CHECK_LT(ps_key, krs[server].end());
+    pskv.keys.push_back(ps_key);
+    pskv.lens.push_back(len);
+    pskv.size = len;
+  }
+  BPS_LOG(TRACE) << "key " << key << " is encoded to " << pskv.keys[0];
+  return pskv;
 }
 
 uint32_t BytePSGlobal::GetTensorCount() {
-    std::lock_guard<std::mutex> lock(_context_mutex);
-    return BytePSGlobal::_name_to_cxt.size();
+  std::lock_guard<std::mutex> lock(_context_mutex);
+  return BytePSGlobal::_name_to_cxt.size();
 }
 
 cudaStream_t* BytePSGlobal::GetCopyDevice2HostStream() {
-    return BytePSGlobal::_copy_device2host_stream;
+  return BytePSGlobal::_copy_device2host_stream;
 }
 
 cudaStream_t* BytePSGlobal::GetCopyHost2DeviceStream() {
-    return BytePSGlobal::_copy_host2device_stream;
+  return BytePSGlobal::_copy_host2device_stream;
 }
 
-
-} // namespace common
-} // namespace byteps
+}  // namespace common
+}  // namespace byteps
diff --git a/byteps/common/global.h b/byteps/common/global.h
index 6264f6e0c..91f957fef 100644
--- a/byteps/common/global.h
+++ b/byteps/common/global.h
@@ -16,148 +16,149 @@
 #ifndef BYTEPS_GLOBAL_H
 #define BYTEPS_GLOBAL_H
 
+#include <map>
 #include <memory>
 #include <mutex>
+#include <queue>
+#include <string>
 #include <thread>
 #include <unordered_map>
-#include <string>
-#include <map>
-#include <queue>
-
 #include "common.h"
-#include "logging.h"
 #include "communicator.h"
-#include "scheduled_queue.h"
-#include "ready_table.h"
-#include "shared_memory.h"
-#include "nccl_manager.h"
 #include "cpu_reducer.h"
+#include "logging.h"
+#include "nccl_manager.h"
 #include "ps/ps.h"
+#include "ready_table.h"
+#include "scheduled_queue.h"
+#include "shared_memory.h"
 
 namespace byteps {
 namespace common {
 
 struct PSKV {
-    ps::SArray<ps::Key> keys;  // n keys
-    ps::SArray<int> lens;  // the length of the i-th value
-    int size;
+  ps::SArray<ps::Key> keys;  // n keys
+  ps::SArray<int> lens;      // the length of the i-th value
+  int size;
 };
 
 typedef void (*LoopFunction)();
 
-
 class BytePSGlobal {
-
-public:
-
-    static void Init();
-    static void Start(const std::vector<LoopFunction> &func);
-    static Status CheckInit();
-    static bool ShouldShutdown() { return _should_shutdown; }
-    static void Shutdown();
-
-    static int GetRank() { return _rank; }
-    static int GetLocalRank() { return _local_rank; }
-    static int GetSize() { return _size; }
-    static int GetLocalSize() { return _local_size; }
-    static int GetWorkerID() { return _worker_id; }
-    static int GetNumWorker() { return _num_worker; }
-    static int GetPcieSwitchSize() { return _nccl_manager->GetSize(); }
-    static int GetPcieSwitchIndex() { return _local_rank / _nccl_manager->GetSize(); }
-    static int GetPcieSwitchNum() { return _local_size / _nccl_manager->GetSize(); }
-    static bool IsRootDevice() { return _is_root_device; }
-    static bool IsDistributed() { return _is_distributed_job; }
-    static bool IsCrossPcieSwitch() { return _is_cross_pcie_switch; }
-    static BytePSRole GetMyRole() { return _my_role; }
-    static std::shared_ptr<BytePSComm> GetBasicComm() { return _basic_comm; }
-    static std::shared_ptr<BytePSSharedMemory> GetSharedMemoryObj() { return _shm_obj; }
-
-    static BytePSScheduledQueue* GetScheduledQueue(QueueType queueType);
-    static void CreateScheduledQueue(QueueType queueType);
-    static ps::KVWorker<char>* GetPS() { return _ps; }
-
-    static bool IsTensorDeclared(const std::string &name);
-    static ps::Key GetKeyFromName(const std::string &name);
-    static BPSContext& GetContextFromName(const std::string &name);
-    static uint32_t GetTensorCount();
-
-    static std::unordered_map<uint64_t, PSKV> ps_kv_;
-    static PSKV& EncodeDefaultKey(uint64_t key, size_t len);
-
-    static uint32_t GetPartitionBound() { return _partition_bytes; }
-
-    static cudaStream_t* GetCopyDevice2HostStream();
-    static cudaStream_t* GetCopyHost2DeviceStream();
-
-    // methods to access or modify the _ready_table
-    static ReadyTable* GetReduceTable() { return _reduce_table; }
-    static ReadyTable* GetPcieReduceTable() { return _pcie_reduce_table; }
-    static ReadyTable* GetBroadcastTable() { return _broadcast_table; }
-    static ReadyTable* GetPushTable() { return _push_table; }
-
-    // for non-root
-    static ReadyTable* GetCopyTable() { return _copy_table; }
-
-    static std::shared_ptr<NcclManager> GetNccl() { return _nccl_manager; }
-    static std::shared_ptr<CpuReducer> GetCpuReducer() { return _cpu_reducer; }
-
-    static bool IsTensorSampled(uint64_t key) { return (key == _sample_key); }
-
-private:
-
-    static std::mutex _init_mutex;
-    static volatile bool _initialized;
-    static volatile bool _should_shutdown;
-
-    static int _rank;
-    static int _local_rank;
-    static int _size;
-    static int _local_size;
-    static int _worker_id;
-    static int _num_worker;
-    static bool _is_root_device;
-    static bool _is_distributed_job;
-    static bool _is_cross_pcie_switch;
-    static BytePSRole _my_role;
-    static std::shared_ptr<BytePSComm> _basic_comm;
-    static std::shared_ptr<BytePSSharedMemory> _shm_obj;
-
-    static volatile BytePSScheduledQueue* _queues[QueueNum];
-    static std::mutex _queues_mutex[QueueNum];
-    static std::vector<std::thread*> _threads;
-
-    static std::mutex _context_mutex;
-
-    static ps::KVWorker<char>* _ps;
-    static std::mutex _encode_mutex;
-    static std::unordered_map<std::string, BPSContext> _name_to_cxt;
-
-    static cudaStream_t* _copy_device2host_stream;
-    static cudaStream_t* _copy_host2device_stream;
-
-    static uint32_t _partition_bytes;
-
-    // (key, ready_signal_count) pair, only valid for root device
-    static ReadyTable* _reduce_table;
-    static ReadyTable* _pcie_reduce_table;
-    static ReadyTable* _broadcast_table;
-    static ReadyTable* _push_table;
-
-    // (key, ready_signal_count) pair, only valid for non-root device
-    static ReadyTable* _copy_table;
-
-    static std::shared_ptr<NcclManager> _nccl_manager;
-    static std::shared_ptr<CpuReducer> _cpu_reducer;
-
-    // for debug sampling
-    static uint64_t _sample_key;
-
-    static int AlignTo(int input, int alignment) { return input / alignment * alignment; }
-
+ public:
+  static void Init();
+  static void Start(const std::vector<LoopFunction>& func);
+  static Status CheckInit();
+  static bool ShouldShutdown() { return _should_shutdown; }
+  static void Shutdown();
+
+  static int GetRank() { return _rank; }
+  static int GetLocalRank() { return _local_rank; }
+  static int GetSize() { return _size; }
+  static int GetLocalSize() { return _local_size; }
+  static int GetWorkerID() { return _worker_id; }
+  static int GetNumWorker() { return _num_worker; }
+  static int GetPcieSwitchSize() { return _nccl_manager->GetSize(); }
+  static int GetPcieSwitchIndex() {
+    return _local_rank / _nccl_manager->GetSize();
+  }
+  static int GetPcieSwitchNum() {
+    return _local_size / _nccl_manager->GetSize();
+  }
+  static bool IsRootDevice() { return _is_root_device; }
+  static bool IsDistributed() { return _is_distributed_job; }
+  static bool IsCrossPcieSwitch() { return _is_cross_pcie_switch; }
+  static BytePSRole GetMyRole() { return _my_role; }
+  static std::shared_ptr<BytePSComm> GetBasicComm() { return _basic_comm; }
+  static std::shared_ptr<BytePSSharedMemory> GetSharedMemoryObj() {
+    return _shm_obj;
+  }
+
+  static BytePSScheduledQueue* GetScheduledQueue(QueueType queueType);
+  static void CreateScheduledQueue(QueueType queueType);
+  static ps::KVWorker<char>* GetPS() { return _ps; }
+
+  static bool IsTensorDeclared(const std::string& name);
+  static ps::Key GetKeyFromName(const std::string& name);
+  static BPSContext& GetContextFromName(const std::string& name);
+  static uint32_t GetTensorCount();
+
+  static std::unordered_map<uint64_t, PSKV> ps_kv_;
+  static PSKV& EncodeDefaultKey(uint64_t key, size_t len);
+
+  static uint32_t GetPartitionBound() { return _partition_bytes; }
+
+  static cudaStream_t* GetCopyDevice2HostStream();
+  static cudaStream_t* GetCopyHost2DeviceStream();
+
+  // methods to access or modify the _ready_table
+  static ReadyTable* GetReduceTable() { return _reduce_table; }
+  static ReadyTable* GetPcieReduceTable() { return _pcie_reduce_table; }
+  static ReadyTable* GetBroadcastTable() { return _broadcast_table; }
+  static ReadyTable* GetPushTable() { return _push_table; }
+
+  // for non-root
+  static ReadyTable* GetCopyTable() { return _copy_table; }
+
+  static std::shared_ptr<NcclManager> GetNccl() { return _nccl_manager; }
+  static std::shared_ptr<CpuReducer> GetCpuReducer() { return _cpu_reducer; }
+
+  static bool IsTensorSampled(uint64_t key) { return (key == _sample_key); }
+
+ private:
+  static std::mutex _init_mutex;
+  static volatile bool _initialized;
+  static volatile bool _should_shutdown;
+
+  static int _rank;
+  static int _local_rank;
+  static int _size;
+  static int _local_size;
+  static int _worker_id;
+  static int _num_worker;
+  static bool _is_root_device;
+  static bool _is_distributed_job;
+  static bool _is_cross_pcie_switch;
+  static BytePSRole _my_role;
+  static std::shared_ptr<BytePSComm> _basic_comm;
+  static std::shared_ptr<BytePSSharedMemory> _shm_obj;
+
+  static volatile BytePSScheduledQueue* _queues[QueueNum];
+  static std::mutex _queues_mutex[QueueNum];
+  static std::vector<std::thread*> _threads;
+
+  static std::mutex _context_mutex;
+
+  static ps::KVWorker<char>* _ps;
+  static std::mutex _encode_mutex;
+  static std::unordered_map<std::string, BPSContext> _name_to_cxt;
+
+  static cudaStream_t* _copy_device2host_stream;
+  static cudaStream_t* _copy_host2device_stream;
+
+  static uint32_t _partition_bytes;
+
+  // (key, ready_signal_count) pair, only valid for root device
+  static ReadyTable* _reduce_table;
+  static ReadyTable* _pcie_reduce_table;
+  static ReadyTable* _broadcast_table;
+  static ReadyTable* _push_table;
+
+  // (key, ready_signal_count) pair, only valid for non-root device
+  static ReadyTable* _copy_table;
+
+  static std::shared_ptr<NcclManager> _nccl_manager;
+  static std::shared_ptr<CpuReducer> _cpu_reducer;
+
+  // for debug sampling
+  static uint64_t _sample_key;
+
+  static int AlignTo(int input, int alignment) {
+    return input / alignment * alignment;
+  }
 };
 
+}  // namespace common
+}  // namespace byteps
 
-} // namespace common
-} // namespace byteps
-
-#endif // BYTEPS_GLOBAL_H
+#endif  // BYTEPS_GLOBAL_H
diff --git a/byteps/common/logging.cc b/byteps/common/logging.cc
index 1af3d6adb..263d95d9c 100644
--- a/byteps/common/logging.cc
+++ b/byteps/common/logging.cc
@@ -14,12 +14,11 @@
 // limitations under the License.
 // =============================================================================
 
-#include <chrono>
+#include "logging.h"
 #include <algorithm>
-#include <iostream>
+#include <chrono>
 #include <iomanip>
-
-#include "logging.h"
+#include <iostream>
 
 namespace byteps {
 namespace common {
@@ -28,7 +27,8 @@ LogMessage::LogMessage(const char* fname, int line, LogLevel severity)
     : fname_(fname), line_(line), severity_(severity) {}
 
 void LogMessage::GenerateLogMessage(bool log_time) {
-  bool use_cout = static_cast<int>(severity_) <= static_cast<int>(LogLevel::INFO);
+  bool use_cout =
+      static_cast<int>(severity_) <= static_cast<int>(LogLevel::INFO);
   std::ostream& os = use_cout ? std::cout : std::cerr;
   if (log_time) {
     auto now = std::chrono::system_clock::now();
@@ -36,18 +36,20 @@ void LogMessage::GenerateLogMessage(bool log_time) {
 
     auto duration = now.time_since_epoch();
     auto seconds = std::chrono::duration_cast<std::chrono::seconds>(duration);
-    auto micros_remainder = std::chrono::duration_cast<std::chrono::microseconds>(duration - seconds);
+    auto micros_remainder =
+        std::chrono::duration_cast<std::chrono::microseconds>(duration -
+                                                              seconds);
 
     const size_t time_buffer_size = 30;
     char time_buffer[time_buffer_size];
     strftime(time_buffer, time_buffer_size, "%Y-%m-%d %H:%M:%S",
              localtime(&as_time_t));
-    os << "[" << time_buffer << "." << std::setw(6) << micros_remainder.count() 
-              << ": " << LOG_LEVELS[static_cast<int>(severity_)] << " " 
-              << fname_ << ":" << line_ << "] " << str() << std::endl;
+    os << "[" << time_buffer << "." << std::setw(6) << micros_remainder.count()
+       << ": " << LOG_LEVELS[static_cast<int>(severity_)] << " " << fname_
+       << ":" << line_ << "] " << str() << std::endl;
   } else {
-    os << "[" << LOG_LEVELS[static_cast<int>(severity_)] << " " 
-              << fname_ << ":" << line_ << "] " << str() << std::endl;
+    os << "[" << LOG_LEVELS[static_cast<int>(severity_)] << " " << fname_ << ":"
+       << line_ << "] " << str() << std::endl;
   }
 }
 
@@ -70,7 +72,8 @@ LogMessageFatal::~LogMessageFatal() {
 
 LogLevel ParseLogLevelStr(const char* env_var_val) {
   std::string min_log_level(env_var_val);
-  std::transform(min_log_level.begin(), min_log_level.end(), min_log_level.begin(), ::tolower);
+  std::transform(min_log_level.begin(), min_log_level.end(),
+                 min_log_level.begin(), ::tolower);
   if (min_log_level == "trace") {
     return LogLevel::TRACE;
   } else if (min_log_level == "debug") {
@@ -99,13 +102,12 @@ LogLevel MinLogLevelFromEnv() {
 
 bool LogTimeFromEnv() {
   const char* env_var_val = getenv("BYTEPS_LOG_HIDE_TIME");
-  if (env_var_val != nullptr &&
-      std::strtol(env_var_val, nullptr, 10) > 0) {
+  if (env_var_val != nullptr && std::strtol(env_var_val, nullptr, 10) > 0) {
     return false;
   } else {
     return true;
   }
 }
 
-}
-}
\ No newline at end of file
+}  // namespace common
+}  // namespace byteps
diff --git a/byteps/common/logging.h b/byteps/common/logging.h
index 2b5b96edd..391249541 100644
--- a/byteps/common/logging.h
+++ b/byteps/common/logging.h
@@ -23,15 +23,13 @@
 namespace byteps {
 namespace common {
 
-enum class LogLevel {
-  TRACE, DEBUG, INFO, WARNING, ERROR, FATAL
-};
+enum class LogLevel { TRACE, DEBUG, INFO, WARNING, ERROR, FATAL };
 
 #define LOG_LEVELS "TDIWEF"
 
 // Always-on checking
-#define BPS_CHECK(x)                                                  \
-  if (!(x))                                                           \
+#define BPS_CHECK(x) \
+  if (!(x))          \
   common::LogMessageFatal(__FILE__, __LINE__) << "Check failed: " #x << ' '
 
 #define BPS_CHECK_LT(x, y) BPS_CHECK((x) < (y))
@@ -40,33 +38,32 @@ enum class LogLevel {
 #define BPS_CHECK_GE(x, y) BPS_CHECK((x) >= (y))
 #define BPS_CHECK_EQ(x, y) BPS_CHECK((x) == (y))
 #define BPS_CHECK_NE(x, y) BPS_CHECK((x) != (y))
-#define BPS_CHECK_NOTNULL(x)                                                                 \
-  ((x) == NULL                                                                           \
-   ? common::LogMessageFatal(__FILE__, __LINE__) << "Check  notnull: " #x << ' ', \
+#define BPS_CHECK_NOTNULL(x)                                 \
+  ((x) == NULL ? common::LogMessageFatal(__FILE__, __LINE__) \
+                     << "Check  notnull: " #x << ' ',        \
    (x) : (x))  // NOLINT(*)
-   
+
 /*!
  * \brief Protected CUDA call.
  * \param func Expression to call.
  *
  * It checks for CUDA errors after invocation of the expression.
  */
-#define CUDA_CALL(func)                                                        \
-  {                                                                            \
-    cudaError_t e = (func);                                                    \
-    BPS_CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading)                   \
-        << "CUDA: " << cudaGetErrorString(e);                                  \
+#define CUDA_CALL(func)                                          \
+  {                                                              \
+    cudaError_t e = (func);                                      \
+    BPS_CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
+        << "CUDA: " << cudaGetErrorString(e);                    \
   }
 
 /*
  * \brief Protected NCCL call.
  */
-#define NCCLCHECK(cmd)                             \
-{                                                  \
-  ncclResult_t r = (cmd);                          \
-  BPS_CHECK(r == ncclSuccess)                      \
-    << "NCCL error: " << ncclGetErrorString(r);    \
-}
+#define NCCLCHECK(cmd)                                                      \
+  {                                                                         \
+    ncclResult_t r = (cmd);                                                 \
+    BPS_CHECK(r == ncclSuccess) << "NCCL error: " << ncclGetErrorString(r); \
+  }
 
 class LogMessage : public std::basic_ostringstream<char> {
  public:
@@ -90,18 +87,12 @@ class LogMessageFatal : public LogMessage {
   ~LogMessageFatal();
 };
 
-#define _BPS_LOG_TRACE \
-  LogMessage(__FILE__, __LINE__, LogLevel::TRACE)
-#define _BPS_LOG_DEBUG \
-  LogMessage(__FILE__, __LINE__, LogLevel::DEBUG)
-#define _BPS_LOG_INFO \
-  LogMessage(__FILE__, __LINE__, LogLevel::INFO)
-#define _BPS_LOG_WARNING \
-  LogMessage(__FILE__, __LINE__, LogLevel::WARNING)
-#define _BPS_LOG_ERROR \
-  LogMessage(__FILE__, __LINE__, LogLevel::ERROR)
-#define _BPS_LOG_FATAL \
-  LogMessageFatal(__FILE__, __LINE__)
+#define _BPS_LOG_TRACE LogMessage(__FILE__, __LINE__, LogLevel::TRACE)
+#define _BPS_LOG_DEBUG LogMessage(__FILE__, __LINE__, LogLevel::DEBUG)
+#define _BPS_LOG_INFO LogMessage(__FILE__, __LINE__, LogLevel::INFO)
+#define _BPS_LOG_WARNING LogMessage(__FILE__, __LINE__, LogLevel::WARNING)
+#define _BPS_LOG_ERROR LogMessage(__FILE__, __LINE__, LogLevel::ERROR)
+#define _BPS_LOG_FATAL LogMessageFatal(__FILE__, __LINE__)
 
 #define _LOG(severity) _BPS_LOG_##severity
 
@@ -113,7 +104,7 @@ class LogMessageFatal : public LogMessage {
 LogLevel MinLogLevelFromEnv();
 bool LogTimeFromEnv();
 
-}
-}
+}  // namespace common
+}  // namespace byteps
 
-#endif // BYTEPS_LOGGING_H
+#endif  // BYTEPS_LOGGING_H
diff --git a/byteps/common/nccl_manager.cc b/byteps/common/nccl_manager.cc
index 53f8ff4db..b481f4d9f 100644
--- a/byteps/common/nccl_manager.cc
+++ b/byteps/common/nccl_manager.cc
@@ -14,165 +14,171 @@
 // =============================================================================
 
 #include "nccl_manager.h"
-#include "logging.h"
 #include "global.h"
+#include "logging.h"
 
 namespace byteps {
 namespace common {
 
 void NcclGroupEntry::RecordEvents() {
-    for (size_t i = 0; i < tasks.size(); i++) {
-        cudaEvent_t event;
-        CUDA_CALL(cudaEventCreateWithFlags(&event, cudaEventBlockingSync | cudaEventDisableTiming));
-        CUDA_CALL(cudaEventRecord(event,
-            BytePSGlobal::GetNccl()->GetStream(tasks[i]->key, queues[i]->getQueueType())));
-        _events.push_back(event);
-    }
+  for (size_t i = 0; i < tasks.size(); i++) {
+    cudaEvent_t event;
+    CUDA_CALL(cudaEventCreateWithFlags(
+        &event, cudaEventBlockingSync | cudaEventDisableTiming));
+    CUDA_CALL(
+        cudaEventRecord(event, BytePSGlobal::GetNccl()->GetStream(
+                                   tasks[i]->key, queues[i]->getQueueType())));
+    _events.push_back(event);
+  }
 }
 
 void NcclGroupEntry::SynchronizeEvents() {
-    for (size_t i = 0; i < tasks.size(); i++) {
-        CUDA_CALL(cudaEventSynchronize(_events[i]));
-    }
+  for (size_t i = 0; i < tasks.size(); i++) {
+    CUDA_CALL(cudaEventSynchronize(_events[i]));
+  }
 }
 
 void NcclGroupEntry::DestroyEvents() {
-    for (size_t i = 0; i < tasks.size(); i++) {
-        CUDA_CALL(cudaEventDestroy(_events[i]));
-    }
+  for (size_t i = 0; i < tasks.size(); i++) {
+    CUDA_CALL(cudaEventDestroy(_events[i]));
+  }
 }
 
 NcclManager::NcclManager(std::shared_ptr<BytePSComm> comm) {
-    _global_comm = comm;
-    InitGlobalEnv();
-    ConstructRings();
-    return;
+  _global_comm = comm;
+  InitGlobalEnv();
+  ConstructRings();
+  return;
 }
 
 ncclComm_t NcclManager::GetComm(uint64_t key, QueueType op) {
-    return _nccl_comm[key % _nccl_num_rings];
+  return _nccl_comm[key % _nccl_num_rings];
 }
 
 cudaStream_t NcclManager::GetStream(uint64_t key, QueueType op) {
-    return _nccl_stream[key % _nccl_num_rings];
+  return _nccl_stream[key % _nccl_num_rings];
 }
 
 int NcclManager::GetRoot(uint64_t key, QueueType op) {
-    return _nccl_pcie_size - 1;
+  return _nccl_pcie_size - 1;
 }
 
 int NcclManager::GetRank(uint64_t key, QueueType op) {
-    return BytePSGlobal::GetLocalRank() % _nccl_pcie_size;
+  return BytePSGlobal::GetLocalRank() % _nccl_pcie_size;
 }
 
 bool NcclManager::IsSignalRoot() {
-    return _signal_comm->getRoot() == BytePSGlobal::GetLocalRank();
+  return _signal_comm->getRoot() == BytePSGlobal::GetLocalRank();
 }
 
 void NcclManager::ConstructRings() {
-    std::string log_string("Constructing NCCL communicators.");
-    auto local_rank = BytePSGlobal::GetLocalRank();
-    std::vector<int> peers;
-    int first_peer = local_rank / _nccl_pcie_size * _nccl_pcie_size;
-    for (int i = first_peer; i < first_peer + (int)_nccl_pcie_size; i++) {
-        peers.push_back(i);
-        log_string = log_string + " " + std::to_string(i);
-    }
-    _signal_comm = std::make_shared<BytePSCommSocket>(_global_comm, std::string("nccl"), peers);
-    BPS_LOG(DEBUG) << log_string;
-
-    // init and sycn NCCL-reduce-id using out-of-band socket
-    _nccl_id = (ncclUniqueId*) malloc(sizeof(ncclUniqueId) * _nccl_num_rings);
-    _nccl_comm = (ncclComm_t*) malloc(sizeof(ncclComm_t) * _nccl_num_rings);
-    _nccl_stream  = (cudaStream_t*) malloc(sizeof(cudaStream_t) *_nccl_num_rings);
-    _nccl_size = _nccl_pcie_size;
-    int greatest_priority;
-    CUDA_CALL(cudaDeviceGetStreamPriorityRange(NULL, &greatest_priority));
-
-    for (size_t i = 0; i < _nccl_num_rings; i++) {
-        auto nccl_id = _nccl_id + i;
-        auto nccl_comm = _nccl_comm + i;
-        auto nccl_stream = _nccl_stream + i;
-
-        // synchronize NCCL IDs
-        if (local_rank == _signal_comm->getRoot()) { // only root create nccl id
-            NCCLCHECK(ncclGetUniqueId(nccl_id));
-            // the log is just for debug, the actual length of nccl id is 128
-            BPS_LOG(DEBUG) << "root nccl_id is " << (*(long long int*)nccl_id);
-            // TODO: change to BytePSCommSignal format
-            _signal_comm->broadcastSignal(nccl_id, sizeof(ncclUniqueId));
-
-        }
-        else {
-            int src;
-            // TODO: change to recvSignalFromRoot after using BytePSCommSignal format
-            int rc = _signal_comm->recvSignal(&src, nccl_id, sizeof(ncclUniqueId));
-            BPS_CHECK_EQ(rc, sizeof(ncclUniqueId)) << rc << ", " << sizeof(ncclUniqueId);
-            BPS_LOG(DEBUG) << "recv nccl_id is " << (*(long long int*)nccl_id)
-                        << ", local_rank=" << local_rank;
-        }
-
-        // initialize NCCL rank
-        auto rank = local_rank % _nccl_pcie_size;
-        NCCLCHECK(ncclCommInitRank(nccl_comm, _nccl_pcie_size, *nccl_id, rank));
-
-        // initialize CUDA streams for NCCL
-        CUDA_CALL(cudaStreamCreateWithPriority(nccl_stream, 
-                                               cudaStreamNonBlocking,
-                                               greatest_priority));
-        CUDA_CALL(cudaStreamSynchronize(*nccl_stream));
+  std::string log_string("Constructing NCCL communicators.");
+  auto local_rank = BytePSGlobal::GetLocalRank();
+  std::vector<int> peers;
+  int first_peer = local_rank / _nccl_pcie_size * _nccl_pcie_size;
+  for (int i = first_peer; i < first_peer + (int)_nccl_pcie_size; i++) {
+    peers.push_back(i);
+    log_string = log_string + " " + std::to_string(i);
+  }
+  _signal_comm = std::make_shared<BytePSCommSocket>(_global_comm,
+                                                    std::string("nccl"), peers);
+  BPS_LOG(DEBUG) << log_string;
+
+  // init and sycn NCCL-reduce-id using out-of-band socket
+  _nccl_id = (ncclUniqueId*)malloc(sizeof(ncclUniqueId) * _nccl_num_rings);
+  _nccl_comm = (ncclComm_t*)malloc(sizeof(ncclComm_t) * _nccl_num_rings);
+  _nccl_stream = (cudaStream_t*)malloc(sizeof(cudaStream_t) * _nccl_num_rings);
+  _nccl_size = _nccl_pcie_size;
+  int greatest_priority;
+  CUDA_CALL(cudaDeviceGetStreamPriorityRange(NULL, &greatest_priority));
+
+  for (size_t i = 0; i < _nccl_num_rings; i++) {
+    auto nccl_id = _nccl_id + i;
+    auto nccl_comm = _nccl_comm + i;
+    auto nccl_stream = _nccl_stream + i;
+
+    // synchronize NCCL IDs
+    if (local_rank == _signal_comm->getRoot()) {  // only root create nccl id
+      NCCLCHECK(ncclGetUniqueId(nccl_id));
+      // the log is just for debug, the actual length of nccl id is 128
+      BPS_LOG(DEBUG) << "root nccl_id is " << (*(long long int*)nccl_id);
+      // TODO: change to BytePSCommSignal format
+      _signal_comm->broadcastSignal(nccl_id, sizeof(ncclUniqueId));
+
+    } else {
+      int src;
+      // TODO: change to recvSignalFromRoot after using BytePSCommSignal format
+      int rc = _signal_comm->recvSignal(&src, nccl_id, sizeof(ncclUniqueId));
+      BPS_CHECK_EQ(rc, sizeof(ncclUniqueId))
+          << rc << ", " << sizeof(ncclUniqueId);
+      BPS_LOG(DEBUG) << "recv nccl_id is " << (*(long long int*)nccl_id)
+                     << ", local_rank=" << local_rank;
     }
 
+    // initialize NCCL rank
+    auto rank = local_rank % _nccl_pcie_size;
+    NCCLCHECK(ncclCommInitRank(nccl_comm, _nccl_pcie_size, *nccl_id, rank));
+
+    // initialize CUDA streams for NCCL
+    CUDA_CALL(cudaStreamCreateWithPriority(nccl_stream, cudaStreamNonBlocking,
+                                           greatest_priority));
+    CUDA_CALL(cudaStreamSynchronize(*nccl_stream));
+  }
 }
 
-void NcclManager::InitGlobalEnv() { // init all global env/param here
-    _nccl_group_size = (getenv("BYTEPS_NCCL_GROUP_SIZE") ?
-                        atoi(getenv("BYTEPS_NCCL_GROUP_SIZE")) : 4);
-    BPS_LOG(DEBUG) << "nccl_group_size" << " set to " << _nccl_group_size;
-    
-    _nccl_pcie_size = (getenv("BYTEPS_PCIE_SWITCH_SIZE") ?
-                       atoi(getenv("BYTEPS_PCIE_SWITCH_SIZE")) : 8);
-    auto local_size = BytePSGlobal::GetLocalSize();
-    _nccl_pcie_num = local_size / _nccl_pcie_size;
-    if (!_nccl_pcie_num) {
-        _nccl_pcie_size = local_size;
-        _nccl_pcie_num = 1;
-    }
-    else {
-        if (local_size % _nccl_pcie_size) {
-            BPS_LOG(WARNING) << "BytePS does not support unbalanced PCIe switches.";
-            _nccl_pcie_size = local_size;
-            _nccl_pcie_num = 1;
-        }
+void NcclManager::InitGlobalEnv() {  // init all global env/param here
+  _nccl_group_size =
+      (getenv("BYTEPS_NCCL_GROUP_SIZE") ? atoi(getenv("BYTEPS_NCCL_GROUP_SIZE"))
+                                        : 4);
+  BPS_LOG(DEBUG) << "nccl_group_size"
+                 << " set to " << _nccl_group_size;
+
+  _nccl_pcie_size = (getenv("BYTEPS_PCIE_SWITCH_SIZE")
+                         ? atoi(getenv("BYTEPS_PCIE_SWITCH_SIZE"))
+                         : 8);
+  auto local_size = BytePSGlobal::GetLocalSize();
+  _nccl_pcie_num = local_size / _nccl_pcie_size;
+  if (!_nccl_pcie_num) {
+    _nccl_pcie_size = local_size;
+    _nccl_pcie_num = 1;
+  } else {
+    if (local_size % _nccl_pcie_size) {
+      BPS_LOG(WARNING) << "BytePS does not support unbalanced PCIe switches.";
+      _nccl_pcie_size = local_size;
+      _nccl_pcie_num = 1;
     }
+  }
 
-    BPS_LOG(DEBUG) << "nccl_pcie_size" << " set to " << _nccl_pcie_size;
-    BPS_LOG(DEBUG) << "nccl_pcie_num" << " set to " << _nccl_pcie_num;
+  BPS_LOG(DEBUG) << "nccl_pcie_size"
+                 << " set to " << _nccl_pcie_size;
+  BPS_LOG(DEBUG) << "nccl_pcie_num"
+                 << " set to " << _nccl_pcie_num;
 
-    _nccl_num_rings = (getenv("BYTEPS_NCCL_NUM_RINGS") ?
-                        atoi(getenv("BYTEPS_NCCL_NUM_RINGS")) : 1);
-    BPS_LOG(DEBUG) << "nccl_num_rings" << " set to " << _nccl_num_rings;
+  _nccl_num_rings =
+      (getenv("BYTEPS_NCCL_NUM_RINGS") ? atoi(getenv("BYTEPS_NCCL_NUM_RINGS"))
+                                       : 1);
+  BPS_LOG(DEBUG) << "nccl_num_rings"
+                 << " set to " << _nccl_num_rings;
 
-    return;
+  return;
 }
 
 void NcclManager::EnqueueGroup(std::shared_ptr<NcclGroupEntry> e) {
-    std::lock_guard<std::mutex> lock(_nccl_mutex);
-    _nccl_pipeline.push(e);
-    return;
+  std::lock_guard<std::mutex> lock(_nccl_mutex);
+  _nccl_pipeline.push(e);
+  return;
 }
 
 std::shared_ptr<NcclGroupEntry> NcclManager::DequeueGroup() {
-    std::lock_guard<std::mutex> lock(_nccl_mutex);
-    if (!_nccl_pipeline.size()) {
-        return nullptr;
-    }
-    auto r = _nccl_pipeline.front();
-    _nccl_pipeline.pop();
-    return r;
+  std::lock_guard<std::mutex> lock(_nccl_mutex);
+  if (!_nccl_pipeline.size()) {
+    return nullptr;
+  }
+  auto r = _nccl_pipeline.front();
+  _nccl_pipeline.pop();
+  return r;
 }
 
-
 // Example:
 // 4 reduce rings:
 // 0 1 2 3 | 4 5 6 7
@@ -181,10 +187,14 @@ std::shared_ptr<NcclGroupEntry> NcclManager::DequeueGroup() {
 // 3 0 1 2 | 7 4 5 6
 //
 // reduce
-// 1st ring, 0->1->2->3->cpubuff->4->5->6->7->cpubuff, 4->5->6->7->cpubuff->0->1->2->3->cpubuff
-// 2nd ring, 1->2->3->0->cpubuff->5->6->7->4->cpubuff, 5->6->7->4->cpubuff->1->2->3->0->cpubuff
-// 3rd ring, 2->3->0->1->cpubuff->6->7->4->5->cpubuff, 6->7->4->5->cpubuff->2->3->0->1->cpubuff
-// 4th ring, 3->0->1->2->cpubuff->7->4->5->6->cpubuff, 7->4->5->6->cpubuff->3->0->1->2->cpubuff      
+// 1st ring, 0->1->2->3->cpubuff->4->5->6->7->cpubuff,
+// 4->5->6->7->cpubuff->0->1->2->3->cpubuff 2nd ring,
+// 1->2->3->0->cpubuff->5->6->7->4->cpubuff,
+// 5->6->7->4->cpubuff->1->2->3->0->cpubuff 3rd ring,
+// 2->3->0->1->cpubuff->6->7->4->5->cpubuff,
+// 6->7->4->5->cpubuff->2->3->0->1->cpubuff 4th ring,
+// 3->0->1->2->cpubuff->7->4->5->6->cpubuff,
+// 7->4->5->6->cpubuff->3->0->1->2->cpubuff
 //
 // 4 broadcast rings (reverse of reduce rings)
 // 7 6 5 4 | 3 2 1 0
@@ -193,114 +203,118 @@ std::shared_ptr<NcclGroupEntry> NcclManager::DequeueGroup() {
 // 6 5 4 7 | 2 1 0 3
 //
 // broadcast
-// 1st ring, cpubuff->7->6->5->4->cpubuff->3->2->1->0, cpubuff->3->2->1->0->cpubuff->7->6->5->4
-// 2nd ring, cpubuff->4->7->6->5->cpubuff->0->3->2->1, cpubuff->0->3->2->1->cpubuff->4->7->6->5
-// 3rd ring, cpubuff->5->4->7->6->cpubuff->1->0->3->2, cpubuff->1->0->3->2->cpubuff->5->4->7->6
-// 4th ring, cpubuff->6->5->4->7->cpubuff->2->1->0->3, cpubuff->2->1->0->3->cpubuff->6->5->4->7
+// 1st ring, cpubuff->7->6->5->4->cpubuff->3->2->1->0,
+// cpubuff->3->2->1->0->cpubuff->7->6->5->4 2nd ring,
+// cpubuff->4->7->6->5->cpubuff->0->3->2->1,
+// cpubuff->0->3->2->1->cpubuff->4->7->6->5 3rd ring,
+// cpubuff->5->4->7->6->cpubuff->1->0->3->2,
+// cpubuff->1->0->3->2->cpubuff->5->4->7->6 4th ring,
+// cpubuff->6->5->4->7->cpubuff->2->1->0->3,
+// cpubuff->2->1->0->3->cpubuff->6->5->4->7
 //
 void NcclManagerExpr::ConstructRings() {
-    _signal_comm = _global_comm;
-    BPS_LOG(DEBUG) << "Constructing NCCL Reduce communicators.";
-    for (size_t i = 0; i < _nccl_pcie_size; i++) {
-        _rings.push_back(std::vector<int>());
-        std::string log("");
-        for (size_t j = 0; j < _nccl_pcie_num; j++) {
-            for (size_t k = 0; k < _nccl_pcie_size; k++) {
-                int rank = (k + i) % _nccl_pcie_size + j * _nccl_pcie_size;
-                _rings[i].push_back(rank);
-                log = log + std::to_string(rank) + ' ';
-            }
-        }
-        BPS_LOG(DEBUG) << log;
+  _signal_comm = _global_comm;
+  BPS_LOG(DEBUG) << "Constructing NCCL Reduce communicators.";
+  for (size_t i = 0; i < _nccl_pcie_size; i++) {
+    _rings.push_back(std::vector<int>());
+    std::string log("");
+    for (size_t j = 0; j < _nccl_pcie_num; j++) {
+      for (size_t k = 0; k < _nccl_pcie_size; k++) {
+        int rank = (k + i) % _nccl_pcie_size + j * _nccl_pcie_size;
+        _rings[i].push_back(rank);
+        log = log + std::to_string(rank) + ' ';
+      }
     }
-    BPS_LOG(DEBUG) << "Constructing NCCL Broadcast communicators.";
-    for (size_t i = 0; i < _nccl_pcie_size; i++) {
-        _rings.push_back(std::vector<int>());
-        std::string log("");
-        for (int j = 0; j < BytePSGlobal::GetLocalSize(); j++) {
-            int rank = _rings[i][BytePSGlobal::GetLocalSize()-j-1];
-            _rings[i + _nccl_pcie_size].push_back(rank);
-            log = log + std::to_string(rank) + ' ';
-        }
-        BPS_LOG(DEBUG) << log;
+    BPS_LOG(DEBUG) << log;
+  }
+  BPS_LOG(DEBUG) << "Constructing NCCL Broadcast communicators.";
+  for (size_t i = 0; i < _nccl_pcie_size; i++) {
+    _rings.push_back(std::vector<int>());
+    std::string log("");
+    for (int j = 0; j < BytePSGlobal::GetLocalSize(); j++) {
+      int rank = _rings[i][BytePSGlobal::GetLocalSize() - j - 1];
+      _rings[i + _nccl_pcie_size].push_back(rank);
+      log = log + std::to_string(rank) + ' ';
     }
-    auto local_size = BytePSGlobal::GetLocalSize();
-    auto local_rank = BytePSGlobal::GetLocalRank();
-    // init and sycn NCCL-reduce-id using out-of-band socket
-    _nccl_id = (ncclUniqueId*) malloc(sizeof(ncclUniqueId) * _nccl_pcie_size * 2);
-    _nccl_comm = (ncclComm_t*) malloc(sizeof(ncclComm_t) * _nccl_pcie_size * 2);
-    _nccl_stream  = (cudaStream_t*) malloc(sizeof(cudaStream_t) * _nccl_pcie_size * 2);
-    int greatest_priority;
-    CUDA_CALL(cudaDeviceGetStreamPriorityRange(NULL, &greatest_priority));
-
-    for (size_t i = 0; i < _nccl_pcie_size * 2; i++) {
-        auto nccl_id = _nccl_id + i;
-        auto nccl_comm = _nccl_comm + i;
-        auto nccl_stream = _nccl_stream + i;
-
-        // synchronize NCCL IDs
-        if (BytePSGlobal::IsRootDevice()) { // only root create nccl id
-            NCCLCHECK(ncclGetUniqueId(nccl_id));
-            // the log is just for debug, the actual length of nccl id is 128
-            BPS_LOG(DEBUG) << "root nccl_id is " << (*(long long int*)nccl_id);
-            _signal_comm->broadcastSignal(nccl_id, sizeof(ncclUniqueId));
-        }
-        else {
-            int src;
-            int rc = _signal_comm->recvSignal(&src, nccl_id, sizeof(ncclUniqueId));
-            BPS_CHECK_EQ(rc, sizeof(ncclUniqueId)) << rc << ", " << sizeof(ncclUniqueId);
-            BPS_LOG(DEBUG) << "recv nccl_id is " << (*(long long int*)nccl_id)
-                        << ", local_rank=" << local_rank;
-        }
-
-        // initialize NCCL rank
-        auto it = std::find(_rings[i].begin(), _rings[i].end(), local_rank);
-        auto rank = std::distance(_rings[i].begin(), it);
-        NCCLCHECK(ncclCommInitRank(nccl_comm, local_size, *nccl_id, rank));
-
-        // initialize CUDA streams for NCCL
-        CUDA_CALL(cudaStreamCreateWithPriority(nccl_stream, 
-                                               cudaStreamNonBlocking,
-                                               greatest_priority));
-        CUDA_CALL(cudaStreamSynchronize(*nccl_stream));
+    BPS_LOG(DEBUG) << log;
+  }
+  auto local_size = BytePSGlobal::GetLocalSize();
+  auto local_rank = BytePSGlobal::GetLocalRank();
+  // init and sycn NCCL-reduce-id using out-of-band socket
+  _nccl_id = (ncclUniqueId*)malloc(sizeof(ncclUniqueId) * _nccl_pcie_size * 2);
+  _nccl_comm = (ncclComm_t*)malloc(sizeof(ncclComm_t) * _nccl_pcie_size * 2);
+  _nccl_stream =
+      (cudaStream_t*)malloc(sizeof(cudaStream_t) * _nccl_pcie_size * 2);
+  int greatest_priority;
+  CUDA_CALL(cudaDeviceGetStreamPriorityRange(NULL, &greatest_priority));
+
+  for (size_t i = 0; i < _nccl_pcie_size * 2; i++) {
+    auto nccl_id = _nccl_id + i;
+    auto nccl_comm = _nccl_comm + i;
+    auto nccl_stream = _nccl_stream + i;
+
+    // synchronize NCCL IDs
+    if (BytePSGlobal::IsRootDevice()) {  // only root create nccl id
+      NCCLCHECK(ncclGetUniqueId(nccl_id));
+      // the log is just for debug, the actual length of nccl id is 128
+      BPS_LOG(DEBUG) << "root nccl_id is " << (*(long long int*)nccl_id);
+      _signal_comm->broadcastSignal(nccl_id, sizeof(ncclUniqueId));
+    } else {
+      int src;
+      int rc = _signal_comm->recvSignal(&src, nccl_id, sizeof(ncclUniqueId));
+      BPS_CHECK_EQ(rc, sizeof(ncclUniqueId))
+          << rc << ", " << sizeof(ncclUniqueId);
+      BPS_LOG(DEBUG) << "recv nccl_id is " << (*(long long int*)nccl_id)
+                     << ", local_rank=" << local_rank;
     }
-    return;
+
+    // initialize NCCL rank
+    auto it = std::find(_rings[i].begin(), _rings[i].end(), local_rank);
+    auto rank = std::distance(_rings[i].begin(), it);
+    NCCLCHECK(ncclCommInitRank(nccl_comm, local_size, *nccl_id, rank));
+
+    // initialize CUDA streams for NCCL
+    CUDA_CALL(cudaStreamCreateWithPriority(nccl_stream, cudaStreamNonBlocking,
+                                           greatest_priority));
+    CUDA_CALL(cudaStreamSynchronize(*nccl_stream));
+  }
+  return;
 }
 
 ncclComm_t NcclManagerExpr::GetComm(uint64_t key, QueueType op) {
-    auto offset = (op == REDUCE) ? 0 : _nccl_pcie_size;
-    return _nccl_comm[key % _nccl_pcie_size + offset];
+  auto offset = (op == REDUCE) ? 0 : _nccl_pcie_size;
+  return _nccl_comm[key % _nccl_pcie_size + offset];
 }
 
 cudaStream_t NcclManagerExpr::GetStream(uint64_t key, QueueType op) {
-    auto offset = (op == REDUCE) ? 0 : _nccl_pcie_size;
-    return _nccl_stream[key % _nccl_pcie_size + offset];
+  auto offset = (op == REDUCE) ? 0 : _nccl_pcie_size;
+  return _nccl_stream[key % _nccl_pcie_size + offset];
 }
 
 int NcclManagerExpr::GetRoot(uint64_t key, QueueType op) {
-    int comm_index = key % _nccl_pcie_size;
-    int pcie_index = key % (_nccl_pcie_size * _nccl_pcie_num) / _nccl_pcie_size;
-    int root = -1;
-    if (op == REDUCE) {
-        int root_index = (_nccl_pcie_num - pcie_index) * _nccl_pcie_size - 1;
-        root = _rings[comm_index][root_index];
-    }
-    else {
-        BPS_CHECK_EQ(op, BROADCAST) << "Unknown OP for NcclManager.";
-        int root_index = pcie_index * _nccl_pcie_size;
-        root = _rings[comm_index + _nccl_pcie_size][root_index];
-    }
-    BPS_CHECK_GT(root, -1);
-    return root;
+  int comm_index = key % _nccl_pcie_size;
+  int pcie_index = key % (_nccl_pcie_size * _nccl_pcie_num) / _nccl_pcie_size;
+  int root = -1;
+  if (op == REDUCE) {
+    int root_index = (_nccl_pcie_num - pcie_index) * _nccl_pcie_size - 1;
+    root = _rings[comm_index][root_index];
+  } else {
+    BPS_CHECK_EQ(op, BROADCAST) << "Unknown OP for NcclManager.";
+    int root_index = pcie_index * _nccl_pcie_size;
+    root = _rings[comm_index + _nccl_pcie_size][root_index];
+  }
+  BPS_CHECK_GT(root, -1);
+  return root;
 }
 
 int NcclManagerExpr::GetRank(uint64_t key, QueueType op) {
-    auto offset = (op == REDUCE) ? 0 : _nccl_pcie_size;
-    auto i = key % _nccl_pcie_size + offset;
-    auto it = std::find(_rings[i].begin(), _rings[i].end(), BytePSGlobal::GetLocalRank());
-    auto rank = std::distance(_rings[i].begin(), it);
-    return rank;
+  auto offset = (op == REDUCE) ? 0 : _nccl_pcie_size;
+  auto i = key % _nccl_pcie_size + offset;
+  auto it = std::find(_rings[i].begin(), _rings[i].end(),
+                      BytePSGlobal::GetLocalRank());
+  auto rank = std::distance(_rings[i].begin(), it);
+  return rank;
 }
 
-} // namespace common
-} // namespace byteps
+}  // namespace common
+}  // namespace byteps
diff --git a/byteps/common/nccl_manager.h b/byteps/common/nccl_manager.h
index 3da2d76fe..40da5ed6c 100644
--- a/byteps/common/nccl_manager.h
+++ b/byteps/common/nccl_manager.h
@@ -16,114 +16,105 @@
 #ifndef BYTEPS_NCCL_MANAGER_H
 #define BYTEPS_NCCL_MANAGER_H
 
-#include <vector>
 #include <memory>
 #include <queue>
+#include <vector>
 #include "common.h"
-#include "scheduled_queue.h"
 #include "communicator.h"
+#include "scheduled_queue.h"
 
 namespace byteps {
 namespace common {
 
-
 class NcclGroupEntry {
+ public:
+  void RecordEvents();
+  void SynchronizeEvents();
+  void DestroyEvents();
 
-public:
-    void RecordEvents();
-    void SynchronizeEvents();
-    void DestroyEvents();
-
-    std::vector<std::shared_ptr<TensorTableEntry>> tasks;
-    std::vector<BytePSScheduledQueue*> queues;
+  std::vector<std::shared_ptr<TensorTableEntry>> tasks;
+  std::vector<BytePSScheduledQueue*> queues;
 
-private:
-    std::vector<cudaEvent_t> _events;
+ private:
+  std::vector<cudaEvent_t> _events;
 };
 
-
 class NcclManager {
-
-public:
-    NcclManager(std::shared_ptr<BytePSComm> comm);
-    ~NcclManager() {
-        if (_nccl_stream) {
-            CUDA_CALL(cudaStreamDestroy(*_nccl_stream));
-        }
-        if (_nccl_id) {
-            free(_nccl_id);
-        }
-        if (_nccl_comm) {
-            free(_nccl_comm);
-        }
-        if (_signal_comm) {
-            _signal_comm.reset();
-        }
-        if (_global_comm) {
-            _global_comm.reset();
-        }
-        while(!_nccl_pipeline.empty()) _nccl_pipeline.pop();
-
-        BPS_LOG(DEBUG) << "Clear NcclManager";
+ public:
+  NcclManager(std::shared_ptr<BytePSComm> comm);
+  ~NcclManager() {
+    if (_nccl_stream) {
+      CUDA_CALL(cudaStreamDestroy(*_nccl_stream));
     }
+    if (_nccl_id) {
+      free(_nccl_id);
+    }
+    if (_nccl_comm) {
+      free(_nccl_comm);
+    }
+    if (_signal_comm) {
+      _signal_comm.reset();
+    }
+    if (_global_comm) {
+      _global_comm.reset();
+    }
+    while (!_nccl_pipeline.empty()) _nccl_pipeline.pop();
 
-    int GetGroupSize() { return _nccl_group_size; }
-    void EnqueueGroup(std::shared_ptr<NcclGroupEntry> e);
-    std::shared_ptr<NcclGroupEntry> DequeueGroup();
+    BPS_LOG(DEBUG) << "Clear NcclManager";
+  }
 
-    virtual cudaStream_t GetStream(uint64_t key, QueueType op);
-    virtual ncclComm_t GetComm(uint64_t key, QueueType op);
-    virtual int GetRoot(uint64_t key, QueueType op);
-    virtual int GetRank(uint64_t key, QueueType op);
+  int GetGroupSize() { return _nccl_group_size; }
+  void EnqueueGroup(std::shared_ptr<NcclGroupEntry> e);
+  std::shared_ptr<NcclGroupEntry> DequeueGroup();
 
-    int GetSize() { return _nccl_size; }
-    std::shared_ptr<BytePSComm> GetSignalComm() { return _signal_comm; }
-    bool IsSignalRoot();
-    
+  virtual cudaStream_t GetStream(uint64_t key, QueueType op);
+  virtual ncclComm_t GetComm(uint64_t key, QueueType op);
+  virtual int GetRoot(uint64_t key, QueueType op);
+  virtual int GetRank(uint64_t key, QueueType op);
 
-protected:
-    void InitGlobalEnv();
-    virtual void ConstructRings();
+  int GetSize() { return _nccl_size; }
+  std::shared_ptr<BytePSComm> GetSignalComm() { return _signal_comm; }
+  bool IsSignalRoot();
 
-    cudaStream_t* _nccl_stream;
-    ncclUniqueId* _nccl_id;
-    ncclComm_t* _nccl_comm;
-    
-    // global user-defined env
-    size_t _nccl_group_size;
-    size_t _nccl_pcie_size;
-    size_t _nccl_pcie_num;
-    size_t _nccl_num_rings;
+ protected:
+  void InitGlobalEnv();
+  virtual void ConstructRings();
 
-    int _nccl_size;
+  cudaStream_t* _nccl_stream;
+  ncclUniqueId* _nccl_id;
+  ncclComm_t* _nccl_comm;
 
-    // for pipelining nccl
-    std::mutex _nccl_mutex;
-    std::queue<std::shared_ptr<NcclGroupEntry>> _nccl_pipeline;
+  // global user-defined env
+  size_t _nccl_group_size;
+  size_t _nccl_pcie_size;
+  size_t _nccl_pcie_num;
+  size_t _nccl_num_rings;
 
-    std::shared_ptr<BytePSComm> _signal_comm;
-    std::shared_ptr<BytePSComm> _global_comm;
+  int _nccl_size;
 
+  // for pipelining nccl
+  std::mutex _nccl_mutex;
+  std::queue<std::shared_ptr<NcclGroupEntry>> _nccl_pipeline;
+
+  std::shared_ptr<BytePSComm> _signal_comm;
+  std::shared_ptr<BytePSComm> _global_comm;
 };
 
 class NcclManagerExpr : public NcclManager {
+ public:
+  cudaStream_t GetStream(uint64_t key, QueueType op);
+  ncclComm_t GetComm(uint64_t key, QueueType op);
+  int GetRoot(uint64_t key, QueueType op);
+  int GetRank(uint64_t key, QueueType op);
 
-public:
-    cudaStream_t GetStream(uint64_t key, QueueType op);
-    ncclComm_t GetComm(uint64_t key, QueueType op);
-    int GetRoot(uint64_t key, QueueType op);
-    int GetRank(uint64_t key, QueueType op);
-
-protected:
-    void ConstructRings();
-
-    // for multi-ring
-    std::vector<std::vector<int>> _rings;
+ protected:
+  void ConstructRings();
 
+  // for multi-ring
+  std::vector<std::vector<int>> _rings;
 };
 
+}  // namespace common
+}  // namespace byteps
 
-} // namespace common
-} // namespace byteps
-
-#endif // BYTEPS_NCCL_MANAGER_H
\ No newline at end of file
+#endif  // BYTEPS_NCCL_MANAGER_H
diff --git a/byteps/common/operations.cc b/byteps/common/operations.cc
index 98dcd351c..e740649fc 100644
--- a/byteps/common/operations.cc
+++ b/byteps/common/operations.cc
@@ -13,15 +13,14 @@
 // limitations under the License.
 // =============================================================================
 
+#include "operations.h"
+#include <cuda_runtime.h>
 #include <cstring>
 #include <memory>
 #include <thread>
-#include <cuda_runtime.h>
-
-#include "logging.h"
-#include "operations.h"
 #include "core_loops.h"
 #include "global.h"
+#include "logging.h"
 
 namespace byteps {
 namespace common {
@@ -29,356 +28,335 @@ namespace common {
 extern "C" {
 
 void byteps_init() {
-    BytePSGlobal::Init();
+  BytePSGlobal::Init();
 
-    // The order of func does not matter
-    std::vector<LoopFunction> func;
-
-    // Push & Pull in distributed mode
-    if (BytePSGlobal::IsDistributed()) {
-        if (BytePSGlobal::IsRootDevice()) {
-            func.push_back(PullLoop);
-        }
-    }
-
-    // Cross-PCIe-switch reduce
-    if (BytePSGlobal::IsCrossPcieSwitch()) {
-        func.push_back(PcieReduceLoop);
-    }
-
-    // Copy between GPU and CPU
-    if (BytePSGlobal::IsCrossPcieSwitch() || BytePSGlobal::IsDistributed()) {
-        func.push_back(CopyDevice2HostLoop);
-        if (BytePSGlobal::IsRootDevice()) {
-            // PUSH can be a real push in distributed mode
-            // Or a dummy barrier in cross-pcie-switch mode
-            func.push_back(PushLoop);
-            func.push_back(RootCopyHost2DeviceLoop);
-        }
-        else {
-            func.push_back(CoordinatePushLoop);
-            func.push_back(NonRootCopyHost2DeviceLoop);
-            func.push_back(NonRootCopyListenLoop);
-        }
-    }
+  // The order of func does not matter
+  std::vector<LoopFunction> func;
 
-    // Per-PCIe-switch NCCL calls
-    func.push_back(SyncNcclLoop);
-    if (BytePSGlobal::GetNccl()->IsSignalRoot()) {
-        func.push_back(RootNcclLoop);
+  // Push & Pull in distributed mode
+  if (BytePSGlobal::IsDistributed()) {
+    if (BytePSGlobal::IsRootDevice()) {
+      func.push_back(PullLoop);
     }
-    else {
-        func.push_back(CoordinateReduceLoop);
-        func.push_back(CoordinateBroadcastLoop);
-        func.push_back(NonRootNcclLoop);
+  }
+
+  // Cross-PCIe-switch reduce
+  if (BytePSGlobal::IsCrossPcieSwitch()) {
+    func.push_back(PcieReduceLoop);
+  }
+
+  // Copy between GPU and CPU
+  if (BytePSGlobal::IsCrossPcieSwitch() || BytePSGlobal::IsDistributed()) {
+    func.push_back(CopyDevice2HostLoop);
+    if (BytePSGlobal::IsRootDevice()) {
+      // PUSH can be a real push in distributed mode
+      // Or a dummy barrier in cross-pcie-switch mode
+      func.push_back(PushLoop);
+      func.push_back(RootCopyHost2DeviceLoop);
+    } else {
+      func.push_back(CoordinatePushLoop);
+      func.push_back(NonRootCopyHost2DeviceLoop);
+      func.push_back(NonRootCopyListenLoop);
     }
-
-    BytePSGlobal::Start(func);
-    return;
+  }
+
+  // Per-PCIe-switch NCCL calls
+  func.push_back(SyncNcclLoop);
+  if (BytePSGlobal::GetNccl()->IsSignalRoot()) {
+    func.push_back(RootNcclLoop);
+  } else {
+    func.push_back(CoordinateReduceLoop);
+    func.push_back(CoordinateBroadcastLoop);
+    func.push_back(NonRootNcclLoop);
+  }
+
+  BytePSGlobal::Start(func);
+  return;
 }
 
 void byteps_shutdown() {
-    BytePSGlobal::Shutdown();
-    BPS_LOG(DEBUG) << "BytePS is shutdown.";
-    return;
+  BytePSGlobal::Shutdown();
+  BPS_LOG(DEBUG) << "BytePS is shutdown.";
+  return;
 }
 
-int byteps_rank() {
-    return BytePSGlobal::GetRank();
-}
+int byteps_rank() { return BytePSGlobal::GetRank(); }
 
-int byteps_local_rank() {
-    return BytePSGlobal::GetLocalRank();
-}
+int byteps_local_rank() { return BytePSGlobal::GetLocalRank(); }
 
-int byteps_size() {
-    return BytePSGlobal::GetSize();
-}
+int byteps_size() { return BytePSGlobal::GetSize(); }
 
-int byteps_local_size() {
-    return BytePSGlobal::GetLocalSize();
-}
+int byteps_local_size() { return BytePSGlobal::GetLocalSize(); }
 
-} // extern "C"
+}  // extern "C"
 
-Status CheckInitialized() {
-    return BytePSGlobal::CheckInit();
-}
+Status CheckInitialized() { return BytePSGlobal::CheckInit(); }
 
-void PartitionTensor(std::shared_ptr<TensorTableEntry> entry,
-                    std::vector<std::shared_ptr<TensorTableEntry> > &partitions) {
-    BPS_CHECK(entry->counter_ptr) << entry->tensor_name << " counter pointer is null";
-    auto size = entry->tensor ? entry->tensor->size() : entry->output->size();
-    auto bound = BytePSGlobal::GetPartitionBound();
-    auto accumulated = 0;
-    int i = 0;
-
-    while (accumulated < size) {
-        std::shared_ptr<TensorTableEntry> e(new TensorTableEntry);
-        // will assign the key later, so don't do it now
-        // e->key = entry->key;
-        e->tensor_name = entry->tensor_name + std::string("_") + std::to_string(i);
-        e->context = entry->context;
-        e->ready_event = entry->ready_event;
-        e->device = entry->device;
-        e->priority = entry->priority;
-        e->version = entry->version;
-        e->callback = entry->callback;
-        e->cpubuff = entry->cpubuff;
-        e->gpu_ptr = entry->gpu_ptr;
-        e->pcie_cpubuff = entry->pcie_cpubuff;
-        e->queue_list = entry->queue_list;
-        e->tensor = entry->tensor;
-        e->output = entry->output;
-        e->offset = accumulated;
-        e->len = ((size - accumulated) > bound) ? bound : (size - accumulated);
-        e->counter_ptr = entry->counter_ptr;
-        e->total_partnum = entry->total_partnum;
-
-        accumulated += e->len;
-        ++i;
-
-        partitions.push_back(e);
-    }
+void PartitionTensor(
+    std::shared_ptr<TensorTableEntry> entry,
+    std::vector<std::shared_ptr<TensorTableEntry>> &partitions) {
+  BPS_CHECK(entry->counter_ptr)
+      << entry->tensor_name << " counter pointer is null";
+  auto size = entry->tensor ? entry->tensor->size() : entry->output->size();
+  auto bound = BytePSGlobal::GetPartitionBound();
+  auto accumulated = 0;
+  int i = 0;
+
+  while (accumulated < size) {
+    std::shared_ptr<TensorTableEntry> e(new TensorTableEntry);
+    // will assign the key later, so don't do it now
+    // e->key = entry->key;
+    e->tensor_name = entry->tensor_name + std::string("_") + std::to_string(i);
+    e->context = entry->context;
+    e->ready_event = entry->ready_event;
+    e->device = entry->device;
+    e->priority = entry->priority;
+    e->version = entry->version;
+    e->callback = entry->callback;
+    e->cpubuff = entry->cpubuff;
+    e->gpu_ptr = entry->gpu_ptr;
+    e->pcie_cpubuff = entry->pcie_cpubuff;
+    e->queue_list = entry->queue_list;
+    e->tensor = entry->tensor;
+    e->output = entry->output;
+    e->offset = accumulated;
+    e->len = ((size - accumulated) > bound) ? bound : (size - accumulated);
+    e->counter_ptr = entry->counter_ptr;
+    e->total_partnum = entry->total_partnum;
+
+    accumulated += e->len;
+    ++i;
+
+    partitions.push_back(e);
+  }
 }
 
-Status EnqueueTensor(BPSContext &context,
-                     std::shared_ptr<Tensor> input,
+Status EnqueueTensor(BPSContext &context, std::shared_ptr<Tensor> input,
                      std::shared_ptr<Tensor> output,
-                     std::shared_ptr<ReadyEvent> ready_event,
-                     const int device, const int priority, const int version,
+                     std::shared_ptr<ReadyEvent> ready_event, const int device,
+                     const int priority, const int version,
                      StatusCallback callback,
                      std::shared_ptr<std::vector<QueueType>> queue_list) {
-    
-    auto& name = context.tensor_name;
-    if (input && output) {
-        BPS_CHECK_EQ(input->size(), output->size()) << name << " output tensor size does not match";
-    }
-
-    std::shared_ptr<TensorTableEntry> e(new TensorTableEntry);
-    e->tensor_name = name;
-    e->context = &context;
-    e->tensor = input;
-    e->output = output;
-    e->ready_event = ready_event;
-    e->device = device;
-    e->priority = priority;
-    e->version = version;
-    e->callback = callback;
-    e->cpubuff = context.cpubuff;
-    e->gpu_ptr = context.gpu_ptr;
-    e->pcie_cpubuff = context.pcie_cpubuff;
-    e->queue_list = *queue_list;
-    e->counter_ptr = std::make_shared<std::atomic_int>(0);
-    e->total_partnum = context.key_list.size();
-
-    std::vector<std::shared_ptr<TensorTableEntry> > partitions;
-    PartitionTensor(e, partitions);
-    BPS_CHECK_EQ(context.key_list.size(), partitions.size()) << name
-            << ": " << context.key_list.size()
-            << ", " << partitions.size();
-
-    if (e->queue_list.size() == 0) {
-        BPS_CHECK(e->tensor_name != "");
-        BPS_LOG(TRACE) << e->tensor_name
-                       << ", device=" << e->device
-                       << " has no queue_list assigned, skipped";
-        e->callback(Status::OK());
-        return Status::OK();
-    }
-
-    unsigned int accumulated = 0;
-    for (size_t i = 0; i < partitions.size(); ++i) {
-        auto task = partitions[i];
-        task->key = context.key_list[i]; // assign the key now
-        BPS_CHECK(task->tensor_name != "");
-        BPS_LOG(TRACE) << "EnqueueTensor: " << (task->tensor_name)
-                       << ", key=" << (task->key)
-                       << ", offset=" << (task->offset)
-                       << ", len=" << (task->len)
-                       << ", device=" << (task->device)
-                       << " rank=" << BytePSGlobal::GetLocalRank();
-
-        BytePSGlobal::GetScheduledQueue(e->queue_list[0])->addTask(task);
-        accumulated += task->len;
-    }
-
-    auto tensor = (e->tensor ? e->tensor : e->output);
-    BPS_CHECK(tensor);
-    BPS_CHECK_EQ(accumulated, tensor->size())
-        << "accumulated partition size not equal to original tensor size";
-
-    BPS_LOG(TRACE) << "EnqueueTensor finished: " << name
-                   << ", rank=" << BytePSGlobal::GetLocalRank();
+  auto &name = context.tensor_name;
+  if (input && output) {
+    BPS_CHECK_EQ(input->size(), output->size())
+        << name << " output tensor size does not match";
+  }
+
+  std::shared_ptr<TensorTableEntry> e(new TensorTableEntry);
+  e->tensor_name = name;
+  e->context = &context;
+  e->tensor = input;
+  e->output = output;
+  e->ready_event = ready_event;
+  e->device = device;
+  e->priority = priority;
+  e->version = version;
+  e->callback = callback;
+  e->cpubuff = context.cpubuff;
+  e->gpu_ptr = context.gpu_ptr;
+  e->pcie_cpubuff = context.pcie_cpubuff;
+  e->queue_list = *queue_list;
+  e->counter_ptr = std::make_shared<std::atomic_int>(0);
+  e->total_partnum = context.key_list.size();
+
+  std::vector<std::shared_ptr<TensorTableEntry>> partitions;
+  PartitionTensor(e, partitions);
+  BPS_CHECK_EQ(context.key_list.size(), partitions.size())
+      << name << ": " << context.key_list.size() << ", " << partitions.size();
+
+  if (e->queue_list.size() == 0) {
+    BPS_CHECK(e->tensor_name != "");
+    BPS_LOG(TRACE) << e->tensor_name << ", device=" << e->device
+                   << " has no queue_list assigned, skipped";
+    e->callback(Status::OK());
     return Status::OK();
+  }
+
+  unsigned int accumulated = 0;
+  for (size_t i = 0; i < partitions.size(); ++i) {
+    auto task = partitions[i];
+    task->key = context.key_list[i];  // assign the key now
+    BPS_CHECK(task->tensor_name != "");
+    BPS_LOG(TRACE) << "EnqueueTensor: " << (task->tensor_name)
+                   << ", key=" << (task->key) << ", offset=" << (task->offset)
+                   << ", len=" << (task->len) << ", device=" << (task->device)
+                   << " rank=" << BytePSGlobal::GetLocalRank();
+
+    BytePSGlobal::GetScheduledQueue(e->queue_list[0])->addTask(task);
+    accumulated += task->len;
+  }
+
+  auto tensor = (e->tensor ? e->tensor : e->output);
+  BPS_CHECK(tensor);
+  BPS_CHECK_EQ(accumulated, tensor->size())
+      << "accumulated partition size not equal to original tensor size";
+
+  BPS_LOG(TRACE) << "EnqueueTensor finished: " << name
+                 << ", rank=" << BytePSGlobal::GetLocalRank();
+  return Status::OK();
 }
 
 void InitTensor(BPSContext &context, size_t size, int dtype, void *cpubuff) {
-    std::lock_guard<std::mutex> lock(context.init_mutex);
-    if (context.initialized) { return; }
-    CUDA_CALL(cudaSetDevice(BytePSGlobal::GetLocalRank()));
-
-    BPS_CHECK_GT(size, 0) << "init tensor size not larger than 0";
-    // Get metadata
-    auto bound = BytePSGlobal::GetPartitionBound();
-    auto& name = context.tensor_name;
-    context.buff_len = size;
-    size_t accumulated = 0;
-
-    // Total key space is 0 to 2^64 - 1
-    // It will be divided to N PS servers, for now we assume N <= 2^16
-    // Then we have 2^48 key space left (top 16 bits for different servers)
-    // MXNet server has a bug dealing with keys larger than 2^32
-    // Below we support up to 2^16 tensors, and up to 2^16 partitions per tensor
-    ps::Key start_key = context.declared_key << 16;
-    while (accumulated < size) {
-        context.key_list.push_back(start_key++);
-        accumulated += ((size - accumulated) > bound) ? bound : (size - accumulated);
-    }
-    BPS_LOG(DEBUG) << name << " partitioned to "
-                    << context.key_list.size() << " part(s)"
-                    << ", total_len=" << size
-                    << ", key_range=["
-                    << context.key_list.front()
-                    << ", "
-                    << context.key_list.back()
-                    << "]"
-                    << " rank=" << BytePSGlobal::GetLocalRank();
-
-    auto key_list = context.key_list;
-
-    BPS_CHECK_GT(key_list.size(), 0) << name;
-    BPS_CHECK_EQ(key_list.size(), (unsigned int) (size+bound-1)/bound) // round up
-                    << key_list.size()
-                    << ", size=" << size
-                    << ", bound=" << bound;
-
-    BPS_LOG(TRACE) << "Begin init " << name
-                   << ", size=" << size
-                   << ", parts=" << key_list.size();
-
-    // If cpubuff is not nullprt, the tensor itself is on CPU
-    // We need to register with CUDA so that NCCL can work on it
-    if (cpubuff) {
-        BPS_LOG(DEBUG) << name << " is already on cpu, len=" << size;
-        CUDA_CALL(cudaHostRegister(cpubuff, size, cudaHostRegisterMapped));
-        CUDA_CALL(cudaHostGetDevicePointer(&(context.gpu_ptr), cpubuff, 0));
+  std::lock_guard<std::mutex> lock(context.init_mutex);
+  if (context.initialized) {
+    return;
+  }
+  CUDA_CALL(cudaSetDevice(BytePSGlobal::GetLocalRank()));
+
+  BPS_CHECK_GT(size, 0) << "init tensor size not larger than 0";
+  // Get metadata
+  auto bound = BytePSGlobal::GetPartitionBound();
+  auto &name = context.tensor_name;
+  context.buff_len = size;
+  size_t accumulated = 0;
+
+  // Total key space is 0 to 2^64 - 1
+  // It will be divided to N PS servers, for now we assume N <= 2^16
+  // Then we have 2^48 key space left (top 16 bits for different servers)
+  // MXNet server has a bug dealing with keys larger than 2^32
+  // Below we support up to 2^16 tensors, and up to 2^16 partitions per tensor
+  ps::Key start_key = context.declared_key << 16;
+  while (accumulated < size) {
+    context.key_list.push_back(start_key++);
+    accumulated +=
+        ((size - accumulated) > bound) ? bound : (size - accumulated);
+  }
+  BPS_LOG(DEBUG) << name << " partitioned to " << context.key_list.size()
+                 << " part(s)"
+                 << ", total_len=" << size << ", key_range=["
+                 << context.key_list.front() << ", " << context.key_list.back()
+                 << "]"
+                 << " rank=" << BytePSGlobal::GetLocalRank();
+
+  auto key_list = context.key_list;
+
+  BPS_CHECK_GT(key_list.size(), 0) << name;
+  BPS_CHECK_EQ(key_list.size(),
+               (unsigned int)(size + bound - 1) / bound)  // round up
+      << key_list.size() << ", size=" << size << ", bound=" << bound;
+
+  BPS_LOG(TRACE) << "Begin init " << name << ", size=" << size
+                 << ", parts=" << key_list.size();
+
+  // If cpubuff is not nullprt, the tensor itself is on CPU
+  // We need to register with CUDA so that NCCL can work on it
+  if (cpubuff) {
+    BPS_LOG(DEBUG) << name << " is already on cpu, len=" << size;
+    CUDA_CALL(cudaHostRegister(cpubuff, size, cudaHostRegisterMapped));
+    CUDA_CALL(cudaHostGetDevicePointer(&(context.gpu_ptr), cpubuff, 0));
+  }
+
+  // We always allocate our own cpu buffer
+  // use the first key in key_list as the index
+  auto shm_obj = BytePSGlobal::GetSharedMemoryObj();
+  if (BytePSGlobal::IsCrossPcieSwitch()) {
+    context.pcie_cpubuff = shm_obj->openPcieSharedMemory(key_list[0], size);
+    context.cpubuff = context.pcie_cpubuff.back();
+  } else {
+    context.cpubuff = shm_obj->openSharedMemory(std::string("BytePS_ShM_"),
+                                                key_list[0], size);
+  }
+  BPS_LOG(TRACE) << name << ": open shared memory size " << size;
+
+  // Init tensors with BytePS server
+  char *data = const_cast<char *>(static_cast<const char *>(context.cpubuff));
+  accumulated = 0;
+  size_t i = 0;
+  while (accumulated < size) {
+    auto key = key_list[i];
+    int len = ((size - accumulated) > bound) ? bound : (size - accumulated);
+
+    if (BytePSGlobal::IsDistributed() && BytePSGlobal::IsRootDevice()) {
+      // encode the key for pskv scattering
+      auto &pskv = BytePSGlobal::EncodeDefaultKey(key, len);
+      // false means not to delete data when SArray is deleted
+      ps::SArray<char> vals(data + accumulated, len, false);
+      // cmd type
+      int cmd = GetCommandType(RequestType::kDefaultPushPull, dtype);
+      // blocking push, also as a global barrirer
+      BytePSGlobal::GetPS()->Wait(
+          BytePSGlobal::GetPS()->ZPush(pskv.keys, vals, pskv.lens, cmd));
     }
 
-    // We always allocate our own cpu buffer
-    // use the first key in key_list as the index
-    auto shm_obj = BytePSGlobal::GetSharedMemoryObj(); 
-    if (BytePSGlobal::IsCrossPcieSwitch()) {
-        context.pcie_cpubuff = shm_obj->openPcieSharedMemory(key_list[0], size);
-        context.cpubuff = context.pcie_cpubuff.back();
-    }
-    else {
-        context.cpubuff = shm_obj->openSharedMemory(std::string("BytePS_ShM_"), key_list[0], size);
-    }
-    BPS_LOG(TRACE) << name << ": open shared memory size " << size;
-
-    // Init tensors with BytePS server
-    char* data = const_cast<char*> (static_cast<const char*> (context.cpubuff));
-    accumulated = 0;
-    size_t i = 0;
-    while (accumulated < size) {
-        auto key = key_list[i];
-        int len = ((size - accumulated) > bound) ? bound : (size - accumulated);
-
-        if (BytePSGlobal::IsDistributed() && BytePSGlobal::IsRootDevice()) {
-            // encode the key for pskv scattering
-            auto& pskv = BytePSGlobal::EncodeDefaultKey(key, len);
-            // false means not to delete data when SArray is deleted
-            ps::SArray<char> vals(data + accumulated, len, false);
-            // cmd type
-            int cmd = GetCommandType(RequestType::kDefaultPushPull, dtype);
-            // blocking push, also as a global barrirer
-            BytePSGlobal::GetPS()->Wait(BytePSGlobal::GetPS()->ZPush(
-                pskv.keys, vals, pskv.lens, cmd));
-        }
-
-        accumulated += len;
-        ++i;
-    }
+    accumulated += len;
+    ++i;
+  }
 
-    BPS_CHECK_EQ(accumulated, size);
-    BPS_CHECK_EQ(i, key_list.size());
+  BPS_CHECK_EQ(accumulated, size);
+  BPS_CHECK_EQ(i, key_list.size());
 
-    context.initialized = true;
+  context.initialized = true;
 
-    BPS_LOG(TRACE) << "Finish Init " << name
-                   << ", size=" << size
-                   << ", parts=" << key_list.size();
+  BPS_LOG(TRACE) << "Finish Init " << name << ", size=" << size
+                 << ", parts=" << key_list.size();
 }
 
-BPSContext& GetContextFromName(const std::string &name) {
-    return BytePSGlobal::GetContextFromName(name);
+BPSContext &GetContextFromName(const std::string &name) {
+  return BytePSGlobal::GetContextFromName(name);
 }
 
 bool IsTensorDeclared(const std::string &name) {
-    return BytePSGlobal::IsTensorDeclared(name);
+  return BytePSGlobal::IsTensorDeclared(name);
 }
 
 std::shared_ptr<std::vector<QueueType>> GetPushQueueList(int device) {
-    auto queue_list = std::make_shared<std::vector<QueueType>>();
-
-    // Per-PCIe-switch NCCL reduce
-    if (BytePSGlobal::GetNccl()->IsSignalRoot()) {
-        queue_list->push_back(REDUCE);
+  auto queue_list = std::make_shared<std::vector<QueueType>>();
+
+  // Per-PCIe-switch NCCL reduce
+  if (BytePSGlobal::GetNccl()->IsSignalRoot()) {
+    queue_list->push_back(REDUCE);
+  } else {
+    queue_list->push_back(COORDINATE_REDUCE);
+    queue_list->push_back(REDUCE);
+  }
+
+  // Copy from GPU to CPU
+  if (BytePSGlobal::IsDistributed() || BytePSGlobal::IsCrossPcieSwitch()) {
+    queue_list->push_back(COPYD2H);
+  }
+
+  // Cross-PCIe-switch reduce
+  if (BytePSGlobal::IsCrossPcieSwitch()) {
+    queue_list->push_back(PCIE_REDUCE);
+  }
+
+  // Push in distributed mode
+  // In case IsCrossPcieSwitch(), PUSH runs as a dummy barrier
+  if (BytePSGlobal::IsDistributed() || BytePSGlobal::IsCrossPcieSwitch()) {
+    if (BytePSGlobal::IsRootDevice()) {
+      queue_list->push_back(PUSH);
+    } else {
+      queue_list->push_back(COORDINATE_PUSH);
     }
-    else {
-        queue_list->push_back(COORDINATE_REDUCE);
-        queue_list->push_back(REDUCE);
-    }
-
-    // Copy from GPU to CPU
-    if (BytePSGlobal::IsDistributed() || BytePSGlobal::IsCrossPcieSwitch()) {
-        queue_list->push_back(COPYD2H);
-    }
-
-    // Cross-PCIe-switch reduce
-    if (BytePSGlobal::IsCrossPcieSwitch()) {
-        queue_list->push_back(PCIE_REDUCE);
-    }
-
-    // Push in distributed mode
-    // In case IsCrossPcieSwitch(), PUSH runs as a dummy barrier
-    if (BytePSGlobal::IsDistributed() || BytePSGlobal::IsCrossPcieSwitch()) {
-        if (BytePSGlobal::IsRootDevice()) {
-            queue_list->push_back(PUSH);
-        }
-        else {
-            queue_list->push_back(COORDINATE_PUSH);
-        }
-    }
-    return queue_list;
+  }
+  return queue_list;
 }
 
 std::shared_ptr<std::vector<QueueType>> GetPullQueueList(int device) {
-    auto queue_list = std::make_shared<std::vector<QueueType>>();
-
-    // Pull in distributed mode
-    if (BytePSGlobal::IsDistributed()) {
-        if (BytePSGlobal::IsRootDevice()) {
-            queue_list->push_back(PULL);
-        }
-    }
+  auto queue_list = std::make_shared<std::vector<QueueType>>();
 
-    // Copy from CPU to GPU
-    if (BytePSGlobal::IsDistributed() || BytePSGlobal::IsCrossPcieSwitch()) {
-        queue_list->push_back(COPYH2D);
-    }
-
-    // Per-PCIe-switch NCCL broadcast
-    if (BytePSGlobal::GetNccl()->IsSignalRoot()) {
-        queue_list->push_back(BROADCAST);
-    }
-    else {
-        queue_list->push_back(COORDINATE_BROADCAST);
-        queue_list->push_back(BROADCAST);
+  // Pull in distributed mode
+  if (BytePSGlobal::IsDistributed()) {
+    if (BytePSGlobal::IsRootDevice()) {
+      queue_list->push_back(PULL);
     }
-    return queue_list;
+  }
+
+  // Copy from CPU to GPU
+  if (BytePSGlobal::IsDistributed() || BytePSGlobal::IsCrossPcieSwitch()) {
+    queue_list->push_back(COPYH2D);
+  }
+
+  // Per-PCIe-switch NCCL broadcast
+  if (BytePSGlobal::GetNccl()->IsSignalRoot()) {
+    queue_list->push_back(BROADCAST);
+  } else {
+    queue_list->push_back(COORDINATE_BROADCAST);
+    queue_list->push_back(BROADCAST);
+  }
+  return queue_list;
 }
 
-} // namespace common
-} // namespace byteps
+}  // namespace common
+}  // namespace byteps
diff --git a/byteps/common/operations.h b/byteps/common/operations.h
index 1acff645b..fcd3951f5 100644
--- a/byteps/common/operations.h
+++ b/byteps/common/operations.h
@@ -22,7 +22,6 @@
 namespace byteps {
 namespace common {
 
-
 // Check that byteps is initialized.
 Status CheckInitialized();
 
@@ -49,30 +48,28 @@ int byteps_size();
 // C interface to return number of byteps processes in the node it is on.
 // Returns -1 if byteps is not initialized.
 int byteps_local_size();
-
 }
 
 // Below are all for Framework plugins
-Status EnqueueTensor(BPSContext &context,
-                     std::shared_ptr<Tensor> input,
+Status EnqueueTensor(BPSContext &context, std::shared_ptr<Tensor> input,
                      std::shared_ptr<Tensor> output,
-                     std::shared_ptr<ReadyEvent> ready_event,
-                     const int device, const int priority, const int version,
+                     std::shared_ptr<ReadyEvent> ready_event, const int device,
+                     const int priority, const int version,
                      StatusCallback callback,
                      std::shared_ptr<std::vector<QueueType>> queue_list);
 
-void InitTensor(BPSContext &context, size_t size, int dtype, void* cpubuff);
+void InitTensor(BPSContext &context, size_t size, int dtype, void *cpubuff);
 
 // Only call these in Framework plugins for the best performance
 bool IsTensorDeclared(const std::string &name);
 
-BPSContext& GetContextFromName(const std::string &name);
+BPSContext &GetContextFromName(const std::string &name);
 
 std::shared_ptr<std::vector<QueueType>> GetPushQueueList(int device);
 
 std::shared_ptr<std::vector<QueueType>> GetPullQueueList(int device);
 
-} // namespace common
-} // namespace byteps
+}  // namespace common
+}  // namespace byteps
 
-#endif // BYTEPS_OPERATIONS_H
\ No newline at end of file
+#endif  // BYTEPS_OPERATIONS_H
diff --git a/byteps/common/ready_table.cc b/byteps/common/ready_table.cc
index 1d307928f..07fcd0ef1 100644
--- a/byteps/common/ready_table.cc
+++ b/byteps/common/ready_table.cc
@@ -13,32 +13,29 @@
 // limitations under the License.
 // =============================================================================
 
-#include "logging.h"
 #include "ready_table.h"
-
+#include "logging.h"
 
 namespace byteps {
 namespace common {
 
-
 // below are methods for accessing/modifying the _ready_table
 bool ReadyTable::IsKeyReady(uint64_t key) {
-    std::lock_guard<std::mutex> lock(_table_mutex);
-    return _ready_table[key] == (_ready_count);
+  std::lock_guard<std::mutex> lock(_table_mutex);
+  return _ready_table[key] == (_ready_count);
 }
 
 int ReadyTable::AddReadyCount(uint64_t key) {
-    std::lock_guard<std::mutex> lock(_table_mutex);
-    BPS_CHECK_LT(_ready_table[key], _ready_count)
-        << _table_name << ": "
-        << _ready_table[key] << ", " << (_ready_count);
-    return ++_ready_table[key];
+  std::lock_guard<std::mutex> lock(_table_mutex);
+  BPS_CHECK_LT(_ready_table[key], _ready_count)
+      << _table_name << ": " << _ready_table[key] << ", " << (_ready_count);
+  return ++_ready_table[key];
 }
 
 void ReadyTable::ClearReadyCount(uint64_t key) {
-    std::lock_guard<std::mutex> lock(_table_mutex);
-    _ready_table[key] = 0;
+  std::lock_guard<std::mutex> lock(_table_mutex);
+  _ready_table[key] = 0;
 }
 
-}
-}
\ No newline at end of file
+}  // namespace common
+}  // namespace byteps
diff --git a/byteps/common/ready_table.h b/byteps/common/ready_table.h
index 2bd0fc83d..68d6ef1d9 100644
--- a/byteps/common/ready_table.h
+++ b/byteps/common/ready_table.h
@@ -24,28 +24,26 @@ namespace byteps {
 namespace common {
 
 class ReadyTable {
-
-public:
-    ReadyTable(int ready_count, const char* name) {
-        _ready_count = ready_count;
-        _table_name = std::string(name);
-    }
-    // methods to access or modify the _ready_table
-    bool IsKeyReady(uint64_t key);
-    int AddReadyCount(uint64_t key);
-    void ClearReadyCount(uint64_t key);
-
-private:
-    // (key, ready_signal_count) pair, only valid for root device
-    std::unordered_map<uint64_t, int> _ready_table;
-    // use this mutex to access/modify the _ready_table
-    std::mutex _table_mutex;
-    int _ready_count;
-    std::string _table_name;
+ public:
+  ReadyTable(int ready_count, const char* name) {
+    _ready_count = ready_count;
+    _table_name = std::string(name);
+  }
+  // methods to access or modify the _ready_table
+  bool IsKeyReady(uint64_t key);
+  int AddReadyCount(uint64_t key);
+  void ClearReadyCount(uint64_t key);
+
+ private:
+  // (key, ready_signal_count) pair, only valid for root device
+  std::unordered_map<uint64_t, int> _ready_table;
+  // use this mutex to access/modify the _ready_table
+  std::mutex _table_mutex;
+  int _ready_count;
+  std::string _table_name;
 };
 
+}  // namespace common
+}  // namespace byteps
 
-} // namespace common
-} // namespace byteps
-
-#endif // BYTEPS_READY_TABLE_H
\ No newline at end of file
+#endif  // BYTEPS_READY_TABLE_H
diff --git a/byteps/common/scheduled_queue.cc b/byteps/common/scheduled_queue.cc
index 8c8fe35f8..79034226a 100644
--- a/byteps/common/scheduled_queue.cc
+++ b/byteps/common/scheduled_queue.cc
@@ -13,156 +13,156 @@
 // limitations under the License.
 // =============================================================================
 
-#include <algorithm>
-
-#include "logging.h"
 #include "scheduled_queue.h"
+#include <algorithm>
 #include "global.h"
+#include "logging.h"
 
 namespace byteps {
 namespace common {
 
 BytePSScheduledQueue::BytePSScheduledQueue(QueueType type) {
-    if (type == REDUCE && BytePSGlobal::GetNccl()->IsSignalRoot()) {
-        _is_scheduled = true;
-    }
-    else {
-        _is_scheduled = false;
-    }
+  if (type == REDUCE && BytePSGlobal::GetNccl()->IsSignalRoot()) {
+    _is_scheduled = true;
+  } else {
+    _is_scheduled = false;
+  }
 
-    _qt = type;
-    _credits = _is_scheduled ?
-               BytePSGlobal::GetPartitionBound() * (BytePSGlobal::GetNccl()->GetGroupSize() + 1)
-               : 34359738368;  // 32GB, basically disabling credit control
-    _rt = nullptr;
+  _qt = type;
+  _credits = _is_scheduled
+                 ? BytePSGlobal::GetPartitionBound() *
+                       (BytePSGlobal::GetNccl()->GetGroupSize() + 1)
+                 : 34359738368;  // 32GB, basically disabling credit control
+  _rt = nullptr;
 
-    switch (_qt) {
-        case REDUCE:
-            if (BytePSGlobal::GetNccl()->IsSignalRoot()) {
-                _rt = BytePSGlobal::GetReduceTable();
-            }
-            break;
-        case PCIE_REDUCE:
-            if (BytePSGlobal::IsCrossPcieSwitch()) {
-                if (BytePSGlobal::GetCpuReducer()->isRoot()) {
-                    _rt = BytePSGlobal::GetPcieReduceTable();
-                }
-            }
-            break;
-        case PUSH:
-            if (BytePSGlobal::IsRootDevice()) {
-                _rt = BytePSGlobal::GetPushTable();
-            }
-            break;
-        case COPYH2D:
-            if (!BytePSGlobal::IsRootDevice()) {
-                _rt = BytePSGlobal::GetCopyTable();
-            }
-            break;
-        case BROADCAST:
-            if (BytePSGlobal::GetNccl()->IsSignalRoot()) {
-                _rt = BytePSGlobal::GetBroadcastTable();
-            }
-            break;
-        default:
-            break;
-    }
+  switch (_qt) {
+    case REDUCE:
+      if (BytePSGlobal::GetNccl()->IsSignalRoot()) {
+        _rt = BytePSGlobal::GetReduceTable();
+      }
+      break;
+    case PCIE_REDUCE:
+      if (BytePSGlobal::IsCrossPcieSwitch()) {
+        if (BytePSGlobal::GetCpuReducer()->isRoot()) {
+          _rt = BytePSGlobal::GetPcieReduceTable();
+        }
+      }
+      break;
+    case PUSH:
+      if (BytePSGlobal::IsRootDevice()) {
+        _rt = BytePSGlobal::GetPushTable();
+      }
+      break;
+    case COPYH2D:
+      if (!BytePSGlobal::IsRootDevice()) {
+        _rt = BytePSGlobal::GetCopyTable();
+      }
+      break;
+    case BROADCAST:
+      if (BytePSGlobal::GetNccl()->IsSignalRoot()) {
+        _rt = BytePSGlobal::GetBroadcastTable();
+      }
+      break;
+    default:
+      break;
+  }
 }
 
 void BytePSScheduledQueue::addTask(std::shared_ptr<TensorTableEntry> entry) {
-    std::lock_guard<std::mutex> lock(_mutex);
-    _sq.push_back(entry);
-    if (_is_scheduled) {
-        // TODO: below can be optimized to O(n) using insertion sort
-        std::sort(_sq.begin(), _sq.end(),
-            [](std::shared_ptr<TensorTableEntry> a, std::shared_ptr<TensorTableEntry> b) {
-                if (a->priority == b->priority) {
-                    return (a->key < b->key); // from the first partition to the last
-                }
-                return (a->priority > b->priority); // from higher priority to lower
+  std::lock_guard<std::mutex> lock(_mutex);
+  _sq.push_back(entry);
+  if (_is_scheduled) {
+    // TODO: below can be optimized to O(n) using insertion sort
+    std::sort(
+        _sq.begin(), _sq.end(),
+        [](std::shared_ptr<TensorTableEntry> a,
+           std::shared_ptr<TensorTableEntry> b) {
+          if (a->priority == b->priority) {
+            return (a->key < b->key);  // from the first partition to the last
+          }
+          return (a->priority > b->priority);  // from higher priority to lower
         });
-    }
-    BPS_CHECK(entry->tensor_name != "");
-    BPS_LOG(TRACE) << "Queue " << LogStrings[_qt]
-                   << " addTask: " << entry->tensor_name
-                   << " key: " << entry->key
-                   << " rank: " << BytePSGlobal::GetLocalRank();
-    return;
+  }
+  BPS_CHECK(entry->tensor_name != "");
+  BPS_LOG(TRACE) << "Queue " << LogStrings[_qt]
+                 << " addTask: " << entry->tensor_name << " key: " << entry->key
+                 << " rank: " << BytePSGlobal::GetLocalRank();
+  return;
 }
 
 std::shared_ptr<TensorTableEntry> BytePSScheduledQueue::getTask() {
-    std::lock_guard<std::mutex> lock(_mutex);
-    std::shared_ptr<TensorTableEntry> task;
-    // TODO: below can be optimized -- if we take task from the tail, erase() can be faster
-    for (auto it = _sq.begin(); it!=_sq.end(); ++it) {
-        if ((*it)->ready_event) {
-            if (!(*it)->ready_event->Ready()) {
-                continue;
-            }
-        }
-        if (_is_scheduled) {
-            if ((*it)->len > _credits) {
-                continue;
-            }
-        }
-        if (_rt) {
-            if (!_rt->IsKeyReady((*it)->key)) {
-                continue;
-            }
-            _rt->ClearReadyCount((*it)->key);
-        }
-        task = *it;
-        _sq.erase(it);
-        if (_is_scheduled) {
-            _credits -= task->len;
-        }
-
-        BPS_CHECK(task->tensor_name != "");
-        BPS_LOG(TRACE) << "Queue " << LogStrings[_qt]
-                       << " getTask: " << task->tensor_name
-                       << " key: " << task->key
-                       << " rank: " << BytePSGlobal::GetLocalRank();
-        return task;
+  std::lock_guard<std::mutex> lock(_mutex);
+  std::shared_ptr<TensorTableEntry> task;
+  // TODO: below can be optimized -- if we take task from the tail, erase() can
+  // be faster
+  for (auto it = _sq.begin(); it != _sq.end(); ++it) {
+    if ((*it)->ready_event) {
+      if (!(*it)->ready_event->Ready()) {
+        continue;
+      }
+    }
+    if (_is_scheduled) {
+      if ((*it)->len > _credits) {
+        continue;
+      }
+    }
+    if (_rt) {
+      if (!_rt->IsKeyReady((*it)->key)) {
+        continue;
+      }
+      _rt->ClearReadyCount((*it)->key);
+    }
+    task = *it;
+    _sq.erase(it);
+    if (_is_scheduled) {
+      _credits -= task->len;
     }
-    return nullptr;
-}
 
-std::shared_ptr<TensorTableEntry> BytePSScheduledQueue::getTask(uint64_t key){
-    BPS_CHECK(!_is_scheduled);
-    std::lock_guard<std::mutex> lock(_mutex);
-    std::shared_ptr<TensorTableEntry> task;
-    for (auto it = _sq.begin(); it!=_sq.end(); ++it) {
-        if ((*it)->ready_event) {
-            BPS_CHECK((*it)->ready_event->Ready());
-        }
-        if ((*it)->key != (uint64_t)key) {
-            continue;
-        }
-        task = *it;
-        _sq.erase(it);
+    BPS_CHECK(task->tensor_name != "");
+    BPS_LOG(TRACE) << "Queue " << LogStrings[_qt]
+                   << " getTask: " << task->tensor_name << " key: " << task->key
+                   << " rank: " << BytePSGlobal::GetLocalRank();
+    return task;
+  }
+  return nullptr;
+}
 
-        BPS_CHECK(task->tensor_name != "");
-        BPS_LOG(TRACE) << "Queue " << LogStrings[_qt]
-                       << " getTask(key): " << task->tensor_name
-                       << " key: " << task->key
-                       << " rank: " << BytePSGlobal::GetLocalRank();
-        return task;
+std::shared_ptr<TensorTableEntry> BytePSScheduledQueue::getTask(uint64_t key) {
+  BPS_CHECK(!_is_scheduled);
+  std::lock_guard<std::mutex> lock(_mutex);
+  std::shared_ptr<TensorTableEntry> task;
+  for (auto it = _sq.begin(); it != _sq.end(); ++it) {
+    if ((*it)->ready_event) {
+      BPS_CHECK((*it)->ready_event->Ready());
+    }
+    if ((*it)->key != (uint64_t)key) {
+      continue;
     }
-    return nullptr;
+    task = *it;
+    _sq.erase(it);
+
+    BPS_CHECK(task->tensor_name != "");
+    BPS_LOG(TRACE) << "Queue " << LogStrings[_qt]
+                   << " getTask(key): " << task->tensor_name
+                   << " key: " << task->key
+                   << " rank: " << BytePSGlobal::GetLocalRank();
+    return task;
+  }
+  return nullptr;
 }
 
 uint32_t BytePSScheduledQueue::pendingSize() {
-    std::lock_guard<std::mutex> lock(_mutex);
-    return _sq.size();
+  std::lock_guard<std::mutex> lock(_mutex);
+  return _sq.size();
 }
 
 void BytePSScheduledQueue::reportFinish(int size) {
-    if (_is_scheduled) {
-        std::lock_guard<std::mutex> lock(_mutex);
-        _credits += size;
-    }
-    return;
+  if (_is_scheduled) {
+    std::lock_guard<std::mutex> lock(_mutex);
+    _credits += size;
+  }
+  return;
 }
 
-} // namespace common
-} // namespace byteps
+}  // namespace common
+}  // namespace byteps
diff --git a/byteps/common/scheduled_queue.h b/byteps/common/scheduled_queue.h
index cc6097fd4..59ed3800b 100644
--- a/byteps/common/scheduled_queue.h
+++ b/byteps/common/scheduled_queue.h
@@ -17,9 +17,9 @@
 #define BYTEPS_SCHEDULED_QUEUE_H
 
 #include <atomic>
-#include <vector>
 #include <memory>
 #include <unordered_map>
+#include <vector>
 #include "common.h"
 #include "ready_table.h"
 
@@ -27,28 +27,26 @@ namespace byteps {
 namespace common {
 
 class BytePSScheduledQueue {
-
-public:
-    BytePSScheduledQueue(QueueType type);
-    QueueType getQueueType() { return _qt; }
-    void addTask(std::shared_ptr<TensorTableEntry>);
-    std::shared_ptr<TensorTableEntry> getTask();
-    std::shared_ptr<TensorTableEntry> getTask(uint64_t key);
-    uint32_t pendingSize();
-    void reportFinish(int size);
-
-private:
-    // TODO: use priority queue or heap
-    std::vector<std::shared_ptr<TensorTableEntry>> _sq;
-    std::mutex _mutex;
-    uint64_t _credits;
-    bool _is_scheduled;
-    QueueType _qt;
-    ReadyTable *_rt;
+ public:
+  BytePSScheduledQueue(QueueType type);
+  QueueType getQueueType() { return _qt; }
+  void addTask(std::shared_ptr<TensorTableEntry>);
+  std::shared_ptr<TensorTableEntry> getTask();
+  std::shared_ptr<TensorTableEntry> getTask(uint64_t key);
+  uint32_t pendingSize();
+  void reportFinish(int size);
+
+ private:
+  // TODO: use priority queue or heap
+  std::vector<std::shared_ptr<TensorTableEntry>> _sq;
+  std::mutex _mutex;
+  uint64_t _credits;
+  bool _is_scheduled;
+  QueueType _qt;
+  ReadyTable *_rt;
 };
 
+}  // namespace common
+}  // namespace byteps
 
-} // namespace common
-} // namespace byteps
-
-#endif // BYTEPS_SCHEDULED_QUEUE_H
\ No newline at end of file
+#endif  // BYTEPS_SCHEDULED_QUEUE_H
diff --git a/byteps/common/shared_memory.cc b/byteps/common/shared_memory.cc
index e0ec8961f..ec86d166f 100644
--- a/byteps/common/shared_memory.cc
+++ b/byteps/common/shared_memory.cc
@@ -13,72 +13,69 @@
 // limitations under the License.
 // =============================================================================
 
-#include <unistd.h>
+#include "shared_memory.h"
 #include <fcntl.h>
+#include <numa.h>
 #include <sys/shm.h>
 #include <sys/stat.h>
-#include <sys/mman.h>
 #include <sys/types.h>
-#include <numa.h>
-
-#include "shared_memory.h"
+#include <unistd.h>
 #include "global.h"
 
 namespace byteps {
 namespace common {
 
-void* BytePSSharedMemory::openSharedMemory(const std::string &prefix, uint64_t key, size_t size) {
-    std::string shm_name(prefix);
-    shm_name += std::to_string(key);
-    int shm_fd = shm_open(shm_name.c_str(), O_CREAT | O_RDWR, 0666);
-    BPS_CHECK_GE(shm_fd, 0) << "shm_open failed for " << shm_name;
+void* BytePSSharedMemory::openSharedMemory(const std::string& prefix,
+                                           uint64_t key, size_t size) {
+  std::string shm_name(prefix);
+  shm_name += std::to_string(key);
+  int shm_fd = shm_open(shm_name.c_str(), O_CREAT | O_RDWR, 0666);
+  BPS_CHECK_GE(shm_fd, 0) << "shm_open failed for " << shm_name;
 
-    BPS_CHECK_GE(ftruncate(shm_fd, size), 0) << strerror(errno);
+  BPS_CHECK_GE(ftruncate(shm_fd, size), 0) << strerror(errno);
 
-    void* ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0);
-    CUDA_CALL(cudaHostRegister(ptr, size, cudaHostRegisterDefault));
-    // mlock(ptr, size);
+  void* ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0);
+  CUDA_CALL(cudaHostRegister(ptr, size, cudaHostRegisterDefault));
+  // mlock(ptr, size);
 
-    BPS_CHECK_NE(ptr, (void *)-1) << strerror(errno);
+  BPS_CHECK_NE(ptr, (void*)-1) << strerror(errno);
 
-    BPS_LOG(TRACE) << "initialized share memory size " << size;
+  BPS_LOG(TRACE) << "initialized share memory size " << size;
 
-    std::lock_guard<std::mutex> lock(_shm_mu);
-    _key_shm_addr[shm_name] = ptr;
-    _key_shm_size[shm_name] = size;
-    return ptr;
+  std::lock_guard<std::mutex> lock(_shm_mu);
+  _key_shm_addr[shm_name] = ptr;
+  _key_shm_size[shm_name] = size;
+  return ptr;
 }
 
-std::vector<void*> BytePSSharedMemory::openPcieSharedMemory(uint64_t key, size_t size) {
-    std::vector<void*> r;
-    for (int i = 0; i < BytePSGlobal::GetPcieSwitchNum(); i++) {
-        auto prefix = std::string("BytePS_Pcie") + std::to_string(i) + "_Shm_";
-        if (BytePSGlobal::IsDistributed()) {
-            if (i <= numa_max_node()) {
-                numa_set_preferred(i);
-            }
-            else {
-                numa_set_preferred(numa_max_node());
-            }
-            r.push_back(openSharedMemory(prefix, key, size));
-            numa_set_preferred(-1);
-        }
-        else {
-            if (BytePSGlobal::IsCrossPcieSwitch()) {
-                numa_set_interleave_mask(numa_all_nodes_ptr);
-                r.push_back(openSharedMemory(prefix, key, size));
-                numa_set_interleave_mask(numa_no_nodes_ptr);
-            }
-            else {
-                numa_set_preferred(0);
-                r.push_back(openSharedMemory(prefix, key, size));
-                numa_set_preferred(-1);
-            }
-        }
+std::vector<void*> BytePSSharedMemory::openPcieSharedMemory(uint64_t key,
+                                                            size_t size) {
+  std::vector<void*> r;
+  for (int i = 0; i < BytePSGlobal::GetPcieSwitchNum(); i++) {
+    auto prefix = std::string("BytePS_Pcie") + std::to_string(i) + "_Shm_";
+    if (BytePSGlobal::IsDistributed()) {
+      if (i <= numa_max_node()) {
+        numa_set_preferred(i);
+      } else {
+        numa_set_preferred(numa_max_node());
+      }
+      r.push_back(openSharedMemory(prefix, key, size));
+      numa_set_preferred(-1);
+    } else {
+      if (BytePSGlobal::IsCrossPcieSwitch()) {
+        numa_set_interleave_mask(numa_all_nodes_ptr);
+        r.push_back(openSharedMemory(prefix, key, size));
+        numa_set_interleave_mask(numa_no_nodes_ptr);
+      } else {
+        numa_set_preferred(0);
+        r.push_back(openSharedMemory(prefix, key, size));
+        numa_set_preferred(-1);
+      }
     }
-    return r;
+  }
+  return r;
 }
 
-} // namespace common
+}  // namespace common
 
-} // namespace byteps
\ No newline at end of file
+}  // namespace byteps
diff --git a/byteps/common/shared_memory.h b/byteps/common/shared_memory.h
index daf566ee2..f402a7cf9 100644
--- a/byteps/common/shared_memory.h
+++ b/byteps/common/shared_memory.h
@@ -16,50 +16,47 @@
 #ifndef BYTEPS_SHARED_MEMORY_H
 #define BYTEPS_SHARED_MEMORY_H
 
+#include <cuda_runtime.h>
+#include <sys/mman.h>
+#include <cerrno>
 #include <cstdio>
 #include <cstdlib>
-#include <unordered_map>
-#include <vector>
+#include <cstring>
 #include <mutex>
 #include <thread>
-#include <cerrno>
-#include <cstring>
-#include <cuda_runtime.h>
+#include <unordered_map>
+#include <vector>
 #include "logging.h"
 
 namespace byteps {
 namespace common {
 
 class BytePSSharedMemory {
-
-public:
-
-    BytePSSharedMemory() {}
-
-    ~BytePSSharedMemory() {
-        for (auto &it : _key_shm_addr) {
-            CUDA_CALL(cudaHostUnregister(it.second));
-            munmap(it.second, _key_shm_size[it.first]);
-            shm_unlink(it.first.c_str());
-        }
-
-        BPS_LOG(DEBUG) << "Clear BytePSSharedMemory: All BytePS shared memory released/unregistered.";
+ public:
+  BytePSSharedMemory() {}
+
+  ~BytePSSharedMemory() {
+    for (auto &it : _key_shm_addr) {
+      CUDA_CALL(cudaHostUnregister(it.second));
+      munmap(it.second, _key_shm_size[it.first]);
+      shm_unlink(it.first.c_str());
     }
 
-    void* openSharedMemory(const std::string &prefix, uint64_t key, size_t size);
-    std::vector<void*> openPcieSharedMemory(uint64_t key, size_t size);
+    BPS_LOG(DEBUG) << "Clear BytePSSharedMemory: All BytePS shared memory "
+                      "released/unregistered.";
+  }
 
-private:
+  void *openSharedMemory(const std::string &prefix, uint64_t key, size_t size);
+  std::vector<void *> openPcieSharedMemory(uint64_t key, size_t size);
 
-    std::unordered_map<std::string, void *> _key_shm_addr;
-    std::unordered_map<std::string, size_t> _key_shm_size;
-
-    std::mutex _shm_mu;
+ private:
+  std::unordered_map<std::string, void *> _key_shm_addr;
+  std::unordered_map<std::string, size_t> _key_shm_size;
 
+  std::mutex _shm_mu;
 };
 
+}  // namespace common
+}  // namespace byteps
 
-} // namespace common
-} // namespace byteps
-
-#endif // BYTEPS_SHARED_MEMORY_H
\ No newline at end of file
+#endif  // BYTEPS_SHARED_MEMORY_H
diff --git a/byteps/keras/__init__.py b/byteps/keras/__init__.py
index 37e28d4ef..2c5da1c3f 100644
--- a/byteps/keras/__init__.py
+++ b/byteps/keras/__init__.py
@@ -78,6 +78,7 @@ def push_pull(value, name=None, average=True):
     """
     return _impl.push_pull(K, value, name, average)
 
+
 def broadcast(value, root_rank, name=None):
     """
     Perform a broadcast on a tensor-compatible value.
@@ -119,4 +120,4 @@ def load_model(filepath, custom_optimizers=None, custom_objects=None, compressio
     """
     def wrap_optimizer(cls):
         return lambda **kwargs: DistributedOptimizer(cls(**kwargs), compression=compression)
-    return _impl.load_model(keras, wrap_optimizer, filepath, custom_optimizers, custom_objects)
\ No newline at end of file
+    return _impl.load_model(keras, wrap_optimizer, filepath, custom_optimizers, custom_objects)
diff --git a/byteps/keras/callbacks.py b/byteps/keras/callbacks.py
index 9adb02489..997e80446 100644
--- a/byteps/keras/callbacks.py
+++ b/byteps/keras/callbacks.py
@@ -131,4 +131,4 @@ def __init__(self, warmup_epochs=5, momentum_correction=True, steps_per_epoch=No
             verbose: verbosity mode, 0 or 1.
         """
         super(LearningRateWarmupCallback, self).__init__(K, warmup_epochs, momentum_correction,
-                                                         steps_per_epoch, verbose)
\ No newline at end of file
+                                                         steps_per_epoch, verbose)
diff --git a/byteps/mxnet/adapter.cc b/byteps/mxnet/adapter.cc
index 8c6fe04bc..98b99ad8f 100644
--- a/byteps/mxnet/adapter.cc
+++ b/byteps/mxnet/adapter.cc
@@ -25,13 +25,16 @@
 namespace byteps {
 namespace mxnet {
 
-template <class T> MXTensor<T>::MXTensor(T* tensor) : tensor_(tensor) {}
+template <class T>
+MXTensor<T>::MXTensor(T* tensor) : tensor_(tensor) {}
 
-template <class T> const DataType MXTensor<T>::dtype() const {
+template <class T>
+const DataType MXTensor<T>::dtype() const {
   return TensorUtil::GetDType(tensor_);
 }
 
-template <class T> const TensorShape MXTensor<T>::shape() const {
+template <class T>
+const TensorShape MXTensor<T>::shape() const {
   auto shape = TensorUtil::GetShape(tensor_);
   if (shape.dims() == 0) {
     // Tensor with empty shape is a Tensor with no values in MXNet, unlike a
@@ -42,15 +45,17 @@ template <class T> const TensorShape MXTensor<T>::shape() const {
   return shape;
 }
 
-template <class T> const void* MXTensor<T>::data() const {
+template <class T>
+const void* MXTensor<T>::data() const {
   return TensorUtil::GetData(tensor_);
 }
 
-template <class T> int64_t MXTensor<T>::size() const {
+template <class T>
+int64_t MXTensor<T>::size() const {
   return TensorUtil::GetSize(tensor_);
 }
 
 template class MXTensor<NDArray>;
 
-} // namespace mxnet
-} // namespace byteps
+}  // namespace mxnet
+}  // namespace byteps
diff --git a/byteps/mxnet/adapter.h b/byteps/mxnet/adapter.h
index d2b2c2a24..51cf3c82b 100644
--- a/byteps/mxnet/adapter.h
+++ b/byteps/mxnet/adapter.h
@@ -18,7 +18,6 @@
 #define BYTEPS_MXNET_ADAPTER_H
 
 #include <mxnet/base.h>
-
 #include "../common/common.h"
 
 namespace byteps {
@@ -26,15 +25,16 @@ namespace mxnet {
 
 using namespace byteps::common;
 
-template <class T> class MXTensor : public Tensor {
-public:
+template <class T>
+class MXTensor : public Tensor {
+ public:
   MXTensor(T* tensor);
   virtual const DataType dtype() const override;
   virtual const TensorShape shape() const override;
   virtual const void* data() const override;
   virtual int64_t size() const override;
 
-protected:
+ protected:
   T* tensor_;
 };
 
@@ -44,7 +44,7 @@ inline void ThrowIfError(const Status& status) {
   }
 }
 
-} // namespace mxnet
-} // namespace byteps
+}  // namespace mxnet
+}  // namespace byteps
 
-#endif // BYTEPS_MXNET_ADAPTER_H
+#endif  // BYTEPS_MXNET_ADAPTER_H
diff --git a/byteps/mxnet/cuda_util.cc b/byteps/mxnet/cuda_util.cc
index e511fc52c..9d14a2974 100644
--- a/byteps/mxnet/cuda_util.cc
+++ b/byteps/mxnet/cuda_util.cc
@@ -15,8 +15,8 @@
 // =============================================================================
 
 #if HAVE_CUDA
-#include "cuda_runtime.h"
 #include <mxnet/base.h>
+#include "cuda_runtime.h"
 #endif
 
 #include "../common/common.h"
@@ -34,8 +34,9 @@ with_device::with_device(int device) {
     CUDA_CALL(cudaGetDevice(&restore_device_));
     CUDA_CALL(cudaSetDevice(device));
 #else
-    throw std::logic_error("Internal error. Requested device context manager "
-                           "with GPU device but not compiled with CUDA.");
+    throw std::logic_error(
+        "Internal error. Requested device context manager "
+        "with GPU device but not compiled with CUDA.");
 #endif
   }
 }
@@ -48,5 +49,5 @@ with_device::~with_device() {
 #endif
 }
 
-} // namespace mxnet
-} // namespace byteps
+}  // namespace mxnet
+}  // namespace byteps
diff --git a/byteps/mxnet/cuda_util.h b/byteps/mxnet/cuda_util.h
index 4c12551d3..352ef8807 100644
--- a/byteps/mxnet/cuda_util.h
+++ b/byteps/mxnet/cuda_util.h
@@ -21,15 +21,15 @@ namespace byteps {
 namespace mxnet {
 
 class with_device {
-public:
+ public:
   with_device(int device);
   ~with_device();
 
-private:
+ private:
   int restore_device_;
 };
 
-} // namespace mxnet
-} // namespace byteps
+}  // namespace mxnet
+}  // namespace byteps
 
-#endif // BYTEPS_MXNET_CUDA_UTIL_H
+#endif  // BYTEPS_MXNET_CUDA_UTIL_H
diff --git a/byteps/mxnet/ops.cc b/byteps/mxnet/ops.cc
index 1ca7d545c..0fe774fc0 100644
--- a/byteps/mxnet/ops.cc
+++ b/byteps/mxnet/ops.cc
@@ -14,13 +14,11 @@
 // limitations under the License.
 // =============================================================================
 
+#include "ops.h"
 #include <atomic>
-
 #include "../common/operations.h"
-
 #include "adapter.h"
 #include "cuda_util.h"
-#include "ops.h"
 #include "ready_event.h"
 #include "tensor_util.h"
 
@@ -32,82 +30,84 @@ namespace {
 std::atomic_int op_count;
 
 std::string GetOpName(std::string prefix, char* name) {
-    if (name != nullptr) {
-      return prefix + "." + std::string(name);
-    }
+  if (name != nullptr) {
+    return prefix + "." + std::string(name);
+  }
 
-    op_count.fetch_add(1);
-    return prefix + ".noname." + std::to_string(op_count);
+  op_count.fetch_add(1);
+  return prefix + ".noname." + std::to_string(op_count);
 }
-} // namespace
+}  // namespace
 
 inline void InvokeCompleteCallback(Callback on_complete, const Status& status) {
-    if (status.ok()) {
-      on_complete();
-    } else {
-      auto error = dmlc::Error(status.reason());
-      on_complete(&error);
-    }
+  if (status.ok()) {
+    on_complete();
+  } else {
+    auto error = dmlc::Error(status.reason());
+    on_complete(&error);
+  }
 }
 
-void DoPushPull(BPSContext &context, NDArray* input, int version, int priority,
-                 Callback on_complete) {
-    ThrowIfError(common::CheckInitialized());
-
-    auto device = TensorUtil::GetDevice(input);
-    auto byteps_input = std::make_shared<MXTensor<NDArray>>(input);
-    auto queue_list = common::GetPushQueueList(device);
-    auto queue_list_pull = common::GetPullQueueList(device);
-    queue_list->insert(queue_list->end(),
-                       queue_list_pull->begin(), queue_list_pull->end());
-
-    auto enqueue_result =
-        common::EnqueueTensor(context, byteps_input, byteps_input, nullptr,
-                              device, priority, version,
-                              [on_complete](const Status& status) {
-                                InvokeCompleteCallback(on_complete, status);
-                              }, queue_list);
-    ThrowIfError(enqueue_result);
+void DoPushPull(BPSContext& context, NDArray* input, int version, int priority,
+                Callback on_complete) {
+  ThrowIfError(common::CheckInitialized());
+
+  auto device = TensorUtil::GetDevice(input);
+  auto byteps_input = std::make_shared<MXTensor<NDArray>>(input);
+  auto queue_list = common::GetPushQueueList(device);
+  auto queue_list_pull = common::GetPullQueueList(device);
+  queue_list->insert(queue_list->end(), queue_list_pull->begin(),
+                     queue_list_pull->end());
+
+  auto enqueue_result = common::EnqueueTensor(
+      context, byteps_input, byteps_input, nullptr, device, priority, version,
+      [on_complete](const Status& status) {
+        InvokeCompleteCallback(on_complete, status);
+      },
+      queue_list);
+  ThrowIfError(enqueue_result);
 }
 
-extern "C" int byteps_mxnet_push_pull_async(NDArray* tensor,
-                                            char* name, int version, int priority, bool is_average) {
-    MX_API_BEGIN();
-
-    std::string tensor_name = GetOpName("byteps", name);
-
-    auto& context = common::GetContextFromName(tensor_name);
-    auto dtype = TensorUtil::GetDType(tensor);
-    auto size = TensorUtil::GetSize(tensor);
-    auto device = TensorUtil::GetDevice(tensor);
-    void* cpubuff = (device == CPU_DEVICE_ID) ? 
-        const_cast<void*>(std::make_shared<MXTensor<NDArray>>(tensor)->data()) : nullptr;
-    common::InitTensor(context, size, dtype, cpubuff);
-
-    auto push_pull_async_fn = [&context, tensor, version, priority](RunContext rctx,
-                                      Callback on_complete) mutable {
-        DoPushPull(context, tensor, version, priority, on_complete);
-    };
-
-    Engine::Get()->PushAsync(push_pull_async_fn, Context::CPU(),
-                            {}, {tensor->var()},
-                            FnProperty::kCPUPrioritized, 0, "BytePSPushPull");
-
-    if (is_average) {
-        // average the aggregated gradient
-        auto num_worker = byteps_size();
-        *tensor /= num_worker;
-    }
-
-    MX_API_END();
+extern "C" int byteps_mxnet_push_pull_async(NDArray* tensor, char* name,
+                                            int version, int priority,
+                                            bool is_average) {
+  MX_API_BEGIN();
+
+  std::string tensor_name = GetOpName("byteps", name);
+
+  auto& context = common::GetContextFromName(tensor_name);
+  auto dtype = TensorUtil::GetDType(tensor);
+  auto size = TensorUtil::GetSize(tensor);
+  auto device = TensorUtil::GetDevice(tensor);
+  void* cpubuff = (device == CPU_DEVICE_ID)
+                      ? const_cast<void*>(
+                            std::make_shared<MXTensor<NDArray>>(tensor)->data())
+                      : nullptr;
+  common::InitTensor(context, size, dtype, cpubuff);
+
+  auto push_pull_async_fn = [&context, tensor, version, priority](
+                                RunContext rctx, Callback on_complete) mutable {
+    DoPushPull(context, tensor, version, priority, on_complete);
+  };
+
+  Engine::Get()->PushAsync(push_pull_async_fn, Context::CPU(), {},
+                           {tensor->var()}, FnProperty::kCPUPrioritized, 0,
+                           "BytePSPushPull");
+
+  if (is_average) {
+    // average the aggregated gradient
+    auto num_worker = byteps_size();
+    *tensor /= num_worker;
+  }
+
+  MX_API_END();
 }
 
 extern "C" void byteps_mxnet_declare_tensor(NDArray* tensor, char* name) {
-    std::string tensor_name = GetOpName("byteps", name);
-    common::IsTensorDeclared(tensor_name);
-    return;
+  std::string tensor_name = GetOpName("byteps", name);
+  common::IsTensorDeclared(tensor_name);
+  return;
 }
 
-
-} // namespace mxnet
-} // namespace byteps
+}  // namespace mxnet
+}  // namespace byteps
diff --git a/byteps/mxnet/ops.h b/byteps/mxnet/ops.h
index 8193d21c7..beb6eb0e4 100644
--- a/byteps/mxnet/ops.h
+++ b/byteps/mxnet/ops.h
@@ -22,6 +22,7 @@
 #include <mxnet/c_api_error.h>
 #include <mxnet/engine.h>
 #include <mxnet/ndarray.h>
+#include "../common/common.h"
 
 namespace byteps {
 namespace mxnet {
@@ -32,12 +33,13 @@ typedef ::mxnet::Engine Engine;
 typedef ::mxnet::NDArray NDArray;
 typedef ::mxnet::Engine::CallbackOnComplete Callback;
 
-extern "C" int byteps_mxnet_push_pull_async(NDArray* input,
-                                            char* name, int version, int priority, bool is_average);
+extern "C" int byteps_mxnet_push_pull_async(NDArray* input, char* name,
+                                            int version, int priority,
+                                            bool is_average);
 
 extern "C" void byteps_mxnet_declare_tensor(NDArray* tensor, char* name);
 
-} // namespace mxnet
-} // namespace byteps
+}  // namespace mxnet
+}  // namespace byteps
 
-#endif // BYTEPS_MXNET_OPS_H
+#endif  // BYTEPS_MXNET_OPS_H
diff --git a/byteps/mxnet/ops.py b/byteps/mxnet/ops.py
index 69d72ac24..9a89d250e 100644
--- a/byteps/mxnet/ops.py
+++ b/byteps/mxnet/ops.py
@@ -68,12 +68,13 @@ def byteps_push_pull(tensor, version=0, priority=0, name=None, is_average=True):
     c_in = tensor.handle
     if isinstance(name, string_types):
         check_call(MXNET_LIB_CTYPES.byteps_mxnet_push_pull_async(c_in,
-                   c_str(name), ctypes.c_int(version), ctypes.c_int(priority), ctypes.c_bool(is_average)))
+                                                                 c_str(name), ctypes.c_int(version), ctypes.c_int(priority), ctypes.c_bool(is_average)))
     else:
         check_call(MXNET_LIB_CTYPES.byteps_mxnet_push_pull_async(c_in,
-                   name, ctypes.c_int(version), ctypes.c_int(priority), ctypes.c_bool(is_average)))
+                                                                 name, ctypes.c_int(version), ctypes.c_int(priority), ctypes.c_bool(is_average)))
 
     return
 
+
 def byteps_declare_tensor(tensor, name):
     check_call(MXNET_LIB_CTYPES.byteps_mxnet_declare_tensor(tensor.handle, c_str(name)))
diff --git a/byteps/mxnet/ready_event.cc b/byteps/mxnet/ready_event.cc
index ba0374016..79b3d478b 100644
--- a/byteps/mxnet/ready_event.cc
+++ b/byteps/mxnet/ready_event.cc
@@ -29,12 +29,16 @@ MXReadyEvent<T>::MXReadyEvent(NDArray* tensor) : tensor_(tensor) {
   assert(tensor->ctx().real_dev_id() != CPU_DEVICE_ID);
 }
 
-template <class T> MXReadyEvent<T>::~MXReadyEvent() {}
+template <class T>
+MXReadyEvent<T>::~MXReadyEvent() {}
 
-template <class T> bool MXReadyEvent<T>::Ready() const { return true; }
+template <class T>
+bool MXReadyEvent<T>::Ready() const {
+  return true;
+}
 
 template class MXReadyEvent<NDArray>;
 
-} // namespace mxnet
-} // namespace byteps
+}  // namespace mxnet
+}  // namespace byteps
 #endif
diff --git a/byteps/mxnet/ready_event.h b/byteps/mxnet/ready_event.h
index 0786d773f..2ce3bbe4d 100644
--- a/byteps/mxnet/ready_event.h
+++ b/byteps/mxnet/ready_event.h
@@ -20,10 +20,10 @@
 #include <mxnet/ndarray.h>
 
 #if HAVE_CUDA
-#include "cuda_runtime.h"
 #include <mutex>
 #include <queue>
 #include <unordered_map>
+#include "cuda_runtime.h"
 
 #include "../common/common.h"
 
@@ -33,18 +33,19 @@ namespace mxnet {
 using namespace byteps::common;
 typedef ::mxnet::NDArray NDArray;
 
-template <class T> class MXReadyEvent : public ReadyEvent {
-public:
+template <class T>
+class MXReadyEvent : public ReadyEvent {
+ public:
   MXReadyEvent(NDArray* tensor);
   ~MXReadyEvent();
   virtual bool Ready() const override;
 
-private:
+ private:
   NDArray* tensor_;
 };
 
-} // namespace mxnet
-} // namespace byteps
+}  // namespace mxnet
+}  // namespace byteps
 #endif
 
-#endif // BYTEPS_MXNET_READY_EVENT_H
+#endif  // BYTEPS_MXNET_READY_EVENT_H
diff --git a/byteps/mxnet/tensor_util.cc b/byteps/mxnet/tensor_util.cc
index 5d760d537..4cc48d30f 100644
--- a/byteps/mxnet/tensor_util.cc
+++ b/byteps/mxnet/tensor_util.cc
@@ -22,23 +22,24 @@ namespace mxnet {
 // Define all types for TensorUtil.
 const DataType TensorUtil::GetDType(NDArray* tensor) {
   switch (tensor->dtype()) {
-  case mshadow::kFloat32:
-    return DataType::BYTEPS_FLOAT32;
-  case mshadow::kFloat64:
-    return DataType::BYTEPS_FLOAT64;
-  case mshadow::kFloat16:
-    return DataType::BYTEPS_FLOAT16;
-  case mshadow::kUint8:
-    return DataType::BYTEPS_UINT8;
-  case mshadow::kInt32:
-    return DataType::BYTEPS_INT32;
-  case mshadow::kInt8:
-    return DataType::BYTEPS_INT8;
-  case mshadow::kInt64:
-    return DataType::BYTEPS_INT64;
-  default:
-    throw std::logic_error("GetDType: Type " + std::to_string(tensor->dtype()) +
-                           " is not supported.");
+    case mshadow::kFloat32:
+      return DataType::BYTEPS_FLOAT32;
+    case mshadow::kFloat64:
+      return DataType::BYTEPS_FLOAT64;
+    case mshadow::kFloat16:
+      return DataType::BYTEPS_FLOAT16;
+    case mshadow::kUint8:
+      return DataType::BYTEPS_UINT8;
+    case mshadow::kInt32:
+      return DataType::BYTEPS_INT32;
+    case mshadow::kInt8:
+      return DataType::BYTEPS_INT8;
+    case mshadow::kInt64:
+      return DataType::BYTEPS_INT64;
+    default:
+      throw std::logic_error("GetDType: Type " +
+                             std::to_string(tensor->dtype()) +
+                             " is not supported.");
   }
 }
 
@@ -57,23 +58,23 @@ const void* TensorUtil::GetData(NDArray* tensor) {
   // The following returns an error:
   // return tensor->data().dptr<void>();
   switch (tensor->dtype()) {
-  case mshadow::kFloat32:
-    return static_cast<void*>(tensor->data().dptr<float>());
-  case mshadow::kFloat64:
-    return static_cast<void*>(tensor->data().dptr<double>());
-  case mshadow::kFloat16:
-    return static_cast<void*>(tensor->data().dptr<mshadow::half::half_t>());
-  case mshadow::kUint8:
-    return static_cast<void*>(tensor->data().dptr<uint8_t>());
-  case mshadow::kInt32:
-    return static_cast<void*>(tensor->data().dptr<int32_t>());
-  case mshadow::kInt8:
-    return static_cast<void*>(tensor->data().dptr<int8_t>());
-  case mshadow::kInt64:
-    return static_cast<void*>(tensor->data().dptr<int64_t>());
-  default:
-    throw std::logic_error("Type " + std::to_string(tensor->dtype()) +
-                           " is not supported in BytePS.");
+    case mshadow::kFloat32:
+      return static_cast<void*>(tensor->data().dptr<float>());
+    case mshadow::kFloat64:
+      return static_cast<void*>(tensor->data().dptr<double>());
+    case mshadow::kFloat16:
+      return static_cast<void*>(tensor->data().dptr<mshadow::half::half_t>());
+    case mshadow::kUint8:
+      return static_cast<void*>(tensor->data().dptr<uint8_t>());
+    case mshadow::kInt32:
+      return static_cast<void*>(tensor->data().dptr<int32_t>());
+    case mshadow::kInt8:
+      return static_cast<void*>(tensor->data().dptr<int8_t>());
+    case mshadow::kInt64:
+      return static_cast<void*>(tensor->data().dptr<int64_t>());
+    default:
+      throw std::logic_error("Type " + std::to_string(tensor->dtype()) +
+                             " is not supported in BytePS.");
   }
 }
 
@@ -81,30 +82,30 @@ const void* TensorUtil::GetData(NDArray* tensor) {
 int64_t TensorUtil::GetSize(NDArray* tensor) {
   int64_t element_size = 0;
   switch (tensor->dtype()) {
-  case mshadow::kFloat32:
-    element_size = kFloat32Size;
-    break;
-  case mshadow::kFloat64:
-    element_size = kFloat64Size;
-    break;
-  case mshadow::kFloat16:
-    element_size = kFloat16Size;
-    break;
-  case mshadow::kUint8:
-    element_size = kUInt8Size;
-    break;
-  case mshadow::kInt32:
-    element_size = kInt32Size;
-    break;
-  case mshadow::kInt8:
-    element_size = kInt8Size;
-    break;
-  case mshadow::kInt64:
-    element_size = kInt64Size;
-    break;
-  default:
-    throw std::logic_error("Type " + std::to_string(tensor->dtype()) +
-                           " is not supported in BytePS.");
+    case mshadow::kFloat32:
+      element_size = kFloat32Size;
+      break;
+    case mshadow::kFloat64:
+      element_size = kFloat64Size;
+      break;
+    case mshadow::kFloat16:
+      element_size = kFloat16Size;
+      break;
+    case mshadow::kUint8:
+      element_size = kUInt8Size;
+      break;
+    case mshadow::kInt32:
+      element_size = kInt32Size;
+      break;
+    case mshadow::kInt8:
+      element_size = kInt8Size;
+      break;
+    case mshadow::kInt64:
+      element_size = kInt64Size;
+      break;
+    default:
+      throw std::logic_error("Type " + std::to_string(tensor->dtype()) +
+                             " is not supported in BytePS.");
   }
   return (int64_t)(tensor->shape().Size()) * element_size;
 }
@@ -113,8 +114,7 @@ int64_t TensorUtil::GetSize(NDArray* tensor) {
 // Otherwise return CPU_DEVICE_ID (-1)
 int TensorUtil::GetDevice(NDArray* tensor) {
   int dev_mask = tensor->ctx().dev_mask();
-  if (dev_mask == gpu::kDevMask)
-    return tensor->ctx().real_dev_id();
+  if (dev_mask == gpu::kDevMask) return tensor->ctx().real_dev_id();
   return CPU_DEVICE_ID;
 }
 
@@ -156,5 +156,5 @@ void TensorUtil::AsyncCopyCudaToCPU(NDArray* cuda, NDArray* cpu) {
 }
 #endif
 
-} // namespace mxnet
-} // namespace byteps
+}  // namespace mxnet
+}  // namespace byteps
diff --git a/byteps/mxnet/tensor_util.h b/byteps/mxnet/tensor_util.h
index 03401d32a..c1474c86b 100644
--- a/byteps/mxnet/tensor_util.h
+++ b/byteps/mxnet/tensor_util.h
@@ -17,10 +17,9 @@
 #ifndef BYTEPS_MXNET_TENSOR_UTIL_H
 #define BYTEPS_MXNET_TENSOR_UTIL_H
 
-#include <cassert>
 #include <mxnet/base.h>
 #include <mxnet/ndarray.h>
-
+#include <cassert>
 #include "../common/common.h"
 #include "cuda_util.h"
 #include "util.h"
@@ -32,7 +31,7 @@ using namespace byteps::common;
 using namespace ::mxnet;
 
 class TensorUtil {
-public:
+ public:
   static const DataType GetDType(NDArray* tensor);
   static const TensorShape GetShape(NDArray* tensor);
   static const void* GetData(NDArray* tensor);
@@ -49,7 +48,7 @@ class TensorUtil {
   static void AsyncCopyCudaToCPU(NDArray* cuda, NDArray* cpu);
 #endif
 
-private:
+ private:
   static const size_t kFloat32Size = 4;
   static const size_t kFloat64Size = 8;
   static const size_t kFloat16Size = 2;
@@ -59,7 +58,7 @@ class TensorUtil {
   static const size_t kInt64Size = 8;
 };
 
-} // namespace mxnet
-} // namespace byteps
+}  // namespace mxnet
+}  // namespace byteps
 
-#endif // BYTEPS_MXNET_TENSOR_UTIL_H
+#endif  // BYTEPS_MXNET_TENSOR_UTIL_H
diff --git a/byteps/mxnet/util.h b/byteps/mxnet/util.h
index 28466c319..543ecf6f4 100644
--- a/byteps/mxnet/util.h
+++ b/byteps/mxnet/util.h
@@ -27,13 +27,13 @@
  *
  * It checks for CUDA errors after invocation of the expression.
  */
-#define CUDA_CALL(func)                                                        \
-  {                                                                            \
-    cudaError_t e = (func);                                                    \
-    CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading)                   \
-        << "CUDA: " << cudaGetErrorString(e);                                  \
+#define CUDA_CALL(func)                                      \
+  {                                                          \
+    cudaError_t e = (func);                                  \
+    CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
+        << "CUDA: " << cudaGetErrorString(e);                \
   }
 
-#endif // HAVE_CUDA
+#endif  // HAVE_CUDA
 
-#endif // BYTEPS_MXNET_UTIL_H
+#endif  // BYTEPS_MXNET_UTIL_H
diff --git a/byteps/tensorflow/__init__.py b/byteps/tensorflow/__init__.py
index 456bbacb3..2a2bd0f6f 100644
--- a/byteps/tensorflow/__init__.py
+++ b/byteps/tensorflow/__init__.py
@@ -28,6 +28,7 @@
 
 import tensorflow as tf
 
+
 def push_pull(tensor, scope='', average=True, device_dense='', device_sparse='',
               compression=Compression.none):
     """Perform an push_pull on a tf.Tensor or tf.IndexedSlices.
@@ -52,7 +53,7 @@ def push_pull(tensor, scope='', average=True, device_dense='', device_sparse='',
         summed_tensor_compressed = _push_pull(tensor_compressed, scope)
         summed_tensor = compression.decompress(summed_tensor_compressed, ctx)
         new_tensor = (tf.div(summed_tensor, byteps_size)
-                        if average else summed_tensor)
+                      if average else summed_tensor)
     return new_tensor
 
 
@@ -277,4 +278,4 @@ def DistributedGradientTape(gradtape, device_dense='', device_sparse='',
     else:
         return cls(gradtape._tape, device_dense, device_sparse,
                    compression, sparse_as_dense,
-                   gradtape._persistent)
\ No newline at end of file
+                   gradtape._persistent)
diff --git a/byteps/tensorflow/compression.py b/byteps/tensorflow/compression.py
index 0e7f35ce8..1fe2b868f 100644
--- a/byteps/tensorflow/compression.py
+++ b/byteps/tensorflow/compression.py
@@ -72,4 +72,4 @@ class Compression(object):
     none = NoneCompressor
 
     """Compress all floating point gradients to 16-bit."""
-    fp16 = FP16Compressor
\ No newline at end of file
+    fp16 = FP16Compressor
diff --git a/byteps/tensorflow/ops.cc b/byteps/tensorflow/ops.cc
index 85fe5cc5a..196a72a63 100644
--- a/byteps/tensorflow/ops.cc
+++ b/byteps/tensorflow/ops.cc
@@ -30,18 +30,18 @@ namespace {
 
 ::tensorflow::Status ConvertStatus(const common::Status& status) {
   switch (status.type()) {
-  case common::OK:
-    return ::tensorflow::Status::OK();
-  case common::UNKNOWN_ERROR:
-    return ::tensorflow::errors::Unknown(status.reason());
-  case common::PRECONDITION_ERROR:
-    return ::tensorflow::errors::FailedPrecondition(status.reason());
-  case common::ABORTED:
-    return ::tensorflow::errors::Aborted(status.reason());
-  case common::INVALID_ARGUMENT:
-    return ::tensorflow::errors::InvalidArgument(status.reason());
-  default:
-    return ::tensorflow::errors::Unknown("Unknown error.");
+    case common::OK:
+      return ::tensorflow::Status::OK();
+    case common::UNKNOWN_ERROR:
+      return ::tensorflow::errors::Unknown(status.reason());
+    case common::PRECONDITION_ERROR:
+      return ::tensorflow::errors::FailedPrecondition(status.reason());
+    case common::ABORTED:
+      return ::tensorflow::errors::Aborted(status.reason());
+    case common::INVALID_ARGUMENT:
+      return ::tensorflow::errors::InvalidArgument(status.reason());
+    default:
+      return ::tensorflow::errors::Unknown("Unknown error.");
   }
 }
 
@@ -57,32 +57,32 @@ int GetDeviceID(::tensorflow::OpKernelContext* context) {
 // Define all types for TensorUtil.
 const common::DataType ConvertDType(int dtype) {
   switch (dtype) {
-  case ::tensorflow::DT_UINT8:
-    return common::BYTEPS_UINT8;
-  case ::tensorflow::DT_INT8:
-    return common::BYTEPS_INT8;
-  // case ::tensorflow::DT_UINT16:
-  //   return common::BYTEPS_UINT16;
-  // case ::tensorflow::DT_INT16:
-  //   return common::BYTEPS_INT16;
-  case ::tensorflow::DT_INT32:
-    return common::BYTEPS_INT32;
-  case ::tensorflow::DT_INT64:
-    return common::BYTEPS_INT64;
-  case ::tensorflow::DT_HALF:
-    return common::BYTEPS_FLOAT16;
-  case ::tensorflow::DT_FLOAT:
-    return common::BYTEPS_FLOAT32;
-  case ::tensorflow::DT_DOUBLE:
-    return common::BYTEPS_FLOAT64;
-  // case ::tensorflow::DT_BOOL:
-  //   return common::BYTEPS_BOOL;
-  default:
-    throw std::logic_error("Invalid tensor type.");
+    case ::tensorflow::DT_UINT8:
+      return common::BYTEPS_UINT8;
+    case ::tensorflow::DT_INT8:
+      return common::BYTEPS_INT8;
+    // case ::tensorflow::DT_UINT16:
+    //   return common::BYTEPS_UINT16;
+    // case ::tensorflow::DT_INT16:
+    //   return common::BYTEPS_INT16;
+    case ::tensorflow::DT_INT32:
+      return common::BYTEPS_INT32;
+    case ::tensorflow::DT_INT64:
+      return common::BYTEPS_INT64;
+    case ::tensorflow::DT_HALF:
+      return common::BYTEPS_FLOAT16;
+    case ::tensorflow::DT_FLOAT:
+      return common::BYTEPS_FLOAT32;
+    case ::tensorflow::DT_DOUBLE:
+      return common::BYTEPS_FLOAT64;
+    // case ::tensorflow::DT_BOOL:
+    //   return common::BYTEPS_BOOL;
+    default:
+      throw std::logic_error("Invalid tensor type.");
   }
 }
 
-} // namespace
+}  // namespace
 
 TFReadyEvent::TFReadyEvent(::tensorflow::DeviceContext* device_context) {
   auto executor = device_context->stream()->parent();
@@ -111,7 +111,9 @@ const common::TensorShape TFTensor::shape() const {
   return shape;
 }
 
-const void* TFTensor::data() const { return (const void*)tensor_.tensor_data().data(); }
+const void* TFTensor::data() const {
+  return (const void*)tensor_.tensor_data().data();
+}
 
 int64_t TFTensor::size() const { return (int64_t)tensor_.tensor_data().size(); }
 
@@ -126,48 +128,49 @@ common::ReadyEvent* RecordReadyEvent(::tensorflow::OpKernelContext* context) {
 }
 
 extern "C" void byteps_tensorflow_declare_tensor(char* name) {
-    std::string tensor_name(name);
-    common::IsTensorDeclared(tensor_name);
-    return;
+  std::string tensor_name(name);
+  common::IsTensorDeclared(tensor_name);
+  return;
 }
 
 void StartTask(::tensorflow::OpKernelContext* context,
                ::tensorflow::AsyncOpKernel::DoneCallback done,
-               std::string node_name,
-               std::shared_ptr<TFTensor> byteps_input,
+               std::string node_name, std::shared_ptr<TFTensor> byteps_input,
                std::shared_ptr<TFTensor> byteps_output,
                std::shared_ptr<common::ReadyEvent> ready_event) {
   auto& byteps_context = common::GetContextFromName(node_name);
   auto device = GetDeviceID(context);
   auto size = byteps_input->size();
   auto dtype = byteps_input->dtype();
-  void* cpubuff = (device == CPU_DEVICE_ID) ?
-      const_cast<void*>(byteps_input->data()) : nullptr;
+  void* cpubuff = (device == CPU_DEVICE_ID)
+                      ? const_cast<void*>(byteps_input->data())
+                      : nullptr;
   common::InitTensor(byteps_context, size, dtype, cpubuff);
 
   auto queue_list = common::GetPushQueueList(device);
   auto queue_list_pull = common::GetPullQueueList(device);
-  queue_list->insert(queue_list->end(),
-                      queue_list_pull->begin(), queue_list_pull->end());
+  queue_list->insert(queue_list->end(), queue_list_pull->begin(),
+                     queue_list_pull->end());
 
   // TODO: assign priority based on topological sort
-  auto enqueue_result = EnqueueTensor(
-      byteps_context, byteps_input, byteps_output, ready_event,
-      device, -byteps_context.declared_key, 0,
-      [context, done](const common::Status& status) {
-        context->SetStatus(ConvertStatus(status));
-        done();
-      }, queue_list);
+  auto enqueue_result =
+      EnqueueTensor(byteps_context, byteps_input, byteps_output, ready_event,
+                    device, -byteps_context.declared_key, 0,
+                    [context, done](const common::Status& status) {
+                      context->SetStatus(ConvertStatus(status));
+                      done();
+                    },
+                    queue_list);
   OP_REQUIRES_OK_ASYNC(context, ConvertStatus(enqueue_result), done);
-
 }
 
 class BytePSPushPullOp : public ::tensorflow::AsyncOpKernel {
-public:
+ public:
   explicit BytePSPushPullOp(::tensorflow::OpKernelConstruction* context)
       : AsyncOpKernel(context) {}
 
-  void ComputeAsync(::tensorflow::OpKernelContext* context, DoneCallback done) override {
+  void ComputeAsync(::tensorflow::OpKernelContext* context,
+                    DoneCallback done) override {
     OP_REQUIRES_OK_ASYNC(context, ConvertStatus(common::CheckInitialized()),
                          done);
 
@@ -176,16 +179,17 @@ class BytePSPushPullOp : public ::tensorflow::AsyncOpKernel {
     OP_REQUIRES_OK_ASYNC(
         context, context->allocate_output(0, tensor.shape(), &output), done);
     // ReadyEvent makes sure input tensor is ready, and output is allocated.
-    auto ready_event = std::shared_ptr<common::ReadyEvent>(RecordReadyEvent(context));
+    auto ready_event =
+        std::shared_ptr<common::ReadyEvent>(RecordReadyEvent(context));
     auto bps_input = std::make_shared<TFTensor>(tensor);
     auto bps_output = std::make_shared<TFTensor>(*output);
     auto node_name = name();
     auto& bps_context = common::GetContextFromName(node_name);
     if (bps_context.initialized) {
       StartTask(context, done, node_name, bps_input, bps_output, ready_event);
-    }
-    else {
-      std::thread t(StartTask, context, done, node_name, bps_input, bps_output, ready_event);
+    } else {
+      std::thread t(StartTask, context, done, node_name, bps_input, bps_output,
+                    ready_event);
       t.detach();
     }
   }
@@ -215,5 +219,5 @@ Output
     sum:    A tensor with the same shape as `tensor`, summed across all processes.
 )doc");
 
-} // namespace tensorflow
-} // namespace byteps
\ No newline at end of file
+}  // namespace tensorflow
+}  // namespace byteps
diff --git a/byteps/tensorflow/ops.h b/byteps/tensorflow/ops.h
index 842209e8b..50cc64f3e 100644
--- a/byteps/tensorflow/ops.h
+++ b/byteps/tensorflow/ops.h
@@ -31,30 +31,29 @@ namespace byteps {
 namespace tensorflow {
 
 class TFReadyEvent : public common::ReadyEvent {
-public:
+ public:
   TFReadyEvent(::tensorflow::DeviceContext* device_context);
   bool Ready() const override;
 
-private:
+ private:
   std::shared_ptr<perftools::gputools::Event> event_;
 };
 
 class TFTensor : public common::Tensor {
-public:
+ public:
   TFTensor(::tensorflow::Tensor& tensor);
   virtual const common::DataType dtype() const override;
   virtual const common::TensorShape shape() const override;
   virtual const void* data() const override;
   virtual int64_t size() const override;
 
-protected:
+ protected:
   ::tensorflow::Tensor tensor_;
 };
 
 extern "C" void byteps_tensorflow_declare_tensor(char* name);
 
+}  // namespace tensorflow
+}  // namespace byteps
 
-} // namespace tensorflow
-} // namespace byteps
-
-#endif // BYTEPS_TENSORFLOW_OPS_H
+#endif  // BYTEPS_TENSORFLOW_OPS_H
diff --git a/byteps/tensorflow/ops.py b/byteps/tensorflow/ops.py
index 1032f054b..ac8ebe20b 100644
--- a/byteps/tensorflow/ops.py
+++ b/byteps/tensorflow/ops.py
@@ -138,6 +138,7 @@ def broadcast(tensor, root_rank, scope='', name=None, is_variable=True):
     else:
         return C_LIB.byteps_push_pull(tensor, name=name)
 
+
 @ops.RegisterGradient('BytePSBroadcast')
 def _broadcast_grad(op, grad):
     """Gradient for broadcast op.
diff --git a/byteps/torch/__init__.py b/byteps/torch/__init__.py
index 1b9ddde99..c42c7122a 100644
--- a/byteps/torch/__init__.py
+++ b/byteps/torch/__init__.py
@@ -214,7 +214,6 @@ def broadcast_parameters(params, root_rank):
         synchronize(handle)
 
 
-
 def broadcast_optimizer_state(optimizer, root_rank):
     """
     Broadcasts an optimizer state from root rank to all other processes.
@@ -286,7 +285,8 @@ def _from_tensor():
 
     def _create_option_callback(index, option_key, option_tensor, dtypes):
         def _from_tensor():
-            optimizer.param_groups[index][option_key] = _recursive_cast(option_tensor.numpy()[0], dtypes)
+            optimizer.param_groups[index][option_key] = _recursive_cast(
+                option_tensor.numpy()[0], dtypes)
         return _from_tensor
 
     # Param groups are an ordered list, normally there is only one per model,
@@ -330,4 +330,4 @@ def _from_tensor():
     # Post-broadcast clenaup for non-tensor parameters
     for key, p in params:
         if key in callbacks:
-            callbacks[key]()
\ No newline at end of file
+            callbacks[key]()
diff --git a/byteps/torch/adapter.cc b/byteps/torch/adapter.cc
index 8c49784c7..30ba74c3f 100644
--- a/byteps/torch/adapter.cc
+++ b/byteps/torch/adapter.cc
@@ -24,24 +24,24 @@ TorchTensor::TorchTensor(::torch::Tensor tensor) : tensor_(tensor) {}
 
 const DataType TorchTensor::dtype() const {
   switch (tensor_.scalar_type()) {
-  case ::torch::kByte:
-    return DataType::BYTEPS_UINT8;
-  case ::torch::kChar:
-    return DataType::BYTEPS_INT8;
-  // case ::torch::kShort:
-  //   return DataType::BYTEPS_INT16;
-  case ::torch::kInt:
-    return DataType::BYTEPS_INT32;
-  case ::torch::kLong:
-    return DataType::BYTEPS_INT64;
-  case ::torch::kHalf:
-    return DataType::BYTEPS_FLOAT16;
-  case ::torch::kFloat:
-    return DataType::BYTEPS_FLOAT32;
-  case ::torch::kDouble:
-    return DataType::BYTEPS_FLOAT64;
-  default:
-    throw std::logic_error("Invalid or unsupported tensor type.");
+    case ::torch::kByte:
+      return DataType::BYTEPS_UINT8;
+    case ::torch::kChar:
+      return DataType::BYTEPS_INT8;
+    // case ::torch::kShort:
+    //   return DataType::BYTEPS_INT16;
+    case ::torch::kInt:
+      return DataType::BYTEPS_INT32;
+    case ::torch::kLong:
+      return DataType::BYTEPS_INT64;
+    case ::torch::kHalf:
+      return DataType::BYTEPS_FLOAT16;
+    case ::torch::kFloat:
+      return DataType::BYTEPS_FLOAT32;
+    case ::torch::kDouble:
+      return DataType::BYTEPS_FLOAT64;
+    default:
+      throw std::logic_error("Invalid or unsupported tensor type.");
   }
 }
 
@@ -57,7 +57,7 @@ const void* TorchTensor::data() const { return tensor_.data_ptr(); }
 
 int64_t TorchTensor::size() const {
 #if TORCH_VERSION >= 1001000000
-   return tensor_.element_size() * tensor_.numel();
+  return tensor_.element_size() * tensor_.numel();
 #else
   return tensor_.type().elementSizeInBytes() * tensor_.numel();
 #endif
@@ -65,18 +65,18 @@ int64_t TorchTensor::size() const {
 
 void ThrowIfError(Status status) {
   switch (status.type()) {
-  case StatusType::OK:
-    return;
-  case StatusType::PRECONDITION_ERROR:
-    throw std::logic_error(status.reason());
-  case StatusType::ABORTED:
-    throw std::runtime_error(status.reason());
-  case StatusType::INVALID_ARGUMENT:
-    throw std::invalid_argument(status.reason());
-  default: // Includes UNKNOWN_ERROR
-    throw std::runtime_error(status.reason());
+    case StatusType::OK:
+      return;
+    case StatusType::PRECONDITION_ERROR:
+      throw std::logic_error(status.reason());
+    case StatusType::ABORTED:
+      throw std::runtime_error(status.reason());
+    case StatusType::INVALID_ARGUMENT:
+      throw std::invalid_argument(status.reason());
+    default:  // Includes UNKNOWN_ERROR
+      throw std::runtime_error(status.reason());
   }
 }
 
-} // namespace torch
-} // namespace byteps
\ No newline at end of file
+}  // namespace torch
+}  // namespace byteps
diff --git a/byteps/torch/adapter.h b/byteps/torch/adapter.h
index 7afb3c261..f94e14a87 100644
--- a/byteps/torch/adapter.h
+++ b/byteps/torch/adapter.h
@@ -28,20 +28,20 @@ namespace torch {
 using namespace byteps::common;
 
 class TorchTensor : public Tensor {
-public:
+ public:
   TorchTensor(::torch::Tensor tensor);
   virtual const DataType dtype() const override;
   virtual const TensorShape shape() const override;
   virtual const void* data() const override;
   virtual int64_t size() const override;
 
-protected:
+ protected:
   ::torch::Tensor tensor_;
 };
 
 void ThrowIfError(Status status);
 
-} // namespace torch
-} // namespace byteps
+}  // namespace torch
+}  // namespace byteps
 
-#endif // BYTEPS_TORCH_ADAPTER_H
\ No newline at end of file
+#endif  // BYTEPS_TORCH_ADAPTER_H
diff --git a/byteps/torch/compression.py b/byteps/torch/compression.py
index c3de83c61..a7d368980 100644
--- a/byteps/torch/compression.py
+++ b/byteps/torch/compression.py
@@ -72,4 +72,4 @@ class Compression(object):
     none = NoneCompressor
 
     """Compress all floating point gradients to 16-bit."""
-    fp16 = FP16Compressor
\ No newline at end of file
+    fp16 = FP16Compressor
diff --git a/byteps/torch/cuda_util.cc b/byteps/torch/cuda_util.cc
index 5ba1541d8..5e2116cf2 100644
--- a/byteps/torch/cuda_util.cc
+++ b/byteps/torch/cuda_util.cc
@@ -14,8 +14,8 @@
 // =============================================================================
 
 #if HAVE_CUDA
-#include "cuda_runtime.h"
 #include <THC/THC.h>
+#include "cuda_runtime.h"
 #endif
 
 #include "../common/common.h"
@@ -32,8 +32,9 @@ with_device::with_device(int device) {
     THCudaCheck(cudaGetDevice(&restore_device_));
     THCudaCheck(cudaSetDevice(device));
 #else
-    throw std::logic_error("Internal error. Requested device context manager "
-                           "with GPU device but not compiled with CUDA.");
+    throw std::logic_error(
+        "Internal error. Requested device context manager "
+        "with GPU device but not compiled with CUDA.");
 #endif
   }
 }
@@ -46,5 +47,5 @@ with_device::~with_device() {
 #endif
 }
 
-} // namespace torch
-} // namespace byteps
\ No newline at end of file
+}  // namespace torch
+}  // namespace byteps
diff --git a/byteps/torch/cuda_util.h b/byteps/torch/cuda_util.h
index c2f7780ad..da727dd14 100644
--- a/byteps/torch/cuda_util.h
+++ b/byteps/torch/cuda_util.h
@@ -17,19 +17,21 @@
 #ifndef BYTEPS_TORCH_CUDA_UTIL_H
 #define BYTEPS_TORCH_CUDA_UTIL_H
 
+#include "../common/common.h"
+
 namespace byteps {
 namespace torch {
 
 class with_device {
-public:
+ public:
   with_device(int device);
   ~with_device();
 
-private:
+ private:
   int restore_device_ = CPU_DEVICE_ID;
 };
 
-}
-}
+}  // namespace torch
+}  // namespace byteps
 
-#endif // BYTEPS_TORCH_CUDA_UTIL_H
\ No newline at end of file
+#endif  // BYTEPS_TORCH_CUDA_UTIL_H
diff --git a/byteps/torch/handle_manager.cc b/byteps/torch/handle_manager.cc
index 7b2f9b7a5..d4ac097b9 100644
--- a/byteps/torch/handle_manager.cc
+++ b/byteps/torch/handle_manager.cc
@@ -44,12 +44,12 @@ std::shared_ptr<Status> HandleManager::ReleaseHandle(int handle) {
   std::lock_guard<std::mutex> guard(mutex_);
   if (results_.find(handle) == results_.end()) {
     throw std::invalid_argument("Handle " + std::to_string(handle) +
-        " was not created or has been cleared.");
+                                " was not created or has been cleared.");
   }
   auto status = results_[handle];
   results_.erase(handle);
   return status;
 }
 
-} // namespace torch
-} // namespace byteps
\ No newline at end of file
+}  // namespace torch
+}  // namespace byteps
diff --git a/byteps/torch/handle_manager.h b/byteps/torch/handle_manager.h
index 711234c29..9a7f51233 100644
--- a/byteps/torch/handle_manager.h
+++ b/byteps/torch/handle_manager.h
@@ -30,19 +30,19 @@ namespace torch {
 using namespace byteps::common;
 
 class HandleManager {
-public:
+ public:
   int AllocateHandle();
   void MarkDone(int handle, const Status& status);
   bool PollHandle(int handle);
   std::shared_ptr<Status> ReleaseHandle(int handle);
 
-private:
+ private:
   std::atomic_int last_handle_;
   std::unordered_map<int, std::shared_ptr<Status>> results_;
   std::mutex mutex_;
 };
 
-} // namespace torch
-} // namespace byteps
+}  // namespace torch
+}  // namespace byteps
 
-#endif // BYTEPS_TORCH_HANDLE_MANAGER_H
\ No newline at end of file
+#endif  // BYTEPS_TORCH_HANDLE_MANAGER_H
diff --git a/byteps/torch/ops.cc b/byteps/torch/ops.cc
index 7cae528f5..50b75988d 100644
--- a/byteps/torch/ops.cc
+++ b/byteps/torch/ops.cc
@@ -14,11 +14,11 @@
 // limitations under the License.
 // =============================================================================
 
+#include <torch/extension.h>
+#include <torch/torch.h>
 #include <chrono>
 #include <memory>
 #include <thread>
-#include <torch/extension.h>
-#include <torch/torch.h>
 
 #include "../common/operations.h"
 #include "adapter.h"
@@ -48,52 +48,55 @@ int GetDeviceID(const ::torch::Tensor& tensor) {
   return CPU_DEVICE_ID;
 }
 
-} // namespace
+}  // namespace
 
 int DoPushPull(::torch::Tensor tensor, ::torch::Tensor output, int average,
-                const std::string& name, int version, int priority) {
-    ThrowIfError(common::CheckInitialized());
-
-    auto handle = handle_manager.AllocateHandle();
-    auto device = GetDeviceID(tensor);
-    auto ready_event = RecordReadyEvent(device);
-    auto byteps_input = std::make_shared<TorchTensor>(tensor);
-    auto byteps_output = std::make_shared<TorchTensor>(output);
-
-    std::string tensor_name = GetOpName("byteps", name.c_str(), 0);
-    size_t size = byteps_input->size();
-    auto dtype = byteps_input->dtype();
-
-    // check if we need to init the tensor
-    if (!common::IsTensorDeclared(tensor_name)) {
-        // we need to init this tensor with PS
-        auto& context = common::GetContextFromName(tensor_name);
-        // the following init is blocking, in order to guarantee the order
-        common::InitTensor(context, size, dtype,
-          (device == CPU_DEVICE_ID) ? const_cast<void*>(byteps_input->data()) : nullptr);
-    }
-
+               const std::string& name, int version, int priority) {
+  ThrowIfError(common::CheckInitialized());
+
+  auto handle = handle_manager.AllocateHandle();
+  auto device = GetDeviceID(tensor);
+  auto ready_event = RecordReadyEvent(device);
+  auto byteps_input = std::make_shared<TorchTensor>(tensor);
+  auto byteps_output = std::make_shared<TorchTensor>(output);
+
+  std::string tensor_name = GetOpName("byteps", name.c_str(), 0);
+  size_t size = byteps_input->size();
+  auto dtype = byteps_input->dtype();
+
+  // check if we need to init the tensor
+  if (!common::IsTensorDeclared(tensor_name)) {
+    // we need to init this tensor with PS
     auto& context = common::GetContextFromName(tensor_name);
+    // the following init is blocking, in order to guarantee the order
+    common::InitTensor(context, size, dtype,
+                       (device == CPU_DEVICE_ID)
+                           ? const_cast<void*>(byteps_input->data())
+                           : nullptr);
+  }
+
+  auto& context = common::GetContextFromName(tensor_name);
+
+  auto queue_list = common::GetPushQueueList(device);
+  auto queue_list_pull = common::GetPullQueueList(device);
+  queue_list->insert(queue_list->end(), queue_list_pull->begin(),
+                     queue_list_pull->end());
+
+  auto enqueue_result = common::EnqueueTensor(
+      context, byteps_input, byteps_output, ready_event, device, priority,
+      version,
+      [handle, average, tensor, output](const Status& status) mutable {
+        // Will execute in the `device` context.
+        if (average) {
+          output.div_(byteps_size());
+        }
+        handle_manager.MarkDone(handle, status);
+      },
+      queue_list);
+
+  ThrowIfError(enqueue_result);
 
-    auto queue_list = common::GetPushQueueList(device);
-    auto queue_list_pull = common::GetPullQueueList(device);
-    queue_list->insert(queue_list->end(),
-                       queue_list_pull->begin(), queue_list_pull->end());
-
-    auto enqueue_result = common::EnqueueTensor(
-        context, byteps_input, byteps_output, ready_event,
-        device, priority, version,
-        [handle, average, tensor, output](const Status& status) mutable {
-            // Will execute in the `device` context.
-            if (average) {
-                output.div_(byteps_size());
-            }
-            handle_manager.MarkDone(handle, status);
-        }, queue_list);
-
-    ThrowIfError(enqueue_result);
-
-    return handle;
+  return handle;
 }
 
 int PollHandle(int handle) { return handle_manager.PollHandle(handle) ? 1 : 0; }
@@ -127,5 +130,5 @@ PYBIND11_MODULE(c_lib, m) {
   m.def("byteps_torch_wait_and_clear", &WaitAndClear);
 }
 
-} // namespace torch
-} // namespace byteps
\ No newline at end of file
+}  // namespace torch
+}  // namespace byteps
diff --git a/byteps/torch/ops.h b/byteps/torch/ops.h
index 763153bf7..ab5d3118e 100644
--- a/byteps/torch/ops.h
+++ b/byteps/torch/ops.h
@@ -30,9 +30,10 @@ namespace torch {
 
 using namespace byteps::common;
 
-#define PUSHPULL_H(torch_Tensor, THTensor)                                    \
-  extern "C" int byteps_torch_push_pull_async_##torch_Tensor(                 \
-      THTensor* tensor, THTensor* output, int average, char* name, int version, int priority);
+#define PUSHPULL_H(torch_Tensor, THTensor)                         \
+  extern "C" int byteps_torch_push_pull_async_##torch_Tensor(      \
+      THTensor* tensor, THTensor* output, int average, char* name, \
+      int version, int priority);
 
 PUSHPULL_H(torch_IntTensor, THIntTensor)
 PUSHPULL_H(torch_LongTensor, THLongTensor)
@@ -49,7 +50,7 @@ PUSHPULL_H(torch_cuda_DoubleTensor, THCudaDoubleTensor)
 extern "C" int byteps_torch_poll(int handle);
 extern "C" void byteps_torch_wait_and_clear(int handle);
 
-} // namespace torch
-} // namespace byteps
+}  // namespace torch
+}  // namespace byteps
 
-#endif // BYTEPS_TORCH_OPS_H
\ No newline at end of file
+#endif  // BYTEPS_TORCH_OPS_H
diff --git a/byteps/torch/ops.py b/byteps/torch/ops.py
index f8ea0c976..12d5de60d 100644
--- a/byteps/torch/ops.py
+++ b/byteps/torch/ops.py
@@ -65,8 +65,8 @@ def _push_pull_function_factory(tensor):
 def _do_push_pull_async(tensor, output, average, name, version=0, priority=0):
     function = _check_function(_push_pull_function_factory, tensor)
     handle = getattr(c_lib, function)(tensor, output, average,
-                                        name.encode() if name is not None else _NULL,
-                                        version, priority)
+                                      name.encode() if name is not None else _NULL,
+                                      version, priority)
     _handle_map[handle] = (tensor, output)
     return handle
 
@@ -107,13 +107,13 @@ def forward(ctx, tensor, average, name, version, priority):
     @staticmethod
     def backward(ctx, grad_output):
         return push_pull(grad_output,
-            ctx.average, ctx.name, ctx.version, ctx.priority), None, None
+                         ctx.average, ctx.name, ctx.version, ctx.priority), None, None
 
 
 def push_pull(tensor, average=True, name=None, version=0, priority=0, compression=Compression.none):
     """
     A function that performs averaging or summation of the input tensor over all the
-    BytePS processes. The input tensor is not modified. The reduction operation is keyed 
+    BytePS processes. The input tensor is not modified. The reduction operation is keyed
     by the name. The name must be provided. The tensor type and shape must be the same on all
     BytePS processes for a given name. The reduction will not start until all processes
     are ready to send and receive the tensor.
@@ -135,9 +135,11 @@ def push_pull(tensor, average=True, name=None, version=0, priority=0, compressio
     if name == None:
         raise AssertionError("To manually call push_pull, you must specify a name by name=...")
     tensor_compressed, ctx = compression.compress(tensor)
-    summed_tensor_compressed = BytePSPushPull.apply(tensor_compressed, average, name, version, priority)
+    summed_tensor_compressed = BytePSPushPull.apply(
+        tensor_compressed, average, name, version, priority)
     return compression.decompress(summed_tensor_compressed, ctx)
 
+
 def push_pull_async_inplace(tensor, average=True, name=None, version=0, priority=0):
     """
     A function that performs asynchronous in-place averaging or summation of the input
@@ -178,6 +180,7 @@ def push_pull_inplace(tensor, average=True, name=None, version=0, priority=0):
     handle = push_pull_async_inplace(tensor, average, name, version, priority)
     return synchronize(handle)
 
+
 def poll(handle):
     """
     Polls an push_pull handle to determine whether underlying
@@ -206,4 +209,4 @@ def synchronize(handle):
         return
     c_lib.byteps_torch_wait_and_clear(handle)
     _, output = _handle_map.pop(handle)
-    return output
\ No newline at end of file
+    return output
diff --git a/byteps/torch/ready_event.cc b/byteps/torch/ready_event.cc
index 274ad64c2..57cf606d7 100644
--- a/byteps/torch/ready_event.cc
+++ b/byteps/torch/ready_event.cc
@@ -22,8 +22,8 @@
 #include <unordered_map>
 #endif
 
-#include "ready_event.h"
 #include "cuda_util.h"
+#include "ready_event.h"
 
 #if HAVE_CUDA
 extern THCState* state;
@@ -86,11 +86,12 @@ std::shared_ptr<ReadyEvent> RecordReadyEvent(int device) {
 #if HAVE_CUDA
     return std::make_shared<TorchReadyEvent>(device);
 #else
-    throw std::logic_error("Internal error. Requested ReadyEvent "
-                           "with GPU device but not compiled with CUDA.");
+    throw std::logic_error(
+        "Internal error. Requested ReadyEvent "
+        "with GPU device but not compiled with CUDA.");
 #endif
   }
 }
 
-} // namespace torch
-} // namespace byteps
\ No newline at end of file
+}  // namespace torch
+}  // namespace byteps
diff --git a/byteps/torch/ready_event.h b/byteps/torch/ready_event.h
index 214675e7f..ea06a1256 100644
--- a/byteps/torch/ready_event.h
+++ b/byteps/torch/ready_event.h
@@ -32,12 +32,12 @@ using namespace byteps::common;
 
 #if HAVE_CUDA
 class TorchReadyEvent : public ReadyEvent {
-public:
+ public:
   TorchReadyEvent(int device);
   ~TorchReadyEvent();
   virtual bool Ready() const override;
 
-private:
+ private:
   int device_ = CPU_DEVICE_ID;
   cudaEvent_t cuda_event_ = nullptr;
 };
@@ -45,7 +45,7 @@ class TorchReadyEvent : public ReadyEvent {
 
 std::shared_ptr<ReadyEvent> RecordReadyEvent(int device);
 
-} // namespace torch
-} // namespace byteps
+}  // namespace torch
+}  // namespace byteps
 
-#endif // BYTEPS_TORCH_READY_EVENT_H
\ No newline at end of file
+#endif  // BYTEPS_TORCH_READY_EVENT_H
diff --git a/launcher/launch.py b/launcher/launch.py
index 8c909c7c5..9f5a7a551 100644
--- a/launcher/launch.py
+++ b/launcher/launch.py
@@ -35,7 +35,7 @@ def worker(local_rank, local_size, command):
             t[i].start()
 
         for i in range(local_size):
-            t[i].join() 
+            t[i].join()
 
     else:
         if "BYTEPS_SERVER_MXNET_PATH" not in os.environ:
@@ -43,4 +43,3 @@ def worker(local_rank, local_size, command):
             os._exit(0)
         sys.path.insert(0, os.getenv("BYTEPS_SERVER_MXNET_PATH")+"/python")
         import mxnet
-
diff --git a/tests/test_mxnet.py b/tests/test_mxnet.py
index 0c7aac810..99151314d 100644
--- a/tests/test_mxnet.py
+++ b/tests/test_mxnet.py
@@ -124,7 +124,7 @@ def test_byteps_broadcast(self):
             return
 
         dtypes = ['int32',   'int64',
-                  'float32', 'float64'] 
+                  'float32', 'float64']
         dims = [1, 2, 3]
         ctx = self._current_context()
         count = 0
diff --git a/tests/test_tensorflow_keras.py b/tests/test_tensorflow_keras.py
index 8425a77e6..a2ac57fa5 100644
--- a/tests/test_tensorflow_keras.py
+++ b/tests/test_tensorflow_keras.py
@@ -100,4 +100,4 @@ def test_sparse_as_dense(self):
 
 if __name__ == '__main__':
     keras_test = TfKerasTests()
-    keras_test.test_train_model()
\ No newline at end of file
+    keras_test.test_train_model()