From 0ed8991c232b73b914f1ed15480f54702d8e4924 Mon Sep 17 00:00:00 2001
From: Carl Yang <ctcyang@ucdavis.edu>
Date: Tue, 7 Aug 2018 23:38:04 +0000
Subject: [PATCH 01/10] add fallback for gpu topology detection using CUDA 9.2

---
 src/kvstore/comm_tree.h    | 16 +++++-----
 src/kvstore/gpu_topology.h | 62 ++++++++++++++++++++++++++++++++++----
 2 files changed, 64 insertions(+), 14 deletions(-)
diff --git a/src/kvstore/comm_tree.h b/src/kvstore/comm_tree.h
index 1ebfcdc8010d..050958e9a63f 100644
--- a/src/kvstore/comm_tree.h
+++ b/src/kvstore/comm_tree.h
@@ -77,9 +77,6 @@ class CommDeviceTree : public CommDevice {
       //  BroadcastRowSparse
       InitMergeBuffer(devs_);
       InitMergeBufferTree();
-      if (dmlc::GetEnv("MXNET_ENABLE_GPU_P2P", 1)) {
-        EnableP2P();
-      }
     }
   }
 
@@ -328,7 +325,7 @@ class CommDeviceTree : public CommDevice {
   }
 
  private:
-  void EnableP2P() {
+  void EnableP2P(std::vector<int>* p2p) {
 #if MXNET_USE_CUDA
     std::vector<int> gpus;
     for (const auto& d : devs_) {
@@ -338,7 +335,8 @@ class CommDeviceTree : public CommDevice {
     }
     int n = static_cast<int>(gpus.size());
     int enabled = 0;
-    std::vector<int> p2p(n*n);
+    p2p->clear();
+    p2p->resize(n*n, 0);
     for (int i = 0; i < n; ++i) {
       cudaSetDevice(gpus[i]);
       for (int j = 0; j < n; j++) {
@@ -348,7 +346,7 @@ class CommDeviceTree : public CommDevice {
           cudaError_t e = cudaDeviceEnablePeerAccess(gpus[j], 0);
           if (e == cudaSuccess || e == cudaErrorPeerAccessAlreadyEnabled) {
             ++enabled;
-            p2p[i*n+j] = 1;
+            (*p2p)[i*n+j] = 1;
           }
         }
       }
@@ -362,7 +360,7 @@ class CommDeviceTree : public CommDevice {
       std::string access(n, '.');
       for (int i = 0; i < n; ++i) {
         for (int j = 0; j < n; ++j) {
-          access[j] = p2p[i*n+j] ? 'v' : '.';
+          access[j] = (*p2p)[i*n+j] ? 'v' : '.';
         }
         LOG(WARNING) << access;
       }
@@ -373,7 +371,9 @@ class CommDeviceTree : public CommDevice {
   void QueryTopology() {
 #if MXNET_USE_CUDA
     std::vector<float> link_matrix(devs_.size()*devs_.size());
-    GetP2PWeight(devs_, &link_matrix);
+    std::vector<int> p2p_matrix(devs_.size()*devs_.size());
+    EnableP2P(&p2p_matrix);
+    GetP2PWeight(devs_, p2p_matrix, &link_matrix);
     if (backtrack_)
       LOG(INFO) << "Using Backtracking to generate trees";
     else
diff --git a/src/kvstore/gpu_topology.h b/src/kvstore/gpu_topology.h
index a8801499c3be..d73d94d647fe 100644
--- a/src/kvstore/gpu_topology.h
+++ b/src/kvstore/gpu_topology.h
@@ -123,6 +123,9 @@ inline bool IsConnected(const std::vector<T>& matrix, int num_gpus) {
 /**
  * \brief Generate adjacency matrix with row/col numbering from 0, 1, ..., n_gpu
  * \param devs is a vector of GPU contexts
+ * \param p2p_matrix is adjacency matrix of P2P connections where
+ *          0: no P2P connection
+ *          1: P2P connection
  * \param matrix is adjacency matrix of link topology graph
  *        where edge weight represents relative performance of NVIDIA GPUs
  *          0: Self-connection
@@ -131,7 +134,9 @@ inline bool IsConnected(const std::vector<T>& matrix, int num_gpus) {
  *          3: 2 NVLink connections
  */
 template <typename T>
-inline void GetP2PWeight(const std::vector<Context>& devs, std::vector<T>* matrix) {
+inline void GetP2PWeight(const std::vector<Context>& devs,
+                         const std::vector<int>& p2p_matrix,
+                         std::vector<T>* matrix) {
   int num_gpus = devs.size();
   int count    = 0;
   std::vector<int> zero_dev_id(num_gpus, -1);
@@ -161,11 +166,54 @@ inline void GetP2PWeight(const std::vector<Context>& devs, std::vector<T>* matri
     }
   }
 
-  // Check that all GPUs have at least 1 NVLink connection
-  int max_value = 0;
-  for (unsigned int i = 0; i < max.size(); ++i) {
-    if (max[i] > max_value)
-      max_value = max[i];
+  // Check that all P2P connections are detected by GetP2PAttribute
+  // If yes, then continue as before
+  // If not, then treat fallback to using p2p_matrix (from EnableP2P)
+  //
+  // We have observed that with CUDA 9.0 p3.16xlarge:
+  //
+  //   0 2 2 3 3 1 1 1    . v v v v . . .
+  //   2 0 3 2 1 3 1 1    v . v v . v . .
+  //   2 3 0 3 1 1 2 1    v v . v . . v .
+  //   3 2 3 0 1 1 1 2    v v v . . . . v
+  //   3 1 1 1 0 2 2 3    v . . . . v v v
+  //   1 3 1 1 2 0 3 2    . v . . v . v v
+  //   1 1 2 1 2 3 0 3    . . v . v v . v
+  //   1 1 1 2 3 2 3 0    . . . v v v v .
+  //
+  //        matrix           p2p_matrix
+  //
+  // Here, they are correctly detected, because the 2s and 3s correspond to
+  // links that have P2P connections between them. However for CUDA 9.2 p3.16xlarge:
+  //
+  //   0 2 2 1 1 1 1 1    . v v v v . . .
+  //   2 0 1 2 1 1 1 1    v . v v . v . .
+  //   2 1 0 1 1 1 2 1    v v . v . . v .
+  //   1 2 1 0 1 1 1 2    v v v . . . . v
+  //   1 1 1 1 0 2 2 1    v . . . . v v v
+  //   1 1 1 1 2 0 1 2    . v . . v . v v
+  //   1 1 2 1 2 1 0 1    . . v . v v . v
+  //   1 1 1 2 1 2 1 0    . . . v v v v .
+  //
+  //        matrix          p2p_matrix
+  //
+  // The fastest connections (3 - double NVLink) are not recognized as being any
+  // different from (1 - non-P2P PCI-E). This is why we fallback to p2p_matrix.
+  bool matrix_correct = true;
+  for (unsigned i = 0; i < p2p_matrix.size(); ++i) {
+    if (p2p_matrix[i] > 0 && (*matrix)[i] == 1) {
+      matrix_correct = false;
+      break;
+    }
+  }
+
+  if (!matrix_correct) {
+    for (unsigned i = 0; i < p2p_matrix.size(); ++i) {
+      if (p2p_matrix[i] > 0)
+        (*matrix)[i] = 2;
+      else
+        (*matrix)[i] = 1;
+    }
   }
 
   // If all GPUs are connected by NVLink, then we can use NVLink only
@@ -188,6 +236,8 @@ inline void GetP2PWeight(const std::vector<Context>& devs, std::vector<T>* matri
       matrix_value = (matrix_value == 1) ? 1./num_gpus : matrix_value;
     }
   }
+  if (kLogTree)
+    PrintMatrix("Weight", *matrix, num_gpus, num_gpus);
 
 #else
   LOG(WARNING) << "GPU required for link topology";

From dad855f74174e5b71705500dbe691c6aa61856cc Mon Sep 17 00:00:00 2001
From: Carl Yang <ctcyang@ucdavis.edu>
Date: Tue, 7 Aug 2018 23:38:04 +0000
Subject: [PATCH 02/10] add fallback for gpu topology detection using CUDA 9.2

---
 src/kvstore/comm_tree.h    | 16 +++++-----
 src/kvstore/gpu_topology.h | 62 ++++++++++++++++++++++++++++++++++----
 2 files changed, 64 insertions(+), 14 deletions(-)

diff --git a/src/kvstore/comm_tree.h b/src/kvstore/comm_tree.h
index b62228cd2885..11d99c021917 100644
--- a/src/kvstore/comm_tree.h
+++ b/src/kvstore/comm_tree.h
@@ -77,9 +77,6 @@ class CommDeviceTree : public CommDevice {
       //  BroadcastRowSparse
       InitMergeBuffer(devs_);
       InitMergeBufferTree();
-      if (dmlc::GetEnv("MXNET_ENABLE_GPU_P2P", 1)) {
-        EnableP2P();
-      }
     }
   }
 
@@ -328,7 +325,7 @@ class CommDeviceTree : public CommDevice {
   }
 
  private:
-  void EnableP2P() {
+  void EnableP2P(std::vector<int>* p2p) {
 #if MXNET_USE_CUDA
     std::vector<int> gpus;
     for (const auto& d : devs_) {
@@ -338,7 +335,8 @@ class CommDeviceTree : public CommDevice {
     }
     int n = static_cast<int>(gpus.size());
     int enabled = 0;
-    std::vector<int> p2p(n*n);
+    p2p->clear();
+    p2p->resize(n*n, 0);
     for (int i = 0; i < n; ++i) {
       mxnet::common::cuda::DeviceStore device_store(gpus[i]);
       for (int j = 0; j < n; j++) {
@@ -348,7 +346,7 @@ class CommDeviceTree : public CommDevice {
           cudaError_t e = cudaDeviceEnablePeerAccess(gpus[j], 0);
           if (e == cudaSuccess || e == cudaErrorPeerAccessAlreadyEnabled) {
             ++enabled;
-            p2p[i*n+j] = 1;
+            (*p2p)[i*n+j] = 1;
           }
         }
       }
@@ -362,7 +360,7 @@ class CommDeviceTree : public CommDevice {
       std::string access(n, '.');
       for (int i = 0; i < n; ++i) {
         for (int j = 0; j < n; ++j) {
-          access[j] = p2p[i*n+j] ? 'v' : '.';
+          access[j] = (*p2p)[i*n+j] ? 'v' : '.';
         }
         LOG(WARNING) << access;
       }
@@ -373,7 +371,9 @@ class CommDeviceTree : public CommDevice {
   void QueryTopology() {
 #if MXNET_USE_CUDA
     std::vector<float> link_matrix(devs_.size()*devs_.size());
-    GetP2PWeight(devs_, &link_matrix);
+    std::vector<int> p2p_matrix(devs_.size()*devs_.size());
+    EnableP2P(&p2p_matrix);
+    GetP2PWeight(devs_, p2p_matrix, &link_matrix);
     if (backtrack_)
       LOG(INFO) << "Using Backtracking to generate trees";
     else
diff --git a/src/kvstore/gpu_topology.h b/src/kvstore/gpu_topology.h
index 92bcf1a5d263..00a2525d0e9c 100644
--- a/src/kvstore/gpu_topology.h
+++ b/src/kvstore/gpu_topology.h
@@ -123,6 +123,9 @@ inline bool IsConnected(const std::vector<T>& matrix, int num_gpus) {
 /**
  * \brief Generate adjacency matrix with row/col numbering from 0, 1, ..., n_gpu
  * \param devs is a vector of GPU contexts
+ * \param p2p_matrix is adjacency matrix of P2P connections where
+ *          0: no P2P connection
+ *          1: P2P connection
  * \param matrix is adjacency matrix of link topology graph
  *        where edge weight represents relative performance of NVIDIA GPUs
  *          0: Self-connection
@@ -131,7 +134,9 @@ inline bool IsConnected(const std::vector<T>& matrix, int num_gpus) {
  *          3: 2 NVLink connections
  */
 template <typename T>
-inline void GetP2PWeight(const std::vector<Context>& devs, std::vector<T>* matrix) {
+inline void GetP2PWeight(const std::vector<Context>& devs,
+                         const std::vector<int>& p2p_matrix,
+                         std::vector<T>* matrix) {
   int num_gpus = devs.size();
   int count    = 0;
   std::vector<int> zero_dev_id(num_gpus, -1);
@@ -161,11 +166,54 @@ inline void GetP2PWeight(const std::vector<Context>& devs, std::vector<T>* matri
     }
   }
 
-  // Check that all GPUs have at least 1 NVLink connection
-  int max_value = 0;
-  for (unsigned int i = 0; i < max.size(); ++i) {
-    if (max[i] > max_value)
-      max_value = max[i];
+  // Check that all P2P connections are detected by GetP2PAttribute
+  // If yes, then continue as before
+  // If not, then treat fallback to using p2p_matrix (from EnableP2P)
+  //
+  // We have observed that with CUDA 9.0 p3.16xlarge:
+  //
+  //   0 2 2 3 3 1 1 1    . v v v v . . .
+  //   2 0 3 2 1 3 1 1    v . v v . v . .
+  //   2 3 0 3 1 1 2 1    v v . v . . v .
+  //   3 2 3 0 1 1 1 2    v v v . . . . v
+  //   3 1 1 1 0 2 2 3    v . . . . v v v
+  //   1 3 1 1 2 0 3 2    . v . . v . v v
+  //   1 1 2 1 2 3 0 3    . . v . v v . v
+  //   1 1 1 2 3 2 3 0    . . . v v v v .
+  //
+  //        matrix           p2p_matrix
+  //
+  // Here, they are correctly detected, because the 2s and 3s correspond to
+  // links that have P2P connections between them. However for CUDA 9.2 p3.16xlarge:
+  //
+  //   0 2 2 1 1 1 1 1    . v v v v . . .
+  //   2 0 1 2 1 1 1 1    v . v v . v . .
+  //   2 1 0 1 1 1 2 1    v v . v . . v .
+  //   1 2 1 0 1 1 1 2    v v v . . . . v
+  //   1 1 1 1 0 2 2 1    v . . . . v v v
+  //   1 1 1 1 2 0 1 2    . v . . v . v v
+  //   1 1 2 1 2 1 0 1    . . v . v v . v
+  //   1 1 1 2 1 2 1 0    . . . v v v v .
+  //
+  //        matrix          p2p_matrix
+  //
+  // The fastest connections (3 - double NVLink) are not recognized as being any
+  // different from (1 - non-P2P PCI-E). This is why we fallback to p2p_matrix.
+  bool matrix_correct = true;
+  for (unsigned i = 0; i < p2p_matrix.size(); ++i) {
+    if (p2p_matrix[i] > 0 && (*matrix)[i] == 1) {
+      matrix_correct = false;
+      break;
+    }
+  }
+
+  if (!matrix_correct) {
+    for (unsigned i = 0; i < p2p_matrix.size(); ++i) {
+      if (p2p_matrix[i] > 0)
+        (*matrix)[i] = 2;
+      else
+        (*matrix)[i] = 1;
+    }
   }
 
   // If all GPUs are connected by NVLink, then we can use NVLink only
@@ -188,6 +236,8 @@ inline void GetP2PWeight(const std::vector<Context>& devs, std::vector<T>* matri
       matrix_value = (matrix_value == 1) ? 1./num_gpus : matrix_value;
     }
   }
+  if (kLogTree)
+    PrintMatrix("Weight", *matrix, num_gpus, num_gpus);
 
 #else
   LOG(WARNING) << "GPU required for link topology";

From ad2baee92acd0bb94413c37127e7c8b14644a623 Mon Sep 17 00:00:00 2001
From: Carl Yang <ctcyang@ucdavis.edu>
Date: Mon, 7 Jan 2019 10:06:16 -0800
Subject: [PATCH 03/10] add log

---
 src/kvstore/gpu_topology.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/kvstore/gpu_topology.h b/src/kvstore/gpu_topology.h
index 00a2525d0e9c..699e30f7ca99 100644
--- a/src/kvstore/gpu_topology.h
+++ b/src/kvstore/gpu_topology.h
@@ -198,6 +198,11 @@ inline void GetP2PWeight(const std::vector<Context>& devs,
   //        matrix          p2p_matrix
   //
   // The fastest connections (3 - double NVLink) are not recognized as being any
+  if (kLogTree) {
+    PrintMatrix("matrix", *matrix, num_gpus, num_gpus);
+    PrintMatrix("p2p_matrix", p2p_matrix, num_gpus, num_gpus);
+  }
+
   // different from (1 - non-P2P PCI-E). This is why we fallback to p2p_matrix.
   bool matrix_correct = true;
   for (unsigned i = 0; i < p2p_matrix.size(); ++i) {

From 497efac539aee94a41b4277389d130c047faae9a Mon Sep 17 00:00:00 2001
From: Carl Yang <ctcyang@ucdavis.edu>
Date: Mon, 7 Jan 2019 10:07:51 -0800
Subject: [PATCH 04/10] update 3rdparty to master

---
 3rdparty/dmlc-core | 2 +-
 3rdparty/mkldnn    | 2 +-
 3rdparty/mshadow   | 2 +-
 3rdparty/tvm       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 0a0e8addf92e..649be18a8c55 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 0a0e8addf92e1287fd7a25c6314016b8c0138dee
+Subproject commit 649be18a8c55c48517861d67158a45dec54992ee
diff --git a/3rdparty/mkldnn b/3rdparty/mkldnn
index a7c5f53832ac..0e7ca738866d 160000
--- a/3rdparty/mkldnn
+++ b/3rdparty/mkldnn
@@ -1 +1 @@
-Subproject commit a7c5f53832acabade6e5086e72c960adedb3c38a
+Subproject commit 0e7ca738866d22cc700aa33b8de120b938f910d0
diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index 6dc04f7c729c..463c0dffe3ea 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit 6dc04f7c729cd5c6c6210d5d4d2026a26ce0bfbf
+Subproject commit 463c0dffe3eae8c39caf7989c85b7244823df27e
diff --git a/3rdparty/tvm b/3rdparty/tvm
index 0f053c82a747..426e3bb0a8e8 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 0f053c82a747b4dcdf49570ec87c17e0067b7439
+Subproject commit 426e3bb0a8e86bb48a25b950fd8ef965ca5d370b

From df2262e05cc19f1ec7d4bca88bf65c5113f183b2 Mon Sep 17 00:00:00 2001
From: Carl Yang <ctcyang@ucdavis.edu>
Date: Tue, 7 Aug 2018 23:38:04 +0000
Subject: [PATCH 05/10] add fallback for gpu topology detection using CUDA 9.2

---
 src/kvstore/comm_tree.h    | 16 +++++-----
 src/kvstore/gpu_topology.h | 62 ++++++++++++++++++++++++++++++++++----
 2 files changed, 64 insertions(+), 14 deletions(-)

diff --git a/src/kvstore/comm_tree.h b/src/kvstore/comm_tree.h
index b62228cd2885..11d99c021917 100644
--- a/src/kvstore/comm_tree.h
+++ b/src/kvstore/comm_tree.h
@@ -77,9 +77,6 @@ class CommDeviceTree : public CommDevice {
       //  BroadcastRowSparse
       InitMergeBuffer(devs_);
       InitMergeBufferTree();
-      if (dmlc::GetEnv("MXNET_ENABLE_GPU_P2P", 1)) {
-        EnableP2P();
-      }
     }
   }
 
@@ -328,7 +325,7 @@ class CommDeviceTree : public CommDevice {
   }
 
  private:
-  void EnableP2P() {
+  void EnableP2P(std::vector<int>* p2p) {
 #if MXNET_USE_CUDA
     std::vector<int> gpus;
     for (const auto& d : devs_) {
@@ -338,7 +335,8 @@ class CommDeviceTree : public CommDevice {
     }
     int n = static_cast<int>(gpus.size());
     int enabled = 0;
-    std::vector<int> p2p(n*n);
+    p2p->clear();
+    p2p->resize(n*n, 0);
     for (int i = 0; i < n; ++i) {
       mxnet::common::cuda::DeviceStore device_store(gpus[i]);
       for (int j = 0; j < n; j++) {
@@ -348,7 +346,7 @@ class CommDeviceTree : public CommDevice {
           cudaError_t e = cudaDeviceEnablePeerAccess(gpus[j], 0);
           if (e == cudaSuccess || e == cudaErrorPeerAccessAlreadyEnabled) {
             ++enabled;
-            p2p[i*n+j] = 1;
+            (*p2p)[i*n+j] = 1;
           }
         }
       }
@@ -362,7 +360,7 @@ class CommDeviceTree : public CommDevice {
       std::string access(n, '.');
       for (int i = 0; i < n; ++i) {
         for (int j = 0; j < n; ++j) {
-          access[j] = p2p[i*n+j] ? 'v' : '.';
+          access[j] = (*p2p)[i*n+j] ? 'v' : '.';
         }
         LOG(WARNING) << access;
       }
@@ -373,7 +371,9 @@ class CommDeviceTree : public CommDevice {
   void QueryTopology() {
 #if MXNET_USE_CUDA
     std::vector<float> link_matrix(devs_.size()*devs_.size());
-    GetP2PWeight(devs_, &link_matrix);
+    std::vector<int> p2p_matrix(devs_.size()*devs_.size());
+    EnableP2P(&p2p_matrix);
+    GetP2PWeight(devs_, p2p_matrix, &link_matrix);
     if (backtrack_)
       LOG(INFO) << "Using Backtracking to generate trees";
     else
diff --git a/src/kvstore/gpu_topology.h b/src/kvstore/gpu_topology.h
index 92bcf1a5d263..00a2525d0e9c 100644
--- a/src/kvstore/gpu_topology.h
+++ b/src/kvstore/gpu_topology.h
@@ -123,6 +123,9 @@ inline bool IsConnected(const std::vector<T>& matrix, int num_gpus) {
 /**
  * \brief Generate adjacency matrix with row/col numbering from 0, 1, ..., n_gpu
  * \param devs is a vector of GPU contexts
+ * \param p2p_matrix is adjacency matrix of P2P connections where
+ *          0: no P2P connection
+ *          1: P2P connection
  * \param matrix is adjacency matrix of link topology graph
  *        where edge weight represents relative performance of NVIDIA GPUs
  *          0: Self-connection
@@ -131,7 +134,9 @@ inline bool IsConnected(const std::vector<T>& matrix, int num_gpus) {
  *          3: 2 NVLink connections
  */
 template <typename T>
-inline void GetP2PWeight(const std::vector<Context>& devs, std::vector<T>* matrix) {
+inline void GetP2PWeight(const std::vector<Context>& devs,
+                         const std::vector<int>& p2p_matrix,
+                         std::vector<T>* matrix) {
   int num_gpus = devs.size();
   int count    = 0;
   std::vector<int> zero_dev_id(num_gpus, -1);
@@ -161,11 +166,54 @@ inline void GetP2PWeight(const std::vector<Context>& devs, std::vector<T>* matri
     }
   }
 
-  // Check that all GPUs have at least 1 NVLink connection
-  int max_value = 0;
-  for (unsigned int i = 0; i < max.size(); ++i) {
-    if (max[i] > max_value)
-      max_value = max[i];
+  // Check that all P2P connections are detected by GetP2PAttribute
+  // If yes, then continue as before
+  // If not, then treat fallback to using p2p_matrix (from EnableP2P)
+  //
+  // We have observed that with CUDA 9.0 p3.16xlarge:
+  //
+  //   0 2 2 3 3 1 1 1    . v v v v . . .
+  //   2 0 3 2 1 3 1 1    v . v v . v . .
+  //   2 3 0 3 1 1 2 1    v v . v . . v .
+  //   3 2 3 0 1 1 1 2    v v v . . . . v
+  //   3 1 1 1 0 2 2 3    v . . . . v v v
+  //   1 3 1 1 2 0 3 2    . v . . v . v v
+  //   1 1 2 1 2 3 0 3    . . v . v v . v
+  //   1 1 1 2 3 2 3 0    . . . v v v v .
+  //
+  //        matrix           p2p_matrix
+  //
+  // Here, they are correctly detected, because the 2s and 3s correspond to
+  // links that have P2P connections between them. However for CUDA 9.2 p3.16xlarge:
+  //
+  //   0 2 2 1 1 1 1 1    . v v v v . . .
+  //   2 0 1 2 1 1 1 1    v . v v . v . .
+  //   2 1 0 1 1 1 2 1    v v . v . . v .
+  //   1 2 1 0 1 1 1 2    v v v . . . . v
+  //   1 1 1 1 0 2 2 1    v . . . . v v v
+  //   1 1 1 1 2 0 1 2    . v . . v . v v
+  //   1 1 2 1 2 1 0 1    . . v . v v . v
+  //   1 1 1 2 1 2 1 0    . . . v v v v .
+  //
+  //        matrix          p2p_matrix
+  //
+  // The fastest connections (3 - double NVLink) are not recognized as being any
+  // different from (1 - non-P2P PCI-E). This is why we fallback to p2p_matrix.
+  bool matrix_correct = true;
+  for (unsigned i = 0; i < p2p_matrix.size(); ++i) {
+    if (p2p_matrix[i] > 0 && (*matrix)[i] == 1) {
+      matrix_correct = false;
+      break;
+    }
+  }
+
+  if (!matrix_correct) {
+    for (unsigned i = 0; i < p2p_matrix.size(); ++i) {
+      if (p2p_matrix[i] > 0)
+        (*matrix)[i] = 2;
+      else
+        (*matrix)[i] = 1;
+    }
   }
 
   // If all GPUs are connected by NVLink, then we can use NVLink only
@@ -188,6 +236,8 @@ inline void GetP2PWeight(const std::vector<Context>& devs, std::vector<T>* matri
       matrix_value = (matrix_value == 1) ? 1./num_gpus : matrix_value;
     }
   }
+  if (kLogTree)
+    PrintMatrix("Weight", *matrix, num_gpus, num_gpus);
 
 #else
   LOG(WARNING) << "GPU required for link topology";

From 4ba1a83bff6c3cd70ccf43ca6aaba0e4b662af17 Mon Sep 17 00:00:00 2001
From: Carl Yang <ctcyang@ucdavis.edu>
Date: Mon, 7 Jan 2019 10:06:16 -0800
Subject: [PATCH 06/10] add log

---
 src/kvstore/gpu_topology.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/kvstore/gpu_topology.h b/src/kvstore/gpu_topology.h
index 00a2525d0e9c..699e30f7ca99 100644
--- a/src/kvstore/gpu_topology.h
+++ b/src/kvstore/gpu_topology.h
@@ -198,6 +198,11 @@ inline void GetP2PWeight(const std::vector<Context>& devs,
   //        matrix          p2p_matrix
   //
   // The fastest connections (3 - double NVLink) are not recognized as being any
+  if (kLogTree) {
+    PrintMatrix("matrix", *matrix, num_gpus, num_gpus);
+    PrintMatrix("p2p_matrix", p2p_matrix, num_gpus, num_gpus);
+  }
+
   // different from (1 - non-P2P PCI-E). This is why we fallback to p2p_matrix.
   bool matrix_correct = true;
   for (unsigned i = 0; i < p2p_matrix.size(); ++i) {

From 5135309b111e0445e6c22f3adaa07c3306d264da Mon Sep 17 00:00:00 2001
From: Carl Yang <ctcyang@ucdavis.edu>
Date: Mon, 7 Jan 2019 10:07:51 -0800
Subject: [PATCH 07/10] update 3rdparty to master

---
 3rdparty/dmlc-core | 2 +-
 3rdparty/mkldnn    | 2 +-
 3rdparty/mshadow   | 2 +-
 3rdparty/tvm       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 0a0e8addf92e..649be18a8c55 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 0a0e8addf92e1287fd7a25c6314016b8c0138dee
+Subproject commit 649be18a8c55c48517861d67158a45dec54992ee
diff --git a/3rdparty/mkldnn b/3rdparty/mkldnn
index a7c5f53832ac..0e7ca738866d 160000
--- a/3rdparty/mkldnn
+++ b/3rdparty/mkldnn
@@ -1 +1 @@
-Subproject commit a7c5f53832acabade6e5086e72c960adedb3c38a
+Subproject commit 0e7ca738866d22cc700aa33b8de120b938f910d0
diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index 6dc04f7c729c..463c0dffe3ea 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit 6dc04f7c729cd5c6c6210d5d4d2026a26ce0bfbf
+Subproject commit 463c0dffe3eae8c39caf7989c85b7244823df27e
diff --git a/3rdparty/tvm b/3rdparty/tvm
index 0f053c82a747..426e3bb0a8e8 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 0f053c82a747b4dcdf49570ec87c17e0067b7439
+Subproject commit 426e3bb0a8e86bb48a25b950fd8ef965ca5d370b

From 9ae5723bbe7b29eee9739466211866ec6a576e17 Mon Sep 17 00:00:00 2001
From: Carl Yang <ctcyang@ucdavis.edu>
Date: Fri, 11 Jan 2019 19:53:14 +0000
Subject: [PATCH 08/10] bring 3rdparty packages to upstream/master

---
 3rdparty/dmlc-core | 2 +-
 3rdparty/mkldnn    | 2 +-
 3rdparty/mshadow   | 2 +-
 3rdparty/tvm       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 649be18a8c55..0a0e8addf92e 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 649be18a8c55c48517861d67158a45dec54992ee
+Subproject commit 0a0e8addf92e1287fd7a25c6314016b8c0138dee
diff --git a/3rdparty/mkldnn b/3rdparty/mkldnn
index 0e7ca738866d..a7c5f53832ac 160000
--- a/3rdparty/mkldnn
+++ b/3rdparty/mkldnn
@@ -1 +1 @@
-Subproject commit 0e7ca738866d22cc700aa33b8de120b938f910d0
+Subproject commit a7c5f53832acabade6e5086e72c960adedb3c38a
diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index 463c0dffe3ea..6dc04f7c729c 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit 463c0dffe3eae8c39caf7989c85b7244823df27e
+Subproject commit 6dc04f7c729cd5c6c6210d5d4d2026a26ce0bfbf
diff --git a/3rdparty/tvm b/3rdparty/tvm
index 426e3bb0a8e8..0f053c82a747 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 426e3bb0a8e86bb48a25b950fd8ef965ca5d370b
+Subproject commit 0f053c82a747b4dcdf49570ec87c17e0067b7439

From 50d6630acddd34ad8181c8ee28b7d94eaef61406 Mon Sep 17 00:00:00 2001
From: Carl Yang <ctcyang@ucdavis.edu>
Date: Fri, 11 Jan 2019 20:17:06 +0000
Subject: [PATCH 09/10] rebase to master

---
 3rdparty/dmlc-core | 2 +-
 3rdparty/mkldnn    | 2 +-
 3rdparty/mshadow   | 2 +-
 3rdparty/tvm       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 649be18a8c55..0a0e8addf92e 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 649be18a8c55c48517861d67158a45dec54992ee
+Subproject commit 0a0e8addf92e1287fd7a25c6314016b8c0138dee
diff --git a/3rdparty/mkldnn b/3rdparty/mkldnn
index 0e7ca738866d..a7c5f53832ac 160000
--- a/3rdparty/mkldnn
+++ b/3rdparty/mkldnn
@@ -1 +1 @@
-Subproject commit 0e7ca738866d22cc700aa33b8de120b938f910d0
+Subproject commit a7c5f53832acabade6e5086e72c960adedb3c38a
diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index 463c0dffe3ea..6dc04f7c729c 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit 463c0dffe3eae8c39caf7989c85b7244823df27e
+Subproject commit 6dc04f7c729cd5c6c6210d5d4d2026a26ce0bfbf
diff --git a/3rdparty/tvm b/3rdparty/tvm
index 426e3bb0a8e8..0f053c82a747 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 426e3bb0a8e86bb48a25b950fd8ef965ca5d370b
+Subproject commit 0f053c82a747b4dcdf49570ec87c17e0067b7439

From 1ca7aab71dd528e054b63f35e572a9e4fd5e4839 Mon Sep 17 00:00:00 2001
From: Carl Yang <ctcyang@ucdavis.edu>
Date: Fri, 11 Jan 2019 13:21:18 -0800
Subject: [PATCH 10/10] Update gpu_topology.h

---
 src/kvstore/gpu_topology.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/kvstore/gpu_topology.h b/src/kvstore/gpu_topology.h
index 699e30f7ca99..777fb47f9945 100644
--- a/src/kvstore/gpu_topology.h
+++ b/src/kvstore/gpu_topology.h
@@ -213,6 +213,8 @@ inline void GetP2PWeight(const std::vector<Context>& devs,
   }
 
   if (!matrix_correct) {
+    LOG(WARNING) << "cudaDeviceGetP2PAttribute incorrect. "
+                 << "Falling back to cudaDeviceEnablePeerAccess for topology detection";
     for (unsigned i = 0; i < p2p_matrix.size(); ++i) {
       if (p2p_matrix[i] > 0)
         (*matrix)[i] = 2;