From d0e559729f87c8e3abf4b3c24183d0523ba13baa Mon Sep 17 00:00:00 2001
From: Yaoyi Chen <29800192+yaoyic@users.noreply.github.com>
Date: Wed, 30 Sep 2020 10:16:49 +0200
Subject: [PATCH 1/3] Synchronize CUDA context before force copying.

---
 platforms/cuda/src/CudaTorchKernels.cpp | 3 +++
 1 file changed, 3 insertions(+)
diff --git a/platforms/cuda/src/CudaTorchKernels.cpp b/platforms/cuda/src/CudaTorchKernels.cpp
index c0a891b5..aedd1fe3 100644
--- a/platforms/cuda/src/CudaTorchKernels.cpp
+++ b/platforms/cuda/src/CudaTorchKernels.cpp
@@ -33,6 +33,7 @@
 #include "CudaTorchKernelSources.h"
 #include "openmm/internal/ContextImpl.h"
 #include <map>
+#include <cuda_runtime_api.h>
 
 using namespace TorchPlugin;
 using namespace OpenMM;
@@ -85,6 +86,8 @@ double CudaCalcTorchForceKernel::execute(ContextImpl& context, bool includeForce
         energyTensor.backward();
         // Note: "forceTensor" needs to be cloned due to a shared context (https://github.com/openmm/openmm-torch/issues/13)
         torch::Tensor forceTensor = posTensor.grad().clone();
+        // make sure that all calculations on PyTorch side is properly finished before changing CUDA context or starting the `addForcesKernel` of this plugin
+        cudaDeviceSynchronize();
         cu.setAsCurrent();
         void* data;
         if (cu.getUseDoublePrecision()) {

From 7211b1b08965c95527a0803052e9d8168a2a1360 Mon Sep 17 00:00:00 2001
From: Yaoyi Chen <29800192+yaoyic@users.noreply.github.com>
Date: Thu, 1 Oct 2020 12:29:28 +0200
Subject: [PATCH 2/3] Changing synchronizing strategy before retrieving force:
 whole device -> current context.

---
 platforms/cuda/src/CudaTorchKernels.cpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/platforms/cuda/src/CudaTorchKernels.cpp b/platforms/cuda/src/CudaTorchKernels.cpp
index aedd1fe3..bf680322 100644
--- a/platforms/cuda/src/CudaTorchKernels.cpp
+++ b/platforms/cuda/src/CudaTorchKernels.cpp
@@ -39,6 +39,15 @@ using namespace TorchPlugin;
 using namespace OpenMM;
 using namespace std;
 
+// macro for checking the result of synchronization operation on CUDA
+// copied from `openmm/platforms/cuda/src/CudaParallelKernels.cpp`
+#define CHECK_RESULT(result, prefix) \
+if (result != CUDA_SUCCESS) { \
+    std::stringstream m; \
+    m<<prefix<<": "<<cu.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
+    throw OpenMMException(m.str());\
+}
+
 CudaCalcTorchForceKernel::~CudaCalcTorchForceKernel() {
 }
 
@@ -87,7 +96,9 @@ double CudaCalcTorchForceKernel::execute(ContextImpl& context, bool includeForce
         // Note: "forceTensor" needs to be cloned due to a shared context (https://github.com/openmm/openmm-torch/issues/13)
         torch::Tensor forceTensor = posTensor.grad().clone();
         // make sure that all calculations on PyTorch side is properly finished before changing CUDA context or starting the `addForcesKernel` of this plugin
-        cudaDeviceSynchronize();
+        // cudaDeviceSynchronize();  // synchronizing the whole device is not necessary and may even cause problem
+        // synchronizing the current context and check the return status
+        CHECK_RESULT(cuCtxSynchronize(), "Error synchronizing CUDA context");
         cu.setAsCurrent();
         void* data;
         if (cu.getUseDoublePrecision()) {

From 3a3b48455913fbb6d23c2a77e2f760ea58a62338 Mon Sep 17 00:00:00 2001
From: Yaoyi Chen <29800192+yaoyic@users.noreply.github.com>
Date: Tue, 6 Oct 2020 09:15:45 +0200
Subject: [PATCH 3/3] Adding a synchronization before starting PyTorch
 operations.

---
 platforms/cuda/src/CudaTorchKernels.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/platforms/cuda/src/CudaTorchKernels.cpp b/platforms/cuda/src/CudaTorchKernels.cpp
index bf680322..3b5ed150 100644
--- a/platforms/cuda/src/CudaTorchKernels.cpp
+++ b/platforms/cuda/src/CudaTorchKernels.cpp
@@ -90,6 +90,8 @@ double CudaCalcTorchForceKernel::execute(ContextImpl& context, bool includeForce
     vector<torch::jit::IValue> inputs = {posTensor};
     if (usePeriodic)
         inputs.push_back(boxTensor);
+    // synchronizing the current context before switching to PyTorch
+    CHECK_RESULT(cuCtxSynchronize(), "Error synchronizing CUDA context");
     torch::Tensor energyTensor = module.forward(inputs).toTensor();
     if (includeForces) {
         energyTensor.backward();