apache · arcadiaphy · Mar 31, 2019 · Mar 31, 2019 · Apr 4, 2019 · Apr 4, 2019
diff --git a/src/engine/threaded_engine.cc b/src/engine/threaded_engine.cc
@@ -31,6 +31,7 @@
 #include <utility>
 #include "./threaded_engine.h"
 #include "../common/cuda_utils.h"
+#include "../operator/custom/custom-inl.h"
 
 namespace mxnet {
 namespace engine {
@@ -373,10 +374,12 @@ void ThreadedEngine::DeleteVariable(SyncFn delete_fn,
 }
 
 void ThreadedEngine::WaitForVar(VarHandle var) {
+  using mxnet::op::custom::CustomOperator;
   BulkFlush();
   ThreadedVar* threaded_var = ThreadedVar::CastFromBase(var);
   if (threaded_var->ready_to_read()) {
     ThrowException(threaded_var);
+    CustomOperator::Get()->ThrowException();
     return;
   }
   if (engine_info_) {
@@ -407,6 +410,7 @@ void ThreadedEngine::WaitForVar(VarHandle var) {
   }
 
   ThrowException(threaded_var);
+  CustomOperator::Get()->ThrowException();
 }
 
 void ThreadedEngine::WaitForAll() {

diff --git a/src/operator/custom/custom-inl.h b/src/operator/custom/custom-inl.h
@@ -96,7 +96,14 @@ class CustomOperator {
       bool prev_recording = Imperative::Get()->set_is_recording(recording);
       bool prev_training = Imperative::Get()->set_is_training(training);
 
-      func();
+      try {
+        func();
+      } catch (dmlc::Error& e) {
+        exception_ =
+            std::make_shared<std::exception_ptr>(std::current_exception());
+        ctx.async_on_complete();
+        return;
+      }
 
       Imperative::Get()->set_is_training(prev_training);
       Imperative::Get()->set_is_recording(prev_recording);
@@ -145,6 +152,7 @@ class CustomOperator {
     num_free_threads = 0;
     destructing_ = false;
     naive_engine_ = true;
+    exception_ = nullptr;
     if (std::string("NaiveEngine") != dmlc::GetEnv("MXNET_ENGINE_TYPE", std::string())) {
       naive_engine_ = false;
     }
@@ -162,6 +170,14 @@ class CustomOperator {
     workers_.clear();
   }
 
+  inline void ThrowException() {
+    if (exception_ && *exception_) {
+      std::exception_ptr tmp = *exception_;
+      exception_ = nullptr;
+      std::rethrow_exception(tmp);
+    }
+  }
+
  private:
   CustomOperator() {
     this->Start();
@@ -198,6 +214,7 @@ class CustomOperator {
   std::vector<std::thread> workers_;
   std::atomic<uint32_t> num_free_threads;
   std::queue<std::function<void(void)> > q_;
+  std::shared_ptr<std::exception_ptr> exception_;
   bool naive_engine_;
   bool destructing_;
 };

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
@@ -29,6 +29,7 @@
 from mxnet.test_utils import *
 from mxnet.base import py_str, MXNetError, _as_list
 from common import setup_module, with_seed, teardown, assert_raises_cudnn_not_satisfied, assertRaises
+from nose.tools import assert_raises
 import unittest
 import os
 
@@ -5200,29 +5201,29 @@ def create_operator(self, ctx, shapes, dtypes):
 
     # test custom operator fork
     # see https://github.com/apache/incubator-mxnet/issues/14396
-    if not sys.platform.startswith('win'):  # no fork in windows
-        class AdditionOP(mx.operator.CustomOp):
-            def __init__(self):
-                super(AdditionOP, self).__init__()
-            def forward(self, is_train, req, in_data, out_data, aux):
-                out_data[0][:] = in_data[0] + in_data[1]
-            def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
-                in_grad[0][:] = out_grad[0]
-                in_grad[1][:] = out_grad[0]
-
-        @mx.operator.register("AdditionOP")
-        class AdditionOPProp(mx.operator.CustomOpProp):
-            def __init__(self):
-                super(AdditionOPProp, self).__init__()
-            def list_arguments(self):
-                return ['a', 'b']
-            def list_outputs(self):
-                return ['output']
-            def infer_shape(self, in_shape):
-                return in_shape, [in_shape[0]]
-            def create_operator(self, ctx, shapes, dtypes):
-                return AdditionOP()
+    class AdditionOP(mx.operator.CustomOp):
+        def __init__(self):
+            super(AdditionOP, self).__init__()
+        def forward(self, is_train, req, in_data, out_data, aux):
+            out_data[0][:] = in_data[0] + in_data[1]
+        def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
+            in_grad[0][:] = out_grad[0]
+            in_grad[1][:] = out_grad[0]
 
+    @mx.operator.register("AdditionOP")
+    class AdditionOPProp(mx.operator.CustomOpProp):
+        def __init__(self):
+            super(AdditionOPProp, self).__init__()
+        def list_arguments(self):
+            return ['a', 'b']
+        def list_outputs(self):
+            return ['output']
+        def infer_shape(self, in_shape):
+            return in_shape, [in_shape[0]]
+        def create_operator(self, ctx, shapes, dtypes):
+            return AdditionOP()
+
+    if not sys.platform.startswith('win'):  # no fork in windows
         def custom_add():
             a = mx.nd.array([1, 2, 3])
             b = mx.nd.array([4, 5, 6])
@@ -5237,6 +5238,18 @@ def custom_add():
         p.join(5)
         assert not p.is_alive(), "deadlock may exist in custom operator"
 
+    # test except handling
+    # see https://github.com/apache/incubator-mxnet/pull/14575
+    def custom_add_exc():
+        a = mx.nd.array([1, 2, 3])
+        b = mx.nd.array([4, 5])
+        # trigger exception by providing unmatched operand shapes
+        c = mx.nd.Custom(a, b, op_type='AdditionOP')
+        c.wait_to_read()
+
+    assert_raises(MXNetError, custom_add_exc)
+
+
 @with_seed()
 def test_psroipooling():
     for num_rois in [1, 2]: