From 764a5df25654ba16bb519652552fc48d8385d2d4 Mon Sep 17 00:00:00 2001
From: Wang Jiajun <wangjiajun.phy@gmail.com>
Date: Sun, 10 Mar 2019 13:33:47 +0800
Subject: [PATCH 1/3] fix engine crash in shutdown phase

---
 src/engine/threaded_engine.h | 9 +++++++++
 1 file changed, 9 insertions(+)
diff --git a/src/engine/threaded_engine.h b/src/engine/threaded_engine.h
index ab06ca1b9b47..5fa36ade63bc 100644
--- a/src/engine/threaded_engine.h
+++ b/src/engine/threaded_engine.h
@@ -38,6 +38,7 @@
 #include <mutex>
 #include <string>
 #include <thread>
+#include <mxnet/storage.h>
 #include "./engine_impl.h"
 #include "../profiler/profiler.h"
 #include "./openmp.h"
@@ -306,6 +307,8 @@ class ThreadedEngine : public Engine {
     objpool_varblk_ref_ = common::ObjectPool<VersionedVarBlock>::_GetSharedRef();
     objpool_var_ref_    = common::ObjectPool<ThreadedVar>::_GetSharedRef();
 
+    storage_ref_ = Storage::_GetSharedRef();
+
     // Get a ref to the profiler so that it doesn't get killed before us
     profiler::Profiler::Get(&profiler_);
   }
@@ -549,6 +552,12 @@ class ThreadedEngine : public Engine {
   std::shared_ptr<common::ObjectPool<VersionedVarBlock> > objpool_varblk_ref_;
   std::shared_ptr<common::ObjectPool<ThreadedVar> >       objpool_var_ref_;
 
+  /*!
+   * \brief Async destruction of some objects is relied on storage,
+   *  prevent it from being destructed too early
+   */
+  std::shared_ptr<Storage> storage_ref_;
+
 #if MXNET_USE_CUDA
   /*! \brief Number of GPU devices available */
   std::atomic<int> device_count_{-1};

From 5a42190815b4debe7302461e7713c0b4e0b0ea8c Mon Sep 17 00:00:00 2001
From: Wang Jiajun <wangjiajun.phy@gmail.com>
Date: Sun, 10 Mar 2019 14:27:07 +0800
Subject: [PATCH 2/3] fix lint

---
 src/engine/threaded_engine.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/engine/threaded_engine.h b/src/engine/threaded_engine.h
index 5fa36ade63bc..640eac4de086 100644
--- a/src/engine/threaded_engine.h
+++ b/src/engine/threaded_engine.h
@@ -30,6 +30,7 @@
 #include <dmlc/base.h>
 #include <dmlc/logging.h>
 #include <dmlc/omp.h>
+#include <mxnet/storage.h>
 #include <vector>
 #include <functional>
 #include <condition_variable>
@@ -38,7 +39,6 @@
 #include <mutex>
 #include <string>
 #include <thread>
-#include <mxnet/storage.h>
 #include "./engine_impl.h"
 #include "../profiler/profiler.h"
 #include "./openmp.h"

From 6cb625558af6528a68028bf229ae2532c651880c Mon Sep 17 00:00:00 2001
From: Wang Jiajun <wangjiajun.phy@gmail.com>
Date: Sun, 10 Mar 2019 14:37:50 +0800
Subject: [PATCH 3/3] Revert "Bypass ThreadedEngine in
 test_operator_gpu.py:test_convolution_multiple_streams. (#14338)"

This reverts commit d6eafca2555b58746f51052fdce96a264d02a84a.
---
 tests/python/gpu/test_operator_gpu.py | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 7d7c2ed71216..c12c94b41f5c 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -547,18 +547,8 @@ def _conv_with_num_streams(seed):
 
 @with_seed()
 def test_convolution_multiple_streams():
-    engines = ['NaiveEngine', 'ThreadedEngine', 'ThreadedEnginePerDevice']
-
-    if os.getenv('MXNET_ENGINE_TYPE') is not None:
-        engines = [os.getenv('MXNET_ENGINE_TYPE'),]
-        print("Only running against '%s'" % engines[0], file=sys.stderr, end='')
-    # Remove this else clause when the ThreadedEngine can handle this test
-    else:
-        engines.remove('ThreadedEngine')
-        print("SKIP: 'ThreadedEngine', only running against %s" % engines, file=sys.stderr, end='')
-
     for num_streams in [1, 2]:
-        for engine in engines:
+        for engine in ['NaiveEngine', 'ThreadedEngine', 'ThreadedEnginePerDevice']:
             print("Starting engine %s with %d streams." % (engine, num_streams), file=sys.stderr)
             run_in_spawned_process(_conv_with_num_streams,
                 {'MXNET_GPU_WORKER_NSTREAMS' : num_streams, 'MXNET_ENGINE_TYPE' : engine})