From 764a5df25654ba16bb519652552fc48d8385d2d4 Mon Sep 17 00:00:00 2001 From: Wang Jiajun Date: Sun, 10 Mar 2019 13:33:47 +0800 Subject: [PATCH 1/3] fix engine crash in shutdown phase --- src/engine/threaded_engine.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/engine/threaded_engine.h b/src/engine/threaded_engine.h index ab06ca1b9b47..5fa36ade63bc 100644 --- a/src/engine/threaded_engine.h +++ b/src/engine/threaded_engine.h @@ -38,6 +38,7 @@ #include #include #include +#include #include "./engine_impl.h" #include "../profiler/profiler.h" #include "./openmp.h" @@ -306,6 +307,8 @@ class ThreadedEngine : public Engine { objpool_varblk_ref_ = common::ObjectPool::_GetSharedRef(); objpool_var_ref_ = common::ObjectPool::_GetSharedRef(); + storage_ref_ = Storage::_GetSharedRef(); + // Get a ref to the profiler so that it doesn't get killed before us profiler::Profiler::Get(&profiler_); } @@ -549,6 +552,12 @@ class ThreadedEngine : public Engine { std::shared_ptr > objpool_varblk_ref_; std::shared_ptr > objpool_var_ref_; + /*! + * \brief Async destruction of some objects is relied on storage, + * prevent it from being destructed too early + */ + std::shared_ptr storage_ref_; + #if MXNET_USE_CUDA /*! \brief Number of GPU devices available */ std::atomic device_count_{-1}; From 5a42190815b4debe7302461e7713c0b4e0b0ea8c Mon Sep 17 00:00:00 2001 From: Wang Jiajun Date: Sun, 10 Mar 2019 14:27:07 +0800 Subject: [PATCH 2/3] fix lint --- src/engine/threaded_engine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/engine/threaded_engine.h b/src/engine/threaded_engine.h index 5fa36ade63bc..640eac4de086 100644 --- a/src/engine/threaded_engine.h +++ b/src/engine/threaded_engine.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -38,7 +39,6 @@ #include #include #include -#include #include "./engine_impl.h" #include "../profiler/profiler.h" #include "./openmp.h" From 6cb625558af6528a68028bf229ae2532c651880c Mon Sep 17 00:00:00 2001 From: Wang Jiajun Date: Sun, 10 Mar 2019 14:37:50 +0800 Subject: [PATCH 3/3] Revert "Bypass ThreadedEngine in test_operator_gpu.py:test_convolution_multiple_streams. (#14338)" This reverts commit d6eafca2555b58746f51052fdce96a264d02a84a. --- tests/python/gpu/test_operator_gpu.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index 7d7c2ed71216..c12c94b41f5c 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -547,18 +547,8 @@ def _conv_with_num_streams(seed): @with_seed() def test_convolution_multiple_streams(): - engines = ['NaiveEngine', 'ThreadedEngine', 'ThreadedEnginePerDevice'] - - if os.getenv('MXNET_ENGINE_TYPE') is not None: - engines = [os.getenv('MXNET_ENGINE_TYPE'),] - print("Only running against '%s'" % engines[0], file=sys.stderr, end='') - # Remove this else clause when the ThreadedEngine can handle this test - else: - engines.remove('ThreadedEngine') - print("SKIP: 'ThreadedEngine', only running against %s" % engines, file=sys.stderr, end='') - for num_streams in [1, 2]: - for engine in engines: + for engine in ['NaiveEngine', 'ThreadedEngine', 'ThreadedEnginePerDevice']: print("Starting engine %s with %d streams." % (engine, num_streams), file=sys.stderr) run_in_spawned_process(_conv_with_num_streams, {'MXNET_GPU_WORKER_NSTREAMS' : num_streams, 'MXNET_ENGINE_TYPE' : engine})