From 1b63eb4942878dba232331094a916e13bd4869b0 Mon Sep 17 00:00:00 2001 From: Kellen Sunderland Date: Tue, 16 Apr 2019 20:45:52 -0700 Subject: [PATCH] Reference engine from chunk via weak pointer (#14591) --- include/mxnet/ndarray.h | 22 +++++++++---- src/ndarray/ndarray.cc | 24 +++++++------- tests/cpp/engine/engine_shutdown_test.cc | 41 ++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 18 deletions(-) create mode 100644 tests/cpp/engine/engine_shutdown_test.cc diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index 13fb42ce521e..05d3fa45683e 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -850,15 +850,20 @@ class NDArray { mxnet::ShapeVector aux_shapes; /*! \brief Reference to the storage to ensure proper destruct order */ std::shared_ptr storage_ref_; + /*! \brief Reference to the engine to ensure we cleanup without calling a destructed engine */ + std::weak_ptr engine_ref_; - /*! \brief default cosntructor */ + + /*! \brief default constructor */ Chunk() : static_data(true), delay_alloc(false), - storage_ref_(Storage::_GetSharedRef()) {} + storage_ref_(Storage::_GetSharedRef()), + engine_ref_(Engine::_GetSharedRef()) {} /*! \brief construct a new chunk */ Chunk(mxnet::TShape shape, Context ctx_, bool delay_alloc_, int dtype) : static_data(false), delay_alloc(true), ctx(ctx_), - storage_ref_(Storage::_GetSharedRef()) { + storage_ref_(Storage::_GetSharedRef()), + engine_ref_(Engine::_GetSharedRef()) { storage_shape = shape; if (shape_is_known(storage_shape)) { shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype); @@ -872,7 +877,8 @@ class NDArray { Chunk(const TBlob &data, int dev_id) : static_data(true), delay_alloc(false), - storage_ref_(Storage::_GetSharedRef()) { + storage_ref_(Storage::_GetSharedRef()), + engine_ref_(Engine::_GetSharedRef()) { CHECK(storage_type == kDefaultStorage); var = Engine::Get()->NewVariable(); if (data.dev_mask() == cpu::kDevMask) { @@ -890,7 +896,8 @@ class NDArray { Chunk(int shared_pid, int shared_id, const mxnet::TShape& shape, int dtype) : static_data(false), delay_alloc(false), - storage_ref_(Storage::_GetSharedRef()) { + storage_ref_(Storage::_GetSharedRef()), + engine_ref_(Engine::_GetSharedRef()) { var = Engine::Get()->NewVariable(); ctx = Context::CPUShared(0); shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype); @@ -906,7 +913,8 @@ class NDArray { const mxnet::ShapeVector &aux_shapes_) : static_data(false), delay_alloc(delay_alloc_), storage_type(storage_type_), aux_types(aux_types_), ctx(ctx_), storage_shape(storage_shape_), - aux_shapes(aux_shapes_), storage_ref_(Storage::_GetSharedRef()) { + aux_shapes(aux_shapes_), storage_ref_(Storage::_GetSharedRef()), + engine_ref_(Engine::_GetSharedRef()) { shandle.ctx = ctx; var = Engine::Get()->NewVariable(); // aux_handles always reflect the correct number of aux data @@ -924,7 +932,7 @@ class NDArray { Chunk(const NDArrayStorageType storage_type_, const TBlob &data, const std::vector &aux_data, int dev_id) : static_data(true), delay_alloc(false), storage_type(storage_type_), - storage_ref_(Storage::_GetSharedRef()) { + storage_ref_(Storage::_GetSharedRef()), engine_ref_(Engine::_GetSharedRef()) { using namespace mshadow; CHECK_NE(storage_type, kDefaultStorage); // init var diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index f5aac36a48eb..eddfbcff9ce8 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -113,20 +113,22 @@ NDArray::Chunk::~Chunk() { // We want to delete mkldnn memory after deleting the variable. mem.mem = this->mkl_mem_; #endif - Engine::Get()->DeleteVariable([mem, skip_free](RunContext s) { - if (skip_free == false) { + if (auto engine = engine_ref_.lock()) { + engine->DeleteVariable([mem, skip_free](RunContext s) { + if (skip_free == false) { #if MXNET_USE_MKLDNN == 1 - if (mem.mem) { - CHECK_LE(mem.mem->GetSize(), mem.h.size); - CHECK_EQ(mem.mem->GetDataHandle(), mem.h.dptr); - } + if (mem.mem) { + CHECK_LE(mem.mem->GetSize(), mem.h.size); + CHECK_EQ(mem.mem->GetDataHandle(), mem.h.dptr); + } #endif - Storage::Get()->Free(mem.h); - for (const auto& aux : mem.aux_h) { - Storage::Get()->Free(aux); + Storage::Get()->Free(mem.h); + for (const auto &aux : mem.aux_h) { + Storage::Get()->Free(aux); + } } - } - }, shandle.ctx, var); + }, shandle.ctx, var); + } } void NDArray::Chunk::CheckAndAllocData(const mxnet::TShape &shape, int dtype) { diff --git a/tests/cpp/engine/engine_shutdown_test.cc b/tests/cpp/engine/engine_shutdown_test.cc new file mode 100644 index 000000000000..893d08502c3a --- /dev/null +++ b/tests/cpp/engine/engine_shutdown_test.cc @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2019 by Contributors + * \file engine_shutdown_test.cc + * \brief Tests engine shutdown for possible crashes +*/ +#include + +#include "../src/engine/engine_impl.h" +#include "../include/test_util.h" + +/** + * This test will help ensure we don't crash during engine shutdown. + * The crash happens during a static destructor call, so this test may pass and then cause a test-run process crash. + */ +TEST(EngineShutdown, stop_without_crashing) { + static std::unique_ptr ndArray; + { + auto engine = mxnet::Engine::_GetSharedRef(); + ndArray = std::make_unique(mxnet::Context::CPU()); + engine->Stop(); + } +}