Skip to content

Commit

Permalink
Reference engine from chunk via weak pointer (apache#14591)
Browse files Browse the repository at this point in the history
  • Loading branch information
KellenSunderland authored and haohuw committed Jun 23, 2019
1 parent ba93a2d commit 1b63eb4
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 18 deletions.
22 changes: 15 additions & 7 deletions include/mxnet/ndarray.h
Original file line number Diff line number Diff line change
Expand Up @@ -850,15 +850,20 @@ class NDArray {
mxnet::ShapeVector aux_shapes;
/*! \brief Reference to the storage to ensure proper destruct order */
std::shared_ptr<Storage> storage_ref_;
/*! \brief Reference to the engine to ensure we cleanup without calling a destructed engine */
std::weak_ptr<Engine> engine_ref_;

/*! \brief default cosntructor */

/*! \brief default constructor */
Chunk() : static_data(true), delay_alloc(false),
storage_ref_(Storage::_GetSharedRef()) {}
storage_ref_(Storage::_GetSharedRef()),
engine_ref_(Engine::_GetSharedRef()) {}

/*! \brief construct a new chunk */
Chunk(mxnet::TShape shape, Context ctx_, bool delay_alloc_, int dtype)
: static_data(false), delay_alloc(true), ctx(ctx_),
storage_ref_(Storage::_GetSharedRef()) {
storage_ref_(Storage::_GetSharedRef()),
engine_ref_(Engine::_GetSharedRef()) {
storage_shape = shape;
if (shape_is_known(storage_shape)) {
shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);
Expand All @@ -872,7 +877,8 @@ class NDArray {

Chunk(const TBlob &data, int dev_id)
: static_data(true), delay_alloc(false),
storage_ref_(Storage::_GetSharedRef()) {
storage_ref_(Storage::_GetSharedRef()),
engine_ref_(Engine::_GetSharedRef()) {
CHECK(storage_type == kDefaultStorage);
var = Engine::Get()->NewVariable();
if (data.dev_mask() == cpu::kDevMask) {
Expand All @@ -890,7 +896,8 @@ class NDArray {

Chunk(int shared_pid, int shared_id, const mxnet::TShape& shape, int dtype)
: static_data(false), delay_alloc(false),
storage_ref_(Storage::_GetSharedRef()) {
storage_ref_(Storage::_GetSharedRef()),
engine_ref_(Engine::_GetSharedRef()) {
var = Engine::Get()->NewVariable();
ctx = Context::CPUShared(0);
shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);
Expand All @@ -906,7 +913,8 @@ class NDArray {
const mxnet::ShapeVector &aux_shapes_)
: static_data(false), delay_alloc(delay_alloc_), storage_type(storage_type_),
aux_types(aux_types_), ctx(ctx_), storage_shape(storage_shape_),
aux_shapes(aux_shapes_), storage_ref_(Storage::_GetSharedRef()) {
aux_shapes(aux_shapes_), storage_ref_(Storage::_GetSharedRef()),
engine_ref_(Engine::_GetSharedRef()) {
shandle.ctx = ctx;
var = Engine::Get()->NewVariable();
// aux_handles always reflect the correct number of aux data
Expand All @@ -924,7 +932,7 @@ class NDArray {
Chunk(const NDArrayStorageType storage_type_, const TBlob &data,
const std::vector<TBlob> &aux_data, int dev_id)
: static_data(true), delay_alloc(false), storage_type(storage_type_),
storage_ref_(Storage::_GetSharedRef()) {
storage_ref_(Storage::_GetSharedRef()), engine_ref_(Engine::_GetSharedRef()) {
using namespace mshadow;
CHECK_NE(storage_type, kDefaultStorage);
// init var
Expand Down
24 changes: 13 additions & 11 deletions src/ndarray/ndarray.cc
Original file line number Diff line number Diff line change
Expand Up @@ -113,20 +113,22 @@ NDArray::Chunk::~Chunk() {
// We want to delete mkldnn memory after deleting the variable.
mem.mem = this->mkl_mem_;
#endif
Engine::Get()->DeleteVariable([mem, skip_free](RunContext s) {
if (skip_free == false) {
if (auto engine = engine_ref_.lock()) {
engine->DeleteVariable([mem, skip_free](RunContext s) {
if (skip_free == false) {
#if MXNET_USE_MKLDNN == 1
if (mem.mem) {
CHECK_LE(mem.mem->GetSize(), mem.h.size);
CHECK_EQ(mem.mem->GetDataHandle(), mem.h.dptr);
}
if (mem.mem) {
CHECK_LE(mem.mem->GetSize(), mem.h.size);
CHECK_EQ(mem.mem->GetDataHandle(), mem.h.dptr);
}
#endif
Storage::Get()->Free(mem.h);
for (const auto& aux : mem.aux_h) {
Storage::Get()->Free(aux);
Storage::Get()->Free(mem.h);
for (const auto &aux : mem.aux_h) {
Storage::Get()->Free(aux);
}
}
}
}, shandle.ctx, var);
}, shandle.ctx, var);
}
}

void NDArray::Chunk::CheckAndAllocData(const mxnet::TShape &shape, int dtype) {
Expand Down
41 changes: 41 additions & 0 deletions tests/cpp/engine/engine_shutdown_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

/*!
* Copyright (c) 2019 by Contributors
* \file engine_shutdown_test.cc
* \brief Tests engine shutdown for possible crashes
*/
#include <gtest/gtest.h>

#include "../src/engine/engine_impl.h"
#include "../include/test_util.h"

/**
* This test will help ensure we don't crash during engine shutdown.
* The crash happens during a static destructor call, so this test may pass and then cause a test-run process crash.
*/
TEST(EngineShutdown, stop_without_crashing) {
static std::unique_ptr<mxnet::NDArray> ndArray;
{
auto engine = mxnet::Engine::_GetSharedRef();
ndArray = std::make_unique<mxnet::NDArray>(mxnet::Context::CPU());
engine->Stop();
}
}

0 comments on commit 1b63eb4

Please sign in to comment.