From 053ffc7f8f7eb085bfefb66749b5938f944844df Mon Sep 17 00:00:00 2001 From: Wang Jiajun Date: Sun, 3 Mar 2019 12:37:11 +0800 Subject: [PATCH] fix memory-related issues to enable ASAN tests (#14223) * fix heap overflow * fix memory leak of optimizer and executer * uncomment memory pool free * run cleanup in engine shutdown phase * make asan tests blocking * fix abort in mxnet shutdown, use forked submodules temporally for tests * trigger CI * change submodule mshadow * change submodule dmlc-core --- 3rdparty/dmlc-core | 2 +- 3rdparty/mshadow | 2 +- ci/docker/runtime_functions.sh | 2 -- cpp-package/example/alexnet.cpp | 1 + cpp-package/example/charRNN.cpp | 10 ++++++++++ cpp-package/example/googlenet.cpp | 1 + cpp-package/example/inception_bn.cpp | 1 + cpp-package/example/lenet.cpp | 1 + cpp-package/example/lenet_with_mxdataiter.cpp | 1 + cpp-package/example/mlp_cpu.cpp | 1 + cpp-package/example/mlp_csv.cpp | 1 + cpp-package/example/mlp_gpu.cpp | 1 + cpp-package/example/resnet.cpp | 1 + cpp-package/example/test_optimizer.cpp | 1 + cpp-package/example/test_score.cpp | 1 + .../image-classification-predict.cc | 4 +++- include/mxnet/ndarray.h | 19 +++++++++++++------ src/common/object_pool.h | 7 +++---- src/engine/threaded_engine.cc | 2 +- src/engine/threaded_engine.h | 3 ++- 20 files changed, 45 insertions(+), 17 deletions(-) diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core index 0a0e8addf92e..55f3c7bc1d87 160000 --- a/3rdparty/dmlc-core +++ b/3rdparty/dmlc-core @@ -1 +1 @@ -Subproject commit 0a0e8addf92e1287fd7a25c6314016b8c0138dee +Subproject commit 55f3c7bc1d875fbc7d34fc26651bb8c6818c8355 diff --git a/3rdparty/mshadow b/3rdparty/mshadow index 3dc80815d965..95ebe0f109ae 160000 --- a/3rdparty/mshadow +++ b/3rdparty/mshadow @@ -1 +1 @@ -Subproject commit 3dc80815d965b56b9a975dc27229361955bf66fe +Subproject commit 95ebe0f109ae021d0d66e2a627ccfc55c3253b55 diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 1b4caa073570..de1b7795ce69 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -1033,8 +1033,6 @@ integrationtest_ubuntu_cpu_asan() { set -ex export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libasan.so.5 - # We do not want to fail the build on ASAN errors until memory leaks have been addressed. - export ASAN_OPTIONS=exitcode=0 cd /work/mxnet/build/cpp-package/example/ /work/mxnet/cpp-package/example/get_data.sh ./mlp_cpu diff --git a/cpp-package/example/alexnet.cpp b/cpp-package/example/alexnet.cpp index 1c0f7130d974..e2083a0dfa0a 100644 --- a/cpp-package/example/alexnet.cpp +++ b/cpp-package/example/alexnet.cpp @@ -319,6 +319,7 @@ int main(int argc, char const *argv[]) { } /*don't foget to release the executor*/ delete exec; + delete opt; MXNotifyShutdown(); return 0; } diff --git a/cpp-package/example/charRNN.cpp b/cpp-package/example/charRNN.cpp index 54b8eea7af39..8951580067a8 100644 --- a/cpp-package/example/charRNN.cpp +++ b/cpp-package/example/charRNN.cpp @@ -500,6 +500,9 @@ void train(const std::string file, int batch_size, int max_epoch, int start_epoc std::string filepath = prefix + "-" + std::to_string(epoch) + ".params"; SaveCheckpoint(filepath, RNN, exe); } + + delete exe; + delete opt; } /*The original example, rnn_cell_demo.py, uses default Xavier as initalizer, which relies on @@ -577,6 +580,9 @@ void trainWithBuiltInRNNOp(const std::string file, int batch_size, int max_epoch std::string filepath = prefix + "-" + std::to_string(epoch) + ".params"; SaveCheckpoint(filepath, RNN, exe); } + + delete exe; + delete opt; } void predict(std::wstring* ptext, int sequence_length, const std::string param_file, @@ -639,6 +645,8 @@ void predict(std::wstring* ptext, int sequence_length, const std::string param_f next = charIndices[n]; ptext->push_back(next); } + + delete exe; } void predictWithBuiltInRNNOp(std::wstring* ptext, int sequence_length, const std::string param_file, @@ -693,6 +701,8 @@ void predictWithBuiltInRNNOp(std::wstring* ptext, int sequence_length, const std next = charIndices[n]; ptext->push_back(next); } + + delete exe; } int main(int argc, char** argv) { diff --git a/cpp-package/example/googlenet.cpp b/cpp-package/example/googlenet.cpp index c8078afd0af4..26ba51027db6 100644 --- a/cpp-package/example/googlenet.cpp +++ b/cpp-package/example/googlenet.cpp @@ -190,6 +190,7 @@ int main(int argc, char const *argv[]) { } delete exec; + delete opt; MXNotifyShutdown(); return 0; } diff --git a/cpp-package/example/inception_bn.cpp b/cpp-package/example/inception_bn.cpp index 456e0d913475..2073ebe47fbc 100644 --- a/cpp-package/example/inception_bn.cpp +++ b/cpp-package/example/inception_bn.cpp @@ -232,6 +232,7 @@ int main(int argc, char const *argv[]) { LG << "Validation Accuracy: " << val_acc.Get(); } delete exec; + delete opt; MXNotifyShutdown(); return 0; } diff --git a/cpp-package/example/lenet.cpp b/cpp-package/example/lenet.cpp index 83c659c7082b..42594548130a 100644 --- a/cpp-package/example/lenet.cpp +++ b/cpp-package/example/lenet.cpp @@ -173,6 +173,7 @@ class Lenet { << ", accuracy: " << ValAccuracy(batch_size * 10, lenet); } delete exe; + delete opt; } private: diff --git a/cpp-package/example/lenet_with_mxdataiter.cpp b/cpp-package/example/lenet_with_mxdataiter.cpp index 39550a3e9e65..33110fee3a88 100644 --- a/cpp-package/example/lenet_with_mxdataiter.cpp +++ b/cpp-package/example/lenet_with_mxdataiter.cpp @@ -177,6 +177,7 @@ int main(int argc, char const *argv[]) { } delete exec; + delete opt; MXNotifyShutdown(); return 0; } diff --git a/cpp-package/example/mlp_cpu.cpp b/cpp-package/example/mlp_cpu.cpp index 93eaf0538269..5d46d40e421f 100644 --- a/cpp-package/example/mlp_cpu.cpp +++ b/cpp-package/example/mlp_cpu.cpp @@ -139,6 +139,7 @@ int main(int argc, char** argv) { } delete exec; + delete opt; MXNotifyShutdown(); return 0; } diff --git a/cpp-package/example/mlp_csv.cpp b/cpp-package/example/mlp_csv.cpp index 43a14c84e6d8..f12b7c17133d 100644 --- a/cpp-package/example/mlp_csv.cpp +++ b/cpp-package/example/mlp_csv.cpp @@ -267,6 +267,7 @@ int main(int argc, char** argv) { } delete exec; + delete opt; MXNotifyShutdown(); return 0; } diff --git a/cpp-package/example/mlp_gpu.cpp b/cpp-package/example/mlp_gpu.cpp index 0befde8ae4bc..f6060209a51e 100644 --- a/cpp-package/example/mlp_gpu.cpp +++ b/cpp-package/example/mlp_gpu.cpp @@ -155,6 +155,7 @@ int main(int argc, char** argv) { } delete exec; + delete opt; MXNotifyShutdown(); return 0; } diff --git a/cpp-package/example/resnet.cpp b/cpp-package/example/resnet.cpp index 7c9dd4daa55a..7200bd42d2de 100644 --- a/cpp-package/example/resnet.cpp +++ b/cpp-package/example/resnet.cpp @@ -242,6 +242,7 @@ int main(int argc, char const *argv[]) { LG << "Validation Accuracy: " << val_acc.Get(); } delete exec; + delete opt; MXNotifyShutdown(); return 0; } diff --git a/cpp-package/example/test_optimizer.cpp b/cpp-package/example/test_optimizer.cpp index 42547ea6c7c9..70190eff5dc6 100644 --- a/cpp-package/example/test_optimizer.cpp +++ b/cpp-package/example/test_optimizer.cpp @@ -30,6 +30,7 @@ int main(int argc, char** argv) { opt = OptimizerRegistry::Find("adam"); int ret = (opt == 0) ? 1 : 0; + delete opt; MXNotifyShutdown(); return ret; } diff --git a/cpp-package/example/test_score.cpp b/cpp-package/example/test_score.cpp index 7e5096abb816..687683f487f8 100644 --- a/cpp-package/example/test_score.cpp +++ b/cpp-package/example/test_score.cpp @@ -156,6 +156,7 @@ int main(int argc, char** argv) { } delete exec; + delete opt; MXNotifyShutdown(); return score >= MIN_SCORE ? 0 : 1; } diff --git a/example/image-classification/predict-cpp/image-classification-predict.cc b/example/image-classification/predict-cpp/image-classification-predict.cc index 2a605b8b2674..3c72589399ba 100644 --- a/example/image-classification/predict-cpp/image-classification-predict.cc +++ b/example/image-classification/predict-cpp/image-classification-predict.cc @@ -76,7 +76,9 @@ class BufferFile { ifs.seekg(0, std::ios::beg); std::cout << file_path.c_str() << " ... " << length_ << " bytes\n"; - buffer_.reset(new char[length_]); + // Buffer as null terminated to be converted to string + buffer_.reset(new char[length_ + 1]); + buffer_[length_] = 0; ifs.read(buffer_.get(), length_); ifs.close(); } diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index feb562aa76fa..c55cb01b4688 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -848,13 +848,17 @@ class NDArray { // The shape of aux data. The default value for the shape depends on the type of storage. // If aux_shapes[i].Size() is zero, aux data i is empty. mxnet::ShapeVector aux_shapes; + /*! \brief Reference to the storage to ensure proper destruct order */ + std::shared_ptr storage_ref_; /*! \brief default cosntructor */ - Chunk() : static_data(true), delay_alloc(false) {} + Chunk() : static_data(true), delay_alloc(false), + storage_ref_(Storage::_GetSharedRef()) {} /*! \brief construct a new chunk */ Chunk(mxnet::TShape shape, Context ctx_, bool delay_alloc_, int dtype) - : static_data(false), delay_alloc(true), ctx(ctx_) { + : static_data(false), delay_alloc(true), ctx(ctx_), + storage_ref_(Storage::_GetSharedRef()) { auto size = shape.Size(); storage_shape = shape; var = Engine::Get()->NewVariable(); @@ -864,7 +868,8 @@ class NDArray { } Chunk(const TBlob &data, int dev_id) - : static_data(true), delay_alloc(false) { + : static_data(true), delay_alloc(false), + storage_ref_(Storage::_GetSharedRef()) { CHECK(storage_type == kDefaultStorage); var = Engine::Get()->NewVariable(); if (data.dev_mask() == cpu::kDevMask) { @@ -881,7 +886,8 @@ class NDArray { } Chunk(int shared_pid, int shared_id, const mxnet::TShape& shape, int dtype) - : static_data(false), delay_alloc(false) { + : static_data(false), delay_alloc(false), + storage_ref_(Storage::_GetSharedRef()) { var = Engine::Get()->NewVariable(); ctx = Context::CPUShared(0); shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype); @@ -897,7 +903,7 @@ class NDArray { const mxnet::ShapeVector &aux_shapes_) : static_data(false), delay_alloc(delay_alloc_), storage_type(storage_type_), aux_types(aux_types_), ctx(ctx_), storage_shape(storage_shape_), - aux_shapes(aux_shapes_) { + aux_shapes(aux_shapes_), storage_ref_(Storage::_GetSharedRef()) { shandle.ctx = ctx; var = Engine::Get()->NewVariable(); // aux_handles always reflect the correct number of aux data @@ -914,7 +920,8 @@ class NDArray { Chunk(const NDArrayStorageType storage_type_, const TBlob &data, const std::vector &aux_data, int dev_id) - : static_data(true), delay_alloc(false), storage_type(storage_type_) { + : static_data(true), delay_alloc(false), storage_type(storage_type_), + storage_ref_(Storage::_GetSharedRef()) { using namespace mshadow; CHECK_NE(storage_type, kDefaultStorage); // init var diff --git a/src/common/object_pool.h b/src/common/object_pool.h index 576ff9aea1cc..362553313b85 100644 --- a/src/common/object_pool.h +++ b/src/common/object_pool.h @@ -132,10 +132,9 @@ struct ObjectPoolAllocatable { template ObjectPool::~ObjectPool() { - // TODO(hotpxl): mind destruction order - // for (auto i : allocated_) { - // free(i); - // } + for (auto i : allocated_) { + free(i); + } } template diff --git a/src/engine/threaded_engine.cc b/src/engine/threaded_engine.cc index 6a6004011db1..b5897a1ca9cd 100644 --- a/src/engine/threaded_engine.cc +++ b/src/engine/threaded_engine.cc @@ -281,7 +281,7 @@ void ThreadedEngine::DeleteOperator(OprHandle op) { this->PushAsync([threaded_opr](RunContext, CallbackOnComplete on_complete) { ThreadedOpr::Delete(threaded_opr); on_complete(); - }, Context::CPU(), {}, deps, FnProperty::kAsync, 0, + }, Context::CPU(), {}, deps, FnProperty::kDeleteVar, 0, "DeleteOperator"); } diff --git a/src/engine/threaded_engine.h b/src/engine/threaded_engine.h index 4a2d4196dcd3..ab06ca1b9b47 100644 --- a/src/engine/threaded_engine.h +++ b/src/engine/threaded_engine.h @@ -352,7 +352,8 @@ class ThreadedEngine : public Engine { LOG(INFO) << "ExecuteOprBlock " << opr_block << "shutdown_phase=" << shutdown_phase_; } - if (!shutdown_phase_) { + // still run cleanup in shutdown_phase + if (!shutdown_phase_ || threaded_opr->prop == FnProperty::kDeleteVar) { try { OnStart(threaded_opr); if (debug_info) {