From 4b35f36ca613a4803ab361e00e0385ed3cc826af Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Tue, 22 Jan 2019 09:55:34 -0800 Subject: [PATCH] Add a means to check for hangs in MPIAlState, enabled with AL_DEBUG_HANG_CHECK. --- src/Al.hpp | 1 + src/CMakeLists.txt | 1 + src/mpi_impl.hpp | 18 ++++++++++++++++++ src/utils.hpp | 41 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 61 insertions(+) create mode 100644 src/utils.hpp diff --git a/src/Al.hpp b/src/Al.hpp index 62610778..cddb935a 100644 --- a/src/Al.hpp +++ b/src/Al.hpp @@ -37,6 +37,7 @@ #include "Al_config.hpp" #include "base.hpp" #include "tuning_params.hpp" +#include "utils.hpp" namespace Al { diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 880105a5..11a717eb 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -6,6 +6,7 @@ set_full_path(THIS_DIR_HEADERS mempool.hpp mpi_impl.hpp tuning_params.hpp + utils.hpp ) set_full_path(THIS_DIR_CXX_SOURCES Al.cpp diff --git a/src/mpi_impl.hpp b/src/mpi_impl.hpp index 27cd2a23..5980e27d 100644 --- a/src/mpi_impl.hpp +++ b/src/mpi_impl.hpp @@ -497,6 +497,10 @@ class MPIAlState : public AlState { int tag; /** Requests for send_recv. */ MPI_Request send_recv_reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; +#ifdef AL_DEBUG_HANG_CHECK + bool hang_reported = false; + double send_recv_start = std::numeric_limits::max(); +#endif /** Start a send/recv. */ void start_send_recv( @@ -504,11 +508,25 @@ class MPIAlState : public AlState { void* recv, int recv_count, int source) { MPI_Irecv(recv, recv_count, type, source, tag, comm, &(send_recv_reqs[0])); MPI_Isend(send, send_count, type, dest, tag, comm, &(send_recv_reqs[1])); +#ifdef AL_DEBUG_HANG_CHECK + send_recv_start = get_time(); +#endif } /** Return true if the outstanding send/recv has completed. */ bool test_send_recv() { int flag; MPI_Testall(2, send_recv_reqs, &flag, MPI_STATUSES_IGNORE); +#ifdef AL_DEBUG_HANG_CHECK + if (!flag && !hang_reported) { + double t = get_time(); + // Choice of 10 + rank is arbitrary, but seems reasonable. + // The + rank part helps ensure printing from ranks isn't interleaved. + if (t - sr_start > 10.0 + rank) { + std::cout << rank << ": Possible send/recv hang detected, tag=" << tag << std::endl; + hang_reported = true; + } + } +#endif return flag; } }; diff --git a/src/utils.hpp b/src/utils.hpp new file mode 100644 index 00000000..103befd4 --- /dev/null +++ b/src/utils.hpp @@ -0,0 +1,41 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the +// Lawrence Livermore National Laboratory in collaboration with University of +// Illinois Urbana-Champaign. +// +// Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-756777. +// All rights reserved. +// +// This file is part of Aluminum GPU-aware Communication Library. For details, see +// http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include + +namespace Al { + +/** Return time, in seconds (with decimal), since a fixed epoch. */ +inline double get_time() { + using namespace std::chrono; + return duration_cast>( + steady_clock::now().time_since_epoch()).count(); +} + +} // namespace Al