Skip to content

Commit

Permalink
Add a means to check for hangs in MPIAlState, enabled with AL_DEBUG_H…
Browse files Browse the repository at this point in the history
…ANG_CHECK.
  • Loading branch information
ndryden committed Jan 22, 2019
1 parent 288a3d0 commit 4b35f36
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/Al.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "Al_config.hpp"
#include "base.hpp"
#include "tuning_params.hpp"
#include "utils.hpp"

namespace Al {

Expand Down
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ set_full_path(THIS_DIR_HEADERS
mempool.hpp
mpi_impl.hpp
tuning_params.hpp
utils.hpp
)
set_full_path(THIS_DIR_CXX_SOURCES
Al.cpp
Expand Down
18 changes: 18 additions & 0 deletions src/mpi_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -497,18 +497,36 @@ class MPIAlState : public AlState {
int tag;
/** Requests for send_recv. */
MPI_Request send_recv_reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
#ifdef AL_DEBUG_HANG_CHECK
bool hang_reported = false;
double send_recv_start = std::numeric_limits<double>::max();
#endif

/** Start a send/recv. */
void start_send_recv(
const void* send, int send_count, int dest,
void* recv, int recv_count, int source) {
MPI_Irecv(recv, recv_count, type, source, tag, comm, &(send_recv_reqs[0]));
MPI_Isend(send, send_count, type, dest, tag, comm, &(send_recv_reqs[1]));
#ifdef AL_DEBUG_HANG_CHECK
send_recv_start = get_time();
#endif
}
/** Return true if the outstanding send/recv has completed. */
bool test_send_recv() {
int flag;
MPI_Testall(2, send_recv_reqs, &flag, MPI_STATUSES_IGNORE);
#ifdef AL_DEBUG_HANG_CHECK
if (!flag && !hang_reported) {
double t = get_time();
// Choice of 10 + rank is arbitrary, but seems reasonable.
// The + rank part helps ensure printing from ranks isn't interleaved.
if (t - sr_start > 10.0 + rank) {
std::cout << rank << ": Possible send/recv hang detected, tag=" << tag << std::endl;
hang_reported = true;
}
}
#endif
return flag;
}
};
Expand Down
41 changes: 41 additions & 0 deletions src/utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the
// Lawrence Livermore National Laboratory in collaboration with University of
// Illinois Urbana-Champaign.
//
// Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
// the CONTRIBUTORS file. <[email protected]>
//
// LLNL-CODE-756777.
// All rights reserved.
//
// This file is part of Aluminum GPU-aware Communication Library. For details, see
// http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
//
// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
// may not use this file except in compliance with the License. You may
// obtain a copy of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the license.
////////////////////////////////////////////////////////////////////////////////

#pragma once

#include <chrono>

namespace Al {

/** Return time, in seconds (with decimal), since a fixed epoch. */
inline double get_time() {
using namespace std::chrono;
return duration_cast<duration<double>>(
steady_clock::now().time_since_epoch()).count();
}

} // namespace Al

0 comments on commit 4b35f36

Please sign in to comment.