Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check for hanging in MPIAlState #33

Merged
merged 1 commit into from
Jan 22, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/Al.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "Al_config.hpp"
#include "base.hpp"
#include "tuning_params.hpp"
#include "utils.hpp"

namespace Al {

Expand Down
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ set_full_path(THIS_DIR_HEADERS
mempool.hpp
mpi_impl.hpp
tuning_params.hpp
utils.hpp
)
set_full_path(THIS_DIR_CXX_SOURCES
Al.cpp
Expand Down
18 changes: 18 additions & 0 deletions src/mpi_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -497,18 +497,36 @@ class MPIAlState : public AlState {
int tag;
/** Requests for send_recv. */
MPI_Request send_recv_reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
#ifdef AL_DEBUG_HANG_CHECK
bool hang_reported = false;
double send_recv_start = std::numeric_limits<double>::max();
#endif

/** Start a send/recv. */
void start_send_recv(
const void* send, int send_count, int dest,
void* recv, int recv_count, int source) {
MPI_Irecv(recv, recv_count, type, source, tag, comm, &(send_recv_reqs[0]));
MPI_Isend(send, send_count, type, dest, tag, comm, &(send_recv_reqs[1]));
#ifdef AL_DEBUG_HANG_CHECK
send_recv_start = get_time();
#endif
}
/** Return true if the outstanding send/recv has completed. */
bool test_send_recv() {
int flag;
MPI_Testall(2, send_recv_reqs, &flag, MPI_STATUSES_IGNORE);
#ifdef AL_DEBUG_HANG_CHECK
if (!flag && !hang_reported) {
double t = get_time();
// Choice of 10 + rank is arbitrary, but seems reasonable.
// The + rank part helps ensure printing from ranks isn't interleaved.
if (t - sr_start > 10.0 + rank) {
std::cout << rank << ": Possible send/recv hang detected, tag=" << tag << std::endl;
hang_reported = true;
}
}
#endif
return flag;
}
};
Expand Down
41 changes: 41 additions & 0 deletions src/utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the
// Lawrence Livermore National Laboratory in collaboration with University of
// Illinois Urbana-Champaign.
//
// Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
// the CONTRIBUTORS file. <[email protected]>
//
// LLNL-CODE-756777.
// All rights reserved.
//
// This file is part of Aluminum GPU-aware Communication Library. For details, see
// http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
//
// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
// may not use this file except in compliance with the License. You may
// obtain a copy of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the license.
////////////////////////////////////////////////////////////////////////////////

#pragma once

#include <chrono>

namespace Al {

/** Return time, in seconds (with decimal), since a fixed epoch. */
inline double get_time() {
using namespace std::chrono;
return duration_cast<duration<double>>(
steady_clock::now().time_since_epoch()).count();
}

} // namespace Al