From 90e6af5d3af9012f1daae37f12c4ac3a15fd1e07 Mon Sep 17 00:00:00 2001 From: liuyuang Date: Thu, 9 Dec 2021 17:04:15 +0800 Subject: [PATCH 1/2] fix overlap hang --- paddle/fluid/distributed/fleet_executor/carrier.h | 2 ++ paddle/fluid/distributed/fleet_executor/compute_interceptor.cc | 3 +++ 2 files changed, 5 insertions(+) diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h index 0c54201c94034..91c78e478beb6 100644 --- a/paddle/fluid/distributed/fleet_executor/carrier.h +++ b/paddle/fluid/distributed/fleet_executor/carrier.h @@ -75,6 +75,8 @@ class Carrier final { bool IsInit() const; + std::mutex run; + DISABLE_COPY_AND_ASSIGN(Carrier); private: diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc index 35905125a0a43..98583de84e7ea 100644 --- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc +++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h" +#include "paddle/fluid/distributed/fleet_executor/carrier.h" #include "paddle/fluid/distributed/fleet_executor/task_node.h" #include "paddle/fluid/framework/executor_gc_helper.h" @@ -169,6 +170,8 @@ void ComputeInterceptor::ReplyCompletedToUpStream() { } void ComputeInterceptor::RunOps() { + Carrier& carrier_instance = Carrier::Instance(); + std::unique_lock lock(carrier_instance.run); VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " running ops for the " << step_ + 1 << " time."; for (auto op : node_->ops()) { From 76a5b0981a9fc8ba67b149f89c2208d31adc8d7b Mon Sep 17 00:00:00 2001 From: liuyuang Date: Thu, 9 Dec 2021 18:45:22 +0800 Subject: [PATCH 2/2] add annotation --- paddle/fluid/distributed/fleet_executor/carrier.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h index 91c78e478beb6..f9411aa73fad4 100644 --- a/paddle/fluid/distributed/fleet_executor/carrier.h +++ b/paddle/fluid/distributed/fleet_executor/carrier.h @@ -75,6 +75,9 @@ class Carrier final { bool IsInit() const; + // NOTE: This mutex will be used in interceptor's RunOps function. + // This mutex is used for avoiding forward ops and backward ops run + // simultaneously, which will lead to a random hang for some sync ops. std::mutex run; DISABLE_COPY_AND_ASSIGN(Carrier);