From 7f7d3337a05f2de8c82a5862e74e1bd7def2aff7 Mon Sep 17 00:00:00 2001 From: Siyao Meng Date: Tue, 3 Nov 2020 13:47:28 -0800 Subject: [PATCH] HDDS-4404. Datanode can go OOM when a Recon or SCM Server is very slow in processing reports Change-Id: I94df595074a0b3cff7acdd4e220302d6f7bd49b0 --- .../common/states/endpoint/HeartbeatEndpointTask.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/states/endpoint/HeartbeatEndpointTask.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/states/endpoint/HeartbeatEndpointTask.java index da2034d93c2d..911b4b1536c2 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/states/endpoint/HeartbeatEndpointTask.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/states/endpoint/HeartbeatEndpointTask.java @@ -147,8 +147,12 @@ public EndpointStateMachine.EndPointStates call() throws Exception { rpcEndpoint.setLastSuccessfulHeartbeat(ZonedDateTime.now()); rpcEndpoint.zeroMissedCount(); } catch (IOException ex) { - // put back the reports which failed to be sent - putBackReports(requestBuilder); + // don't resend reports to recon as it could be down for days + // DN is expected to work fine without recon and not go OOM + if (!rpcEndpoint.isPassive()) { + // put back the reports which failed to be sent + putBackReports(requestBuilder); + } rpcEndpoint.logIfNeeded(ex); } finally {