diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/metadata/DatanodeTable.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/metadata/DatanodeTable.java index 94b8b0569978..ec9359fb3725 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/metadata/DatanodeTable.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/metadata/DatanodeTable.java @@ -90,7 +90,7 @@ public void deleteWithBatch(BatchOperation batch, KEY key) } @Override - public String getName() throws IOException { + public String getName() { return table.getName(); } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/CommandForDatanode.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/CommandForDatanode.java index 0326df66a9f6..cd4c05fc6eea 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/CommandForDatanode.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/CommandForDatanode.java @@ -26,8 +26,7 @@ /** * Command for the datanode with the destination address. */ -public class CommandForDatanode implements - IdentifiableEventPayload { +public final class CommandForDatanode implements IdentifiableEventPayload { private final DatanodeID datanodeId; private final SCMCommand command; @@ -54,4 +53,9 @@ public SCMCommand getCommand() { public long getId() { return command.getId(); } + + @Override + public String toString() { + return "CommandForDatanode{" + datanodeId + ", " + command + '}'; + } } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/SCMCommand.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/SCMCommand.java index 6c3406a14357..cc0915aac00b 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/SCMCommand.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/SCMCommand.java @@ -127,4 +127,9 @@ public boolean hasExpired(long currentEpochMs) { public boolean contributesToQueueSize() { return true; } + + @Override + public String toString() { + return getType() + "(id=" + id + ", term=" + term + ')'; + } } diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/db/Table.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/db/Table.java index 18f98a08ed63..9552f327e2e8 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/db/Table.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/db/Table.java @@ -173,9 +173,8 @@ default VALUE getReadCopy(KEY key) throws IOException { /** * Returns the Name of this Table. * @return - Table Name. - * @throws IOException on failure. */ - String getName() throws IOException; + String getName(); /** * Returns the key count of this Table. Note the result can be inaccurate. diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/NodeEndpoint.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/NodeEndpoint.java index a0bad57d2a51..cb7693054a39 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/NodeEndpoint.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/NodeEndpoint.java @@ -48,12 +48,12 @@ import org.apache.hadoop.hdds.protocol.DatanodeDetails; import org.apache.hadoop.hdds.protocol.DatanodeID; import org.apache.hadoop.hdds.protocol.proto.HddsProtos; -import org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState; import org.apache.hadoop.hdds.scm.container.ContainerID; import org.apache.hadoop.hdds.scm.container.ContainerInfo; import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException; import org.apache.hadoop.hdds.scm.container.placement.metrics.SCMNodeStat; +import org.apache.hadoop.hdds.scm.node.DatanodeInfo; import org.apache.hadoop.hdds.scm.node.NodeStatus; import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException; import org.apache.hadoop.hdds.scm.pipeline.Pipeline; @@ -115,13 +115,10 @@ public Response getDatanodes() { } catch (NodeNotFoundException e) { LOG.warn("Cannot get nodeState for datanode {}", datanode, e); } - final NodeOperationalState nodeOpState = datanode.getPersistedOpState(); - String hostname = datanode.getHostName(); Set pipelineIDs = nodeManager.getPipelines(datanode); List pipelines = new ArrayList<>(); AtomicInteger leaderCount = new AtomicInteger(); AtomicInteger openContainers = new AtomicInteger(); - DatanodeMetadata.Builder builder = DatanodeMetadata.newBuilder(); pipelineIDs.forEach(pipelineID -> { try { @@ -140,39 +137,32 @@ public Response getDatanodes() { openContainers.getAndAdd(openContainerPerPipeline); } catch (PipelineNotFoundException ex) { LOG.warn("Cannot get pipeline {} for datanode {}, pipeline not found", - pipelineID.getId(), hostname, ex); + pipelineID.getId(), datanode, ex); } catch (IOException ioEx) { LOG.warn("Cannot get leader node of pipeline with id {}.", pipelineID.getId(), ioEx); } }); + + final DatanodeMetadata.Builder builder = DatanodeMetadata.newBuilder() + .setOpenContainers(openContainers.get()); + try { builder.setContainers(nodeManager.getContainerCount(datanode)); - builder.setOpenContainers(openContainers.get()); } catch (NodeNotFoundException ex) { - LOG.warn("Cannot get containers, datanode {} not found.", - datanode.getUuid(), ex); + LOG.warn("Failed to getContainerCount for {}", datanode, ex); } - datanodes.add(builder.setHostname(nodeManager.getHostName(datanode)) + datanodes.add(builder.setDatanode(datanode) .setDatanodeStorageReport(storageReport) .setLastHeartbeat(nodeManager.getLastHeartbeat(datanode)) .setState(nodeState) - .setOperationalState(nodeOpState) .setPipelines(pipelines) .setLeaderCount(leaderCount.get()) - .setUuid(datanode.getUuidString()) - .setVersion(nodeManager.getVersion(datanode)) - .setSetupTime(nodeManager.getSetupTime(datanode)) - .setRevision(nodeManager.getRevision(datanode)) - .setLayoutVersion(datanode.getLastKnownLayoutVersion().getMetadataLayoutVersion()) - .setNetworkLocation(datanode.getNetworkLocation()) .build()); }); - DatanodesResponse datanodesResponse = - new DatanodesResponse(datanodes.size(), datanodes); - return Response.ok(datanodesResponse).build(); + return Response.ok(new DatanodesResponse(datanodes)).build(); } /** @@ -209,26 +199,22 @@ public Response removeDatanodes(List uuids) { Preconditions.checkArgument(!uuids.isEmpty(), "Datanode list argument should not be empty"); try { for (String uuid : uuids) { - DatanodeDetails nodeByUuid = nodeManager.getNode(DatanodeID.fromUuidString(uuid)); - try { + final DatanodeInfo nodeByUuid = nodeManager.getNode(DatanodeID.fromUuidString(uuid)); + if (nodeByUuid != null) { + final DatanodeMetadata metadata = DatanodeMetadata.newBuilder() + .setDatanode(nodeByUuid) + .setState(nodeManager.getNodeStatus(nodeByUuid).getHealth()) + .build(); + if (preChecksSuccess(nodeByUuid, failedNodeErrorResponseMap)) { - removedDatanodes.add(DatanodeMetadata.newBuilder() - .setHostname(nodeManager.getHostName(nodeByUuid)) - .setUuid(uuid) - .setState(nodeManager.getNodeStatus(nodeByUuid).getHealth()) - .build()); + removedDatanodes.add(metadata); nodeManager.removeNode(nodeByUuid); LOG.info("Node {} removed successfully !!!", uuid); } else { - failedDatanodes.add(DatanodeMetadata.newBuilder() - .setHostname(nodeManager.getHostName(nodeByUuid)) - .setUuid(uuid) - .setOperationalState(nodeByUuid.getPersistedOpState()) - .setState(nodeManager.getNodeStatus(nodeByUuid).getHealth()) - .build()); + failedDatanodes.add(metadata); } - } catch (NodeNotFoundException nnfe) { - LOG.error("Selected node {} not found : {} ", uuid, nnfe); + } else { + LOG.error("Node not found: {}", uuid); notFoundDatanodes.add(DatanodeMetadata.newBuilder() .setHostname("") .setState(NodeState.DEAD) @@ -236,7 +222,7 @@ public Response removeDatanodes(List uuids) { } } } catch (Exception exp) { - LOG.error("Unexpected Error while removing datanodes : {} ", exp); + LOG.error("Unexpected Error while removing datanodes {}", uuids, exp); throw new WebApplicationException(exp, Response.Status.INTERNAL_SERVER_ERROR); } @@ -250,25 +236,19 @@ public Response removeDatanodes(List uuids) { } if (!notFoundDatanodes.isEmpty()) { - DatanodesResponse notFoundNodesResp = - new DatanodesResponse(notFoundDatanodes.size(), notFoundDatanodes); + final DatanodesResponse notFoundNodesResp = new DatanodesResponse(notFoundDatanodes); removeDataNodesResponseWrapper.getDatanodesResponseMap().put("notFoundDatanodes", notFoundNodesResp); } if (!removedDatanodes.isEmpty()) { - DatanodesResponse removedNodesResp = - new DatanodesResponse(removedDatanodes.size(), removedDatanodes); + final DatanodesResponse removedNodesResp = new DatanodesResponse(removedDatanodes); removeDataNodesResponseWrapper.getDatanodesResponseMap().put("removedDatanodes", removedNodesResp); } return Response.ok(removeDataNodesResponseWrapper).build(); } - private boolean preChecksSuccess(DatanodeDetails nodeByUuid, Map failedNodeErrorResponseMap) - throws NodeNotFoundException { - if (null == nodeByUuid) { - throw new NodeNotFoundException(); - } - NodeStatus nodeStatus = null; + private boolean preChecksSuccess(DatanodeDetails nodeByUuid, Map failedNodeErrorResponseMap) { + final NodeStatus nodeStatus; AtomicBoolean isContainerOrPipeLineOpen = new AtomicBoolean(false); try { nodeStatus = nodeManager.getNodeStatus(nodeByUuid); @@ -300,9 +280,8 @@ private void checkPipelines(DatanodeDetails nodeByUuid, AtomicBoolean isContaine final Pipeline pipeline = pipelineManager.getPipeline(id); if (pipeline.isOpen()) { LOG.warn("Pipeline : {} is still open for datanode: {}, pre-check failed, datanode not eligible " + - "for remove.", id.getId(), nodeByUuid.getUuid()); + "for remove.", id.getId(), nodeByUuid); isContainerOrPipeLineOpen.set(true); - return; } } catch (PipelineNotFoundException pipelineNotFoundException) { LOG.warn("Pipeline {} is not managed by PipelineManager.", id, pipelineNotFoundException); @@ -317,10 +296,8 @@ private void checkContainers(DatanodeDetails nodeByUuid, AtomicBoolean isContain try { final ContainerInfo container = reconContainerManager.getContainer(id); if (container.getState() == HddsProtos.LifeCycleState.OPEN) { - LOG.warn("Container : {} is still open for datanode: {}, pre-check failed, datanode not eligible " + - "for remove.", container.getContainerID(), nodeByUuid.getUuid()); + LOG.warn("Failed to remove datanode {} due to OPEN container: {}", nodeByUuid, container); isContainerOrPipeLineOpen.set(true); - return; } } catch (ContainerNotFoundException cnfe) { LOG.warn("Container {} is not managed by ContainerManager.", diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/types/DatanodeMetadata.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/types/DatanodeMetadata.java index 78133f5a8895..5f6b4ef86a31 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/types/DatanodeMetadata.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/types/DatanodeMetadata.java @@ -25,6 +25,7 @@ import javax.xml.bind.annotation.XmlElement; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState; +import org.apache.hadoop.hdds.scm.node.DatanodeInfo; /** * Metadata object that represents a Datanode. @@ -214,11 +215,6 @@ public Builder setState(NodeState state) { return this; } - public Builder setOperationalState(NodeOperationalState operationalState) { - this.opState = operationalState; - return this; - } - public Builder setLastHeartbeat(long lastHeartbeat) { this.lastHeartbeat = lastHeartbeat; return this; @@ -255,28 +251,18 @@ public Builder setUuid(String uuid) { return this; } - public Builder setVersion(String version) { - this.version = version; - return this; - } - - public Builder setSetupTime(long setupTime) { - this.setupTime = setupTime; - return this; - } + public Builder setDatanode(DatanodeInfo datanode) { + this.uuid = datanode.getUuidString(); + this.hostname = datanode.getHostName(); + this.networkLocation = datanode.getNetworkLocation(); - public Builder setRevision(String revision) { - this.revision = revision; - return this; - } + this.opState = datanode.getPersistedOpState(); - public Builder setLayoutVersion(int layoutVersion) { - this.layoutVersion = layoutVersion; - return this; - } + this.version = datanode.getVersion(); + this.revision = datanode.getRevision(); + this.layoutVersion = datanode.getLastKnownLayoutVersion().getMetadataLayoutVersion(); - public Builder setNetworkLocation(String networkLocation) { - this.networkLocation = networkLocation; + this.setupTime = datanode.getSetupTime(); return this; } diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconNodeManager.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconNodeManager.java index 119db2cf88c5..e5cdf65c9103 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconNodeManager.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconNodeManager.java @@ -23,12 +23,11 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableSet; import java.io.IOException; -import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; -import java.util.UUID; import java.util.concurrent.atomic.AtomicBoolean; import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.hdds.protocol.DatanodeDetails; @@ -81,14 +80,9 @@ public class ReconNodeManager extends SCMNodeManager { * Map that contains mapping between datanodes * and their last heartbeat time. */ - private Map datanodeHeartbeatMap = new HashMap<>(); - private Map inMemDatanodeDetails = new HashMap<>(); + private Map datanodeHeartbeatMap = new HashMap<>(); - private long reconDatanodeOutdatedTime; - private static int reconStaleDatanodeMultiplier = 3; - - private static final DatanodeDetails EMPTY_DATANODE_DETAILS = - DatanodeDetails.newBuilder().setUuid(UUID.randomUUID()).build(); + private final long reconDatanodeOutdatedTime; public ReconNodeManager(OzoneConfiguration conf, SCMStorageConfig scmStorageConfig, @@ -98,6 +92,7 @@ public ReconNodeManager(OzoneConfiguration conf, HDDSLayoutVersionManager scmLayoutVersionManager) { super(conf, scmStorageConfig, eventPublisher, networkTopology, SCMContext.emptyContext(), scmLayoutVersionManager); + final int reconStaleDatanodeMultiplier = 3; this.reconDatanodeOutdatedTime = reconStaleDatanodeMultiplier * HddsServerUtil.getReconHeartbeatInterval(conf); this.nodeDB = nodeDB; @@ -145,7 +140,7 @@ public VersionResponse getVersion(SCMVersionRequestProto versionRequest) { */ public void addNodeToDB(DatanodeDetails datanodeDetails) throws IOException { nodeDB.put(datanodeDetails.getID(), datanodeDetails); - LOG.info("Adding new node {} to Node DB.", datanodeDetails.getUuid()); + LOG.info("Added a new node to db: {}", datanodeDetails); } /** @@ -156,51 +151,7 @@ public void addNodeToDB(DatanodeDetails datanodeDetails) throws IOException { */ @Override public long getLastHeartbeat(DatanodeDetails datanodeDetails) { - return datanodeHeartbeatMap.getOrDefault(datanodeDetails.getUuid(), 0L); - } - - /** - * Returns the hostname of the given node. - * - * @param datanodeDetails DatanodeDetails - * @return hostname - */ - public String getHostName(DatanodeDetails datanodeDetails) { - return inMemDatanodeDetails.getOrDefault(datanodeDetails.getUuid(), - EMPTY_DATANODE_DETAILS).getHostName(); - } - - /** - * Returns the version of the given node. - * - * @param datanodeDetails DatanodeDetails - * @return setTime - */ - public String getVersion(DatanodeDetails datanodeDetails) { - return inMemDatanodeDetails.getOrDefault(datanodeDetails.getUuid(), - EMPTY_DATANODE_DETAILS).getVersion(); - } - - /** - * Returns the setupTime of the given node. - * - * @param datanodeDetails DatanodeDetails - * @return setupTime - */ - public long getSetupTime(DatanodeDetails datanodeDetails) { - return inMemDatanodeDetails.getOrDefault(datanodeDetails.getUuid(), - EMPTY_DATANODE_DETAILS).getSetupTime(); - } - - /** - * Returns the revision of the given node. - * - * @param datanodeDetails DatanodeDetails - * @return revision - */ - public String getRevision(DatanodeDetails datanodeDetails) { - return inMemDatanodeDetails.getOrDefault(datanodeDetails.getUuid(), - EMPTY_DATANODE_DETAILS).getRevision(); + return datanodeHeartbeatMap.getOrDefault(datanodeDetails.getID(), 0L); } @Override @@ -212,7 +163,7 @@ public void onMessage(CommandForDatanode commandForDatanode, } else { LOG.debug("Ignoring unsupported command {} for Datanode {}.", commandForDatanode.getCommand().getType(), - commandForDatanode.getDatanodeId()); + commandForDatanode); } } @@ -225,19 +176,15 @@ public void onMessage(CommandForDatanode commandForDatanode, @Override public List> processHeartbeat(DatanodeDetails datanodeDetails, CommandQueueReportProto queueReport) { - List> cmds = new ArrayList<>(); long currentTime = Time.now(); - if (needUpdate(datanodeDetails, currentTime)) { - cmds.add(new ReregisterCommand()); - LOG.info("Sending ReregisterCommand() for " + - datanodeDetails.getHostName()); - datanodeHeartbeatMap.put(datanodeDetails.getUuid(), Time.now()); - return cmds; + final Long lastHeartbeat = datanodeHeartbeatMap.put(datanodeDetails.getID(), currentTime); + final boolean needUpdate = lastHeartbeat == null + || currentTime - lastHeartbeat >= reconDatanodeOutdatedTime; + if (needUpdate) { + LOG.info("Sending ReregisterCommand() for {}", datanodeDetails); + return Collections.singletonList(new ReregisterCommand()); } - // Update heartbeat map with current time - datanodeHeartbeatMap.put(datanodeDetails.getUuid(), Time.now()); - cmds.addAll(super.processHeartbeat(datanodeDetails, queueReport)); - return cmds.stream() + return super.processHeartbeat(datanodeDetails, queueReport).stream() .filter(c -> ALLOWED_COMMANDS.contains(c.getType())) .collect(toList()); } @@ -266,14 +213,12 @@ public RegisteredCommand register( DatanodeDetails datanodeDetails, NodeReportProto nodeReport, PipelineReportsProto pipelineReportsProto, LayoutVersionProto layoutInfo) { - inMemDatanodeDetails.put(datanodeDetails.getUuid(), datanodeDetails); if (isNodeRegistered(datanodeDetails)) { try { nodeDB.put(datanodeDetails.getID(), datanodeDetails); - LOG.info("Updating nodeDB for " + datanodeDetails.getHostName()); + LOG.info("Updated {} db table for {}", nodeDB.getName(), datanodeDetails); } catch (IOException e) { - LOG.error("Can not update node {} to Node DB.", - datanodeDetails.getUuid()); + LOG.error("Failed to update {} db table for {}", nodeDB.getName(), datanodeDetails, e); } } try { @@ -313,12 +258,6 @@ public void updateNodeOperationalStateFromScm(HddsProtos.Node scmNode, } } - private boolean needUpdate(DatanodeDetails datanodeDetails, - long currentTime) { - return currentTime - getLastHeartbeat(datanodeDetails) >= - reconDatanodeOutdatedTime; - } - public void reinitialize(Table nodeTable) { this.nodeDB = nodeTable; loadExistingNodes(); @@ -349,19 +288,12 @@ public void removeNode(DatanodeDetails datanodeDetails) throws NodeNotFoundExcep try { super.removeNode(datanodeDetails); nodeDB.delete(datanodeDetails.getID()); - } catch (IOException ioException) { - LOG.error("Node {} deletion fails from Node DB.", datanodeDetails.getUuid()); - throw ioException; + } catch (IOException e) { + throw new IOException("Failed to delete from nodeDB for " + datanodeDetails, e); } - datanodeHeartbeatMap.remove(datanodeDetails.getUuid()); - inMemDatanodeDetails.remove(datanodeDetails.getUuid()); + datanodeHeartbeatMap.remove(datanodeDetails.getID()); LOG.info("Removed existing node {} from Node DB and NodeManager data structures in memory ", - datanodeDetails.getUuid()); - } - - @VisibleForTesting - public ReconContext getReconContext() { - return reconContext; + datanodeDetails); } @Override