apache · adoroszlai · Jun 23, 2022 · Mar 11, 2022 · Mar 14, 2022 · Mar 15, 2022
diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/ratis/RatisHelper.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/ratis/RatisHelper.java
@@ -31,6 +31,7 @@
 
 import org.apache.hadoop.hdds.StringUtils;
 import org.apache.hadoop.hdds.conf.ConfigurationSource;
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
 import org.apache.hadoop.hdds.protocol.DatanodeDetails;
 import org.apache.hadoop.hdds.protocol.DatanodeDetails.Port;
 import org.apache.hadoop.hdds.ratis.conf.RatisClientConfig;
@@ -39,6 +40,7 @@
 import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
 import org.apache.hadoop.hdds.security.x509.SecurityConfig;
 
+import org.apache.hadoop.hdfs.DFSConfigKeys;
 import org.apache.ratis.RaftConfigKeys;
 import org.apache.ratis.client.RaftClient;
 import org.apache.ratis.client.RaftClientConfigKeys;
@@ -65,6 +67,8 @@ public final class RatisHelper {
 
   private static final Logger LOG = LoggerFactory.getLogger(RatisHelper.class);
 
+  private static final OzoneConfiguration CONF = new OzoneConfiguration();
+
   // Prefix for Ratis Server GRPC and Ratis client conf.
   public static final String HDDS_DATANODE_RATIS_PREFIX_KEY = "hdds.ratis";
 
@@ -97,7 +101,18 @@ public static UUID toDatanodeId(RaftProtos.RaftPeerProto peerId) {
   }
 
   private static String toRaftPeerAddress(DatanodeDetails id, Port.Name port) {
-    return id.getIpAddress() + ":" + id.getPort(port).getValue();
+    if (datanodeUseHostName()) {
+      final String address =
+              id.getHostName() + ":" + id.getPort(port).getValue();
+      LOG.debug("Datanode is using hostname for raft peer address: {}",
+              address);
+      return address;
+    } else {
+      final String address =
+              id.getIpAddress() + ":" + id.getPort(port).getValue();
+      LOG.debug("Datanode is using IP for raft peer address: {}", address);
+      return address;
+    }
   }
 
   public static RaftPeerId toRaftPeerId(DatanodeDetails id) {
@@ -369,6 +384,12 @@ public static Long getMinReplicatedIndex(
         .min(Long::compareTo).orElse(null);
   }
 
+  private static boolean datanodeUseHostName() {
+    return CONF.getBoolean(
+            DFSConfigKeys.DFS_DATANODE_USE_DN_HOSTNAME,
+            DFSConfigKeys.DFS_DATANODE_USE_DN_HOSTNAME_DEFAULT);
+  }
+
   private static <U> Class<? extends U> getClass(String name,
       Class<U> xface) {
     try {

diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/net/NetworkTopology.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/net/NetworkTopology.java
@@ -39,6 +39,14 @@ public InvalidTopologyException(String msg) {
    */
   void add(Node node);
 
+  /**
+   * Update a node. This will be called when a datanode needs to be updated.
+   * If the old datanode does not exist, then just add the new datanode.
+   * @param oldNode node to be updated; can be null
+   * @param newNode node to update to; cannot be null
+   */
+  void update(Node oldNode, Node newNode);
+
   /**
    * Remove a node from the network topology. This will be called when a
    * existing datanode is removed from the system.

diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/net/NetworkTopologyImpl.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/net/NetworkTopologyImpl.java
@@ -116,6 +116,59 @@ public void add(Node node) {
     }
   }
 
+  /**
+   * Update a leaf node. It is called when a datanode needs to be updated.
+   * If the old datanode does not exist, then just add the new datanode.
+   * @param oldNode node to be updated; can be null
+   * @param newNode node to update to; cannot be null
+   */
+  @Override
+  public void update(Node oldNode, Node newNode) {
+    Preconditions.checkArgument(newNode != null, "newNode cannot be null");
+    if (oldNode != null && oldNode instanceof InnerNode) {
+      throw new IllegalArgumentException(
+              "Not allowed to update an inner node: "
+                      + oldNode.getNetworkFullPath());
+    }
+
+    if (newNode instanceof InnerNode) {
+      throw new IllegalArgumentException(
+              "Not allowed to update a leaf node to an inner node: "
+                      + newNode.getNetworkFullPath());
+    }
+
+    int newDepth = NetUtils.locationToDepth(newNode.getNetworkLocation()) + 1;
+    // Check depth
+    if (maxLevel != newDepth) {
+      throw new InvalidTopologyException("Failed to update to " +
+              newNode.getNetworkFullPath()
+              + ": Its path depth is not "
+              + maxLevel);
+    }
+
+    netlock.writeLock().lock();
+    boolean add;
+    try {
+      boolean exist = false;
+      if (oldNode != null) {
+        exist = containsNode(oldNode);
+      }
+      if (exist) {
+        clusterTree.remove(oldNode);
+      }
+
+      add = clusterTree.add(newNode);
+    } finally {
+      netlock.writeLock().unlock();
+    }
+    if (add) {
+      LOG.info("Updated to the new node: {}", newNode.getNetworkFullPath());
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("NetworkTopology became:\n{}", this);
+      }
+    }
+  }
+
   /**
    * Remove a node from the network topology. This will be called when a
    * existing datanode is removed from the system.
@@ -150,16 +203,20 @@ public boolean contains(Node node) {
     Preconditions.checkArgument(node != null, "node cannot be null");
     netlock.readLock().lock();
     try {
-      Node parent = node.getParent();
-      while (parent != null && parent != clusterTree) {
-        parent = parent.getParent();
-      }
-      if (parent == clusterTree) {
-        return true;
-      }
+      return containsNode(node);
     } finally {
       netlock.readLock().unlock();
     }
+  }
+
+  private boolean containsNode(Node node) {
+    Node parent = node.getParent();
+    while (parent != null && parent != clusterTree) {
+      parent = parent.getParent();
+    }
+    if (parent == clusterTree) {
+      return true;
+    }
     return false;
   }
 

diff --git a/hadoop-hdds/common/src/test/java/org/apache/hadoop/hdds/scm/net/TestNetworkTopologyImpl.java b/hadoop-hdds/common/src/test/java/org/apache/hadoop/hdds/scm/net/TestNetworkTopologyImpl.java
@@ -914,6 +914,44 @@ public void testSingleNodeRackWithAffinityNode() {
   }
 
   @Test
+  public void testUpdateNode() {
+    List<NodeSchema> schemas = new ArrayList<>();
+    schemas.add(ROOT_SCHEMA);
+    schemas.add(DATACENTER_SCHEMA);
+    schemas.add(RACK_SCHEMA);
+    schemas.add(LEAF_SCHEMA);
+
+    NodeSchemaManager manager = NodeSchemaManager.getInstance();
+    manager.init(schemas.toArray(new NodeSchema[0]), true);
+    NetworkTopology newCluster =
+            new NetworkTopologyImpl(manager);
+    Node node = createDatanode("1.1.1.1", "/d1/r1");
+    newCluster.add(node);
+    assertTrue(newCluster.contains(node));
+
+    // update
+    Node newNode = createDatanode("1.1.1.2", "/d1/r1");
+    assertFalse(newCluster.contains(newNode));
+    newCluster.update(node, newNode);
+    assertFalse(newCluster.contains(node));
+    assertTrue(newCluster.contains(newNode));
+
+    // update a non-existing node
+    Node nodeExisting = createDatanode("1.1.1.3", "/d1/r1");
+    Node newNode2 = createDatanode("1.1.1.4", "/d1/r1");
+    assertFalse(newCluster.contains(nodeExisting));
+    assertFalse(newCluster.contains(newNode2));
+
+    newCluster.update(nodeExisting, newNode2);
+    assertFalse(newCluster.contains(nodeExisting));
+    assertTrue(newCluster.contains(newNode2));
+
+    // old node is null
+    Node newNode3 = createDatanode("1.1.1.5", "/d1/r1");
+    assertFalse(newCluster.contains(newNode3));
+    newCluster.update(null, newNode3);
+    assertTrue(newCluster.contains(newNode3));
+  }
   public void testIsAncestor() {
     NodeImpl r1 = new NodeImpl("r1", "/", NODE_COST_DEFAULT);
     NodeImpl r12 = new NodeImpl("r12", "/", NODE_COST_DEFAULT);

diff --git a/...main/java/org/apache/hadoop/ozone/container/common/states/datanode/InitDatanodeState.java b/...main/java/org/apache/hadoop/ozone/container/common/states/datanode/InitDatanodeState.java
@@ -125,7 +125,7 @@ private void persistContainerDatanodeDetails() {
     File idPath = new File(dataNodeIDPath);
     DatanodeDetails datanodeDetails = this.context.getParent()
         .getDatanodeDetails();
-    if (datanodeDetails != null && !idPath.exists()) {
+    if (datanodeDetails != null) {
       try {
         ContainerUtils.writeDatanodeDetailsTo(datanodeDetails, idPath);
       } catch (IOException ex) {

diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/server/events/EventQueue.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/server/events/EventQueue.java
@@ -26,6 +26,9 @@
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
+import com.google.gson.ExclusionStrategy;
+import com.google.gson.FieldAttributes;
+import org.apache.hadoop.hdds.scm.net.NodeImpl;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.Time;
 
@@ -58,10 +61,29 @@ public class EventQueue implements EventPublisher, AutoCloseable {
 
   private boolean isRunning = true;
 
-  private static final Gson TRACING_SERIALIZER = new GsonBuilder().create();
+  private static final Gson TRACING_SERIALIZER = new GsonBuilder()
+          .setExclusionStrategies(new DatanodeDetailsGsonExclusionStrategy())
+          .create();
 
   private boolean isSilent = false;
 
+  // The field parent in DatanodeDetails class has the circular reference
+  // which will result in Gson infinite recursive parsing. We need to exclude
+  // this field when generating json string for DatanodeDetails object
+  static class DatanodeDetailsGsonExclusionStrategy
+          implements ExclusionStrategy {
+    @Override
+    public boolean shouldSkipField(FieldAttributes f) {
+      return f.getDeclaringClass() == NodeImpl.class
+              && f.getName().equals("parent");
+    }
+
+    @Override
+    public boolean shouldSkipClass(Class<?> aClass) {
+      return false;
+    }
+  }
+
   /**
    * Add new handler to the event queue.
    * <p>

diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/events/SCMEvents.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/events/SCMEvents.java
@@ -157,6 +157,13 @@ public final class SCMEvents {
   public static final TypedEvent<DatanodeDetails> NEW_NODE =
       new TypedEvent<>(DatanodeDetails.class, "New_Node");
 
+  /**
+   * This event will be triggered whenever a datanode is registered with
+   * SCM with a different Ip or host name.
+   */
+  public static final TypedEvent<DatanodeDetails> NODE_ADDRESS_UPDATE =
+          new TypedEvent<>(DatanodeDetails.class, "Node_Address_Update");
+
   /**
    * This event will be triggered whenever a datanode is moved from healthy to
    * stale state.

diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMService.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMService.java
@@ -60,6 +60,7 @@ enum ServiceStatus {
   enum Event {
     PRE_CHECK_COMPLETED,
     NEW_NODE_HANDLER_TRIGGERED,
+    NODE_ADDRESS_UPDATE_HANDLER_TRIGGERED,
     UNHEALTHY_TO_HEALTHY_NODE_HANDLER_TRIGGERED
   }
 

diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NewNodeHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NewNodeHandler.java
@@ -56,6 +56,7 @@ public NewNodeHandler(PipelineManager pipelineManager,
   public void onMessage(DatanodeDetails datanodeDetails,
       EventPublisher publisher) {
     try {
+      pipelineManager.closeStalePipelines(datanodeDetails);
       serviceManager.notifyEventTriggered(Event.NEW_NODE_HANDLER_TRIGGERED);
 
       if (datanodeDetails.getPersistedOpState()

diff --git a/...ds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeAddressUpdateHandler.java b/...ds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeAddressUpdateHandler.java
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership.  The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.hadoop.hdds.scm.node;
+
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.hdds.scm.ha.SCMService;
+import org.apache.hadoop.hdds.scm.ha.SCMServiceManager;
+import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
+import org.apache.hadoop.hdds.scm.pipeline.PipelineManager;
+import org.apache.hadoop.hdds.server.events.EventHandler;
+import org.apache.hadoop.hdds.server.events.EventPublisher;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Handles datanode ip or hostname change event.
+ */
+public class NodeAddressUpdateHandler
+        implements EventHandler<DatanodeDetails> {
+  private static final Logger LOG =
+          LoggerFactory.getLogger(NodeAddressUpdateHandler.class);
+
+  private final PipelineManager pipelineManager;
+  private final NodeDecommissionManager decommissionManager;
+  private final SCMServiceManager serviceManager;
+
+  public NodeAddressUpdateHandler(PipelineManager pipelineManager,
+                                  NodeDecommissionManager
+                                               decommissionManager,
+                                  SCMServiceManager serviceManager) {
+    this.pipelineManager = pipelineManager;
+    this.decommissionManager = decommissionManager;
+    this.serviceManager = serviceManager;
+  }
+
+  @Override
+  public void onMessage(DatanodeDetails datanodeDetails,
+                        EventPublisher publisher) {
+    try {
+      LOG.info("Closing stale pipelines for datanode: {}", datanodeDetails);
+      pipelineManager.closeStalePipelines(datanodeDetails);
+      serviceManager.notifyEventTriggered(SCMService.Event
+              .NODE_ADDRESS_UPDATE_HANDLER_TRIGGERED);
+
+      decommissionManager.continueAdminForNode(datanodeDetails);
+    } catch (NodeNotFoundException e) {
+      // Should not happen, as the node has just registered to call this event
+      // handler.
+      LOG.error(
+              "NodeNotFound when updating the node Ip or host name to the " +
+                      "decommissionManager",
+              e);
+    }
+  }
+}
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStateManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStateManager.java
@@ -402,6 +402,28 @@ public void updateLastKnownLayoutVersion(DatanodeDetails datanodeDetails,
         .updateLastKnownLayoutVersion(layoutInfo);
   }
 
+  /**
+   * Update node.
+   *
+   * @param datanodeDetails the datanode details
+   * @param layoutInfo the layoutInfo
+   * @throws NodeNotFoundException the node not found exception
+   */
+  public void updateNode(DatanodeDetails datanodeDetails,
+                         LayoutVersionProto layoutInfo)
+          throws NodeNotFoundException {
+    DatanodeInfo datanodeInfo =
+            nodeStateMap.getNodeInfo(datanodeDetails.getUuid());
+    NodeStatus newNodeStatus = newNodeStatus(datanodeDetails, layoutInfo);
+    LOG.info("updating node {} from {} to {} with status {}",
+            datanodeDetails.getUuidString(),
+            datanodeInfo,
+            datanodeDetails,
+            newNodeStatus);
+    nodeStateMap.updateNode(datanodeDetails, newNodeStatus, layoutInfo);
+    updateLastKnownLayoutVersion(datanodeDetails, layoutInfo);
+  }
+
   /**
    * Returns the current state of the node.
    *