diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAMetrics.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAMetrics.java new file mode 100644 index 00000000000..2b18babc7b2 --- /dev/null +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAMetrics.java @@ -0,0 +1,104 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package org.apache.hadoop.ozone.om; + +import org.apache.hadoop.ozone.om.ha.OMHAMetrics; +import org.apache.ozone.test.GenericTestUtils; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_RATIS_SERVER_FAILURE_TIMEOUT_DURATION_DEFAULT; + +/** + * Test Ozone Manager HA Metrics. + */ +public class TestOzoneManagerHAMetrics extends TestOzoneManagerHA { + + @Test + public void testOMHAMetrics() throws Exception { + waitForLeaderToBeReady(); + + // Get leader OM + OzoneManager leaderOM = getCluster().getOMLeader(); + // Store current leader's node ID, + // to use it after restarting the OM + String leaderOMId = leaderOM.getOMNodeId(); + // Get a list of all OMs + List omList = getCluster().getOzoneManagersList(); + // Check metrics for all OMs + checkOMHAMetricsForAllOMs(omList, leaderOMId); + + // Restart leader OM + getCluster().shutdownOzoneManager(leaderOM); + getCluster().restartOzoneManager(leaderOM, true); + waitForLeaderToBeReady(); + + // Get the new leader + OzoneManager newLeaderOM = getCluster().getOMLeader(); + String newLeaderOMId = newLeaderOM.getOMNodeId(); + // Get a list of all OMs again + omList = getCluster().getOzoneManagersList(); + // New state for the old leader + int newState = leaderOMId.equals(newLeaderOMId) ? 1 : 0; + + // Get old leader + OzoneManager oldLeader = getCluster().getOzoneManager(leaderOMId); + // Get old leader's metrics + OMHAMetrics omhaMetrics = oldLeader.getOmhaMetrics(); + + Assertions.assertEquals(newState, + omhaMetrics.getOmhaInfoOzoneManagerHALeaderState()); + + // Check that metrics for all OMs have been updated + checkOMHAMetricsForAllOMs(omList, newLeaderOMId); + } + + private void checkOMHAMetricsForAllOMs(List omList, + String leaderOMId) { + for (OzoneManager om : omList) { + // Get OMHAMetrics for the current OM + OMHAMetrics omhaMetrics = om.getOmhaMetrics(); + String nodeId = om.getOMNodeId(); + + // If current OM is leader, state should be 1 + int expectedState = nodeId + .equals(leaderOMId) ? 1 : 0; + Assertions.assertEquals(expectedState, + omhaMetrics.getOmhaInfoOzoneManagerHALeaderState()); + + Assertions.assertEquals(nodeId, omhaMetrics.getOmhaInfoNodeId()); + } + } + + + /** + * After restarting OMs we need to wait + * for a leader to be elected and ready. + */ + private void waitForLeaderToBeReady() + throws InterruptedException, TimeoutException { + // Wait for Leader Election timeout + int timeout = OZONE_OM_RATIS_SERVER_FAILURE_TIMEOUT_DURATION_DEFAULT + .toIntExact(TimeUnit.MILLISECONDS); + GenericTestUtils.waitFor(() -> + getCluster().getOMLeader() != null, 500, timeout); + } +} diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java index 6345eb18156..f39ffc73411 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java @@ -28,7 +28,6 @@ import org.apache.hadoop.ozone.client.io.OzoneOutputStream; import org.apache.hadoop.ozone.om.exceptions.OMException; import org.apache.hadoop.ozone.om.ha.HadoopRpcOMFailoverProxyProvider; -import org.apache.hadoop.ozone.om.ha.OMHAMetrics; import org.apache.hadoop.ozone.om.helpers.OmMultipartInfo; import org.apache.hadoop.ozone.om.helpers.OmMultipartUploadCompleteInfo; import org.apache.ozone.test.GenericTestUtils; @@ -42,7 +41,6 @@ import java.util.List; import java.util.Map; import java.util.UUID; -import java.util.concurrent.TimeoutException; import static java.nio.charset.StandardCharsets.UTF_8; import static org.apache.hadoop.ozone.MiniOzoneHAClusterImpl.NODE_FAILURE_TIMEOUT; @@ -106,85 +104,6 @@ public void testMultipartUpload() throws Exception { testMultipartUploadWithOneOmNodeDown(); } - @Test - @Flaky("HDDS-8035") - public void testOMHAMetrics() throws InterruptedException, - TimeoutException, IOException { - waitForLeaderToBeReady(); - - // Get leader OM - OzoneManager leaderOM = getCluster().getOMLeader(); - // Store current leader's node ID, - // to use it after restarting the OM - String leaderOMId = leaderOM.getOMNodeId(); - // Get a list of all OMs - List omList = getCluster().getOzoneManagersList(); - - // Check metrics for all OMs - checkOMHAMetricsForAllOMs(omList, leaderOMId); - - // Restart current leader OM - leaderOM.stop(); - leaderOM.restart(); - - waitForLeaderToBeReady(); - - // Get the new leader - OzoneManager newLeaderOM = getCluster().getOMLeader(); - String newLeaderOMId = newLeaderOM.getOMNodeId(); - // Get a list of all OMs again - omList = getCluster().getOzoneManagersList(); - - // New state for the old leader - int newState = leaderOMId.equals(newLeaderOMId) ? 1 : 0; - - // Get old leader - OzoneManager oldLeader = getCluster().getOzoneManager(leaderOMId); - // Get old leader's metrics - OMHAMetrics omhaMetrics = oldLeader.getOmhaMetrics(); - - Assertions.assertEquals(newState, - omhaMetrics.getOmhaInfoOzoneManagerHALeaderState()); - - // Check that metrics for all OMs have been updated - checkOMHAMetricsForAllOMs(omList, newLeaderOMId); - } - - private void checkOMHAMetricsForAllOMs(List omList, - String leaderOMId) { - for (OzoneManager om : omList) { - // Get OMHAMetrics for the current OM - OMHAMetrics omhaMetrics = om.getOmhaMetrics(); - String nodeId = om.getOMNodeId(); - - // If current OM is leader, state should be 1 - int expectedState = nodeId - .equals(leaderOMId) ? 1 : 0; - - Assertions.assertEquals(expectedState, - omhaMetrics.getOmhaInfoOzoneManagerHALeaderState()); - - Assertions.assertEquals(nodeId, omhaMetrics.getOmhaInfoNodeId()); - } - } - - - /** - * Some tests are stopping or restarting OMs. - * There are test cases where we might need to - * wait for a leader to be elected and ready. - */ - private void waitForLeaderToBeReady() - throws InterruptedException, TimeoutException { - GenericTestUtils.waitFor(() -> { - try { - return getCluster().getOMLeader().isLeaderReady(); - } catch (Exception e) { - return false; - } - }, 1000, 80000); - } - @Test public void testFileOperationsAndDelete() throws Exception { testFileOperationsWithRecursive(); @@ -528,7 +447,7 @@ public void testOMRestart() throws Exception { followerOM1.stop(); // Do more transactions. Stopped OM should miss these transactions and - // the logs corresponding to atleast some of the missed transactions + // the logs corresponding to at least some missed transactions // should be purged. This will force the OM to install snapshot when // restarted. long minNewTxIndex = followerOM1LastAppliedIndex + getLogPurgeGap() * 10L; diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java index 4de1120a44c..010ad1606ef 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java @@ -1909,17 +1909,6 @@ public void updatePeerList(List newPeers) { } } } - RaftPeer leader = omRatisServer.getLeader(); - if (Objects.nonNull(leader)) { - // If we have any leader information, its id cannot be null. - String leaderId = leader.getId().toString(); - omHAMetricsInit(leaderId); - } else { - LOG.error("OzoneManagerRatisServer leader is null, " + - "unregistering OMHAMetrics."); - // Unregister, to get rid of stale metrics - OMHAMetrics.unRegister(); - } } /** @@ -2218,6 +2207,10 @@ public void stop() { if (certClient != null) { certClient.close(); } + + if (omhaMetrics != null) { + OMHAMetrics.unRegister(); + } } catch (Exception e) { LOG.error("OzoneManager stop failed.", e); } @@ -2941,9 +2934,9 @@ public String getRatisRoles() { /** * Create OMHAMetrics instance. */ - private void omHAMetricsInit(String leaderId) { + public void omHAMetricsInit(String leaderId) { // unregister, in case metrics already exist - // so that the metric tags will get updated. + // so that the metrics will get updated. OMHAMetrics.unRegister(); omhaMetrics = OMHAMetrics .create(getOMNodeId(), leaderId); diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerStateMachine.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerStateMachine.java index 7bcc6706a99..a7775f5bc00 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerStateMachine.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerStateMachine.java @@ -55,6 +55,7 @@ import org.apache.ratis.protocol.Message; import org.apache.ratis.protocol.RaftClientRequest; import org.apache.ratis.protocol.RaftGroupId; +import org.apache.ratis.protocol.RaftGroupMemberId; import org.apache.ratis.protocol.RaftPeerId; import org.apache.ratis.protocol.exceptions.StateMachineException; import org.apache.ratis.server.RaftServer; @@ -156,6 +157,13 @@ public SnapshotInfo getLatestSnapshot() { return snapshotInfo; } + @Override + public void notifyLeaderChanged(RaftGroupMemberId groupMemberId, + RaftPeerId newLeaderId) { + // Initialize OMHAMetrics + ozoneManager.omHAMetricsInit(newLeaderId.toString()); + } + /** * Called to notify state machine about indexes which are processed * internally by Raft Server, this currently happens when conf entries are