diff --git a/hadoop-ozone/dist/src/main/compose/ozone-ha/docker-compose.yaml b/hadoop-ozone/dist/src/main/compose/ozone-ha/docker-compose.yaml index e9ec65935c1b..7332d7c7f33f 100644 --- a/hadoop-ozone/dist/src/main/compose/ozone-ha/docker-compose.yaml +++ b/hadoop-ozone/dist/src/main/compose/ozone-ha/docker-compose.yaml @@ -45,7 +45,7 @@ services: ENSURE_OM_INITIALIZED: /data/metadata/om/current/VERSION <<: *replication ports: - - 9874 + - 9874:9874 - 9862 hostname: om1 command: ["ozone","om"] @@ -74,7 +74,7 @@ services: scm1: <<: *common-config ports: - - 9876 + - 9876:9876 environment: ENSURE_SCM_INITIALIZED: /data/metadata/scm/current/VERSION OZONE-SITE.XML_hdds.scm.safemode.min.datanode: ${OZONE_SAFEMODE_MIN_DATANODES:-1} diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java index fbfe6d36e3e5..f51872e1810e 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java @@ -28,11 +28,12 @@ import org.apache.hadoop.ozone.client.io.OzoneOutputStream; import org.apache.hadoop.ozone.om.exceptions.OMException; import org.apache.hadoop.ozone.om.ha.HadoopRpcOMFailoverProxyProvider; +import org.apache.hadoop.ozone.om.ha.OMHAMetrics; import org.apache.hadoop.ozone.om.helpers.OmMultipartInfo; import org.apache.hadoop.ozone.om.helpers.OmMultipartUploadCompleteInfo; import org.apache.ozone.test.GenericTestUtils; import org.apache.ozone.test.tag.Flaky; -import org.junit.Assert; +import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import java.io.IOException; @@ -41,6 +42,7 @@ import java.util.List; import java.util.Map; import java.util.UUID; +import java.util.concurrent.TimeoutException; import static java.nio.charset.StandardCharsets.UTF_8; import static org.apache.hadoop.ozone.MiniOzoneHAClusterImpl.NODE_FAILURE_TIMEOUT; @@ -104,6 +106,84 @@ public void testMultipartUpload() throws Exception { testMultipartUploadWithOneOmNodeDown(); } + @Test + public void testOMHAMetrics() throws InterruptedException, + TimeoutException, IOException { + waitForLeaderToBeReady(); + + // Get leader OM + OzoneManager leaderOM = getCluster().getOMLeader(); + // Store current leader's node ID, + // to use it after restarting the OM + String leaderOMId = leaderOM.getOMNodeId(); + // Get a list of all OMs + List omList = getCluster().getOzoneManagersList(); + + // Check metrics for all OMs + checkOMHAMetricsForAllOMs(omList, leaderOMId); + + // Restart current leader OM + leaderOM.stop(); + leaderOM.restart(); + + waitForLeaderToBeReady(); + + // Get the new leader + OzoneManager newLeaderOM = getCluster().getOMLeader(); + String newLeaderOMId = newLeaderOM.getOMNodeId(); + // Get a list of all OMs again + omList = getCluster().getOzoneManagersList(); + + // New state for the old leader + int newState = leaderOMId.equals(newLeaderOMId) ? 1 : 0; + + // Get old leader + OzoneManager oldLeader = getCluster().getOzoneManager(leaderOMId); + // Get old leader's metrics + OMHAMetrics omhaMetrics = oldLeader.getOmhaMetrics(); + + Assertions.assertEquals(newState, + omhaMetrics.getOmhaInfoOzoneManagerHALeaderState()); + + // Check that metrics for all OMs have been updated + checkOMHAMetricsForAllOMs(omList, newLeaderOMId); + } + + private void checkOMHAMetricsForAllOMs(List omList, + String leaderOMId) { + for (OzoneManager om : omList) { + // Get OMHAMetrics for the current OM + OMHAMetrics omhaMetrics = om.getOmhaMetrics(); + String nodeId = om.getOMNodeId(); + + // If current OM is leader, state should be 1 + int expectedState = nodeId + .equals(leaderOMId) ? 1 : 0; + + Assertions.assertEquals(expectedState, + omhaMetrics.getOmhaInfoOzoneManagerHALeaderState()); + + Assertions.assertEquals(nodeId, omhaMetrics.getOmhaInfoNodeId()); + } + } + + + /** + * Some tests are stopping or restarting OMs. + * There are test cases where we might need to + * wait for a leader to be elected and ready. + */ + private void waitForLeaderToBeReady() + throws InterruptedException, TimeoutException { + GenericTestUtils.waitFor(() -> { + try { + return getCluster().getOMLeader().isLeaderReady(); + } catch (Exception e) { + return false; + } + }, 1000, 80000); + } + @Test public void testFileOperationsAndDelete() throws Exception { testFileOperationsWithRecursive(); @@ -135,7 +215,7 @@ private void testFileOperationsWithRecursive() throws Exception { testCreateFile(ozoneBucket, keyName, data, true, false); fail("testFileOperationsWithRecursive"); } catch (OMException ex) { - Assert.assertEquals(FILE_ALREADY_EXISTS, ex.getResult()); + Assertions.assertEquals(FILE_ALREADY_EXISTS, ex.getResult()); } // Try now with a file name which is same as a directory. @@ -145,7 +225,7 @@ private void testFileOperationsWithRecursive() throws Exception { testCreateFile(ozoneBucket, keyName, data, true, false); fail("testFileOperationsWithNonRecursive"); } catch (OMException ex) { - Assert.assertEquals(NOT_A_FILE, ex.getResult()); + Assertions.assertEquals(NOT_A_FILE, ex.getResult()); } } @@ -185,7 +265,7 @@ private void testKeysDelete() throws Exception { } catch (OMException ex) { // The expected exception PARTIAL_DELETE, as if not able to delete, we // return error codee PARTIAL_DElETE. - Assert.assertEquals(PARTIAL_DELETE, ex.getResult()); + Assertions.assertEquals(PARTIAL_DELETE, ex.getResult()); } } @@ -206,7 +286,7 @@ private void testFileOperationsWithNonRecursive() throws Exception { try { testCreateFile(ozoneBucket, keyName, data, false, false); } catch (OMException ex) { - Assert.assertEquals(DIRECTORY_NOT_FOUND, ex.getResult()); + Assertions.assertEquals(DIRECTORY_NOT_FOUND, ex.getResult()); } // create directory, now this should pass. @@ -221,7 +301,7 @@ private void testFileOperationsWithNonRecursive() throws Exception { testCreateFile(ozoneBucket, keyName, data, false, false); fail("testFileOperationsWithRecursive"); } catch (OMException ex) { - Assert.assertEquals(FILE_ALREADY_EXISTS, ex.getResult()); + Assertions.assertEquals(FILE_ALREADY_EXISTS, ex.getResult()); } @@ -241,7 +321,7 @@ private void testFileOperationsWithNonRecursive() throws Exception { testCreateFile(ozoneBucket, keyName, data, false, false); fail("testFileOperationsWithNonRecursive"); } catch (OMException ex) { - Assert.assertEquals(NOT_A_FILE, ex.getResult()); + Assertions.assertEquals(NOT_A_FILE, ex.getResult()); } } @@ -274,7 +354,7 @@ private void testMultipartUploadWithOneOmNodeDown() throws Exception { String newLeaderOMNodeId = omFailoverProxyProvider.getCurrentProxyOMNodeId(); - Assert.assertTrue(!leaderOMNodeId.equals(newLeaderOMNodeId)); + Assertions.assertTrue(!leaderOMNodeId.equals(newLeaderOMNodeId)); } private String initiateMultipartUpload(OzoneBucket ozoneBucket, @@ -286,7 +366,7 @@ private String initiateMultipartUpload(OzoneBucket ozoneBucket, ReplicationFactor.ONE); String uploadID = omMultipartInfo.getUploadID(); - Assert.assertTrue(uploadID != null); + Assertions.assertTrue(uploadID != null); return uploadID; } @@ -305,15 +385,15 @@ private void createMultipartKeyAndReadKey(OzoneBucket ozoneBucket, OmMultipartUploadCompleteInfo omMultipartUploadCompleteInfo = ozoneBucket.completeMultipartUpload(keyName, uploadID, partsMap); - Assert.assertTrue(omMultipartUploadCompleteInfo != null); - Assert.assertTrue(omMultipartUploadCompleteInfo.getHash() != null); + Assertions.assertTrue(omMultipartUploadCompleteInfo != null); + Assertions.assertTrue(omMultipartUploadCompleteInfo.getHash() != null); OzoneInputStream ozoneInputStream = ozoneBucket.readKey(keyName); byte[] fileContent = new byte[value.getBytes(UTF_8).length]; ozoneInputStream.read(fileContent); - Assert.assertEquals(value, new String(fileContent, UTF_8)); + Assertions.assertEquals(value, new String(fileContent, UTF_8)); } @Test @@ -368,10 +448,10 @@ public void testOMRatisSnapshot() throws Exception { long smLastAppliedIndex = ozoneManager.getOmRatisServer().getLastAppliedTermIndex().getIndex(); long ratisSnapshotIndex = ozoneManager.getRatisSnapshotIndex(); - Assert.assertTrue("LastAppliedIndex on OM State Machine (" - + smLastAppliedIndex + ") is less than the saved snapshot index(" - + ratisSnapshotIndex + ").", - smLastAppliedIndex >= ratisSnapshotIndex); + Assertions.assertTrue(smLastAppliedIndex >= ratisSnapshotIndex, + "LastAppliedIndex on OM State Machine (" + + smLastAppliedIndex + ") is less than the saved snapshot index(" + + ratisSnapshotIndex + ")."); // Add more transactions to Ratis to trigger another snapshot while (appliedLogIndex <= (smLastAppliedIndex + getSnapshotThreshold())) { @@ -393,8 +473,9 @@ public void testOMRatisSnapshot() throws Exception { // The new snapshot index must be greater than the previous snapshot index long ratisSnapshotIndexNew = ozoneManager.getRatisSnapshotIndex(); - Assert.assertTrue("Latest snapshot index must be greater than previous " + - "snapshot indices", ratisSnapshotIndexNew > ratisSnapshotIndex); + Assertions.assertTrue(ratisSnapshotIndexNew > ratisSnapshotIndex, + "Latest snapshot index must be greater than previous " + + "snapshot indices"); } @@ -459,7 +540,7 @@ public void testOMRestart() throws Exception { final long leaderOMSnaphsotIndex = leaderOM.getRatisSnapshotIndex(); // The stopped OM should be lagging behind the leader OM. - Assert.assertTrue(followerOM1LastAppliedIndex < leaderOMSnaphsotIndex); + Assertions.assertTrue(followerOM1LastAppliedIndex < leaderOMSnaphsotIndex); // Restart the stopped OM. followerOM1.restart(); @@ -478,7 +559,8 @@ public void testOMRestart() throws Exception { final long followerOM1LastAppliedIndexNew = followerOM1.getOmRatisServer().getLastAppliedTermIndex().getIndex(); - Assert.assertTrue(followerOM1LastAppliedIndexNew > leaderOMSnaphsotIndex); + Assertions.assertTrue( + followerOM1LastAppliedIndexNew > leaderOMSnaphsotIndex); } @Test @@ -522,15 +604,15 @@ private void validateListParts(OzoneBucket ozoneBucket, String keyName, List partInfoList = ozoneMultipartUploadPartListParts.getPartInfoList(); - Assert.assertTrue(partInfoList.size() == partsMap.size()); + Assertions.assertTrue(partInfoList.size() == partsMap.size()); for (int i = 0; i < partsMap.size(); i++) { - Assert.assertEquals(partsMap.get(partInfoList.get(i).getPartNumber()), + Assertions.assertEquals(partsMap.get(partInfoList.get(i).getPartNumber()), partInfoList.get(i).getPartName()); } - Assert.assertFalse(ozoneMultipartUploadPartListParts.isTruncated()); + Assertions.assertFalse(ozoneMultipartUploadPartListParts.isTruncated()); } /** diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java index 0982c6d482a1..20ba92fbbc3c 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java @@ -51,6 +51,7 @@ import java.util.stream.Collectors; import com.google.common.base.Optional; +import com.google.common.base.Strings; import com.google.common.collect.ImmutableSortedSet; import com.google.common.collect.Lists; import org.apache.hadoop.conf.Configuration; @@ -82,6 +83,7 @@ import org.apache.hadoop.hdds.utils.db.Table.KeyValue; import org.apache.hadoop.hdds.utils.db.TableIterator; import org.apache.hadoop.ozone.OzoneManagerVersion; +import org.apache.hadoop.ozone.om.ha.OMHAMetrics; import org.apache.hadoop.ozone.om.helpers.KeyInfoWithVolumeContext; import org.apache.hadoop.ozone.om.helpers.SnapshotInfo; import org.apache.hadoop.ozone.om.request.OMClientRequest; @@ -347,6 +349,7 @@ public final class OzoneManager extends ServiceRuntimeInfoImpl private final OzoneAdmins s3OzoneAdmins; private final OMMetrics metrics; + private OMHAMetrics omhaMetrics; private final ProtocolMessageMetrics omClientProtocolMetrics; private OzoneManagerHttpServer httpServer; @@ -1894,6 +1897,26 @@ public void updatePeerList(List newPeers) { } } } + RaftPeer leader = null; + try { + leader = omRatisServer.getLeader(); + } catch (IOException ex) { + LOG.error("IOException while getting the " + + "Ratis server leader.", ex); + } + if (Objects.nonNull(leader)) { + String leaderId = leader.getId().toString(); + + // If leaderId is empty, then leader is undefined + // and current OM is neither leader nor follower. + // OMHAMetrics shouldn't be registered in that case. + if (!Strings.isNullOrEmpty(leaderId)) { + omHAMetricsInit(leaderId); + } else { + // unregister, to get rid of stale metrics + OMHAMetrics.unRegister(); + } + } } /** @@ -2160,6 +2183,7 @@ public void stop() { if (omRatisServer != null) { omRatisServer.stop(); omRatisServer = null; + OMHAMetrics.unRegister(); } isOmRpcServerRunning = false; if (isOmGrpcServerEnabled) { @@ -2911,6 +2935,22 @@ public String getRatisRoles() { } } + /** + * Create OMHAMetrics instance. + */ + private void omHAMetricsInit(String leaderId) { + // unregister, in case metrics already exist + // so that the metric tags will get updated. + OMHAMetrics.unRegister(); + omhaMetrics = OMHAMetrics + .create(getOMNodeId(), leaderId); + } + + @VisibleForTesting + public OMHAMetrics getOmhaMetrics() { + return omhaMetrics; + } + public String getRatisLogDirectory() { return OzoneManagerRatisUtils.getOMRatisDirectory(configuration); } diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java new file mode 100644 index 000000000000..7fde2943bbee --- /dev/null +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java @@ -0,0 +1,132 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS,WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package org.apache.hadoop.ozone.om.ha; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.metrics2.MetricsCollector; +import org.apache.hadoop.metrics2.MetricsInfo; +import org.apache.hadoop.metrics2.MetricsSource; +import org.apache.hadoop.metrics2.MetricsRecordBuilder; +import org.apache.hadoop.metrics2.annotation.Metrics; +import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; +import org.apache.hadoop.metrics2.lib.Interns; +import org.apache.hadoop.metrics2.lib.MetricsRegistry; +import org.apache.hadoop.ozone.OzoneConsts; + +/** + * Class to maintain metrics and info related to OM HA. + */ +@Metrics(about = "OzoneManager HA Metrics", context = OzoneConsts.OZONE) +public final class OMHAMetrics implements MetricsSource { + + /** + * Private nested class to hold the values + * of MetricsInfo for OMHAMetrics. + */ + private static final class OMHAMetricsInfo { + + private static final MetricsInfo OZONE_MANAGER_HA_LEADER_STATE = + Interns.info("OzoneManagerHALeaderState", + "Leader active state of OzoneManager node (1 leader, 0 follower)"); + + private static final MetricsInfo NODE_ID = + Interns.info("NodeId", "OM node Id"); + + private int ozoneManagerHALeaderState; + private String nodeId; + + OMHAMetricsInfo() { + this.ozoneManagerHALeaderState = 0; + this.nodeId = ""; + } + + public int getOzoneManagerHALeaderState() { + return ozoneManagerHALeaderState; + } + + public void setOzoneManagerHALeaderState(int ozoneManagerHALeaderState) { + this.ozoneManagerHALeaderState = ozoneManagerHALeaderState; + } + + public String getNodeId() { + return nodeId; + } + + public void setNodeId(String nodeId) { + this.nodeId = nodeId; + } + } + + public static final String SOURCE_NAME = + OMHAMetrics.class.getSimpleName(); + private final OMHAMetricsInfo omhaMetricsInfo = new OMHAMetricsInfo(); + private MetricsRegistry metricsRegistry; + + private String currNodeId; + private String leaderId; + + private OMHAMetrics(String currNodeId, String leaderId) { + this.currNodeId = currNodeId; + this.leaderId = leaderId; + this.metricsRegistry = new MetricsRegistry(SOURCE_NAME); + } + + /** + * Create and return OMHAMetrics instance. + * @return OMHAMetrics + */ + public static OMHAMetrics create( + String nodeId, String leaderId) { + OMHAMetrics metrics = new OMHAMetrics(nodeId, leaderId); + return DefaultMetricsSystem.instance() + .register(SOURCE_NAME, "Metrics for OM HA", metrics); + } + + /** + * Unregister the metrics instance. + */ + public static void unRegister() { + DefaultMetricsSystem.instance().unregisterSource(SOURCE_NAME); + } + + @Override + public synchronized void getMetrics(MetricsCollector collector, boolean all) { + + MetricsRecordBuilder recordBuilder = collector.addRecord(SOURCE_NAME); + + // Check current node state (1 leader, 0 follower) + int state = currNodeId.equals(leaderId) ? 1 : 0; + omhaMetricsInfo.setNodeId(currNodeId); + omhaMetricsInfo.setOzoneManagerHALeaderState(state); + + recordBuilder + .tag(OMHAMetricsInfo.NODE_ID, currNodeId) + .addGauge(OMHAMetricsInfo.OZONE_MANAGER_HA_LEADER_STATE, state); + + recordBuilder.endRecord(); + } + + @VisibleForTesting + public String getOmhaInfoNodeId() { + return omhaMetricsInfo.getNodeId(); + } + + @VisibleForTesting + public int getOmhaInfoOzoneManagerHALeaderState() { + return omhaMetricsInfo.getOzoneManagerHALeaderState(); + } +} diff --git a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ha/TestOMHAMetrics.java b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ha/TestOMHAMetrics.java new file mode 100644 index 000000000000..16192c4902cf --- /dev/null +++ b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ha/TestOMHAMetrics.java @@ -0,0 +1,61 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package org.apache.hadoop.ozone.om.ha; + +import org.apache.commons.lang3.RandomStringUtils; +import org.apache.hadoop.metrics2.impl.MetricsCollectorImpl; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +/** + * Tests for {@link OMHAMetrics}. + */ +public class TestOMHAMetrics { + + private static final MetricsCollectorImpl METRICS_COLLECTOR = + new MetricsCollectorImpl(); + private static final String NODE_ID = + "om" + RandomStringUtils.randomNumeric(5); + private OMHAMetrics omhaMetrics; + private String leaderId; + + @AfterEach + public void cleanUp() { + OMHAMetrics.unRegister(); + } + + @Test + public void testGetMetricsWithLeader() { + leaderId = NODE_ID; + omhaMetrics = OMHAMetrics.create(NODE_ID, leaderId); + + omhaMetrics.getMetrics(METRICS_COLLECTOR, true); + Assertions.assertEquals(1, + omhaMetrics.getOmhaInfoOzoneManagerHALeaderState()); + } + + @Test + public void testGetMetricsWithFollower() { + leaderId = "om" + RandomStringUtils.randomNumeric(5); + omhaMetrics = OMHAMetrics.create(NODE_ID, leaderId); + + omhaMetrics.getMetrics(METRICS_COLLECTOR, true); + Assertions.assertEquals(0, + omhaMetrics.getOmhaInfoOzoneManagerHALeaderState()); + } +}