From 492048afcdc5660bdf4a81c8348788c5a6e1f317 Mon Sep 17 00:00:00 2001 From: xBis7 Date: Tue, 3 Jan 2023 18:36:37 +0200 Subject: [PATCH 01/12] Make OM Ratis roles available in /prom endpoint --- .../ozone/om/TestOzoneManagerHAWithData.java | 63 ++++++++++------ .../apache/hadoop/ozone/om/OzoneManager.java | 54 ++++++++++++-- .../hadoop/ozone/om/ha/OMHAMetrics.java | 72 +++++++++++++++++++ 3 files changed, 162 insertions(+), 27 deletions(-) create mode 100644 hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java index fbfe6d36e3e5..62ae81e8432f 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java @@ -28,11 +28,12 @@ import org.apache.hadoop.ozone.client.io.OzoneOutputStream; import org.apache.hadoop.ozone.om.exceptions.OMException; import org.apache.hadoop.ozone.om.ha.HadoopRpcOMFailoverProxyProvider; +import org.apache.hadoop.ozone.om.ha.OMHAMetrics; import org.apache.hadoop.ozone.om.helpers.OmMultipartInfo; import org.apache.hadoop.ozone.om.helpers.OmMultipartUploadCompleteInfo; import org.apache.ozone.test.GenericTestUtils; import org.apache.ozone.test.tag.Flaky; -import org.junit.Assert; +import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import java.io.IOException; @@ -104,6 +105,20 @@ public void testMultipartUpload() throws Exception { testMultipartUploadWithOneOmNodeDown(); } + @Test + public void testOMHAMetrics() throws InterruptedException { + Thread.sleep(2000); + OzoneManager om = getCluster().getOzoneManager(1); + OMHAMetrics omhaMetrics = om.getOmhaMetrics(); + + String omRoles = omhaMetrics.getMetricsRegistry() + .getTag("OMRoles").value(); + + Assertions.assertEquals(om.getRatisRoles(), omRoles); + Assertions.assertEquals(getNumOfOMs(), + omhaMetrics.getNumOfOMNodes().value()); + } + @Test public void testFileOperationsAndDelete() throws Exception { testFileOperationsWithRecursive(); @@ -135,7 +150,7 @@ private void testFileOperationsWithRecursive() throws Exception { testCreateFile(ozoneBucket, keyName, data, true, false); fail("testFileOperationsWithRecursive"); } catch (OMException ex) { - Assert.assertEquals(FILE_ALREADY_EXISTS, ex.getResult()); + Assertions.assertEquals(FILE_ALREADY_EXISTS, ex.getResult()); } // Try now with a file name which is same as a directory. @@ -145,7 +160,7 @@ private void testFileOperationsWithRecursive() throws Exception { testCreateFile(ozoneBucket, keyName, data, true, false); fail("testFileOperationsWithNonRecursive"); } catch (OMException ex) { - Assert.assertEquals(NOT_A_FILE, ex.getResult()); + Assertions.assertEquals(NOT_A_FILE, ex.getResult()); } } @@ -185,7 +200,7 @@ private void testKeysDelete() throws Exception { } catch (OMException ex) { // The expected exception PARTIAL_DELETE, as if not able to delete, we // return error codee PARTIAL_DElETE. - Assert.assertEquals(PARTIAL_DELETE, ex.getResult()); + Assertions.assertEquals(PARTIAL_DELETE, ex.getResult()); } } @@ -206,7 +221,7 @@ private void testFileOperationsWithNonRecursive() throws Exception { try { testCreateFile(ozoneBucket, keyName, data, false, false); } catch (OMException ex) { - Assert.assertEquals(DIRECTORY_NOT_FOUND, ex.getResult()); + Assertions.assertEquals(DIRECTORY_NOT_FOUND, ex.getResult()); } // create directory, now this should pass. @@ -221,7 +236,7 @@ private void testFileOperationsWithNonRecursive() throws Exception { testCreateFile(ozoneBucket, keyName, data, false, false); fail("testFileOperationsWithRecursive"); } catch (OMException ex) { - Assert.assertEquals(FILE_ALREADY_EXISTS, ex.getResult()); + Assertions.assertEquals(FILE_ALREADY_EXISTS, ex.getResult()); } @@ -241,7 +256,7 @@ private void testFileOperationsWithNonRecursive() throws Exception { testCreateFile(ozoneBucket, keyName, data, false, false); fail("testFileOperationsWithNonRecursive"); } catch (OMException ex) { - Assert.assertEquals(NOT_A_FILE, ex.getResult()); + Assertions.assertEquals(NOT_A_FILE, ex.getResult()); } } @@ -274,7 +289,7 @@ private void testMultipartUploadWithOneOmNodeDown() throws Exception { String newLeaderOMNodeId = omFailoverProxyProvider.getCurrentProxyOMNodeId(); - Assert.assertTrue(!leaderOMNodeId.equals(newLeaderOMNodeId)); + Assertions.assertTrue(!leaderOMNodeId.equals(newLeaderOMNodeId)); } private String initiateMultipartUpload(OzoneBucket ozoneBucket, @@ -286,7 +301,7 @@ private String initiateMultipartUpload(OzoneBucket ozoneBucket, ReplicationFactor.ONE); String uploadID = omMultipartInfo.getUploadID(); - Assert.assertTrue(uploadID != null); + Assertions.assertTrue(uploadID != null); return uploadID; } @@ -305,15 +320,15 @@ private void createMultipartKeyAndReadKey(OzoneBucket ozoneBucket, OmMultipartUploadCompleteInfo omMultipartUploadCompleteInfo = ozoneBucket.completeMultipartUpload(keyName, uploadID, partsMap); - Assert.assertTrue(omMultipartUploadCompleteInfo != null); - Assert.assertTrue(omMultipartUploadCompleteInfo.getHash() != null); + Assertions.assertTrue(omMultipartUploadCompleteInfo != null); + Assertions.assertTrue(omMultipartUploadCompleteInfo.getHash() != null); OzoneInputStream ozoneInputStream = ozoneBucket.readKey(keyName); byte[] fileContent = new byte[value.getBytes(UTF_8).length]; ozoneInputStream.read(fileContent); - Assert.assertEquals(value, new String(fileContent, UTF_8)); + Assertions.assertEquals(value, new String(fileContent, UTF_8)); } @Test @@ -368,10 +383,10 @@ public void testOMRatisSnapshot() throws Exception { long smLastAppliedIndex = ozoneManager.getOmRatisServer().getLastAppliedTermIndex().getIndex(); long ratisSnapshotIndex = ozoneManager.getRatisSnapshotIndex(); - Assert.assertTrue("LastAppliedIndex on OM State Machine (" - + smLastAppliedIndex + ") is less than the saved snapshot index(" - + ratisSnapshotIndex + ").", - smLastAppliedIndex >= ratisSnapshotIndex); + Assertions.assertTrue(smLastAppliedIndex >= ratisSnapshotIndex, + "LastAppliedIndex on OM State Machine (" + + smLastAppliedIndex + ") is less than the saved snapshot index(" + + ratisSnapshotIndex + ")."); // Add more transactions to Ratis to trigger another snapshot while (appliedLogIndex <= (smLastAppliedIndex + getSnapshotThreshold())) { @@ -393,8 +408,9 @@ public void testOMRatisSnapshot() throws Exception { // The new snapshot index must be greater than the previous snapshot index long ratisSnapshotIndexNew = ozoneManager.getRatisSnapshotIndex(); - Assert.assertTrue("Latest snapshot index must be greater than previous " + - "snapshot indices", ratisSnapshotIndexNew > ratisSnapshotIndex); + Assertions.assertTrue(ratisSnapshotIndexNew > ratisSnapshotIndex, + "Latest snapshot index must be greater than previous " + + "snapshot indices"); } @@ -459,7 +475,7 @@ public void testOMRestart() throws Exception { final long leaderOMSnaphsotIndex = leaderOM.getRatisSnapshotIndex(); // The stopped OM should be lagging behind the leader OM. - Assert.assertTrue(followerOM1LastAppliedIndex < leaderOMSnaphsotIndex); + Assertions.assertTrue(followerOM1LastAppliedIndex < leaderOMSnaphsotIndex); // Restart the stopped OM. followerOM1.restart(); @@ -478,7 +494,8 @@ public void testOMRestart() throws Exception { final long followerOM1LastAppliedIndexNew = followerOM1.getOmRatisServer().getLastAppliedTermIndex().getIndex(); - Assert.assertTrue(followerOM1LastAppliedIndexNew > leaderOMSnaphsotIndex); + Assertions.assertTrue( + followerOM1LastAppliedIndexNew > leaderOMSnaphsotIndex); } @Test @@ -522,15 +539,15 @@ private void validateListParts(OzoneBucket ozoneBucket, String keyName, List partInfoList = ozoneMultipartUploadPartListParts.getPartInfoList(); - Assert.assertTrue(partInfoList.size() == partsMap.size()); + Assertions.assertTrue(partInfoList.size() == partsMap.size()); for (int i = 0; i < partsMap.size(); i++) { - Assert.assertEquals(partsMap.get(partInfoList.get(i).getPartNumber()), + Assertions.assertEquals(partsMap.get(partInfoList.get(i).getPartNumber()), partInfoList.get(i).getPartName()); } - Assert.assertFalse(ozoneMultipartUploadPartListParts.isTruncated()); + Assertions.assertFalse(ozoneMultipartUploadPartListParts.isTruncated()); } /** diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java index 15b579c9c958..1e1e6c995fe3 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java @@ -72,6 +72,7 @@ import org.apache.hadoop.hdds.utils.db.Table.KeyValue; import org.apache.hadoop.hdds.utils.db.TableIterator; import org.apache.hadoop.ozone.OzoneManagerVersion; +import org.apache.hadoop.ozone.om.ha.OMHAMetrics; import org.apache.hadoop.ozone.om.helpers.KeyInfoWithVolumeContext; import org.apache.hadoop.ozone.om.service.OMRangerBGSyncService; import org.apache.hadoop.ozone.om.upgrade.OMLayoutFeature; @@ -341,6 +342,7 @@ public final class OzoneManager extends ServiceRuntimeInfoImpl private final OzoneAdmins s3OzoneAdmins; private final OMMetrics metrics; + private OMHAMetrics omhaMetrics; private final ProtocolMessageMetrics omClientProtocolMetrics; private OzoneManagerHttpServer httpServer; @@ -1826,6 +1828,16 @@ public void updatePeerList(List newPeers) { } } } + if (isRatisEnabled) { + List serviceList = new ArrayList<>(); + try { + serviceList = getServiceList(); + } catch (IOException ex) { + LOG.error("IOException while getting the ServiceInfo list.", ex); + } + String ratisRoles = ratisRolesToString(); + omHAMetricsInit(serviceList, ratisRoles); + } } /** @@ -2092,6 +2104,7 @@ public void stop() { if (omRatisServer != null) { omRatisServer.stop(); omRatisServer = null; + OMHAMetrics.unRegister(); } isOmRpcServerRunning = false; if (isOmGrpcServerEnabled) { @@ -2979,23 +2992,56 @@ public String getRpcPort() { @Override public String getRatisRoles() { - List serviceList = null; + return ratisRolesToString(); + } + + private String ratisRolesToString() { + List serviceList; int port = omNodeDetails.getRatisPort(); - RaftPeer leaderId; + RaftPeer leader; if (isRatisEnabled) { try { - leaderId = omRatisServer.getLeader(); + leader = omRatisServer.getLeader(); serviceList = getServiceList(); } catch (IOException e) { LOG.error("IO-Exception Occurred", e); return "Exception: " + e.toString(); } - return OmUtils.format(serviceList, port, leaderId.getId().toString()); + String leaderId = ""; + if (Objects.nonNull(leader)) { + leaderId += leader.getId().toString(); + } + return OmUtils.format(serviceList, port, leaderId); } else { return "Ratis-Disabled"; } } + /** + * Create OMHAMetrics instance and set the number of + * OM nodes that are up and running. + */ + private void omHAMetricsInit(List serviceInfoList, + String ratisRoles) { + // unregister, in case metrics already exist + // so that the metric tags will get updated. + OMHAMetrics.unRegister(); + omhaMetrics = OMHAMetrics + .create(ratisRoles); + + int omCount = 0; + for (ServiceInfo serviceInfo : serviceInfoList) { + if (serviceInfo.getNodeType().equals(HddsProtos.NodeType.OM)) { + omCount++; + } + } + omhaMetrics.setNumOfOMNodes(omCount); + } + + public OMHAMetrics getOmhaMetrics() { + return omhaMetrics; + } + public String getRatisLogDirectory() { return OzoneManagerRatisUtils.getOMRatisDirectory(configuration); } diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java new file mode 100644 index 000000000000..08fd3257b4c6 --- /dev/null +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java @@ -0,0 +1,72 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS,WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package org.apache.hadoop.ozone.om.ha; + +import org.apache.hadoop.metrics2.annotation.Metric; +import org.apache.hadoop.metrics2.annotation.Metrics; +import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; +import org.apache.hadoop.metrics2.lib.MetricsRegistry; +import org.apache.hadoop.metrics2.lib.MutableGaugeInt; +import org.apache.hadoop.ozone.OzoneConsts; + +/** + * Class to maintain metrics and info related to OM HA. + */ +@Metrics(about = "OM HA Metrics", context = OzoneConsts.OZONE) +public class OMHAMetrics { + + public static final String SOURCE_NAME = + OMHAMetrics.class.getSimpleName(); + private final MetricsRegistry metricsRegistry; + + public OMHAMetrics(String ratisRoles) { + this.metricsRegistry = new MetricsRegistry(SOURCE_NAME) + .tag("OMRoles", "OM roles", ratisRoles); + } + + /** + * Create and return OMHAMetrics instance. + * @return OMHAMetrics + */ + public static synchronized OMHAMetrics create(String ratisRoles) { + OMHAMetrics metrics = new OMHAMetrics(ratisRoles); + return DefaultMetricsSystem.instance() + .register(SOURCE_NAME, "Metrics for OM HA", metrics); + } + + /** + * Unregister the metrics instance. + */ + public static void unRegister() { + DefaultMetricsSystem.instance().unregisterSource(SOURCE_NAME); + } + + @Metric("Number of OM nodes") + private MutableGaugeInt numOfOMNodes; + + public void setNumOfOMNodes(int count) { + numOfOMNodes.set(count); + } + + public MutableGaugeInt getNumOfOMNodes() { + return numOfOMNodes; + } + + public MetricsRegistry getMetricsRegistry() { + return metricsRegistry; + } +} From 12d620b5b2a056446d55bb1b13a589adf9ce4517 Mon Sep 17 00:00:00 2001 From: xBis7 Date: Tue, 3 Jan 2023 21:43:37 +0200 Subject: [PATCH 02/12] om, scm http-address setup for ozone-ha --- .../dist/src/main/compose/ozone-ha/docker-compose.yaml | 4 ++-- hadoop-ozone/dist/src/main/compose/ozone-ha/docker-config | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/hadoop-ozone/dist/src/main/compose/ozone-ha/docker-compose.yaml b/hadoop-ozone/dist/src/main/compose/ozone-ha/docker-compose.yaml index e9ec65935c1b..7332d7c7f33f 100644 --- a/hadoop-ozone/dist/src/main/compose/ozone-ha/docker-compose.yaml +++ b/hadoop-ozone/dist/src/main/compose/ozone-ha/docker-compose.yaml @@ -45,7 +45,7 @@ services: ENSURE_OM_INITIALIZED: /data/metadata/om/current/VERSION <<: *replication ports: - - 9874 + - 9874:9874 - 9862 hostname: om1 command: ["ozone","om"] @@ -74,7 +74,7 @@ services: scm1: <<: *common-config ports: - - 9876 + - 9876:9876 environment: ENSURE_SCM_INITIALIZED: /data/metadata/scm/current/VERSION OZONE-SITE.XML_hdds.scm.safemode.min.datanode: ${OZONE_SAFEMODE_MIN_DATANODES:-1} diff --git a/hadoop-ozone/dist/src/main/compose/ozone-ha/docker-config b/hadoop-ozone/dist/src/main/compose/ozone-ha/docker-config index c22505ff9a42..07fd0481a42f 100644 --- a/hadoop-ozone/dist/src/main/compose/ozone-ha/docker-config +++ b/hadoop-ozone/dist/src/main/compose/ozone-ha/docker-config @@ -23,6 +23,9 @@ OZONE-SITE.XML_ozone.om.address.omservice.om2=om2 OZONE-SITE.XML_ozone.om.address.omservice.om3=om3 OZONE-SITE.XML_ozone.om.ratis.enable=true +OZONE-SITE.XML_ozone.om.http-address=0.0.0.0:9874 +OZONE-SITE.XML_ozone.scm.http-address=0.0.0.0:9876 + OZONE-SITE.XML_ozone.scm.service.ids=scmservice OZONE-SITE.XML_ozone.scm.nodes.scmservice=scm1,scm2,scm3 OZONE-SITE.XML_ozone.scm.address.scmservice.scm1=scm1 From abf2de1d74c8ad1edd20a48cf3510b9270cf4442 Mon Sep 17 00:00:00 2001 From: xBis7 Date: Wed, 4 Jan 2023 15:39:04 +0200 Subject: [PATCH 03/12] OM refactor OMHAMetricsInit() call --- .../main/java/org/apache/hadoop/ozone/om/OzoneManager.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java index 1e1e6c995fe3..b8ea530d9eb6 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java @@ -1828,16 +1828,16 @@ public void updatePeerList(List newPeers) { } } } + List serviceList = new ArrayList<>(); if (isRatisEnabled) { - List serviceList = new ArrayList<>(); try { serviceList = getServiceList(); } catch (IOException ex) { LOG.error("IOException while getting the ServiceInfo list.", ex); } - String ratisRoles = ratisRolesToString(); - omHAMetricsInit(serviceList, ratisRoles); } + String ratisRoles = ratisRolesToString(); + omHAMetricsInit(serviceList, ratisRoles); } /** From b731eea6ec94222492aa370e4db85c9e82b366f4 Mon Sep 17 00:00:00 2001 From: xBis7 Date: Fri, 13 Jan 2023 16:57:04 +0200 Subject: [PATCH 04/12] each OM holds info only about itself - value presented as gauge --- .../ozone/om/TestOzoneManagerHAWithData.java | 26 ++-- .../apache/hadoop/ozone/om/OzoneManager.java | 47 +++---- .../hadoop/ozone/om/ha/OMHAMetrics.java | 132 +++++++++++++++--- 3 files changed, 149 insertions(+), 56 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java index 62ae81e8432f..9d3b16296121 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java @@ -108,15 +108,25 @@ public void testMultipartUpload() throws Exception { @Test public void testOMHAMetrics() throws InterruptedException { Thread.sleep(2000); - OzoneManager om = getCluster().getOzoneManager(1); - OMHAMetrics omhaMetrics = om.getOmhaMetrics(); - String omRoles = omhaMetrics.getMetricsRegistry() - .getTag("OMRoles").value(); - - Assertions.assertEquals(om.getRatisRoles(), omRoles); - Assertions.assertEquals(getNumOfOMs(), - omhaMetrics.getNumOfOMNodes().value()); + OzoneManager leaderOM = getCluster().getOMLeader(); + OzoneManager randomOM = getCluster().getOzoneManager(1); + + // Get OMHAMetrics + OMHAMetrics omhaMetrics = randomOM.getOmhaMetrics(); + + if (randomOM.getOMNodeId() + .equals(leaderOM.getOMNodeId())) { + Assertions.assertEquals("leader", + omhaMetrics.getOmhaInfoState()); + Assertions.assertEquals(1L, + omhaMetrics.getOmhaInfoOzoneManagerHALeaderState()); + } else { + Assertions.assertEquals("follower", + omhaMetrics.getOmhaInfoState()); + Assertions.assertEquals(0L, + omhaMetrics.getOmhaInfoOzoneManagerHALeaderState()); + } } @Test diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java index b8ea530d9eb6..f2f1bea850bd 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java @@ -1828,16 +1828,20 @@ public void updatePeerList(List newPeers) { } } } - List serviceList = new ArrayList<>(); + String leaderId = ""; if (isRatisEnabled) { + RaftPeer leader = null; try { - serviceList = getServiceList(); + leader = omRatisServer.getLeader(); } catch (IOException ex) { - LOG.error("IOException while getting the ServiceInfo list.", ex); + LOG.error("IOException while getting the " + + "Ratis server leader.", ex); + } + if (Objects.nonNull(leader)) { + leaderId += leader.getId().toString(); } } - String ratisRoles = ratisRolesToString(); - omHAMetricsInit(serviceList, ratisRoles); + omHAMetricsInit(leaderId); } /** @@ -2992,52 +2996,35 @@ public String getRpcPort() { @Override public String getRatisRoles() { - return ratisRolesToString(); - } - - private String ratisRolesToString() { - List serviceList; + List serviceList = null; int port = omNodeDetails.getRatisPort(); - RaftPeer leader; + RaftPeer leaderId; if (isRatisEnabled) { try { - leader = omRatisServer.getLeader(); + leaderId = omRatisServer.getLeader(); serviceList = getServiceList(); } catch (IOException e) { LOG.error("IO-Exception Occurred", e); return "Exception: " + e.toString(); } - String leaderId = ""; - if (Objects.nonNull(leader)) { - leaderId += leader.getId().toString(); - } - return OmUtils.format(serviceList, port, leaderId); + return OmUtils.format(serviceList, port, leaderId.getId().toString()); } else { return "Ratis-Disabled"; } } /** - * Create OMHAMetrics instance and set the number of - * OM nodes that are up and running. + * Create OMHAMetrics instance. */ - private void omHAMetricsInit(List serviceInfoList, - String ratisRoles) { + private void omHAMetricsInit(String leaderId) { // unregister, in case metrics already exist // so that the metric tags will get updated. OMHAMetrics.unRegister(); omhaMetrics = OMHAMetrics - .create(ratisRoles); - - int omCount = 0; - for (ServiceInfo serviceInfo : serviceInfoList) { - if (serviceInfo.getNodeType().equals(HddsProtos.NodeType.OM)) { - omCount++; - } - } - omhaMetrics.setNumOfOMNodes(omCount); + .create(getOMNodeId(), leaderId); } + @VisibleForTesting public OMHAMetrics getOmhaMetrics() { return omhaMetrics; } diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java index 08fd3257b4c6..402c0d3207e9 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java @@ -16,34 +16,103 @@ */ package org.apache.hadoop.ozone.om.ha; -import org.apache.hadoop.metrics2.annotation.Metric; +import com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.metrics2.MetricsCollector; +import org.apache.hadoop.metrics2.MetricsInfo; +import org.apache.hadoop.metrics2.MetricsSource; +import org.apache.hadoop.metrics2.MetricsRecordBuilder; import org.apache.hadoop.metrics2.annotation.Metrics; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.metrics2.lib.MetricsRegistry; -import org.apache.hadoop.metrics2.lib.MutableGaugeInt; import org.apache.hadoop.ozone.OzoneConsts; /** * Class to maintain metrics and info related to OM HA. */ -@Metrics(about = "OM HA Metrics", context = OzoneConsts.OZONE) -public class OMHAMetrics { +@Metrics(about = "OzoneManager HA Metrics", context = OzoneConsts.OZONE) +public final class OMHAMetrics implements MetricsSource { + + private enum OMHAMetricsInfo implements MetricsInfo { + + OzoneManagerHALeaderState("Leader active state " + + "of OzoneManager node (1 leader, 0 follower)"), + State("OM State (leader or follower)"), + NodeId("OM node Id"); + + private final String description; + + OMHAMetricsInfo(String description) { + this.description = description; + } + + @Override + public String description() { + return description; + } + } + + /** + * Private nested class to hold + * the values of OMHAMetricsInfo. + */ + private static final class OMHAInfo { + + private long ozoneManagerHALeaderState; + private String state; + private String nodeId; + + OMHAInfo() { + this.ozoneManagerHALeaderState = 0L; + this.state = ""; + this.nodeId = ""; + } + + public long getOzoneManagerHALeaderState() { + return ozoneManagerHALeaderState; + } + + public void setOzoneManagerHALeaderState(long ozoneManagerHALeaderState) { + this.ozoneManagerHALeaderState = ozoneManagerHALeaderState; + } + + public String getState() { + return state; + } + + public void setState(String state) { + this.state = state; + } + + public String getNodeId() { + return nodeId; + } + + public void setNodeId(String nodeId) { + this.nodeId = nodeId; + } + } public static final String SOURCE_NAME = OMHAMetrics.class.getSimpleName(); - private final MetricsRegistry metricsRegistry; + private final OMHAInfo omhaInfo = new OMHAInfo(); + private MetricsRegistry metricsRegistry; - public OMHAMetrics(String ratisRoles) { - this.metricsRegistry = new MetricsRegistry(SOURCE_NAME) - .tag("OMRoles", "OM roles", ratisRoles); + private String currNodeId; + private String leaderId; + + public OMHAMetrics(String currNodeId, String leaderId) { + this.currNodeId = currNodeId; + this.leaderId = leaderId; + this.metricsRegistry = new MetricsRegistry(SOURCE_NAME); } /** * Create and return OMHAMetrics instance. * @return OMHAMetrics */ - public static synchronized OMHAMetrics create(String ratisRoles) { - OMHAMetrics metrics = new OMHAMetrics(ratisRoles); + public static synchronized OMHAMetrics create( + String nodeId, String leaderId) { + OMHAMetrics metrics = new OMHAMetrics(nodeId, leaderId); return DefaultMetricsSystem.instance() .register(SOURCE_NAME, "Metrics for OM HA", metrics); } @@ -55,18 +124,45 @@ public static void unRegister() { DefaultMetricsSystem.instance().unregisterSource(SOURCE_NAME); } - @Metric("Number of OM nodes") - private MutableGaugeInt numOfOMNodes; + @Override + public synchronized void getMetrics(MetricsCollector collector, boolean all) { + + MetricsRecordBuilder recordBuilder = collector.addRecord(SOURCE_NAME); + + if (currNodeId.equals(leaderId)) { + omhaInfo.setNodeId(currNodeId); + omhaInfo.setState("leader"); + omhaInfo.setOzoneManagerHALeaderState(1); + + recordBuilder + .tag(OMHAMetricsInfo.NodeId, currNodeId) + .tag(OMHAMetricsInfo.State, "leader") + .addGauge(OMHAMetricsInfo.OzoneManagerHALeaderState, 1); + } else { + omhaInfo.setNodeId(currNodeId); + omhaInfo.setState("follower"); + omhaInfo.setOzoneManagerHALeaderState(0); + + recordBuilder + .tag(OMHAMetricsInfo.NodeId, currNodeId) + .tag(OMHAMetricsInfo.State, "follower") + .addGauge(OMHAMetricsInfo.OzoneManagerHALeaderState, 0); + } + recordBuilder.endRecord(); + } - public void setNumOfOMNodes(int count) { - numOfOMNodes.set(count); + @VisibleForTesting + public String getOmhaInfoNodeId() { + return omhaInfo.getNodeId(); } - public MutableGaugeInt getNumOfOMNodes() { - return numOfOMNodes; + @VisibleForTesting + public String getOmhaInfoState() { + return omhaInfo.getState(); } - public MetricsRegistry getMetricsRegistry() { - return metricsRegistry; + @VisibleForTesting + public long getOmhaInfoOzoneManagerHALeaderState() { + return omhaInfo.getOzoneManagerHALeaderState(); } } From 732bd36b0e593152f83334423a83f023cef619b0 Mon Sep 17 00:00:00 2001 From: xBis7 Date: Fri, 20 Jan 2023 13:41:58 +0200 Subject: [PATCH 05/12] remove sleep from test --- .../src/main/compose/ozone-ha/docker-config | 3 --- .../ozone/om/TestOzoneManagerHAWithData.java | 22 +++++++++++++++++-- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/hadoop-ozone/dist/src/main/compose/ozone-ha/docker-config b/hadoop-ozone/dist/src/main/compose/ozone-ha/docker-config index 07fd0481a42f..c22505ff9a42 100644 --- a/hadoop-ozone/dist/src/main/compose/ozone-ha/docker-config +++ b/hadoop-ozone/dist/src/main/compose/ozone-ha/docker-config @@ -23,9 +23,6 @@ OZONE-SITE.XML_ozone.om.address.omservice.om2=om2 OZONE-SITE.XML_ozone.om.address.omservice.om3=om3 OZONE-SITE.XML_ozone.om.ratis.enable=true -OZONE-SITE.XML_ozone.om.http-address=0.0.0.0:9874 -OZONE-SITE.XML_ozone.scm.http-address=0.0.0.0:9876 - OZONE-SITE.XML_ozone.scm.service.ids=scmservice OZONE-SITE.XML_ozone.scm.nodes.scmservice=scm1,scm2,scm3 OZONE-SITE.XML_ozone.scm.address.scmservice.scm1=scm1 diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java index 9d3b16296121..e407816d6e0a 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java @@ -42,6 +42,7 @@ import java.util.List; import java.util.Map; import java.util.UUID; +import java.util.concurrent.TimeoutException; import static java.nio.charset.StandardCharsets.UTF_8; import static org.apache.hadoop.ozone.MiniOzoneHAClusterImpl.NODE_FAILURE_TIMEOUT; @@ -106,8 +107,9 @@ public void testMultipartUpload() throws Exception { } @Test - public void testOMHAMetrics() throws InterruptedException { - Thread.sleep(2000); + public void testOMHAMetrics() + throws InterruptedException, TimeoutException { + waitForLeaderToBeReady(); OzoneManager leaderOM = getCluster().getOMLeader(); OzoneManager randomOM = getCluster().getOzoneManager(1); @@ -129,6 +131,22 @@ public void testOMHAMetrics() throws InterruptedException { } } + /** + * Some tests are stopping or restarting OMs. + * There are test cases where we might need to + * wait for a leader to be elected and ready. + */ + private void waitForLeaderToBeReady() + throws InterruptedException, TimeoutException { + GenericTestUtils.waitFor(() -> { + try { + return getCluster().getOMLeader().isLeaderReady(); + } catch (Exception e) { + return false; + } + }, 1000, 20000); + } + @Test public void testFileOperationsAndDelete() throws Exception { testFileOperationsWithRecursive(); From e836659ea975c95f02e57ab90deb801e9c0f7eeb Mon Sep 17 00:00:00 2001 From: xBis7 Date: Thu, 26 Jan 2023 18:31:36 +0200 Subject: [PATCH 06/12] removing state tag --- .../ozone/om/TestOzoneManagerHAWithData.java | 4 ---- .../hadoop/ozone/om/ha/OMHAMetrics.java | 20 ------------------- 2 files changed, 24 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java index e407816d6e0a..359ade6261ed 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java @@ -119,13 +119,9 @@ public void testOMHAMetrics() if (randomOM.getOMNodeId() .equals(leaderOM.getOMNodeId())) { - Assertions.assertEquals("leader", - omhaMetrics.getOmhaInfoState()); Assertions.assertEquals(1L, omhaMetrics.getOmhaInfoOzoneManagerHALeaderState()); } else { - Assertions.assertEquals("follower", - omhaMetrics.getOmhaInfoState()); Assertions.assertEquals(0L, omhaMetrics.getOmhaInfoOzoneManagerHALeaderState()); } diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java index 402c0d3207e9..80efcc940262 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java @@ -36,7 +36,6 @@ private enum OMHAMetricsInfo implements MetricsInfo { OzoneManagerHALeaderState("Leader active state " + "of OzoneManager node (1 leader, 0 follower)"), - State("OM State (leader or follower)"), NodeId("OM node Id"); private final String description; @@ -58,12 +57,10 @@ public String description() { private static final class OMHAInfo { private long ozoneManagerHALeaderState; - private String state; private String nodeId; OMHAInfo() { this.ozoneManagerHALeaderState = 0L; - this.state = ""; this.nodeId = ""; } @@ -75,14 +72,6 @@ public void setOzoneManagerHALeaderState(long ozoneManagerHALeaderState) { this.ozoneManagerHALeaderState = ozoneManagerHALeaderState; } - public String getState() { - return state; - } - - public void setState(String state) { - this.state = state; - } - public String getNodeId() { return nodeId; } @@ -131,21 +120,17 @@ public synchronized void getMetrics(MetricsCollector collector, boolean all) { if (currNodeId.equals(leaderId)) { omhaInfo.setNodeId(currNodeId); - omhaInfo.setState("leader"); omhaInfo.setOzoneManagerHALeaderState(1); recordBuilder .tag(OMHAMetricsInfo.NodeId, currNodeId) - .tag(OMHAMetricsInfo.State, "leader") .addGauge(OMHAMetricsInfo.OzoneManagerHALeaderState, 1); } else { omhaInfo.setNodeId(currNodeId); - omhaInfo.setState("follower"); omhaInfo.setOzoneManagerHALeaderState(0); recordBuilder .tag(OMHAMetricsInfo.NodeId, currNodeId) - .tag(OMHAMetricsInfo.State, "follower") .addGauge(OMHAMetricsInfo.OzoneManagerHALeaderState, 0); } recordBuilder.endRecord(); @@ -156,11 +141,6 @@ public String getOmhaInfoNodeId() { return omhaInfo.getNodeId(); } - @VisibleForTesting - public String getOmhaInfoState() { - return omhaInfo.getState(); - } - @VisibleForTesting public long getOmhaInfoOzoneManagerHALeaderState() { return omhaInfo.getOzoneManagerHALeaderState(); From 69ca1ac8065c63c1993e78e0a1f30fd8168911cb Mon Sep 17 00:00:00 2001 From: xBis7 Date: Wed, 1 Feb 2023 16:37:33 +0200 Subject: [PATCH 07/12] reduce code duplication - expand testOMHAMetrics --- .../ozone/om/TestOzoneManagerHAWithData.java | 64 +++++++++++++++---- .../apache/hadoop/ozone/om/OzoneManager.java | 2 +- .../hadoop/ozone/om/ha/OMHAMetrics.java | 24 +++---- 3 files changed, 62 insertions(+), 28 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java index 359ade6261ed..66da0a16b4e1 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java @@ -107,26 +107,66 @@ public void testMultipartUpload() throws Exception { } @Test - public void testOMHAMetrics() - throws InterruptedException, TimeoutException { + public void testOMHAMetrics() throws InterruptedException, + TimeoutException, IOException { waitForLeaderToBeReady(); + // Get leader OM OzoneManager leaderOM = getCluster().getOMLeader(); - OzoneManager randomOM = getCluster().getOzoneManager(1); + // Store current leader's node ID, + // to use it after restarting the OM + String leaderNodeId = leaderOM.getOMNodeId(); + // Get a list of all OMs + List omList = getCluster().getOzoneManagersList(); - // Get OMHAMetrics - OMHAMetrics omhaMetrics = randomOM.getOmhaMetrics(); + // Check metrics for all OMs + checkOMHAMetricsForAllOMs(omList, leaderOM); - if (randomOM.getOMNodeId() - .equals(leaderOM.getOMNodeId())) { - Assertions.assertEquals(1L, - omhaMetrics.getOmhaInfoOzoneManagerHALeaderState()); - } else { - Assertions.assertEquals(0L, + // Restart current leader OM + leaderOM.stop(); + leaderOM.restart(); + + waitForLeaderToBeReady(); + + // Get the new leader + OzoneManager newLeaderOM = getCluster().getOMLeader(); + // Get a list of all OMs again + omList = getCluster().getOzoneManagersList(); + + // New state for the old leader + int newState = leaderNodeId.equals(newLeaderOM.getOMNodeId()) ? 1 : 0; + + // Get old leader + OzoneManager oldLeader = getCluster().getOzoneManager(leaderNodeId); + // Get old leader's metrics + OMHAMetrics omhaMetrics = oldLeader.getOmhaMetrics(); + + Assertions.assertEquals(newState, + omhaMetrics.getOmhaInfoOzoneManagerHALeaderState()); + + // Check that metrics for all OMs have been updated + checkOMHAMetricsForAllOMs(omList, newLeaderOM); + } + + private void checkOMHAMetricsForAllOMs(List omList, + OzoneManager leaderOM) { + for (OzoneManager om : omList) { + // Get OMHAMetrics for the current OM + OMHAMetrics omhaMetrics = om.getOmhaMetrics(); + String nodeId = om.getOMNodeId(); + + // If current OM is leader, state should be 1 + int expectedState = nodeId + .equals(leaderOM.getOMNodeId()) ? 1 : 0; + + Assertions.assertEquals(expectedState, omhaMetrics.getOmhaInfoOzoneManagerHALeaderState()); + + Assertions.assertEquals(nodeId, omhaMetrics.getOmhaInfoNodeId()); } } + /** * Some tests are stopping or restarting OMs. * There are test cases where we might need to @@ -140,7 +180,7 @@ private void waitForLeaderToBeReady() } catch (Exception e) { return false; } - }, 1000, 20000); + }, 1000, 80000); } @Test diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java index f2f1bea850bd..90181f51b2f8 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java @@ -1838,7 +1838,7 @@ public void updatePeerList(List newPeers) { "Ratis server leader.", ex); } if (Objects.nonNull(leader)) { - leaderId += leader.getId().toString(); + leaderId = leader.getId().toString(); } } omHAMetricsInit(leaderId); diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java index 80efcc940262..5a729c37be0f 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java @@ -118,21 +118,15 @@ public synchronized void getMetrics(MetricsCollector collector, boolean all) { MetricsRecordBuilder recordBuilder = collector.addRecord(SOURCE_NAME); - if (currNodeId.equals(leaderId)) { - omhaInfo.setNodeId(currNodeId); - omhaInfo.setOzoneManagerHALeaderState(1); - - recordBuilder - .tag(OMHAMetricsInfo.NodeId, currNodeId) - .addGauge(OMHAMetricsInfo.OzoneManagerHALeaderState, 1); - } else { - omhaInfo.setNodeId(currNodeId); - omhaInfo.setOzoneManagerHALeaderState(0); - - recordBuilder - .tag(OMHAMetricsInfo.NodeId, currNodeId) - .addGauge(OMHAMetricsInfo.OzoneManagerHALeaderState, 0); - } + // Check current node state (1 leader, 0 follower) + int state = currNodeId.equals(leaderId) ? 1 : 0; + omhaInfo.setNodeId(currNodeId); + omhaInfo.setOzoneManagerHALeaderState(state); + + recordBuilder + .tag(OMHAMetricsInfo.NodeId, currNodeId) + .addGauge(OMHAMetricsInfo.OzoneManagerHALeaderState, state); + recordBuilder.endRecord(); } From 4d608b0d9e33d2253f0a1e93b7acb399b8f4ef49 Mon Sep 17 00:00:00 2001 From: xBis7 Date: Thu, 2 Feb 2023 13:51:15 +0200 Subject: [PATCH 08/12] OMHAMetrics cleanup --- .../hadoop/ozone/om/ha/OMHAMetrics.java | 52 ++++++++----------- 1 file changed, 21 insertions(+), 31 deletions(-) diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java index 5a729c37be0f..2c75848b8ff8 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java @@ -23,6 +23,7 @@ import org.apache.hadoop.metrics2.MetricsRecordBuilder; import org.apache.hadoop.metrics2.annotation.Metrics; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; +import org.apache.hadoop.metrics2.lib.Interns; import org.apache.hadoop.metrics2.lib.MetricsRegistry; import org.apache.hadoop.ozone.OzoneConsts; @@ -32,34 +33,23 @@ @Metrics(about = "OzoneManager HA Metrics", context = OzoneConsts.OZONE) public final class OMHAMetrics implements MetricsSource { - private enum OMHAMetricsInfo implements MetricsInfo { - - OzoneManagerHALeaderState("Leader active state " + - "of OzoneManager node (1 leader, 0 follower)"), - NodeId("OM node Id"); - - private final String description; - - OMHAMetricsInfo(String description) { - this.description = description; - } - - @Override - public String description() { - return description; - } - } - /** - * Private nested class to hold - * the values of OMHAMetricsInfo. + * Private nested class to hold the values + * of MetricsInfo for OMHAMetrics. */ - private static final class OMHAInfo { + private static final class OMHAMetricsInfo { + + private static final MetricsInfo OZONE_MANAGER_HA_LEADER_STATE = + Interns.info("OzoneManagerHALeaderState", + "Leader active state of OzoneManager node (1 leader, 0 follower)"); + + private static final MetricsInfo NODE_ID = + Interns.info("NodeId", "OM node Id"); private long ozoneManagerHALeaderState; private String nodeId; - OMHAInfo() { + OMHAMetricsInfo() { this.ozoneManagerHALeaderState = 0L; this.nodeId = ""; } @@ -83,13 +73,13 @@ public void setNodeId(String nodeId) { public static final String SOURCE_NAME = OMHAMetrics.class.getSimpleName(); - private final OMHAInfo omhaInfo = new OMHAInfo(); + private final OMHAMetricsInfo omhaMetricsInfo = new OMHAMetricsInfo(); private MetricsRegistry metricsRegistry; private String currNodeId; private String leaderId; - public OMHAMetrics(String currNodeId, String leaderId) { + private OMHAMetrics(String currNodeId, String leaderId) { this.currNodeId = currNodeId; this.leaderId = leaderId; this.metricsRegistry = new MetricsRegistry(SOURCE_NAME); @@ -99,7 +89,7 @@ public OMHAMetrics(String currNodeId, String leaderId) { * Create and return OMHAMetrics instance. * @return OMHAMetrics */ - public static synchronized OMHAMetrics create( + public static OMHAMetrics create( String nodeId, String leaderId) { OMHAMetrics metrics = new OMHAMetrics(nodeId, leaderId); return DefaultMetricsSystem.instance() @@ -120,23 +110,23 @@ public synchronized void getMetrics(MetricsCollector collector, boolean all) { // Check current node state (1 leader, 0 follower) int state = currNodeId.equals(leaderId) ? 1 : 0; - omhaInfo.setNodeId(currNodeId); - omhaInfo.setOzoneManagerHALeaderState(state); + omhaMetricsInfo.setNodeId(currNodeId); + omhaMetricsInfo.setOzoneManagerHALeaderState(state); recordBuilder - .tag(OMHAMetricsInfo.NodeId, currNodeId) - .addGauge(OMHAMetricsInfo.OzoneManagerHALeaderState, state); + .tag(OMHAMetricsInfo.NODE_ID, currNodeId) + .addGauge(OMHAMetricsInfo.OZONE_MANAGER_HA_LEADER_STATE, state); recordBuilder.endRecord(); } @VisibleForTesting public String getOmhaInfoNodeId() { - return omhaInfo.getNodeId(); + return omhaMetricsInfo.getNodeId(); } @VisibleForTesting public long getOmhaInfoOzoneManagerHALeaderState() { - return omhaInfo.getOzoneManagerHALeaderState(); + return omhaMetricsInfo.getOzoneManagerHALeaderState(); } } From 579778afbf5690ed89a085f72d72efbe05256b39 Mon Sep 17 00:00:00 2001 From: xBis7 Date: Thu, 16 Feb 2023 14:29:52 +0200 Subject: [PATCH 09/12] cleanup --- .../ozone/om/TestOzoneManagerHAWithData.java | 15 +++++++------ .../apache/hadoop/ozone/om/OzoneManager.java | 22 +++++++++---------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java index 66da0a16b4e1..f51872e1810e 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAWithData.java @@ -115,12 +115,12 @@ public void testOMHAMetrics() throws InterruptedException, OzoneManager leaderOM = getCluster().getOMLeader(); // Store current leader's node ID, // to use it after restarting the OM - String leaderNodeId = leaderOM.getOMNodeId(); + String leaderOMId = leaderOM.getOMNodeId(); // Get a list of all OMs List omList = getCluster().getOzoneManagersList(); // Check metrics for all OMs - checkOMHAMetricsForAllOMs(omList, leaderOM); + checkOMHAMetricsForAllOMs(omList, leaderOMId); // Restart current leader OM leaderOM.stop(); @@ -130,14 +130,15 @@ public void testOMHAMetrics() throws InterruptedException, // Get the new leader OzoneManager newLeaderOM = getCluster().getOMLeader(); + String newLeaderOMId = newLeaderOM.getOMNodeId(); // Get a list of all OMs again omList = getCluster().getOzoneManagersList(); // New state for the old leader - int newState = leaderNodeId.equals(newLeaderOM.getOMNodeId()) ? 1 : 0; + int newState = leaderOMId.equals(newLeaderOMId) ? 1 : 0; // Get old leader - OzoneManager oldLeader = getCluster().getOzoneManager(leaderNodeId); + OzoneManager oldLeader = getCluster().getOzoneManager(leaderOMId); // Get old leader's metrics OMHAMetrics omhaMetrics = oldLeader.getOmhaMetrics(); @@ -145,11 +146,11 @@ public void testOMHAMetrics() throws InterruptedException, omhaMetrics.getOmhaInfoOzoneManagerHALeaderState()); // Check that metrics for all OMs have been updated - checkOMHAMetricsForAllOMs(omList, newLeaderOM); + checkOMHAMetricsForAllOMs(omList, newLeaderOMId); } private void checkOMHAMetricsForAllOMs(List omList, - OzoneManager leaderOM) { + String leaderOMId) { for (OzoneManager om : omList) { // Get OMHAMetrics for the current OM OMHAMetrics omhaMetrics = om.getOmhaMetrics(); @@ -157,7 +158,7 @@ private void checkOMHAMetricsForAllOMs(List omList, // If current OM is leader, state should be 1 int expectedState = nodeId - .equals(leaderOM.getOMNodeId()) ? 1 : 0; + .equals(leaderOMId) ? 1 : 0; Assertions.assertEquals(expectedState, omhaMetrics.getOmhaInfoOzoneManagerHALeaderState()); diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java index 90181f51b2f8..7db0098a225b 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java @@ -1829,18 +1829,18 @@ public void updatePeerList(List newPeers) { } } String leaderId = ""; - if (isRatisEnabled) { - RaftPeer leader = null; - try { - leader = omRatisServer.getLeader(); - } catch (IOException ex) { - LOG.error("IOException while getting the " + - "Ratis server leader.", ex); - } - if (Objects.nonNull(leader)) { - leaderId = leader.getId().toString(); - } + RaftPeer leader = null; + try { + leader = omRatisServer.getLeader(); + } catch (IOException ex) { + LOG.error("IOException while getting the " + + "Ratis server leader.", ex); + } + if (Objects.nonNull(leader)) { + leaderId = leader.getId().toString(); } + // If leaderId is empty, then for some reason there is + // no leader and all OMs will present as followers. omHAMetricsInit(leaderId); } From dd60ffc27002405cb2d792055a5c0bff1d6e4518 Mon Sep 17 00:00:00 2001 From: xBis7 Date: Thu, 16 Feb 2023 15:38:31 +0200 Subject: [PATCH 10/12] unit tests --- .../hadoop/fs/ozone/TestOzoneFsSnapshot.java | 2 +- .../hadoop/ozone/om/ha/OMHAMetrics.java | 10 +-- .../hadoop/ozone/om/ha/TestOMHAMetrics.java | 71 +++++++++++++++++++ 3 files changed, 77 insertions(+), 6 deletions(-) create mode 100644 hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ha/TestOMHAMetrics.java diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/fs/ozone/TestOzoneFsSnapshot.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/fs/ozone/TestOzoneFsSnapshot.java index dcbdebf9bf0a..3cb28e42cb56 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/fs/ozone/TestOzoneFsSnapshot.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/fs/ozone/TestOzoneFsSnapshot.java @@ -63,7 +63,7 @@ public class TestOzoneFsSnapshot { private static OzoneManager ozoneManager; private static OzoneFsShell shell; private static final String VOLUME = - "vol-" + RandomStringUtils.randomNumeric(5);; + "vol-" + RandomStringUtils.randomNumeric(5); private static final String BUCKET = "buck-" + RandomStringUtils.randomNumeric(5); private static final String KEY = diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java index 2c75848b8ff8..7fde2943bbee 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHAMetrics.java @@ -46,19 +46,19 @@ private static final class OMHAMetricsInfo { private static final MetricsInfo NODE_ID = Interns.info("NodeId", "OM node Id"); - private long ozoneManagerHALeaderState; + private int ozoneManagerHALeaderState; private String nodeId; OMHAMetricsInfo() { - this.ozoneManagerHALeaderState = 0L; + this.ozoneManagerHALeaderState = 0; this.nodeId = ""; } - public long getOzoneManagerHALeaderState() { + public int getOzoneManagerHALeaderState() { return ozoneManagerHALeaderState; } - public void setOzoneManagerHALeaderState(long ozoneManagerHALeaderState) { + public void setOzoneManagerHALeaderState(int ozoneManagerHALeaderState) { this.ozoneManagerHALeaderState = ozoneManagerHALeaderState; } @@ -126,7 +126,7 @@ public String getOmhaInfoNodeId() { } @VisibleForTesting - public long getOmhaInfoOzoneManagerHALeaderState() { + public int getOmhaInfoOzoneManagerHALeaderState() { return omhaMetricsInfo.getOzoneManagerHALeaderState(); } } diff --git a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ha/TestOMHAMetrics.java b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ha/TestOMHAMetrics.java new file mode 100644 index 000000000000..ff4c00158deb --- /dev/null +++ b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ha/TestOMHAMetrics.java @@ -0,0 +1,71 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package org.apache.hadoop.ozone.om.ha; + +import org.apache.commons.lang3.RandomStringUtils; +import org.apache.hadoop.metrics2.impl.MetricsCollectorImpl; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +/** + * Tests for {@link OMHAMetrics}. + */ +public class TestOMHAMetrics { + + private static final MetricsCollectorImpl METRICS_COLLECTOR = + new MetricsCollectorImpl(); + private static final String NODE_ID = + "om" + RandomStringUtils.randomNumeric(5); + private OMHAMetrics omhaMetrics; + private String leaderId; + + @AfterEach + public void cleanUp() { + OMHAMetrics.unRegister(); + } + + @Test + public void testGetMetricsWithLeader() { + leaderId = NODE_ID; + omhaMetrics = OMHAMetrics.create(NODE_ID, leaderId); + + omhaMetrics.getMetrics(METRICS_COLLECTOR, true); + Assertions.assertEquals(1, + omhaMetrics.getOmhaInfoOzoneManagerHALeaderState()); + } + + @Test + public void testGetMetricsWithFollower() { + leaderId = "om" + RandomStringUtils.randomNumeric(5); + omhaMetrics = OMHAMetrics.create(NODE_ID, leaderId); + + omhaMetrics.getMetrics(METRICS_COLLECTOR, true); + Assertions.assertEquals(0, + omhaMetrics.getOmhaInfoOzoneManagerHALeaderState()); + } + + @Test + public void testGetMetricsWithEmptyLeaderId() { + leaderId = ""; + omhaMetrics = OMHAMetrics.create(NODE_ID, leaderId); + + omhaMetrics.getMetrics(METRICS_COLLECTOR, true); + Assertions.assertEquals(0, + omhaMetrics.getOmhaInfoOzoneManagerHALeaderState()); + } +} From ac49ca72851f7b8c20ca8db84a24107592e1d386 Mon Sep 17 00:00:00 2001 From: xBis7 Date: Tue, 21 Feb 2023 12:41:20 +0200 Subject: [PATCH 11/12] OMHAMetrics not registered if leaderId is empty --- .../hadoop/fs/ozone/TestOzoneFsSnapshot.java | 2 +- .../org/apache/hadoop/ozone/om/OzoneManager.java | 14 +++++++++----- .../apache/hadoop/ozone/om/ha/TestOMHAMetrics.java | 10 ---------- 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/fs/ozone/TestOzoneFsSnapshot.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/fs/ozone/TestOzoneFsSnapshot.java index 3cb28e42cb56..dcbdebf9bf0a 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/fs/ozone/TestOzoneFsSnapshot.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/fs/ozone/TestOzoneFsSnapshot.java @@ -63,7 +63,7 @@ public class TestOzoneFsSnapshot { private static OzoneManager ozoneManager; private static OzoneFsShell shell; private static final String VOLUME = - "vol-" + RandomStringUtils.randomNumeric(5); + "vol-" + RandomStringUtils.randomNumeric(5);; private static final String BUCKET = "buck-" + RandomStringUtils.randomNumeric(5); private static final String KEY = diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java index 2705d3cdb285..92a1d5045c0b 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java @@ -51,6 +51,7 @@ import java.util.stream.Collectors; import com.google.common.base.Optional; +import com.google.common.base.Strings; import com.google.common.collect.ImmutableSortedSet; import com.google.common.collect.Lists; import org.apache.hadoop.conf.Configuration; @@ -1896,7 +1897,6 @@ public void updatePeerList(List newPeers) { } } } - String leaderId = ""; RaftPeer leader = null; try { leader = omRatisServer.getLeader(); @@ -1905,11 +1905,15 @@ public void updatePeerList(List newPeers) { "Ratis server leader.", ex); } if (Objects.nonNull(leader)) { - leaderId = leader.getId().toString(); + String leaderId = leader.getId().toString(); + + // If leaderId is empty, then leader is undefined + // and current OM is neither leader nor follower. + // OMHAMetrics shouldn't be registered in that case. + if (!Strings.isNullOrEmpty(leaderId)) { + omHAMetricsInit(leaderId); + } } - // If leaderId is empty, then for some reason there is - // no leader and all OMs will present as followers. - omHAMetricsInit(leaderId); } /** diff --git a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ha/TestOMHAMetrics.java b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ha/TestOMHAMetrics.java index ff4c00158deb..16192c4902cf 100644 --- a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ha/TestOMHAMetrics.java +++ b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ha/TestOMHAMetrics.java @@ -58,14 +58,4 @@ public void testGetMetricsWithFollower() { Assertions.assertEquals(0, omhaMetrics.getOmhaInfoOzoneManagerHALeaderState()); } - - @Test - public void testGetMetricsWithEmptyLeaderId() { - leaderId = ""; - omhaMetrics = OMHAMetrics.create(NODE_ID, leaderId); - - omhaMetrics.getMetrics(METRICS_COLLECTOR, true); - Assertions.assertEquals(0, - omhaMetrics.getOmhaInfoOzoneManagerHALeaderState()); - } } From 3100c35ab944652d353be5ee36e84db2c317921c Mon Sep 17 00:00:00 2001 From: xBis7 Date: Tue, 21 Feb 2023 20:02:43 +0200 Subject: [PATCH 12/12] if leaderid is empty unregister metrics --- .../src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java index 92a1d5045c0b..20ba92fbbc3c 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java @@ -1912,6 +1912,9 @@ public void updatePeerList(List newPeers) { // OMHAMetrics shouldn't be registered in that case. if (!Strings.isNullOrEmpty(leaderId)) { omHAMetricsInit(leaderId); + } else { + // unregister, to get rid of stale metrics + OMHAMetrics.unRegister(); } } }