Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ services:
ENSURE_OM_INITIALIZED: /data/metadata/om/current/VERSION
<<: *replication
ports:
- 9874
- 9874:9874
- 9862
hostname: om1
command: ["ozone","om"]
Expand Down Expand Up @@ -74,7 +74,7 @@ services:
scm1:
<<: *common-config
ports:
- 9876
- 9876:9876
environment:
ENSURE_SCM_INITIALIZED: /data/metadata/scm/current/VERSION
OZONE-SITE.XML_hdds.scm.safemode.min.datanode: ${OZONE_SAFEMODE_MIN_DATANODES:-1}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,12 @@
import org.apache.hadoop.ozone.client.io.OzoneOutputStream;
import org.apache.hadoop.ozone.om.exceptions.OMException;
import org.apache.hadoop.ozone.om.ha.HadoopRpcOMFailoverProxyProvider;
import org.apache.hadoop.ozone.om.ha.OMHAMetrics;
import org.apache.hadoop.ozone.om.helpers.OmMultipartInfo;
import org.apache.hadoop.ozone.om.helpers.OmMultipartUploadCompleteInfo;
import org.apache.ozone.test.GenericTestUtils;
import org.apache.ozone.test.tag.Flaky;
import org.junit.Assert;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;

import java.io.IOException;
Expand All @@ -41,6 +42,7 @@
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.TimeoutException;

import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.hadoop.ozone.MiniOzoneHAClusterImpl.NODE_FAILURE_TIMEOUT;
Expand Down Expand Up @@ -104,6 +106,84 @@ public void testMultipartUpload() throws Exception {
testMultipartUploadWithOneOmNodeDown();
}

@Test
public void testOMHAMetrics() throws InterruptedException,
TimeoutException, IOException {
waitForLeaderToBeReady();

// Get leader OM
OzoneManager leaderOM = getCluster().getOMLeader();
// Store current leader's node ID,
// to use it after restarting the OM
String leaderOMId = leaderOM.getOMNodeId();
// Get a list of all OMs
List<OzoneManager> omList = getCluster().getOzoneManagersList();

// Check metrics for all OMs
checkOMHAMetricsForAllOMs(omList, leaderOMId);

// Restart current leader OM
leaderOM.stop();
leaderOM.restart();

waitForLeaderToBeReady();

// Get the new leader
OzoneManager newLeaderOM = getCluster().getOMLeader();
String newLeaderOMId = newLeaderOM.getOMNodeId();
// Get a list of all OMs again
omList = getCluster().getOzoneManagersList();

// New state for the old leader
int newState = leaderOMId.equals(newLeaderOMId) ? 1 : 0;

// Get old leader
OzoneManager oldLeader = getCluster().getOzoneManager(leaderOMId);
// Get old leader's metrics
OMHAMetrics omhaMetrics = oldLeader.getOmhaMetrics();

Assertions.assertEquals(newState,
omhaMetrics.getOmhaInfoOzoneManagerHALeaderState());

// Check that metrics for all OMs have been updated
checkOMHAMetricsForAllOMs(omList, newLeaderOMId);
}

private void checkOMHAMetricsForAllOMs(List<OzoneManager> omList,
String leaderOMId) {
for (OzoneManager om : omList) {
// Get OMHAMetrics for the current OM
OMHAMetrics omhaMetrics = om.getOmhaMetrics();
String nodeId = om.getOMNodeId();

// If current OM is leader, state should be 1
int expectedState = nodeId
.equals(leaderOMId) ? 1 : 0;

Assertions.assertEquals(expectedState,
omhaMetrics.getOmhaInfoOzoneManagerHALeaderState());

Assertions.assertEquals(nodeId, omhaMetrics.getOmhaInfoNodeId());
}
}


/**
* Some tests are stopping or restarting OMs.
* There are test cases where we might need to
* wait for a leader to be elected and ready.
*/
private void waitForLeaderToBeReady()
throws InterruptedException, TimeoutException {
GenericTestUtils.waitFor(() -> {
try {
return getCluster().getOMLeader().isLeaderReady();
} catch (Exception e) {
return false;
}
}, 1000, 80000);
}

@Test
public void testFileOperationsAndDelete() throws Exception {
testFileOperationsWithRecursive();
Expand Down Expand Up @@ -135,7 +215,7 @@ private void testFileOperationsWithRecursive() throws Exception {
testCreateFile(ozoneBucket, keyName, data, true, false);
fail("testFileOperationsWithRecursive");
} catch (OMException ex) {
Assert.assertEquals(FILE_ALREADY_EXISTS, ex.getResult());
Assertions.assertEquals(FILE_ALREADY_EXISTS, ex.getResult());
}

// Try now with a file name which is same as a directory.
Expand All @@ -145,7 +225,7 @@ private void testFileOperationsWithRecursive() throws Exception {
testCreateFile(ozoneBucket, keyName, data, true, false);
fail("testFileOperationsWithNonRecursive");
} catch (OMException ex) {
Assert.assertEquals(NOT_A_FILE, ex.getResult());
Assertions.assertEquals(NOT_A_FILE, ex.getResult());
}

}
Expand Down Expand Up @@ -185,7 +265,7 @@ private void testKeysDelete() throws Exception {
} catch (OMException ex) {
// The expected exception PARTIAL_DELETE, as if not able to delete, we
// return error codee PARTIAL_DElETE.
Assert.assertEquals(PARTIAL_DELETE, ex.getResult());
Assertions.assertEquals(PARTIAL_DELETE, ex.getResult());
}
}

Expand All @@ -206,7 +286,7 @@ private void testFileOperationsWithNonRecursive() throws Exception {
try {
testCreateFile(ozoneBucket, keyName, data, false, false);
} catch (OMException ex) {
Assert.assertEquals(DIRECTORY_NOT_FOUND, ex.getResult());
Assertions.assertEquals(DIRECTORY_NOT_FOUND, ex.getResult());
}

// create directory, now this should pass.
Expand All @@ -221,7 +301,7 @@ private void testFileOperationsWithNonRecursive() throws Exception {
testCreateFile(ozoneBucket, keyName, data, false, false);
fail("testFileOperationsWithRecursive");
} catch (OMException ex) {
Assert.assertEquals(FILE_ALREADY_EXISTS, ex.getResult());
Assertions.assertEquals(FILE_ALREADY_EXISTS, ex.getResult());
}


Expand All @@ -241,7 +321,7 @@ private void testFileOperationsWithNonRecursive() throws Exception {
testCreateFile(ozoneBucket, keyName, data, false, false);
fail("testFileOperationsWithNonRecursive");
} catch (OMException ex) {
Assert.assertEquals(NOT_A_FILE, ex.getResult());
Assertions.assertEquals(NOT_A_FILE, ex.getResult());
}

}
Expand Down Expand Up @@ -274,7 +354,7 @@ private void testMultipartUploadWithOneOmNodeDown() throws Exception {
String newLeaderOMNodeId =
omFailoverProxyProvider.getCurrentProxyOMNodeId();

Assert.assertTrue(!leaderOMNodeId.equals(newLeaderOMNodeId));
Assertions.assertTrue(!leaderOMNodeId.equals(newLeaderOMNodeId));
}

private String initiateMultipartUpload(OzoneBucket ozoneBucket,
Expand All @@ -286,7 +366,7 @@ private String initiateMultipartUpload(OzoneBucket ozoneBucket,
ReplicationFactor.ONE);

String uploadID = omMultipartInfo.getUploadID();
Assert.assertTrue(uploadID != null);
Assertions.assertTrue(uploadID != null);
return uploadID;
}

Expand All @@ -305,15 +385,15 @@ private void createMultipartKeyAndReadKey(OzoneBucket ozoneBucket,
OmMultipartUploadCompleteInfo omMultipartUploadCompleteInfo =
ozoneBucket.completeMultipartUpload(keyName, uploadID, partsMap);

Assert.assertTrue(omMultipartUploadCompleteInfo != null);
Assert.assertTrue(omMultipartUploadCompleteInfo.getHash() != null);
Assertions.assertTrue(omMultipartUploadCompleteInfo != null);
Assertions.assertTrue(omMultipartUploadCompleteInfo.getHash() != null);


OzoneInputStream ozoneInputStream = ozoneBucket.readKey(keyName);

byte[] fileContent = new byte[value.getBytes(UTF_8).length];
ozoneInputStream.read(fileContent);
Assert.assertEquals(value, new String(fileContent, UTF_8));
Assertions.assertEquals(value, new String(fileContent, UTF_8));
}

@Test
Expand Down Expand Up @@ -368,10 +448,10 @@ public void testOMRatisSnapshot() throws Exception {
long smLastAppliedIndex =
ozoneManager.getOmRatisServer().getLastAppliedTermIndex().getIndex();
long ratisSnapshotIndex = ozoneManager.getRatisSnapshotIndex();
Assert.assertTrue("LastAppliedIndex on OM State Machine ("
+ smLastAppliedIndex + ") is less than the saved snapshot index("
+ ratisSnapshotIndex + ").",
smLastAppliedIndex >= ratisSnapshotIndex);
Assertions.assertTrue(smLastAppliedIndex >= ratisSnapshotIndex,
"LastAppliedIndex on OM State Machine ("
+ smLastAppliedIndex + ") is less than the saved snapshot index("
+ ratisSnapshotIndex + ").");

// Add more transactions to Ratis to trigger another snapshot
while (appliedLogIndex <= (smLastAppliedIndex + getSnapshotThreshold())) {
Expand All @@ -393,8 +473,9 @@ public void testOMRatisSnapshot() throws Exception {

// The new snapshot index must be greater than the previous snapshot index
long ratisSnapshotIndexNew = ozoneManager.getRatisSnapshotIndex();
Assert.assertTrue("Latest snapshot index must be greater than previous " +
"snapshot indices", ratisSnapshotIndexNew > ratisSnapshotIndex);
Assertions.assertTrue(ratisSnapshotIndexNew > ratisSnapshotIndex,
"Latest snapshot index must be greater than previous " +
"snapshot indices");

}

Expand Down Expand Up @@ -459,7 +540,7 @@ public void testOMRestart() throws Exception {
final long leaderOMSnaphsotIndex = leaderOM.getRatisSnapshotIndex();

// The stopped OM should be lagging behind the leader OM.
Assert.assertTrue(followerOM1LastAppliedIndex < leaderOMSnaphsotIndex);
Assertions.assertTrue(followerOM1LastAppliedIndex < leaderOMSnaphsotIndex);

// Restart the stopped OM.
followerOM1.restart();
Expand All @@ -478,7 +559,8 @@ public void testOMRestart() throws Exception {

final long followerOM1LastAppliedIndexNew =
followerOM1.getOmRatisServer().getLastAppliedTermIndex().getIndex();
Assert.assertTrue(followerOM1LastAppliedIndexNew > leaderOMSnaphsotIndex);
Assertions.assertTrue(
followerOM1LastAppliedIndexNew > leaderOMSnaphsotIndex);
}

@Test
Expand Down Expand Up @@ -522,15 +604,15 @@ private void validateListParts(OzoneBucket ozoneBucket, String keyName,
List<OzoneMultipartUploadPartListParts.PartInfo> partInfoList =
ozoneMultipartUploadPartListParts.getPartInfoList();

Assert.assertTrue(partInfoList.size() == partsMap.size());
Assertions.assertTrue(partInfoList.size() == partsMap.size());

for (int i = 0; i < partsMap.size(); i++) {
Assert.assertEquals(partsMap.get(partInfoList.get(i).getPartNumber()),
Assertions.assertEquals(partsMap.get(partInfoList.get(i).getPartNumber()),
partInfoList.get(i).getPartName());

}

Assert.assertFalse(ozoneMultipartUploadPartListParts.isTruncated());
Assertions.assertFalse(ozoneMultipartUploadPartListParts.isTruncated());
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
import java.util.stream.Collectors;

import com.google.common.base.Optional;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableSortedSet;
import com.google.common.collect.Lists;
import org.apache.hadoop.conf.Configuration;
Expand Down Expand Up @@ -82,6 +83,7 @@
import org.apache.hadoop.hdds.utils.db.Table.KeyValue;
import org.apache.hadoop.hdds.utils.db.TableIterator;
import org.apache.hadoop.ozone.OzoneManagerVersion;
import org.apache.hadoop.ozone.om.ha.OMHAMetrics;
import org.apache.hadoop.ozone.om.helpers.KeyInfoWithVolumeContext;
import org.apache.hadoop.ozone.om.helpers.SnapshotInfo;
import org.apache.hadoop.ozone.om.request.OMClientRequest;
Expand Down Expand Up @@ -347,6 +349,7 @@ public final class OzoneManager extends ServiceRuntimeInfoImpl
private final OzoneAdmins s3OzoneAdmins;

private final OMMetrics metrics;
private OMHAMetrics omhaMetrics;
private final ProtocolMessageMetrics<ProtocolMessageEnum>
omClientProtocolMetrics;
private OzoneManagerHttpServer httpServer;
Expand Down Expand Up @@ -1894,6 +1897,26 @@ public void updatePeerList(List<String> newPeers) {
}
}
}
RaftPeer leader = null;
try {
leader = omRatisServer.getLeader();
} catch (IOException ex) {
LOG.error("IOException while getting the " +
"Ratis server leader.", ex);
}
if (Objects.nonNull(leader)) {
String leaderId = leader.getId().toString();

// If leaderId is empty, then leader is undefined
// and current OM is neither leader nor follower.
// OMHAMetrics shouldn't be registered in that case.
if (!Strings.isNullOrEmpty(leaderId)) {
Comment on lines +1900 to +1913
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@neils-dev @xBis7

I had to update this part of the code after merging the PR due to a conflicting change that had been just merged (HDDS-6743, getLeader() no longer throws IOException).

Upon closer inspection, I think these if-else blocks have a problem. Please correct me if I'm wrong, but if leader is undefined then leader will be null, so the unregistration will not happen. (This seems to have been the case even before HDDS-6743.) If we have any leader information, its id cannot be null.

I think it should be:

if (Objects.nonNull(leader)) {
  // init
} else {
  // unregister
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@adoroszlai You are right. This was missed because the code was

String leaderId = "";
try{
  leader = 
} catch() {

}

if (Objects.nonNull(leader)) {

}

// leaderId could be deliberately left empty down here due to failure to get the leader
// after refactoring `String leaderId = "";` was removed.

If we have any leader information, its id cannot be null.

I didn't know that.

How can we handle this now since the code has been merged?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @xBis7 for checking.

If we have any leader information, its id cannot be null.

I didn't know that.

https://github.com/apache/ratis/blob/27f5a59cc697ce51f1c5ab6d7b6c63eb58390b79/ratis-common/src/main/java/org/apache/ratis/protocol/RaftPeer.java#L175

How can we handle this now since the code has been merged?

HDDS-8009

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@adoroszlai Thanks! I'll create a patch shortly for HDDS-8009.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @adoroszlai for merging and uncovering this change. With getLeader() no longer throwing an exception, we can cleanup the null check and handling of that condition.

omHAMetricsInit(leaderId);
} else {
// unregister, to get rid of stale metrics
OMHAMetrics.unRegister();
}
}
}

/**
Expand Down Expand Up @@ -2160,6 +2183,7 @@ public void stop() {
if (omRatisServer != null) {
omRatisServer.stop();
omRatisServer = null;
OMHAMetrics.unRegister();
}
isOmRpcServerRunning = false;
if (isOmGrpcServerEnabled) {
Expand Down Expand Up @@ -2911,6 +2935,22 @@ public String getRatisRoles() {
}
}

/**
* Create OMHAMetrics instance.
*/
private void omHAMetricsInit(String leaderId) {
// unregister, in case metrics already exist
// so that the metric tags will get updated.
OMHAMetrics.unRegister();
omhaMetrics = OMHAMetrics
.create(getOMNodeId(), leaderId);
}

@VisibleForTesting
public OMHAMetrics getOmhaMetrics() {
return omhaMetrics;
}

public String getRatisLogDirectory() {
return OzoneManagerRatisUtils.getOMRatisDirectory(configuration);
}
Expand Down
Loading