diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/ratis/ServerNotLeaderException.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/ratis/ServerNotLeaderException.java
new file mode 100644
index 000000000000..71b26f587882
--- /dev/null
+++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/ratis/ServerNotLeaderException.java
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.hadoop.hdds.ratis;
+
+import org.apache.hadoop.hdds.HddsUtils;
+import org.apache.ratis.protocol.RaftPeerId;
+import org.apache.ratis.protocol.exceptions.NotLeaderException;
+
+import java.io.IOException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Exception thrown when a server is not a leader for Ratis group.
+ */
+public class ServerNotLeaderException extends IOException {
+ private final String currentPeerId;
+ private final String leader;
+ private static final Pattern CURRENT_PEER_ID_PATTERN =
+ Pattern.compile("Server:(.*) is not the leader[.]+.*", Pattern.DOTALL);
+ private static final Pattern SUGGESTED_LEADER_PATTERN =
+ Pattern.compile(".*Suggested leader is Server:([^.]*).*", Pattern.DOTALL);
+
+ public ServerNotLeaderException(RaftPeerId currentPeerId) {
+ super("Server:" + currentPeerId + " is not the leader. Could not " +
+ "determine the leader node.");
+ this.currentPeerId = currentPeerId.toString();
+ this.leader = null;
+ }
+
+ public ServerNotLeaderException(RaftPeerId currentPeerId,
+ String suggestedLeader) {
+ super("Server:" + currentPeerId + " is not the leader. Suggested leader is"
+ + " Server:" + suggestedLeader + ".");
+ this.currentPeerId = currentPeerId.toString();
+ this.leader = suggestedLeader;
+ }
+
+ public ServerNotLeaderException(String message) {
+ super(message);
+
+ Matcher currentLeaderMatcher = CURRENT_PEER_ID_PATTERN.matcher(message);
+ if (currentLeaderMatcher.matches()) {
+ this.currentPeerId = currentLeaderMatcher.group(1);
+
+ Matcher suggestedLeaderMatcher =
+ SUGGESTED_LEADER_PATTERN.matcher(message);
+ if (suggestedLeaderMatcher.matches()) {
+ this.leader = suggestedLeaderMatcher.group(1);
+ } else {
+ this.leader = null;
+ }
+ } else {
+ this.currentPeerId = null;
+ this.leader = null;
+ }
+ }
+
+ public String getSuggestedLeader() {
+ return leader;
+ }
+
+ /**
+ * Convert {@link org.apache.ratis.protocol.exceptions.NotLeaderException}
+ * to {@link ServerNotLeaderException}.
+ * @param notLeaderException
+ * @param currentPeer
+ * @return ServerNotLeaderException
+ */
+ public static ServerNotLeaderException convertToNotLeaderException(
+ NotLeaderException notLeaderException,
+ RaftPeerId currentPeer, String port) {
+ String suggestedLeader = notLeaderException.getSuggestedLeader() != null ?
+ HddsUtils
+ .getHostName(notLeaderException.getSuggestedLeader().getAddress())
+ .get() :
+ null;
+ ServerNotLeaderException serverNotLeaderException;
+ if (suggestedLeader != null) {
+ String suggestedLeaderHostPort = suggestedLeader + ":" + port;
+ serverNotLeaderException =
+ new ServerNotLeaderException(currentPeer, suggestedLeaderHostPort);
+ } else {
+ serverNotLeaderException = new ServerNotLeaderException(currentPeer);
+ }
+ return serverNotLeaderException;
+ }
+}
\ No newline at end of file
diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ha/NonRetriableException.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ha/NonRetriableException.java
new file mode 100644
index 000000000000..52da99e67488
--- /dev/null
+++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ha/NonRetriableException.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdds.scm.ha;
+
+import java.io.IOException;
+
+/**
+ * exception for which there should be no retry.
+ */
+public class NonRetriableException extends IOException {
+
+ public NonRetriableException(IOException exception) {
+ super(exception);
+ }
+}
diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAUtils.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAUtils.java
index 0db2b5a54092..fb6dc96a5def 100644
--- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAUtils.java
+++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAUtils.java
@@ -24,15 +24,18 @@
import org.apache.hadoop.hdds.conf.ConfigurationException;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.ratis.ServerNotLeaderException;
import org.apache.hadoop.hdds.scm.ScmConfigKeys;
import org.apache.hadoop.hdds.server.ServerUtils;
import org.apache.hadoop.io.retry.RetryPolicy;
+import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.ozone.ha.ConfUtils;
import org.apache.ratis.protocol.exceptions.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
+import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collection;
@@ -198,12 +201,40 @@ public static Collection getSCMNodeIds(
return getSCMNodeIds(configuration, scmServiceId);
}
+ private static Throwable unwrapException(Exception e) {
+ IOException ioException = null;
+ Throwable cause = e.getCause();
+ if (cause instanceof RemoteException) {
+ ioException = ((RemoteException) cause).unwrapRemoteException();
+ }
+ return ioException == null ? e : ioException;
+ }
+
+ /**
+ * Checks if the underlying exception if of type StateMachine. Used by scm
+ * clients.
+ */
+ public static boolean isNonRetriableException(Exception e) {
+ Throwable t =
+ getExceptionForClass(e, StateMachineException.class);
+ return t == null ? false : true;
+ }
+
+ /**
+ * Checks if the underlying exception if of type non retriable. Used by scm
+ * clients.
+ */
+ public static boolean checkNonRetriableException(Exception e) {
+ Throwable t = unwrapException(e);
+ return NonRetriableException.class.isInstance(t);
+ }
+
// This will return the underlying exception after unwrapping
// the exception to see if it matches with expected exception
// list , returns true otherwise will return false.
public static boolean isRetriableWithNoFailoverException(Exception e) {
Throwable t = e;
- while (t != null && t.getCause() != null) {
+ while (t != null) {
for (Class extends Exception> clazz :
getRetriableWithNoFailoverExceptionList()) {
if (clazz.isInstance(t)) {
@@ -215,6 +246,41 @@ public static boolean isRetriableWithNoFailoverException(Exception e) {
return false;
}
+ /**
+ * Checks if the underlying exception if of type retriable with no failover.
+ * Used by scm clients.
+ */
+ public static boolean checkRetriableWithNoFailoverException(Exception e) {
+ Throwable t = unwrapException(e);
+ return RetriableWithNoFailoverException.class.isInstance(t);
+ }
+
+ public static Throwable getNotLeaderException(Exception e) {
+ return getExceptionForClass(e, NotLeaderException.class);
+ }
+
+ public static Throwable getServerNotLeaderException(Exception e) {
+ return getExceptionForClass(e, ServerNotLeaderException.class);
+ }
+
+ // This will return the underlying NotLeaderException exception
+ public static Throwable getExceptionForClass(Exception e,
+ Class extends Exception> clazz) {
+ IOException ioException = null;
+ Throwable cause = e.getCause();
+ if (cause instanceof RemoteException) {
+ ioException = ((RemoteException) cause).unwrapRemoteException();
+ }
+ Throwable t = ioException == null ? e : ioException;
+ while (t != null) {
+ if (clazz.isInstance(t)) {
+ return t;
+ }
+ t = t.getCause();
+ }
+ return null;
+ }
+
public static List> getRetriableWithNoFailoverExceptionList() {
return RETRIABLE_WITH_NO_FAILOVER_EXCEPTION_LIST;
@@ -222,13 +288,15 @@ Exception>> getRetriableWithNoFailoverExceptionList() {
public static RetryPolicy.RetryAction getRetryAction(int failovers, int retry,
Exception e, int maxRetryCount, long retryInterval) {
- if (SCMHAUtils.isRetriableWithNoFailoverException(e)) {
+ if (SCMHAUtils.checkRetriableWithNoFailoverException(e)) {
if (retry < maxRetryCount) {
return new RetryPolicy.RetryAction(
RetryPolicy.RetryAction.RetryDecision.RETRY, retryInterval);
} else {
return RetryPolicy.RetryAction.FAIL;
}
+ } else if (SCMHAUtils.checkNonRetriableException(e)) {
+ return RetryPolicy.RetryAction.FAIL;
} else {
if (failovers < maxRetryCount) {
return new RetryPolicy.RetryAction(
diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml b/hadoop-hdds/common/src/main/resources/ozone-default.xml
index ac8169e35c50..0524b285b949 100644
--- a/hadoop-hdds/common/src/main/resources/ozone-default.xml
+++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml
@@ -2764,6 +2764,14 @@
+
+ ozone.client.key.provider.cache.expiry
+ OZONE, CLIENT, SECURITY
+ 10d
+ Ozone client security key provider cache expiration time.
+
+
+
ozone.scm.info.wait.duration
OZONE, SCM, OM
@@ -2773,12 +2781,4 @@
-
- ozone.client.key.provider.cache.expiry
- OZONE, CLIENT, SECURITY
- 10d
- Ozone client security key provider cache expiration time.
-
-
-
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/ScmBlockLocationProtocolClientSideTranslatorPB.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/ScmBlockLocationProtocolClientSideTranslatorPB.java
index d93f7f63057b..412b2ec88c00 100644
--- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/ScmBlockLocationProtocolClientSideTranslatorPB.java
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/ScmBlockLocationProtocolClientSideTranslatorPB.java
@@ -29,7 +29,6 @@
import org.apache.hadoop.hdds.client.StandaloneReplicationConfig;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
-import org.apache.hadoop.hdds.protocol.proto.ScmBlockLocationProtocolProtos;
import org.apache.hadoop.hdds.protocol.proto.ScmBlockLocationProtocolProtos.SCMBlockLocationRequest;
import org.apache.hadoop.hdds.protocol.proto.ScmBlockLocationProtocolProtos.SCMBlockLocationResponse;
import org.apache.hadoop.hdds.protocol.proto.ScmBlockLocationProtocolProtos.Type;
@@ -119,11 +118,6 @@ private SCMBlockLocationResponse submitRequest(
try {
SCMBlockLocationResponse response =
rpcProxy.send(NULL_RPC_CONTROLLER, req);
- if (response.getStatus() ==
- ScmBlockLocationProtocolProtos.Status.SCM_NOT_LEADER) {
- failoverProxyProvider
- .performFailoverToAssignedLeader(response.getLeaderSCMNodeId());
- }
return response;
} catch (ServiceException e) {
throw ProtobufHelper.getRemoteException(e);
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMBlockLocationFailoverProxyProvider.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMBlockLocationFailoverProxyProvider.java
index 4253d040877d..b0188facea26 100644
--- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMBlockLocationFailoverProxyProvider.java
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMBlockLocationFailoverProxyProvider.java
@@ -22,6 +22,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdds.conf.ConfigurationException;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
+import org.apache.hadoop.hdds.ratis.ServerNotLeaderException;
import org.apache.hadoop.hdds.scm.ha.SCMHAUtils;
import org.apache.hadoop.hdds.scm.ha.SCMNodeInfo;
import org.apache.hadoop.hdds.scm.protocolPB.ScmBlockLocationProtocolPB;
@@ -29,7 +30,6 @@
import org.apache.hadoop.io.retry.FailoverProxyProvider;
import org.apache.hadoop.io.retry.RetryPolicies;
import org.apache.hadoop.io.retry.RetryPolicy;
-import org.apache.hadoop.io.retry.RetryPolicy.RetryAction;
import org.apache.hadoop.ipc.ProtobufRpcEngine;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.net.NetUtils;
@@ -103,7 +103,7 @@ public SCMBlockLocationFailoverProxyProvider(ConfigurationSource conf) {
this.retryInterval = config.getRetryInterval();
}
- private void loadConfigs() {
+ private synchronized void loadConfigs() {
scmNodeIds = new ArrayList<>();
List scmNodeInfoList = SCMNodeInfo.buildNodeInfo(conf);
@@ -131,7 +131,14 @@ private void loadConfigs() {
}
@VisibleForTesting
- public synchronized String getCurrentProxyOMNodeId() {
+ public synchronized void changeCurrentProxy(String nodeId) {
+ currentProxyIndex = scmNodeIds.indexOf(nodeId);
+ currentProxySCMNodeId = nodeId;
+ nextProxyIndex();
+ }
+
+ @VisibleForTesting
+ public synchronized String getCurrentProxySCMNodeId() {
return currentProxySCMNodeId;
}
@@ -143,15 +150,28 @@ public synchronized ProxyInfo getProxy() {
}
@Override
- public void performFailover(ScmBlockLocationProtocolPB newLeader) {
+ public synchronized void performFailover(
+ ScmBlockLocationProtocolPB newLeader) {
// Should do nothing here.
- LOG.debug("Failing over to next proxy. {}", getCurrentProxyOMNodeId());
+ LOG.debug("Failing over to next proxy. {}", getCurrentProxySCMNodeId());
}
- public void performFailoverToAssignedLeader(String newLeader) {
+ public synchronized void performFailoverToAssignedLeader(String newLeader,
+ Exception e) {
+ ServerNotLeaderException snle =
+ (ServerNotLeaderException) SCMHAUtils.getServerNotLeaderException(e);
+ if (snle != null && snle.getSuggestedLeader() != null) {
+ newLeader = scmProxyInfoMap.values().stream().filter(
+ proxyInfo -> NetUtils.getHostPortString(proxyInfo.getAddress())
+ .equals(snle.getSuggestedLeader())).findFirst().get().getNodeId();
+ LOG.debug("Performing failover to suggested leader {}, nodeId {}",
+ snle.getSuggestedLeader(), newLeader);
+ }
if (newLeader == null) {
// If newLeader is not assigned, it will fail over to next proxy.
nextProxyIndex();
+ LOG.debug("Performing failover to next proxy node {}",
+ currentProxySCMNodeId);
} else {
if (!assignLeaderToNode(newLeader)) {
LOG.debug("Failing over SCM proxy to nodeId: {}", newLeader);
@@ -249,8 +269,8 @@ public RetryPolicy getSCMBlockLocationRetryPolicy(String newLeader) {
@Override
public RetryAction shouldRetry(Exception e, int retry,
int failover, boolean b) {
- if (!SCMHAUtils.isRetriableWithNoFailoverException(e)) {
- performFailoverToAssignedLeader(newLeader);
+ if (!SCMHAUtils.checkRetriableWithNoFailoverException(e)) {
+ performFailoverToAssignedLeader(newLeader, e);
}
return SCMHAUtils.getRetryAction(failover, retry, e, maxRetryCount,
getRetryInterval());
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMClientConfig.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMClientConfig.java
index 65acfaea3a1c..df48b233f622 100644
--- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMClientConfig.java
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMClientConfig.java
@@ -37,6 +37,8 @@ public class SCMClientConfig {
public static final String SCM_CLIENT_RPC_TIME_OUT = "rpc.timeout";
public static final String SCM_CLIENT_FAILOVER_MAX_RETRY =
"failover.max.retry";
+ public static final String SCM_CLIENT_MAX_RETRY_TIMEOUT =
+ "max.retry.timeout";
public static final String SCM_CLIENT_RETRY_INTERVAL =
"failover.retry.interval";
@@ -54,6 +56,16 @@ public class SCMClientConfig {
)
private long rpcTimeOut = 15 * 60 * 1000;
+ @Config(key = SCM_CLIENT_MAX_RETRY_TIMEOUT,
+ defaultValue = "10m",
+ type = ConfigType.TIME,
+ timeUnit = TimeUnit.MILLISECONDS,
+ tags = {OZONE, SCM, CLIENT},
+ description = "Max retry timeout for SCM Client"
+ )
+
+ private long maxRetryTimeout = 10 * 60 * 1000;
+
@Config(key = SCM_CLIENT_FAILOVER_MAX_RETRY,
defaultValue = "15",
type = ConfigType.INT,
@@ -86,7 +98,16 @@ public void setRpcTimeOut(long timeOut) {
}
public int getRetryCount() {
- return retryCount;
+ long duration = getMaxRetryTimeout();
+ int retryCountFromMaxTimeOut = (int) (duration / getRetryInterval());
+ // If duration is set to lesser value, fall back to actual default
+ // retry count.
+ return retryCountFromMaxTimeOut > retryCount ?
+ retryCountFromMaxTimeOut : retryCount;
+ }
+
+ public long getMaxRetryTimeout() {
+ return maxRetryTimeout;
}
public void setRetryCount(int retryCount) {
@@ -100,4 +121,8 @@ public long getRetryInterval() {
public void setRetryInterval(long retryInterval) {
this.retryInterval = retryInterval;
}
+
+ public void setMaxRetryTimeout(long timeout) {
+ this.maxRetryTimeout = timeout;
+ }
}
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMContainerLocationFailoverProxyProvider.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMContainerLocationFailoverProxyProvider.java
index 4c919ae4839a..96e627555535 100644
--- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMContainerLocationFailoverProxyProvider.java
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMContainerLocationFailoverProxyProvider.java
@@ -22,6 +22,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdds.conf.ConfigurationException;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
+import org.apache.hadoop.hdds.ratis.ServerNotLeaderException;
import org.apache.hadoop.hdds.scm.ha.SCMHAUtils;
import org.apache.hadoop.hdds.scm.ha.SCMNodeInfo;
import org.apache.hadoop.hdds.scm.protocolPB.StorageContainerLocationProtocolPB;
@@ -99,7 +100,7 @@ public SCMContainerLocationFailoverProxyProvider(ConfigurationSource conf) {
}
@VisibleForTesting
- protected void loadConfigs() {
+ protected synchronized void loadConfigs() {
List scmNodeInfoList = SCMNodeInfo.buildNodeInfo(conf);
scmNodeIds = new ArrayList<>();
@@ -131,6 +132,13 @@ public synchronized String getCurrentProxySCMNodeId() {
return currentProxySCMNodeId;
}
+ @VisibleForTesting
+ public synchronized void changeCurrentProxy(String nodeId) {
+ currentProxyIndex = scmNodeIds.indexOf(nodeId);
+ currentProxySCMNodeId = nodeId;
+ nextProxyIndex();
+ }
+
@Override
public synchronized ProxyInfo getProxy() {
ProxyInfo currentProxyInfo
@@ -148,19 +156,33 @@ public synchronized List getProxies() {
}
@Override
- public synchronized void performFailover(
+ public void performFailover(
StorageContainerLocationProtocolPB newLeader) {
// Should do nothing here.
LOG.debug("Failing over to next proxy. {}", getCurrentProxySCMNodeId());
}
- public synchronized void performFailoverToAssignedLeader(String newLeader) {
+ public synchronized void performFailoverToAssignedLeader(String newLeader,
+ Exception e) {
+ ServerNotLeaderException snle =
+ (ServerNotLeaderException) SCMHAUtils.getServerNotLeaderException(e);
+ if (snle != null && snle.getSuggestedLeader() != null) {
+ newLeader = scmProxyInfoMap.values().stream().filter(
+ proxyInfo -> NetUtils.getHostPortString(proxyInfo.getAddress())
+ .equals(snle.getSuggestedLeader())).findFirst().get().getNodeId();
+ LOG.debug("Performing failover to suggested leader {}, nodeId {}",
+ snle.getSuggestedLeader(), newLeader);
+ }
if (newLeader == null) {
- // If newLeader is not assigned, fail over to next proxy.
- nextProxyIndex();
- } else if (!assignLeaderToNode(newLeader)) {
- // If failed to fail over to newLeader, fail over to next proxy.
+ // If newLeader is not assigned, it will fail over to next proxy.
nextProxyIndex();
+ LOG.debug("Performing failover to next proxy node {}",
+ currentProxySCMNodeId);
+ } else {
+ if (!assignLeaderToNode(newLeader)) {
+ LOG.debug("Failing over SCM proxy to nodeId: {}", newLeader);
+ nextProxyIndex();
+ }
}
}
@@ -181,37 +203,19 @@ public synchronized void close() throws IOException {
}
}
- public RetryPolicy.RetryAction getRetryAction(int failovers, int retry,
- Exception e) {
- if (SCMHAUtils.isRetriableWithNoFailoverException(e)) {
- if (retry < maxRetryCount) {
- return new RetryPolicy.RetryAction(
- RetryPolicy.RetryAction.RetryDecision.RETRY, getRetryInterval());
- } else {
- return RetryPolicy.RetryAction.FAIL;
- }
- } else if (failovers < maxRetryCount) {
- return new RetryPolicy.RetryAction(
- RetryPolicy.RetryAction.RetryDecision.FAILOVER_AND_RETRY,
- getRetryInterval());
- } else {
- return RetryPolicy.RetryAction.FAIL;
- }
- }
-
private long getRetryInterval() {
// TODO add exponential backup
return retryInterval;
}
- private int nextProxyIndex() {
+ private synchronized int nextProxyIndex() {
// round robin the next proxy
currentProxyIndex = (currentProxyIndex + 1) % scmProxies.size();
currentProxySCMNodeId = scmNodeIds.get(currentProxyIndex);
return currentProxyIndex;
}
- private boolean assignLeaderToNode(String newLeaderNodeId) {
+ private synchronized boolean assignLeaderToNode(String newLeaderNodeId) {
if (!currentProxySCMNodeId.equals(newLeaderNodeId)
&& scmProxies.containsKey(newLeaderNodeId)) {
currentProxySCMNodeId = newLeaderNodeId;
@@ -220,7 +224,6 @@ private boolean assignLeaderToNode(String newLeaderNodeId) {
LOG.debug("Failing over SCM proxy to nodeId: {}", newLeaderNodeId);
return true;
}
-
return false;
}
@@ -271,8 +274,8 @@ public RetryPolicy getRetryPolicy() {
@Override
public RetryAction shouldRetry(Exception e, int retry,
int failover, boolean b) {
- if (!SCMHAUtils.isRetriableWithNoFailoverException(e)) {
- performFailoverToAssignedLeader(null);
+ if (!SCMHAUtils.checkRetriableWithNoFailoverException(e)) {
+ performFailoverToAssignedLeader(null, e);
}
return SCMHAUtils.getRetryAction(failover, retry, e, maxRetryCount,
getRetryInterval());
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMSecurityProtocolFailoverProxyProvider.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMSecurityProtocolFailoverProxyProvider.java
index 13f1bc04c4ef..23f3a3e6cbea 100644
--- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMSecurityProtocolFailoverProxyProvider.java
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMSecurityProtocolFailoverProxyProvider.java
@@ -22,6 +22,7 @@
import org.apache.hadoop.hdds.conf.ConfigurationException;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
import org.apache.hadoop.hdds.protocolPB.SCMSecurityProtocolPB;
+import org.apache.hadoop.hdds.ratis.ServerNotLeaderException;
import org.apache.hadoop.hdds.scm.ha.SCMHAUtils;
import org.apache.hadoop.hdds.scm.ha.SCMNodeInfo;
import org.apache.hadoop.hdds.utils.HAUtils;
@@ -99,7 +100,7 @@ public SCMSecurityProtocolFailoverProxyProvider(ConfigurationSource conf,
this.retryInterval = scmClientConfig.getRetryInterval();
}
- protected void loadConfigs() {
+ protected synchronized void loadConfigs() {
List scmNodeInfoList = SCMNodeInfo.buildNodeInfo(conf);
scmNodeIds = new ArrayList<>();
@@ -173,7 +174,7 @@ private SCMSecurityProtocolPB createSCMProxy(InetSocketAddress scmAddress)
@Override
- public void performFailover(SCMSecurityProtocolPB currentProxy) {
+ public synchronized void performFailover(SCMSecurityProtocolPB currentProxy) {
if (LOG.isDebugEnabled()) {
int currentIndex = getCurrentProxyIndex();
LOG.debug("Failing over SCM Security proxy to index: {}, nodeId: {}",
@@ -181,10 +182,46 @@ public void performFailover(SCMSecurityProtocolPB currentProxy) {
}
}
+ public synchronized void performFailoverToAssignedLeader(String newLeader,
+ Exception e) {
+ ServerNotLeaderException snle =
+ (ServerNotLeaderException) SCMHAUtils.getServerNotLeaderException(e);
+ if (snle != null && snle.getSuggestedLeader() != null) {
+ newLeader = scmProxyInfoMap.values().stream().filter(
+ proxyInfo -> NetUtils.getHostPortString(proxyInfo.getAddress())
+ .equals(snle.getSuggestedLeader())).findFirst().get().getNodeId();
+ LOG.debug("Performing failover to suggested leader {}, nodeId {}",
+ snle.getSuggestedLeader(), newLeader);
+ }
+ if (newLeader == null) {
+ // If newLeader is not assigned, it will fail over to next proxy.
+ performFailoverToNextProxy();
+ LOG.debug("Performing failover to next proxy node {}",
+ currentProxySCMNodeId);
+ } else {
+ if (!assignLeaderToNode(newLeader)) {
+ LOG.debug("Failing over SCM proxy to nodeId: {}", newLeader);
+ performFailoverToNextProxy();
+ }
+ }
+ }
+
+ private synchronized boolean assignLeaderToNode(String newLeaderNodeId) {
+ if (!currentProxySCMNodeId.equals(newLeaderNodeId)
+ && scmProxies.containsKey(newLeaderNodeId)) {
+ currentProxySCMNodeId = newLeaderNodeId;
+ currentProxyIndex = scmNodeIds.indexOf(currentProxySCMNodeId);
+
+ LOG.debug("Failing over SCM proxy to nodeId: {}", newLeaderNodeId);
+ return true;
+ }
+
+ return false;
+ }
/**
* Performs fail-over to the next proxy.
*/
- public void performFailoverToNextProxy() {
+ public synchronized void performFailoverToNextProxy() {
int newProxyIndex = incrementProxyIndex();
if (LOG.isDebugEnabled()) {
LOG.debug("Incrementing SCM Security proxy index to {}, nodeId: {}",
@@ -228,11 +265,8 @@ public RetryAction shouldRetry(Exception exception, int retries,
return RetryAction.FAIL;
}
- // Perform fail over to next proxy, as right now we don't have any
- // suggested leader ID from server, we fail over to next one.
- // TODO: Act based on server response if leader id is passed.
- if (!SCMHAUtils.isRetriableWithNoFailoverException(exception)) {
- performFailoverToNextProxy();
+ if (!SCMHAUtils.checkRetriableWithNoFailoverException(exception)) {
+ performFailoverToAssignedLeader(null, exception);
}
return SCMHAUtils
.getRetryAction(failovers, retries, exception, maxRetryCount,
@@ -250,7 +284,7 @@ public Class< SCMSecurityProtocolPB > getInterface() {
}
@Override
- public void close() throws IOException {
+ public synchronized void close() throws IOException {
for (Map.Entry> proxy :
scmProxies.entrySet()) {
if (proxy.getValue() != null) {
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java
index 112843181fba..b3f4c8dbbe6d 100644
--- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java
@@ -79,8 +79,7 @@ public final class HAUtils {
private HAUtils() {
}
- public static ScmInfo getScmInfo(OzoneConfiguration conf)
- throws IOException {
+ public static ScmInfo getScmInfo(OzoneConfiguration conf) throws IOException {
OzoneConfiguration configuration = new OzoneConfiguration(conf);
try {
long duration = conf.getTimeDuration(OZONE_SCM_INFO_WAIT_DURATION,
@@ -88,15 +87,13 @@ public static ScmInfo getScmInfo(OzoneConfiguration conf)
SCMClientConfig scmClientConfig =
configuration.getObject(SCMClientConfig.class);
int retryCount =
- (int) (duration / (scmClientConfig.getRetryInterval()/1000));
-
+ (int) (duration / (scmClientConfig.getRetryInterval() / 1000));
// If duration is set to lesser value, fall back to actual default
// retry count.
if (retryCount > scmClientConfig.getRetryCount()) {
scmClientConfig.setRetryCount(retryCount);
configuration.setFromObject(scmClientConfig);
}
-
return getScmBlockClient(configuration).getScmInfo();
} catch (IOException e) {
throw e;
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HddsServerUtil.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HddsServerUtil.java
index e3a255041828..d8a4d9475d00 100644
--- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HddsServerUtil.java
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HddsServerUtil.java
@@ -81,10 +81,10 @@
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT_DEFAULT;
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_RETRY_INTERVAL;
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_RETRY_INTERVAL_DEFAULT;
-import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_INFO_WAIT_DURATION;
-import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_INFO_WAIT_DURATION_DEFAULT;
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL;
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL_DEFAULT;
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_INFO_WAIT_DURATION;
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_INFO_WAIT_DURATION_DEFAULT;
import static org.apache.hadoop.hdds.server.ServerUtils.sanitizeUserArgs;
import org.rocksdb.RocksDBException;
@@ -467,10 +467,9 @@ public static SCMSecurityProtocolClientSideTranslatorPB getScmSecurityClient(
OzoneConfiguration configuration = new OzoneConfiguration(conf);
long duration = conf.getTimeDuration(OZONE_SCM_INFO_WAIT_DURATION,
OZONE_SCM_INFO_WAIT_DURATION_DEFAULT, TimeUnit.SECONDS);
- SCMClientConfig scmClientConfig =
- conf.getObject(SCMClientConfig.class);
+ SCMClientConfig scmClientConfig = conf.getObject(SCMClientConfig.class);
int retryCount =
- (int) (duration / (scmClientConfig.getRetryInterval()/1000));
+ (int) (duration / (scmClientConfig.getRetryInterval() / 1000));
// If duration is set to lesser value, fall back to actual default
// retry count.
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/RatisUtil.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/RatisUtil.java
index 75db7bf1c034..16394e6d63ad 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/RatisUtil.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/RatisUtil.java
@@ -18,18 +18,22 @@
package org.apache.hadoop.hdds.scm.ha;
import com.google.common.base.Strings;
+import com.google.protobuf.ServiceException;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
+import org.apache.hadoop.hdds.ratis.ServerNotLeaderException;
import org.apache.hadoop.hdds.scm.ScmConfigKeys;
import org.apache.hadoop.hdds.server.ServerUtils;
import org.apache.ratis.RaftConfigKeys;
import org.apache.ratis.conf.RaftProperties;
import org.apache.ratis.grpc.GrpcConfigKeys;
+import org.apache.ratis.protocol.exceptions.NotLeaderException;
import org.apache.ratis.rpc.RpcType;
import org.apache.ratis.server.RaftServerConfigKeys;
import org.apache.ratis.util.SizeInBytes;
import org.apache.ratis.util.TimeDuration;
import java.io.File;
+import java.io.IOException;
import java.util.Collections;
import java.util.concurrent.TimeUnit;
@@ -174,4 +178,18 @@ private static void setRaftSnapshotProperties(
conf.getRatisSnapshotThreshold());
}
+ public static void checkRatisException(IOException e, String port,
+ String scmId) throws ServiceException {
+ if (SCMHAUtils.isNonRetriableException(e)) {
+ throw new ServiceException(new NonRetriableException(e));
+ } else if (SCMHAUtils.isRetriableWithNoFailoverException(e)) {
+ throw new ServiceException(new RetriableWithNoFailoverException(e));
+ } else if (SCMHAUtils.getNotLeaderException(e) != null) {
+ NotLeaderException nle =
+ (NotLeaderException) SCMHAUtils.getNotLeaderException(e);
+ throw new ServiceException(ServerNotLeaderException
+ .convertToNotLeaderException(nle,
+ SCMRatisServerImpl.getSelfPeerId(scmId), port));
+ }
+ }
}
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMRatisServerImpl.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMRatisServerImpl.java
index 8b016d5b9434..4cbe25ffbec6 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMRatisServerImpl.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMRatisServerImpl.java
@@ -54,6 +54,7 @@
import org.apache.ratis.protocol.exceptions.NotLeaderException;
import org.apache.ratis.protocol.SetConfigurationRequest;
import org.apache.ratis.server.RaftServer;
+import org.apache.ratis.thirdparty.com.google.protobuf.ByteString;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -246,8 +247,15 @@ public List getRatisRoles() throws IOException {
*/
@Override
public NotLeaderException triggerNotLeaderException() {
- return new NotLeaderException(
- division.getMemberId(), null, division.getGroup().getPeers());
+ ByteString leaderId =
+ division.getInfo().getRoleInfoProto().getFollowerInfo().getLeaderInfo()
+ .getId().getId();
+ RaftPeer suggestedLeader = leaderId.isEmpty() ?
+ null :
+ division.getRaftConf().getPeer(RaftPeerId.valueOf(leaderId));
+ return new NotLeaderException(division.getMemberId(),
+ suggestedLeader,
+ division.getGroup().getPeers());
}
@Override
@@ -289,7 +297,7 @@ private static RaftGroup buildRaftGroup(SCMNodeDetails details,
String scmId, String clusterId) {
Preconditions.checkNotNull(scmId);
final RaftGroupId groupId = buildRaftGroupId(clusterId);
- RaftPeerId selfPeerId = RaftPeerId.getRaftPeerId(scmId);
+ RaftPeerId selfPeerId = getSelfPeerId(scmId);
RaftPeer localRaftPeer = RaftPeer.newBuilder().setId(selfPeerId)
// TODO : Should we use IP instead of hostname??
@@ -303,6 +311,10 @@ private static RaftGroup buildRaftGroup(SCMNodeDetails details,
return group;
}
+ public static RaftPeerId getSelfPeerId(String scmId) {
+ return RaftPeerId.getRaftPeerId(scmId);
+ }
+
@VisibleForTesting
public static RaftGroupId buildRaftGroupId(String clusterId) {
Preconditions.checkNotNull(clusterId);
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/SCMSecurityProtocolServerSideTranslatorPB.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/SCMSecurityProtocolServerSideTranslatorPB.java
index 6829f959e925..088f74d3845a 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/SCMSecurityProtocolServerSideTranslatorPB.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/SCMSecurityProtocolServerSideTranslatorPB.java
@@ -37,8 +37,7 @@
import org.apache.hadoop.hdds.protocol.proto.SCMSecurityProtocolProtos.SCMSecurityResponse;
import org.apache.hadoop.hdds.protocol.proto.SCMSecurityProtocolProtos.Status;
import org.apache.hadoop.hdds.protocolPB.SCMSecurityProtocolPB;
-import org.apache.hadoop.hdds.scm.ha.RetriableWithNoFailoverException;
-import org.apache.hadoop.hdds.scm.ha.SCMHAUtils;
+import org.apache.hadoop.hdds.scm.ha.RatisUtil;
import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
import org.apache.hadoop.hdds.security.exception.SCMSecurityException;
import org.apache.hadoop.hdds.security.x509.crl.CRLInfo;
@@ -48,7 +47,6 @@
import com.google.protobuf.ProtocolMessageEnum;
import com.google.protobuf.RpcController;
import com.google.protobuf.ServiceException;
-import org.apache.ratis.protocol.exceptions.NotLeaderException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -89,9 +87,9 @@ public SCMSecurityResponse submitRequest(RpcController controller,
// primary SCM may not be leader SCM.
if (!request.getCmdType().equals(GetSCMCertificate)) {
if (!scm.checkLeader()) {
- throw new ServiceException(scm.getScmHAManager()
- .getRatisServer()
- .triggerNotLeaderException());
+ RatisUtil.checkRatisException(
+ scm.getScmHAManager().getRatisServer().triggerNotLeaderException(),
+ scm.getSecurityProtocolRpcPort(), scm.getScmId());
}
}
return dispatcher.processRequest(request, this::processRequest,
@@ -148,11 +146,8 @@ public SCMSecurityResponse processRequest(SCMSecurityRequest request)
"Unknown request type: " + request.getCmdType());
}
} catch (IOException e) {
- if (SCMHAUtils.isRetriableWithNoFailoverException(e)) {
- throw new ServiceException(new RetriableWithNoFailoverException(e));
- } else if (e instanceof NotLeaderException) {
- throw new ServiceException(e);
- }
+ RatisUtil.checkRatisException(e, scm.getSecurityProtocolRpcPort(),
+ scm.getScmId());
scmSecurityResponse.setSuccess(false);
scmSecurityResponse.setStatus(exceptionToResponseStatus(e));
// If actual cause is set in SCMSecurityException, set message with
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/ScmBlockLocationProtocolServerSideTranslatorPB.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/ScmBlockLocationProtocolServerSideTranslatorPB.java
index 950e6da925e4..1938afe94a5a 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/ScmBlockLocationProtocolServerSideTranslatorPB.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/ScmBlockLocationProtocolServerSideTranslatorPB.java
@@ -42,8 +42,7 @@
import org.apache.hadoop.hdds.scm.container.common.helpers.AllocatedBlock;
import org.apache.hadoop.hdds.scm.container.common.helpers.ExcludeList;
import org.apache.hadoop.hdds.scm.exceptions.SCMException;
-import org.apache.hadoop.hdds.scm.ha.RetriableWithNoFailoverException;
-import org.apache.hadoop.hdds.scm.ha.SCMHAUtils;
+import org.apache.hadoop.hdds.scm.ha.RatisUtil;
import org.apache.hadoop.hdds.scm.protocolPB.ScmBlockLocationProtocolPB;
import org.apache.hadoop.hdds.scm.protocolPB.StorageContainerLocationProtocolPB;
import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
@@ -55,7 +54,6 @@
import com.google.protobuf.ProtocolMessageEnum;
import com.google.protobuf.RpcController;
import com.google.protobuf.ServiceException;
-import org.apache.ratis.protocol.exceptions.NotLeaderException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -107,9 +105,9 @@ private SCMBlockLocationResponse.Builder createSCMBlockResponse(
public SCMBlockLocationResponse send(RpcController controller,
SCMBlockLocationRequest request) throws ServiceException {
if (!scm.getScmContext().isLeader()) {
- throw new ServiceException(scm.getScmHAManager()
- .getRatisServer()
- .triggerNotLeaderException());
+ RatisUtil.checkRatisException(
+ scm.getScmHAManager().getRatisServer().triggerNotLeaderException(),
+ scm.getBlockProtocolRpcPort(), scm.getScmId());
}
return dispatcher.processRequest(
request,
@@ -155,11 +153,8 @@ private SCMBlockLocationResponse processMessage(
" in ScmBlockLocationProtocol");
}
} catch (IOException e) {
- if (SCMHAUtils.isRetriableWithNoFailoverException(e)) {
- throw new ServiceException(new RetriableWithNoFailoverException(e));
- } else if (e instanceof NotLeaderException) {
- throw new ServiceException(e);
- }
+ RatisUtil.checkRatisException(e, scm.getBlockProtocolRpcPort(),
+ scm.getScmId());
response.setSuccess(false);
response.setStatus(exceptionToResponseStatus(e));
if (e.getMessage() != null) {
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java
index 68046fe28e78..6690c7f67e19 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java
@@ -78,6 +78,7 @@
import org.apache.hadoop.hdds.scm.ScmInfo;
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
import org.apache.hadoop.hdds.scm.container.common.helpers.ContainerWithPipeline;
+import org.apache.hadoop.hdds.scm.ha.RatisUtil;
import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
import org.apache.hadoop.hdds.scm.protocolPB.StorageContainerLocationProtocolPB;
import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
@@ -140,9 +141,9 @@ public ScmContainerLocationResponse submitRequest(RpcController controller,
// not leader or not belong to admin command.
if (!scm.getScmContext().isLeader()
&& !ADMIN_COMMAND_TYPE.contains(request.getCmdType())) {
- throw new ServiceException(scm.getScmHAManager()
- .getRatisServer()
- .triggerNotLeaderException());
+ RatisUtil.checkRatisException(
+ scm.getScmHAManager().getRatisServer().triggerNotLeaderException(),
+ scm.getClientRpcPort(), scm.getScmId());
}
return dispatcher
.processRequest(request, this::processRequest, request.getCmdType(),
@@ -329,6 +330,8 @@ public ScmContainerLocationResponse processRequest(
"Unknown command type: " + request.getCmdType());
}
} catch (IOException e) {
+ RatisUtil
+ .checkRatisException(e, scm.getClientRpcPort(), scm.getScmId());
throw new ServiceException(e);
}
}
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java
index 30df2c08aad3..13ee92161a1d 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java
@@ -1098,6 +1098,16 @@ public String getClientRpcPort() {
return addr == null ? "0" : Integer.toString(addr.getPort());
}
+ public String getBlockProtocolRpcPort() {
+ InetSocketAddress addr = getBlockProtocolServer().getBlockRpcAddress();
+ return addr == null ? "0" : Integer.toString(addr.getPort());
+ }
+
+ public String getSecurityProtocolRpcPort() {
+ InetSocketAddress addr = getSecurityProtocolServer().getRpcAddress();
+ return addr == null ? "0" : Integer.toString(addr.getPort());
+ }
+
/**
* Returns listening address of StorageDatanode Protocol RPC server.
*
diff --git a/hadoop-ozone/dist/src/main/compose/compatibility/docker-config b/hadoop-ozone/dist/src/main/compose/compatibility/docker-config
index 940f5e9818e2..3fd4ca02eced 100644
--- a/hadoop-ozone/dist/src/main/compose/compatibility/docker-config
+++ b/hadoop-ozone/dist/src/main/compose/compatibility/docker-config
@@ -27,6 +27,7 @@ OZONE-SITE.XML_ozone.recon.db.dir=/data/metadata/recon
OZONE-SITE.XML_ozone.scm.client.address=scm
OZONE-SITE.XML_hdds.datanode.dir=/data/hdds
OZONE-SITE.XML_ozone.recon.address=recon:9891
+OZONE-SITE.XML_hdds.scmclient.max.retry.timeout=30s
HADOOP_OPTS="-Dhadoop.opts=test"
HDFS_STORAGECONTAINERMANAGER_OPTS="-Dhdfs.scm.opts=test"
diff --git a/hadoop-ozone/dist/src/main/compose/ozone-csi/docker-config b/hadoop-ozone/dist/src/main/compose/ozone-csi/docker-config
index 1d1db1365a65..2488153c0974 100644
--- a/hadoop-ozone/dist/src/main/compose/ozone-csi/docker-config
+++ b/hadoop-ozone/dist/src/main/compose/ozone-csi/docker-config
@@ -31,6 +31,7 @@ OZONE-SITE.XML_ozone.metadata.dirs=/data/metadata
OZONE-SITE.XML_ozone.recon.db.dir=/data/metadata/recon
OZONE-SITE.XML_ozone.scm.client.address=scm
OZONE-SITE.XML_hdds.datanode.dir=/data/hdds
+OZONE-SITE.XML_hdds.scmclient.max.retry.timeout=30s
OZONE_CONF_DIR=/etc/hadoop
OZONE_LOG_DIR=/var/log/hadoop
diff --git a/hadoop-ozone/dist/src/main/compose/ozone-ha/docker-config b/hadoop-ozone/dist/src/main/compose/ozone-ha/docker-config
index 70895a0d3fc0..bdc0c4331ccf 100644
--- a/hadoop-ozone/dist/src/main/compose/ozone-ha/docker-config
+++ b/hadoop-ozone/dist/src/main/compose/ozone-ha/docker-config
@@ -34,6 +34,7 @@ OZONE-SITE.XML_ozone.scm.container.size=1GB
OZONE-SITE.XML_ozone.metadata.dirs=/data/metadata
OZONE-SITE.XML_hdds.datanode.dir=/data/hdds
OZONE-SITE.XML_ozone.datanode.pipeline.limit=1
+OZONE-SITE.XML_hdds.scmclient.max.retry.timeout=30s
OZONE_CONF_DIR=/etc/hadoop
OZONE_LOG_DIR=/var/log/hadoop
diff --git a/hadoop-ozone/dist/src/main/compose/ozone-om-ha/docker-config b/hadoop-ozone/dist/src/main/compose/ozone-om-ha/docker-config
index cd8ea0c47c08..34c241cff53e 100644
--- a/hadoop-ozone/dist/src/main/compose/ozone-om-ha/docker-config
+++ b/hadoop-ozone/dist/src/main/compose/ozone-om-ha/docker-config
@@ -33,6 +33,7 @@ OZONE-SITE.XML_ozone.replication=1
OZONE-SITE.XML_ozone.client.failover.max.attempts=6
OZONE-SITE.XML_hdds.datanode.dir=/data/hdds
OZONE-SITE.XML_hdds.profiler.endpoint.enabled=true
+OZONE-SITE.XML_hdds.scmclient.max.retry.timeout=30s
HDFS-SITE.XML_rpc.metrics.quantile.enable=true
HDFS-SITE.XML_rpc.metrics.percentiles.intervals=60,300
ASYNC_PROFILER_HOME=/opt/profiler
diff --git a/hadoop-ozone/dist/src/main/compose/ozone-topology/docker-config b/hadoop-ozone/dist/src/main/compose/ozone-topology/docker-config
index ff78482573ea..e46ca589684f 100644
--- a/hadoop-ozone/dist/src/main/compose/ozone-topology/docker-config
+++ b/hadoop-ozone/dist/src/main/compose/ozone-topology/docker-config
@@ -35,6 +35,7 @@ OZONE-SITE.XML_ozone.scm.container.placement.impl=org.apache.hadoop.hdds.scm.con
OZONE-SITE.XML_net.topology.node.switch.mapping.impl=org.apache.hadoop.net.TableMapping
OZONE-SITE.XML_net.topology.table.file.name=/opt/hadoop/compose/ozone-topology/network-config
OZONE-SITE.XML_ozone.network.topology.aware.read=true
+OZONE-SITE.XML_hdds.scmclient.max.retry.timeout=30s
HDFS-SITE.XML_rpc.metrics.quantile.enable=true
HDFS-SITE.XML_rpc.metrics.percentiles.intervals=60,300
diff --git a/hadoop-ozone/dist/src/main/compose/ozone/docker-config b/hadoop-ozone/dist/src/main/compose/ozone/docker-config
index 47112238424d..c9669a1b9fdf 100644
--- a/hadoop-ozone/dist/src/main/compose/ozone/docker-config
+++ b/hadoop-ozone/dist/src/main/compose/ozone/docker-config
@@ -32,6 +32,7 @@ OZONE-SITE.XML_hdds.datanode.dir=/data/hdds
OZONE-SITE.XML_ozone.recon.address=recon:9891
OZONE-SITE.XML_ozone.recon.om.snapshot.task.interval.delay=1m
OZONE-SITE.XML_ozone.datanode.pipeline.limit=1
+OZONE-SITE.XML_hdds.scmclient.max.retry.timeout=30s
OZONE_CONF_DIR=/etc/hadoop
OZONE_LOG_DIR=/var/log/hadoop
diff --git a/hadoop-ozone/dist/src/main/compose/ozoneblockade/docker-config b/hadoop-ozone/dist/src/main/compose/ozoneblockade/docker-config
index 1df06e419bb8..762bfeaa4b32 100644
--- a/hadoop-ozone/dist/src/main/compose/ozoneblockade/docker-config
+++ b/hadoop-ozone/dist/src/main/compose/ozoneblockade/docker-config
@@ -37,6 +37,7 @@ OZONE-SITE.XML_hdds.scm.replication.thread.interval=6s
OZONE-SITE.XML_hdds.scm.replication.event.timeout=10s
OZONE-SITE.XML_dfs.ratis.server.failure.duration=35s
OZONE-SITE.XML_hdds.scm.safemode.min.datanode=3
+OZONE-SITE.XML_hdds.scmclient.max.retry.timeout=30s
HDFS-SITE.XML_rpc.metrics.quantile.enable=true
HDFS-SITE.XML_rpc.metrics.percentiles.intervals=60,300
diff --git a/hadoop-ozone/dist/src/main/compose/ozones3-haproxy/docker-config b/hadoop-ozone/dist/src/main/compose/ozones3-haproxy/docker-config
index 73f775bf39da..711ac7c1c564 100644
--- a/hadoop-ozone/dist/src/main/compose/ozones3-haproxy/docker-config
+++ b/hadoop-ozone/dist/src/main/compose/ozones3-haproxy/docker-config
@@ -27,6 +27,7 @@ OZONE-SITE.XML_ozone.metadata.dirs=/data/metadata
OZONE-SITE.XML_ozone.scm.client.address=scm
OZONE-SITE.XML_ozone.replication=3
OZONE-SITE.XML_hdds.datanode.dir=/data/hdds
+OZONE-SITE.XML_hdds.scmclient.max.retry.timeout=30s
HDFS-SITE.XML_rpc.metrics.quantile.enable=true
HDFS-SITE.XML_rpc.metrics.percentiles.intervals=60,300
diff --git a/hadoop-ozone/dist/src/main/compose/ozonescripts/docker-config b/hadoop-ozone/dist/src/main/compose/ozonescripts/docker-config
index 45c0c2dbc824..101236b18952 100644
--- a/hadoop-ozone/dist/src/main/compose/ozonescripts/docker-config
+++ b/hadoop-ozone/dist/src/main/compose/ozonescripts/docker-config
@@ -25,6 +25,7 @@ OZONE-SITE.XML_ozone.metadata.dirs=/data/metadata
OZONE-SITE.XML_ozone.scm.client.address=scm
OZONE-SITE.XML_ozone.replication=1
OZONE-SITE.XML_hdds.datanode.dir=/data/hdds
+OZONE-SITE.XML_hdds.scmclient.max.retry.timeout=30s
OZONE_CONF_DIR=/etc/hadoop
OZONE_LOG_DIR=/var/log/hadoop
diff --git a/hadoop-ozone/dist/src/main/compose/ozonesecure-ha/docker-config b/hadoop-ozone/dist/src/main/compose/ozonesecure-ha/docker-config
index 6e67b4072f7c..ae028efcca36 100644
--- a/hadoop-ozone/dist/src/main/compose/ozonesecure-ha/docker-config
+++ b/hadoop-ozone/dist/src/main/compose/ozonesecure-ha/docker-config
@@ -47,6 +47,7 @@ OZONE-SITE.XML_ozone.scm.client.address=scm
OZONE-SITE.XML_hdds.block.token.enabled=true
OZONE-SITE.XML_hdds.grpc.tls.enabled=true
OZONE-SITE.XML_ozone.replication=3
+OZONE-SITE.XML_hdds.scmclient.max.retry.timeout=30s
OZONE-SITE.XML_ozone.recon.om.snapshot.task.interval.delay=1m
OZONE-SITE.XML_ozone.recon.db.dir=/data/metadata/recon
diff --git a/hadoop-ozone/dist/src/main/compose/ozonesecure-mr/docker-config b/hadoop-ozone/dist/src/main/compose/ozonesecure-mr/docker-config
index 2f201478e7fa..14f88c279255 100644
--- a/hadoop-ozone/dist/src/main/compose/ozonesecure-mr/docker-config
+++ b/hadoop-ozone/dist/src/main/compose/ozonesecure-mr/docker-config
@@ -27,6 +27,7 @@ OZONE-SITE.XML_ozone.handler.type=distributed
OZONE-SITE.XML_ozone.scm.client.address=scm
OZONE-SITE.XML_hdds.block.token.enabled=true
OZONE-SITE.XML_ozone.replication=3
+OZONE-SITE.XML_hdds.scmclient.max.retry.timeout=30s
OZONE-SITE.XML_hdds.scm.kerberos.principal=scm/scm@EXAMPLE.COM
OZONE-SITE.XML_hdds.scm.kerberos.keytab.file=/etc/security/keytabs/scm.keytab
OZONE-SITE.XML_ozone.om.kerberos.principal=om/om@EXAMPLE.COM
diff --git a/hadoop-ozone/dist/src/main/compose/ozonesecure/docker-config b/hadoop-ozone/dist/src/main/compose/ozonesecure/docker-config
index c8d3643e9736..1b45367d6199 100644
--- a/hadoop-ozone/dist/src/main/compose/ozonesecure/docker-config
+++ b/hadoop-ozone/dist/src/main/compose/ozonesecure/docker-config
@@ -32,6 +32,7 @@ OZONE-SITE.XML_hdds.block.token.enabled=true
OZONE-SITE.XML_hdds.grpc.tls.enabled=true
OZONE-SITE.XML_ozone.replication=3
OZONE-SITE.XML_ozone.datanode.pipeline.limit=1
+OZONE-SITE.XML_hdds.scmclient.max.retry.timeout=30s
OZONE-SITE.XML_ozone.recon.om.snapshot.task.interval.delay=1m
OZONE-SITE.XML_ozone.recon.db.dir=/data/metadata/recon
diff --git a/hadoop-ozone/dist/src/main/compose/restart/docker-config b/hadoop-ozone/dist/src/main/compose/restart/docker-config
index 83530bb334e0..f3a46541e99c 100644
--- a/hadoop-ozone/dist/src/main/compose/restart/docker-config
+++ b/hadoop-ozone/dist/src/main/compose/restart/docker-config
@@ -28,6 +28,7 @@ OZONE-SITE.XML_ozone.scm.client.address=scm
OZONE-SITE.XML_hdds.datanode.dir=/data/hdds
OZONE-SITE.XML_ozone.recon.address=recon:9891
OZONE-SITE.XML_ozone.recon.om.snapshot.task.interval.delay=1m
+OZONE-SITE.XML_hdds.scmclient.max.retry.timeout=30s
OZONE_CONF_DIR=/etc/hadoop
OZONE_LOG_DIR=/var/log/hadoop
diff --git a/hadoop-ozone/dist/src/main/compose/upgrade/docker-config b/hadoop-ozone/dist/src/main/compose/upgrade/docker-config
index 0a489ae9c6d6..e55477d73644 100644
--- a/hadoop-ozone/dist/src/main/compose/upgrade/docker-config
+++ b/hadoop-ozone/dist/src/main/compose/upgrade/docker-config
@@ -30,6 +30,7 @@ OZONE-SITE.XML_ozone.scm.client.address=scm
OZONE-SITE.XML_hdds.datanode.dir=/data/hdds
OZONE-SITE.XML_ozone.recon.address=recon:9891
OZONE-SITE.XML_ozone.recon.om.snapshot.task.interval.delay=1m
+OZONE-SITE.XML_hdds.scmclient.max.retry.timeout=30s
OZONE_CONF_DIR=/etc/hadoop
OZONE_LOG_DIR=/var/log/hadoop
diff --git a/hadoop-ozone/dist/src/main/compose/xcompat/docker-config b/hadoop-ozone/dist/src/main/compose/xcompat/docker-config
index 5519807176c9..0599498cf238 100644
--- a/hadoop-ozone/dist/src/main/compose/xcompat/docker-config
+++ b/hadoop-ozone/dist/src/main/compose/xcompat/docker-config
@@ -29,5 +29,6 @@ OZONE-SITE.XML_ozone.scm.datanode.id.dir=/data
OZONE-SITE.XML_ozone.scm.names=scm
OZONE-SITE.XML_ozone.scm.pipeline.owner.container.count=1
OZONE-SITE.XML_recon.om.snapshot.task.interval.delay=1m
+OZONE-SITE.XML_hdds.scmclient.max.retry.timeout=30s
no_proxy=om,recon,scm,s3g,kdc,localhost,127.0.0.1
diff --git a/hadoop-ozone/fault-injection-test/network-tests/src/test/compose/docker-config b/hadoop-ozone/fault-injection-test/network-tests/src/test/compose/docker-config
index da46f8e0b3ab..eabc95c812a6 100644
--- a/hadoop-ozone/fault-injection-test/network-tests/src/test/compose/docker-config
+++ b/hadoop-ozone/fault-injection-test/network-tests/src/test/compose/docker-config
@@ -25,6 +25,7 @@ OZONE-SITE.XML_ozone.scm.client.address=scm
OZONE-SITE.XML_ozone.scm.dead.node.interval=5m
OZONE-SITE.XML_ozone.replication=1
OZONE-SITE.XML_hdds.datanode.dir=/data/hdds
+OZONE-SITE.XML_hdds.scmclient.max.retry.timeout=30s
HDFS-SITE.XML_rpc.metrics.quantile.enable=true
HDFS-SITE.XML_rpc.metrics.percentiles.intervals=60,300
LOG4J.PROPERTIES_log4j.rootLogger=INFO, stdout
diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneClusterImpl.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneClusterImpl.java
index 40d99a798f3c..2da2211d8c42 100644
--- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneClusterImpl.java
+++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneClusterImpl.java
@@ -50,6 +50,7 @@
import org.apache.hadoop.hdds.scm.ha.SCMRatisServerImpl;
import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
import org.apache.hadoop.hdds.scm.protocolPB.StorageContainerLocationProtocolClientSideTranslatorPB;
+import org.apache.hadoop.hdds.scm.proxy.SCMClientConfig;
import org.apache.hadoop.hdds.scm.proxy.SCMContainerLocationFailoverProxyProvider;
import org.apache.hadoop.hdds.scm.safemode.HealthyPipelineSafeModeRule;
import org.apache.hadoop.hdds.scm.server.OzoneStorageContainerManager;
@@ -675,6 +676,10 @@ protected void initializeConfiguration() throws IOException {
pipelineNumLimit : DEFAULT_PIPELIME_LIMIT);
conf.setTimeDuration(OMConfigKeys.OZONE_OM_RATIS_MINIMUM_TIMEOUT_KEY,
DEFAULT_RATIS_RPC_TIMEOUT_SEC, TimeUnit.SECONDS);
+ SCMClientConfig scmClientConfig = conf.getObject(SCMClientConfig.class);
+ // default max retry timeout set to 30s
+ scmClientConfig.setMaxRetryTimeout(30 * 1000);
+ conf.setFromObject(scmClientConfig);
configureTrace();
}
diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/TestFailoverWithSCMHA.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/TestFailoverWithSCMHA.java
new file mode 100644
index 000000000000..1f0e71fbcd7a
--- /dev/null
+++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/TestFailoverWithSCMHA.java
@@ -0,0 +1,147 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.hadoop.ozone.scm;
+
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.scm.ha.SCMHAConfiguration;
+import org.apache.hadoop.hdds.scm.protocol.ScmBlockLocationProtocol;
+import org.apache.hadoop.hdds.scm.protocol.StorageContainerLocationProtocol;
+import org.apache.hadoop.hdds.scm.protocolPB.ScmBlockLocationProtocolClientSideTranslatorPB;
+import org.apache.hadoop.hdds.scm.protocolPB.StorageContainerLocationProtocolClientSideTranslatorPB;
+import org.apache.hadoop.hdds.scm.proxy.SCMBlockLocationFailoverProxyProvider;
+import org.apache.hadoop.hdds.scm.proxy.SCMClientConfig;
+import org.apache.hadoop.hdds.scm.proxy.SCMContainerLocationFailoverProxyProvider;
+import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
+import org.apache.hadoop.hdds.tracing.TracingUtil;
+import org.apache.hadoop.ozone.MiniOzoneCluster;
+import org.apache.hadoop.ozone.MiniOzoneHAClusterImpl;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.junit.Assert;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.slf4j.event.Level;
+
+import java.io.IOException;
+import java.util.UUID;
+
+/**
+ * Tests failover with SCM HA setup.
+ */
+public class TestFailoverWithSCMHA {
+ private MiniOzoneHAClusterImpl cluster = null;
+ private OzoneConfiguration conf;
+ private String clusterId;
+ private String scmId;
+ private String omServiceId;
+ private String scmServiceId;
+ private int numOfOMs = 1;
+ private int numOfSCMs = 3;
+
+ private static final long SNAPSHOT_THRESHOLD = 5;
+
+ /**
+ * Create a MiniOzoneCluster for testing.
+ *
+ * @throws IOException
+ */
+ @BeforeEach
+ public void init() throws Exception {
+ conf = new OzoneConfiguration();
+ clusterId = UUID.randomUUID().toString();
+ scmId = UUID.randomUUID().toString();
+ omServiceId = "om-service-test1";
+ scmServiceId = "scm-service-test1";
+ SCMHAConfiguration scmhaConfiguration =
+ conf.getObject(SCMHAConfiguration.class);
+ scmhaConfiguration.setRatisSnapshotThreshold(SNAPSHOT_THRESHOLD);
+ conf.setFromObject(scmhaConfiguration);
+
+ cluster = (MiniOzoneHAClusterImpl) MiniOzoneCluster.newHABuilder(conf)
+ .setClusterId(clusterId).setScmId(scmId).setOMServiceId(omServiceId)
+ .setSCMServiceId(scmServiceId).setNumOfOzoneManagers(numOfOMs)
+ .setNumOfStorageContainerManagers(numOfSCMs).setNumOfActiveSCMs(3)
+ .build();
+ cluster.waitForClusterToBeReady();
+ }
+
+ /**
+ * Shutdown MiniDFSCluster.
+ */
+ @AfterEach
+ public void shutdown() {
+ if (cluster != null) {
+ cluster.shutdown();
+ }
+ }
+
+ @Test
+ public void testFailover() throws Exception {
+ SCMClientConfig scmClientConfig =
+ conf.getObject(SCMClientConfig.class);
+ scmClientConfig.setRetryCount(1);
+ scmClientConfig.setRetryInterval(100);
+ scmClientConfig.setMaxRetryTimeout(1500);
+ Assert.assertEquals(scmClientConfig.getRetryCount(), 15);
+ conf.setFromObject(scmClientConfig);
+ StorageContainerManager scm = getLeader(cluster);
+ Assert.assertNotNull(scm);
+ SCMBlockLocationFailoverProxyProvider failoverProxyProvider =
+ new SCMBlockLocationFailoverProxyProvider(conf);
+ failoverProxyProvider.changeCurrentProxy(scm.getSCMNodeId());
+ ScmBlockLocationProtocolClientSideTranslatorPB scmBlockLocationClient =
+ new ScmBlockLocationProtocolClientSideTranslatorPB(
+ failoverProxyProvider);
+ GenericTestUtils
+ .setLogLevel(SCMBlockLocationFailoverProxyProvider.LOG, Level.DEBUG);
+ GenericTestUtils.LogCapturer logCapture = GenericTestUtils.LogCapturer
+ .captureLogs(SCMBlockLocationFailoverProxyProvider.LOG);
+ ScmBlockLocationProtocol scmBlockLocationProtocol = TracingUtil
+ .createProxy(scmBlockLocationClient, ScmBlockLocationProtocol.class,
+ conf);
+ scmBlockLocationProtocol.getScmInfo();
+ Assert.assertTrue(logCapture.getOutput()
+ .contains("Performing failover to suggested leader"));
+ scm = getLeader(cluster);
+ SCMContainerLocationFailoverProxyProvider proxyProvider =
+ new SCMContainerLocationFailoverProxyProvider(conf);
+ GenericTestUtils.setLogLevel(SCMContainerLocationFailoverProxyProvider.LOG,
+ Level.DEBUG);
+ logCapture = GenericTestUtils.LogCapturer
+ .captureLogs(SCMContainerLocationFailoverProxyProvider.LOG);
+ proxyProvider.changeCurrentProxy(scm.getSCMNodeId());
+ StorageContainerLocationProtocol scmContainerClient =
+ TracingUtil.createProxy(
+ new StorageContainerLocationProtocolClientSideTranslatorPB(
+ proxyProvider), StorageContainerLocationProtocol.class, conf);
+
+ scmContainerClient.allocateContainer(HddsProtos.ReplicationType.RATIS,
+ HddsProtos.ReplicationFactor.ONE, "ozone");
+ Assert.assertTrue(logCapture.getOutput()
+ .contains("Performing failover to suggested leader"));
+ }
+
+ static StorageContainerManager getLeader(MiniOzoneHAClusterImpl impl) {
+ for (StorageContainerManager scm : impl.getStorageContainerManagers()) {
+ if (scm.checkLeader()) {
+ return scm;
+ }
+ }
+ return null;
+ }
+}