From e309ac247f2cf51c9db9cfe70a55e59b57a90001 Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Thu, 6 Aug 2020 13:17:11 -0700 Subject: [PATCH 1/5] HDDS-4075. Retry request on different OM on AccessControlException --- .../java/org/apache/hadoop/ozone/OmUtils.java | 26 ++++++++ .../ozone/om/ha/OMFailoverProxyProvider.java | 18 ++++- .../om/protocolPB/Hadoop3OmTransport.java | 65 ++++--------------- 3 files changed, 56 insertions(+), 53 deletions(-) diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/OmUtils.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/OmUtils.java index 93e0e7f7dec0..2a34580a954c 100644 --- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/OmUtils.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/OmUtils.java @@ -17,6 +17,7 @@ package org.apache.hadoop.ozone; +import com.google.protobuf.ServiceException; import java.io.File; import java.io.IOException; import java.net.InetSocketAddress; @@ -36,12 +37,15 @@ import org.apache.hadoop.hdds.conf.ConfigurationSource; import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.hdds.scm.client.HddsClientUtils; +import org.apache.hadoop.ipc.RemoteException; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.ozone.conf.OMClientConfig; import org.apache.hadoop.ozone.om.exceptions.OMException; import org.apache.hadoop.ozone.om.helpers.OmKeyInfo; import org.apache.hadoop.ozone.om.helpers.RepeatedOmKeyInfo; import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos; +import org.apache.hadoop.security.AccessControlException; +import org.apache.hadoop.security.token.SecretManager; import com.google.common.base.Joiner; import org.apache.commons.lang3.StringUtils; @@ -59,6 +63,7 @@ import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_NODES_KEY; import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_PORT_DEFAULT; import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SERVICE_IDS_KEY; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -575,4 +580,25 @@ public static String getOzoneManagerServiceId(OzoneConfiguration conf) return serviceId; } } + + /** + * Unwrap exception to check if it is some kind of access control problem + * ({@link AccessControlException} or {@link SecretManager.InvalidToken}). + */ + public static boolean isAccessControlException(Exception ex) { + if (ex instanceof ServiceException) { + Throwable t = ex.getCause(); + if (t instanceof RemoteException) { + t = ((RemoteException) t).unwrapRemoteException(); + } + while (t != null) { + if (t instanceof AccessControlException || + t instanceof SecretManager.InvalidToken) { + return true; + } + t = t.getCause(); + } + } + return false; + } } diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java index 1abe5abfdb1b..86325c0326b2 100644 --- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java @@ -80,7 +80,6 @@ public class OMFailoverProxyProvider implements private final String omServiceId; - // OMFailoverProxyProvider, on encountering certain exception, tries each OM // once in a round robin fashion. After that it waits for configured time // before attempting to contact all the OMs again. For other exceptions @@ -90,6 +89,7 @@ public class OMFailoverProxyProvider implements private String lastAttemptedOM; private int numAttemptsOnSameOM = 0; private final long waitBetweenRetries; + private Set accessControlExceptionOMs = new HashSet<>(); public OMFailoverProxyProvider(ConfigurationSource configuration, UserGroupInformation ugi, String omServiceId) throws IOException { @@ -351,7 +351,7 @@ private synchronized int getCurrentProxyIndex() { public synchronized long getWaitTime() { if (currentProxyOMNodeId.equals(lastAttemptedOM)) { - // Clear attemptedOMs list as round robin has been broken. Add only the + // Clear attemptedOMs list as round robin has been broken. attemptedOMs.clear(); // The same OM will be contacted again. So wait and then retry. @@ -375,6 +375,20 @@ public synchronized long getWaitTime() { return waitBetweenRetries; } + public synchronized boolean shouldFailover(Exception ex) { + if (OmUtils.isAccessControlException(ex)) { + // Retry all available OMs once before failing with + // AccessControlException. + if (accessControlExceptionOMs.contains(omNodeIDList) || + accessControlExceptionOMs.contains(currentProxyOMNodeId)) { + accessControlExceptionOMs.clear(); + return false; + } + accessControlExceptionOMs.add(currentProxyOMNodeId); + } + return true; + } + /** * Close all the proxy objects which have been opened over the lifetime of * the proxy provider. diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/Hadoop3OmTransport.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/Hadoop3OmTransport.java index 33d22b45b43e..5360de21e4be 100644 --- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/Hadoop3OmTransport.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/Hadoop3OmTransport.java @@ -37,9 +37,7 @@ import org.apache.hadoop.ozone.om.ha.OMFailoverProxyProvider; import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.OMRequest; import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.OMResponse; -import org.apache.hadoop.security.AccessControlException; import org.apache.hadoop.security.UserGroupInformation; -import org.apache.hadoop.security.token.SecretManager; import com.google.common.annotations.VisibleForTesting; import com.google.protobuf.RpcController; @@ -128,18 +126,17 @@ private OzoneManagerProtocolPB createRetryProxy( public RetryAction shouldRetry(Exception exception, int retries, int failovers, boolean isIdempotentOrAtMostOnce) throws Exception { - if (isAccessControlException(exception)) { - return RetryAction.FAIL; // do not retry + + if (LOG.isDebugEnabled()) { + LOG.debug("RetryProxy: {}", exception.getCause() != null ? + exception.getCause().getMessage() : exception.getMessage()); } + retryExceptions.add(getExceptionMsg(exception, failovers)); + if (exception instanceof ServiceException) { OMNotLeaderException notLeaderException = getNotLeaderException(exception); if (notLeaderException != null) { - retryExceptions.add(getExceptionMsg(notLeaderException, failovers)); - if (LOG.isDebugEnabled()) { - LOG.debug("RetryProxy: {}", notLeaderException.getMessage()); - } - // TODO: NotLeaderException should include the host // address of the suggested leader along with the nodeID. // Failing over just based on nodeID is not very robust. @@ -152,35 +149,22 @@ public RetryAction shouldRetry(Exception exception, int retries, OMLeaderNotReadyException leaderNotReadyException = getLeaderNotReadyException(exception); - // As in this case, current OM node is leader, but it is not ready. - // OMFailoverProxyProvider#performFailover() is a dummy call and - // does not perform any failover. - // So Just retry with same OM node. if (leaderNotReadyException != null) { - retryExceptions.add(getExceptionMsg(leaderNotReadyException, - failovers)); - if (LOG.isDebugEnabled()) { - LOG.debug("RetryProxy: {}", leaderNotReadyException.getMessage()); - } - // HDDS-3465. OM index will not change, but LastOmID will be - // updated to currentOMId, so that waitTime calculation will - // know lastOmID and currentID are same and need to increment - // wait time in between. + // Retry on same OM again as leader OM is not ready. + // Failing over to same OM so that wait time between retries is + // incremented omFailoverProxyProvider.performFailoverIfRequired( omFailoverProxyProvider.getCurrentProxyOMNodeId()); return getRetryAction(RetryDecision.FAILOVER_AND_RETRY, failovers); } } - // For all other exceptions other than LeaderNotReadyException and - // NotLeaderException fail over manually to the next OM Node proxy. - // OMFailoverProxyProvider#performFailover() is a dummy call and - // does not perform any failover. - retryExceptions.add(getExceptionMsg(exception, failovers)); - if (LOG.isDebugEnabled()) { - LOG.debug("RetryProxy: {}", exception.getCause() != null ? - exception.getCause().getMessage() : exception.getMessage()); + if (!omFailoverProxyProvider.shouldFailover(exception)) { + return RetryAction.FAIL; // do not retry } + + // For all other exceptions, fail over manually to the next OM Node + // proxy. omFailoverProxyProvider.performFailoverToNextProxy(); return getRetryAction(RetryDecision.FAILOVER_AND_RETRY, failovers); } @@ -245,27 +229,6 @@ private OMLeaderNotReadyException getLeaderNotReadyException( return null; } - /** - * Unwrap exception to check if it is some kind of access control problem - * ({@link AccessControlException} or {@link SecretManager.InvalidToken}). - */ - private boolean isAccessControlException(Exception ex) { - if (ex instanceof ServiceException) { - Throwable t = ex.getCause(); - if (t instanceof RemoteException) { - t = ((RemoteException) t).unwrapRemoteException(); - } - while (t != null) { - if (t instanceof AccessControlException || - t instanceof SecretManager.InvalidToken) { - return true; - } - t = t.getCause(); - } - } - return false; - } - /** * Check if exception is a OMNotLeaderException. * From debb7ffc1c140825e0f576572aa45fccdacfb136 Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Fri, 7 Aug 2020 11:55:43 -0700 Subject: [PATCH 2/5] integration test and findbug fix --- .../hadoop/ozone/om/ha/OMFailoverProxyProvider.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java index 86325c0326b2..71794f1dc384 100644 --- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java @@ -379,12 +379,15 @@ public synchronized boolean shouldFailover(Exception ex) { if (OmUtils.isAccessControlException(ex)) { // Retry all available OMs once before failing with // AccessControlException. - if (accessControlExceptionOMs.contains(omNodeIDList) || - accessControlExceptionOMs.contains(currentProxyOMNodeId)) { + if (accessControlExceptionOMs.contains(currentProxyOMNodeId)) { accessControlExceptionOMs.clear(); return false; + } else { + accessControlExceptionOMs.add(currentProxyOMNodeId); + if (accessControlExceptionOMs.containsAll(omNodeIDList)) { + return false; + } } - accessControlExceptionOMs.add(currentProxyOMNodeId); } return true; } From 2ae8fd0e48a2494d32aaf5d8090f671f00fa0aad Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Tue, 8 Sep 2020 17:22:24 -0700 Subject: [PATCH 3/5] unit test using Mock FailoverProxyProvider --- .../ozone/om/ha/OMFailoverProxyProvider.java | 166 ++++++++++++++++-- .../om/protocolPB/Hadoop3OmTransport.java | 138 +-------------- .../ozone/om/failover/TestOMFailovers.java | 127 ++++++++++++++ 3 files changed, 283 insertions(+), 148 deletions(-) create mode 100644 hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/failover/TestOMFailovers.java diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java index 71794f1dc384..c57da37bde91 100644 --- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java @@ -18,6 +18,9 @@ package org.apache.hadoop.ozone.om.ha; +import com.google.common.annotations.VisibleForTesting; +import com.google.protobuf.ServiceException; + import java.io.Closeable; import java.io.IOException; import java.net.InetSocketAddress; @@ -32,28 +35,31 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdds.conf.ConfigurationSource; -import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.hdds.utils.LegacyHadoopConfigurationSource; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.retry.FailoverProxyProvider; import org.apache.hadoop.io.retry.RetryInvocationHandler; import org.apache.hadoop.io.retry.RetryPolicies; import org.apache.hadoop.io.retry.RetryPolicy; +import org.apache.hadoop.io.retry.RetryPolicy.RetryAction.RetryDecision; import org.apache.hadoop.ipc.ProtobufRpcEngine; import org.apache.hadoop.ipc.RPC; +import org.apache.hadoop.ipc.RemoteException; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.ozone.OmUtils; import org.apache.hadoop.ozone.OzoneConfigKeys; import org.apache.hadoop.ozone.OzoneConsts; +import org.apache.hadoop.ozone.om.exceptions.OMLeaderNotReadyException; +import org.apache.hadoop.ozone.om.exceptions.OMNotLeaderException; import org.apache.hadoop.ozone.om.protocolPB.OzoneManagerProtocolClientSideTranslatorPB; import org.apache.hadoop.ozone.om.protocolPB.OzoneManagerProtocolPB; import org.apache.hadoop.security.UserGroupInformation; -import com.google.common.annotations.VisibleForTesting; -import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_ADDRESS_KEY; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_ADDRESS_KEY; + /** * A failover proxy provider implementation which allows clients to configure * multiple OMs to connect to. In case of OM failover, client can try @@ -66,9 +72,9 @@ public class OMFailoverProxyProvider implements LoggerFactory.getLogger(OMFailoverProxyProvider.class); // Map of OMNodeID to its proxy - private Map> omProxies; - private Map omProxyInfos; - private List omNodeIDList; + protected Map> omProxies; + protected Map omProxyInfos; + protected List omNodeIDList; private String currentProxyOMNodeId; private int currentProxyIndex; @@ -80,6 +86,8 @@ public class OMFailoverProxyProvider implements private final String omServiceId; + private List retryExceptions = new ArrayList<>(); + // OMFailoverProxyProvider, on encountering certain exception, tries each OM // once in a round robin fashion. After that it waits for configured time // before attempting to contact all the OMs again. For other exceptions @@ -108,12 +116,7 @@ public OMFailoverProxyProvider(ConfigurationSource configuration, OzoneConfigKeys.OZONE_CLIENT_WAIT_BETWEEN_RETRIES_MILLIS_DEFAULT); } - public OMFailoverProxyProvider(OzoneConfiguration configuration, - UserGroupInformation ugi) throws IOException { - this(configuration, ugi, null); - } - - private void loadOMClientConfigs(ConfigurationSource config, String omSvcId) + protected void loadOMClientConfigs(ConfigurationSource config, String omSvcId) throws IOException { this.omProxies = new HashMap<>(); this.omProxyInfos = new HashMap<>(); @@ -203,7 +206,7 @@ public synchronized ProxyInfo getProxy() { /** * Creates proxy object if it does not already exist. */ - private void createOMProxyIfNeeded(ProxyInfo proxyInfo, + protected void createOMProxyIfNeeded(ProxyInfo proxyInfo, String nodeId) { if (proxyInfo.proxy == null) { InetSocketAddress address = omProxyInfos.get(nodeId).getAddress(); @@ -223,11 +226,93 @@ private void createOMProxyIfNeeded(ProxyInfo proxyInfo, } } + @VisibleForTesting + public RetryPolicy getRetryPolicy(int maxFailovers) { + // Client attempts contacting each OM ipc.client.connect.max.retries + // (default = 10) times before failing over to the next OM, if + // available. + // Client will attempt upto maxFailovers number of failovers between + // available OMs before throwing exception. + RetryPolicy retryPolicy = new RetryPolicy() { + @Override + public RetryAction shouldRetry(Exception exception, int retries, + int failovers, boolean isIdempotentOrAtMostOnce) + throws Exception { + + if (LOG.isDebugEnabled()) { + if (exception.getCause() != null) { + LOG.debug("RetryProxy: OM {}: {}: {}", getCurrentProxyOMNodeId(), + exception.getCause().getClass().getSimpleName(), + exception.getCause().getMessage()); + } else { + LOG.debug("RetryProxy: OM {}: {}", getCurrentProxyOMNodeId(), + exception.getMessage()); + } + } + retryExceptions.add(getExceptionMsg(exception, failovers)); + + if (exception instanceof ServiceException) { + OMNotLeaderException notLeaderException = + getNotLeaderException(exception); + if (notLeaderException != null) { + // TODO: NotLeaderException should include the host + // address of the suggested leader along with the nodeID. + // Failing over just based on nodeID is not very robust. + + // OMFailoverProxyProvider#performFailover() is a dummy call and + // does not perform any failover. Failover manually to the next OM. + performFailoverToNextProxy(); + return getRetryAction(RetryDecision.FAILOVER_AND_RETRY, failovers); + } + + OMLeaderNotReadyException leaderNotReadyException = + getLeaderNotReadyException(exception); + if (leaderNotReadyException != null) { + // Retry on same OM again as leader OM is not ready. + // Failing over to same OM so that wait time between retries is + // incremented + performFailoverIfRequired(getCurrentProxyOMNodeId()); + return getRetryAction(RetryDecision.FAILOVER_AND_RETRY, failovers); + } + } + + if (!shouldFailover(exception)) { + return RetryAction.FAIL; // do not retry + } + + // For all other exceptions, fail over manually to the next OM Node + // proxy. + performFailoverToNextProxy(); + return getRetryAction(RetryDecision.FAILOVER_AND_RETRY, failovers); + } + + private RetryAction getRetryAction(RetryDecision fallbackAction, + int failovers) { + if (failovers < maxFailovers) { + return new RetryAction(fallbackAction, getWaitTime()); + } else { + StringBuilder allRetryExceptions = new StringBuilder(); + allRetryExceptions.append("\n"); + retryExceptions.stream().forEach(e -> allRetryExceptions.append(e) + .append("\n")); + LOG.error("Failed to connect to OMs: {}. Attempted {} failovers. " + + "Got following exceptions during retries: {}", + getOMProxyInfos(), maxFailovers, + allRetryExceptions.toString()); + retryExceptions.clear(); + return RetryAction.FAIL; + } + } + }; + + return retryPolicy; + } + public Text getCurrentProxyDelegationToken() { return delegationTokenService; } - private Text computeDelegationTokenService() { + protected Text computeDelegationTokenService() { // For HA, this will return "," separated address of all OM's. List addresses = new ArrayList<>(); @@ -415,5 +500,58 @@ public List getOMProxies() { public List getOMProxyInfos() { return new ArrayList(omProxyInfos.values()); } + + private static String getExceptionMsg(Exception e, int retryAttempt) { + StringBuilder exceptionMsg = new StringBuilder() + .append("Retry Attempt ") + .append(retryAttempt) + .append(" Exception - "); + if (e.getCause() == null) { + exceptionMsg.append(e.getClass().getCanonicalName()) + .append(": ") + .append(e.getMessage()); + } else { + exceptionMsg.append(e.getCause().getClass().getCanonicalName()) + .append(": ") + .append(e.getCause().getMessage()); + } + return exceptionMsg.toString(); + } + + /** + * Check if exception is OMLeaderNotReadyException. + * + * @param exception + * @return OMLeaderNotReadyException + */ + private static OMLeaderNotReadyException getLeaderNotReadyException( + Exception exception) { + Throwable cause = exception.getCause(); + if (cause instanceof RemoteException) { + IOException ioException = + ((RemoteException) cause).unwrapRemoteException(); + if (ioException instanceof OMLeaderNotReadyException) { + return (OMLeaderNotReadyException) ioException; + } + } + return null; + } + + /** + * Check if exception is a OMNotLeaderException. + * + * @return OMNotLeaderException. + */ + public static OMNotLeaderException getNotLeaderException(Exception exception) { + Throwable cause = exception.getCause(); + if (cause instanceof RemoteException) { + IOException ioException = + ((RemoteException) cause).unwrapRemoteException(); + if (ioException instanceof OMNotLeaderException) { + return (OMNotLeaderException) ioException; + } + } + return null; + } } diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/Hadoop3OmTransport.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/Hadoop3OmTransport.java index 5360de21e4be..ea0e534582cd 100644 --- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/Hadoop3OmTransport.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/Hadoop3OmTransport.java @@ -18,21 +18,15 @@ package org.apache.hadoop.ozone.om.protocolPB; import java.io.IOException; -import java.util.ArrayList; -import java.util.List; import org.apache.hadoop.hdds.conf.ConfigurationSource; import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.retry.RetryPolicy; -import org.apache.hadoop.io.retry.RetryPolicy.RetryAction.RetryDecision; import org.apache.hadoop.io.retry.RetryProxy; import org.apache.hadoop.ipc.ProtobufHelper; import org.apache.hadoop.ipc.ProtobufRpcEngine; import org.apache.hadoop.ipc.RPC; -import org.apache.hadoop.ipc.RemoteException; import org.apache.hadoop.ozone.OzoneConfigKeys; -import org.apache.hadoop.ozone.om.exceptions.OMLeaderNotReadyException; import org.apache.hadoop.ozone.om.exceptions.OMNotLeaderException; import org.apache.hadoop.ozone.om.ha.OMFailoverProxyProvider; import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.OMRequest; @@ -61,7 +55,6 @@ public class Hadoop3OmTransport implements OmTransport { private final OMFailoverProxyProvider omFailoverProxyProvider; private final OzoneManagerProtocolPB rpcProxy; - private List retryExceptions = new ArrayList<>(); public Hadoop3OmTransport(ConfigurationSource conf, UserGroupInformation ugi, String omServiceId) throws IOException { @@ -95,7 +88,8 @@ public OMResponse submitRequest(OMRequest payload) throws IOException { } return omResponse; } catch (ServiceException e) { - OMNotLeaderException notLeaderException = getNotLeaderException(e); + OMNotLeaderException notLeaderException = + OMFailoverProxyProvider.getNotLeaderException(e); if (notLeaderException == null) { throw ProtobufHelper.getRemoteException(e); } @@ -116,136 +110,12 @@ public Text getDelegationTokenService() { private OzoneManagerProtocolPB createRetryProxy( OMFailoverProxyProvider failoverProxyProvider, int maxFailovers) { - // Client attempts contacting each OM ipc.client.connect.max.retries - // (default = 10) times before failing over to the next OM, if - // available. - // Client will attempt upto maxFailovers number of failovers between - // available OMs before throwing exception. - RetryPolicy retryPolicy = new RetryPolicy() { - @Override - public RetryAction shouldRetry(Exception exception, int retries, - int failovers, boolean isIdempotentOrAtMostOnce) - throws Exception { - - if (LOG.isDebugEnabled()) { - LOG.debug("RetryProxy: {}", exception.getCause() != null ? - exception.getCause().getMessage() : exception.getMessage()); - } - retryExceptions.add(getExceptionMsg(exception, failovers)); - - if (exception instanceof ServiceException) { - OMNotLeaderException notLeaderException = - getNotLeaderException(exception); - if (notLeaderException != null) { - // TODO: NotLeaderException should include the host - // address of the suggested leader along with the nodeID. - // Failing over just based on nodeID is not very robust. - - // OMFailoverProxyProvider#performFailover() is a dummy call and - // does not perform any failover. Failover manually to the next OM. - omFailoverProxyProvider.performFailoverToNextProxy(); - return getRetryAction(RetryDecision.FAILOVER_AND_RETRY, failovers); - } - - OMLeaderNotReadyException leaderNotReadyException = - getLeaderNotReadyException(exception); - if (leaderNotReadyException != null) { - // Retry on same OM again as leader OM is not ready. - // Failing over to same OM so that wait time between retries is - // incremented - omFailoverProxyProvider.performFailoverIfRequired( - omFailoverProxyProvider.getCurrentProxyOMNodeId()); - return getRetryAction(RetryDecision.FAILOVER_AND_RETRY, failovers); - } - } - - if (!omFailoverProxyProvider.shouldFailover(exception)) { - return RetryAction.FAIL; // do not retry - } - - // For all other exceptions, fail over manually to the next OM Node - // proxy. - omFailoverProxyProvider.performFailoverToNextProxy(); - return getRetryAction(RetryDecision.FAILOVER_AND_RETRY, failovers); - } - - private RetryAction getRetryAction(RetryDecision fallbackAction, - int failovers) { - if (failovers < maxFailovers) { - return new RetryAction(fallbackAction, - omFailoverProxyProvider.getWaitTime()); - } else { - StringBuilder allRetryExceptions = new StringBuilder(); - allRetryExceptions.append("\n"); - retryExceptions.stream().forEach(e -> allRetryExceptions.append(e)); - LOG.error("Failed to connect to OMs: {}. Attempted {} failovers. " + - "Got following exceptions during retries: {}", - omFailoverProxyProvider.getOMProxyInfos(), maxFailovers, - allRetryExceptions.toString()); - retryExceptions.clear(); - return RetryAction.FAIL; - } - } - }; - OzoneManagerProtocolPB proxy = (OzoneManagerProtocolPB) RetryProxy.create( - OzoneManagerProtocolPB.class, failoverProxyProvider, retryPolicy); + OzoneManagerProtocolPB.class, failoverProxyProvider, + failoverProxyProvider.getRetryPolicy(maxFailovers)); return proxy; } - private String getExceptionMsg(Exception e, int retryAttempt) { - StringBuilder exceptionMsg = new StringBuilder() - .append("Retry Attempt ") - .append(retryAttempt) - .append(" Exception - "); - if (e.getCause() == null) { - exceptionMsg.append(e.getClass().getCanonicalName()) - .append(": ") - .append(e.getMessage()); - } else { - exceptionMsg.append(e.getCause().getClass().getCanonicalName()) - .append(": ") - .append(e.getCause().getMessage()); - } - return exceptionMsg.toString(); - } - - /** - * Check if exception is OMLeaderNotReadyException. - * - * @param exception - * @return OMLeaderNotReadyException - */ - private OMLeaderNotReadyException getLeaderNotReadyException( - Exception exception) { - Throwable cause = exception.getCause(); - if (cause instanceof RemoteException) { - IOException ioException = - ((RemoteException) cause).unwrapRemoteException(); - if (ioException instanceof OMLeaderNotReadyException) { - return (OMLeaderNotReadyException) ioException; - } - } - return null; - } - - /** - * Check if exception is a OMNotLeaderException. - * - * @return OMNotLeaderException. - */ - private OMNotLeaderException getNotLeaderException(Exception exception) { - Throwable cause = exception.getCause(); - if (cause instanceof RemoteException) { - IOException ioException = - ((RemoteException) cause).unwrapRemoteException(); - if (ioException instanceof OMNotLeaderException) { - return (OMNotLeaderException) ioException; - } - } - return null; - } - @VisibleForTesting public OMFailoverProxyProvider getOmFailoverProxyProvider() { return omFailoverProxyProvider; diff --git a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/failover/TestOMFailovers.java b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/failover/TestOMFailovers.java new file mode 100644 index 000000000000..0f8e92e66e78 --- /dev/null +++ b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/failover/TestOMFailovers.java @@ -0,0 +1,127 @@ +package org.apache.hadoop.ozone.om.failover; + +import com.google.protobuf.RpcController; +import com.google.protobuf.ServiceException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; + +import org.apache.hadoop.hdds.conf.ConfigurationSource; +import org.apache.hadoop.hdds.conf.OzoneConfiguration; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.retry.RetryProxy; +import org.apache.hadoop.ozone.om.ha.OMFailoverProxyProvider; +import org.apache.hadoop.ozone.om.protocolPB.OzoneManagerProtocolPB; +import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos; +import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.OMResponse; +import org.apache.hadoop.security.AccessControlException; +import org.apache.hadoop.test.GenericTestUtils; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.event.Level; + +/** + * Tests OM failover protocols using a Mock Failover provider and a Mock OM + * Protocol. + */ +public class TestOMFailovers { + + ConfigurationSource conf = new OzoneConfiguration(); + Exception testException; + + @Test + public void test1() throws Exception { + + testException = new AccessControlException(); + + GenericTestUtils.setLogLevel(OMFailoverProxyProvider.LOG, Level.DEBUG); + GenericTestUtils.LogCapturer logCapturer = GenericTestUtils.LogCapturer + .captureLogs(OMFailoverProxyProvider.LOG); + + MockFailoverProxyProvider failoverProxyProvider = + new MockFailoverProxyProvider(conf); + + OzoneManagerProtocolPB proxy = (OzoneManagerProtocolPB) RetryProxy + .create(OzoneManagerProtocolPB.class, failoverProxyProvider, + failoverProxyProvider.getRetryPolicy(9)); + + try { + proxy.submitRequest(null, null); + Assert.fail("Request should fail with AccessControlException"); + } catch (Exception ex) { + Assert.assertTrue(ex instanceof ServiceException); + + // Request should try all OMs one be one and fail when the last OM also + // throws AccessControlException. + GenericTestUtils.assertExceptionContains("ServiceException of " + + "type class org.apache.hadoop.security.AccessControlException for " + + "om3", ex); + Assert.assertTrue(ex.getCause() instanceof AccessControlException); + + logCapturer.getOutput().contains(getRetryProxyDebugMsg("om1")); + logCapturer.getOutput().contains(getRetryProxyDebugMsg("om2")); + logCapturer.getOutput().contains(getRetryProxyDebugMsg("om3")); + } + } + + private String getRetryProxyDebugMsg(String omNodeId) { + return "RetryProxy: OM " + omNodeId +": AccessControlException: " + + "Permission denied."; + } + + private class MockOzoneManagerProtocol implements OzoneManagerProtocolPB { + + final String omNodeId; + // Exception to throw when submitMockRequest is called + final Exception exception; + + private MockOzoneManagerProtocol(String nodeId, Exception ex) { + omNodeId = nodeId; + exception = ex; + } + + @Override + public OMResponse submitRequest(RpcController controller, + OzoneManagerProtocolProtos.OMRequest request) throws ServiceException { + throw new ServiceException("ServiceException of type " + + exception.getClass() + " for "+ omNodeId, exception); + } + } + + private class MockFailoverProxyProvider extends OMFailoverProxyProvider { + + private MockFailoverProxyProvider(ConfigurationSource configuration) + throws IOException { + super(configuration, null, null); + } + + @Override + protected void createOMProxyIfNeeded(ProxyInfo proxyInfo, + String nodeId) { + if (proxyInfo.proxy == null) { + proxyInfo.proxy = new MockOzoneManagerProtocol(nodeId, + testException); + } + } + + @Override + protected void loadOMClientConfigs(ConfigurationSource config, + String omSvcId) { + this.omProxies = new HashMap<>(); + this.omProxyInfos = new HashMap<>(); + this.omNodeIDList = new ArrayList<>(); + + for (int i = 1; i <= 3; i++) { + String nodeId = "om" + i; + omProxies.put(nodeId, new ProxyInfo<>(null, nodeId)); + omProxyInfos.put(nodeId, null); + omNodeIDList.add(nodeId); + } + } + + @Override + protected Text computeDelegationTokenService() { + return null; + } + } +} From 389c401a1b0ed4736418beb79cf335cf2033c1c8 Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Tue, 15 Sep 2020 09:16:21 -0700 Subject: [PATCH 4/5] rat and checkstyle fixes --- .../ozone/om/ha/OMFailoverProxyProvider.java | 19 ++++++--- .../ozone/om/failover/TestOMFailovers.java | 39 ++++++++++++++----- 2 files changed, 43 insertions(+), 15 deletions(-) diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java index c57da37bde91..034354fd4ce4 100644 --- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java @@ -72,9 +72,9 @@ public class OMFailoverProxyProvider implements LoggerFactory.getLogger(OMFailoverProxyProvider.class); // Map of OMNodeID to its proxy - protected Map> omProxies; - protected Map omProxyInfos; - protected List omNodeIDList; + private Map> omProxies; + private Map omProxyInfos; + private List omNodeIDList; private String currentProxyOMNodeId; private int currentProxyIndex; @@ -228,9 +228,6 @@ protected void createOMProxyIfNeeded(ProxyInfo proxyInfo, @VisibleForTesting public RetryPolicy getRetryPolicy(int maxFailovers) { - // Client attempts contacting each OM ipc.client.connect.max.retries - // (default = 10) times before failing over to the next OM, if - // available. // Client will attempt upto maxFailovers number of failovers between // available OMs before throwing exception. RetryPolicy retryPolicy = new RetryPolicy() { @@ -553,5 +550,15 @@ public static OMNotLeaderException getNotLeaderException(Exception exception) { } return null; } + + @VisibleForTesting + protected void setProxiesForTesting( + Map> testOMProxies, + Map testOMProxyInfos, + List testOMNodeIDList) { + this.omProxies = testOMProxies; + this.omProxyInfos = testOMProxyInfos; + this.omNodeIDList = testOMNodeIDList; + } } diff --git a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/failover/TestOMFailovers.java b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/failover/TestOMFailovers.java index 0f8e92e66e78..553b48dcf3ce 100644 --- a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/failover/TestOMFailovers.java +++ b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/failover/TestOMFailovers.java @@ -1,3 +1,21 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.hadoop.ozone.om.failover; import com.google.protobuf.RpcController; @@ -11,6 +29,7 @@ import org.apache.hadoop.io.Text; import org.apache.hadoop.io.retry.RetryProxy; import org.apache.hadoop.ozone.om.ha.OMFailoverProxyProvider; +import org.apache.hadoop.ozone.om.ha.OMProxyInfo; import org.apache.hadoop.ozone.om.protocolPB.OzoneManagerProtocolPB; import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos; import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.OMResponse; @@ -26,8 +45,8 @@ */ public class TestOMFailovers { - ConfigurationSource conf = new OzoneConfiguration(); - Exception testException; + private ConfigurationSource conf = new OzoneConfiguration(); + private Exception testException; @Test public void test1() throws Exception { @@ -69,11 +88,11 @@ private String getRetryProxyDebugMsg(String omNodeId) { "Permission denied."; } - private class MockOzoneManagerProtocol implements OzoneManagerProtocolPB { + private final class MockOzoneManagerProtocol implements OzoneManagerProtocolPB { - final String omNodeId; + private final String omNodeId; // Exception to throw when submitMockRequest is called - final Exception exception; + private final Exception exception; private MockOzoneManagerProtocol(String nodeId, Exception ex) { omNodeId = nodeId; @@ -88,7 +107,7 @@ public OMResponse submitRequest(RpcController controller, } } - private class MockFailoverProxyProvider extends OMFailoverProxyProvider { + private final class MockFailoverProxyProvider extends OMFailoverProxyProvider { private MockFailoverProxyProvider(ConfigurationSource configuration) throws IOException { @@ -107,9 +126,10 @@ protected void createOMProxyIfNeeded(ProxyInfo proxyInfo, @Override protected void loadOMClientConfigs(ConfigurationSource config, String omSvcId) { - this.omProxies = new HashMap<>(); - this.omProxyInfos = new HashMap<>(); - this.omNodeIDList = new ArrayList<>(); + HashMap> omProxies = + new HashMap<>();; + HashMap omProxyInfos = new HashMap<>();; + ArrayList omNodeIDList = new ArrayList<>(); for (int i = 1; i <= 3; i++) { String nodeId = "om" + i; @@ -117,6 +137,7 @@ protected void loadOMClientConfigs(ConfigurationSource config, omProxyInfos.put(nodeId, null); omNodeIDList.add(nodeId); } + setProxiesForTesting(omProxies, omProxyInfos, omNodeIDList); } @Override From 9866e1cf4f0de2439eb248c261c4b0f3850b528c Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Tue, 15 Sep 2020 11:46:06 -0700 Subject: [PATCH 5/5] review fixes --- .../ozone/om/ha/OMFailoverProxyProvider.java | 3 ++- .../ozone/om/failover/TestOMFailovers.java | 18 +++++++++++------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java index 034354fd4ce4..acafa7cd3c48 100644 --- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java @@ -539,7 +539,8 @@ private static OMLeaderNotReadyException getLeaderNotReadyException( * * @return OMNotLeaderException. */ - public static OMNotLeaderException getNotLeaderException(Exception exception) { + public static OMNotLeaderException getNotLeaderException( + Exception exception) { Throwable cause = exception.getCause(); if (cause instanceof RemoteException) { IOException ioException = diff --git a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/failover/TestOMFailovers.java b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/failover/TestOMFailovers.java index 553b48dcf3ce..860f3ed916ff 100644 --- a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/failover/TestOMFailovers.java +++ b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/failover/TestOMFailovers.java @@ -28,6 +28,7 @@ import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.retry.RetryProxy; +import org.apache.hadoop.ozone.OzoneConfigKeys; import org.apache.hadoop.ozone.om.ha.OMFailoverProxyProvider; import org.apache.hadoop.ozone.om.ha.OMProxyInfo; import org.apache.hadoop.ozone.om.protocolPB.OzoneManagerProtocolPB; @@ -49,7 +50,7 @@ public class TestOMFailovers { private Exception testException; @Test - public void test1() throws Exception { + public void testAccessContorlExceptionFailovers() throws Exception { testException = new AccessControlException(); @@ -62,7 +63,8 @@ public void test1() throws Exception { OzoneManagerProtocolPB proxy = (OzoneManagerProtocolPB) RetryProxy .create(OzoneManagerProtocolPB.class, failoverProxyProvider, - failoverProxyProvider.getRetryPolicy(9)); + failoverProxyProvider.getRetryPolicy( + OzoneConfigKeys.OZONE_CLIENT_FAILOVER_MAX_ATTEMPTS_DEFAULT)); try { proxy.submitRequest(null, null); @@ -84,11 +86,12 @@ public void test1() throws Exception { } private String getRetryProxyDebugMsg(String omNodeId) { - return "RetryProxy: OM " + omNodeId +": AccessControlException: " + + return "RetryProxy: OM " + omNodeId + ": AccessControlException: " + "Permission denied."; } - private final class MockOzoneManagerProtocol implements OzoneManagerProtocolPB { + private final class MockOzoneManagerProtocol + implements OzoneManagerProtocolPB { private final String omNodeId; // Exception to throw when submitMockRequest is called @@ -107,7 +110,8 @@ public OMResponse submitRequest(RpcController controller, } } - private final class MockFailoverProxyProvider extends OMFailoverProxyProvider { + private final class MockFailoverProxyProvider + extends OMFailoverProxyProvider { private MockFailoverProxyProvider(ConfigurationSource configuration) throws IOException { @@ -127,8 +131,8 @@ protected void createOMProxyIfNeeded(ProxyInfo proxyInfo, protected void loadOMClientConfigs(ConfigurationSource config, String omSvcId) { HashMap> omProxies = - new HashMap<>();; - HashMap omProxyInfos = new HashMap<>();; + new HashMap<>(); + HashMap omProxyInfos = new HashMap<>(); ArrayList omNodeIDList = new ArrayList<>(); for (int i = 1; i <= 3; i++) {