diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java index e5958b7b9046..d2a1330469af 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java @@ -246,6 +246,11 @@ public final class ScmConfigKeys { public static final int OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT_DEFAULT = 15; + public static final String OZONE_SCM_HEARTBEAT_RPC_RETRY_INTERVAL = + "ozone.scm.heartbeat.rpc-retry-interval"; + public static final String OZONE_SCM_HEARTBEAT_RPC_RETRY_INTERVAL_DEFAULT = + "1s"; + /** * Defines how frequently we will log the missing of heartbeat to a specific * SCM. In the default case we will write a warning message for each 10 diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml b/hadoop-hdds/common/src/main/resources/ozone-default.xml index 9311937ce629..1dece14fc471 100644 --- a/hadoop-hdds/common/src/main/resources/ozone-default.xml +++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml @@ -969,7 +969,17 @@ OZONE, MANAGEMENT Retry count for the RPC from Datanode to SCM. The rpc-retry-interval - is 1s. Make sure rpc-retry-count * (rpc-timeout + rpc-retry-interval) + is 1s by default. Make sure rpc-retry-count * (rpc-timeout + + rpc-retry-interval) is less than hdds.heartbeat.interval. + + + + ozone.scm.heartbeat.rpc-retry-interval + 1s + OZONE, MANAGEMENT + + Retry interval for the RPC from Datanode to SCM. + Make sure rpc-retry-count * (rpc-timeout + rpc-retry-interval) is less than hdds.heartbeat.interval. diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/SCMConnectionManager.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/SCMConnectionManager.java index c7dd9c65ecf5..ffca599f8024 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/SCMConnectionManager.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/SCMConnectionManager.java @@ -47,6 +47,7 @@ import static java.util.Collections.unmodifiableList; import static org.apache.hadoop.hdds.utils.HddsServerUtil.getScmRpcTimeOutInMilliseconds; import static org.apache.hadoop.hdds.utils.HddsServerUtil.getScmRpcRetryCount; +import static org.apache.hadoop.hdds.utils.HddsServerUtil.getScmRpcRetryInterval; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -151,8 +152,8 @@ public void addSCMServer(InetSocketAddress address) throws IOException { RetryPolicy retryPolicy = RetryPolicies.retryUpToMaximumCountWithFixedSleep( - getScmRpcRetryCount(conf), - 1000, TimeUnit.MILLISECONDS); + getScmRpcRetryCount(conf), getScmRpcRetryInterval(conf), + TimeUnit.MILLISECONDS); StorageContainerDatanodeProtocolPB rpcProxy = RPC.getProtocolProxy( StorageContainerDatanodeProtocolPB.class, version, @@ -196,8 +197,8 @@ public void addReconServer(InetSocketAddress address) throws IOException { RetryPolicy retryPolicy = RetryPolicies.retryUpToMaximumCountWithFixedSleep( - getScmRpcRetryCount(conf), - 1000, TimeUnit.MILLISECONDS); + getScmRpcRetryCount(conf), getScmRpcRetryInterval(conf), + TimeUnit.MILLISECONDS); ReconDatanodeProtocolPB rpcProxy = RPC.getProtocolProxy( ReconDatanodeProtocolPB.class, version, address, UserGroupInformation.getCurrentUser(), hadoopConfig, diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HddsServerUtil.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HddsServerUtil.java index d2273c373f91..2c2b5fcc3245 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HddsServerUtil.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HddsServerUtil.java @@ -67,6 +67,8 @@ import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_TIMEOUT_DEFAULT; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT_DEFAULT; +import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_RETRY_INTERVAL; +import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_RETRY_INTERVAL_DEFAULT; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL_DEFAULT; import static org.apache.hadoop.hdds.server.ServerUtils.sanitizeUserArgs; @@ -339,6 +341,18 @@ public static int getScmRpcRetryCount(ConfigurationSource conf) { OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT_DEFAULT); } + /** + * Fixed datanode rpc retry interval, which is used by datanode to connect + * the SCM. + * + * @param conf - Ozone Config + * @return - Rpc retry interval. + */ + public static long getScmRpcRetryInterval(ConfigurationSource conf) { + return conf.getTimeDuration(OZONE_SCM_HEARTBEAT_RPC_RETRY_INTERVAL, + OZONE_SCM_HEARTBEAT_RPC_RETRY_INTERVAL_DEFAULT, TimeUnit.MILLISECONDS); + } + /** * Log Warn interval. *