Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,11 @@ public final class ScmConfigKeys {
public static final int OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT_DEFAULT =
15;

public static final String OZONE_SCM_HEARTBEAT_RPC_RETRY_INTERVAL =
"ozone.scm.heartbeat.rpc-retry-interval";
public static final String OZONE_SCM_HEARTBEAT_RPC_RETRY_INTERVAL_DEFAULT =
"1s";

/**
* Defines how frequently we will log the missing of heartbeat to a specific
* SCM. In the default case we will write a warning message for each 10
Expand Down
12 changes: 11 additions & 1 deletion hadoop-hdds/common/src/main/resources/ozone-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -969,7 +969,17 @@
<tag>OZONE, MANAGEMENT</tag>
<description>
Retry count for the RPC from Datanode to SCM. The rpc-retry-interval
is 1s. Make sure rpc-retry-count * (rpc-timeout + rpc-retry-interval)
is 1s by default. Make sure rpc-retry-count * (rpc-timeout +
rpc-retry-interval) is less than hdds.heartbeat.interval.
</description>
</property>
<property>
<name>ozone.scm.heartbeat.rpc-retry-interval</name>
<value>1s</value>
<tag>OZONE, MANAGEMENT</tag>
<description>
Retry interval for the RPC from Datanode to SCM.
Make sure rpc-retry-count * (rpc-timeout + rpc-retry-interval)
is less than hdds.heartbeat.interval.
</description>
</property>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
import static java.util.Collections.unmodifiableList;
import static org.apache.hadoop.hdds.utils.HddsServerUtil.getScmRpcTimeOutInMilliseconds;
import static org.apache.hadoop.hdds.utils.HddsServerUtil.getScmRpcRetryCount;
import static org.apache.hadoop.hdds.utils.HddsServerUtil.getScmRpcRetryInterval;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -151,8 +152,8 @@ public void addSCMServer(InetSocketAddress address) throws IOException {

RetryPolicy retryPolicy =
RetryPolicies.retryUpToMaximumCountWithFixedSleep(
getScmRpcRetryCount(conf),
1000, TimeUnit.MILLISECONDS);
getScmRpcRetryCount(conf), getScmRpcRetryInterval(conf),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we just reuse default DN heartbeat interval(HddsConfigKeys#HDDS_HEARTBEAT_INTERVAL_DEFAULT, 30s) rather than defined a new rpc retry interval here? Would this a better way?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we just reuse default DN heartbeat interval(HddsConfigKeys#HDDS_HEARTBEAT_INTERVAL_DEFAULT, 30s) rather than defined a new rpc retry interval here? Would this a better way?

The retry interval is only 1 sec now, which is for quickly connecting the scm. The default HB interval may be too long.
Actually, the retry count is not working, since the DatanodeStateMachine keeps retrying after 15 retries finish.
The current retry policy seems still needs to be changed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, get it.
There is another place that also can be updated to use getScmRpcRetryInterval(conf) in this class. Can you update this (SCMConnectionManager.java#L200)?

  /**
   * Adds a new Recon server to the set of endpoints.
   * @param address Recon address.
   * @throws IOException
   */
  public void addReconServer(InetSocketAddress address) throws IOException {
    LOG.info("Adding Recon Server : {}", address.toString());
    writeLock();
    try {
      if (scmMachines.containsKey(address)) {
        LOG.warn("Trying to add an existing SCM Machine to Machines group. " +
            "Ignoring the request.");
        return;
      }
      Configuration hadoopConfig =
          LegacyHadoopConfigurationSource.asHadoopConfiguration(this.conf);
      RPC.setProtocolEngine(hadoopConfig, ReconDatanodeProtocolPB.class,
          ProtobufRpcEngine.class);
      long version =
          RPC.getProtocolVersion(ReconDatanodeProtocolPB.class);

      RetryPolicy retryPolicy =
          RetryPolicies.retryUpToMaximumCountWithFixedSleep(
              getScmRpcRetryCount(conf),
              1000, TimeUnit.MILLISECONDS);  <======
...
}

TimeUnit.MILLISECONDS);

StorageContainerDatanodeProtocolPB rpcProxy = RPC.getProtocolProxy(
StorageContainerDatanodeProtocolPB.class, version,
Expand Down Expand Up @@ -196,8 +197,8 @@ public void addReconServer(InetSocketAddress address) throws IOException {

RetryPolicy retryPolicy =
RetryPolicies.retryUpToMaximumCountWithFixedSleep(
getScmRpcRetryCount(conf),
1000, TimeUnit.MILLISECONDS);
getScmRpcRetryCount(conf), getScmRpcRetryInterval(conf),
TimeUnit.MILLISECONDS);
ReconDatanodeProtocolPB rpcProxy = RPC.getProtocolProxy(
ReconDatanodeProtocolPB.class, version,
address, UserGroupInformation.getCurrentUser(), hadoopConfig,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_TIMEOUT_DEFAULT;
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT;
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT_DEFAULT;
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_RETRY_INTERVAL;
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_RETRY_INTERVAL_DEFAULT;
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL;
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL_DEFAULT;
import static org.apache.hadoop.hdds.server.ServerUtils.sanitizeUserArgs;
Expand Down Expand Up @@ -339,6 +341,18 @@ public static int getScmRpcRetryCount(ConfigurationSource conf) {
OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT_DEFAULT);
}

/**
* Fixed datanode rpc retry interval, which is used by datanode to connect
* the SCM.
*
* @param conf - Ozone Config
* @return - Rpc retry interval.
*/
public static long getScmRpcRetryInterval(ConfigurationSource conf) {
return conf.getTimeDuration(OZONE_SCM_HEARTBEAT_RPC_RETRY_INTERVAL,
OZONE_SCM_HEARTBEAT_RPC_RETRY_INTERVAL_DEFAULT, TimeUnit.MILLISECONDS);
}

/**
* Log Warn interval.
*
Expand Down