diff --git a/hadoop-hdds/client/pom.xml b/hadoop-hdds/client/pom.xml
index 608839e82dd6..e1b51e8bba98 100644
--- a/hadoop-hdds/client/pom.xml
+++ b/hadoop-hdds/client/pom.xml
@@ -51,11 +51,6 @@ https://maven.apache.org/xsd/maven-4.0.0.xsd">
test
-
- io.netty
- netty-all
-
-
org.apache.hadoophadoop-hdds-hadoop-dependency-test
diff --git a/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/OzoneClientConfig.java b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/OzoneClientConfig.java
index 2a79edbe31eb..b3c774a2c22f 100644
--- a/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/OzoneClientConfig.java
+++ b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/OzoneClientConfig.java
@@ -21,6 +21,7 @@
import org.apache.hadoop.hdds.conf.ConfigGroup;
import org.apache.hadoop.hdds.conf.ConfigTag;
import org.apache.hadoop.hdds.conf.ConfigType;
+import org.apache.hadoop.hdds.conf.PostConstruct;
import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ChecksumType;
import org.apache.hadoop.ozone.OzoneConfigKeys;
@@ -111,9 +112,7 @@ public class OzoneClientConfig {
tags = ConfigTag.CLIENT)
private boolean checksumVerify = true;
- public OzoneClientConfig() {
- }
-
+ @PostConstruct
private void validate() {
Preconditions.checkState(streamBufferSize > 0);
Preconditions.checkState(streamBufferFlushSize > 0);
diff --git a/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/XceiverClientRatis.java b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/XceiverClientRatis.java
index ced9df7fb664..6e99bf3553d4 100644
--- a/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/XceiverClientRatis.java
+++ b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/XceiverClientRatis.java
@@ -217,12 +217,12 @@ private CompletableFuture sendRequestAsync(
if (LOG.isDebugEnabled()) {
LOG.debug("sendCommandAsync ReadOnly {}", message);
}
- return getClient().sendReadOnlyAsync(message);
+ return getClient().async().sendReadOnly(message);
} else {
if (LOG.isDebugEnabled()) {
LOG.debug("sendCommandAsync {}", message);
}
- return getClient().sendAsync(message);
+ return getClient().async().send(message);
}
}
@@ -258,8 +258,8 @@ public XceiverClientReply watchForCommit(long index)
}
RaftClientReply reply;
try {
- CompletableFuture replyFuture = getClient()
- .sendWatchAsync(index, RaftProtos.ReplicationLevel.ALL_COMMITTED);
+ CompletableFuture replyFuture = getClient().async()
+ .watch(index, RaftProtos.ReplicationLevel.ALL_COMMITTED);
replyFuture.get();
} catch (Exception e) {
Throwable t = HddsClientUtils.checkForException(e);
@@ -267,8 +267,8 @@ public XceiverClientReply watchForCommit(long index)
if (t instanceof GroupMismatchException) {
throw e;
}
- reply = getClient()
- .sendWatchAsync(index, RaftProtos.ReplicationLevel.MAJORITY_COMMITTED)
+ reply = getClient().async()
+ .watch(index, RaftProtos.ReplicationLevel.MAJORITY_COMMITTED)
.get();
List commitInfoProtoList =
reply.getCommitInfos().stream()
diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/ratis/RatisHelper.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/ratis/RatisHelper.java
index 324774d7d77f..c910dd5acea8 100644
--- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/ratis/RatisHelper.java
+++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/ratis/RatisHelper.java
@@ -104,12 +104,18 @@ public static RaftPeerId toRaftPeerId(DatanodeDetails id) {
}
public static RaftPeer toRaftPeer(DatanodeDetails id) {
- return new RaftPeer(toRaftPeerId(id), toRaftPeerAddressString(id));
+ return RaftPeer.newBuilder()
+ .setId(toRaftPeerId(id))
+ .setAddress(toRaftPeerAddressString(id))
+ .build();
}
public static RaftPeer toRaftPeer(DatanodeDetails id, int priority) {
- return new RaftPeer(
- toRaftPeerId(id), toRaftPeerAddressString(id), priority);
+ return RaftPeer.newBuilder()
+ .setId(toRaftPeerId(id))
+ .setAddress(toRaftPeerAddressString(id))
+ .setPriority(priority)
+ .build();
}
private static List toRaftPeers(Pipeline pipeline) {
diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java
index 7b01e0797f3e..0e16968def56 100644
--- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java
+++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java
@@ -273,6 +273,16 @@ public final class ScmConfigKeys {
// able to send back a new list to the datanodes.
public static final String OZONE_SCM_NAMES = "ozone.scm.names";
+ public static final String OZONE_SCM_INTERNAL_SERVICE_ID =
+ "ozone.scm.internal.service.id";
+
+ public static final String OZONE_SCM_SERVICE_IDS_KEY =
+ "ozone.scm.service.ids";
+ public static final String OZONE_SCM_NODES_KEY =
+ "ozone.scm.nodes";
+ public static final String OZONE_SCM_NODE_ID_KEY =
+ "ozone.scm.node.id";
+
public static final int OZONE_SCM_DEFAULT_PORT =
OZONE_SCM_DATANODE_PORT_DEFAULT;
// The path where datanode ID is to be written to.
@@ -364,6 +374,83 @@ public final class ScmConfigKeys {
public static final String HDDS_TRACING_ENABLED = "hdds.tracing.enabled";
public static final boolean HDDS_TRACING_ENABLED_DEFAULT = false;
+ // SCM Ratis related
+ public static final String OZONE_SCM_HA_ENABLE_KEY
+ = "ozone.scm.ratis.enable";
+ public static final boolean OZONE_SCM_HA_ENABLE_DEFAULT
+ = false;
+ public static final String OZONE_SCM_RATIS_PORT_KEY
+ = "ozone.scm.ratis.port";
+ public static final int OZONE_SCM_RATIS_PORT_DEFAULT
+ = 9864;
+ public static final String OZONE_SCM_RATIS_RPC_TYPE_KEY
+ = "ozone.scm.ratis.rpc.type";
+ public static final String OZONE_SCM_RATIS_RPC_TYPE_DEFAULT
+ = "GRPC";
+
+ // SCM Ratis Log configurations
+ public static final String OZONE_SCM_RATIS_STORAGE_DIR
+ = "ozone.scm.ratis.storage.dir";
+ public static final String OZONE_SCM_RATIS_SEGMENT_SIZE_KEY
+ = "ozone.scm.ratis.segment.size";
+ public static final String OZONE_SCM_RATIS_SEGMENT_SIZE_DEFAULT
+ = "16KB";
+ public static final String OZONE_SCM_RATIS_SEGMENT_PREALLOCATED_SIZE_KEY
+ = "ozone.scm.ratis.segment.preallocated.size";
+ public static final String OZONE_SCM_RATIS_SEGMENT_PREALLOCATED_SIZE_DEFAULT
+ = "16KB";
+
+ // SCM Ratis Log Appender configurations
+ public static final String
+ OZONE_SCM_RATIS_LOG_APPENDER_QUEUE_NUM_ELEMENTS =
+ "ozone.scm.ratis.log.appender.queue.num-elements";
+ public static final int
+ OZONE_SCM_RATIS_LOG_APPENDER_QUEUE_NUM_ELEMENTS_DEFAULT = 1024;
+ public static final String OZONE_SCM_RATIS_LOG_APPENDER_QUEUE_BYTE_LIMIT =
+ "ozone.scm.ratis.log.appender.queue.byte-limit";
+ public static final String
+ OZONE_SCM_RATIS_LOG_APPENDER_QUEUE_BYTE_LIMIT_DEFAULT = "32MB";
+ public static final String OZONE_SCM_RATIS_LOG_PURGE_GAP =
+ "ozone.scm.ratis.log.purge.gap";
+ public static final int OZONE_SCM_RATIS_LOG_PURGE_GAP_DEFAULT = 1000000;
+
+ // SCM Ratis server configurations
+ public static final String OZONE_SCM_RATIS_SERVER_REQUEST_TIMEOUT_KEY
+ = "ozone.scm.ratis.server.request.timeout";
+ public static final TimeDuration
+ OZONE_SCM_RATIS_SERVER_REQUEST_TIMEOUT_DEFAULT
+ = TimeDuration.valueOf(3000, TimeUnit.MILLISECONDS);
+ public static final String
+ OZONE_SCM_RATIS_SERVER_RETRY_CACHE_TIMEOUT_KEY
+ = "ozone.scm.ratis.server.retry.cache.timeout";
+ public static final TimeDuration
+ OZONE_SCM_RATIS_SERVER_RETRY_CACHE_TIMEOUT_DEFAULT
+ = TimeDuration.valueOf(600000, TimeUnit.MILLISECONDS);
+ public static final String OZONE_SCM_RATIS_MINIMUM_TIMEOUT_KEY
+ = "ozone.scm.ratis.minimum.timeout";
+ public static final TimeDuration OZONE_SCM_RATIS_MINIMUM_TIMEOUT_DEFAULT
+ = TimeDuration.valueOf(1, TimeUnit.SECONDS);
+
+ // SCM Ratis Leader Election configurations
+ public static final String
+ OZONE_SCM_LEADER_ELECTION_MINIMUM_TIMEOUT_DURATION_KEY =
+ "ozone.scm.ratis.leader.election.minimum.timeout.duration";
+ public static final TimeDuration
+ OZONE_SCM_LEADER_ELECTION_MINIMUM_TIMEOUT_DURATION_DEFAULT =
+ TimeDuration.valueOf(1, TimeUnit.SECONDS);
+ public static final String OZONE_SCM_RATIS_SERVER_FAILURE_TIMEOUT_DURATION_KEY
+ = "ozone.scm.ratis.server.failure.timeout.duration";
+ public static final TimeDuration
+ OZONE_SCM_RATIS_SERVER_FAILURE_TIMEOUT_DURATION_DEFAULT
+ = TimeDuration.valueOf(120, TimeUnit.SECONDS);
+
+ // SCM Leader server role check interval
+ public static final String OZONE_SCM_RATIS_SERVER_ROLE_CHECK_INTERVAL_KEY
+ = "ozone.scm.ratis.server.role.check.interval";
+ public static final TimeDuration
+ OZONE_SCM_RATIS_SERVER_ROLE_CHECK_INTERVAL_DEFAULT
+ = TimeDuration.valueOf(15, TimeUnit.SECONDS);
+
/**
* Never constructed.
*/
diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmInfo.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmInfo.java
index 6236febb7b12..b9d823e8d817 100644
--- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmInfo.java
+++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmInfo.java
@@ -18,6 +18,9 @@
package org.apache.hadoop.hdds.scm;
+import java.util.ArrayList;
+import java.util.List;
+
/**
* ScmInfo wraps the result returned from SCM#getScmInfo which
* contains clusterId and the SCM Id.
@@ -25,6 +28,7 @@
public final class ScmInfo {
private String clusterId;
private String scmId;
+ private List peerRoles;
/**
* Builder for ScmInfo.
@@ -32,6 +36,11 @@ public final class ScmInfo {
public static class Builder {
private String clusterId;
private String scmId;
+ private List peerRoles;
+
+ public Builder() {
+ peerRoles = new ArrayList<>();
+ }
/**
* sets the cluster id.
@@ -53,14 +62,25 @@ public Builder setScmId(String id) {
return this;
}
+ /**
+ * Set peer address in Scm HA.
+ * @param roles ratis peer address in the format of [ip|hostname]:port
+ * @return Builder for scmInfo
+ */
+ public Builder setRatisPeerRoles(List roles) {
+ peerRoles.addAll(roles);
+ return this;
+ }
+
public ScmInfo build() {
- return new ScmInfo(clusterId, scmId);
+ return new ScmInfo(clusterId, scmId, peerRoles);
}
}
- private ScmInfo(String clusterId, String scmId) {
+ private ScmInfo(String clusterId, String scmId, List peerRoles) {
this.clusterId = clusterId;
this.scmId = scmId;
+ this.peerRoles = peerRoles;
}
/**
@@ -78,4 +98,12 @@ public String getClusterId() {
public String getScmId() {
return scmId;
}
+
+ /**
+ * Gets the list of peer roles (currently address) in Scm HA.
+ * @return List of peer address
+ */
+ public List getRatisPeerRoles() {
+ return peerRoles;
+ }
}
diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java
index e4369fa86272..7c3c94cb7ae1 100644
--- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java
+++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java
@@ -246,5 +246,8 @@ Map> getSafeModeRuleStatuses()
*/
boolean getReplicationManagerStatus() throws IOException;
-
+ /**
+ * returns the list of ratis peer roles. Currently only include peer address.
+ */
+ List getScmRatisRoles() throws IOException;
}
diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerID.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerID.java
index bb44da4e78e5..1a6be9660ce0 100644
--- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerID.java
+++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerID.java
@@ -23,6 +23,7 @@
import org.apache.commons.lang3.builder.CompareToBuilder;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.apache.commons.lang3.builder.HashCodeBuilder;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
/**
* Container ID is an integer that is a value between 1..MAX_CONTAINER ID.
@@ -34,13 +35,14 @@ public final class ContainerID implements Comparable {
private final long id;
- // TODO: make this private.
/**
* Constructs ContainerID.
*
* @param id int
*/
- public ContainerID(long id) {
+ private ContainerID(long id) {
+ Preconditions.checkState(id > 0,
+ "Container ID should be a positive. %s.", id);
this.id = id;
}
@@ -49,9 +51,7 @@ public ContainerID(long id) {
* @param containerID long
* @return ContainerID.
*/
- public static ContainerID valueof(final long containerID) {
- Preconditions.checkState(containerID > 0,
- "Container ID should be a positive long. "+ containerID);
+ public static ContainerID valueOf(final long containerID) {
return new ContainerID(containerID);
}
@@ -60,14 +60,30 @@ public static ContainerID valueof(final long containerID) {
*
* @return int
*/
+ @Deprecated
+ /*
+ * Don't expose the int value.
+ */
public long getId() {
return id;
}
+ /**
+ * Use proto message.
+ */
+ @Deprecated
public byte[] getBytes() {
return Longs.toByteArray(id);
}
+ public HddsProtos.ContainerID getProtobuf() {
+ return HddsProtos.ContainerID.newBuilder().setId(id).build();
+ }
+
+ public static ContainerID getFromProtobuf(HddsProtos.ContainerID proto) {
+ return ContainerID.valueOf(proto.getId());
+ }
+
@Override
public boolean equals(final Object o) {
if (this == o) {
@@ -81,14 +97,14 @@ public boolean equals(final Object o) {
final ContainerID that = (ContainerID) o;
return new EqualsBuilder()
- .append(getId(), that.getId())
+ .append(id, that.id)
.isEquals();
}
@Override
public int hashCode() {
return new HashCodeBuilder(61, 71)
- .append(getId())
+ .append(id)
.toHashCode();
}
@@ -96,7 +112,7 @@ public int hashCode() {
public int compareTo(final ContainerID that) {
Preconditions.checkNotNull(that);
return new CompareToBuilder()
- .append(this.getId(), that.getId())
+ .append(this.id, that.id)
.build();
}
diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerInfo.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerInfo.java
index b8f1a926f186..e621a4f54eac 100644
--- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerInfo.java
+++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerInfo.java
@@ -121,6 +121,11 @@ public static ContainerInfo fromProtobuf(HddsProtos.ContainerInfoProto info) {
.build();
}
+ /**
+ * This method is depricated, use {@code containerID()} which returns
+ * {@link ContainerID} object.
+ */
+ @Deprecated
public long getContainerID() {
return containerID;
}
@@ -179,7 +184,7 @@ public void updateSequenceId(long sequenceID) {
}
public ContainerID containerID() {
- return new ContainerID(getContainerID());
+ return ContainerID.valueOf(containerID);
}
/**
diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/common/helpers/ExcludeList.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/common/helpers/ExcludeList.java
index 803aa0367045..824a1f5833ab 100644
--- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/common/helpers/ExcludeList.java
+++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/common/helpers/ExcludeList.java
@@ -91,7 +91,7 @@ public static ExcludeList getFromProtoBuf(
HddsProtos.ExcludeListProto excludeListProto) {
ExcludeList excludeList = new ExcludeList();
excludeListProto.getContainerIdsList().forEach(id -> {
- excludeList.addConatinerId(ContainerID.valueof(id));
+ excludeList.addConatinerId(ContainerID.valueOf(id));
});
DatanodeDetails.Builder builder = DatanodeDetails.newBuilder();
excludeListProto.getDatanodesList().forEach(dn -> {
diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/exceptions/SCMException.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/exceptions/SCMException.java
index 48a8e059d97b..82e3034454c2 100644
--- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/exceptions/SCMException.java
+++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/exceptions/SCMException.java
@@ -124,6 +124,7 @@ public enum ResultCodes {
FAILED_TO_ALLOCATE_ENOUGH_BLOCKS,
INTERNAL_ERROR,
FAILED_TO_INIT_PIPELINE_CHOOSE_POLICY,
- FAILED_TO_INIT_LEADER_CHOOSE_POLICY
+ FAILED_TO_INIT_LEADER_CHOOSE_POLICY,
+ SCM_NOT_LEADER
}
}
diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConsts.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConsts.java
index a7aca164b001..03da6dd9dab6 100644
--- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConsts.java
+++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConsts.java
@@ -379,6 +379,13 @@ private OzoneConsts() {
public static final String CONTAINER_DB_TYPE_ROCKSDB = "RocksDB";
public static final String CONTAINER_DB_TYPE_LEVELDB = "LevelDB";
+ // SCM HA
+ public static final String SCM_SERVICE_ID_DEFAULT = "scmServiceIdDefault";
+
+ // SCM Ratis snapshot file to store the last applied index
+ public static final String SCM_RATIS_SNAPSHOT_INDEX = "scmRatisSnapshotIndex";
+
+ public static final String SCM_RATIS_SNAPSHOT_TERM = "scmRatisSnapshotTerm";
// An on-disk transient marker file used when replacing DB with checkpoint
public static final String DB_TRANSIENT_MARKER = "dbInconsistentMarker";
}
diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/common/StorageInfo.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/common/StorageInfo.java
index 55911fcfd994..c88aaa9b25d5 100644
--- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/common/StorageInfo.java
+++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/common/StorageInfo.java
@@ -27,7 +27,6 @@
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.Properties;
-import java.util.UUID;
/**
* Common class for storage information. This class defines the common
@@ -198,7 +197,11 @@ private Properties readFrom(File from) throws IOException {
* @return new clusterID
*/
public static String newClusterID() {
- return "CID-" + UUID.randomUUID().toString();
+ // TODO:
+ // Please check https://issues.apache.org/jira/browse/HDDS-4538
+ // hard code clusterID and scmUuid on HDDS-2823,
+ // so that multi SCMs won't cause chaos in Datanode side.
+ return "CID-1df51ed9-19f1-4283-8f61-5d90a84c196c";
}
}
diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml b/hadoop-hdds/common/src/main/resources/ozone-default.xml
index d8402f7b9df6..76d959eefb0d 100644
--- a/hadoop-hdds/common/src/main/resources/ozone-default.xml
+++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml
@@ -1839,6 +1839,186 @@
OZONE, HDDS, SECURITYSCM security server port.
+
+ ozone.scm.service.ids
+
+ OZONE, SCM, HA
+
+ Comma-separated list of SCM service Ids. This property allows the client
+ to figure out quorum of OzoneManager address.
+
+
+
+ ozone.scm.internal.service.id
+
+ OZONE, SCM, HA
+
+ Service ID of the SCM. If this is not set fall back to
+ ozone.scm.service.ids to find the service ID it belongs to.
+
+
+
+ ozone.scm.nodes.EXAMPLESCMSERVICEID
+
+ OZONE, SCM, HA
+
+ Comma-separated list of SCM node Ids for a given SCM service ID (eg.
+ EXAMPLESCMSERVICEID). The SCM service ID should be the value (one of the
+ values if there are multiple) set for the parameter ozone.scm.service.ids.
+
+ Unique identifiers for each SCM Node, delimited by commas. This will be
+ used by SCMs in HA setup to determine all the SCMs
+ belonging to the same SCM in the cluster. For example, if you
+ used “scmService1” as the SCM service ID previously, and you wanted to
+ use “scm1”, “scm2” and "scm3" as the individual IDs of the SCMs,
+ you would configure a property ozone.scm.nodes.scmService1, and its value
+ "scm1,scm2,scm3".
+
+
+
+ ozone.scm.node.id
+
+ OZONE, SCM, HA
+
+ The ID of this SCM node. If the SCM node ID is not configured it
+ is determined automatically by matching the local node's address
+ with the configured address.
+
+ If node ID is not deterministic from the configuration, then it is set
+ to the scmId from the SCM version file.
+
+
+
+ ozone.scm.ratis.enable
+ false
+ OZONE, SCM, HA, RATIS
+ Property to enable or disable Ratis server on SCM.
+ Please note - this is a temporary property to disable SCM Ratis server.
+
+
+
+
+ ozone.scm.ratis.port
+ 9872
+ OZONE, SCM, HA, RATIS
+
+ The port number of the SCM's Ratis server.
+
+
+
+
+ ozone.scm.ratis.rpc.type
+ GRPC
+ OZONE, SCM, HA, RATIS
+ Ratis supports different kinds of transports like netty, GRPC,
+ Hadoop RPC etc. This picks one of those for this cluster.
+
+
+
+
+ ozone.scm.ratis.storage.dir
+
+ OZONE, SCM, HA, RATIS, STORAGE
+ This directory is used for storing SCM's Ratis metadata like
+ logs. If this is not set then default metadata dirs is used. A warning
+ will be logged if this not set. Ideally, this should be mapped to a
+ fast disk like an SSD.
+ If undefined, SCM ratis storage dir will fallback to ozone.metadata.dirs.
+ This fallback approach is not recommended for production environments.
+
+
+
+
+ ozone.scm.ratis.segment.size
+ 16KB
+ OZONE, SCM, HA, RATIS, PERFORMANCE
+ The size of the raft segment used by Apache Ratis on SCM.
+ (16 KB by default)
+
+
+
+
+ ozone.scm.ratis.segment.preallocated.size
+ 16KB
+ OZONE, SCM, HA, RATIS, PERFORMANCE
+ The size of the buffer which is preallocated for raft segment
+ used by Apache Ratis on SCM.(16 KB by default)
+
+
+
+
+ ozone.scm.ratis.log.appender.queue.num-elements
+ 1024
+ OZONE, DEBUG, SCM, HA, RATIS
+ Number of operation pending with Raft's Log Worker.
+
+
+
+ ozone.scm.ratis.log.appender.queue.byte-limit
+ 32MB
+ OZONE, DEBUG, SCM, HA, RATIS
+ Byte limit for Raft's Log Worker queue.
+
+
+
+ ozone.scm.ratis.log.purge.gap
+ 1000000
+ OZONE, SCM, HA, RATIS
+ The minimum gap between log indices for Raft server to purge
+ its log segments after taking snapshot.
+
+
+
+ ozone.scm.ratis.server.request.timeout
+ 3s
+ OZONE, SCM, HA, RATIS
+ The timeout duration for SCM's ratis server request.
+
+
+
+ ozone.scm.ratis.server.retry.cache.timeout
+ 600000ms
+ OZONE, SCM, HA, RATIS
+ Retry Cache entry timeout for SCM's ratis server.
+
+
+
+ ozone.scm.ratis.minimum.timeout
+ 1s
+ OZONE, SCM, HA, RATIS
+ The minimum timeout duration for SCM's Ratis server rpc.
+
+
+
+
+ ozone.scm.ratis.leader.election.minimum.timeout.duration
+ 1s
+ OZONE, SCM, HA, RATIS
+ The minimum timeout duration for SCM ratis leader election.
+ Default is 1s.
+
+
+
+
+ ozone.scm.ratis.server.failure.timeout.duration
+ 120s
+ OZONE, SCM, HA, RATIS
+ The timeout duration for ratis server failure detection,
+ once the threshold has reached, the ratis state machine will be informed
+ about the failure in the ratis ring.
+
+
+
+
+ ozone.scm.ratis.server.role.check.interval
+ 15s
+ OZONE, SCM, HA, RATIS
+ The interval between SCM leader performing a role
+ check on its ratis server. Ratis server informs SCM if it
+ loses the leader role. The scheduled check is an secondary
+ check to ensure that the leader role is updated periodically
+ .
+ hdds.metadata.dir
diff --git a/hadoop-hdds/config/src/main/java/org/apache/hadoop/hdds/conf/Config.java b/hadoop-hdds/config/src/main/java/org/apache/hadoop/hdds/conf/Config.java
index 316c867e9944..5d4b4774a5a1 100644
--- a/hadoop-hdds/config/src/main/java/org/apache/hadoop/hdds/conf/Config.java
+++ b/hadoop-hdds/config/src/main/java/org/apache/hadoop/hdds/conf/Config.java
@@ -55,5 +55,10 @@
*/
TimeUnit timeUnit() default TimeUnit.MILLISECONDS;
+ /**
+ * If type == SIZE the unit should be defined with this attribute.
+ */
+ StorageUnit sizeUnit() default StorageUnit.BYTES;
+
ConfigTag[] tags();
}
diff --git a/hadoop-hdds/config/src/main/java/org/apache/hadoop/hdds/conf/ConfigTag.java b/hadoop-hdds/config/src/main/java/org/apache/hadoop/hdds/conf/ConfigTag.java
index 3d1d689e36dc..39dcabab6687 100644
--- a/hadoop-hdds/config/src/main/java/org/apache/hadoop/hdds/conf/ConfigTag.java
+++ b/hadoop-hdds/config/src/main/java/org/apache/hadoop/hdds/conf/ConfigTag.java
@@ -43,5 +43,6 @@ public enum ConfigTag {
S3GATEWAY,
DATANODE,
RECON,
- DELETION
+ DELETION,
+ HA
}
diff --git a/hadoop-hdds/container-service/pom.xml b/hadoop-hdds/container-service/pom.xml
index b71f8e3471e7..aaa5302b4b60 100644
--- a/hadoop-hdds/container-service/pom.xml
+++ b/hadoop-hdds/container-service/pom.xml
@@ -46,6 +46,7 @@ https://maven.apache.org/xsd/maven-4.0.0.xsd">
org.apache.hadoophadoop-hdds-commontest-jar
+ testorg.apache.hadoop
@@ -55,6 +56,10 @@ https://maven.apache.org/xsd/maven-4.0.0.xsd">
org.apache.hadoophadoop-hdds-client
+
+ commons-codec
+ commons-codec
+ io.dropwizard.metricsmetrics-core
@@ -98,11 +103,11 @@ https://maven.apache.org/xsd/maven-4.0.0.xsd">
test
- org.apache.hadoop
- hadoop-hdfs
+ org.slf4j
+ slf4j-log4j12test
- test-jar
+
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/StateContext.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/StateContext.java
index 4cd769f4d245..f39755ffe8fc 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/StateContext.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/StateContext.java
@@ -23,6 +23,7 @@
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
+import java.util.Optional;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
@@ -31,6 +32,7 @@
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
@@ -80,6 +82,18 @@ public class StateContext {
private boolean shutdownGracefully = false;
private final AtomicLong threadPoolNotAvailableCount;
+ /**
+ * term of latest leader SCM, extract from SCMCommand.
+ *
+ * Only leader SCM (both latest and stale) can send out SCMCommand,
+ * which will save its term in SCMCommand. Since latest leader SCM
+ * always has the highest term, term can be used to detect SCMCommand
+ * from stale leader SCM.
+ *
+ * For non-HA mode, term of SCMCommand will be 0.
+ */
+ private Optional termOfLeaderSCM = Optional.empty();
+
/**
* Starting with a 2 sec heartbeat frequency which will be updated to the
* real HB frequency after scm registration. With this method the
@@ -470,6 +484,65 @@ public void execute(ExecutorService service, long time, TimeUnit unit)
}
}
+ /**
+ * After startup, datanode needs detect latest leader SCM before handling
+ * any SCMCommand, so that it won't be disturbed by stale leader SCM.
+ *
+ * The rule is: after majority SCMs are in HEARTBEAT state and has
+ * heard from leader SCMs (commandQueue is not empty), datanode will init
+ * termOfLeaderSCM with the max term found in commandQueue.
+ *
+ * The init process also works for non-HA mode. In that case, term of all
+ * SCMCommands will be 0.
+ */
+ private void initTermOfLeaderSCM() {
+ // only init once
+ if (termOfLeaderSCM.isPresent()) {
+ return;
+ }
+
+ AtomicInteger scmNum = new AtomicInteger(0);
+ AtomicInteger activeScmNum = new AtomicInteger(0);
+
+ getParent().getConnectionManager().getValues()
+ .forEach(endpoint -> {
+ if (endpoint.isPassive()) {
+ return;
+ }
+ scmNum.incrementAndGet();
+ if (endpoint.getState()
+ == EndpointStateMachine.EndPointStates.HEARTBEAT) {
+ activeScmNum.incrementAndGet();
+ }
+ });
+
+ // majority SCMs should be in HEARTBEAT state.
+ if (activeScmNum.get() < scmNum.get() / 2 + 1) {
+ return;
+ }
+
+ // if commandQueue is not empty, init termOfLeaderSCM
+ // with the largest term found in commandQueue
+ commandQueue.stream()
+ .mapToLong(SCMCommand::getTerm)
+ .max()
+ .ifPresent(term -> termOfLeaderSCM = Optional.of(term));
+ }
+
+ /**
+ * monotonically increase termOfLeaderSCM.
+ * Always record the latest term that has seen.
+ */
+ private void updateTermOfLeaderSCM(SCMCommand> command) {
+ if (!termOfLeaderSCM.isPresent()) {
+ LOG.error("should init termOfLeaderSCM before update it.");
+ return;
+ }
+
+ termOfLeaderSCM = Optional.of(
+ Long.max(termOfLeaderSCM.get(), command.getTerm()));
+ }
+
/**
* Returns the next command or null if it is empty.
*
@@ -478,7 +551,26 @@ public void execute(ExecutorService service, long time, TimeUnit unit)
public SCMCommand getNextCommand() {
lock.lock();
try {
- return commandQueue.poll();
+ initTermOfLeaderSCM();
+ if (!termOfLeaderSCM.isPresent()) {
+ return null; // not ready yet
+ }
+
+ while (true) {
+ SCMCommand> command = commandQueue.poll();
+ if (command == null) {
+ return null;
+ }
+
+ updateTermOfLeaderSCM(command);
+ if (command.getTerm() == termOfLeaderSCM.get()) {
+ return command;
+ }
+
+ LOG.warn("Detect and drop a SCMCommand {} from stale leader SCM," +
+ " stale term {}, latest term {}.",
+ command, command.getTerm(), termOfLeaderSCM.get());
+ }
} finally {
lock.unlock();
}
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/CreatePipelineCommandHandler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/CreatePipelineCommandHandler.java
index 4ad05de2cd48..db4bd76cc25f 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/CreatePipelineCommandHandler.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/CreatePipelineCommandHandler.java
@@ -96,7 +96,7 @@ public void handle(SCMCommand command, OzoneContainer ozoneContainer,
final RaftPeer peer = RatisHelper.toRaftPeer(d);
try (RaftClient client = RatisHelper.newRaftClient(peer, conf,
ozoneContainer.getTlsClientConfig())) {
- client.groupAdd(group, peer.getId());
+ client.getGroupManagementApi(peer.getId()).add(group);
} catch (AlreadyExistsException ae) {
// do not log
} catch (IOException ioe) {
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/states/endpoint/HeartbeatEndpointTask.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/states/endpoint/HeartbeatEndpointTask.java
index da2034d93c2d..eac7b37e3383 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/states/endpoint/HeartbeatEndpointTask.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/states/endpoint/HeartbeatEndpointTask.java
@@ -272,6 +272,9 @@ private void processResponse(SCMHeartbeatResponseProto response,
DeleteBlocksCommand db = DeleteBlocksCommand
.getFromProtobuf(
commandResponseProto.getDeleteBlocksCommandProto());
+ if (commandResponseProto.hasTerm()) {
+ db.setTerm(commandResponseProto.getTerm());
+ }
if (!db.blocksTobeDeleted().isEmpty()) {
if (LOG.isDebugEnabled()) {
LOG.debug(DeletedContainerBlocksSummary
@@ -285,6 +288,9 @@ private void processResponse(SCMHeartbeatResponseProto response,
CloseContainerCommand closeContainer =
CloseContainerCommand.getFromProtobuf(
commandResponseProto.getCloseContainerCommandProto());
+ if (commandResponseProto.hasTerm()) {
+ closeContainer.setTerm(commandResponseProto.getTerm());
+ }
if (LOG.isDebugEnabled()) {
LOG.debug("Received SCM container close request for container {}",
closeContainer.getContainerID());
@@ -295,6 +301,9 @@ private void processResponse(SCMHeartbeatResponseProto response,
ReplicateContainerCommand replicateContainerCommand =
ReplicateContainerCommand.getFromProtobuf(
commandResponseProto.getReplicateContainerCommandProto());
+ if (commandResponseProto.hasTerm()) {
+ replicateContainerCommand.setTerm(commandResponseProto.getTerm());
+ }
if (LOG.isDebugEnabled()) {
LOG.debug("Received SCM container replicate request for container {}",
replicateContainerCommand.getContainerID());
@@ -305,6 +314,9 @@ private void processResponse(SCMHeartbeatResponseProto response,
DeleteContainerCommand deleteContainerCommand =
DeleteContainerCommand.getFromProtobuf(
commandResponseProto.getDeleteContainerCommandProto());
+ if (commandResponseProto.hasTerm()) {
+ deleteContainerCommand.setTerm(commandResponseProto.getTerm());
+ }
if (LOG.isDebugEnabled()) {
LOG.debug("Received SCM delete container request for container {}",
deleteContainerCommand.getContainerID());
@@ -315,6 +327,9 @@ private void processResponse(SCMHeartbeatResponseProto response,
CreatePipelineCommand createPipelineCommand =
CreatePipelineCommand.getFromProtobuf(
commandResponseProto.getCreatePipelineCommandProto());
+ if (commandResponseProto.hasTerm()) {
+ createPipelineCommand.setTerm(commandResponseProto.getTerm());
+ }
if (LOG.isDebugEnabled()) {
LOG.debug("Received SCM create pipeline request {}",
createPipelineCommand.getPipelineID());
@@ -325,6 +340,9 @@ private void processResponse(SCMHeartbeatResponseProto response,
ClosePipelineCommand closePipelineCommand =
ClosePipelineCommand.getFromProtobuf(
commandResponseProto.getClosePipelineCommandProto());
+ if (commandResponseProto.hasTerm()) {
+ closePipelineCommand.setTerm(commandResponseProto.getTerm());
+ }
if (LOG.isDebugEnabled()) {
LOG.debug("Received SCM close pipeline request {}",
closePipelineCommand.getPipelineID());
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java
index 89ab976bc88e..1a87ce55e26d 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java
@@ -700,7 +700,7 @@ private synchronized void updateLastApplied() {
* @param index index of the log entry
*/
@Override
- public void notifyIndexUpdate(long term, long index) {
+ public void notifyTermIndexUpdated(long term, long index) {
applyTransactionCompletionMap.put(index, term);
// We need to call updateLastApplied here because now in ratis when a
// node becomes leader, it is checking stateMachineIndex >=
@@ -844,7 +844,7 @@ public void evictStateMachineCache() {
}
@Override
- public void notifySlowness(RoleInfoProto roleInfoProto) {
+ public void notifyFollowerSlowness(RoleInfoProto roleInfoProto) {
ratisServer.handleNodeSlowness(gid, roleInfoProto);
}
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/ContainerCache.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/ContainerCache.java
index a7fa54a1797f..c56c7432adcb 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/ContainerCache.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/ContainerCache.java
@@ -157,7 +157,7 @@ public ReferenceCountedDB getDB(long containerID, String containerDBType,
try {
long start = Time.monotonicNow();
DatanodeStore store = BlockUtils.getUncachedDatanodeStore(containerID,
- containerDBPath, schemaVersion, conf);
+ containerDBPath, schemaVersion, conf, false);
db = new ReferenceCountedDB(store, containerDBPath);
metrics.incDbOpenLatency(Time.monotonicNow() - start);
} catch (Exception e) {
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java
index 66cd6573dc33..1dee1bac0e8a 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java
@@ -27,6 +27,8 @@
import java.util.concurrent.atomic.AtomicLong;
import org.apache.hadoop.fs.StorageType;
+import org.apache.hadoop.hdds.annotation.InterfaceAudience;
+import org.apache.hadoop.hdds.annotation.InterfaceStability;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
import org.apache.hadoop.hdds.fs.SpaceUsageCheckFactory;
import org.apache.hadoop.hdfs.server.datanode.StorageLocation;
@@ -40,8 +42,6 @@
import org.apache.hadoop.util.Time;
import com.google.common.base.Preconditions;
-import org.apache.yetus.audience.InterfaceAudience;
-import org.apache.yetus.audience.InterfaceStability;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainer.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainer.java
index 1fff7494e87c..a239b5fbd8a5 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainer.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainer.java
@@ -528,6 +528,9 @@ public void exportContainerData(OutputStream destination,
+ getContainerData().getContainerID() + " is in state " + state);
}
compactDB();
+ // Close DB (and remove from cache) to avoid concurrent modification while
+ // packing it.
+ BlockUtils.removeDB(containerData, config);
packer.pack(this, destination);
}
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/helpers/BlockUtils.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/helpers/BlockUtils.java
index 0a8d692afd95..e842d17f2ace 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/helpers/BlockUtils.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/helpers/BlockUtils.java
@@ -61,15 +61,15 @@ private BlockUtils() {
*/
public static DatanodeStore getUncachedDatanodeStore(long containerID,
String containerDBPath, String schemaVersion,
- ConfigurationSource conf) throws IOException {
+ ConfigurationSource conf, boolean readOnly) throws IOException {
DatanodeStore store;
if (schemaVersion.equals(OzoneConsts.SCHEMA_V1)) {
store = new DatanodeStoreSchemaOneImpl(conf,
- containerID, containerDBPath);
+ containerID, containerDBPath, readOnly);
} else if (schemaVersion.equals(OzoneConsts.SCHEMA_V2)) {
store = new DatanodeStoreSchemaTwoImpl(conf,
- containerID, containerDBPath);
+ containerID, containerDBPath, readOnly);
} else {
throw new IllegalArgumentException(
"Unrecognized database schema version: " + schemaVersion);
@@ -88,11 +88,11 @@ public static DatanodeStore getUncachedDatanodeStore(long containerID,
* @throws IOException
*/
public static DatanodeStore getUncachedDatanodeStore(
- KeyValueContainerData containerData, ConfigurationSource conf)
- throws IOException {
+ KeyValueContainerData containerData, ConfigurationSource conf,
+ boolean readOnly) throws IOException {
return getUncachedDatanodeStore(containerData.getContainerID(),
containerData.getDbFile().getAbsolutePath(),
- containerData.getSchemaVersion(), conf);
+ containerData.getSchemaVersion(), conf, readOnly);
}
/**
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/helpers/KeyValueContainerUtil.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/helpers/KeyValueContainerUtil.java
index 1780b1ebf0e3..7c75108d7d83 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/helpers/KeyValueContainerUtil.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/helpers/KeyValueContainerUtil.java
@@ -106,10 +106,10 @@ public static void createContainerMetaData(long containerID,
DatanodeStore store;
if (schemaVersion.equals(OzoneConsts.SCHEMA_V1)) {
store = new DatanodeStoreSchemaOneImpl(conf,
- containerID, dbFile.getAbsolutePath());
+ containerID, dbFile.getAbsolutePath(), false);
} else if (schemaVersion.equals(OzoneConsts.SCHEMA_V2)) {
store = new DatanodeStoreSchemaTwoImpl(conf,
- containerID, dbFile.getAbsolutePath());
+ containerID, dbFile.getAbsolutePath(), false);
} else {
throw new IllegalArgumentException(
"Unrecognized schema version for container: " + schemaVersion);
@@ -192,7 +192,8 @@ public static void parseKVContainerData(KeyValueContainerData kvContainerData,
DatanodeStore store = null;
try {
try {
- store = BlockUtils.getUncachedDatanodeStore(kvContainerData, config);
+ store = BlockUtils.getUncachedDatanodeStore(
+ kvContainerData, config, true);
} catch (IOException e) {
// If an exception is thrown, then it may indicate the RocksDB is
// already open in the container cache. As this code is only executed at
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/statemachine/background/BlockDeletingService.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/statemachine/background/BlockDeletingService.java
index a373c21e89a0..b03b7d7ad657 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/statemachine/background/BlockDeletingService.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/statemachine/background/BlockDeletingService.java
@@ -29,7 +29,6 @@
import java.util.stream.Collectors;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
-import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos;
import org.apache.hadoop.hdds.scm.ScmConfigKeys;
import org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException;
import org.apache.hadoop.hdds.scm.pipeline.PipelineID;
@@ -41,9 +40,7 @@
import org.apache.hadoop.hdds.utils.MetadataKeyFilters;
import org.apache.hadoop.hdds.utils.MetadataKeyFilters.KeyPrefixFilter;
import org.apache.hadoop.hdds.utils.db.Table;
-import org.apache.hadoop.ozone.OzoneConsts;
import org.apache.hadoop.ozone.container.common.helpers.BlockData;
-import org.apache.hadoop.ozone.container.common.helpers.ChunkInfoList;
import org.apache.hadoop.ozone.container.common.impl.ContainerData;
import org.apache.hadoop.ozone.container.common.impl.TopNOrderedContainerDeletionChoosingPolicy;
import org.apache.hadoop.ozone.container.common.interfaces.Container;
@@ -293,29 +290,15 @@ public BackgroundTaskResult call() throws Exception {
}
}
- // Once files are deleted... replace deleting entries with deleted
- // entries
+ // Once blocks are deleted... remove the blockID from blockDataTable.
try(BatchOperation batch = meta.getStore().getBatchHandler()
.initBatchOperation()) {
- Table< String, ChunkInfoList > deletedBlocksTable =
- meta.getStore().getDeletedBlocksTable();
for (String entry : succeedBlocks) {
- List< ContainerProtos.ChunkInfo > chunkList =
- blockDataTable.get(entry).getChunks();
- String blockId = entry.substring(
- OzoneConsts.DELETING_KEY_PREFIX.length());
-
- deletedBlocksTable.putWithBatch(
- batch, blockId,
- new ChunkInfoList(chunkList));
blockDataTable.deleteWithBatch(batch, entry);
}
-
int deleteBlockCount = succeedBlocks.size();
containerData.updateAndCommitDBCounters(meta, batch,
deleteBlockCount);
-
-
// update count of pending deletion blocks and block count in
// in-memory container status.
containerData.decrPendingDeletionBlocks(deleteBlockCount);
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/metadata/AbstractDatanodeStore.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/metadata/AbstractDatanodeStore.java
index efbc24730af7..12921af1ead3 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/metadata/AbstractDatanodeStore.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/metadata/AbstractDatanodeStore.java
@@ -77,6 +77,7 @@ public abstract class AbstractDatanodeStore implements DatanodeStore {
private static final DBProfile DEFAULT_PROFILE = DBProfile.DISK;
private static final Map
OPTIONS_CACHE = new ConcurrentHashMap<>();
+ private final boolean openReadOnly;
/**
* Constructs the metadata store and starts the DB services.
@@ -85,7 +86,8 @@ public abstract class AbstractDatanodeStore implements DatanodeStore {
* @throws IOException - on Failure.
*/
protected AbstractDatanodeStore(ConfigurationSource config, long containerID,
- AbstractDatanodeDBDefinition dbDef) throws IOException {
+ AbstractDatanodeDBDefinition dbDef, boolean openReadOnly)
+ throws IOException {
// The same config instance is used on each datanode, so we can share the
// corresponding column family options, providing a single shared cache
@@ -97,6 +99,7 @@ protected AbstractDatanodeStore(ConfigurationSource config, long containerID,
this.dbDef = dbDef;
this.containerID = containerID;
+ this.openReadOnly = openReadOnly;
start(config);
}
@@ -121,6 +124,7 @@ public void start(ConfigurationSource config)
this.store = DBStoreBuilder.newBuilder(config, dbDef)
.setDBOptions(options)
.setDefaultCFOptions(cfOptions)
+ .setOpenReadOnly(openReadOnly)
.build();
// Use the DatanodeTable wrapper to disable the table iterator on
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/metadata/DatanodeStoreSchemaOneImpl.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/metadata/DatanodeStoreSchemaOneImpl.java
index 97b9b25e275d..b72f19eeeb51 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/metadata/DatanodeStoreSchemaOneImpl.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/metadata/DatanodeStoreSchemaOneImpl.java
@@ -35,9 +35,10 @@ public class DatanodeStoreSchemaOneImpl extends AbstractDatanodeStore {
* @throws IOException - on Failure.
*/
public DatanodeStoreSchemaOneImpl(ConfigurationSource config,
- long containerID, String dbPath)
- throws IOException {
- super(config, containerID, new DatanodeSchemaOneDBDefinition(dbPath));
+ long containerID, String dbPath, boolean openReadOnly)
+ throws IOException {
+ super(config, containerID, new DatanodeSchemaOneDBDefinition(dbPath),
+ openReadOnly);
}
@Override
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/metadata/DatanodeStoreSchemaTwoImpl.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/metadata/DatanodeStoreSchemaTwoImpl.java
index fd8e4fa9d087..df9b8c06712d 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/metadata/DatanodeStoreSchemaTwoImpl.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/metadata/DatanodeStoreSchemaTwoImpl.java
@@ -37,8 +37,9 @@ public class DatanodeStoreSchemaTwoImpl extends AbstractDatanodeStore {
* @throws IOException - on Failure.
*/
public DatanodeStoreSchemaTwoImpl(ConfigurationSource config,
- long containerID, String dbPath)
- throws IOException {
- super(config, containerID, new DatanodeSchemaTwoDBDefinition(dbPath));
+ long containerID, String dbPath, boolean openReadOnly)
+ throws IOException {
+ super(config, containerID, new DatanodeSchemaTwoDBDefinition(dbPath),
+ openReadOnly);
}
}
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
index a44ef384362b..5fd1690c1f72 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
@@ -25,6 +25,7 @@
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.Consumer;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
@@ -85,6 +86,7 @@ public class OzoneContainer {
private List dataScanners;
private final BlockDeletingService blockDeletingService;
private final GrpcTlsConfig tlsClientConfig;
+ private final AtomicBoolean isStarted;
/**
* Construct OzoneContainer object.
@@ -152,6 +154,8 @@ public OzoneContainer(DatanodeDetails datanodeDetails, ConfigurationSource
TimeUnit.MILLISECONDS, config);
tlsClientConfig = RatisHelper.createTlsClientConfig(
secConf, certClient != null ? certClient.getCACertificate() : null);
+
+ isStarted = new AtomicBoolean(false);
}
public GrpcTlsConfig getTlsClientConfig() {
@@ -240,6 +244,10 @@ private void stopContainerScrub() {
* @throws IOException
*/
public void start(String scmId) throws IOException {
+ if (!isStarted.compareAndSet(false, true)) {
+ LOG.info("Ignore. OzoneContainer already started.");
+ return;
+ }
LOG.info("Attempting to start container services.");
startContainerScrub();
writeChannel.start();
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/GrpcOutputStream.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/GrpcOutputStream.java
index 4303bb16bab8..c09c8f6743e7 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/GrpcOutputStream.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/GrpcOutputStream.java
@@ -44,7 +44,7 @@ class GrpcOutputStream extends OutputStream {
private final int bufferSize;
- private int writtenBytes;
+ private long writtenBytes;
GrpcOutputStream(
StreamObserver responseObserver,
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/SCMCommand.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/SCMCommand.java
index 3c4e05b424af..4d87bb096cb6 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/SCMCommand.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/SCMCommand.java
@@ -30,7 +30,13 @@
*/
public abstract class SCMCommand implements
IdentifiableEventPayload {
- private long id;
+ private final long id;
+
+ // Under HA mode, holds term of underlying RaftServer iff current
+ // SCM is a leader, otherwise, holds term 0.
+ // Notes that, the first elected leader is from term 1, term 0,
+ // as the initial value of currentTerm, is never used under HA mode.
+ private long term = 0;
SCMCommand() {
this.id = HddsIdFactory.getLongId();
@@ -59,4 +65,18 @@ public long getId() {
return id;
}
+ /**
+ * Get term of this command.
+ * @return term
+ */
+ public long getTerm() {
+ return term;
+ }
+
+ /**
+ * Set term of this command.
+ */
+ public void setTerm(long term) {
+ this.term = term;
+ }
}
diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestBlockDeletingService.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestBlockDeletingService.java
index 2fb577c79f98..2eb6a394e060 100644
--- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestBlockDeletingService.java
+++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestBlockDeletingService.java
@@ -20,8 +20,7 @@
import java.io.File;
import java.io.IOException;
-import java.time.Duration;
-import java.util.Iterator;
+import java.nio.ByteBuffer;
import java.util.List;
import java.util.Map;
import java.util.UUID;
@@ -38,37 +37,40 @@
import org.apache.hadoop.hdds.scm.ScmConfigKeys;
import org.apache.hadoop.hdds.utils.BackgroundService;
import org.apache.hadoop.hdds.utils.MetadataKeyFilters;
-import org.apache.hadoop.hdds.utils.db.Table;
import org.apache.hadoop.ozone.OzoneConsts;
import org.apache.hadoop.ozone.common.Checksum;
+import org.apache.hadoop.ozone.common.ChunkBuffer;
import org.apache.hadoop.ozone.container.ContainerTestHelper;
import org.apache.hadoop.ozone.container.common.helpers.BlockData;
-import org.apache.hadoop.ozone.container.common.helpers.ChunkInfoList;
+import org.apache.hadoop.ozone.container.common.helpers.ChunkInfo;
+import org.apache.hadoop.ozone.container.common.helpers.ContainerMetrics;
import org.apache.hadoop.ozone.container.common.impl.ChunkLayOutVersion;
import org.apache.hadoop.ozone.container.common.impl.ContainerData;
import org.apache.hadoop.ozone.container.common.impl.ContainerSet;
import org.apache.hadoop.ozone.container.common.impl.TopNOrderedContainerDeletionChoosingPolicy;
import org.apache.hadoop.ozone.container.common.interfaces.Container;
import org.apache.hadoop.ozone.container.common.interfaces.ContainerDispatcher;
-import org.apache.hadoop.ozone.container.common.interfaces.Handler;
-import org.apache.hadoop.ozone.container.common.statemachine.DatanodeConfiguration;
+import org.apache.hadoop.ozone.container.common.transport.server.ratis.DispatcherContext;
import org.apache.hadoop.ozone.container.common.utils.ReferenceCountedDB;
import org.apache.hadoop.ozone.container.common.volume.MutableVolumeSet;
import org.apache.hadoop.ozone.container.common.volume.RoundRobinVolumeChoosingPolicy;
+import org.apache.hadoop.ozone.container.common.volume.VolumeSet;
import org.apache.hadoop.ozone.container.keyvalue.ChunkLayoutTestInfo;
import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainer;
import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainerData;
import org.apache.hadoop.ozone.container.keyvalue.KeyValueHandler;
import org.apache.hadoop.ozone.container.keyvalue.helpers.BlockUtils;
+import org.apache.hadoop.ozone.container.keyvalue.impl.FilePerBlockStrategy;
+import org.apache.hadoop.ozone.container.keyvalue.impl.FilePerChunkStrategy;
+import org.apache.hadoop.ozone.container.keyvalue.interfaces.ChunkManager;
import org.apache.hadoop.ozone.container.keyvalue.statemachine.background.BlockDeletingService;
import org.apache.hadoop.ozone.container.ozoneimpl.OzoneContainer;
import org.apache.hadoop.ozone.container.testutils.BlockDeletingServiceTestImpl;
import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.test.GenericTestUtils.LogCapturer;
+import static org.apache.commons.lang3.RandomStringUtils.randomAlphanumeric;
-import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_CONTAINER_LIMIT_PER_INTERVAL;
-import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_LIMIT_PER_CONTAINER;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
@@ -76,12 +78,15 @@
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
-
+import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_CONTAINER_LIMIT_PER_INTERVAL;
+import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_LIMIT_PER_CONTAINER;
+import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_LIMIT_PER_CONTAINER_DEFAULT;
+import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_SERVICE_INTERVAL;
+import static org.apache.hadoop.ozone.container.common.impl.ChunkLayOutVersion.FILE_PER_BLOCK;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.times;
-import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
+import static java.nio.charset.StandardCharsets.UTF_8;
/**
* Tests to test block deleting service.
@@ -92,9 +97,12 @@ public class TestBlockDeletingService {
private static File testRoot;
private static String scmId;
private static String clusterID;
- private Handler handler;
+ private static String datanodeUuid;
+ private static MutableConfigurationSource conf;
private final ChunkLayOutVersion layout;
+ private int blockLimitPerTask;
+ private static VolumeSet volumeSet;
public TestBlockDeletingService(ChunkLayOutVersion layout) {
this.layout = layout;
@@ -114,6 +122,10 @@ public static void init() throws IOException {
}
scmId = UUID.randomUUID().toString();
clusterID = UUID.randomUUID().toString();
+ conf = new OzoneConfiguration();
+ conf.set(ScmConfigKeys.HDDS_DATANODE_DIR_KEY, testRoot.getAbsolutePath());
+ datanodeUuid = UUID.randomUUID().toString();
+ volumeSet = new MutableVolumeSet(datanodeUuid, conf);
}
@AfterClass
@@ -121,31 +133,45 @@ public static void cleanup() throws IOException {
FileUtils.deleteDirectory(testRoot);
}
+ private static final DispatcherContext WRITE_STAGE =
+ new DispatcherContext.Builder()
+ .setStage(DispatcherContext.WriteChunkStage.WRITE_DATA).build();
+
+ private static final DispatcherContext COMMIT_STAGE =
+ new DispatcherContext.Builder()
+ .setStage(DispatcherContext.WriteChunkStage.COMMIT_DATA).build();
+
/**
* A helper method to create some blocks and put them under deletion
* state for testing. This method directly updates container.db and
* creates some fake chunk files for testing.
*/
private void createToDeleteBlocks(ContainerSet containerSet,
- MutableConfigurationSource conf, int numOfContainers,
+ int numOfContainers,
int numOfBlocksPerContainer,
int numOfChunksPerBlock) throws IOException {
+ ChunkManager chunkManager;
+ if (layout == FILE_PER_BLOCK) {
+ chunkManager = new FilePerBlockStrategy(true);
+ } else {
+ chunkManager = new FilePerChunkStrategy(true, null);
+ }
+ byte[] arr = randomAlphanumeric(1048576).getBytes(UTF_8);
+ ChunkBuffer buffer = ChunkBuffer.wrap(ByteBuffer.wrap(arr));
for (int x = 0; x < numOfContainers; x++) {
- conf.set(ScmConfigKeys.HDDS_DATANODE_DIR_KEY, testRoot.getAbsolutePath());
long containerID = ContainerTestHelper.getTestContainerID();
- KeyValueContainerData data = new KeyValueContainerData(containerID,
- layout,
- ContainerTestHelper.CONTAINER_MAX_SIZE, UUID.randomUUID().toString(),
- UUID.randomUUID().toString());
+ KeyValueContainerData data =
+ new KeyValueContainerData(containerID, layout,
+ ContainerTestHelper.CONTAINER_MAX_SIZE,
+ UUID.randomUUID().toString(), datanodeUuid);
data.closeContainer();
KeyValueContainer container = new KeyValueContainer(data, conf);
- container.create(new MutableVolumeSet(scmId, clusterID, conf),
+ container.create(volumeSet,
new RoundRobinVolumeChoosingPolicy(), scmId);
containerSet.addContainer(container);
data = (KeyValueContainerData) containerSet.getContainer(
containerID).getContainerData();
-
- long blockLength = 100;
+ long chunkLength = 100;
try(ReferenceCountedDB metadata = BlockUtils.getDB(data, conf)) {
for (int j = 0; j < numOfBlocksPerContainer; j++) {
BlockID blockID =
@@ -155,30 +181,35 @@ private void createToDeleteBlocks(ContainerSet containerSet,
BlockData kd = new BlockData(blockID);
List chunks = Lists.newArrayList();
for (int k = 0; k < numOfChunksPerBlock; k++) {
+ final String chunkName = String.format("block.%d.chunk.%d", j, k);
+ final long offset = k * chunkLength;
ContainerProtos.ChunkInfo info =
ContainerProtos.ChunkInfo.newBuilder()
- .setChunkName(blockID.getLocalID() + "_chunk_" + k)
- .setLen(blockLength)
- .setOffset(0)
+ .setChunkName(chunkName)
+ .setLen(chunkLength)
+ .setOffset(offset)
.setChecksumData(Checksum.getNoChecksumDataProto())
.build();
chunks.add(info);
+ ChunkInfo chunkInfo = new ChunkInfo(chunkName, offset, chunkLength);
+ ChunkBuffer chunkData = buffer.duplicate(0, (int) chunkLength);
+ chunkManager.writeChunk(container, blockID, chunkInfo, chunkData,
+ WRITE_STAGE);
+ chunkManager.writeChunk(container, blockID, chunkInfo, chunkData,
+ COMMIT_STAGE);
}
kd.setChunks(chunks);
metadata.getStore().getBlockDataTable().put(
deleteStateName, kd);
container.getContainerData().incrPendingDeletionBlocks(1);
}
-
container.getContainerData().setKeyCount(numOfBlocksPerContainer);
- container.getContainerData().setBytesUsed(
- blockLength * numOfBlocksPerContainer);
// Set block count, bytes used and pending delete block count.
metadata.getStore().getMetadataTable().put(
OzoneConsts.BLOCK_COUNT, (long)numOfBlocksPerContainer);
metadata.getStore().getMetadataTable().put(
OzoneConsts.CONTAINER_BYTES_USED,
- blockLength * numOfBlocksPerContainer);
+ chunkLength * numOfChunksPerBlock * numOfBlocksPerContainer);
metadata.getStore().getMetadataTable().put(
OzoneConsts.PENDING_DELETE_BLOCK_COUNT,
(long)numOfBlocksPerContainer);
@@ -207,21 +238,23 @@ private int getUnderDeletionBlocksCount(ReferenceCountedDB meta)
MetadataKeyFilters.getDeletingKeyFilter()).size();
}
- private int getDeletedBlocksCount(ReferenceCountedDB db) throws IOException {
- return db.getStore().getDeletedBlocksTable()
- .getRangeKVs(null, 100).size();
- }
@Test
public void testBlockDeletion() throws Exception {
- OzoneConfiguration conf = new OzoneConfiguration();
conf.setInt(OZONE_BLOCK_DELETING_CONTAINER_LIMIT_PER_INTERVAL, 10);
conf.setInt(OZONE_BLOCK_DELETING_LIMIT_PER_CONTAINER, 2);
+ this.blockLimitPerTask =
+ conf.getInt(OZONE_BLOCK_DELETING_LIMIT_PER_CONTAINER,
+ OZONE_BLOCK_DELETING_LIMIT_PER_CONTAINER_DEFAULT);
ContainerSet containerSet = new ContainerSet();
- createToDeleteBlocks(containerSet, conf, 1, 3, 1);
-
+ createToDeleteBlocks(containerSet, 1, 3, 1);
+ ContainerMetrics metrics = ContainerMetrics.create(conf);
+ KeyValueHandler keyValueHandler =
+ new KeyValueHandler(conf, datanodeUuid, containerSet, volumeSet,
+ metrics, c -> {
+ });
BlockDeletingServiceTestImpl svc =
- getBlockDeletingService(containerSet, conf);
+ getBlockDeletingService(containerSet, conf, keyValueHandler);
svc.start();
GenericTestUtils.waitFor(svc::isStarted, 100, 3000);
@@ -240,40 +273,43 @@ public void testBlockDeletion() throws Exception {
.get(containerData.get(0).getContainerID()).getContainerData())
.getDeleteTransactionId();
-
+ long containerSpace = containerData.get(0).getBytesUsed();
// Number of deleted blocks in container should be equal to 0 before
// block delete
+
Assert.assertEquals(0, transactionId);
// Ensure there are 3 blocks under deletion and 0 deleted blocks
Assert.assertEquals(3, getUnderDeletionBlocksCount(meta));
- Assert.assertEquals(3,
- meta.getStore().getMetadataTable()
- .get(OzoneConsts.PENDING_DELETE_BLOCK_COUNT).longValue());
- Assert.assertEquals(0, getDeletedBlocksCount(meta));
+ Assert.assertEquals(3, meta.getStore().getMetadataTable()
+ .get(OzoneConsts.PENDING_DELETE_BLOCK_COUNT).longValue());
+
+ // Container contains 3 blocks. So, space used by the container
+ // should be greater than zero.
+ Assert.assertTrue(containerSpace > 0);
// An interval will delete 1 * 2 blocks
deleteAndWait(svc, 1);
- Assert.assertEquals(1, getUnderDeletionBlocksCount(meta));
- Assert.assertEquals(2, getDeletedBlocksCount(meta));
- deleteAndWait(svc, 2);
- Assert.assertEquals(0, getUnderDeletionBlocksCount(meta));
- Assert.assertEquals(3, getDeletedBlocksCount(meta));
+ // After first interval 2 blocks will be deleted. Hence, current space
+ // used by the container should be less than the space used by the
+ // container initially(before running deletion services).
+ Assert.assertTrue(containerData.get(0).getBytesUsed() < containerSpace);
- deleteAndWait(svc, 3);
- Assert.assertEquals(0, getUnderDeletionBlocksCount(meta));
- Assert.assertEquals(3, getDeletedBlocksCount(meta));
+ deleteAndWait(svc, 2);
+ // After deletion of all 3 blocks, space used by the containers
+ // should be zero.
+ containerSpace = containerData.get(0).getBytesUsed();
+ Assert.assertTrue(containerSpace == 0);
// Check finally DB counters.
// Not checking bytes used, as handler is a mock call.
+ Assert.assertEquals(0, meta.getStore().getMetadataTable()
+ .get(OzoneConsts.PENDING_DELETE_BLOCK_COUNT).longValue());
Assert.assertEquals(0,
- meta.getStore().getMetadataTable()
- .get(OzoneConsts.PENDING_DELETE_BLOCK_COUNT).longValue());
- Assert.assertEquals(0,
- meta.getStore().getMetadataTable()
- .get(OzoneConsts.BLOCK_COUNT).longValue());
+ meta.getStore().getMetadataTable().get(OzoneConsts.BLOCK_COUNT)
+ .longValue());
}
svc.shutdown();
@@ -282,19 +318,20 @@ public void testBlockDeletion() throws Exception {
@Test
@SuppressWarnings("java:S2699") // waitFor => assertion with timeout
public void testShutdownService() throws Exception {
- OzoneConfiguration conf = new OzoneConfiguration();
- DatanodeConfiguration datanodeConfiguration = conf.getObject(
- DatanodeConfiguration.class);
- datanodeConfiguration.setBlockDeletionInterval(Duration.ofMillis(500));
- conf.setFromObject(datanodeConfiguration);
+ conf.setTimeDuration(OZONE_BLOCK_DELETING_SERVICE_INTERVAL, 500,
+ TimeUnit.MILLISECONDS);
conf.setInt(OZONE_BLOCK_DELETING_CONTAINER_LIMIT_PER_INTERVAL, 10);
conf.setInt(OZONE_BLOCK_DELETING_LIMIT_PER_CONTAINER, 10);
ContainerSet containerSet = new ContainerSet();
// Create 1 container with 100 blocks
- createToDeleteBlocks(containerSet, conf, 1, 100, 1);
-
+ createToDeleteBlocks(containerSet, 1, 100, 1);
+ ContainerMetrics metrics = ContainerMetrics.create(conf);
+ KeyValueHandler keyValueHandler =
+ new KeyValueHandler(conf, datanodeUuid, containerSet, volumeSet,
+ metrics, c -> {
+ });
BlockDeletingServiceTestImpl service =
- getBlockDeletingService(containerSet, conf);
+ getBlockDeletingService(containerSet, conf, keyValueHandler);
service.start();
GenericTestUtils.waitFor(service::isStarted, 100, 3000);
@@ -309,15 +346,19 @@ public void testShutdownService() throws Exception {
@Test
public void testBlockDeletionTimeout() throws Exception {
- OzoneConfiguration conf = new OzoneConfiguration();
conf.setInt(OZONE_BLOCK_DELETING_CONTAINER_LIMIT_PER_INTERVAL, 10);
conf.setInt(OZONE_BLOCK_DELETING_LIMIT_PER_CONTAINER, 2);
ContainerSet containerSet = new ContainerSet();
- createToDeleteBlocks(containerSet, conf, 1, 3, 1);
-
+ createToDeleteBlocks(containerSet, 1, 3, 1);
+ ContainerMetrics metrics = ContainerMetrics.create(conf);
+ KeyValueHandler keyValueHandler =
+ new KeyValueHandler(conf, datanodeUuid, containerSet, volumeSet,
+ metrics, c -> {
+ });
// set timeout value as 1ns to trigger timeout behavior
long timeout = 1;
- OzoneContainer ozoneContainer = mockDependencies(containerSet);
+ OzoneContainer ozoneContainer =
+ mockDependencies(containerSet, keyValueHandler);
BlockDeletingService svc = new BlockDeletingService(ozoneContainer,
TimeUnit.MILLISECONDS.toNanos(1000), timeout, TimeUnit.NANOSECONDS,
conf);
@@ -338,7 +379,7 @@ public void testBlockDeletionTimeout() throws Exception {
// test for normal case that doesn't have timeout limitation
timeout = 0;
- createToDeleteBlocks(containerSet, conf, 1, 3, 1);
+ createToDeleteBlocks(containerSet, 1, 3, 1);
svc = new BlockDeletingService(ozoneContainer,
TimeUnit.MILLISECONDS.toNanos(1000), timeout, TimeUnit.MILLISECONDS,
conf);
@@ -369,19 +410,21 @@ public void testBlockDeletionTimeout() throws Exception {
}
private BlockDeletingServiceTestImpl getBlockDeletingService(
- ContainerSet containerSet, ConfigurationSource conf) {
- OzoneContainer ozoneContainer = mockDependencies(containerSet);
- return new BlockDeletingServiceTestImpl(ozoneContainer, 1000, conf);
+ ContainerSet containerSet, ConfigurationSource config,
+ KeyValueHandler keyValueHandler) {
+ OzoneContainer ozoneContainer =
+ mockDependencies(containerSet, keyValueHandler);
+ return new BlockDeletingServiceTestImpl(ozoneContainer, 1000, config);
}
- private OzoneContainer mockDependencies(ContainerSet containerSet) {
+ private OzoneContainer mockDependencies(ContainerSet containerSet,
+ KeyValueHandler keyValueHandler) {
OzoneContainer ozoneContainer = mock(OzoneContainer.class);
when(ozoneContainer.getContainerSet()).thenReturn(containerSet);
when(ozoneContainer.getWriteChannel()).thenReturn(null);
ContainerDispatcher dispatcher = mock(ContainerDispatcher.class);
when(ozoneContainer.getDispatcher()).thenReturn(dispatcher);
- handler = mock(KeyValueHandler.class);
- when(dispatcher.getHandler(any())).thenReturn(handler);
+ when(dispatcher.getHandler(any())).thenReturn(keyValueHandler);
return ozoneContainer;
}
@@ -396,7 +439,6 @@ public void testContainerThrottle() throws Exception {
//
// Each time only 1 container can be processed, so each time
// 1 block from 1 container can be deleted.
- OzoneConfiguration conf = new OzoneConfiguration();
// Process 1 container per interval
conf.set(
ScmConfigKeys.OZONE_SCM_KEY_VALUE_CONTAINER_DELETION_CHOOSING_POLICY,
@@ -404,28 +446,54 @@ public void testContainerThrottle() throws Exception {
conf.setInt(OZONE_BLOCK_DELETING_CONTAINER_LIMIT_PER_INTERVAL, 1);
conf.setInt(OZONE_BLOCK_DELETING_LIMIT_PER_CONTAINER, 1);
ContainerSet containerSet = new ContainerSet();
+
int containerCount = 2;
int chunksPerBlock = 10;
int blocksPerContainer = 1;
- createToDeleteBlocks(containerSet, conf, containerCount, blocksPerContainer,
+ createToDeleteBlocks(containerSet, containerCount, blocksPerContainer,
chunksPerBlock);
-
+ ContainerMetrics metrics = ContainerMetrics.create(conf);
+ KeyValueHandler keyValueHandler =
+ new KeyValueHandler(conf, datanodeUuid, containerSet, volumeSet,
+ metrics, c -> {
+ });
BlockDeletingServiceTestImpl service =
- getBlockDeletingService(containerSet, conf);
+ getBlockDeletingService(containerSet, conf, keyValueHandler);
service.start();
-
+ List containerData = Lists.newArrayList();
+ containerSet.listContainer(0L, containerCount, containerData);
try {
GenericTestUtils.waitFor(service::isStarted, 100, 3000);
- for (int i = 1; i <= containerCount; i++) {
- deleteAndWait(service, i);
- verify(handler, times(i * blocksPerContainer))
- .deleteBlock(any(), any());
- }
+
+ // Deleting one of the two containers and its single block.
+ // Hence, space used by the container of whose block has been
+ // deleted should be zero.
+ deleteAndWait(service, 1);
+ Assert.assertTrue((containerData.get(0).getBytesUsed() == 0)
+ || containerData.get(1).getBytesUsed() == 0);
+
+ Assert.assertFalse((containerData.get(0).getBytesUsed() == 0) && (
+ containerData.get(1).getBytesUsed() == 0));
+
+ // Deleting the second container. Hence, space used by both the
+ // containers should be zero.
+ deleteAndWait(service, 2);
+
+ Assert.assertTrue((containerData.get(1).getBytesUsed() == 0) && (
+ containerData.get(1).getBytesUsed() == 0));
} finally {
service.shutdown();
}
}
+ public long currentBlockSpace(List containerData,
+ int totalContainers) {
+ long totalSpaceUsed = 0;
+ for (int i = 0; i < totalContainers; i++) {
+ totalSpaceUsed += containerData.get(i).getBytesUsed();
+ }
+ return totalSpaceUsed;
+ }
@Test(timeout = 30000)
public void testBlockThrottle() throws Exception {
@@ -439,92 +507,54 @@ public void testBlockThrottle() throws Exception {
// Each time containers can be all scanned, but only 2 blocks
// per container can be actually deleted. So it requires 2 waves
// to cleanup all blocks.
- OzoneConfiguration conf = new OzoneConfiguration();
conf.setInt(OZONE_BLOCK_DELETING_CONTAINER_LIMIT_PER_INTERVAL, 10);
- int blockLimitPerTask = 2;
+ blockLimitPerTask = 2;
conf.setInt(OZONE_BLOCK_DELETING_LIMIT_PER_CONTAINER, blockLimitPerTask);
ContainerSet containerSet = new ContainerSet();
+ ContainerMetrics metrics = ContainerMetrics.create(conf);
+ KeyValueHandler keyValueHandler =
+ new KeyValueHandler(conf, datanodeUuid, containerSet, volumeSet,
+ metrics, c -> {
+ });
int containerCount = 5;
int blocksPerContainer = 3;
- createToDeleteBlocks(containerSet, conf, containerCount,
+ createToDeleteBlocks(containerSet, containerCount,
blocksPerContainer, 1);
BlockDeletingServiceTestImpl service =
- getBlockDeletingService(containerSet, conf);
+ getBlockDeletingService(containerSet, conf, keyValueHandler);
service.start();
-
+ List containerData = Lists.newArrayList();
+ containerSet.listContainer(0L, containerCount, containerData);
+ long blockSpace = containerData.get(0).getBytesUsed() / blocksPerContainer;
+ long totalContainerSpace =
+ containerCount * containerData.get(0).getBytesUsed();
try {
GenericTestUtils.waitFor(service::isStarted, 100, 3000);
// Total blocks = 3 * 5 = 15
// block per task = 2
// number of containers = 5
// each interval will at most runDeletingTasks 5 * 2 = 10 blocks
+
+ // Deleted space of 10 blocks should be equal to (initial total space
+ // of container - current total space of container).
deleteAndWait(service, 1);
- verify(handler, times(blockLimitPerTask * containerCount))
- .deleteBlock(any(), any());
+ Assert.assertEquals(blockLimitPerTask * containerCount * blockSpace,
+ (totalContainerSpace - currentBlockSpace(containerData,
+ containerCount)));
// There is only 5 blocks left to runDeletingTasks
+
+ // (Deleted space of previous 10 blocks + these left 5 blocks) should
+ // be equal to (initial total space of container
+ // - current total space of container(it will be zero as all blocks
+ // in all the containers are deleted)).
deleteAndWait(service, 2);
- verify(handler, times(
- blocksPerContainer * containerCount))
- .deleteBlock(any(), any());
+ Assert.assertEquals(blocksPerContainer * containerCount * blockSpace,
+ (totalContainerSpace - currentBlockSpace(containerData,
+ containerCount)));
} finally {
service.shutdown();
}
}
-
- @Test
- public void testDeletedChunkInfo() throws Exception {
- OzoneConfiguration conf = new OzoneConfiguration();
- conf.setInt(OZONE_BLOCK_DELETING_CONTAINER_LIMIT_PER_INTERVAL, 10);
- conf.setInt(OZONE_BLOCK_DELETING_LIMIT_PER_CONTAINER, 2);
- ContainerSet containerSet = new ContainerSet();
- createToDeleteBlocks(containerSet, conf, 1, 2, 3);
-
- List containerData = Lists.newArrayList();
- containerSet.listContainer(0L, 1, containerData);
-
- try(ReferenceCountedDB meta = BlockUtils.getDB(
- (KeyValueContainerData) containerData.get(0), conf)) {
-
- // Collect all ChunkInfo from blocks marked for deletion.
- List extends Table.KeyValue> deletingBlocks =
- meta.getStore().getBlockDataTable()
- .getRangeKVs(null, 100,
- MetadataKeyFilters.getDeletingKeyFilter());
-
- // Delete all blocks marked for deletion.
- BlockDeletingServiceTestImpl svc =
- getBlockDeletingService(containerSet, conf);
- svc.start();
- GenericTestUtils.waitFor(svc::isStarted, 100, 3000);
- deleteAndWait(svc, 1);
- svc.shutdown();
-
- // Get deleted blocks from their table, and check their ChunkInfo lists
- // against those we saved for them before deletion.
- List extends Table.KeyValue> deletedBlocks =
- meta.getStore().getDeletedBlocksTable()
- .getRangeKVs(null, 100);
-
- Assert.assertEquals(deletingBlocks.size(), deletedBlocks.size());
-
- Iterator extends Table.KeyValue>
- deletingBlocksIter = deletingBlocks.iterator();
- Iterator extends Table.KeyValue>
- deletedBlocksIter = deletedBlocks.iterator();
-
- while(deletingBlocksIter.hasNext() && deletedBlocksIter.hasNext()) {
- List deletingChunks =
- deletingBlocksIter.next().getValue().getChunks();
- List deletedChunks =
- deletedBlocksIter.next().getValue().asList();
-
- // On each element of each list, this call uses the equals method
- // for ChunkInfos generated by protobuf.
- // This checks their internal fields for equality.
- Assert.assertEquals(deletingChunks, deletedChunks);
- }
- }
- }
}
diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestContainerCache.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestContainerCache.java
index 3a47120181ff..e7f6388cee02 100644
--- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestContainerCache.java
+++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestContainerCache.java
@@ -54,7 +54,7 @@ public class TestContainerCache {
private void createContainerDB(OzoneConfiguration conf, File dbFile)
throws Exception {
DatanodeStore store = new DatanodeStoreSchemaTwoImpl(
- conf, 1, dbFile.getAbsolutePath());
+ conf, 1, dbFile.getAbsolutePath(), false);
// we close since the SCM pre-creates containers.
// we will open and put Db handle into a cache when keys are being created
diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestSchemaOneBackwardsCompatibility.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestSchemaOneBackwardsCompatibility.java
index 01fa3bf372c4..00ebcb011207 100644
--- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestSchemaOneBackwardsCompatibility.java
+++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestSchemaOneBackwardsCompatibility.java
@@ -27,11 +27,14 @@
import org.apache.hadoop.ozone.OzoneConsts;
import org.apache.hadoop.ozone.container.common.helpers.BlockData;
import org.apache.hadoop.ozone.container.common.helpers.ChunkInfoList;
+import org.apache.hadoop.ozone.container.common.helpers.ContainerMetrics;
import org.apache.hadoop.ozone.container.common.impl.ContainerDataYaml;
import org.apache.hadoop.ozone.container.common.impl.ContainerSet;
import org.apache.hadoop.ozone.container.common.interfaces.BlockIterator;
import org.apache.hadoop.ozone.container.common.interfaces.ContainerDispatcher;
import org.apache.hadoop.ozone.container.common.utils.ReferenceCountedDB;
+import org.apache.hadoop.ozone.container.common.volume.MutableVolumeSet;
+import org.apache.hadoop.ozone.container.common.volume.VolumeSet;
import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainer;
import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainerData;
import org.apache.hadoop.ozone.container.keyvalue.KeyValueHandler;
@@ -52,11 +55,19 @@
import java.io.File;
import java.io.IOException;
import java.net.URL;
-import java.util.*;
+import java.util.List;
+import java.util.UUID;
+import java.util.ArrayList;
+import java.util.Set;
+import java.util.HashSet;
+import java.util.Arrays;
import java.util.stream.Collectors;
import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_CONTAINER_LIMIT_PER_INTERVAL;
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
@@ -225,8 +236,22 @@ public void testReadWithoutMetadata() throws Exception {
@Test
public void testDelete() throws Exception {
final long numBlocksToDelete = TestDB.NUM_PENDING_DELETION_BLOCKS;
+ String datanodeUuid = UUID.randomUUID().toString();
+ ContainerSet containerSet = makeContainerSet();
+ VolumeSet volumeSet = new MutableVolumeSet(datanodeUuid, conf);
+ ContainerMetrics metrics = ContainerMetrics.create(conf);
+ KeyValueHandler keyValueHandler =
+ new KeyValueHandler(conf, datanodeUuid, containerSet, volumeSet,
+ metrics, c -> {
+ });
+ long initialTotalSpace = newKvData().getBytesUsed();
+ long blockSpace = initialTotalSpace / TestDB.KEY_COUNT;
+
+ runBlockDeletingService(keyValueHandler);
- runBlockDeletingService();
+ long currentTotalSpace = newKvData().getBytesUsed();
+ long numberOfBlocksDeleted =
+ (initialTotalSpace - currentTotalSpace) / blockSpace;
// Expected values after blocks with #deleting# prefix in original DB are
// deleted.
@@ -242,7 +267,7 @@ public void testDelete() throws Exception {
assertEquals(expectedDeletingBlocks,
countDeletingBlocks(refCountedDB));
assertEquals(expectedDeletedBlocks,
- countDeletedBlocks(refCountedDB));
+ TestDB.NUM_DELETED_BLOCKS + numberOfBlocksDeleted);
assertEquals(expectedRegularBlocks,
countUnprefixedBlocks(refCountedDB));
@@ -269,6 +294,14 @@ public void testDelete() throws Exception {
*/
@Test
public void testReadDeletedBlockChunkInfo() throws Exception {
+ String datanodeUuid = UUID.randomUUID().toString();
+ ContainerSet containerSet = makeContainerSet();
+ VolumeSet volumeSet = new MutableVolumeSet(datanodeUuid, conf);
+ ContainerMetrics metrics = ContainerMetrics.create(conf);
+ KeyValueHandler keyValueHandler =
+ new KeyValueHandler(conf, datanodeUuid, containerSet, volumeSet,
+ metrics, c -> {
+ });
try(ReferenceCountedDB refCountedDB = BlockUtils.getDB(newKvData(), conf)) {
// Read blocks that were already deleted before the upgrade.
List extends Table.KeyValue> deletedBlocks =
@@ -290,25 +323,22 @@ public void testReadDeletedBlockChunkInfo() throws Exception {
Assert.assertEquals(TestDB.NUM_DELETED_BLOCKS, preUpgradeBlocks.size());
- runBlockDeletingService();
+ long initialTotalSpace = newKvData().getBytesUsed();
+ long blockSpace = initialTotalSpace / TestDB.KEY_COUNT;
- // After the block deleting service runs, get the updated list of
- // deleted blocks.
- deletedBlocks = refCountedDB.getStore()
- .getDeletedBlocksTable().getRangeKVs(null, 100);
+ runBlockDeletingService(keyValueHandler);
- int numPostUpgradeDeletesFound = 0;
- for(Table.KeyValue chunkListKV: deletedBlocks) {
- if (!preUpgradeBlocks.contains(chunkListKV.getKey())) {
- numPostUpgradeDeletesFound++;
- Assert.assertNotNull(chunkListKV.getValue());
- }
- }
+ long currentTotalSpace = newKvData().getBytesUsed();
+
+ // After the block deleting service runs, get the number of
+ // deleted blocks.
+ long numberOfBlocksDeleted =
+ (initialTotalSpace - currentTotalSpace) / blockSpace;
// The blocks that were originally marked for deletion should now be
// deleted.
Assert.assertEquals(TestDB.NUM_PENDING_DELETION_BLOCKS,
- numPostUpgradeDeletesFound);
+ numberOfBlocksDeleted);
}
}
@@ -448,21 +478,22 @@ public void testReadDeletedBlocks() throws Exception {
}
}
- private void runBlockDeletingService() throws Exception {
+ private void runBlockDeletingService(KeyValueHandler keyValueHandler)
+ throws Exception {
conf.setInt(OZONE_BLOCK_DELETING_CONTAINER_LIMIT_PER_INTERVAL, 10);
conf.setInt(OzoneConfigKeys.OZONE_BLOCK_DELETING_LIMIT_PER_CONTAINER, 2);
conf.set(ScmConfigKeys.HDDS_DATANODE_DIR_KEY,
- metadataDir.getAbsolutePath());
+ metadataDir.getAbsolutePath());
- OzoneContainer container = makeMockOzoneContainer();
+ OzoneContainer container = makeMockOzoneContainer(keyValueHandler);
BlockDeletingServiceTestImpl service =
- new BlockDeletingServiceTestImpl(container, 1000, conf);
+ new BlockDeletingServiceTestImpl(container, 1000, conf);
service.start();
GenericTestUtils.waitFor(service::isStarted, 100, 3000);
service.runDeletingTasks();
- GenericTestUtils.waitFor(() -> service.getTimesOfProcessed() == 1,
- 100, 3000);
+ GenericTestUtils
+ .waitFor(() -> service.getTimesOfProcessed() == 1, 100, 3000);
}
private ContainerSet makeContainerSet() throws Exception {
@@ -473,7 +504,8 @@ private ContainerSet makeContainerSet() throws Exception {
return containerSet;
}
- private OzoneContainer makeMockOzoneContainer() throws Exception {
+ private OzoneContainer makeMockOzoneContainer(KeyValueHandler keyValueHandler)
+ throws Exception {
ContainerSet containerSet = makeContainerSet();
OzoneContainer ozoneContainer = mock(OzoneContainer.class);
@@ -481,8 +513,7 @@ private OzoneContainer makeMockOzoneContainer() throws Exception {
when(ozoneContainer.getWriteChannel()).thenReturn(null);
ContainerDispatcher dispatcher = mock(ContainerDispatcher.class);
when(ozoneContainer.getDispatcher()).thenReturn(dispatcher);
- KeyValueHandler handler = mock(KeyValueHandler.class);
- when(dispatcher.getHandler(any())).thenReturn(handler);
+ when(dispatcher.getHandler(any())).thenReturn(keyValueHandler);
return ozoneContainer;
}
diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestCreatePipelineCommandHandler.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestCreatePipelineCommandHandler.java
index ede0b94de476..febd1c3bd0df 100644
--- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestCreatePipelineCommandHandler.java
+++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestCreatePipelineCommandHandler.java
@@ -34,6 +34,7 @@
import org.apache.hadoop.ozone.protocol.commands.CreatePipelineCommand;
import org.apache.hadoop.ozone.protocol.commands.SCMCommand;
import org.apache.ratis.client.RaftClient;
+import org.apache.ratis.client.api.GroupManagementApi;
import org.apache.ratis.conf.RaftProperties;
import org.apache.ratis.protocol.ClientId;
import org.apache.ratis.protocol.RaftGroup;
@@ -64,6 +65,7 @@ public class TestCreatePipelineCommandHandler {
private StateContext stateContext;
private SCMConnectionManager connectionManager;
private RaftClient raftClient;
+ private GroupManagementApi raftClientGroupManager;
@Before
public void setup() throws Exception {
@@ -71,8 +73,11 @@ public void setup() throws Exception {
stateContext = Mockito.mock(StateContext.class);
connectionManager = Mockito.mock(SCMConnectionManager.class);
raftClient = Mockito.mock(RaftClient.class);
+ raftClientGroupManager = Mockito.mock(GroupManagementApi.class);
final RaftClient.Builder builder = mockRaftClientBuilder();
Mockito.when(builder.build()).thenReturn(raftClient);
+ Mockito.when(raftClient.getGroupManagementApi(
+ Mockito.any(RaftPeerId.class))).thenReturn(raftClientGroupManager);
PowerMockito.mockStatic(RaftClient.class);
PowerMockito.when(RaftClient.newBuilder()).thenReturn(builder);
}
@@ -121,8 +126,8 @@ public void testPipelineCreation() throws IOException {
Mockito.verify(writeChanel, Mockito.times(1))
.addGroup(pipelineID.getProtobuf(), datanodes, priorityList);
- Mockito.verify(raftClient, Mockito.times(2))
- .groupAdd(Mockito.any(RaftGroup.class), Mockito.any(RaftPeerId.class));
+ Mockito.verify(raftClientGroupManager, Mockito.times(2))
+ .add(Mockito.any(RaftGroup.class));
}
@Test
@@ -150,8 +155,8 @@ public void testCommandIdempotency() throws IOException {
Mockito.verify(writeChanel, Mockito.times(0))
.addGroup(pipelineID.getProtobuf(), datanodes);
- Mockito.verify(raftClient, Mockito.times(0))
- .groupAdd(Mockito.any(RaftGroup.class), Mockito.any(RaftPeerId.class));
+ Mockito.verify(raftClientGroupManager, Mockito.times(0))
+ .add(Mockito.any(RaftGroup.class));
}
private List getDatanodes() {
diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueContainer.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueContainer.java
index c2b487be2933..25d8b1d25edf 100644
--- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueContainer.java
+++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueContainer.java
@@ -199,7 +199,6 @@ public void testContainerImportExport() throws Exception {
metadataStore.getStore().getMetadataTable()
.put(OzoneConsts.BLOCK_COUNT, numberOfKeysToWrite);
}
- BlockUtils.removeDB(keyValueContainerData, CONF);
Map metadata = new HashMap<>();
metadata.put("key1", "value1");
diff --git a/hadoop-hdds/docs/content/concept/OzoneManager.md b/hadoop-hdds/docs/content/concept/OzoneManager.md
index 0930ec95e380..5cf520ca2195 100644
--- a/hadoop-hdds/docs/content/concept/OzoneManager.md
+++ b/hadoop-hdds/docs/content/concept/OzoneManager.md
@@ -97,7 +97,7 @@ the data from the data node.
For a detailed view of Ozone Manager this section gives a quick overview about the provided network services and the stored persisted data.
-**Network services provided by Ozone Manager:**
+### Network services provided by Ozone Manager:
Ozone provides a network service for the client and for administration commands. The main service calls
@@ -115,7 +115,7 @@ Ozone provides a network service for the client and for administration commands.
* ServiceList (used for service discovery)
* DBUpdates (used by [Recon]({{< ref "feature/Recon.md" >}}) downloads snapshots)
-**Persisted state**
+### Persisted state
The following data is persisted in Ozone Manager side in a specific RocksDB directory:
diff --git a/hadoop-hdds/docs/content/concept/Recon.md b/hadoop-hdds/docs/content/concept/Recon.md
new file mode 100644
index 000000000000..902c865be8fa
--- /dev/null
+++ b/hadoop-hdds/docs/content/concept/Recon.md
@@ -0,0 +1,163 @@
+---
+title: "Recon"
+date: "2020-10-27"
+weight: 8
+menu:
+ main:
+ parent: Architecture
+summary: Recon serves as a management and monitoring console for Ozone.
+---
+
+
+Recon serves as a management and monitoring console for Ozone. It gives a
+bird's-eye view of Ozone and helps users troubleshoot any issues by presenting
+the current state of the cluster through REST based APIs and rich web UI.
+
+
+## High Level Design
+
+{{}}
+
+
+
+On a high level, Recon collects and aggregates metadata from Ozone Manager (OM),
+Storage Container Manager (SCM) and Datanodes (DN) and acts as a central
+management and monitoring console. Ozone administrators can use Recon to query
+the current state of the system without overloading OM or SCM.
+
+Recon maintains multiple databases to enable batch processing, faster querying
+and to persist aggregate information. It maintains a local copy of OM db and
+SCM db along with a SQL database for persisting aggregate information.
+
+Recon also integrates with Prometheus to provide a HTTP endpoint to query Prometheus
+for Ozone metrics and also to display a few crucial point in time metrics in
+the web UI.
+
+## Recon and Ozone Manager
+
+{{}}
+
+
+
+Recon gets a full snapshot of OM rocks db initially from the leader OM's HTTP
+endpoint, untars the file and initializes RocksDB for querying locally. The
+database is kept in sync by periodically requesting delta updates from the leader
+OM via RPC calls from the last applied sequence id. If for any reason, the delta
+updates could not be retrieved or applied to the local db, a full snapshot is
+requested again to keep the local db in sync with OM db. Due to this, Recon can
+show stale information since the local db will not always be in sync.
+
+The db updates retrieved from OM is then converted into a batch of events for
+further processing by OM db tasks via [Recon Task Framework](#task-framework).
+
+
+## Recon and Storage Container Manager
+
+{{}}
+
+
+
+Recon also acts as a passive SCM for datanodes. When Recon is configured in the
+cluster, all the datanodes register with Recon and send heartbeats, container
+reports, incremental container reports etc. to Recon similar to SCM. Recon uses
+all the information it gets from datanodes to construct its own copy of SCM rocks db
+locally. Recon never sends any command to datanodes in response and just acts as
+a passive SCM for faster lookup of SCM metadata.
+
+## Task Framework
+
+Recon has its own Task framework to enable batch processing of data obtained
+from OM and SCM. A task can listen to and act upon db events such as `PUT`, `DELETE`,
+`UPDATE`, etc. on either OM db or SCM db. Based on this, a task either implements
+`org.apache.hadoop.ozone.recon.tasks.ReconOmTask` or extends
+`org.apache.hadoop.ozone.recon.scm.ReconScmTask`.
+
+An example `ReconOmTask` is `ContainerKeyMapperTask` that persists the container -> key
+mapping in RocksDB. This is useful to understand which keys were part of the container
+when the container is reported missing or is in a bad health state. Another example is
+`FileSizeCountTask` which keeps track of count of files within a given file size range in
+a SQL database. These tasks have implementations for two scenarios:
+
+ - Full snapshot (reprocess())
+ - Delta updates (process())
+
+When a full snapshot of OM db is obtained from the leader OM, the reprocess()
+is called on all the registered OM tasks. On subsequent delta updates, process()
+is called on these OM tasks.
+
+An example `ReconScmTask` is `ContainerHealthTask` that runs in configurable
+intervals to scan the list of all the containers and to persist the state of
+unhealthy containers (`MISSING`, `MIS_REPLICATED`, `UNDER_REPLICATED`, `OVER_REPLICATED`)
+in a SQL table. This information is used to determine if there are any missing
+containers in the cluster.
+
+## Recon and Prometheus
+
+Recon can integrate with any Prometheus instance configured to collected metrics
+and can display useful information in Recon UI in Datanodes and Pipelines pages.
+Recon also exposes a proxy endpoint ([/metrics]({{< ref "interface/ReconApi.md#metrics" >}}))
+to query Prometheus. This integration can be enabled by setting this configuration `ozone.recon.prometheus.http.endpoint`
+to the Prometheus endpoint like `ozone.recon.prometheus.http.endpoint=localhost:9090`.
+
+## API Reference
+
+[Link to complete API Reference]({{< ref "interface/ReconApi.md" >}})
+
+## Persisted state
+
+ * A local copy of [OM database]({{< ref "concept/OzoneManager.md#persisted-state" >}})
+ * A local copy of [SCM database]({{< ref "concept/StorageContainerManager.md#persisted-state" >}})
+ * The following data is persisted in Recon in the specified RocksDB directory:
+ * ContainerKey table
+ * Stores the mapping (container, key) -> count
+ * ContainerKeyCount table
+ * Stores containerID -> no. of keys count within the container
+
+ * The following data is stored in the configured SQL database (default is Derby):
+ * GlobalStats table
+ * A Key -> Value table to store aggregate information like total
+ number of volumes / buckets / keys present in the cluster
+ * FileCountBySize table
+ * Keeps track of the number of files present within a file size range in the cluster
+ * ReconTaskStatus table
+ * Keeps track of the status and last run timestamp of the registered OM and SCM
+ db tasks in the [Recon Task Framework](#task-framework)
+ * ContainerHistory table
+ * Stores ContainerReplica -> Datanode mapping with last known timestamp. This
+ is used to determine the last known datanodes when a container is reported missing
+ * UnhealthyContainers table
+ * Keeps track of all the Unhealthy Containers (MISSING, UNDER_REPLICATED,
+ OVER_REPLICATED, MIS_REPLICATED) in the cluster at any given time
+
+
+## Notable configurations
+
+key | default |
description
+----|---------|------------
+ozone.recon.http-address | 0.0.0.0:9888 | The address and the base port where the Recon web UI will listen on.
+ozone.recon.address | 0.0.0.0:9891 | RPC address of the Recon.
+ozone.recon.db.dir | none | Directory where the Recon Server stores its metadata.
+ozone.recon.om.db.dir | none | Directory where the Recon Server stores its OM snapshot DB.
+ozone.recon.om.snapshot .task.interval.delay | 10m | Interval in MINUTES by Recon to request OM DB Snapshot / delta updates.
+ozone.recon.task .missingcontainer.interval | 300s | Time interval of the periodic check for Unhealthy Containers in the cluster.
+ozone.recon.sql.db.jooq.dialect | DERBY | Please refer to [SQL Dialect](https://www.jooq.org/javadoc/latest/org.jooq/org/jooq/SQLDialect.html) to specify a different dialect.
+ozone.recon.sql.db.jdbc.url | jdbc:derby:${ozone.recon.db.dir} /ozone_recon_derby.db | Recon SQL database jdbc url.
+ozone.recon.sql.db.username | none | Recon SQL database username.
+ozone.recon.sql.db.password | none | Recon SQL database password.
+ozone.recon.sql.db.driver | org.apache.derby.jdbc .EmbeddedDriver | Recon SQL database jdbc driver.
+
diff --git a/hadoop-hdds/docs/content/concept/ReconHighLevelDesign.png b/hadoop-hdds/docs/content/concept/ReconHighLevelDesign.png
new file mode 100644
index 000000000000..3bd6443d84c2
Binary files /dev/null and b/hadoop-hdds/docs/content/concept/ReconHighLevelDesign.png differ
diff --git a/hadoop-hdds/docs/content/concept/ReconOmDesign.png b/hadoop-hdds/docs/content/concept/ReconOmDesign.png
new file mode 100644
index 000000000000..20ea6a3360ed
Binary files /dev/null and b/hadoop-hdds/docs/content/concept/ReconOmDesign.png differ
diff --git a/hadoop-hdds/docs/content/concept/ReconScmDesign.png b/hadoop-hdds/docs/content/concept/ReconScmDesign.png
new file mode 100644
index 000000000000..32d07e02d2c4
Binary files /dev/null and b/hadoop-hdds/docs/content/concept/ReconScmDesign.png differ
diff --git a/hadoop-hdds/docs/content/concept/StorageContainerManager.md b/hadoop-hdds/docs/content/concept/StorageContainerManager.md
index 9636af5ec7cb..8922f89bc5d9 100644
--- a/hadoop-hdds/docs/content/concept/StorageContainerManager.md
+++ b/hadoop-hdds/docs/content/concept/StorageContainerManager.md
@@ -56,7 +56,7 @@ token infrastructure depends on this certificate infrastructure.
For a detailed view of Storage Container Manager this section gives a quick overview about the provided network services and the stored persisted data.
-**Network services provided by Storage Container Manager:**
+### Network services provided by Storage Container Manager:
* Pipelines: List/Delete/Activate/Deactivate
* pipelines are set of datanodes to form replication groups
@@ -74,8 +74,7 @@ For a detailed view of Storage Container Manager this section gives a quick over
Note: client doesn't connect directly to the SCM
-**Persisted state**
-
+### Persisted state
The following data is persisted in Storage Container Manager side in a specific RocksDB directory
diff --git a/hadoop-hdds/docs/content/feature/HA.zh.md b/hadoop-hdds/docs/content/feature/HA.zh.md
index cb89530ff560..b1975712b28d 100644
--- a/hadoop-hdds/docs/content/feature/HA.zh.md
+++ b/hadoop-hdds/docs/content/feature/HA.zh.md
@@ -3,7 +3,7 @@ title: "高可用"
weight: 1
menu:
main:
- parent: 特性
+ parent: 特点
summary: Ozone 用于避免单点故障的高可用设置
---
@@ -87,7 +87,7 @@ Ozone 有两个leader节点(用于键管理的 *Ozone Manager* 和用于块空
```
-基于 [客户端接口]({{< ref path="interface/_index.md" lang="en">}}) ,定义好的 `serviceId` 就可用于替代单个 OM 主机。
+基于 [客户端接口]({{< ref path="interface/_index.zh.md" lang="zh">}}) ,定义好的 `serviceId` 就可用于替代单个 OM 主机。
例如,使用 `o3fs://`
@@ -114,4 +114,4 @@ RocksDB 由后台的批处理事务线程负责更新(这也就是所谓的"
## 参考文档
* 查看 [该页面]({{< ref path="design/omha.md" lang="en">}}) 以获取详细设计文档;
-* Ozone 的分发包中的 compose/ozone-om-ha 目录下提供了一个配置 OM 高可用的示例,可以借助 [docker-compose]({{< ref path="start/RunningViaDocker.md" lang="en">}}) 进行测试。
+* Ozone 的分发包中的 `compose/ozone-om-ha` 目录下提供了一个配置 OM 高可用的示例,可以借助 [docker-compose]({{< ref path="start/RunningViaDocker.md" lang="en">}}) 进行测试。
diff --git a/hadoop-hdds/docs/content/feature/Quota.md b/hadoop-hdds/docs/content/feature/Quota.md
index 5be9f4db4d0c..933bbb50aec3 100644
--- a/hadoop-hdds/docs/content/feature/Quota.md
+++ b/hadoop-hdds/docs/content/feature/Quota.md
@@ -31,7 +31,12 @@ So far, we know that Ozone allows users to create volumes, buckets, and keys. A
## Currently supported
1. Storage Space level quota
-Administrators should be able to define how much storage space a Volume or Bucket can use.
+Administrators should be able to define how much storage space a Volume or Bucket can use. The following Settings for Storage space quota are currently supported:
+a. By default, the quota for volume and bucket is not enabled.
+b. When volume quota is enabled, the total size of bucket quota cannot exceed volume.
+c. Bucket quota can be set separately without enabling Volume quota. The size of bucket quota is unrestricted at this point.
+d. Volume quota is not currently supported separately, and volume quota takes effect only if bucket quota is set. Because ozone only check the usedBytes of the bucket when we write the key.
+
## Client usage
### Storage Space level quota
@@ -59,7 +64,7 @@ bin/ozone sh bucket setquota --space-quota 10GB /volume1/bucket1
```
This behavior changes the quota for Bucket1 to 10GB
-A bucket quota should not be greater than its Volume quota. Let's look at an example. If we have a 10MB Volume and create five buckets under that Volume with a quota of 5MB, the total quota is 25MB. In this case, the bucket creation will always succeed, and we check the quota for bucket and volume when the data is actually written. Each write needs to check whether the current bucket is exceeding the limit and the current total volume usage is exceeding the limit.
+Total bucket quota should not be greater than its Volume quota. If we have a 10MB Volume, The sum of the sizes of all buckets under this volume cannot exceed 10MB, otherwise the bucket set quota fails.
#### Clear the quota for Volume1. The Bucket cleanup command is similar.
```shell
diff --git a/hadoop-hdds/docs/content/feature/Quota.zh.md b/hadoop-hdds/docs/content/feature/Quota.zh.md
index 4cc1371668d6..b3f0c3c3187e 100644
--- a/hadoop-hdds/docs/content/feature/Quota.zh.md
+++ b/hadoop-hdds/docs/content/feature/Quota.zh.md
@@ -29,7 +29,11 @@ menu:
## 目前支持的
1. Storage space级别配额
- 管理员应该能够定义一个Volume或Bucket可以使用多少存储空间。
+ 管理员应该能够定义一个Volume或Bucket可以使用多少存储空间。目前支持以下storage space quota的设置:
+ a. 默认情况下volume和bucket的quota不启用。
+ b. 当volume quota启用时,bucket quota的总大小不能超过volume。
+ c. 可以在不启用volume quota的情况下单独给bucket设置quota。此时bucket quota的大小是不受限制的。
+ d. 目前不支持单独设置volume quota,只有在设置了bucket quota的情况下volume quota才会生效。因为ozone在写入key时只检查bucket的usedBytes。
## 客户端用法
### Storage space级别配额
@@ -56,7 +60,7 @@ bin/ozone sh bucket setquota --space-quota 10GB /volume1/bucket1
```
该行为将bucket1的配额更改为10GB
-一个bucket配额 不应大于其Volume的配额。让我们看一个例子,如果我们有一个10MB的Volume,并在该Volume下创建5个Bucket,配额为5MB,则总配额为25MB。在这种情况下,创建存储桶将始终成功,我们会在数据真正写入时检查bucket和volume的quota。每次写入需要检查当前bucket的是否超上限,当前总的volume使用量是否超上限。
+bucket的总配额 不应大于其Volume的配额。让我们看一个例子,如果我们有一个10MB的Volume,该volume下所有bucket的大小之和不能超过10MB,否则设置bucket quota将失败。
#### 清除Volume1的配额, Bucket清除命令与此类似
```shell
diff --git a/hadoop-hdds/docs/content/feature/Recon.md b/hadoop-hdds/docs/content/feature/Recon.md
index 9fa3f8c7cdec..be434a7e517d 100644
--- a/hadoop-hdds/docs/content/feature/Recon.md
+++ b/hadoop-hdds/docs/content/feature/Recon.md
@@ -1,5 +1,5 @@
---
-title: "Recon"
+title: "Recon Server"
weight: 7
menu:
main:
@@ -23,25 +23,19 @@ summary: Recon is the Web UI and analysis service for Ozone
limitations under the License.
-->
-Recon is the Web UI and analytics service for Ozone. It's an optional component, but strongly recommended as it can add additional visibility.
+Recon serves as a management and monitoring console for Ozone.
+It's an optional component, but it is strongly recommended to add it to the cluster
+since Recon can help with troubleshooting the cluster at critical times.
+Refer to [Recon Architecture]({{< ref "concept/Recon.md" >}}) for detailed architecture overview and
+[Recon API]({{< ref "interface/ReconApi.md" >}}) documentation
+for HTTP API reference.
-Recon collects all the data from an Ozone cluster and **store** them in a SQL database for further analyses.
-
- 1. Ozone Manager data is downloaded in the background by an async process. A RocksDB snapshots are created on OM side periodically, and the incremental data is copied to Recon and processed.
- 2. Datanodes can send Heartbeats not just to SCM but Recon. Recon can be a read-only listener of the Heartbeats and updates the local database based on the received information.
-
-Once Recon is configured, we are ready to start the service.
+Recon is a service that brings its own HTTP web server and can be started by
+the following command.
{{< highlight bash >}}
ozone --daemon start recon
{{< /highlight >}}
-## Notable configurations
-key | default | description
-----|---------|------------
-ozone.recon.http-address | 0.0.0.0:9888 | The address and the base port where the Recon web UI will listen on.
-ozone.recon.address | 0.0.0.0:9891 | RPC address of the Recon.
-ozone.recon.db.dir | none | Directory where the Recon Server stores its metadata.
-ozone.recon.om.db.dir | none | Directory where the Recon Server stores its OM snapshot DB.
-ozone.recon.om.snapshot.task.interval.delay | 10m | Interval in MINUTES by Recon to request OM DB Snapshot.
+
diff --git a/hadoop-hdds/docs/content/interface/ReconApi.md b/hadoop-hdds/docs/content/interface/ReconApi.md
new file mode 100644
index 000000000000..dd033f39f0ca
--- /dev/null
+++ b/hadoop-hdds/docs/content/interface/ReconApi.md
@@ -0,0 +1,511 @@
+---
+title: Recon API
+weight: 4
+menu:
+ main:
+ parent: "Client Interfaces"
+summary: Recon server supports HTTP endpoints to help troubleshoot and monitor Ozone cluster.
+---
+
+
+
+The Recon API v1 is a set of HTTP endpoints that help you understand the current
+state of an Ozone cluster and to troubleshoot if needed.
+
+### HTTP Endpoints
+
+#### Containers
+
+* **/containers**
+
+ **URL Structure**
+ ```
+ GET /api/v1/containers
+ ```
+
+ **Parameters**
+
+ * prevKey (optional)
+
+ Only returns the containers with ID greater than the given prevKey.
+ Example: prevKey=1
+
+ * limit (optional)
+
+ Only returns the limited number of results. The default limit is 1000.
+
+ **Returns**
+
+ Returns all the ContainerMetadata objects.
+
+ ```json
+ {
+ "data": {
+ "totalCount": 3,
+ "containers": [
+ {
+ "ContainerID": 1,
+ "NumberOfKeys": 834
+ },
+ {
+ "ContainerID": 2,
+ "NumberOfKeys": 833
+ },
+ {
+ "ContainerID": 3,
+ "NumberOfKeys": 833
+ }
+ ]
+ }
+ }
+ ```
+
+* **/containers/:id/keys**
+
+ **URL Structure**
+ ```
+ GET /api/v1/containers/:id/keys
+ ```
+
+ **Parameters**
+
+ * prevKey (optional)
+
+ Only returns the keys that are present after the given prevKey key prefix.
+ Example: prevKey=/vol1/bucket1/key1
+
+ * limit (optional)
+
+ Only returns the limited number of results. The default limit is 1000.
+
+ **Returns**
+
+ Returns all the KeyMetadata objects for the given ContainerID.
+
+ ```json
+ {
+ "totalCount":7,
+ "keys": [
+ {
+ "Volume":"vol-1-73141",
+ "Bucket":"bucket-3-35816",
+ "Key":"key-0-43637",
+ "DataSize":1000,
+ "Versions":[0],
+ "Blocks": {
+ "0": [
+ {
+ "containerID":1,
+ "localID":105232659753992201
+ }
+ ]
+ },
+ "CreationTime":"2020-11-18T18:09:17.722Z",
+ "ModificationTime":"2020-11-18T18:09:30.405Z"
+ },
+ ...
+ ]
+ }
+ ```
+
+* **/containers/missing**
+
+ **URL Structure**
+ ```
+ GET /api/v1/containers/missing
+ ```
+
+ **Parameters**
+
+ No parameters.
+
+ **Returns**
+
+ Returns the MissingContainerMetadata objects for all the missing containers.
+
+ ```json
+ {
+ "totalCount": 26,
+ "containers": [{
+ "containerID": 1,
+ "missingSince": 1605731029145,
+ "keys": 7,
+ "pipelineID": "88646d32-a1aa-4e1a",
+ "replicas": [{
+ "containerId": 1,
+ "datanodeHost": "localhost-1",
+ "firstReportTimestamp": 1605724047057,
+ "lastReportTimestamp": 1605731201301
+ },
+ ...
+ ]
+ },
+ ...
+ ]
+ }
+ ```
+
+* **/containers/:id/replicaHistory**
+
+ **URL Structure**
+ ```
+ GET /api/v1/containers/:id/replicaHistory
+ ```
+
+ **Parameters**
+
+ No parameters.
+
+ **Returns**
+
+ Returns all the ContainerHistory objects for the given ContainerID.
+
+ ```json
+ [
+ {
+ "containerId": 1,
+ "datanodeHost": "localhost-1",
+ "firstReportTimestamp": 1605724047057,
+ "lastReportTimestamp": 1605730421294
+ },
+ ...
+ ]
+ ```
+
+* **/containers/unhealthy**
+
+ **URL Structure**
+ ```
+ GET /api/v1/containers/unhealthy
+ ```
+
+ **Parameters**
+
+ * batchNum (optional)
+
+ The batch number (like "page number") of results to return.
+ Passing 1, will return records 1 to limit. 2 will return
+ limit + 1 to 2 * limit, etc.
+
+ * limit (optional)
+
+ Only returns the limited number of results. The default limit is 1000.
+
+ **Returns**
+
+ Returns the UnhealthyContainerMetadata objects for all the unhealthy containers.
+
+ ```json
+ {
+ "missingCount": 2,
+ "underReplicatedCount": 0,
+ "overReplicatedCount": 0,
+ "misReplicatedCount": 0,
+ "containers": [{
+ "containerID": 1,
+ "containerState": "MISSING",
+ "unhealthySince": 1605731029145,
+ "expectedReplicaCount": 3,
+ "actualReplicaCount": 0,
+ "replicaDeltaCount": 3,
+ "reason": null,
+ "keys": 7,
+ "pipelineID": "88646d32-a1aa-4e1a",
+ "replicas": [{
+ "containerId": 1,
+ "datanodeHost": "localhost-1",
+ "firstReportTimestamp": 1605722960125,
+ "lastReportTimestamp": 1605731230509
+ },
+ ...
+ ]
+ },
+ ...
+ ]
+ }
+ ```
+
+* **/containers/unhealthy/:state**
+
+ **URL Structure**
+ ```
+ GET /api/v1/containers/unhealthy/:state
+ ```
+
+ **Parameters**
+
+ * batchNum (optional)
+
+ The batch number (like "page number") of results to return.
+ Passing 1, will return records 1 to limit. 2 will return
+ limit + 1 to 2 * limit, etc.
+
+ * limit (optional)
+
+ Only returns the limited number of results. The default limit is 1000.
+
+ **Returns**
+
+ Returns the UnhealthyContainerMetadata objects for the containers in the given state.
+ Possible unhealthy container states are `MISSING`, `MIS_REPLICATED`, `UNDER_REPLICATED`, `OVER_REPLICATED`.
+ The response structure is same as `/containers/unhealthy`.
+
+#### ClusterState
+
+* **/clusterState**
+
+ **URL Structure**
+ ```
+ GET /api/v1/clusterState
+ ```
+
+ **Parameters**
+
+ No parameters.
+
+ **Returns**
+
+ Returns a summary of the current state of the Ozone cluster.
+
+ ```json
+ {
+ "pipelines": 5,
+ "totalDatanodes": 4,
+ "healthyDatanodes": 4,
+ "storageReport": {
+ "capacity": 1081719668736,
+ "used": 1309212672,
+ "remaining": 597361258496
+ },
+ "containers": 26,
+ "volumes": 6,
+ "buckets": 26,
+ "keys": 25
+ }
+ ```
+
+#### Datanodes
+
+* **/datanodes**
+
+ **URL Structure**
+ ```
+ GET /api/v1/datanodes
+ ```
+
+ **Parameters**
+
+ No parameters.
+
+ **Returns**
+
+ Returns all the datanodes in the cluster.
+
+ ```json
+ {
+ "totalCount": 4,
+ "datanodes": [{
+ "uuid": "f8f8cb45-3ab2-4123",
+ "hostname": "localhost-1",
+ "state": "HEALTHY",
+ "lastHeartbeat": 1605738400544,
+ "storageReport": {
+ "capacity": 270429917184,
+ "used": 358805504,
+ "remaining": 119648149504
+ },
+ "pipelines": [{
+ "pipelineID": "b9415b20-b9bd-4225",
+ "replicationType": "RATIS",
+ "replicationFactor": 3,
+ "leaderNode": "localhost-2"
+ }, {
+ "pipelineID": "3bf4a9e9-69cc-4d20",
+ "replicationType": "RATIS",
+ "replicationFactor": 1,
+ "leaderNode": "localhost-1"
+ }],
+ "containers": 17,
+ "leaderCount": 1
+ },
+ ...
+ ]
+ }
+ ```
+
+#### Pipelines
+
+* **/pipelines**
+
+ **URL Structure**
+ ```
+ GET /api/v1/pipelines
+ ```
+
+ **Parameters**
+
+ No parameters.
+
+ **Returns**
+
+ Returns all the pipelines in the cluster.
+
+ ```json
+ {
+ "totalCount": 5,
+ "pipelines": [{
+ "pipelineId": "b9415b20-b9bd-4225",
+ "status": "OPEN",
+ "leaderNode": "localhost-1",
+ "datanodes": ["localhost-1", "localhost-2", "localhost-3"],
+ "lastLeaderElection": 0,
+ "duration": 23166128,
+ "leaderElections": 0,
+ "replicationType": "RATIS",
+ "replicationFactor": 3,
+ "containers": 0
+ },
+ ...
+ ]
+ }
+ ```
+
+#### Tasks
+
+* **/task/status**
+
+ **URL Structure**
+ ```
+ GET /api/v1/task/status
+ ```
+
+ **Parameters**
+
+ No parameters.
+
+ **Returns**
+
+ Returns the status of all the Recon tasks.
+
+ ```json
+ [
+ {
+ "taskName": "OmDeltaRequest",
+ "lastUpdatedTimestamp": 1605724099147,
+ "lastUpdatedSeqNumber": 186
+ },
+ ...
+ ]
+ ```
+
+#### Utilization
+
+* **/utilization/fileCount**
+
+ **URL Structure**
+ ```
+ GET /api/v1/utilization/fileCount
+ ```
+
+ **Parameters**
+
+ * volume (optional)
+
+ Filters the results based on the given volume name.
+
+ * bucket (optional)
+
+ Filters the results based on the given bucket name.
+
+ * fileSize (optional)
+
+ Filters the results based on the given fileSize.
+
+ **Returns**
+
+ Returns the file counts within different file ranges with `fileSize` in the
+ response object being the upper cap for file size range.
+
+ ```json
+ [{
+ "volume": "vol-2-04168",
+ "bucket": "bucket-0-11685",
+ "fileSize": 1024,
+ "count": 1
+ }, {
+ "volume": "vol-2-04168",
+ "bucket": "bucket-1-41795",
+ "fileSize": 1024,
+ "count": 1
+ }, {
+ "volume": "vol-2-04168",
+ "bucket": "bucket-2-93377",
+ "fileSize": 1024,
+ "count": 1
+ }, {
+ "volume": "vol-2-04168",
+ "bucket": "bucket-3-50336",
+ "fileSize": 1024,
+ "count": 2
+ }]
+ ```
+
+#### Metrics
+
+* **/metrics/:api**
+
+ **URL Structure**
+ ```
+ GET /api/v1/metrics/:api
+ ```
+
+ **Parameters**
+
+ Refer to [Prometheus HTTP API Reference](https://prometheus.io/docs/prometheus/latest/querying/api/)
+ for complete documentation on querying.
+
+ **Returns**
+
+ This is a proxy endpoint for Prometheus and returns the same response as
+ the prometheus endpoint.
+ Example: /api/v1/metrics/query?query=ratis_leader_election_electionCount
+
+ ```json
+ {
+ "status": "success",
+ "data": {
+ "resultType": "vector",
+ "result": [
+ {
+ "metric": {
+ "__name__": "ratis_leader_election_electionCount",
+ "exported_instance": "33a5ac1d-8c65-4c74-a0b8-9314dfcccb42",
+ "group": "group-03CA9397D54B",
+ "instance": "ozone_datanode_1:9882",
+ "job": "ozone"
+ },
+ "value": [
+ 1599159384.455,
+ "5"
+ ]
+ }
+ ]
+ }
+ }
+ ```
+
\ No newline at end of file
diff --git a/hadoop-hdds/framework/pom.xml b/hadoop-hdds/framework/pom.xml
index 91eb43c83465..4f9866995750 100644
--- a/hadoop-hdds/framework/pom.xml
+++ b/hadoop-hdds/framework/pom.xml
@@ -51,6 +51,22 @@ https://maven.apache.org/xsd/maven-4.0.0.xsd">
org.apache.hadoophadoop-hdds-hadoop-dependency-server
+
+ org.eclipse.jetty
+ jetty-util
+
+
+ org.eclipse.jetty
+ jetty-server
+
+
+ org.eclipse.jetty
+ jetty-servlet
+
+
+ org.eclipse.jetty
+ jetty-webapp
+ ratis-serverorg.apache.ratis
@@ -69,10 +85,6 @@ https://maven.apache.org/xsd/maven-4.0.0.xsd">
-
- org.eclipse.jetty
- jetty-util
- org.rocksdbrocksdbjni
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/conf/DatanodeRatisServerConfig.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/conf/DatanodeRatisServerConfig.java
index 19084f179cb8..8392789735f1 100644
--- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/conf/DatanodeRatisServerConfig.java
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/conf/DatanodeRatisServerConfig.java
@@ -88,7 +88,7 @@ public void setNoLeaderTimeout(Duration duration) {
this.noLeaderTimeout = duration.toMillis();
}
- @Config(key = "rpcslowness.timeout",
+ @Config(key = "rpc.slowness.timeout",
defaultValue = "300s",
type = ConfigType.TIME,
tags = {OZONE, DATANODE, RATIS},
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/ScmBlockLocationProtocolClientSideTranslatorPB.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/ScmBlockLocationProtocolClientSideTranslatorPB.java
index e86ee81ddb86..12c51f6ca49b 100644
--- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/ScmBlockLocationProtocolClientSideTranslatorPB.java
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/ScmBlockLocationProtocolClientSideTranslatorPB.java
@@ -26,6 +26,7 @@
import org.apache.hadoop.hdds.client.ContainerBlockID;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.protocol.proto.ScmBlockLocationProtocolProtos;
import org.apache.hadoop.hdds.protocol.proto.ScmBlockLocationProtocolProtos.SCMBlockLocationRequest;
import org.apache.hadoop.hdds.protocol.proto.ScmBlockLocationProtocolProtos.SCMBlockLocationResponse;
import org.apache.hadoop.hdds.protocol.proto.ScmBlockLocationProtocolProtos.Type;
@@ -45,10 +46,11 @@
import org.apache.hadoop.hdds.scm.exceptions.SCMException;
import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
import org.apache.hadoop.hdds.scm.protocol.ScmBlockLocationProtocol;
+import org.apache.hadoop.hdds.scm.proxy.SCMBlockLocationFailoverProxyProvider;
import org.apache.hadoop.hdds.tracing.TracingUtil;
+import org.apache.hadoop.io.retry.RetryProxy;
import org.apache.hadoop.ipc.ProtobufHelper;
import org.apache.hadoop.ipc.ProtocolTranslator;
-import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.ozone.common.BlockGroup;
import org.apache.hadoop.ozone.common.DeleteBlockGroupResult;
@@ -73,15 +75,21 @@ public final class ScmBlockLocationProtocolClientSideTranslatorPB
private static final RpcController NULL_RPC_CONTROLLER = null;
private final ScmBlockLocationProtocolPB rpcProxy;
+ private SCMBlockLocationFailoverProxyProvider failoverProxyProvider;
/**
* Creates a new StorageContainerLocationProtocolClientSideTranslatorPB.
*
- * @param rpcProxy {@link StorageContainerLocationProtocolPB} RPC proxy
+ * @param proxyProvider {@link SCMBlockLocationFailoverProxyProvider}
+ * failover proxy provider.
*/
public ScmBlockLocationProtocolClientSideTranslatorPB(
- ScmBlockLocationProtocolPB rpcProxy) {
- this.rpcProxy = rpcProxy;
+ SCMBlockLocationFailoverProxyProvider proxyProvider) {
+ Preconditions.checkState(proxyProvider != null);
+ this.failoverProxyProvider = proxyProvider;
+ this.rpcProxy = (ScmBlockLocationProtocolPB) RetryProxy.create(
+ ScmBlockLocationProtocolPB.class, failoverProxyProvider,
+ failoverProxyProvider.getSCMBlockLocationRetryPolicy(null));
}
/**
@@ -105,6 +113,11 @@ private SCMBlockLocationResponse submitRequest(
try {
SCMBlockLocationResponse response =
rpcProxy.send(NULL_RPC_CONTROLLER, req);
+ if (response.getStatus() ==
+ ScmBlockLocationProtocolProtos.Status.SCM_NOT_LEADER) {
+ failoverProxyProvider
+ .performFailoverToAssignedLeader(response.getLeaderSCMNodeId());
+ }
return response;
} catch (ServiceException e) {
throw ProtobufHelper.getRemoteException(e);
@@ -267,7 +280,7 @@ public Object getUnderlyingProxyObject() {
}
@Override
- public void close() {
- RPC.stopProxy(rpcProxy);
+ public void close() throws IOException {
+ failoverProxyProvider.close();
}
}
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java
index 0733940deb2c..e5ee1234e335 100644
--- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java
@@ -68,7 +68,9 @@
import org.apache.hadoop.hdds.scm.container.common.helpers.ContainerWithPipeline;
import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
import org.apache.hadoop.hdds.scm.protocol.StorageContainerLocationProtocol;
+import org.apache.hadoop.hdds.scm.proxy.SCMContainerLocationFailoverProxyProvider;
import org.apache.hadoop.hdds.tracing.TracingUtil;
+import org.apache.hadoop.io.retry.RetryProxy;
import org.apache.hadoop.ipc.ProtobufHelper;
import org.apache.hadoop.ipc.ProtocolTranslator;
import org.apache.hadoop.ipc.RPC;
@@ -92,15 +94,20 @@ public final class StorageContainerLocationProtocolClientSideTranslatorPB
private static final RpcController NULL_RPC_CONTROLLER = null;
private final StorageContainerLocationProtocolPB rpcProxy;
+ private final SCMContainerLocationFailoverProxyProvider failoverProxyProvider;
/**
* Creates a new StorageContainerLocationProtocolClientSideTranslatorPB.
*
- * @param rpcProxy {@link StorageContainerLocationProtocolPB} RPC proxy
+ * @param proxyProvider {@link SCMContainerLocationFailoverProxyProvider}
*/
public StorageContainerLocationProtocolClientSideTranslatorPB(
- StorageContainerLocationProtocolPB rpcProxy) {
- this.rpcProxy = rpcProxy;
+ SCMContainerLocationFailoverProxyProvider proxyProvider) {
+ Preconditions.checkNotNull(proxyProvider);
+ this.failoverProxyProvider = proxyProvider;
+ this.rpcProxy = (StorageContainerLocationProtocolPB) RetryProxy.create(
+ StorageContainerLocationProtocolPB.class, failoverProxyProvider,
+ failoverProxyProvider.getSCMContainerLocationRetryPolicy(null));
}
/**
@@ -127,7 +134,13 @@ private ScmContainerLocationResponse submitRequest(
private ScmContainerLocationResponse submitRpcRequest(
ScmContainerLocationRequest wrapper) throws ServiceException {
- return rpcProxy.submitRequest(NULL_RPC_CONTROLLER, wrapper);
+ ScmContainerLocationResponse response =
+ rpcProxy.submitRequest(NULL_RPC_CONTROLLER, wrapper);
+ if (response.getStatus() ==
+ ScmContainerLocationResponse.Status.SCM_NOT_LEADER) {
+ failoverProxyProvider.performFailoverToAssignedLeader(null);
+ }
+ return response;
}
/**
@@ -444,7 +457,9 @@ public ScmInfo getScmInfo() throws IOException {
.getGetScmInfoResponse();
ScmInfo.Builder builder = new ScmInfo.Builder()
.setClusterId(resp.getClusterId())
- .setScmId(resp.getScmId());
+ .setScmId(resp.getScmId())
+ .setRatisPeerRoles(resp.getPeerRolesList());
+
return builder.build();
}
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMBlockLocationFailoverProxyProvider.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMBlockLocationFailoverProxyProvider.java
new file mode 100644
index 000000000000..bcc1a01c13c4
--- /dev/null
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMBlockLocationFailoverProxyProvider.java
@@ -0,0 +1,279 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdds.scm.proxy;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdds.conf.ConfigurationSource;
+import org.apache.hadoop.hdds.scm.ScmConfigKeys;
+import org.apache.hadoop.hdds.scm.protocolPB.ScmBlockLocationProtocolPB;
+import org.apache.hadoop.hdds.utils.LegacyHadoopConfigurationSource;
+import org.apache.hadoop.io.retry.FailoverProxyProvider;
+import org.apache.hadoop.io.retry.RetryPolicy;
+import org.apache.hadoop.io.retry.RetryPolicy.RetryAction;
+import org.apache.hadoop.ipc.ProtobufRpcEngine;
+import org.apache.hadoop.ipc.RPC;
+import org.apache.hadoop.net.NetUtils;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_NAMES;
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_SERVICE_IDS_KEY;
+import static org.apache.hadoop.hdds.HddsUtils.getScmAddressForBlockClients;
+import static org.apache.hadoop.hdds.HddsUtils.getPortNumberFromConfigKeys;
+import static org.apache.hadoop.hdds.HddsUtils.getHostName;
+
+/**
+ * Failover proxy provider for SCM block location.
+ */
+public class SCMBlockLocationFailoverProxyProvider implements
+ FailoverProxyProvider, Closeable {
+ public static final Logger LOG =
+ LoggerFactory.getLogger(SCMBlockLocationFailoverProxyProvider.class);
+
+ private Map> scmProxies;
+ private Map scmProxyInfoMap;
+ private List scmNodeIDList;
+
+ private String currentProxySCMNodeId;
+ private int currentProxyIndex;
+
+ private final ConfigurationSource conf;
+ private final long scmVersion;
+
+ private final String scmServiceId;
+
+ private String lastAttemptedLeader;
+
+ private final int maxRetryCount;
+ private final long retryInterval;
+
+ public static final String SCM_DUMMY_NODEID_PREFIX = "scm";
+
+ public SCMBlockLocationFailoverProxyProvider(ConfigurationSource conf) {
+ this.conf = conf;
+ this.scmVersion = RPC.getProtocolVersion(ScmBlockLocationProtocolPB.class);
+ this.scmServiceId = conf.getTrimmed(OZONE_SCM_SERVICE_IDS_KEY);
+ this.scmProxies = new HashMap<>();
+ this.scmProxyInfoMap = new HashMap<>();
+ this.scmNodeIDList = new ArrayList<>();
+ loadConfigs();
+
+
+ this.currentProxyIndex = 0;
+ currentProxySCMNodeId = scmNodeIDList.get(currentProxyIndex);
+
+ SCMClientConfig config = conf.getObject(SCMClientConfig.class);
+ this.maxRetryCount = config.getRetryCount();
+ this.retryInterval = config.getRetryInterval();
+ }
+
+ @VisibleForTesting
+ protected Collection getSCMAddressList() {
+ Collection scmAddressList =
+ conf.getTrimmedStringCollection(OZONE_SCM_NAMES);
+ Collection resultList = new ArrayList<>();
+ if (!scmAddressList.isEmpty()) {
+ final int port = getPortNumberFromConfigKeys(conf,
+ ScmConfigKeys.OZONE_SCM_BLOCK_CLIENT_ADDRESS_KEY)
+ .orElse(ScmConfigKeys.OZONE_SCM_BLOCK_CLIENT_PORT_DEFAULT);
+ for (String scmAddress : scmAddressList) {
+ LOG.info("SCM Address for proxy is {}", scmAddress);
+
+ Optional hostname = getHostName(scmAddress);
+ if (hostname.isPresent()) {
+ resultList.add(NetUtils.createSocketAddr(
+ hostname.get() + ":" + port));
+ }
+ }
+ }
+ if (resultList.isEmpty()) {
+ // fall back
+ resultList.add(getScmAddressForBlockClients(conf));
+ }
+ return resultList;
+ }
+
+ private void loadConfigs() {
+ Collection scmAddressList = getSCMAddressList();
+ int scmNodeIndex = 1;
+ for (InetSocketAddress scmAddress : scmAddressList) {
+ String nodeId = SCM_DUMMY_NODEID_PREFIX + scmNodeIndex;
+ if (scmAddress == null) {
+ LOG.error("Failed to create SCM proxy for {}.", nodeId);
+ continue;
+ }
+ scmNodeIndex++;
+ SCMProxyInfo scmProxyInfo = new SCMProxyInfo(
+ scmServiceId, nodeId, scmAddress);
+ ProxyInfo proxy = new ProxyInfo<>(
+ null, scmProxyInfo.toString());
+ scmProxies.put(nodeId, proxy);
+ scmProxyInfoMap.put(nodeId, scmProxyInfo);
+ scmNodeIDList.add(nodeId);
+ }
+
+ if (scmProxies.isEmpty()) {
+ throw new IllegalArgumentException("Could not find any configured " +
+ "addresses for SCM. Please configure the system with "
+ + OZONE_SCM_NAMES);
+ }
+ }
+
+ @VisibleForTesting
+ public synchronized String getCurrentProxyOMNodeId() {
+ return currentProxySCMNodeId;
+ }
+
+ @Override
+ public synchronized ProxyInfo getProxy() {
+ ProxyInfo currentProxyInfo = scmProxies.get(currentProxySCMNodeId);
+ createSCMProxyIfNeeded(currentProxyInfo, currentProxySCMNodeId);
+ return currentProxyInfo;
+ }
+
+ @Override
+ public void performFailover(ScmBlockLocationProtocolPB newLeader) {
+ // Should do nothing here.
+ LOG.debug("Failing over to next proxy. {}", getCurrentProxyOMNodeId());
+ }
+
+ public void performFailoverToAssignedLeader(String newLeader) {
+ if (newLeader == null) {
+ // If newLeader is not assigned, it will fail over to next proxy.
+ nextProxyIndex();
+ } else {
+ if (!assignLeaderToNode(newLeader)) {
+ LOG.debug("Failing over OM proxy to nodeId: {}", newLeader);
+ nextProxyIndex();
+ }
+ }
+ }
+
+ @Override
+ public Class getInterface() {
+ return ScmBlockLocationProtocolPB.class;
+ }
+
+ @Override
+ public synchronized void close() throws IOException {
+ for (ProxyInfo proxy : scmProxies.values()) {
+ ScmBlockLocationProtocolPB scmProxy = proxy.proxy;
+ if (scmProxy != null) {
+ RPC.stopProxy(scmProxy);
+ }
+ }
+ }
+
+ public RetryAction getRetryAction(int failovers) {
+ if (failovers < maxRetryCount) {
+ return new RetryAction(RetryAction.RetryDecision.FAILOVER_AND_RETRY,
+ getRetryInterval());
+ } else {
+ return RetryAction.FAIL;
+ }
+ }
+
+ private synchronized long getRetryInterval() {
+ // TODO add exponential backup
+ return retryInterval;
+ }
+
+ private synchronized int nextProxyIndex() {
+ lastAttemptedLeader = currentProxySCMNodeId;
+
+ // round robin the next proxy
+ currentProxyIndex = (currentProxyIndex + 1) % scmProxies.size();
+ currentProxySCMNodeId = scmNodeIDList.get(currentProxyIndex);
+ return currentProxyIndex;
+ }
+
+ private synchronized boolean assignLeaderToNode(String newLeaderNodeId) {
+ if (!currentProxySCMNodeId.equals(newLeaderNodeId)) {
+ if (scmProxies.containsKey(newLeaderNodeId)) {
+ lastAttemptedLeader = currentProxySCMNodeId;
+ currentProxySCMNodeId = newLeaderNodeId;
+ currentProxyIndex = scmNodeIDList.indexOf(currentProxySCMNodeId);
+ return true;
+ }
+ } else {
+ lastAttemptedLeader = currentProxySCMNodeId;
+ }
+ return false;
+ }
+
+ /**
+ * Creates proxy object if it does not already exist.
+ */
+ private void createSCMProxyIfNeeded(ProxyInfo proxyInfo,
+ String nodeId) {
+ if (proxyInfo.proxy == null) {
+ InetSocketAddress address = scmProxyInfoMap.get(nodeId).getAddress();
+ try {
+ ScmBlockLocationProtocolPB proxy = createSCMProxy(address);
+ try {
+ proxyInfo.proxy = proxy;
+ } catch (IllegalAccessError iae) {
+ scmProxies.put(nodeId,
+ new ProxyInfo<>(proxy, proxyInfo.proxyInfo));
+ }
+ } catch (IOException ioe) {
+ LOG.error("{} Failed to create RPC proxy to SCM at {}",
+ this.getClass().getSimpleName(), address, ioe);
+ throw new RuntimeException(ioe);
+ }
+ }
+ }
+
+ private ScmBlockLocationProtocolPB createSCMProxy(
+ InetSocketAddress scmAddress) throws IOException {
+ Configuration hadoopConf =
+ LegacyHadoopConfigurationSource.asHadoopConfiguration(conf);
+ RPC.setProtocolEngine(hadoopConf, ScmBlockLocationProtocolPB.class,
+ ProtobufRpcEngine.class);
+ return RPC.getProxy(ScmBlockLocationProtocolPB.class, scmVersion,
+ scmAddress, UserGroupInformation.getCurrentUser(), hadoopConf,
+ NetUtils.getDefaultSocketFactory(hadoopConf),
+ (int)conf.getObject(SCMClientConfig.class).getRpcTimeOut());
+ }
+
+ public RetryPolicy getSCMBlockLocationRetryPolicy(String newLeader) {
+ RetryPolicy retryPolicy = new RetryPolicy() {
+ @Override
+ public RetryAction shouldRetry(Exception e, int retry,
+ int failover, boolean b) {
+ performFailoverToAssignedLeader(newLeader);
+ return getRetryAction(failover);
+ }
+ };
+ return retryPolicy;
+ }
+}
+
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMClientConfig.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMClientConfig.java
new file mode 100644
index 000000000000..99dc4461f00b
--- /dev/null
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMClientConfig.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.hadoop.hdds.scm.proxy;
+
+import org.apache.hadoop.hdds.conf.Config;
+import org.apache.hadoop.hdds.conf.ConfigGroup;
+import org.apache.hadoop.hdds.conf.ConfigType;
+
+import java.util.concurrent.TimeUnit;
+
+import static org.apache.hadoop.hdds.conf.ConfigTag.CLIENT;
+import static org.apache.hadoop.hdds.conf.ConfigTag.OZONE;
+import static org.apache.hadoop.hdds.conf.ConfigTag.SCM;
+
+/**
+ * Config for SCM Block Client.
+ */
+@ConfigGroup(prefix = "hdds.scmclient")
+public class SCMClientConfig {
+ public static final String SCM_CLIENT_RPC_TIME_OUT = "rpc.timeout";
+ public static final String SCM_CLIENT_FAILOVER_MAX_RETRY =
+ "failover.max.retry";
+ public static final String SCM_CLIENT_RETRY_INTERVAL =
+ "failover.retry.interval";
+
+ @Config(key = SCM_CLIENT_RPC_TIME_OUT,
+ defaultValue = "15m",
+ type = ConfigType.TIME,
+ tags = {OZONE, SCM, CLIENT},
+ timeUnit = TimeUnit.MILLISECONDS,
+ description = "RpcClient timeout on waiting for the response from " +
+ "SCM. The default value is set to 15 minutes. " +
+ "If ipc.client.ping is set to true and this rpc-timeout " +
+ "is greater than the value of ipc.ping.interval, the effective " +
+ "value of the rpc-timeout is rounded up to multiple of " +
+ "ipc.ping.interval."
+ )
+ private long rpcTimeOut = 15 * 60 * 1000;
+
+ @Config(key = SCM_CLIENT_FAILOVER_MAX_RETRY,
+ defaultValue = "15",
+ type = ConfigType.INT,
+ tags = {OZONE, SCM, CLIENT},
+ description = "Max retry count for SCM Client when failover happens."
+ )
+ private int retryCount = 15;
+
+ @Config(key = SCM_CLIENT_RETRY_INTERVAL,
+ defaultValue = "2s",
+ type = ConfigType.TIME,
+ tags = {OZONE, SCM, CLIENT},
+ timeUnit = TimeUnit.MILLISECONDS,
+ description = "SCM Client timeout on waiting for the next connection " +
+ "retry to other SCM IP. The default value is set to 2 minutes. "
+ )
+ private long retryInterval = 2 * 1000;
+
+ public long getRpcTimeOut() {
+ return rpcTimeOut;
+ }
+
+ public void setRpcTimeOut(long timeOut) {
+ // As at the end this value should not exceed MAX_VALUE, as underlying
+ // Rpc layer SocketTimeout parameter is int.
+ if (rpcTimeOut > Integer.MAX_VALUE) {
+ this.rpcTimeOut = Integer.MAX_VALUE;
+ }
+ this.rpcTimeOut = timeOut;
+ }
+
+ public int getRetryCount() {
+ return retryCount;
+ }
+
+ public void setRetryCount(int retryCount) {
+ this.retryCount = retryCount;
+ }
+
+ public long getRetryInterval() {
+ return retryInterval;
+ }
+
+ public void setRetryInterval(long retryInterval) {
+ this.retryInterval = retryInterval;
+ }
+}
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMContainerLocationFailoverProxyProvider.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMContainerLocationFailoverProxyProvider.java
new file mode 100644
index 000000000000..a04a66f4f278
--- /dev/null
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMContainerLocationFailoverProxyProvider.java
@@ -0,0 +1,284 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdds.scm.proxy;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdds.conf.ConfigurationSource;
+import org.apache.hadoop.hdds.scm.ScmConfigKeys;
+import org.apache.hadoop.hdds.scm.protocolPB.StorageContainerLocationProtocolPB;
+import org.apache.hadoop.hdds.utils.LegacyHadoopConfigurationSource;
+import org.apache.hadoop.io.retry.FailoverProxyProvider;
+import org.apache.hadoop.io.retry.RetryPolicy;
+import org.apache.hadoop.ipc.ProtobufRpcEngine;
+import org.apache.hadoop.ipc.RPC;
+import org.apache.hadoop.net.NetUtils;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+
+import static org.apache.hadoop.hdds.HddsUtils.getHostName;
+import static org.apache.hadoop.hdds.HddsUtils.getPortNumberFromConfigKeys;
+import static org.apache.hadoop.hdds.HddsUtils.getScmAddressForClients;
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_NAMES;
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_SERVICE_IDS_KEY;
+
+/**
+ * Failover proxy provider for SCM container location.
+ */
+public class SCMContainerLocationFailoverProxyProvider implements
+ FailoverProxyProvider, Closeable {
+ public static final Logger LOG =
+ LoggerFactory.getLogger(SCMContainerLocationFailoverProxyProvider.class);
+
+ private Map> scmProxies;
+ private Map scmProxyInfoMap;
+ private List scmNodeIDList;
+
+ private String currentProxySCMNodeId;
+ private int currentProxyIndex;
+
+ private final ConfigurationSource conf;
+ private final SCMClientConfig scmClientConfig;
+ private final long scmVersion;
+
+ private final String scmServiceId;
+
+ private final int maxRetryCount;
+ private final long retryInterval;
+
+ public static final String SCM_DUMMY_NODEID_PREFIX = "scm";
+
+ public SCMContainerLocationFailoverProxyProvider(ConfigurationSource conf) {
+ this.conf = conf;
+ this.scmVersion = RPC.getProtocolVersion(
+ StorageContainerLocationProtocolPB.class);
+ this.scmServiceId = conf.getTrimmed(OZONE_SCM_SERVICE_IDS_KEY);
+ this.scmProxies = new HashMap<>();
+ this.scmProxyInfoMap = new HashMap<>();
+ this.scmNodeIDList = new ArrayList<>();
+ loadConfigs();
+
+ this.currentProxyIndex = 0;
+ currentProxySCMNodeId = scmNodeIDList.get(currentProxyIndex);
+ scmClientConfig = conf.getObject(SCMClientConfig.class);
+ this.maxRetryCount = scmClientConfig.getRetryCount();
+ this.retryInterval = scmClientConfig.getRetryInterval();
+ }
+
+ @VisibleForTesting
+ protected Collection getSCMAddressList() {
+ Collection scmAddressList =
+ conf.getTrimmedStringCollection(OZONE_SCM_NAMES);
+ Collection resultList = new ArrayList<>();
+ if (!scmAddressList.isEmpty()) {
+ final int port = getPortNumberFromConfigKeys(conf,
+ ScmConfigKeys.OZONE_SCM_CLIENT_ADDRESS_KEY)
+ .orElse(ScmConfigKeys.OZONE_SCM_CLIENT_PORT_DEFAULT);
+ for (String scmAddress : scmAddressList) {
+ LOG.debug("SCM Address for proxy is {}", scmAddress);
+
+ Optional hostname = getHostName(scmAddress);
+ if (hostname.isPresent()) {
+ resultList.add(NetUtils.createSocketAddr(
+ hostname.get() + ":" + port));
+ }
+ }
+ }
+ if (resultList.isEmpty()) {
+ // fall back
+ resultList.add(getScmAddressForClients(conf));
+ }
+ return resultList;
+ }
+
+ private void loadConfigs() {
+ Collection scmAddressList = getSCMAddressList();
+ int scmNodeIndex = 1;
+ for (InetSocketAddress scmAddress : scmAddressList) {
+ String nodeId = SCM_DUMMY_NODEID_PREFIX + scmNodeIndex;
+ if (scmAddress == null) {
+ LOG.error("Failed to create SCM proxy for {}.", nodeId);
+ continue;
+ }
+ scmNodeIndex++;
+ SCMProxyInfo scmProxyInfo = new SCMProxyInfo(
+ scmServiceId, nodeId, scmAddress);
+ ProxyInfo proxy
+ = new ProxyInfo<>(null, scmProxyInfo.toString());
+ scmProxies.put(nodeId, proxy);
+ scmProxyInfoMap.put(nodeId, scmProxyInfo);
+ scmNodeIDList.add(nodeId);
+ }
+
+ if (scmProxies.isEmpty()) {
+ throw new IllegalArgumentException("Could not find any configured " +
+ "addresses for SCM. Please configure the system with "
+ + OZONE_SCM_NAMES);
+ }
+ }
+
+ @VisibleForTesting
+ public synchronized String getCurrentProxyOMNodeId() {
+ return currentProxySCMNodeId;
+ }
+
+ @Override
+ public synchronized ProxyInfo getProxy() {
+ ProxyInfo currentProxyInfo = scmProxies.get(currentProxySCMNodeId);
+ createSCMProxyIfNeeded(currentProxyInfo, currentProxySCMNodeId);
+ return currentProxyInfo;
+ }
+
+ @Override
+ public void performFailover(
+ StorageContainerLocationProtocolPB newLeader) {
+ // Should do nothing here.
+ LOG.debug("Failing over to next proxy. {}", getCurrentProxyOMNodeId());
+ }
+
+ public void performFailoverToAssignedLeader(String newLeader) {
+ if (newLeader == null) {
+ // If newLeader is not assigned, it will fail over to next proxy.
+ nextProxyIndex();
+ } else {
+ if (!assignLeaderToNode(newLeader)) {
+ LOG.debug("Failing over OM proxy to nodeId: {}", newLeader);
+ nextProxyIndex();
+ }
+ }
+ }
+
+ @Override
+ public Class<
+ StorageContainerLocationProtocolPB> getInterface() {
+ return StorageContainerLocationProtocolPB.class;
+ }
+
+ @Override
+ public synchronized void close() throws IOException {
+ for (ProxyInfo
+ proxy : scmProxies.values()) {
+ StorageContainerLocationProtocolPB scmProxy =
+ proxy.proxy;
+ if (scmProxy != null) {
+ RPC.stopProxy(scmProxy);
+ }
+ }
+ }
+
+ public RetryPolicy.RetryAction getRetryAction(int failovers) {
+ if (failovers < maxRetryCount) {
+ return new RetryPolicy.RetryAction(
+ RetryPolicy.RetryAction.RetryDecision.FAILOVER_AND_RETRY,
+ getRetryInterval());
+ } else {
+ return RetryPolicy.RetryAction.FAIL;
+ }
+ }
+
+ private synchronized long getRetryInterval() {
+ // TODO add exponential backup
+ return retryInterval;
+ }
+
+ private synchronized int nextProxyIndex() {
+// lastAttemptedLeader = currentProxySCMNodeId;
+
+ // round robin the next proxy
+ currentProxyIndex = (currentProxyIndex + 1) % scmProxies.size();
+ currentProxySCMNodeId = scmNodeIDList.get(currentProxyIndex);
+ return currentProxyIndex;
+ }
+
+ synchronized boolean assignLeaderToNode(String newLeaderNodeId) {
+ if (!currentProxySCMNodeId.equals(newLeaderNodeId)) {
+ if (scmProxies.containsKey(newLeaderNodeId)) {
+// lastAttemptedLeader = currentProxySCMNodeId;
+ currentProxySCMNodeId = newLeaderNodeId;
+ currentProxyIndex = scmNodeIDList.indexOf(currentProxySCMNodeId);
+ return true;
+ }
+ }
+// } else {
+// lastAttemptedLeader = currentProxySCMNodeId;
+// }
+ return false;
+ }
+
+ /**
+ * Creates proxy object if it does not already exist.
+ */
+ private void createSCMProxyIfNeeded(ProxyInfo proxyInfo,
+ String nodeId) {
+ if (proxyInfo.proxy == null) {
+ InetSocketAddress address = scmProxyInfoMap.get(nodeId).getAddress();
+ try {
+ StorageContainerLocationProtocolPB proxy =
+ createSCMProxy(address);
+ try {
+ proxyInfo.proxy = proxy;
+ } catch (IllegalAccessError iae) {
+ scmProxies.put(nodeId,
+ new ProxyInfo<>(proxy, proxyInfo.proxyInfo));
+ }
+ } catch (IOException ioe) {
+ LOG.error("{} Failed to create RPC proxy to SCM at {}",
+ this.getClass().getSimpleName(), address, ioe);
+ throw new RuntimeException(ioe);
+ }
+ }
+ }
+
+ private StorageContainerLocationProtocolPB createSCMProxy(
+ InetSocketAddress scmAddress) throws IOException {
+ Configuration hadoopConf =
+ LegacyHadoopConfigurationSource.asHadoopConfiguration(conf);
+ RPC.setProtocolEngine(hadoopConf, StorageContainerLocationProtocolPB.class,
+ ProtobufRpcEngine.class);
+ return RPC.getProxy(
+ StorageContainerLocationProtocolPB.class,
+ scmVersion, scmAddress, UserGroupInformation.getCurrentUser(),
+ hadoopConf, NetUtils.getDefaultSocketFactory(hadoopConf),
+ (int)scmClientConfig.getRpcTimeOut());
+ }
+
+ public RetryPolicy getSCMContainerLocationRetryPolicy(
+ String suggestedLeader) {
+ RetryPolicy retryPolicy = new RetryPolicy() {
+ @Override
+ public RetryAction shouldRetry(Exception e, int retry,
+ int failover, boolean b) {
+ performFailoverToAssignedLeader(suggestedLeader);
+ return getRetryAction(failover);
+ }
+ };
+ return retryPolicy;
+ }
+}
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMProxyInfo.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMProxyInfo.java
new file mode 100644
index 000000000000..ec2a5b01ce34
--- /dev/null
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMProxyInfo.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdds.scm.proxy;
+
+import com.google.common.base.Preconditions;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.net.InetSocketAddress;
+
+/**
+ * Class to store SCM proxy info.
+ */
+public class SCMProxyInfo {
+ private String serviceId;
+ private String nodeId;
+ private String rpcAddrStr;
+ private InetSocketAddress rpcAddr;
+
+ private static final Logger LOG =
+ LoggerFactory.getLogger(SCMProxyInfo.class);
+
+ public SCMProxyInfo(String serviceID, String nodeID,
+ InetSocketAddress rpcAddress) {
+ Preconditions.checkNotNull(rpcAddress);
+ this.serviceId = serviceID;
+ this.nodeId = nodeID;
+ this.rpcAddrStr = rpcAddress.toString();
+ this.rpcAddr = rpcAddress;
+ if (rpcAddr.isUnresolved()) {
+ LOG.warn("SCM address {} for serviceID {} remains unresolved " +
+ "for node ID {} Check your ozone-site.xml file to ensure scm " +
+ "addresses are configured properly.",
+ rpcAddress, serviceId, nodeId);
+ }
+ }
+
+ public String toString() {
+ return new StringBuilder()
+ .append("nodeId=")
+ .append(nodeId)
+ .append(",nodeAddress=")
+ .append(rpcAddrStr).toString();
+ }
+
+ public InetSocketAddress getAddress() {
+ return rpcAddr;
+ }
+
+ public String getServiceId() {
+ return serviceId;
+ }
+
+ public String getNodeId() {
+ return nodeId;
+ }
+}
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/package-info.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/package-info.java
new file mode 100644
index 000000000000..e3bb05895e99
--- /dev/null
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/package-info.java
@@ -0,0 +1,22 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdds.scm.proxy;
+
+/**
+ * This package contains classes related to scm proxy.
+ */
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/db/DBStoreBuilder.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/db/DBStoreBuilder.java
index 5b907afd9f82..ad48a19927a7 100644
--- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/db/DBStoreBuilder.java
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/db/DBStoreBuilder.java
@@ -87,6 +87,8 @@ public final class DBStoreBuilder {
private CodecRegistry registry;
private String rocksDbStat;
private RocksDBConfiguration rocksDBConfiguration;
+ // Flag to indicate if the RocksDB should be opened readonly.
+ private boolean openReadOnly = false;
/**
* Create DBStoreBuilder from a generic DBDefinition.
@@ -187,7 +189,7 @@ public DBStore build() throws IOException {
}
return new RDBStore(dbFile, rocksDBOption, writeOptions, tableConfigs,
- registry);
+ registry, openReadOnly);
}
public DBStoreBuilder setName(String name) {
@@ -227,6 +229,11 @@ public DBStoreBuilder setPath(Path path) {
return this;
}
+ public DBStoreBuilder setOpenReadOnly(boolean readOnly) {
+ this.openReadOnly = readOnly;
+ return this;
+ }
+
/**
* Set the {@link DBOptions} and default {@link ColumnFamilyOptions} based
* on {@code prof}.
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/db/RDBStore.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/db/RDBStore.java
index 0890a81d8fb8..adbd2eb39ead 100644
--- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/db/RDBStore.java
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/db/RDBStore.java
@@ -71,12 +71,13 @@ public class RDBStore implements DBStore {
@VisibleForTesting
public RDBStore(File dbFile, DBOptions options,
Set families) throws IOException {
- this(dbFile, options, new WriteOptions(), families, new CodecRegistry());
+ this(dbFile, options, new WriteOptions(), families, new CodecRegistry(),
+ false);
}
public RDBStore(File dbFile, DBOptions options,
WriteOptions writeOptions, Set families,
- CodecRegistry registry)
+ CodecRegistry registry, boolean readOnly)
throws IOException {
Preconditions.checkNotNull(dbFile, "DB file location cannot be null");
Preconditions.checkNotNull(families);
@@ -108,8 +109,13 @@ public RDBStore(File dbFile, DBOptions options,
extraCf.forEach(cf -> columnFamilyDescriptors.add(cf.getDescriptor()));
}
- db = RocksDB.open(dbOptions, dbLocation.getAbsolutePath(),
- columnFamilyDescriptors, columnFamilyHandles);
+ if (readOnly) {
+ db = RocksDB.openReadOnly(dbOptions, dbLocation.getAbsolutePath(),
+ columnFamilyDescriptors, columnFamilyHandles);
+ } else {
+ db = RocksDB.open(dbOptions, dbLocation.getAbsolutePath(),
+ columnFamilyDescriptors, columnFamilyHandles);
+ }
for (int x = 0; x < columnFamilyHandles.size(); x++) {
handleTable.put(
diff --git a/hadoop-hdds/hadoop-dependency-server/pom.xml b/hadoop-hdds/hadoop-dependency-server/pom.xml
index 06f0f87da478..642898885ae2 100644
--- a/hadoop-hdds/hadoop-dependency-server/pom.xml
+++ b/hadoop-hdds/hadoop-dependency-server/pom.xml
@@ -83,6 +83,12 @@ https://maven.apache.org/xsd/maven-4.0.0.xsd">
hadoop-hdfs${hadoop.version}compile
+
+
+ io.netty
+ *
+
+
diff --git a/hadoop-hdds/hadoop-dependency-test/pom.xml b/hadoop-hdds/hadoop-dependency-test/pom.xml
index 0dcbcc4fcaf0..c45421e95cd5 100644
--- a/hadoop-hdds/hadoop-dependency-test/pom.xml
+++ b/hadoop-hdds/hadoop-dependency-test/pom.xml
@@ -35,12 +35,24 @@ https://maven.apache.org/xsd/maven-4.0.0.xsd">
hadoop-common${hadoop.version}test-jar
+
+
+ *
+ *
+
+ org.apache.hadoophadoop-hdfs${hadoop.version}test-jar
+
+
+ *
+ *
+
+
diff --git a/hadoop-hdds/interface-admin/src/main/proto/ScmAdminProtocol.proto b/hadoop-hdds/interface-admin/src/main/proto/ScmAdminProtocol.proto
index 91dbebe33b88..739377551fea 100644
--- a/hadoop-hdds/interface-admin/src/main/proto/ScmAdminProtocol.proto
+++ b/hadoop-hdds/interface-admin/src/main/proto/ScmAdminProtocol.proto
@@ -100,6 +100,7 @@ message ScmContainerLocationResponse {
OK = 1;
CONTAINER_ALREADY_EXISTS = 2;
CONTAINER_IS_MISSING = 3;
+ SCM_NOT_LEADER = 4;
}
}
@@ -147,6 +148,7 @@ message ContainerResponseProto {
success = 1;
errorContainerAlreadyExists = 2;
errorContainerMissing = 3;
+ scmNotLeader = 4;
}
required Error errorCode = 1;
required ContainerWithPipeline containerWithPipeline = 2;
diff --git a/hadoop-hdds/interface-client/src/main/proto/hdds.proto b/hadoop-hdds/interface-client/src/main/proto/hdds.proto
index b43a74cd0679..f0c9b37a6758 100644
--- a/hadoop-hdds/interface-client/src/main/proto/hdds.proto
+++ b/hadoop-hdds/interface-client/src/main/proto/hdds.proto
@@ -83,6 +83,10 @@ message PipelineID {
optional UUID uuid128 = 100;
}
+message ContainerID {
+ required uint64 id = 1;
+}
+
enum PipelineState {
PIPELINE_ALLOCATED = 1;
PIPELINE_OPEN = 2;
@@ -181,6 +185,7 @@ enum LifeCycleEvent {
}
message ContainerInfoProto {
+ // Replace int64 with ContainerID message
required int64 containerID = 1;
required LifeCycleState state = 2;
optional PipelineID pipelineID = 3;
@@ -206,6 +211,7 @@ message GetScmInfoRequestProto {
message GetScmInfoResponseProto {
required string clusterId = 1;
required string scmId = 2;
+ repeated string peerRoles = 3;
}
@@ -236,6 +242,7 @@ enum ScmOps {
message ExcludeListProto {
repeated string datanodes = 1;
+ // Replace int64 with ContainerID message
repeated int64 containerIds = 2;
repeated PipelineID pipelineIds = 3;
}
@@ -244,6 +251,7 @@ message ExcludeListProto {
* Block ID that uniquely identify a block by SCM.
*/
message ContainerBlockID {
+ // Replace int64 with ContainerID message
required int64 containerID = 1;
required int64 localID = 2;
}
diff --git a/hadoop-hdds/interface-server/src/main/proto/SCMRatisProtocol.proto b/hadoop-hdds/interface-server/src/main/proto/SCMRatisProtocol.proto
new file mode 100644
index 000000000000..1107016fcd09
--- /dev/null
+++ b/hadoop-hdds/interface-server/src/main/proto/SCMRatisProtocol.proto
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+option java_package = "org.apache.hadoop.hdds.protocol.proto";
+option java_outer_classname = "SCMRatisProtocol";
+option java_generate_equals_and_hash = true;
+
+enum RequestType {
+ PIPELINE = 1;
+ CONTAINER = 2;
+}
+
+message Method {
+ required string name = 1;
+ repeated MethodArgument args = 2;
+}
+
+message MethodArgument {
+ required string type = 1;
+ required bytes value = 2;
+}
+
+message SCMRatisRequestProto {
+ required RequestType type = 1;
+ required Method method = 2;
+}
+
+message SCMRatisResponseProto {
+ required string type = 2;
+ required bytes value = 3;
+}
diff --git a/hadoop-hdds/interface-server/src/main/proto/ScmServerDatanodeHeartbeatProtocol.proto b/hadoop-hdds/interface-server/src/main/proto/ScmServerDatanodeHeartbeatProtocol.proto
index 4f610ff24b1a..973789a35369 100644
--- a/hadoop-hdds/interface-server/src/main/proto/ScmServerDatanodeHeartbeatProtocol.proto
+++ b/hadoop-hdds/interface-server/src/main/proto/ScmServerDatanodeHeartbeatProtocol.proto
@@ -303,6 +303,12 @@ message SCMCommandProto {
optional ReplicateContainerCommandProto replicateContainerCommandProto = 6;
optional CreatePipelineCommandProto createPipelineCommandProto = 7;
optional ClosePipelineCommandProto closePipelineCommandProto = 8;
+
+ // Under HA mode, holds term of underlying RaftServer iff current
+ // SCM is a leader, otherwise, holds term 0.
+ // Notes that, the first elected leader is from term 1, term 0,
+ // as the initial value of currentTerm, is never used under HA mode.
+ optional uint64 term = 15;
}
/**
diff --git a/hadoop-hdds/interface-server/src/main/proto/ScmServerProtocol.proto b/hadoop-hdds/interface-server/src/main/proto/ScmServerProtocol.proto
index 7d59bd72ef4c..bc5193f9a4b8 100644
--- a/hadoop-hdds/interface-server/src/main/proto/ScmServerProtocol.proto
+++ b/hadoop-hdds/interface-server/src/main/proto/ScmServerProtocol.proto
@@ -70,6 +70,8 @@ message SCMBlockLocationResponse {
optional string leaderOMNodeId = 6;
+ optional string leaderSCMNodeId = 7;
+
optional AllocateScmBlockResponseProto allocateScmBlockResponse = 11;
optional DeleteScmKeyBlocksResponseProto deleteScmKeyBlocksResponse = 12;
optional hadoop.hdds.GetScmInfoResponseProto getScmInfoResponse = 13;
@@ -116,6 +118,7 @@ enum Status {
INTERNAL_ERROR = 29;
FAILED_TO_INIT_PIPELINE_CHOOSE_POLICY = 30;
FAILED_TO_INIT_LEADER_CHOOSE_POLICY = 31;
+ SCM_NOT_LEADER = 32;
}
/**
diff --git a/hadoop-hdds/pom.xml b/hadoop-hdds/pom.xml
index 6ebccf7bc8cf..f2f6c7d444fa 100644
--- a/hadoop-hdds/pom.xml
+++ b/hadoop-hdds/pom.xml
@@ -160,6 +160,7 @@ https://maven.apache.org/xsd/maven-4.0.0.xsd">
hadoop-hdds-common${hdds.version}test-jar
+ test
@@ -167,6 +168,7 @@ https://maven.apache.org/xsd/maven-4.0.0.xsd">
hadoop-hdds-container-service${hdds.version}test-jar
+ test
@@ -174,6 +176,7 @@ https://maven.apache.org/xsd/maven-4.0.0.xsd">
hadoop-hdds-server-scmtest-jar${hdds.version}
+ test
diff --git a/hadoop-hdds/server-scm/dev-support/findbugsExcludeFile.xml b/hadoop-hdds/server-scm/dev-support/findbugsExcludeFile.xml
new file mode 100644
index 000000000000..3571a8929e3f
--- /dev/null
+++ b/hadoop-hdds/server-scm/dev-support/findbugsExcludeFile.xml
@@ -0,0 +1,21 @@
+
+
+
+
+
+
diff --git a/hadoop-hdds/server-scm/pom.xml b/hadoop-hdds/server-scm/pom.xml
index c007ef1ec214..5c2aacce38b8 100644
--- a/hadoop-hdds/server-scm/pom.xml
+++ b/hadoop-hdds/server-scm/pom.xml
@@ -90,6 +90,12 @@ https://maven.apache.org/xsd/maven-4.0.0.xsd">
testtest-jar
+
+ org.apache.hadoop
+ hadoop-hdds-common
+ test-jar
+ test
+ org.hamcrest
@@ -129,6 +135,11 @@ https://maven.apache.org/xsd/maven-4.0.0.xsd">
hadoop-hdds-hadoop-dependency-testtest
+
+ com.google.protobuf
+ protobuf-java
+ compile
+
@@ -164,6 +175,13 @@ https://maven.apache.org/xsd/maven-4.0.0.xsd">
+
+ com.github.spotbugs
+ spotbugs-maven-plugin
+
+ ${basedir}/dev-support/findbugsExcludeFile.xml
+
+
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ScmUtils.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ScmUtils.java
index 426341a32f40..bb48654e8d53 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ScmUtils.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ScmUtils.java
@@ -25,6 +25,8 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.io.File;
+
/**
* SCM utility class.
*/
@@ -48,4 +50,14 @@ public static void preCheck(ScmOps operation, Precheck... preChecks)
}
}
+ /**
+ * Create SCM directory file based on given path.
+ */
+ public static File createSCMDir(String dirPath) {
+ File dirFile = new File(dirPath);
+ if (!dirFile.mkdirs() && !dirFile.exists()) {
+ throw new IllegalArgumentException("Unable to create path: " + dirFile);
+ }
+ return dirFile;
+ }
}
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/BlockManagerImpl.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/BlockManagerImpl.java
index 014c76c6e64c..8c2323738c6a 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/BlockManagerImpl.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/BlockManagerImpl.java
@@ -58,6 +58,8 @@
import static org.apache.hadoop.hdds.scm.exceptions.SCMException.ResultCodes.INVALID_BLOCK_SIZE;
import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_SERVICE_TIMEOUT;
import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_SERVICE_TIMEOUT_DEFAULT;
+
+import org.apache.ratis.protocol.exceptions.NotLeaderException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -255,7 +257,8 @@ public AllocatedBlock allocateBlock(final long size, ReplicationType type,
* @param containerInfo - Container Info.
* @return AllocatedBlock
*/
- private AllocatedBlock newBlock(ContainerInfo containerInfo) {
+ private AllocatedBlock newBlock(ContainerInfo containerInfo)
+ throws NotLeaderException {
try {
final Pipeline pipeline = pipelineManager
.getPipeline(containerInfo.getPipelineID());
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/DeletedBlockLogImpl.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/DeletedBlockLogImpl.java
index aa554808f499..2fe558f55a6f 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/DeletedBlockLogImpl.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/DeletedBlockLogImpl.java
@@ -192,7 +192,7 @@ public void commitTransactions(
long txID = transactionResult.getTxID();
// set of dns which have successfully committed transaction txId.
dnsWithCommittedTxn = transactionToDNsCommitMap.get(txID);
- final ContainerID containerId = ContainerID.valueof(
+ final ContainerID containerId = ContainerID.valueOf(
transactionResult.getContainerID());
if (dnsWithCommittedTxn == null) {
// Mostly likely it's a retried delete command response.
@@ -326,7 +326,7 @@ private void getTransaction(DeletedBlocksTransaction tx,
DatanodeDeletedBlockTransactions transactions) {
try {
Set replicas = containerManager
- .getContainerReplicas(ContainerID.valueof(tx.getContainerID()));
+ .getContainerReplicas(ContainerID.valueOf(tx.getContainerID()));
for (ContainerReplica replica : replicas) {
UUID dnID = replica.getDatanodeDetails().getUuid();
Set dnsWithTransactionCommitted =
@@ -358,7 +358,7 @@ public DatanodeDeletedBlockTransactions getTransactions(
Table.KeyValue keyValue =
iter.next();
DeletedBlocksTransaction txn = keyValue.getValue();
- final ContainerID id = ContainerID.valueof(txn.getContainerID());
+ final ContainerID id = ContainerID.valueOf(txn.getContainerID());
if (txn.getCount() > -1 && txn.getCount() <= maxRetry
&& !containerManager.getContainer(id).isOpen()) {
numBlocksAdded += txn.getLocalIDCount();
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMBlockDeletingService.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMBlockDeletingService.java
index 2d91bd60adad..fbf56543aeef 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMBlockDeletingService.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMBlockDeletingService.java
@@ -141,7 +141,7 @@ public EmptyTaskResult call() throws Exception {
// We should stop caching new commands if num of un-processed
// command is bigger than a limit, e.g 50. In case datanode goes
// offline for sometime, the cached commands be flooded.
- eventPublisher.fireEvent(SCMEvents.RETRIABLE_DATANODE_COMMAND,
+ eventPublisher.fireEvent(SCMEvents.DATANODE_COMMAND,
new CommandForDatanode<>(dnId,
new DeleteBlocksCommand(dnTXs)));
if (LOG.isDebugEnabled()) {
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java
index 1b190a22da1b..02dc3f50aeff 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java
@@ -75,7 +75,7 @@ protected void processContainerReplica(final DatanodeDetails datanodeDetails,
final ContainerReplicaProto replicaProto, final EventPublisher publisher)
throws IOException {
final ContainerID containerId = ContainerID
- .valueof(replicaProto.getContainerID());
+ .valueOf(replicaProto.getContainerID());
if (logger.isDebugEnabled()) {
logger.debug("Processing replica of container {} from datanode {}",
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/CloseContainerEventHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/CloseContainerEventHandler.java
index fd73711003bf..da221934bff0 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/CloseContainerEventHandler.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/CloseContainerEventHandler.java
@@ -29,6 +29,7 @@
import org.apache.hadoop.hdds.server.events.EventPublisher;
import org.apache.hadoop.ozone.protocol.commands.CloseContainerCommand;
import org.apache.hadoop.ozone.protocol.commands.CommandForDatanode;
+import org.apache.ratis.protocol.exceptions.NotLeaderException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -98,7 +99,7 @@ public void onMessage(ContainerID containerID, EventPublisher publisher) {
* @throws ContainerNotFoundException
*/
private List getNodes(final ContainerInfo container)
- throws ContainerNotFoundException {
+ throws ContainerNotFoundException, NotLeaderException {
try {
return pipelineManager.getPipeline(container.getPipelineID()).getNodes();
} catch (PipelineNotFoundException ex) {
@@ -109,5 +110,4 @@ private List getNodes(final ContainerInfo container)
.collect(Collectors.toList());
}
}
-
}
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerActionsHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerActionsHandler.java
index e79f268974cf..3d53e292172c 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerActionsHandler.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerActionsHandler.java
@@ -45,7 +45,7 @@ public void onMessage(
DatanodeDetails dd = containerReportFromDatanode.getDatanodeDetails();
for (ContainerAction action : containerReportFromDatanode.getReport()
.getContainerActionsList()) {
- ContainerID containerId = ContainerID.valueof(action.getContainerID());
+ ContainerID containerId = ContainerID.valueOf(action.getContainerID());
switch (action.getAction()) {
case CLOSE:
if (LOG.isDebugEnabled()) {
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManagerImpl.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManagerImpl.java
new file mode 100644
index 000000000000..3477eea2c758
--- /dev/null
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManagerImpl.java
@@ -0,0 +1,286 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package org.apache.hadoop.hdds.scm.container;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.Set;
+import java.util.concurrent.locks.ReadWriteLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+import java.util.stream.Collectors;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ContainerInfoProto;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleEvent;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType;
+import org.apache.hadoop.hdds.scm.ha.SCMHAManager;
+import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
+import org.apache.hadoop.hdds.scm.pipeline.PipelineManager;
+import org.apache.hadoop.hdds.utils.UniqueId;
+import org.apache.hadoop.hdds.utils.db.Table;
+import org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionException;
+import org.apache.hadoop.util.Time;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * TODO: Add javadoc.
+ */
+public class ContainerManagerImpl implements ContainerManagerV2 {
+
+ /*
+ * TODO: Introduce container level locks.
+ */
+
+ /**
+ *
+ */
+ private static final Logger LOG = LoggerFactory.getLogger(
+ ContainerManagerImpl.class);
+
+ /**
+ *
+ */
+ //Can we move this lock to ContainerStateManager?
+ private final ReadWriteLock lock;
+
+ /**
+ *
+ */
+ private final PipelineManager pipelineManager;
+
+ /**
+ *
+ */
+ private final ContainerStateManagerV2 containerStateManager;
+
+ /**
+ *
+ */
+ public ContainerManagerImpl(
+ final Configuration conf,
+ final SCMHAManager scmHaManager,
+ final PipelineManager pipelineManager,
+ final Table containerStore)
+ throws IOException {
+ // Introduce builder for this class?
+ this.lock = new ReentrantReadWriteLock();
+ this.pipelineManager = pipelineManager;
+ this.containerStateManager = ContainerStateManagerImpl.newBuilder()
+ .setConfiguration(conf)
+ .setPipelineManager(pipelineManager)
+ .setRatisServer(scmHaManager.getRatisServer())
+ .setContainerStore(containerStore)
+ .build();
+ }
+
+ @Override
+ public ContainerInfo getContainer(final ContainerID id)
+ throws ContainerNotFoundException {
+ lock.readLock().lock();
+ try {
+ return Optional.ofNullable(containerStateManager
+ .getContainer(id.getProtobuf()))
+ .orElseThrow(() -> new ContainerNotFoundException("ID " + id));
+ } finally {
+ lock.readLock().unlock();
+ }
+ }
+
+ @Override
+ public List listContainers(final ContainerID startID,
+ final int count) {
+ lock.readLock().lock();
+ try {
+ final long start = startID == null ? 0 : startID.getId();
+ final List containersIds =
+ new ArrayList<>(containerStateManager.getContainerIDs());
+ Collections.sort(containersIds);
+ return containersIds.stream()
+ .filter(id -> id.getId() > start).limit(count)
+ .map(ContainerID::getProtobuf)
+ .map(containerStateManager::getContainer)
+ .collect(Collectors.toList());
+ } finally {
+ lock.readLock().unlock();
+ }
+ }
+
+ @Override
+ public List listContainers(final LifeCycleState state) {
+ lock.readLock().lock();
+ try {
+ return containerStateManager.getContainerIDs(state).stream()
+ .map(ContainerID::getProtobuf)
+ .map(containerStateManager::getContainer)
+ .filter(Objects::nonNull).collect(Collectors.toList());
+ } finally {
+ lock.readLock().unlock();
+ }
+ }
+
+ @Override
+ public ContainerInfo allocateContainer(final ReplicationType type,
+ final ReplicationFactor replicationFactor, final String owner)
+ throws IOException {
+ lock.writeLock().lock();
+ try {
+ final List pipelines = pipelineManager
+ .getPipelines(type, replicationFactor, Pipeline.PipelineState.OPEN);
+
+ if (pipelines.isEmpty()) {
+ throw new IOException("Could not allocate container. Cannot get any" +
+ " matching pipeline for Type:" + type + ", Factor:" +
+ replicationFactor + ", State:PipelineState.OPEN");
+ }
+
+ // TODO: Replace this with Distributed unique id generator.
+ final ContainerID containerID = ContainerID.valueOf(UniqueId.next());
+ final Pipeline pipeline = pipelines.get(
+ (int) containerID.getId() % pipelines.size());
+
+ final ContainerInfoProto containerInfo = ContainerInfoProto.newBuilder()
+ .setState(LifeCycleState.OPEN)
+ .setPipelineID(pipeline.getId().getProtobuf())
+ .setUsedBytes(0)
+ .setNumberOfKeys(0)
+ .setStateEnterTime(Time.now())
+ .setOwner(owner)
+ .setContainerID(containerID.getId())
+ .setDeleteTransactionId(0)
+ .setReplicationFactor(pipeline.getFactor())
+ .setReplicationType(pipeline.getType())
+ .build();
+ containerStateManager.addContainer(containerInfo);
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("New container allocated: {}", containerInfo);
+ }
+ return containerStateManager.getContainer(containerID.getProtobuf());
+ } finally {
+ lock.writeLock().unlock();
+ }
+ }
+
+ @Override
+ public void updateContainerState(final ContainerID id,
+ final LifeCycleEvent event)
+ throws IOException, InvalidStateTransitionException {
+ final HddsProtos.ContainerID cid = id.getProtobuf();
+ lock.writeLock().lock();
+ try {
+ checkIfContainerExist(cid);
+ containerStateManager.updateContainerState(cid, event);
+ } finally {
+ lock.writeLock().unlock();
+ }
+ }
+
+ @Override
+ public Set getContainerReplicas(final ContainerID id)
+ throws ContainerNotFoundException {
+ lock.readLock().lock();
+ try {
+ return Optional.ofNullable(containerStateManager
+ .getContainerReplicas(id.getProtobuf()))
+ .orElseThrow(() -> new ContainerNotFoundException("ID " + id));
+ } finally {
+ lock.readLock().unlock();
+ }
+ }
+
+ @Override
+ public void updateContainerReplica(final ContainerID id,
+ final ContainerReplica replica)
+ throws ContainerNotFoundException {
+ final HddsProtos.ContainerID cid = id.getProtobuf();
+ lock.writeLock().lock();
+ try {
+ checkIfContainerExist(cid);
+ containerStateManager.updateContainerReplica(cid, replica);
+ } finally {
+ lock.writeLock().unlock();
+ }
+ }
+
+ @Override
+ public void removeContainerReplica(final ContainerID id,
+ final ContainerReplica replica)
+ throws ContainerNotFoundException, ContainerReplicaNotFoundException {
+ final HddsProtos.ContainerID cid = id.getProtobuf();
+ lock.writeLock().lock();
+ try {
+ checkIfContainerExist(cid);
+ containerStateManager.removeContainerReplica(cid, replica);
+ } finally {
+ lock.writeLock().unlock();
+ }
+ }
+
+ @Override
+ public void updateDeleteTransactionId(
+ final Map deleteTransactionMap) throws IOException {
+ throw new UnsupportedOperationException("Not yet implemented!");
+ }
+
+ @Override
+ public ContainerInfo getMatchingContainer(final long size, final String owner,
+ final Pipeline pipeline, final List excludedContainerIDS) {
+ throw new UnsupportedOperationException("Not yet implemented!");
+ }
+
+ @Override
+ public void notifyContainerReportProcessing(final boolean isFullReport,
+ final boolean success) {
+ throw new UnsupportedOperationException("Not yet implemented!");
+ }
+
+ @Override
+ public void deleteContainer(final ContainerID id)
+ throws IOException {
+ final HddsProtos.ContainerID cid = id.getProtobuf();
+ lock.writeLock().lock();
+ try {
+ checkIfContainerExist(cid);
+ containerStateManager.removeContainer(cid);
+ } finally {
+ lock.writeLock().unlock();
+ }
+ }
+
+ private void checkIfContainerExist(final HddsProtos.ContainerID id)
+ throws ContainerNotFoundException {
+ if (!containerStateManager.contains(id)) {
+ throw new ContainerNotFoundException("Container with id #" +
+ id.getId() + " not found.");
+ }
+ }
+
+ @Override
+ public void close() throws Exception {
+ containerStateManager.close();
+ }
+
+}
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManagerV2.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManagerV2.java
new file mode 100644
index 000000000000..dcedb6cedac3
--- /dev/null
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManagerV2.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.hadoop.hdds.scm.container;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleEvent;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType;
+import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
+import org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionException;
+
+/**
+ * TODO: Add extensive javadoc.
+ *
+ * ContainerManager class contains the mapping from a name to a pipeline
+ * mapping. This is used by SCM when allocating new locations and when
+ * looking up a key.
+ */
+public interface ContainerManagerV2 extends AutoCloseable {
+ // TODO: Rename this to ContainerManager
+
+
+ /**
+ * Returns the ContainerInfo from the container ID.
+ *
+ */
+ ContainerInfo getContainer(ContainerID containerID)
+ throws ContainerNotFoundException;
+
+ /**
+ * Returns containers under certain conditions.
+ * Search container IDs from start ID(exclusive),
+ * The max size of the searching range cannot exceed the
+ * value of count.
+ *
+ * @param startID start containerID, >=0,
+ * start searching at the head if 0.
+ * @param count count must be >= 0
+ * Usually the count will be replace with a very big
+ * value instead of being unlimited in case the db is very big.
+ *
+ * @return a list of container.
+ */
+ List listContainers(ContainerID startID, int count);
+
+
+ /**
+ * Returns all the containers which are in the specified state.
+ *
+ * @return List of ContainerInfo
+ */
+ List listContainers(LifeCycleState state);
+
+ /**
+ * Allocates a new container for a given keyName and replication factor.
+ *
+ * @param replicationFactor - replication factor of the container.
+ * @param owner
+ * @return - ContainerInfo.
+ * @throws IOException
+ */
+ ContainerInfo allocateContainer(ReplicationType type,
+ ReplicationFactor replicationFactor,
+ String owner) throws IOException;
+
+ /**
+ * Update container state.
+ * @param containerID - Container ID
+ * @param event - container life cycle event
+ * @throws IOException
+ * @throws InvalidStateTransitionException
+ */
+ void updateContainerState(ContainerID containerID,
+ LifeCycleEvent event)
+ throws IOException, InvalidStateTransitionException;
+
+ /**
+ * Returns the latest list of replicas for given containerId.
+ *
+ * @param containerID Container ID
+ * @return Set of ContainerReplica
+ */
+ Set getContainerReplicas(ContainerID containerID)
+ throws ContainerNotFoundException;
+
+ /**
+ * Adds a container Replica for the given Container.
+ *
+ * @param containerID Container ID
+ * @param replica ContainerReplica
+ */
+ void updateContainerReplica(ContainerID containerID, ContainerReplica replica)
+ throws ContainerNotFoundException;
+
+ /**
+ * Remove a container Replica form a given Container.
+ *
+ * @param containerID Container ID
+ * @param replica ContainerReplica
+ * @return True of dataNode is removed successfully else false.
+ */
+ void removeContainerReplica(ContainerID containerID, ContainerReplica replica)
+ throws ContainerNotFoundException, ContainerReplicaNotFoundException;
+
+ /**
+ * Update deleteTransactionId according to deleteTransactionMap.
+ *
+ * @param deleteTransactionMap Maps the containerId to latest delete
+ * transaction id for the container.
+ * @throws IOException
+ */
+ void updateDeleteTransactionId(Map deleteTransactionMap)
+ throws IOException;
+
+ /**
+ * Returns ContainerInfo which matches the requirements.
+ * @param size - the amount of space required in the container
+ * @param owner - the user which requires space in its owned container
+ * @param pipeline - pipeline to which the container should belong.
+ * @param excludedContainerIDS - containerIds to be excluded.
+ * @return ContainerInfo for the matching container.
+ */
+ ContainerInfo getMatchingContainer(long size, String owner,
+ Pipeline pipeline,
+ List excludedContainerIDS);
+
+ /**
+ * Once after report processor handler completes, call this to notify
+ * container manager to increment metrics.
+ * @param isFullReport
+ * @param success
+ */
+ // Is it possible to remove this from the Interface?
+ void notifyContainerReportProcessing(boolean isFullReport, boolean success);
+
+ /**
+ * Deletes a container from SCM.
+ *
+ * @param containerID - Container ID
+ * @throws IOException
+ */
+ void deleteContainer(ContainerID containerID)
+ throws IOException;
+}
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReportHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReportHandler.java
index 7bca64f635b5..18dffe72895d 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReportHandler.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReportHandler.java
@@ -120,7 +120,7 @@ public void onMessage(final ContainerReportFromDatanode reportFromDatanode,
final Set containersInDn = replicas.parallelStream()
.map(ContainerReplicaProto::getContainerID)
- .map(ContainerID::valueof).collect(Collectors.toSet());
+ .map(ContainerID::valueOf).collect(Collectors.toSet());
final Set missingReplicas = new HashSet<>(containersInSCM);
missingReplicas.removeAll(containersInDn);
@@ -167,7 +167,7 @@ private void processContainerReplicas(final DatanodeDetails datanodeDetails,
} else if (unknownContainerHandleAction.equals(
UNKNOWN_CONTAINER_ACTION_DELETE)) {
final ContainerID containerId = ContainerID
- .valueof(replicaProto.getContainerID());
+ .valueOf(replicaProto.getContainerID());
deleteReplica(containerId, datanodeDetails, publisher, "unknown");
}
} catch (IOException e) {
@@ -221,7 +221,7 @@ private void updateDeleteTransaction(final DatanodeDetails datanodeDetails,
for (ContainerReplicaProto replica : replicas) {
try {
final ContainerInfo containerInfo = containerManager.getContainer(
- ContainerID.valueof(replica.getContainerID()));
+ ContainerID.valueOf(replica.getContainerID()));
if (containerInfo.getDeleteTransactionId() >
replica.getDeleteTransactionId()) {
pendingDeleteStatusList.addPendingDeleteStatus(
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerStateManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerStateManager.java
index e575c60566b1..0c3772f44825 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerStateManager.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerStateManager.java
@@ -329,7 +329,7 @@ public void addContainerInfo(long containerID,
// In Recon, while adding a 'new' CLOSED container, pipeline will be a
// random ID, and hence be passed down as null.
pipelineManager.addContainerToPipeline(pipeline.getId(),
- ContainerID.valueof(containerID));
+ ContainerID.valueOf(containerID));
}
containerStateCount.incrementAndGet(containerInfo.getState());
}
@@ -371,12 +371,8 @@ void updateContainerState(final ContainerID containerID,
void updateDeleteTransactionId(
final Map deleteTransactionMap) {
deleteTransactionMap.forEach((k, v) -> {
- try {
- containers.getContainerInfo(ContainerID.valueof(k))
- .updateDeleteTransactionId(v);
- } catch (ContainerNotFoundException e) {
- LOG.warn("Exception while updating delete transaction id.", e);
- }
+ containers.getContainerInfo(ContainerID.valueOf(k))
+ .updateDeleteTransactionId(v);
});
}
@@ -432,18 +428,13 @@ ContainerInfo getMatchingContainer(final long size, String owner,
private ContainerInfo findContainerWithSpace(final long size,
final NavigableSet searchSet, final String owner,
final PipelineID pipelineID) {
- try {
- // Get the container with space to meet our request.
- for (ContainerID id : searchSet) {
- final ContainerInfo containerInfo = containers.getContainerInfo(id);
- if (containerInfo.getUsedBytes() + size <= this.containerSize) {
- containerInfo.updateLastUsedTime();
- return containerInfo;
- }
+ // Get the container with space to meet our request.
+ for (ContainerID id : searchSet) {
+ final ContainerInfo containerInfo = containers.getContainerInfo(id);
+ if (containerInfo.getUsedBytes() + size <= this.containerSize) {
+ containerInfo.updateLastUsedTime();
+ return containerInfo;
}
- } catch (ContainerNotFoundException e) {
- // This should not happen!
- LOG.warn("Exception while finding container with space", e);
}
return null;
}
@@ -496,7 +487,11 @@ NavigableSet getMatchingContainerIDs(final String owner,
*/
ContainerInfo getContainer(final ContainerID containerID)
throws ContainerNotFoundException {
- return containers.getContainerInfo(containerID);
+ final ContainerInfo container = containers.getContainerInfo(containerID);
+ if (container != null) {
+ return container;
+ }
+ throw new ContainerNotFoundException(containerID.toString());
}
void close() throws IOException {
@@ -540,6 +535,9 @@ void removeContainerReplica(final ContainerID containerID,
void removeContainer(final ContainerID containerID)
throws ContainerNotFoundException {
+ if (containers.getContainerInfo(containerID) == null) {
+ throw new ContainerNotFoundException(containerID.toString());
+ }
containers.removeContainer(containerID);
}
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerStateManagerImpl.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerStateManagerImpl.java
new file mode 100644
index 000000000000..7f42a971cac4
--- /dev/null
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerStateManagerImpl.java
@@ -0,0 +1,399 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package org.apache.hadoop.hdds.scm.container;
+
+import java.io.IOException;
+import java.lang.reflect.Proxy;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.NavigableSet;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+
+import com.google.common.base.Preconditions;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.StorageUnit;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ContainerInfoProto;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleEvent;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType;
+import org.apache.hadoop.hdds.protocol.proto.SCMRatisProtocol.RequestType;
+import org.apache.hadoop.hdds.scm.ScmConfigKeys;
+import org.apache.hadoop.hdds.scm.container.states.ContainerState;
+import org.apache.hadoop.hdds.scm.container.states.ContainerStateMap;
+import org.apache.hadoop.hdds.scm.ha.SCMHAInvocationHandler;
+import org.apache.hadoop.hdds.scm.ha.SCMRatisServer;
+import org.apache.hadoop.hdds.scm.pipeline.PipelineID;
+import org.apache.hadoop.hdds.scm.pipeline.PipelineManager;
+import org.apache.hadoop.hdds.scm.pipeline.PipelineNotFoundException;
+import org.apache.hadoop.hdds.utils.db.Table;
+import org.apache.hadoop.hdds.utils.db.Table.KeyValue;
+import org.apache.hadoop.hdds.utils.db.TableIterator;
+import org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionException;
+import org.apache.hadoop.ozone.common.statemachine.StateMachine;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleEvent.FINALIZE;
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleEvent.QUASI_CLOSE;
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleEvent.CLOSE;
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleEvent.FORCE_CLOSE;
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleEvent.DELETE;
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleEvent.CLEANUP;
+
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.OPEN;
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.CLOSING;
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.QUASI_CLOSED;
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.CLOSED;
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.DELETING;
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.DELETED;
+
+/**
+ * Default implementation of ContainerStateManager. This implementation
+ * holds the Container States in-memory which is backed by a persistent store.
+ * The persistent store is always kept in sync with the in-memory state changes.
+ *
+ * This class is NOT thread safe. All the calls are idempotent.
+ */
+public final class ContainerStateManagerImpl
+ implements ContainerStateManagerV2 {
+
+ /**
+ * Logger instance of ContainerStateManagerImpl.
+ */
+ private static final Logger LOG = LoggerFactory.getLogger(
+ ContainerStateManagerImpl.class);
+
+ /**
+ * Configured container size.
+ */
+ private final long containerSize;
+
+ /**
+ * In-memory representation of Container States.
+ */
+ private final ContainerStateMap containers;
+
+ /**
+ * Persistent store for Container States.
+ */
+ private Table containerStore;
+
+ /**
+ * PipelineManager instance.
+ */
+ private final PipelineManager pipelineManager;
+
+ /**
+ * Container lifecycle state machine.
+ */
+ private final StateMachine stateMachine;
+
+ /**
+ * We use the containers in round-robin fashion for operations like block
+ * allocation. This map is used for remembering the last used container.
+ */
+ private final ConcurrentHashMap lastUsedMap;
+
+ /**
+ * constructs ContainerStateManagerImpl instance and loads the containers
+ * form the persistent storage.
+ *
+ * @param conf the Configuration
+ * @param pipelineManager the {@link PipelineManager} instance
+ * @param containerStore the persistent storage
+ * @throws IOException in case of error while loading the containers
+ */
+ private ContainerStateManagerImpl(final Configuration conf,
+ final PipelineManager pipelineManager,
+ final Table containerStore)
+ throws IOException {
+ this.pipelineManager = pipelineManager;
+ this.containerStore = containerStore;
+ this.stateMachine = newStateMachine();
+ this.containerSize = getConfiguredContainerSize(conf);
+ this.containers = new ContainerStateMap();
+ this.lastUsedMap = new ConcurrentHashMap<>();
+
+ initialize();
+ }
+
+ /**
+ * Creates and initializes a new Container Lifecycle StateMachine.
+ *
+ * @return the Container Lifecycle StateMachine
+ */
+ private StateMachine newStateMachine() {
+
+ final Set finalStates = new HashSet<>();
+
+ // These are the steady states of a container.
+ finalStates.add(CLOSED);
+ finalStates.add(DELETED);
+
+ final StateMachine containerLifecycleSM =
+ new StateMachine<>(OPEN, finalStates);
+
+ containerLifecycleSM.addTransition(OPEN, CLOSING, FINALIZE);
+ containerLifecycleSM.addTransition(CLOSING, QUASI_CLOSED, QUASI_CLOSE);
+ containerLifecycleSM.addTransition(CLOSING, CLOSED, CLOSE);
+ containerLifecycleSM.addTransition(QUASI_CLOSED, CLOSED, FORCE_CLOSE);
+ containerLifecycleSM.addTransition(CLOSED, DELETING, DELETE);
+ containerLifecycleSM.addTransition(DELETING, DELETED, CLEANUP);
+
+ /* The following set of transitions are to make state machine
+ * transition idempotent.
+ */
+ makeStateTransitionIdempotent(containerLifecycleSM, FINALIZE,
+ CLOSING, QUASI_CLOSED, CLOSED, DELETING, DELETED);
+ makeStateTransitionIdempotent(containerLifecycleSM, QUASI_CLOSE,
+ QUASI_CLOSED, CLOSED, DELETING, DELETED);
+ makeStateTransitionIdempotent(containerLifecycleSM, CLOSE,
+ CLOSED, DELETING, DELETED);
+ makeStateTransitionIdempotent(containerLifecycleSM, FORCE_CLOSE,
+ CLOSED, DELETING, DELETED);
+ makeStateTransitionIdempotent(containerLifecycleSM, DELETE,
+ DELETING, DELETED);
+ makeStateTransitionIdempotent(containerLifecycleSM, CLEANUP, DELETED);
+
+ return containerLifecycleSM;
+ }
+
+ private void makeStateTransitionIdempotent(
+ final StateMachine sm,
+ final LifeCycleEvent event, final LifeCycleState... states) {
+ for (LifeCycleState state : states) {
+ sm.addTransition(state, state, event);
+ }
+ }
+
+ /**
+ * Returns the configured container size.
+ *
+ * @return the max size of container
+ */
+ private long getConfiguredContainerSize(final Configuration conf) {
+ return (long) conf.getStorageSize(
+ ScmConfigKeys.OZONE_SCM_CONTAINER_SIZE,
+ ScmConfigKeys.OZONE_SCM_CONTAINER_SIZE_DEFAULT,
+ StorageUnit.BYTES);
+ }
+
+ /**
+ * Loads the containers from container store into memory.
+ *
+ * @throws IOException in case of error while loading the containers
+ */
+ private void initialize() throws IOException {
+ TableIterator>
+ iterator = containerStore.iterator();
+
+ while (iterator.hasNext()) {
+ final ContainerInfo container = iterator.next().getValue();
+ Preconditions.checkNotNull(container);
+ containers.addContainer(container);
+ if (container.getState() == LifeCycleState.OPEN) {
+ try {
+ pipelineManager.addContainerToPipeline(container.getPipelineID(),
+ container.containerID());
+ } catch (PipelineNotFoundException ex) {
+ LOG.warn("Found container {} which is in OPEN state with " +
+ "pipeline {} that does not exist. Marking container for " +
+ "closing.", container, container.getPipelineID());
+ try {
+ updateContainerState(container.containerID().getProtobuf(),
+ LifeCycleEvent.FINALIZE);
+ } catch (InvalidStateTransitionException e) {
+ // This cannot happen.
+ LOG.warn("Unable to finalize Container {}.", container);
+ }
+ }
+ }
+ }
+ }
+
+ @Override
+ public Set getContainerIDs() {
+ return containers.getAllContainerIDs();
+ }
+
+ @Override
+ public Set getContainerIDs(final LifeCycleState state) {
+ return containers.getContainerIDsByState(state);
+ }
+
+ @Override
+ public ContainerInfo getContainer(final HddsProtos.ContainerID id) {
+ return containers.getContainerInfo(
+ ContainerID.getFromProtobuf(id));
+ }
+
+ @Override
+ public void addContainer(final ContainerInfoProto containerInfo)
+ throws IOException {
+
+ // Change the exception thrown to PipelineNotFound and
+ // ClosedPipelineException once ClosedPipelineException is introduced
+ // in PipelineManager.
+
+ Preconditions.checkNotNull(containerInfo);
+ final ContainerInfo container = ContainerInfo.fromProtobuf(containerInfo);
+ final ContainerID containerID = container.containerID();
+ final PipelineID pipelineID = container.getPipelineID();
+
+ if (!containers.contains(containerID)) {
+ containerStore.put(containerID, container);
+ try {
+ containers.addContainer(container);
+ pipelineManager.addContainerToPipeline(pipelineID, containerID);
+ } catch (Exception ex) {
+ containers.removeContainer(containerID);
+ containerStore.delete(containerID);
+ throw ex;
+ }
+ }
+ }
+
+ @Override
+ public boolean contains(final HddsProtos.ContainerID id) {
+ // TODO: Remove the protobuf conversion after fixing ContainerStateMap.
+ return containers.contains(ContainerID.getFromProtobuf(id));
+ }
+
+ public void updateContainerState(final HddsProtos.ContainerID containerID,
+ final LifeCycleEvent event)
+ throws IOException, InvalidStateTransitionException {
+ // TODO: Remove the protobuf conversion after fixing ContainerStateMap.
+ final ContainerID id = ContainerID.getFromProtobuf(containerID);
+ if (containers.contains(id)) {
+ final ContainerInfo info = containers.getContainerInfo(id);
+ final LifeCycleState oldState = info.getState();
+ final LifeCycleState newState = stateMachine.getNextState(
+ info.getState(), event);
+ if (newState.getNumber() > oldState.getNumber()) {
+ containers.updateState(id, info.getState(), newState);
+ }
+ }
+ }
+
+
+ @Override
+ public Set getContainerReplicas(
+ final HddsProtos.ContainerID id) {
+ return containers.getContainerReplicas(
+ ContainerID.getFromProtobuf(id));
+ }
+
+ @Override
+ public void updateContainerReplica(final HddsProtos.ContainerID id,
+ final ContainerReplica replica) {
+ containers.updateContainerReplica(ContainerID.getFromProtobuf(id),
+ replica);
+ }
+
+ @Override
+ public void removeContainerReplica(final HddsProtos.ContainerID id,
+ final ContainerReplica replica) {
+ containers.removeContainerReplica(ContainerID.getFromProtobuf(id),
+ replica);
+
+ }
+
+ void updateDeleteTransactionId(
+ final Map deleteTransactionMap) {
+ throw new UnsupportedOperationException("Not yet implemented!");
+ }
+
+ ContainerInfo getMatchingContainer(final long size, String owner,
+ PipelineID pipelineID, NavigableSet containerIDs) {
+ throw new UnsupportedOperationException("Not yet implemented!");
+ }
+
+ NavigableSet getMatchingContainerIDs(final String owner,
+ final ReplicationType type, final ReplicationFactor factor,
+ final LifeCycleState state) {
+ throw new UnsupportedOperationException("Not yet implemented!");
+ }
+
+ public void removeContainer(final HddsProtos.ContainerID id) {
+ containers.removeContainer(ContainerID.getFromProtobuf(id));
+ }
+
+ @Override
+ public void close() throws Exception {
+ containerStore.close();
+ }
+
+ public static Builder newBuilder() {
+ return new Builder();
+ }
+
+ /**
+ * Builder for ContainerStateManager.
+ */
+ public static class Builder {
+ private Configuration conf;
+ private PipelineManager pipelineMgr;
+ private SCMRatisServer scmRatisServer;
+ private Table table;
+
+ public Builder setConfiguration(final Configuration config) {
+ conf = config;
+ return this;
+ }
+
+ public Builder setPipelineManager(final PipelineManager pipelineManager) {
+ pipelineMgr = pipelineManager;
+ return this;
+ }
+
+ public Builder setRatisServer(final SCMRatisServer ratisServer) {
+ scmRatisServer = ratisServer;
+ return this;
+ }
+
+ public Builder setContainerStore(
+ final Table containerStore) {
+ table = containerStore;
+ return this;
+ }
+
+ public ContainerStateManagerV2 build() throws IOException {
+ Preconditions.checkNotNull(conf);
+ Preconditions.checkNotNull(pipelineMgr);
+ Preconditions.checkNotNull(scmRatisServer);
+ Preconditions.checkNotNull(table);
+
+ final ContainerStateManagerV2 csm = new ContainerStateManagerImpl(
+ conf, pipelineMgr, table);
+
+ final SCMHAInvocationHandler invocationHandler =
+ new SCMHAInvocationHandler(RequestType.CONTAINER, csm,
+ scmRatisServer);
+
+ return (ContainerStateManagerV2) Proxy.newProxyInstance(
+ SCMHAInvocationHandler.class.getClassLoader(),
+ new Class>[]{ContainerStateManagerV2.class}, invocationHandler);
+ }
+
+ }
+}
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerStateManagerV2.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerStateManagerV2.java
new file mode 100644
index 000000000000..3a0cf2111f01
--- /dev/null
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerStateManagerV2.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package org.apache.hadoop.hdds.scm.container;
+
+import java.io.IOException;
+import java.util.Set;
+
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ContainerInfoProto;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState;
+import org.apache.hadoop.hdds.scm.metadata.Replicate;
+import org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionException;
+
+/**
+ * A ContainerStateManager is responsible for keeping track of all the
+ * container and its state inside SCM, it also exposes methods to read and
+ * modify the container and its state.
+ *
+ * All the mutation operations are marked with {@link Replicate} annotation so
+ * that when SCM-HA is enabled, the mutations are replicated from leader SCM
+ * to the followers.
+ *
+ * When a method is marked with {@link Replicate} annotation it should follow
+ * the below rules.
+ *
+ * 1. The method call should be Idempotent
+ * 2. Arguments should be of protobuf objects
+ * 3. Return type should be of protobuf object
+ * 4. The declaration should throw RaftException
+ *
+ */
+public interface ContainerStateManagerV2 {
+
+ //TODO: Rename this to ContainerStateManager
+
+ /* **********************************************************************
+ * Container Life Cycle *
+ * *
+ * Event and State Transition Mapping: *
+ * *
+ * State: OPEN ----------------> CLOSING *
+ * Event: FINALIZE *
+ * *
+ * State: CLOSING ----------------> QUASI_CLOSED *
+ * Event: QUASI_CLOSE *
+ * *
+ * State: CLOSING ----------------> CLOSED *
+ * Event: CLOSE *
+ * *
+ * State: QUASI_CLOSED ----------------> CLOSED *
+ * Event: FORCE_CLOSE *
+ * *
+ * State: CLOSED ----------------> DELETING *
+ * Event: DELETE *
+ * *
+ * State: DELETING ----------------> DELETED *
+ * Event: CLEANUP *
+ * *
+ * *
+ * Container State Flow: *
+ * *
+ * [OPEN]--------------->[CLOSING]--------------->[QUASI_CLOSED] *
+ * (FINALIZE) | (QUASI_CLOSE) | *
+ * | | *
+ * | | *
+ * (CLOSE) | (FORCE_CLOSE) | *
+ * | | *
+ * | | *
+ * +--------->[CLOSED]<--------+ *
+ * | *
+ * (DELETE)| *
+ * | *
+ * | *
+ * [DELETING] *
+ * | *
+ * (CLEANUP) | *
+ * | *
+ * V *
+ * [DELETED] *
+ * *
+ ************************************************************************/
+
+ /**
+ *
+ */
+ boolean contains(HddsProtos.ContainerID containerID);
+
+ /**
+ * Returns the ID of all the managed containers.
+ *
+ * @return Set of {@link ContainerID}
+ */
+ Set getContainerIDs();
+
+ /**
+ *
+ */
+ Set getContainerIDs(LifeCycleState state);
+
+ /**
+ *
+ */
+ ContainerInfo getContainer(HddsProtos.ContainerID id);
+
+ /**
+ *
+ */
+ Set getContainerReplicas(HddsProtos.ContainerID id);
+
+ /**
+ *
+ */
+ void updateContainerReplica(HddsProtos.ContainerID id,
+ ContainerReplica replica);
+
+ /**
+ *
+ */
+ void removeContainerReplica(HddsProtos.ContainerID id,
+ ContainerReplica replica);
+
+ /**
+ *
+ */
+ @Replicate
+ void addContainer(ContainerInfoProto containerInfo)
+ throws IOException;
+
+ /**
+ *
+ */
+ @Replicate
+ void updateContainerState(HddsProtos.ContainerID id,
+ HddsProtos.LifeCycleEvent event)
+ throws IOException, InvalidStateTransitionException;
+
+ /**
+ *
+ */
+ @Replicate
+ void removeContainer(HddsProtos.ContainerID containerInfo)
+ throws IOException;
+
+ /**
+ *
+ */
+ void close() throws Exception;
+}
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/IncrementalContainerReportHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/IncrementalContainerReportHandler.java
index 5ecbed3fe7d9..1c8ff60d0f58 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/IncrementalContainerReportHandler.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/IncrementalContainerReportHandler.java
@@ -71,7 +71,7 @@ public void onMessage(final IncrementalContainerReportFromDatanode report,
for (ContainerReplicaProto replicaProto :
report.getReport().getReportList()) {
try {
- final ContainerID id = ContainerID.valueof(
+ final ContainerID id = ContainerID.valueOf(
replicaProto.getContainerID());
if (!replicaProto.getState().equals(
ContainerReplicaProto.State.DELETED)) {
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java
index bab885173627..ed6924ca8b03 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java
@@ -441,8 +441,8 @@ private boolean isContainerEmpty(final ContainerInfo container,
*/
private boolean isContainerUnderReplicated(final ContainerInfo container,
final Set replicas) {
- if (container.getState() != LifeCycleState.CLOSED &&
- container.getState() != LifeCycleState.QUASI_CLOSED) {
+ if (container.getState() == LifeCycleState.DELETING ||
+ container.getState() == LifeCycleState.DELETED) {
return false;
}
boolean misReplicated = !getPlacementStatus(
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/SCMContainerManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/SCMContainerManager.java
index 19a5ab20dc46..f59e4014d9c3 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/SCMContainerManager.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/SCMContainerManager.java
@@ -130,7 +130,7 @@ private void loadExistingContainers() throws IOException {
try {
if (container.getState() == LifeCycleState.OPEN) {
pipelineManager.addContainerToPipeline(container.getPipelineID(),
- ContainerID.valueof(container.getContainerID()));
+ ContainerID.valueOf(container.getContainerID()));
}
} catch (PipelineNotFoundException ex) {
LOG.warn("Found a Container {} which is in {} state with pipeline {} " +
@@ -216,7 +216,9 @@ public ContainerInfo getContainer(final ContainerID containerID)
public boolean exists(ContainerID containerID) {
lock.lock();
try {
- return (containerStateManager.getContainer(containerID) != null);
+ Preconditions.checkNotNull(
+ containerStateManager.getContainer(containerID));
+ return true;
} catch (ContainerNotFoundException e) {
return false;
} finally {
@@ -290,7 +292,7 @@ public ContainerInfo allocateContainer(final ReplicationType type,
// PipelineStateManager.
pipelineManager.removeContainerFromPipeline(
containerInfo.getPipelineID(),
- new ContainerID(containerInfo.getContainerID()));
+ containerInfo.containerID());
throw ex;
}
return containerInfo;
@@ -387,13 +389,13 @@ private HddsProtos.LifeCycleState updateContainerState(
}
}
- /**
- * Update deleteTransactionId according to deleteTransactionMap.
- *
- * @param deleteTransactionMap Maps the containerId to latest delete
- * transaction id for the container.
- * @throws IOException
- */
+ /**
+ * Update deleteTransactionId according to deleteTransactionMap.
+ *
+ * @param deleteTransactionMap Maps the containerId to latest delete
+ * transaction id for the container.
+ * @throws IOException
+ */
public void updateDeleteTransactionId(Map deleteTransactionMap)
throws IOException {
@@ -404,7 +406,8 @@ public void updateDeleteTransactionId(Map deleteTransactionMap)
try(BatchOperation batchOperation = batchHandler.initBatchOperation()) {
for (Map.Entry< Long, Long > entry : deleteTransactionMap.entrySet()) {
long containerID = entry.getKey();
- ContainerID containerIdObject = new ContainerID(containerID);
+
+ ContainerID containerIdObject = ContainerID.valueOf(containerID);
ContainerInfo containerInfo =
containerStore.get(containerIdObject);
ContainerInfo containerInfoInMem = containerStateManager
@@ -493,7 +496,7 @@ protected void addContainerToDB(ContainerInfo containerInfo)
throws IOException {
try {
containerStore
- .put(new ContainerID(containerInfo.getContainerID()), containerInfo);
+ .put(containerInfo.containerID(), containerInfo);
// Incrementing here, as allocateBlock to create a container calls
// getMatchingContainer() and finally calls this API to add newly
// created container to DB.
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/states/ContainerAttribute.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/states/ContainerAttribute.java
index af44a8a043e5..61cff09daa7e 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/states/ContainerAttribute.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/states/ContainerAttribute.java
@@ -153,7 +153,7 @@ public boolean hasContainerID(T key, ContainerID id) {
* @return true or false
*/
public boolean hasContainerID(T key, int id) {
- return hasContainerID(key, ContainerID.valueof(id));
+ return hasContainerID(key, ContainerID.valueOf(id));
}
/**
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/states/ContainerStateMap.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/states/ContainerStateMap.java
index 8cef966995eb..4d143e0db2f7 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/states/ContainerStateMap.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/states/ContainerStateMap.java
@@ -18,31 +18,29 @@
package org.apache.hadoop.hdds.scm.container.states;
+import java.util.Set;
+import java.util.Collections;
+import java.util.Map;
+import java.util.NavigableSet;
+import java.util.TreeSet;
+import java.util.concurrent.locks.ReadWriteLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+import java.util.concurrent.ConcurrentHashMap;
+
import com.google.common.base.Preconditions;
import org.apache.hadoop.hdds.scm.container.ContainerID;
-import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException;
import org.apache.hadoop.hdds.scm.container.ContainerReplica;
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
-import org.apache.hadoop.hdds.scm.container.ContainerReplicaNotFoundException;
import org.apache.hadoop.hdds.scm.exceptions.SCMException;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType;
+
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.util.Set;
-import java.util.Collections;
-import java.util.Map;
-import java.util.NavigableSet;
-import java.util.TreeSet;
-import java.util.concurrent.locks.ReadWriteLock;
-import java.util.concurrent.locks.ReentrantReadWriteLock;
-import java.util.concurrent.ConcurrentHashMap;
-import static org.apache.hadoop.hdds.scm.exceptions.SCMException.ResultCodes
- .CONTAINER_EXISTS;
import static org.apache.hadoop.hdds.scm.exceptions.SCMException.ResultCodes
.FAILED_TO_CHANGE_CONTAINER_STATE;
@@ -76,6 +74,8 @@
* select a container that belongs to user1, with Ratis replication which can
* make 3 copies of data. The fact that we will look for open containers by
* default and if we cannot find them we will add new containers.
+ *
+ * All the calls are idempotent.
*/
public class ContainerStateMap {
private static final Logger LOG =
@@ -95,6 +95,7 @@ public class ContainerStateMap {
// Container State Map lock should be held before calling into
// Update ContainerAttributes. The consistency of ContainerAttributes is
// protected by this lock.
+ // Can we remove this lock?
private final ReadWriteLock lock;
/**
@@ -120,56 +121,57 @@ public ContainerStateMap() {
public void addContainer(final ContainerInfo info)
throws SCMException {
Preconditions.checkNotNull(info, "Container Info cannot be null");
- Preconditions.checkArgument(info.getReplicationFactor().getNumber() > 0,
- "ExpectedReplicaCount should be greater than 0");
-
lock.writeLock().lock();
try {
final ContainerID id = info.containerID();
- if (containerMap.putIfAbsent(id, info) != null) {
- LOG.debug("Duplicate container ID detected. {}", id);
- throw new
- SCMException("Duplicate container ID detected.",
- CONTAINER_EXISTS);
+ if (!contains(id)) {
+ containerMap.put(id, info);
+ lifeCycleStateMap.insert(info.getState(), id);
+ ownerMap.insert(info.getOwner(), id);
+ factorMap.insert(info.getReplicationFactor(), id);
+ typeMap.insert(info.getReplicationType(), id);
+ replicaMap.put(id, ConcurrentHashMap.newKeySet());
+
+ // Flush the cache of this container type, will be added later when
+ // get container queries are executed.
+ flushCache(info);
+ LOG.trace("Container {} added to ContainerStateMap.", id);
}
-
- lifeCycleStateMap.insert(info.getState(), id);
- ownerMap.insert(info.getOwner(), id);
- factorMap.insert(info.getReplicationFactor(), id);
- typeMap.insert(info.getReplicationType(), id);
- replicaMap.put(id, ConcurrentHashMap.newKeySet());
-
- // Flush the cache of this container type, will be added later when
- // get container queries are executed.
- flushCache(info);
- LOG.trace("Created container with {} successfully.", id);
} finally {
lock.writeLock().unlock();
}
}
+ public boolean contains(final ContainerID id) {
+ lock.readLock().lock();
+ try {
+ return containerMap.containsKey(id);
+ } finally {
+ lock.readLock().unlock();
+ }
+ }
+
/**
* Removes a Container Entry from ContainerStateMap.
*
- * @param containerID - ContainerID
- * @throws SCMException - throws if create failed.
+ * @param id - ContainerID
*/
- public void removeContainer(final ContainerID containerID)
- throws ContainerNotFoundException {
- Preconditions.checkNotNull(containerID, "ContainerID cannot be null");
+ public void removeContainer(final ContainerID id) {
+ Preconditions.checkNotNull(id, "ContainerID cannot be null");
lock.writeLock().lock();
try {
- checkIfContainerExist(containerID);
- // Should we revert back to the original state if any of the below
- // remove operation fails?
- final ContainerInfo info = containerMap.remove(containerID);
- lifeCycleStateMap.remove(info.getState(), containerID);
- ownerMap.remove(info.getOwner(), containerID);
- factorMap.remove(info.getReplicationFactor(), containerID);
- typeMap.remove(info.getReplicationType(), containerID);
- // Flush the cache of this container type.
- flushCache(info);
- LOG.trace("Removed container with {} successfully.", containerID);
+ if (contains(id)) {
+ // Should we revert back to the original state if any of the below
+ // remove operation fails?
+ final ContainerInfo info = containerMap.remove(id);
+ lifeCycleStateMap.remove(info.getState(), id);
+ ownerMap.remove(info.getOwner(), id);
+ factorMap.remove(info.getReplicationFactor(), id);
+ typeMap.remove(info.getReplicationType(), id);
+ // Flush the cache of this container type.
+ flushCache(info);
+ LOG.trace("Container {} removed from ContainerStateMap.", id);
+ }
} finally {
lock.writeLock().unlock();
}
@@ -179,13 +181,11 @@ public void removeContainer(final ContainerID containerID)
* Returns the latest state of Container from SCM's Container State Map.
*
* @param containerID - ContainerID
- * @return container info, if found.
+ * @return container info, if found else null.
*/
- public ContainerInfo getContainerInfo(final ContainerID containerID)
- throws ContainerNotFoundException {
+ public ContainerInfo getContainerInfo(final ContainerID containerID) {
lock.readLock().lock();
try {
- checkIfContainerExist(containerID);
return containerMap.get(containerID);
} finally {
lock.readLock().unlock();
@@ -194,19 +194,18 @@ public ContainerInfo getContainerInfo(final ContainerID containerID)
/**
* Returns the latest list of DataNodes where replica for given containerId
- * exist. Throws an SCMException if no entry is found for given containerId.
+ * exist.
*
* @param containerID
* @return Set
*/
public Set getContainerReplicas(
- final ContainerID containerID) throws ContainerNotFoundException {
+ final ContainerID containerID) {
Preconditions.checkNotNull(containerID);
lock.readLock().lock();
try {
- checkIfContainerExist(containerID);
- return Collections
- .unmodifiableSet(replicaMap.get(containerID));
+ final Set replicas = replicaMap.get(containerID);
+ return replicas == null ? null : Collections.unmodifiableSet(replicas);
} finally {
lock.readLock().unlock();
}
@@ -221,14 +220,15 @@ public Set getContainerReplicas(
* @param replica
*/
public void updateContainerReplica(final ContainerID containerID,
- final ContainerReplica replica) throws ContainerNotFoundException {
+ final ContainerReplica replica) {
Preconditions.checkNotNull(containerID);
lock.writeLock().lock();
try {
- checkIfContainerExist(containerID);
- Set replicas = replicaMap.get(containerID);
- replicas.remove(replica);
- replicas.add(replica);
+ if (contains(containerID)) {
+ final Set replicas = replicaMap.get(containerID);
+ replicas.remove(replica);
+ replicas.add(replica);
+ }
} finally {
lock.writeLock().unlock();
}
@@ -242,18 +242,13 @@ public void updateContainerReplica(final ContainerID containerID,
* @return True of dataNode is removed successfully else false.
*/
public void removeContainerReplica(final ContainerID containerID,
- final ContainerReplica replica)
- throws ContainerNotFoundException, ContainerReplicaNotFoundException {
+ final ContainerReplica replica) {
Preconditions.checkNotNull(containerID);
Preconditions.checkNotNull(replica);
-
lock.writeLock().lock();
try {
- checkIfContainerExist(containerID);
- if(!replicaMap.get(containerID).remove(replica)) {
- throw new ContainerReplicaNotFoundException(
- "Container #"
- + containerID.getId() + ", replica: " + replica);
+ if (contains(containerID)) {
+ replicaMap.get(containerID).remove(replica);
}
} finally {
lock.writeLock().unlock();
@@ -264,15 +259,16 @@ public void removeContainerReplica(final ContainerID containerID,
* Just update the container State.
* @param info ContainerInfo.
*/
- public void updateContainerInfo(final ContainerInfo info)
- throws ContainerNotFoundException {
+ public void updateContainerInfo(final ContainerInfo info) {
+ Preconditions.checkNotNull(info);
+ final ContainerID id = info.containerID();
lock.writeLock().lock();
try {
- Preconditions.checkNotNull(info);
- checkIfContainerExist(info.containerID());
- final ContainerInfo currentInfo = containerMap.get(info.containerID());
- flushCache(info, currentInfo);
- containerMap.put(info.containerID(), info);
+ if (contains(id)) {
+ final ContainerInfo currentInfo = containerMap.get(id);
+ flushCache(info, currentInfo);
+ containerMap.put(id, info);
+ }
} finally {
lock.writeLock().unlock();
}
@@ -287,12 +283,16 @@ public void updateContainerInfo(final ContainerInfo info)
* @throws SCMException - in case of failure.
*/
public void updateState(ContainerID containerID, LifeCycleState currentState,
- LifeCycleState newState) throws SCMException, ContainerNotFoundException {
+ LifeCycleState newState) throws SCMException {
Preconditions.checkNotNull(currentState);
Preconditions.checkNotNull(newState);
lock.writeLock().lock();
try {
- checkIfContainerExist(containerID);
+ if (!contains(containerID)) {
+ return;
+ }
+
+ // TODO: Simplify this logic.
final ContainerInfo currentInfo = containerMap.get(containerID);
try {
currentInfo.setState(newState);
@@ -340,7 +340,12 @@ public void updateState(ContainerID containerID, LifeCycleState currentState,
}
public Set getAllContainerIDs() {
- return Collections.unmodifiableSet(containerMap.keySet());
+ lock.readLock().lock();
+ try {
+ return Collections.unmodifiableSet(containerMap.keySet());
+ } finally {
+ lock.readLock().unlock();
+ }
}
/**
@@ -535,12 +540,4 @@ private void flushCache(final ContainerInfo... containerInfos) {
}
}
- private void checkIfContainerExist(ContainerID containerID)
- throws ContainerNotFoundException {
- if (!containerMap.containsKey(containerID)) {
- throw new ContainerNotFoundException("Container with id #" +
- containerID.getId() + " not found.");
- }
- }
-
}
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/RatisUtil.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/RatisUtil.java
new file mode 100644
index 000000000000..1bc16974362f
--- /dev/null
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/RatisUtil.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package org.apache.hadoop.hdds.scm.ha;
+
+import com.google.common.base.Strings;
+import org.apache.hadoop.hdds.conf.ConfigurationSource;
+import org.apache.hadoop.hdds.server.ServerUtils;
+import org.apache.ratis.RaftConfigKeys;
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.grpc.GrpcConfigKeys;
+import org.apache.ratis.rpc.RpcType;
+import org.apache.ratis.server.RaftServerConfigKeys;
+import org.apache.ratis.util.SizeInBytes;
+import org.apache.ratis.util.TimeDuration;
+
+import java.io.File;
+import java.util.Collections;
+import java.util.concurrent.TimeUnit;
+
+import static org.apache.ratis.server.RaftServerConfigKeys.Log;
+import static org.apache.ratis.server.RaftServerConfigKeys.RetryCache;
+import static org.apache.ratis.server.RaftServerConfigKeys.Rpc;
+import static org.apache.ratis.server.RaftServerConfigKeys.Snapshot;
+
+/**
+ * Ratis Util for SCM HA.
+ */
+public final class RatisUtil {
+
+ private RatisUtil() {
+ }
+
+
+ /**
+ * Constructs new Raft Properties instance using {@link SCMHAConfiguration}.
+ * @param haConf SCMHAConfiguration
+ * @param conf ConfigurationSource
+ */
+ public static RaftProperties newRaftProperties(
+ final SCMHAConfiguration haConf, final ConfigurationSource conf) {
+ //TODO: Remove ConfigurationSource!
+ // TODO: Check the default values.
+ final RaftProperties properties = new RaftProperties();
+ setRaftStorageDir(properties, haConf, conf);
+ setRaftRpcProperties(properties, haConf);
+ setRaftLogProperties(properties, haConf);
+ setRaftRetryCacheProperties(properties, haConf);
+ setRaftSnapshotProperties(properties, haConf);
+ return properties;
+ }
+
+ /**
+ * Set the local directory where ratis logs will be stored.
+ *
+ * @param properties RaftProperties instance which will be updated
+ * @param haConf SCMHAConfiguration
+ * @param conf ConfigurationSource
+ */
+ public static void setRaftStorageDir(final RaftProperties properties,
+ final SCMHAConfiguration haConf,
+ final ConfigurationSource conf) {
+ String storageDir = haConf.getRatisStorageDir();
+ if (Strings.isNullOrEmpty(storageDir)) {
+ storageDir = ServerUtils.getDefaultRatisDirectory(conf);
+ }
+ RaftServerConfigKeys.setStorageDir(properties,
+ Collections.singletonList(new File(storageDir)));
+ }
+
+ /**
+ * Set properties related to Raft RPC.
+ *
+ * @param properties RaftProperties instance which will be updated
+ * @param conf SCMHAConfiguration
+ */
+ private static void setRaftRpcProperties(final RaftProperties properties,
+ final SCMHAConfiguration conf) {
+ RaftConfigKeys.Rpc.setType(properties,
+ RpcType.valueOf(conf.getRatisRpcType()));
+ GrpcConfigKeys.Server.setPort(properties,
+ conf.getRatisBindAddress().getPort());
+ GrpcConfigKeys.setMessageSizeMax(properties,
+ SizeInBytes.valueOf("32m"));
+
+ Rpc.setRequestTimeout(properties, TimeDuration.valueOf(
+ conf.getRatisRequestTimeout(), TimeUnit.MILLISECONDS));
+ Rpc.setTimeoutMin(properties, TimeDuration.valueOf(
+ conf.getRatisRequestMinTimeout(), TimeUnit.MILLISECONDS));
+ Rpc.setTimeoutMax(properties, TimeDuration.valueOf(
+ conf.getRatisRequestMaxTimeout(), TimeUnit.MILLISECONDS));
+ Rpc.setSlownessTimeout(properties, TimeDuration.valueOf(
+ conf.getRatisNodeFailureTimeout(), TimeUnit.MILLISECONDS));
+ }
+
+ /**
+ * Set properties related to Raft Log.
+ *
+ * @param properties RaftProperties instance which will be updated
+ * @param conf SCMHAConfiguration
+ */
+ private static void setRaftLogProperties(final RaftProperties properties,
+ final SCMHAConfiguration conf) {
+ Log.setSegmentSizeMax(properties,
+ SizeInBytes.valueOf(conf.getRaftSegmentSize()));
+ Log.Appender.setBufferElementLimit(properties,
+ conf.getRaftLogAppenderQueueByteLimit());
+ Log.Appender.setBufferByteLimit(properties,
+ SizeInBytes.valueOf(conf.getRaftLogAppenderQueueByteLimit()));
+ Log.setPreallocatedSize(properties,
+ SizeInBytes.valueOf(conf.getRaftSegmentPreAllocatedSize()));
+ Log.Appender.setInstallSnapshotEnabled(properties, false);
+ Log.setPurgeGap(properties, conf.getRaftLogPurgeGap());
+ Log.setSegmentCacheNumMax(properties, 2);
+ }
+
+ /**
+ * Set properties related to Raft Retry Cache.
+ *
+ * @param properties RaftProperties instance which will be updated
+ * @param conf SCMHAConfiguration
+ */
+ private static void setRaftRetryCacheProperties(
+ final RaftProperties properties, final SCMHAConfiguration conf) {
+ RetryCache.setExpiryTime(properties, TimeDuration.valueOf(
+ conf.getRatisRetryCacheTimeout(), TimeUnit.MILLISECONDS));
+ }
+
+ /**
+ * Set properties related to Raft Snapshot.
+ *
+ * @param properties RaftProperties instance which will be updated
+ * @param conf SCMHAConfiguration
+ */
+ private static void setRaftSnapshotProperties(
+ final RaftProperties properties, final SCMHAConfiguration conf) {
+ Snapshot.setAutoTriggerEnabled(properties, true);
+ Snapshot.setAutoTriggerThreshold(properties, 400000);
+ }
+
+}
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/ReflectionUtil.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/ReflectionUtil.java
new file mode 100644
index 000000000000..7c54723d7470
--- /dev/null
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/ReflectionUtil.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package org.apache.hadoop.hdds.scm.ha;
+
+import java.lang.reflect.Method;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Reflection util for SCM HA.
+ */
+public final class ReflectionUtil {
+
+ private static Map> classCache = new HashMap<>();
+
+ private ReflectionUtil() {
+ }
+
+ /**
+ * Returns the {@code Class} object associated with the given string name.
+ *
+ * @param className the fully qualified name of the desired class.
+ * @return the {@code Class} object for the class with the
+ * specified name.
+ * @throws ClassNotFoundException if the class cannot be located
+ */
+ public static Class> getClass(String className)
+ throws ClassNotFoundException {
+ if (!classCache.containsKey(className)) {
+ classCache.put(className, Class.forName(className));
+ }
+ return classCache.get(className);
+ }
+
+ /**
+ * Returns a {@code Method} object that reflects the specified public
+ * member method of the given {@code Class} object.
+ *
+ * @param clazz the class object which has the method
+ * @param methodName the name of the method
+ * @param arg the list of parameters
+ * @return the {@code Method} object that matches the specified
+ * {@code name} and {@code parameterTypes}
+ * @throws NoSuchMethodException if a matching method is not found
+ * or if the name is "<init>"or "<clinit>".
+ */
+ public static Method getMethod(
+ final Class> clazz, final String methodName, final Class>... arg)
+ throws NoSuchMethodException {
+ return clazz.getMethod(methodName, arg);
+ }
+}
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAConfiguration.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAConfiguration.java
new file mode 100644
index 000000000000..5fbf2688b1aa
--- /dev/null
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAConfiguration.java
@@ -0,0 +1,225 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *
Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package org.apache.hadoop.hdds.scm.ha;
+
+import java.net.InetSocketAddress;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.hdds.conf.Config;
+import org.apache.hadoop.hdds.conf.ConfigGroup;
+import org.apache.hadoop.hdds.conf.ConfigType;
+import org.apache.hadoop.net.NetUtils;
+
+import static org.apache.hadoop.hdds.conf.ConfigTag.HA;
+import static org.apache.hadoop.hdds.conf.ConfigTag.OZONE;
+import static org.apache.hadoop.hdds.conf.ConfigTag.RATIS;
+import static org.apache.hadoop.hdds.conf.ConfigTag.SCM;
+
+/**
+ * Configuration used by SCM HA.
+ */
+@ConfigGroup(prefix = "ozone.scm.ha")
+public class SCMHAConfiguration {
+
+ @Config(key = "ratis.storage.dir",
+ type = ConfigType.STRING,
+ defaultValue = "",
+ tags = {OZONE, SCM, HA, RATIS},
+ description = "Storage directory used by SCM to write Ratis logs."
+ )
+ private String ratisStorageDir;
+
+ @Config(key = "ratis.bind.host",
+ type = ConfigType.STRING,
+ defaultValue = "0.0.0.0",
+ tags = {OZONE, SCM, HA, RATIS},
+ description = "Host used by SCM for binding Ratis Server."
+ )
+ private String ratisBindHost = "0.0.0.0";
+
+ @Config(key = "ratis.bind.port",
+ type = ConfigType.INT,
+ defaultValue = "9865",
+ tags = {OZONE, SCM, HA, RATIS},
+ description = "Port used by SCM for Ratis Server."
+ )
+ private int ratisBindPort = 9865;
+
+
+ @Config(key = "ratis.rpc.type",
+ type = ConfigType.STRING,
+ defaultValue = "GRPC",
+ tags = {SCM, OZONE, HA, RATIS},
+ description = "Ratis supports different kinds of transports like" +
+ " netty, GRPC, Hadoop RPC etc. This picks one of those for" +
+ " this cluster."
+ )
+ private String ratisRpcType;
+
+ @Config(key = "ratis.segment.size",
+ type = ConfigType.SIZE,
+ defaultValue = "16KB",
+ tags = {SCM, OZONE, HA, RATIS},
+ description = "The size of the raft segment used by Apache Ratis on" +
+ " SCM. (16 KB by default)"
+ )
+ private double raftSegmentSize = 16L * 1024L;
+
+ @Config(key = "ratis.segment.preallocated.size",
+ type = ConfigType.SIZE,
+ defaultValue = "16KB",
+ tags = {SCM, OZONE, HA, RATIS},
+ description = "The size of the buffer which is preallocated for" +
+ " raft segment used by Apache Ratis on SCM.(16 KB by default)"
+ )
+ private double raftSegmentPreAllocatedSize = 16 * 1024;
+
+ @Config(key = "ratis.log.appender.queue.num-elements",
+ type = ConfigType.INT,
+ defaultValue = "1024",
+ tags = {SCM, OZONE, HA, RATIS},
+ description = "Number of operation pending with Raft's Log Worker."
+ )
+ private int raftLogAppenderQueueNum = 1024;
+
+ @Config(key = "ratis.log.appender.queue.byte-limit",
+ type = ConfigType.SIZE,
+ defaultValue = "32MB",
+ tags = {SCM, OZONE, HA, RATIS},
+ description = "Byte limit for Raft's Log Worker queue."
+ )
+ private double raftLogAppenderQueueByteLimit = 32 * 1024 * 1024;
+
+ @Config(key = "ratis.log.purge.gap",
+ type = ConfigType.INT,
+ defaultValue = "1000000",
+ tags = {SCM, OZONE, HA, RATIS},
+ description = "The minimum gap between log indices for Raft server to" +
+ " purge its log segments after taking snapshot."
+ )
+ private int raftLogPurgeGap = 1000000;
+
+ @Config(key = "ratis.request.timeout",
+ type = ConfigType.TIME,
+ defaultValue = "3000ms",
+ tags = {SCM, OZONE, HA, RATIS},
+ description = "The timeout duration for SCM's Ratis server RPC."
+ )
+ private long ratisRequestTimeout = 3000L;
+
+ @Config(key = "ratis.server.retry.cache.timeout",
+ type = ConfigType.TIME,
+ defaultValue = "60s",
+ tags = {SCM, OZONE, HA, RATIS},
+ description = "Retry Cache entry timeout for SCM's ratis server."
+ )
+ private long ratisRetryCacheTimeout = 60 * 1000L;
+
+
+ @Config(key = "ratis.leader.election.timeout",
+ type = ConfigType.TIME,
+ defaultValue = "1s",
+ tags = {SCM, OZONE, HA, RATIS},
+ description = "The minimum timeout duration for SCM ratis leader" +
+ " election. Default is 1s."
+ )
+ private long ratisLeaderElectionTimeout = 1 * 1000L;
+
+ @Config(key = "ratis.server.failure.timeout.duration",
+ type = ConfigType.TIME,
+ defaultValue = "120s",
+ tags = {SCM, OZONE, HA, RATIS},
+ description = "The timeout duration for ratis server failure" +
+ " detection, once the threshold has reached, the ratis state" +
+ " machine will be informed about the failure in the ratis ring."
+ )
+ private long ratisNodeFailureTimeout = 120 * 1000L;
+
+ @Config(key = "ratis.server.role.check.interval",
+ type = ConfigType.TIME,
+ defaultValue = "15s",
+ tags = {SCM, OZONE, HA, RATIS},
+ description = "The interval between SCM leader performing a role" +
+ " check on its ratis server. Ratis server informs SCM if it loses" +
+ " the leader role. The scheduled check is an secondary check to" +
+ " ensure that the leader role is updated periodically"
+ )
+ private long ratisRoleCheckerInterval = 15 * 1000L;
+
+ public String getRatisStorageDir() {
+ return ratisStorageDir;
+ }
+
+ public InetSocketAddress getRatisBindAddress() {
+ return NetUtils.createSocketAddr(ratisBindHost, ratisBindPort);
+ }
+
+ public String getRatisRpcType() {
+ return ratisRpcType;
+ }
+
+ public long getRaftSegmentSize() {
+ return (long)raftSegmentSize;
+ }
+
+ public long getRaftSegmentPreAllocatedSize() {
+ return (long)raftSegmentPreAllocatedSize;
+ }
+
+ public int getRaftLogAppenderQueueNum() {
+ return raftLogAppenderQueueNum;
+ }
+
+ public int getRaftLogAppenderQueueByteLimit() {
+ return (int)raftLogAppenderQueueByteLimit;
+ }
+
+ public int getRaftLogPurgeGap() {
+ return raftLogPurgeGap;
+ }
+
+ public long getRatisRetryCacheTimeout() {
+ return ratisRetryCacheTimeout;
+ }
+
+ public long getRatisRequestTimeout() {
+ Preconditions.checkArgument(ratisRequestTimeout > 1000L,
+ "Ratis request timeout cannot be less than 1000ms.");
+ return ratisRequestTimeout;
+ }
+
+ public long getRatisRequestMinTimeout() {
+ return ratisRequestTimeout - 1000L;
+ }
+
+ public long getRatisRequestMaxTimeout() {
+ return ratisRequestTimeout + 1000L;
+ }
+
+ public long getRatisLeaderElectionTimeout() {
+ return ratisLeaderElectionTimeout;
+ }
+
+ public long getRatisNodeFailureTimeout() {
+ return ratisNodeFailureTimeout;
+ }
+
+ public long getRatisRoleCheckerInterval() {
+ return ratisRoleCheckerInterval;
+ }
+}
\ No newline at end of file
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAInvocationHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAInvocationHandler.java
new file mode 100644
index 000000000000..cbe2ce38ef41
--- /dev/null
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAInvocationHandler.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *
Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package org.apache.hadoop.hdds.scm.ha;
+
+import java.lang.reflect.InvocationHandler;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+
+import org.apache.hadoop.hdds.protocol.proto.SCMRatisProtocol.RequestType;
+import org.apache.hadoop.hdds.scm.metadata.Replicate;
+import org.apache.hadoop.util.Time;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * InvocationHandler which checks for {@link Replicate} annotation and
+ * dispatches the request to Ratis Server.
+ */
+public class SCMHAInvocationHandler implements InvocationHandler {
+
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(SCMHAInvocationHandler.class);
+
+ private final RequestType requestType;
+ private final Object localHandler;
+ private final SCMRatisServer ratisHandler;
+
+ /**
+ * TODO.
+ */
+ public SCMHAInvocationHandler(final RequestType requestType,
+ final Object localHandler,
+ final SCMRatisServer ratisHandler) {
+ this.requestType = requestType;
+ this.localHandler = localHandler;
+ this.ratisHandler = ratisHandler;
+ ratisHandler.registerStateMachineHandler(requestType, localHandler);
+ }
+
+ @Override
+ public Object invoke(final Object proxy, final Method method,
+ final Object[] args) throws Throwable {
+ try {
+ long startTime = Time.monotonicNow();
+ final Object result = method.isAnnotationPresent(Replicate.class) ?
+ invokeRatis(method, args) : invokeLocal(method, args);
+ LOG.debug("Call: {} took {} ms", method, Time.monotonicNow() - startTime);
+ return result;
+ } catch(InvocationTargetException iEx) {
+ throw iEx.getCause();
+ }
+ }
+
+ /**
+ * TODO.
+ */
+ private Object invokeLocal(Method method, Object[] args)
+ throws InvocationTargetException, IllegalAccessException {
+ LOG.trace("Invoking method {} on target {} with arguments {}",
+ method, localHandler, args);
+ return method.invoke(localHandler, args);
+ }
+
+ /**
+ * TODO.
+ */
+ private Object invokeRatis(Method method, Object[] args)
+ throws Exception {
+ LOG.trace("Invoking method {} on target {}", method, ratisHandler);
+ final SCMRatisResponse response = ratisHandler.submitRequest(
+ SCMRatisRequest.of(requestType, method.getName(), args));
+ if (response.isSuccess()) {
+ return response.getResult();
+ }
+ // Should we unwrap and throw proper exception from here?
+ throw response.getException();
+ }
+
+}
\ No newline at end of file
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAManager.java
new file mode 100644
index 000000000000..59410b19c2df
--- /dev/null
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAManager.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *
Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package org.apache.hadoop.hdds.scm.ha;
+
+import java.io.IOException;
+import java.util.Optional;
+
+/**
+ * SCMHAManager provides HA service for SCM.
+ */
+public interface SCMHAManager {
+
+ /**
+ * Starts HA service.
+ */
+ void start() throws IOException;
+
+ /**
+ * For HA mode, return an Optional that holds term of the
+ * underlying RaftServer iff current SCM is in leader role.
+ * Otherwise, return an empty optional.
+ *
+ * For non-HA mode, return an Optional that holds term 0.
+ */
+ Optional isLeader();
+
+ /**
+ * Returns RatisServer instance associated with the SCM instance.
+ */
+ SCMRatisServer getRatisServer();
+
+ /**
+ * Stops the HA service.
+ */
+ void shutdown() throws IOException;
+}
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAManagerImpl.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAManagerImpl.java
new file mode 100644
index 000000000000..ae91fc2e8f72
--- /dev/null
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAManagerImpl.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *
Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package org.apache.hadoop.hdds.scm.ha;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.hdds.conf.ConfigurationSource;
+import org.apache.ratis.proto.RaftProtos;
+import org.apache.ratis.server.RaftServer;
+import org.apache.ratis.server.impl.RaftServerImpl;
+import org.apache.ratis.server.impl.RaftServerProxy;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Optional;
+
+/**
+ * SCMHAManagerImpl uses Apache Ratis for HA implementation. We will have 2N+1
+ * node Ratis ring. The Ratis ring will have one Leader node and 2N follower
+ * nodes.
+ *
+ * TODO
+ *
+ */
+public class SCMHAManagerImpl implements SCMHAManager {
+
+ private static final Logger LOG =
+ LoggerFactory.getLogger(SCMHAManagerImpl.class);
+
+ private final SCMRatisServer ratisServer;
+ private final ConfigurationSource conf;
+
+ /**
+ * Creates SCMHAManager instance.
+ */
+ public SCMHAManagerImpl(final ConfigurationSource conf) throws IOException {
+ this.conf = conf;
+ this.ratisServer = new SCMRatisServerImpl(
+ conf.getObject(SCMHAConfiguration.class), conf);
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public void start() throws IOException {
+ ratisServer.start();
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public Optional isLeader() {
+ if (!SCMHAUtils.isSCMHAEnabled(conf)) {
+ // When SCM HA is not enabled, the current SCM is always the leader.
+ return Optional.of((long)0);
+ }
+ RaftServer server = ratisServer.getDivision().getRaftServer();
+ Preconditions.checkState(server instanceof RaftServerProxy);
+ try {
+ // SCM only has one raft group.
+ RaftServerImpl serverImpl = ((RaftServerProxy) server)
+ .getImpl(ratisServer.getDivision().getGroup().getGroupId());
+ if (serverImpl != null) {
+ // TODO: getRoleInfoProto() will be exposed from Division later.
+ RaftProtos.RoleInfoProto roleInfoProto = serverImpl.getRoleInfoProto();
+ return roleInfoProto.hasLeaderInfo()
+ ? Optional.of(roleInfoProto.getLeaderInfo().getTerm())
+ : Optional.empty();
+ }
+ } catch (IOException ioe) {
+ LOG.error("Fail to get RaftServer impl and therefore it's not clear " +
+ "whether it's leader. ", ioe);
+ }
+ return Optional.empty();
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public SCMRatisServer getRatisServer() {
+ return ratisServer;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public void shutdown() throws IOException {
+ ratisServer.stop();
+ }
+}
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAUtils.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAUtils.java
new file mode 100644
index 000000000000..48946b487913
--- /dev/null
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAUtils.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdds.scm.ha;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdds.conf.ConfigurationSource;
+import org.apache.hadoop.hdds.scm.ScmConfigKeys;
+
+import java.util.Collection;
+
+/**
+ * Utility class used by SCM HA.
+ */
+public final class SCMHAUtils {
+ private SCMHAUtils() {
+ // not used
+ }
+
+ // Check if SCM HA is enabled.
+ public static boolean isSCMHAEnabled(ConfigurationSource conf) {
+ return conf.getBoolean(ScmConfigKeys.OZONE_SCM_HA_ENABLE_KEY,
+ ScmConfigKeys.OZONE_SCM_HA_ENABLE_DEFAULT);
+ }
+
+ /**
+ * Get a collection of all scmNodeIds for the given scmServiceId.
+ */
+ public static Collection getSCMNodeIds(Configuration conf,
+ String scmServiceId) {
+ String key = addSuffix(ScmConfigKeys.OZONE_SCM_NODES_KEY, scmServiceId);
+ return conf.getTrimmedStringCollection(key);
+ }
+
+ public static String getLocalSCMNodeId(String scmServiceId) {
+ return addSuffix(ScmConfigKeys.OZONE_SCM_NODES_KEY, scmServiceId);
+ }
+
+ /**
+ * Add non empty and non null suffix to a key.
+ */
+ private static String addSuffix(String key, String suffix) {
+ if (suffix == null || suffix.isEmpty()) {
+ return key;
+ }
+ assert !suffix.startsWith(".") :
+ "suffix '" + suffix + "' should not already have '.' prepended.";
+ return key + "." + suffix;
+ }
+}
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMNodeDetails.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMNodeDetails.java
new file mode 100644
index 000000000000..2390cb3a87cd
--- /dev/null
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMNodeDetails.java
@@ -0,0 +1,181 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *