iter = getBlockReportIterator();
- while (iter.hasNext()) {
- Block b = iter.next();
- if (b.getGenerationStamp() > maxGs) {
- maxGs = b.getGenerationStamp();
+ @InterfaceAudience.Private
+ public static class BlockReportReplica extends Block implements Replica {
+ private ReplicaState state;
+ private BlockReportReplica() {
+ }
+ public BlockReportReplica(Block block) {
+ super(block);
+ if (block instanceof BlockReportReplica) {
+ this.state = ((BlockReportReplica)block).getState();
+ } else {
+ this.state = ReplicaState.FINALIZED;
}
}
- return maxGs;
+ public void setState(ReplicaState state) {
+ this.state = state;
+ }
+ @Override
+ public ReplicaState getState() {
+ return state;
+ }
+ @Override
+ public long getBytesOnDisk() {
+ return getNumBytes();
+ }
+ @Override
+ public long getVisibleLength() {
+ throw new UnsupportedOperationException();
+ }
+ @Override
+ public String getStorageUuid() {
+ throw new UnsupportedOperationException();
+ }
+ @Override
+ public boolean isOnTransientStorage() {
+ throw new UnsupportedOperationException();
+ }
+ @Override
+ public boolean equals(Object o) {
+ return super.equals(o);
+ }
+ @Override
+ public int hashCode() {
+ return super.hashCode();
+ }
}
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/ClientProtocol.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/ClientProtocol.java
index d4fe9038e3dce..bafb02b8d4acc 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/ClientProtocol.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/ClientProtocol.java
@@ -798,12 +798,17 @@ public boolean setSafeMode(HdfsConstants.SafeModeAction action, boolean isChecke
*
* Saves current namespace into storage directories and reset edits log.
* Requires superuser privilege and safe mode.
- *
- * @throws AccessControlException if the superuser privilege is violated.
+ *
+ * @param timeWindow NameNode does a checkpoint if the latest checkpoint was
+ * done beyond the given time period (in seconds).
+ * @param txGap NameNode does a checkpoint if the gap between the latest
+ * checkpoint and the latest transaction id is greater this gap.
+ * @return whether an extra checkpoint has been done
+ *
* @throws IOException if image creation failed.
*/
@AtMostOnce
- public void saveNamespace() throws AccessControlException, IOException;
+ public boolean saveNamespace(long timeWindow, long txGap) throws IOException;
/**
@@ -847,7 +852,7 @@ public boolean restoreFailedStorage(String arg)
/**
* Rolling upgrade operations.
- * @param action either query, start or finailze.
+ * @param action either query, prepare or finalize.
* @return rolling upgrade information.
*/
@Idempotent
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/DatanodeID.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/DatanodeID.java
index 779e3b905f1a9..f91696fb284ed 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/DatanodeID.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/DatanodeID.java
@@ -47,19 +47,23 @@ public class DatanodeID implements Comparable {
private int infoSecurePort; // info server port
private int ipcPort; // IPC server port
private String xferAddr;
- private int hashCode = -1;
/**
* UUID identifying a given datanode. For upgraded Datanodes this is the
* same as the StorageID that was previously used by this Datanode.
* For newly formatted Datanodes it is a UUID.
*/
- private String datanodeUuid = null;
+ private final String datanodeUuid;
public DatanodeID(DatanodeID from) {
+ this(from.getDatanodeUuid(), from);
+ }
+
+ @VisibleForTesting
+ public DatanodeID(String datanodeUuid, DatanodeID from) {
this(from.getIpAddr(),
from.getHostName(),
- from.getDatanodeUuid(),
+ datanodeUuid,
from.getXferPort(),
from.getInfoPort(),
from.getInfoSecurePort(),
@@ -81,19 +85,24 @@ public DatanodeID(DatanodeID from) {
*/
public DatanodeID(String ipAddr, String hostName, String datanodeUuid,
int xferPort, int infoPort, int infoSecurePort, int ipcPort) {
- this.ipAddr = ipAddr;
+ setIpAndXferPort(ipAddr, xferPort);
this.hostName = hostName;
this.datanodeUuid = checkDatanodeUuid(datanodeUuid);
- this.xferPort = xferPort;
this.infoPort = infoPort;
this.infoSecurePort = infoSecurePort;
this.ipcPort = ipcPort;
- updateXferAddrAndInvalidateHashCode();
}
public void setIpAddr(String ipAddr) {
+ //updated during registration, preserve former xferPort
+ setIpAndXferPort(ipAddr, xferPort);
+ }
+
+ private void setIpAndXferPort(String ipAddr, int xferPort) {
+ // build xferAddr string to reduce cost of frequent use
this.ipAddr = ipAddr;
- updateXferAddrAndInvalidateHashCode();
+ this.xferPort = xferPort;
+ this.xferAddr = ipAddr + ":" + xferPort;
}
public void setPeerHostName(String peerHostName) {
@@ -107,12 +116,6 @@ public String getDatanodeUuid() {
return datanodeUuid;
}
- @VisibleForTesting
- public void setDatanodeUuidForTesting(String datanodeUuid) {
- this.datanodeUuid = datanodeUuid;
- updateXferAddrAndInvalidateHashCode();
- }
-
private String checkDatanodeUuid(String uuid) {
if (uuid == null || uuid.isEmpty()) {
return null;
@@ -242,11 +245,7 @@ public boolean equals(Object to) {
@Override
public int hashCode() {
- if (hashCode == -1) {
- int newHashCode = xferAddr.hashCode() ^ datanodeUuid.hashCode();
- hashCode = newHashCode & Integer.MAX_VALUE;
- }
- return hashCode;
+ return datanodeUuid.hashCode();
}
@Override
@@ -259,14 +258,12 @@ public String toString() {
* Note that this does not update storageID.
*/
public void updateRegInfo(DatanodeID nodeReg) {
- ipAddr = nodeReg.getIpAddr();
+ setIpAndXferPort(nodeReg.getIpAddr(), nodeReg.getXferPort());
hostName = nodeReg.getHostName();
peerHostName = nodeReg.getPeerHostName();
- xferPort = nodeReg.getXferPort();
infoPort = nodeReg.getInfoPort();
infoSecurePort = nodeReg.getInfoSecurePort();
ipcPort = nodeReg.getIpcPort();
- updateXferAddrAndInvalidateHashCode();
}
/**
@@ -279,13 +276,4 @@ public void updateRegInfo(DatanodeID nodeReg) {
public int compareTo(DatanodeID that) {
return getXferAddr().compareTo(that.getXferAddr());
}
-
- // NOTE: mutable hash codes are dangerous, however this class chooses to
- // use them. this method must be called when a value mutates that is used
- // to compute the hash, equality, or comparison of instances.
- private void updateXferAddrAndInvalidateHashCode() {
- xferAddr = ipAddr + ":" + xferPort;
- // can't compute new hash yet because uuid might still null...
- hashCode = -1;
- }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/DatanodeInfo.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/DatanodeInfo.java
index 9fcada734ae6d..5ded26b185232 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/DatanodeInfo.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/DatanodeInfo.java
@@ -49,6 +49,7 @@ public class DatanodeInfo extends DatanodeID implements Node {
private long cacheCapacity;
private long cacheUsed;
private long lastUpdate;
+ private long lastUpdateMonotonic;
private int xceiverCount;
private String location = NetworkTopology.DEFAULT_RACK;
private String softwareVersion;
@@ -91,6 +92,7 @@ public DatanodeInfo(DatanodeInfo from) {
this.cacheCapacity = from.getCacheCapacity();
this.cacheUsed = from.getCacheUsed();
this.lastUpdate = from.getLastUpdate();
+ this.lastUpdateMonotonic = from.getLastUpdateMonotonic();
this.xceiverCount = from.getXceiverCount();
this.location = from.getNetworkLocation();
this.adminState = from.getAdminState();
@@ -105,6 +107,7 @@ public DatanodeInfo(DatanodeID nodeID) {
this.cacheCapacity = 0L;
this.cacheUsed = 0L;
this.lastUpdate = 0L;
+ this.lastUpdateMonotonic = 0L;
this.xceiverCount = 0;
this.adminState = null;
}
@@ -117,13 +120,13 @@ public DatanodeInfo(DatanodeID nodeID, String location) {
public DatanodeInfo(DatanodeID nodeID, String location,
final long capacity, final long dfsUsed, final long remaining,
final long blockPoolUsed, final long cacheCapacity, final long cacheUsed,
- final long lastUpdate, final int xceiverCount,
- final AdminStates adminState) {
+ final long lastUpdate, final long lastUpdateMonotonic,
+ final int xceiverCount, final AdminStates adminState) {
this(nodeID.getIpAddr(), nodeID.getHostName(), nodeID.getDatanodeUuid(),
nodeID.getXferPort(), nodeID.getInfoPort(), nodeID.getInfoSecurePort(),
nodeID.getIpcPort(), capacity, dfsUsed, remaining, blockPoolUsed,
- cacheCapacity, cacheUsed, lastUpdate, xceiverCount, location,
- adminState);
+ cacheCapacity, cacheUsed, lastUpdate, lastUpdateMonotonic,
+ xceiverCount, location, adminState);
}
/** Constructor */
@@ -132,8 +135,9 @@ public DatanodeInfo(final String ipAddr, final String hostName,
final int infoSecurePort, final int ipcPort,
final long capacity, final long dfsUsed, final long remaining,
final long blockPoolUsed, final long cacheCapacity, final long cacheUsed,
- final long lastUpdate, final int xceiverCount,
- final String networkLocation, final AdminStates adminState) {
+ final long lastUpdate, final long lastUpdateMonotonic,
+ final int xceiverCount, final String networkLocation,
+ final AdminStates adminState) {
super(ipAddr, hostName, datanodeUuid, xferPort, infoPort,
infoSecurePort, ipcPort);
this.capacity = capacity;
@@ -143,6 +147,7 @@ public DatanodeInfo(final String ipAddr, final String hostName,
this.cacheCapacity = cacheCapacity;
this.cacheUsed = cacheUsed;
this.lastUpdate = lastUpdate;
+ this.lastUpdateMonotonic = lastUpdateMonotonic;
this.xceiverCount = xceiverCount;
this.location = networkLocation;
this.adminState = adminState;
@@ -223,9 +228,26 @@ public float getCacheRemainingPercent() {
return DFSUtil.getPercentRemaining(getCacheRemaining(), cacheCapacity);
}
- /** The time when this information was accurate. */
+ /**
+ * Get the last update timestamp.
+ * Return value is suitable for Date conversion.
+ */
public long getLastUpdate() { return lastUpdate; }
+ /**
+ * The time when this information was accurate.
+ * Ps: So return value is ideal for calculation of time differences.
+ * Should not be used to convert to Date.
+ */
+ public long getLastUpdateMonotonic() { return lastUpdateMonotonic;}
+
+ /**
+ * Set lastUpdate monotonic time
+ */
+ public void setLastUpdateMonotonic(long lastUpdateMonotonic) {
+ this.lastUpdateMonotonic = lastUpdateMonotonic;
+ }
+
/** number of active connections */
public int getXceiverCount() { return xceiverCount; }
@@ -437,7 +459,7 @@ public AdminStates getAdminState() {
* @return true if the node is stale
*/
public boolean isStale(long staleInterval) {
- return (Time.now() - lastUpdate) >= staleInterval;
+ return (Time.monotonicNow() - lastUpdateMonotonic) >= staleInterval;
}
/**
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/LocatedBlock.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/LocatedBlock.java
index 0d521918484b3..e729869878f82 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/LocatedBlock.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/LocatedBlock.java
@@ -44,9 +44,9 @@ public class LocatedBlock {
private long offset; // offset of the first byte of the block in the file
private final DatanodeInfoWithStorage[] locs;
/** Cached storage ID for each replica */
- private String[] storageIDs;
+ private final String[] storageIDs;
/** Cached storage type for each replica, if reported. */
- private StorageType[] storageTypes;
+ private final StorageType[] storageTypes;
// corrupt flag is true if all of the replicas of a block are corrupt.
// else false. If block has few corrupt replicas, they are filtered and
// their locations are not part of this object
@@ -62,16 +62,8 @@ public class LocatedBlock {
new DatanodeInfoWithStorage[0];
public LocatedBlock(ExtendedBlock b, DatanodeInfo[] locs) {
- this(b, locs, -1, false); // startOffset is unknown
- }
-
- public LocatedBlock(ExtendedBlock b, DatanodeInfo[] locs, long startOffset,
- boolean corrupt) {
- this(b, locs, null, null, startOffset, corrupt, EMPTY_LOCS);
- }
-
- public LocatedBlock(ExtendedBlock b, DatanodeStorageInfo[] storages) {
- this(b, storages, -1, false); // startOffset is unknown
+ // By default, startOffset is unknown(-1) and corrupt is false.
+ this(b, locs, null, null, -1, false, EMPTY_LOCS);
}
public LocatedBlock(ExtendedBlock b, DatanodeInfo[] locs,
@@ -170,11 +162,11 @@ public long getBlockSize() {
return b.getNumBytes();
}
- void setStartOffset(long value) {
+ public void setStartOffset(long value) {
this.offset = value;
}
- void setCorrupt(boolean corrupt) {
+ public void setCorrupt(boolean corrupt) {
this.corrupt = corrupt;
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/LocatedBlocks.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/LocatedBlocks.java
index fc739cf711e33..e35a43107444b 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/LocatedBlocks.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/LocatedBlocks.java
@@ -119,7 +119,7 @@ public FileEncryptionInfo getFileEncryptionInfo() {
public int findBlock(long offset) {
// create fake block of size 0 as a key
LocatedBlock key = new LocatedBlock(
- new ExtendedBlock(), new DatanodeInfo[0], 0L, false);
+ new ExtendedBlock(), new DatanodeInfo[0]);
key.setStartOffset(offset);
key.getBlock().setNumBytes(1);
Comparator comp =
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/RollingUpgradeInfo.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/RollingUpgradeInfo.java
index 98089bc6ef873..80e3e3478d3fb 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/RollingUpgradeInfo.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/RollingUpgradeInfo.java
@@ -29,12 +29,12 @@
@InterfaceStability.Evolving
public class RollingUpgradeInfo extends RollingUpgradeStatus {
private final long startTime;
- private final long finalizeTime;
+ private long finalizeTime;
private boolean createdRollbackImages;
public RollingUpgradeInfo(String blockPoolId, boolean createdRollbackImages,
long startTime, long finalizeTime) {
- super(blockPoolId);
+ super(blockPoolId, finalizeTime != 0);
this.createdRollbackImages = createdRollbackImages;
this.startTime = startTime;
this.finalizeTime = finalizeTime;
@@ -56,11 +56,23 @@ public boolean isStarted() {
public long getStartTime() {
return startTime;
}
-
+
+ @Override
public boolean isFinalized() {
return finalizeTime != 0;
}
+ /**
+ * Finalize the upgrade if not already finalized
+ * @param finalizeTime
+ */
+ public void finalize(long finalizeTime) {
+ if (finalizeTime != 0) {
+ this.finalizeTime = finalizeTime;
+ createdRollbackImages = false;
+ }
+ }
+
public long getFinalizeTime() {
return finalizeTime;
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/RollingUpgradeStatus.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/RollingUpgradeStatus.java
index 9925920250b4e..1f969fbb0c12b 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/RollingUpgradeStatus.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/RollingUpgradeStatus.java
@@ -27,15 +27,21 @@
@InterfaceStability.Evolving
public class RollingUpgradeStatus {
private final String blockPoolId;
+ private final boolean finalized;
- public RollingUpgradeStatus(String blockPoolId) {
+ public RollingUpgradeStatus(String blockPoolId, boolean finalized) {
this.blockPoolId = blockPoolId;
+ this.finalized = finalized;
}
public String getBlockPoolId() {
return blockPoolId;
}
+ public boolean isFinalized() {
+ return finalized;
+ }
+
@Override
public int hashCode() {
return blockPoolId.hashCode();
@@ -48,8 +54,9 @@ public boolean equals(Object obj) {
} else if (obj == null || !(obj instanceof RollingUpgradeStatus)) {
return false;
}
- final RollingUpgradeStatus that = (RollingUpgradeStatus)obj;
- return this.blockPoolId.equals(that.blockPoolId);
+ final RollingUpgradeStatus that = (RollingUpgradeStatus) obj;
+ return this.blockPoolId.equals(that.blockPoolId)
+ && this.isFinalized() == that.isFinalized();
}
@Override
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/DataTransferProtocol.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/DataTransferProtocol.java
index 4be42a8d35891..48e931d741a03 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/DataTransferProtocol.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/DataTransferProtocol.java
@@ -138,10 +138,13 @@ public void transferBlock(final ExtendedBlock blk,
* to use no slot id.
* @param maxVersion Maximum version of the block data the client
* can understand.
+ * @param supportsReceiptVerification True if the client supports
+ * receipt verification.
*/
public void requestShortCircuitFds(final ExtendedBlock blk,
final Token blockToken,
- SlotId slotId, int maxVersion) throws IOException;
+ SlotId slotId, int maxVersion, boolean supportsReceiptVerification)
+ throws IOException;
/**
* Release a pair of short-circuit FDs requested earlier.
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/PipelineAck.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/PipelineAck.java
index 35e5bb84dc7b0..9bd4115b59ff9 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/PipelineAck.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/PipelineAck.java
@@ -130,13 +130,16 @@ public PipelineAck(long seqno, int[] replies) {
*/
public PipelineAck(long seqno, int[] replies,
long downstreamAckTimeNanos) {
- ArrayList replyList = Lists.newArrayList();
+ ArrayList statusList = Lists.newArrayList();
+ ArrayList flagList = Lists.newArrayList();
for (int r : replies) {
- replyList.add(r);
+ statusList.add(StatusFormat.getStatus(r));
+ flagList.add(r);
}
proto = PipelineAckProto.newBuilder()
.setSeqno(seqno)
- .addAllReply(replyList)
+ .addAllReply(statusList)
+ .addAllFlag(flagList)
.setDownstreamAckTimeNanos(downstreamAckTimeNanos)
.build();
}
@@ -158,11 +161,18 @@ public short getNumOfReplies() {
}
/**
- * get the ith reply
- * @return the the ith reply
+ * get the header flag of ith reply
*/
- public int getReply(int i) {
- return proto.getReply(i);
+ public int getHeaderFlag(int i) {
+ if (proto.getFlagCount() > 0) {
+ return proto.getFlag(i);
+ } else {
+ return combineHeader(ECN.DISABLED, proto.getReply(i));
+ }
+ }
+
+ public int getFlag(int i) {
+ return proto.getFlag(i);
}
/**
@@ -178,8 +188,8 @@ public long getDownstreamAckTimeNanos() {
* @return true if all statuses are SUCCESS
*/
public boolean isSuccess() {
- for (int reply : proto.getReplyList()) {
- if (StatusFormat.getStatus(reply) != Status.SUCCESS) {
+ for (Status s : proto.getReplyList()) {
+ if (s != Status.SUCCESS) {
return false;
}
}
@@ -196,10 +206,9 @@ public Status getOOBStatus() {
if (getSeqno() != UNKOWN_SEQNO) {
return null;
}
- for (int reply : proto.getReplyList()) {
+ for (Status s : proto.getReplyList()) {
// The following check is valid because protobuf guarantees to
// preserve the ordering of enum elements.
- Status s = StatusFormat.getStatus(reply);
if (s.getNumber() >= OOB_START && s.getNumber() <= OOB_END) {
return s;
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/Receiver.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/Receiver.java
index 7994027c6ec83..31bdc5e2a52a4 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/Receiver.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/Receiver.java
@@ -186,7 +186,7 @@ private void opRequestShortCircuitFds(DataInputStream in) throws IOException {
try {
requestShortCircuitFds(PBHelper.convert(proto.getHeader().getBlock()),
PBHelper.convert(proto.getHeader().getToken()),
- slotId, proto.getMaxVersion());
+ slotId, proto.getMaxVersion(), true);
} finally {
if (traceScope != null) traceScope.close();
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/Sender.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/Sender.java
index 7fea33efc59c5..df69125882bc6 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/Sender.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/Sender.java
@@ -181,7 +181,8 @@ public void transferBlock(final ExtendedBlock blk,
@Override
public void requestShortCircuitFds(final ExtendedBlock blk,
final Token blockToken,
- SlotId slotId, int maxVersion) throws IOException {
+ SlotId slotId, int maxVersion, boolean supportsReceiptVerification)
+ throws IOException {
OpRequestShortCircuitAccessProto.Builder builder =
OpRequestShortCircuitAccessProto.newBuilder()
.setHeader(DataTransferProtoUtil.buildBaseHeader(
@@ -189,6 +190,7 @@ public void requestShortCircuitFds(final ExtendedBlock blk,
if (slotId != null) {
builder.setSlotId(PBHelper.convert(slotId));
}
+ builder.setSupportsReceiptVerification(supportsReceiptVerification);
OpRequestShortCircuitAccessProto proto = builder.build();
send(out, Op.REQUEST_SHORT_CIRCUIT_FDS, proto);
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientNamenodeProtocolServerSideTranslatorPB.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientNamenodeProtocolServerSideTranslatorPB.java
index ce8c3924e39a5..e26158b208df0 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientNamenodeProtocolServerSideTranslatorPB.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientNamenodeProtocolServerSideTranslatorPB.java
@@ -277,10 +277,7 @@ public class ClientNamenodeProtocolServerSideTranslatorPB implements
private static final RenewLeaseResponseProto VOID_RENEWLEASE_RESPONSE =
RenewLeaseResponseProto.newBuilder().build();
- private static final SaveNamespaceResponseProto VOID_SAVENAMESPACE_RESPONSE =
- SaveNamespaceResponseProto.newBuilder().build();
-
- private static final RefreshNodesResponseProto VOID_REFRESHNODES_RESPONSE =
+ private static final RefreshNodesResponseProto VOID_REFRESHNODES_RESPONSE =
RefreshNodesResponseProto.newBuilder().build();
private static final FinalizeUpgradeResponseProto VOID_FINALIZEUPGRADE_RESPONSE =
@@ -748,14 +745,15 @@ public SetSafeModeResponseProto setSafeMode(RpcController controller,
public SaveNamespaceResponseProto saveNamespace(RpcController controller,
SaveNamespaceRequestProto req) throws ServiceException {
try {
- server.saveNamespace();
- return VOID_SAVENAMESPACE_RESPONSE;
+ final long timeWindow = req.hasTimeWindow() ? req.getTimeWindow() : 0;
+ final long txGap = req.hasTxGap() ? req.getTxGap() : 0;
+ boolean saved = server.saveNamespace(timeWindow, txGap);
+ return SaveNamespaceResponseProto.newBuilder().setSaved(saved).build();
} catch (IOException e) {
throw new ServiceException(e);
}
-
}
-
+
@Override
public RollEditsResponseProto rollEdits(RpcController controller,
RollEditsRequestProto request) throws ServiceException {
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientNamenodeProtocolTranslatorPB.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientNamenodeProtocolTranslatorPB.java
index e970293ede10a..4ec6f9ea05e94 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientNamenodeProtocolTranslatorPB.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientNamenodeProtocolTranslatorPB.java
@@ -670,9 +670,11 @@ public boolean setSafeMode(SafeModeAction action, boolean isChecked) throws IOEx
}
@Override
- public void saveNamespace() throws AccessControlException, IOException {
+ public boolean saveNamespace(long timeWindow, long txGap) throws IOException {
try {
- rpcProxy.saveNamespace(null, VOID_SAVE_NAMESPACE_REQUEST);
+ SaveNamespaceRequestProto req = SaveNamespaceRequestProto.newBuilder()
+ .setTimeWindow(timeWindow).setTxGap(txGap).build();
+ return rpcProxy.saveNamespace(null, req).getSaved();
} catch (ServiceException e) {
throw ProtobufHelper.getRemoteException(e);
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolClientSideTranslatorPB.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolClientSideTranslatorPB.java
index 192916f365d24..825e83586b323 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolClientSideTranslatorPB.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolClientSideTranslatorPB.java
@@ -26,6 +26,7 @@
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
@@ -46,11 +47,13 @@
import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.StorageBlockReportProto;
import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.StorageReceivedDeletedBlocksProto;
import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.VersionRequestProto;
+import org.apache.hadoop.hdfs.server.protocol.BlockReportContext;
import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
+import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo.Capability;
import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo;
import org.apache.hadoop.hdfs.server.protocol.StorageBlockReport;
import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks;
@@ -64,6 +67,7 @@
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.security.UserGroupInformation;
+import com.google.common.annotations.VisibleForTesting;
import com.google.protobuf.RpcController;
import com.google.protobuf.ServiceException;
@@ -83,6 +87,11 @@ public class DatanodeProtocolClientSideTranslatorPB implements
VersionRequestProto.newBuilder().build();
private final static RpcController NULL_CONTROLLER = null;
+ @VisibleForTesting
+ public DatanodeProtocolClientSideTranslatorPB(DatanodeProtocolPB rpcProxy) {
+ this.rpcProxy = rpcProxy;
+ }
+
public DatanodeProtocolClientSideTranslatorPB(InetSocketAddress nameNodeAddr,
Configuration conf) throws IOException {
RPC.setProtocolEngine(conf, DatanodeProtocolPB.class,
@@ -161,20 +170,30 @@ public HeartbeatResponse sendHeartbeat(DatanodeRegistration registration,
@Override
public DatanodeCommand blockReport(DatanodeRegistration registration,
- String poolId, StorageBlockReport[] reports) throws IOException {
+ String poolId, StorageBlockReport[] reports, BlockReportContext context)
+ throws IOException {
BlockReportRequestProto.Builder builder = BlockReportRequestProto
.newBuilder().setRegistration(PBHelper.convert(registration))
.setBlockPoolId(poolId);
+ boolean useBlocksBuffer = registration.getNamespaceInfo()
+ .isCapabilitySupported(Capability.STORAGE_BLOCK_REPORT_BUFFERS);
+
for (StorageBlockReport r : reports) {
StorageBlockReportProto.Builder reportBuilder = StorageBlockReportProto
.newBuilder().setStorage(PBHelper.convert(r.getStorage()));
- long[] blocks = r.getBlocks();
- for (int i = 0; i < blocks.length; i++) {
- reportBuilder.addBlocks(blocks[i]);
+ BlockListAsLongs blocks = r.getBlocks();
+ if (useBlocksBuffer) {
+ reportBuilder.setNumberOfBlocks(blocks.getNumberOfBlocks());
+ reportBuilder.addAllBlocksBuffers(blocks.getBlocksBuffers());
+ } else {
+ for (long value : blocks.getBlockListAsLongs()) {
+ reportBuilder.addBlocks(value);
+ }
}
builder.addReports(reportBuilder.build());
}
+ builder.setContext(PBHelper.convert(context));
BlockReportResponseProto resp;
try {
resp = rpcProxy.blockReport(NULL_CONTROLLER, builder.build());
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolServerSideTranslatorPB.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolServerSideTranslatorPB.java
index 1a89090f13e3e..873eb6d1708e7 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolServerSideTranslatorPB.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolServerSideTranslatorPB.java
@@ -21,6 +21,7 @@
import java.io.IOException;
import java.util.List;
+import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.RollingUpgradeStatus;
@@ -58,6 +59,7 @@
import org.apache.hadoop.hdfs.server.protocol.StorageReport;
import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
+import com.google.common.base.Preconditions;
import com.google.protobuf.RpcController;
import com.google.protobuf.ServiceException;
@@ -145,17 +147,23 @@ public BlockReportResponseProto blockReport(RpcController controller,
int index = 0;
for (StorageBlockReportProto s : request.getReportsList()) {
- List blockIds = s.getBlocksList();
- long[] blocks = new long[blockIds.size()];
- for (int i = 0; i < blockIds.size(); i++) {
- blocks[i] = blockIds.get(i);
+ final BlockListAsLongs blocks;
+ if (s.hasNumberOfBlocks()) { // new style buffer based reports
+ int num = (int)s.getNumberOfBlocks();
+ Preconditions.checkState(s.getBlocksCount() == 0,
+ "cannot send both blocks list and buffers");
+ blocks = BlockListAsLongs.decodeBuffers(num, s.getBlocksBuffersList());
+ } else {
+ blocks = BlockListAsLongs.decodeLongs(s.getBlocksList());
}
report[index++] = new StorageBlockReport(PBHelper.convert(s.getStorage()),
blocks);
}
try {
cmd = impl.blockReport(PBHelper.convert(request.getRegistration()),
- request.getBlockPoolId(), report);
+ request.getBlockPoolId(), report,
+ request.hasContext() ?
+ PBHelper.convert(request.getContext()) : null);
} catch (IOException e) {
throw new ServiceException(e);
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelper.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelper.java
index ee1603ced30cb..1942ea9e98da7 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelper.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelper.java
@@ -111,6 +111,7 @@
import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.SafeModeActionProto;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ShortCircuitShmIdProto;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ShortCircuitShmSlotProto;
+import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos;
import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.BalancerBandwidthCommandProto;
import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.BlockCommandProto;
import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.BlockIdCommandProto;
@@ -123,6 +124,7 @@
import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.ReceivedDeletedBlockInfoProto;
import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.RegisterCommandProto;
import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.VolumeFailureSummaryProto;
+import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.BlockReportContextProto;
import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos;
import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.BlockKeyProto;
import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.BlockProto;
@@ -194,6 +196,7 @@
import org.apache.hadoop.hdfs.server.protocol.BlockIdCommand;
import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand;
import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand.RecoveringBlock;
+import org.apache.hadoop.hdfs.server.protocol.BlockReportContext;
import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations;
import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations.BlockWithLocations;
import org.apache.hadoop.hdfs.server.protocol.CheckpointCommand;
@@ -573,7 +576,7 @@ public static NamespaceInfo convert(NamespaceInfoProto info) {
StorageInfoProto storage = info.getStorageInfo();
return new NamespaceInfo(storage.getNamespceID(), storage.getClusterID(),
info.getBlockPoolID(), storage.getCTime(), info.getBuildVersion(),
- info.getSoftwareVersion());
+ info.getSoftwareVersion(), info.getCapabilities());
}
public static NamenodeCommand convert(NamenodeCommandProto cmd) {
@@ -642,8 +645,8 @@ static public DatanodeInfo convert(DatanodeInfoProto di) {
di.hasLocation() ? di.getLocation() : null ,
di.getCapacity(), di.getDfsUsed(), di.getRemaining(),
di.getBlockPoolUsed(), di.getCacheCapacity(), di.getCacheUsed(),
- di.getLastUpdate(), di.getXceiverCount(),
- PBHelper.convert(di.getAdminState()));
+ di.getLastUpdate(), di.getLastUpdateMonotonic(),
+ di.getXceiverCount(), PBHelper.convert(di.getAdminState()));
}
static public DatanodeInfoProto convertDatanodeInfo(DatanodeInfo di) {
@@ -704,6 +707,7 @@ public static DatanodeInfoProto convert(DatanodeInfo info) {
.setCacheCapacity(info.getCacheCapacity())
.setCacheUsed(info.getCacheUsed())
.setLastUpdate(info.getLastUpdate())
+ .setLastUpdateMonotonic(info.getLastUpdateMonotonic())
.setXceiverCount(info.getXceiverCount())
.setAdminState(PBHelper.convert(info.getAdminState()))
.build();
@@ -1233,7 +1237,9 @@ public static NamespaceInfoProto convert(NamespaceInfo info) {
.setBuildVersion(info.getBuildVersion())
.setUnused(0)
.setStorageInfo(PBHelper.convert((StorageInfo)info))
- .setSoftwareVersion(info.getSoftwareVersion()).build();
+ .setSoftwareVersion(info.getSoftwareVersion())
+ .setCapabilities(info.getCapabilities())
+ .build();
}
// Located Block Arrays and Lists
@@ -1680,11 +1686,13 @@ public static RollingUpgradeStatusProto convertRollingUpgradeStatus(
RollingUpgradeStatus status) {
return RollingUpgradeStatusProto.newBuilder()
.setBlockPoolId(status.getBlockPoolId())
+ .setFinalized(status.isFinalized())
.build();
}
public static RollingUpgradeStatus convert(RollingUpgradeStatusProto proto) {
- return new RollingUpgradeStatus(proto.getBlockPoolId());
+ return new RollingUpgradeStatus(proto.getBlockPoolId(),
+ proto.getFinalized());
}
public static RollingUpgradeInfoProto convert(RollingUpgradeInfo info) {
@@ -1722,21 +1730,49 @@ public static CorruptFileBlocksProto convert(CorruptFileBlocks c) {
public static ContentSummary convert(ContentSummaryProto cs) {
if (cs == null) return null;
- return new ContentSummary(
- cs.getLength(), cs.getFileCount(), cs.getDirectoryCount(), cs.getQuota(),
- cs.getSpaceConsumed(), cs.getSpaceQuota());
+ ContentSummary.Builder builder = new ContentSummary.Builder();
+ builder.length(cs.getLength()).
+ fileCount(cs.getFileCount()).
+ directoryCount(cs.getDirectoryCount()).
+ quota(cs.getQuota()).
+ spaceConsumed(cs.getSpaceConsumed()).
+ spaceQuota(cs.getSpaceQuota());
+ if (cs.hasTypeQuotaInfos()) {
+ for (HdfsProtos.StorageTypeQuotaInfoProto info :
+ cs.getTypeQuotaInfos().getTypeQuotaInfoList()) {
+ StorageType type = PBHelper.convertStorageType(info.getType());
+ builder.typeConsumed(type, info.getConsumed());
+ builder.typeQuota(type, info.getQuota());
+ }
+ }
+ return builder.build();
}
public static ContentSummaryProto convert(ContentSummary cs) {
if (cs == null) return null;
- return ContentSummaryProto.newBuilder().
- setLength(cs.getLength()).
+ ContentSummaryProto.Builder builder = ContentSummaryProto.newBuilder();
+ builder.setLength(cs.getLength()).
setFileCount(cs.getFileCount()).
setDirectoryCount(cs.getDirectoryCount()).
setQuota(cs.getQuota()).
setSpaceConsumed(cs.getSpaceConsumed()).
- setSpaceQuota(cs.getSpaceQuota()).
- build();
+ setSpaceQuota(cs.getSpaceQuota());
+
+ if (cs.isTypeQuotaSet() || cs.isTypeConsumedAvailable()) {
+ HdfsProtos.StorageTypeQuotaInfosProto.Builder isb =
+ HdfsProtos.StorageTypeQuotaInfosProto.newBuilder();
+ for (StorageType t: StorageType.getTypesSupportingQuota()) {
+ HdfsProtos.StorageTypeQuotaInfoProto info =
+ HdfsProtos.StorageTypeQuotaInfoProto.newBuilder().
+ setType(convertStorageType(t)).
+ setConsumed(cs.getTypeConsumed(t)).
+ setQuota(cs.getTypeQuota(t)).
+ build();
+ isb.addTypeQuotaInfo(info);
+ }
+ builder.setTypeQuotaInfos(isb);
+ }
+ return builder.build();
}
public static NNHAStatusHeartbeat convert(NNHAStatusHeartbeatProto s) {
@@ -3006,4 +3042,16 @@ public static boolean[] convertBooleanList(
return targetPinnings;
}
+ public static BlockReportContext convert(BlockReportContextProto proto) {
+ return new BlockReportContext(proto.getTotalRpcs(),
+ proto.getCurRpc(), proto.getId());
+ }
+
+ public static BlockReportContextProto convert(BlockReportContext context) {
+ return BlockReportContextProto.newBuilder().
+ setTotalRpcs(context.getTotalRpcs()).
+ setCurRpc(context.getCurRpc()).
+ setId(context.getReportId()).
+ build();
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java
index 90212a3858832..bc7e4489e0bc6 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java
@@ -672,7 +672,7 @@ static class Cli extends Configured implements Tool {
*/
@Override
public int run(String[] args) {
- final long startTime = Time.now();
+ final long startTime = Time.monotonicNow();
final Configuration conf = getConf();
try {
@@ -687,8 +687,10 @@ public int run(String[] args) {
System.out.println(e + ". Exiting ...");
return ExitStatus.INTERRUPTED.getExitCode();
} finally {
- System.out.format("%-24s ", DateFormat.getDateTimeInstance().format(new Date()));
- System.out.println("Balancing took " + time2Str(Time.now()-startTime));
+ System.out.format("%-24s ",
+ DateFormat.getDateTimeInstance().format(new Date()));
+ System.out.println("Balancing took "
+ + time2Str(Time.monotonicNow() - startTime));
}
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockCollection.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockCollection.java
index 15476118b9eec..e9baf8535bdac 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockCollection.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockCollection.java
@@ -36,7 +36,7 @@ public interface BlockCollection {
/**
* Get content summary.
*/
- public ContentSummary computeContentSummary();
+ public ContentSummary computeContentSummary(BlockStoragePolicySuite bsps);
/**
* @return the number of blocks
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockInfoContiguousUnderConstruction.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockInfoContiguousUnderConstruction.java
index 91b76ccb0e266..92153abb97686 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockInfoContiguousUnderConstruction.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockInfoContiguousUnderConstruction.java
@@ -315,7 +315,8 @@ public void initializeBlockRecovery(long recoveryId) {
continue;
}
final ReplicaUnderConstruction ruc = replicas.get(i);
- final long lastUpdate = ruc.getExpectedStorageLocation().getDatanodeDescriptor().getLastUpdate();
+ final long lastUpdate = ruc.getExpectedStorageLocation()
+ .getDatanodeDescriptor().getLastUpdateMonotonic();
if (lastUpdate > mostRecentLastUpdate) {
primaryNodeIndex = i;
primary = ruc;
@@ -383,6 +384,7 @@ public void appendStringTo(StringBuilder sb) {
private void appendUCParts(StringBuilder sb) {
sb.append("{UCState=").append(blockUCState)
+ .append(", truncateBlock=" + truncateBlock)
.append(", primaryNodeIndex=").append(primaryNodeIndex)
.append(", replicas=[");
if (replicas != null) {
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
index 58a8b94d30ede..d9aee62e7ef0b 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
@@ -47,7 +47,7 @@
import org.apache.hadoop.hdfs.HAUtil;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
-import org.apache.hadoop.hdfs.protocol.BlockListAsLongs.BlockReportIterator;
+import org.apache.hadoop.hdfs.protocol.BlockListAsLongs.BlockReportReplica;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
@@ -69,6 +69,7 @@
import org.apache.hadoop.hdfs.server.namenode.Namesystem;
import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
import org.apache.hadoop.hdfs.server.protocol.BlockCommand;
+import org.apache.hadoop.hdfs.server.protocol.BlockReportContext;
import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations;
import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations.BlockWithLocations;
import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
@@ -365,10 +366,10 @@ private static BlockTokenSecretManager createBlockTokenSecretManager(
if (!isEnabled) {
if (UserGroupInformation.isSecurityEnabled()) {
- LOG.error("Security is enabled but block access tokens " +
- "(via " + DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY + ") " +
- "aren't enabled. This may cause issues " +
- "when clients attempt to talk to a DataNode.");
+ LOG.error("Security is enabled but block access tokens " +
+ "(via " + DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY + ") " +
+ "aren't enabled. This may cause issues " +
+ "when clients attempt to talk to a DataNode.");
}
return null;
}
@@ -744,7 +745,7 @@ public LocatedBlock convertLastBlockToUnderConstruction(
// always decrement total blocks
-1);
- final long fileLength = bc.computeContentSummary().getLength();
+ final long fileLength = bc.computeContentSummary(getStoragePolicySuite()).getLength();
final long pos = fileLength - ucBlock.getNumBytes();
return createLocatedBlock(ucBlock, pos, AccessMode.WRITE);
}
@@ -1222,10 +1223,6 @@ private boolean invalidateBlock(BlockToMarkCorrupt b, DatanodeInfo dn
// Check how many copies we have of the block
NumberReplicas nr = countNodes(b.stored);
if (nr.replicasOnStaleNodes() > 0) {
- blockLog.info("BLOCK* invalidateBlocks: postponing " +
- "invalidation of " + b + " on " + dn + " because " +
- nr.replicasOnStaleNodes() + " replica(s) are located on nodes " +
- "with potentially out-of-date block reports");
blockLog.info("BLOCK* invalidateBlocks: postponing " +
"invalidation of {} on {} because {} replica(s) are located on " +
"nodes with potentially out-of-date block reports", b, dn,
@@ -1496,8 +1493,10 @@ int computeReplicationWorkForBlocks(List> blocksToReplicate) {
}
}
}
- blockLog.debug("BLOCK* neededReplications = {} pendingReplications = {}",
- neededReplications.size(), pendingReplications.size());
+ if (blockLog.isDebugEnabled()) {
+ blockLog.debug("BLOCK* neededReplications = {} pendingReplications = {}",
+ neededReplications.size(), pendingReplications.size());
+ }
return scheduledWork;
}
@@ -1638,7 +1637,8 @@ else if (excessBlocks != null && excessBlocks.contains(block)) {
// If so, do not select the node as src node
if ((nodesCorrupt != null) && nodesCorrupt.contains(node))
continue;
- if(priority != UnderReplicatedBlocks.QUEUE_HIGHEST_PRIORITY
+ if(priority != UnderReplicatedBlocks.QUEUE_HIGHEST_PRIORITY
+ && !node.isDecommissionInProgress()
&& node.getNumberOfBlocksToBeReplicated() >= maxReplicationStreams)
{
continue; // already reached replication limit
@@ -1653,13 +1653,12 @@ else if (excessBlocks != null && excessBlocks.contains(block)) {
// never use already decommissioned nodes
if(node.isDecommissioned())
continue;
- // we prefer nodes that are in DECOMMISSION_INPROGRESS state
- if(node.isDecommissionInProgress() || srcNode == null) {
+
+ // We got this far, current node is a reasonable choice
+ if (srcNode == null) {
srcNode = node;
continue;
}
- if(srcNode.isDecommissionInProgress())
- continue;
// switch to a different node randomly
// this to prevent from deterministically selecting the same node even
// if the node failed to replicate the block on previous iterations
@@ -1772,9 +1771,10 @@ public String toString() {
*/
public boolean processReport(final DatanodeID nodeID,
final DatanodeStorage storage,
- final BlockListAsLongs newReport) throws IOException {
+ final BlockListAsLongs newReport, BlockReportContext context,
+ boolean lastStorageInRpc) throws IOException {
namesystem.writeLock();
- final long startTime = Time.now(); //after acquiring write lock
+ final long startTime = Time.monotonicNow(); //after acquiring write lock
final long endTime;
DatanodeDescriptor node;
Collection invalidatedBlocks = null;
@@ -1811,8 +1811,31 @@ public boolean processReport(final DatanodeID nodeID,
}
storageInfo.receivedBlockReport();
+ if (context != null) {
+ storageInfo.setLastBlockReportId(context.getReportId());
+ if (lastStorageInRpc) {
+ int rpcsSeen = node.updateBlockReportContext(context);
+ if (rpcsSeen >= context.getTotalRpcs()) {
+ List zombies = node.removeZombieStorages();
+ if (zombies.isEmpty()) {
+ LOG.debug("processReport 0x{}: no zombie storages found.",
+ Long.toHexString(context.getReportId()));
+ } else {
+ for (DatanodeStorageInfo zombie : zombies) {
+ removeZombieReplicas(context, zombie);
+ }
+ }
+ node.clearBlockReportContext();
+ } else {
+ LOG.debug("processReport 0x{}: {} more RPCs remaining in this " +
+ "report.", Long.toHexString(context.getReportId()),
+ (context.getTotalRpcs() - rpcsSeen)
+ );
+ }
+ }
+ }
} finally {
- endTime = Time.now();
+ endTime = Time.monotonicNow();
namesystem.writeUnlock();
}
@@ -1835,6 +1858,32 @@ public boolean processReport(final DatanodeID nodeID,
return !node.hasStaleStorages();
}
+ private void removeZombieReplicas(BlockReportContext context,
+ DatanodeStorageInfo zombie) {
+ LOG.warn("processReport 0x{}: removing zombie storage {}, which no " +
+ "longer exists on the DataNode.",
+ Long.toHexString(context.getReportId()), zombie.getStorageID());
+ assert(namesystem.hasWriteLock());
+ Iterator iter = zombie.getBlockIterator();
+ int prevBlocks = zombie.numBlocks();
+ while (iter.hasNext()) {
+ BlockInfoContiguous block = iter.next();
+ // We assume that a block can be on only one storage in a DataNode.
+ // That's why we pass in the DatanodeDescriptor rather than the
+ // DatanodeStorageInfo.
+ // TODO: remove this assumption in case we want to put a block on
+ // more than one storage on a datanode (and because it's a difficult
+ // assumption to really enforce)
+ removeStoredBlock(block, zombie.getDatanodeDescriptor());
+ invalidateBlocks.remove(zombie.getDatanodeDescriptor(), block);
+ }
+ assert(zombie.numBlocks() == 0);
+ LOG.warn("processReport 0x{}: removed {} replicas from storage {}, " +
+ "which no longer exists on the DataNode.",
+ Long.toHexString(context.getReportId()), prevBlocks,
+ zombie.getStorageID());
+ }
+
/**
* Rescan the list of blocks which were previously postponed.
*/
@@ -1842,7 +1891,7 @@ void rescanPostponedMisreplicatedBlocks() {
if (getPostponedMisreplicatedBlocksCount() == 0) {
return;
}
- long startTimeRescanPostponedMisReplicatedBlocks = Time.now();
+ long startTimeRescanPostponedMisReplicatedBlocks = Time.monotonicNow();
long startPostponedMisReplicatedBlocksCount =
getPostponedMisreplicatedBlocksCount();
namesystem.writeLock();
@@ -1902,7 +1951,7 @@ void rescanPostponedMisreplicatedBlocks() {
long endPostponedMisReplicatedBlocksCount =
getPostponedMisreplicatedBlocksCount();
LOG.info("Rescan of postponedMisreplicatedBlocks completed in " +
- (Time.now() - startTimeRescanPostponedMisReplicatedBlocks) +
+ (Time.monotonicNow() - startTimeRescanPostponedMisReplicatedBlocks) +
" msecs. " + endPostponedMisReplicatedBlocksCount +
" blocks are left. " + (startPostponedMisReplicatedBlocksCount -
endPostponedMisReplicatedBlocksCount) + " blocks are removed.");
@@ -1951,6 +2000,46 @@ private Collection processReport(
return toInvalidate;
}
+ /**
+ * Mark block replicas as corrupt except those on the storages in
+ * newStorages list.
+ */
+ public void markBlockReplicasAsCorrupt(BlockInfoContiguous block,
+ long oldGenerationStamp, long oldNumBytes,
+ DatanodeStorageInfo[] newStorages) throws IOException {
+ assert namesystem.hasWriteLock();
+ BlockToMarkCorrupt b = null;
+ if (block.getGenerationStamp() != oldGenerationStamp) {
+ b = new BlockToMarkCorrupt(block, oldGenerationStamp,
+ "genstamp does not match " + oldGenerationStamp
+ + " : " + block.getGenerationStamp(), Reason.GENSTAMP_MISMATCH);
+ } else if (block.getNumBytes() != oldNumBytes) {
+ b = new BlockToMarkCorrupt(block,
+ "length does not match " + oldNumBytes
+ + " : " + block.getNumBytes(), Reason.SIZE_MISMATCH);
+ } else {
+ return;
+ }
+
+ for (DatanodeStorageInfo storage : getStorages(block)) {
+ boolean isCorrupt = true;
+ if (newStorages != null) {
+ for (DatanodeStorageInfo newStorage : newStorages) {
+ if (newStorage!= null && storage.equals(newStorage)) {
+ isCorrupt = false;
+ break;
+ }
+ }
+ }
+ if (isCorrupt) {
+ blockLog.info("BLOCK* markBlockReplicasAsCorrupt: mark block replica" +
+ " {} on {} as corrupt because the dn is not in the new committed " +
+ "storage list.", b, storage.getDatanodeDescriptor());
+ markBlockAsCorrupt(b, storage, storage.getDatanodeDescriptor());
+ }
+ }
+ }
+
/**
* processFirstBlockReport is intended only for processing "initial" block
* reports, the first block report received from a DN after it registers.
@@ -1968,11 +2057,9 @@ private void processFirstBlockReport(
if (report == null) return;
assert (namesystem.hasWriteLock());
assert (storageInfo.numBlocks() == 0);
- BlockReportIterator itBR = report.getBlockReportIterator();
- while(itBR.hasNext()) {
- Block iblk = itBR.next();
- ReplicaState reportedState = itBR.getCurrentReplicaState();
+ for (BlockReportReplica iblk : report) {
+ ReplicaState reportedState = iblk.getState();
if (shouldPostponeBlocksFromFuture &&
namesystem.isGenStampInFuture(iblk)) {
@@ -2042,13 +2129,11 @@ private void reportDiff(DatanodeStorageInfo storageInfo,
int curIndex;
if (newReport == null) {
- newReport = new BlockListAsLongs();
+ newReport = BlockListAsLongs.EMPTY;
}
// scan the report and process newly reported blocks
- BlockReportIterator itBR = newReport.getBlockReportIterator();
- while(itBR.hasNext()) {
- Block iblk = itBR.next();
- ReplicaState iState = itBR.getCurrentReplicaState();
+ for (BlockReportReplica iblk : newReport) {
+ ReplicaState iState = iblk.getState();
BlockInfoContiguous storedBlock = processReportedBlock(storageInfo,
iblk, iState, toAdd, toInvalidate, toCorrupt, toUC);
@@ -2441,9 +2526,6 @@ private Block addStoredBlock(final BlockInfoContiguous block,
}
} else if (result == AddBlockResult.REPLACED) {
curReplicaDelta = 0;
- blockLog.warn("BLOCK* addStoredBlock: " + "block " + storedBlock
- + " moved to storageType " + storageInfo.getStorageType()
- + " on node " + node);
blockLog.warn("BLOCK* addStoredBlock: block {} moved to storageType " +
"{} on node {}", storedBlock, storageInfo.getStorageType(), node);
} else {
@@ -2617,7 +2699,7 @@ private void stopReplicationInitializer() {
private void processMisReplicatesAsync() throws InterruptedException {
long nrInvalid = 0, nrOverReplicated = 0;
long nrUnderReplicated = 0, nrPostponed = 0, nrUnderConstruction = 0;
- long startTimeMisReplicatedScan = Time.now();
+ long startTimeMisReplicatedScan = Time.monotonicNow();
Iterator blocksItr = blocksMap.getBlocks().iterator();
long totalBlocks = blocksMap.size();
replicationQueuesInitProgress = 0;
@@ -2675,7 +2757,8 @@ private void processMisReplicatesAsync() throws InterruptedException {
NameNode.stateChangeLog
.info("STATE* Replication Queue initialization "
+ "scan for invalid, over- and under-replicated blocks "
- + "completed in " + (Time.now() - startTimeMisReplicatedScan)
+ + "completed in "
+ + (Time.monotonicNow() - startTimeMisReplicatedScan)
+ " msec");
break;
}
@@ -3188,28 +3271,6 @@ int countLiveNodes(BlockInfoContiguous b) {
}
return live;
}
-
- private void logBlockReplicationInfo(Block block, DatanodeDescriptor srcNode,
- NumberReplicas num) {
- int curReplicas = num.liveReplicas();
- int curExpectedReplicas = getReplication(block);
- BlockCollection bc = blocksMap.getBlockCollection(block);
- StringBuilder nodeList = new StringBuilder();
- for(DatanodeStorageInfo storage : blocksMap.getStorages(block)) {
- final DatanodeDescriptor node = storage.getDatanodeDescriptor();
- nodeList.append(node);
- nodeList.append(" ");
- }
- LOG.info("Block: " + block + ", Expected Replicas: "
- + curExpectedReplicas + ", live replicas: " + curReplicas
- + ", corrupt replicas: " + num.corruptReplicas()
- + ", decommissioned replicas: " + num.decommissionedReplicas()
- + ", excess replicas: " + num.excessReplicas()
- + ", Is Open File: " + bc.isUnderConstruction()
- + ", Datanodes having this block: " + nodeList + ", Current Datanode: "
- + srcNode + ", Is current datanode decommissioning: "
- + srcNode.isDecommissionInProgress());
- }
/**
* On stopping decommission, check if the node has excess replicas.
@@ -3240,89 +3301,30 @@ void processOverReplicatedBlocksOnReCommission(
}
/**
- * Return true if there are any blocks on this node that have not
- * yet reached their replication factor. Otherwise returns false.
+ * Returns whether a node can be safely decommissioned based on its
+ * liveness. Dead nodes cannot always be safely decommissioned.
*/
- boolean isReplicationInProgress(DatanodeDescriptor srcNode) {
- boolean status = false;
- boolean firstReplicationLog = true;
- int underReplicatedBlocks = 0;
- int decommissionOnlyReplicas = 0;
- int underReplicatedInOpenFiles = 0;
- final Iterator extends Block> it = srcNode.getBlockIterator();
- while(it.hasNext()) {
- final Block block = it.next();
- BlockCollection bc = blocksMap.getBlockCollection(block);
-
- if (bc != null) {
- NumberReplicas num = countNodes(block);
- int curReplicas = num.liveReplicas();
- int curExpectedReplicas = getReplication(block);
-
- if (isNeededReplication(block, curExpectedReplicas, curReplicas)) {
- if (curExpectedReplicas > curReplicas) {
- if (bc.isUnderConstruction()) {
- if (block.equals(bc.getLastBlock()) && curReplicas > minReplication) {
- continue;
- }
- underReplicatedInOpenFiles++;
- }
-
- // Log info about one block for this node which needs replication
- if (!status) {
- status = true;
- if (firstReplicationLog) {
- logBlockReplicationInfo(block, srcNode, num);
- }
- // Allowing decommission as long as default replication is met
- if (curReplicas >= defaultReplication) {
- status = false;
- firstReplicationLog = false;
- }
- }
- underReplicatedBlocks++;
- if ((curReplicas == 0) && (num.decommissionedReplicas() > 0)) {
- decommissionOnlyReplicas++;
- }
- }
- if (!neededReplications.contains(block) &&
- pendingReplications.getNumReplicas(block) == 0 &&
- namesystem.isPopulatingReplQueues()) {
- //
- // These blocks have been reported from the datanode
- // after the startDecommission method has been executed. These
- // blocks were in flight when the decommissioning was started.
- // Process these blocks only when active NN is out of safe mode.
- //
- neededReplications.add(block,
- curReplicas,
- num.decommissionedReplicas(),
- curExpectedReplicas);
- }
- }
- }
+ boolean isNodeHealthyForDecommission(DatanodeDescriptor node) {
+ if (node.isAlive) {
+ return true;
}
- if (!status && !srcNode.isAlive) {
- updateState();
- if (pendingReplicationBlocksCount == 0 &&
- underReplicatedBlocksCount == 0) {
- LOG.info("srcNode {} is dead and there are no under-replicated" +
- " blocks or blocks pending replication. Marking as " +
- "decommissioned.");
- } else {
- LOG.warn("srcNode " + srcNode + " is dead " +
- "while decommission is in progress. Continuing to mark " +
- "it as decommission in progress so when it rejoins the " +
- "cluster it can continue the decommission process.");
- status = true;
- }
+ updateState();
+ if (pendingReplicationBlocksCount == 0 &&
+ underReplicatedBlocksCount == 0) {
+ LOG.info("Node {} is dead and there are no under-replicated" +
+ " blocks or blocks pending replication. Safe to decommission.",
+ node);
+ return true;
}
- srcNode.decommissioningStatus.set(underReplicatedBlocks,
- decommissionOnlyReplicas,
- underReplicatedInOpenFiles);
- return status;
+ LOG.warn("Node {} is dead " +
+ "while decommission is in progress. Cannot be safely " +
+ "decommissioned since there is risk of reduced " +
+ "data durability or data loss. Either restart the failed node or" +
+ " force decommissioning by removing, calling refreshNodes, " +
+ "then re-adding to the excludes files.", node);
+ return false;
}
public int getActiveBlockCount() {
@@ -3349,8 +3351,7 @@ public void removeBlock(Block block) {
// file already removes them from the block map below.
block.setNumBytes(BlockCommand.NO_ACK);
addToInvalidates(block);
- corruptReplicas.removeFromCorruptReplicasMap(block);
- blocksMap.removeBlock(block);
+ removeBlockFromMap(block);
// Remove the block from pendingReplications and neededReplications
pendingReplications.remove(block);
neededReplications.remove(block, UnderReplicatedBlocks.LEVEL);
@@ -3493,7 +3494,7 @@ boolean blockHasEnoughRacks(Block b) {
* A block needs replication if the number of replicas is less than expected
* or if it does not have enough racks.
*/
- private boolean isNeededReplication(Block b, int expected, int current) {
+ boolean isNeededReplication(Block b, int expected, int current) {
return current < expected || !blockHasEnoughRacks(b);
}
@@ -3526,11 +3527,30 @@ public int numCorruptReplicas(Block block) {
}
public void removeBlockFromMap(Block block) {
+ removeFromExcessReplicateMap(block);
blocksMap.removeBlock(block);
// If block is removed from blocksMap remove it from corruptReplicasMap
corruptReplicas.removeFromCorruptReplicasMap(block);
}
+ /**
+ * If a block is removed from blocksMap, remove it from excessReplicateMap.
+ */
+ private void removeFromExcessReplicateMap(Block block) {
+ for (DatanodeStorageInfo info : blocksMap.getStorages(block)) {
+ String uuid = info.getDatanodeDescriptor().getDatanodeUuid();
+ LightWeightLinkedSet excessReplicas = excessReplicateMap.get(uuid);
+ if (excessReplicas != null) {
+ if (excessReplicas.remove(block)) {
+ excessBlocksCount.decrementAndGet();
+ if (excessReplicas.isEmpty()) {
+ excessReplicateMap.remove(uuid);
+ }
+ }
+ }
+ }
+ }
+
public int getCapacity() {
return blocksMap.getCapacity();
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java
index cb17596173e52..3262772613834 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java
@@ -17,7 +17,7 @@
*/
package org.apache.hadoop.hdfs.server.blockmanagement;
-import static org.apache.hadoop.util.Time.now;
+import static org.apache.hadoop.util.Time.monotonicNow;
import java.util.*;
@@ -884,7 +884,7 @@ public DatanodeStorageInfo chooseReplicaToDelete(BlockCollection bc,
Collection second,
final List excessTypes) {
long oldestHeartbeat =
- now() - heartbeatInterval * tolerateHeartbeatMultiplier;
+ monotonicNow() - heartbeatInterval * tolerateHeartbeatMultiplier;
DatanodeStorageInfo oldestHeartbeatStorage = null;
long minSpace = Long.MAX_VALUE;
DatanodeStorageInfo minSpaceStorage = null;
@@ -898,8 +898,8 @@ public DatanodeStorageInfo chooseReplicaToDelete(BlockCollection bc,
final DatanodeDescriptor node = storage.getDatanodeDescriptor();
long free = node.getRemaining();
- long lastHeartbeat = node.getLastUpdate();
- if(lastHeartbeat < oldestHeartbeat) {
+ long lastHeartbeat = node.getLastUpdateMonotonic();
+ if (lastHeartbeat < oldestHeartbeat) {
oldestHeartbeat = lastHeartbeat;
oldestHeartbeatStorage = storage;
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java
index c0a17b1c9525f..4731ad44c3116 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java
@@ -18,6 +18,7 @@
package org.apache.hadoop.hdfs.server.blockmanagement;
import java.util.ArrayList;
+import java.util.BitSet;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
@@ -31,6 +32,7 @@
import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ImmutableList;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
@@ -40,6 +42,7 @@
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.server.namenode.CachedBlock;
+import org.apache.hadoop.hdfs.server.protocol.BlockReportContext;
import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
import org.apache.hadoop.hdfs.server.protocol.StorageReport;
import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
@@ -62,7 +65,25 @@ public class DatanodeDescriptor extends DatanodeInfo {
// Stores status of decommissioning.
// If node is not decommissioning, do not use this object for anything.
public final DecommissioningStatus decommissioningStatus = new DecommissioningStatus();
-
+
+ private long curBlockReportId = 0;
+
+ private BitSet curBlockReportRpcsSeen = null;
+
+ public int updateBlockReportContext(BlockReportContext context) {
+ if (curBlockReportId != context.getReportId()) {
+ curBlockReportId = context.getReportId();
+ curBlockReportRpcsSeen = new BitSet(context.getTotalRpcs());
+ }
+ curBlockReportRpcsSeen.set(context.getCurRpc());
+ return curBlockReportRpcsSeen.cardinality();
+ }
+
+ public void clearBlockReportContext() {
+ curBlockReportId = 0;
+ curBlockReportRpcsSeen = null;
+ }
+
/** Block and targets pair */
@InterfaceAudience.Private
@InterfaceStability.Evolving
@@ -282,6 +303,34 @@ boolean hasStaleStorages() {
}
}
+ static final private List EMPTY_STORAGE_INFO_LIST =
+ ImmutableList.of();
+
+ List removeZombieStorages() {
+ List zombies = null;
+ synchronized (storageMap) {
+ Iterator> iter =
+ storageMap.entrySet().iterator();
+ while (iter.hasNext()) {
+ Map.Entry entry = iter.next();
+ DatanodeStorageInfo storageInfo = entry.getValue();
+ if (storageInfo.getLastBlockReportId() != curBlockReportId) {
+ LOG.info(storageInfo.getStorageID() + " had lastBlockReportId 0x" +
+ Long.toHexString(storageInfo.getLastBlockReportId()) +
+ ", but curBlockReportId = 0x" +
+ Long.toHexString(curBlockReportId));
+ iter.remove();
+ if (zombies == null) {
+ zombies = new LinkedList();
+ }
+ zombies.add(storageInfo);
+ }
+ storageInfo.setLastBlockReportId(0);
+ }
+ }
+ return zombies == null ? EMPTY_STORAGE_INFO_LIST : zombies;
+ }
+
/**
* Remove block from the list of blocks belonging to the data-node. Remove
* data-node from the block.
@@ -398,14 +447,17 @@ public void updateHeartbeatState(StorageReport[] reports, long cacheCapacity,
if (checkFailedStorages) {
LOG.info("Number of failed storage changes from "
+ this.volumeFailures + " to " + volFailures);
- failedStorageInfos = new HashSet(
- storageMap.values());
+ synchronized (storageMap) {
+ failedStorageInfos =
+ new HashSet(storageMap.values());
+ }
}
setCacheCapacity(cacheCapacity);
setCacheUsed(cacheUsed);
setXceiverCount(xceiverCount);
- setLastUpdate(Time.now());
+ setLastUpdate(Time.now());
+ setLastUpdateMonotonic(Time.monotonicNow());
this.volumeFailures = volFailures;
this.volumeFailureSummary = volumeFailureSummary;
for (StorageReport report : reports) {
@@ -420,7 +472,7 @@ public void updateHeartbeatState(StorageReport[] reports, long cacheCapacity,
totalBlockPoolUsed += report.getBlockPoolUsed();
totalDfsUsed += report.getDfsUsed();
}
- rollBlocksScheduled(getLastUpdate());
+ rollBlocksScheduled(getLastUpdateMonotonic());
// Update total metrics for the node.
setCapacity(totalCapacity);
@@ -430,8 +482,11 @@ public void updateHeartbeatState(StorageReport[] reports, long cacheCapacity,
if (checkFailedStorages) {
updateFailedStorage(failedStorageInfos);
}
-
- if (storageMap.size() != reports.length) {
+ long storageMapSize;
+ synchronized (storageMap) {
+ storageMapSize = storageMap.size();
+ }
+ if (storageMapSize != reports.length) {
pruneStorageMap(reports);
}
}
@@ -441,14 +496,14 @@ public void updateHeartbeatState(StorageReport[] reports, long cacheCapacity,
* as long as they have associated block replicas.
*/
private void pruneStorageMap(final StorageReport[] reports) {
- if (LOG.isDebugEnabled()) {
- LOG.debug("Number of storages reported in heartbeat=" + reports.length +
- "; Number of storages in storageMap=" + storageMap.size());
- }
+ synchronized (storageMap) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Number of storages reported in heartbeat=" + reports.length
+ + "; Number of storages in storageMap=" + storageMap.size());
+ }
- HashMap excessStorages;
+ HashMap excessStorages;
- synchronized (storageMap) {
// Init excessStorages with all known storages.
excessStorages = new HashMap(storageMap);
@@ -465,8 +520,8 @@ private void pruneStorageMap(final StorageReport[] reports) {
LOG.info("Removed storage " + storageInfo + " from DataNode" + this);
} else if (LOG.isDebugEnabled()) {
// This can occur until all block reports are received.
- LOG.debug("Deferring removal of stale storage " + storageInfo +
- " with " + storageInfo.numBlocks() + " blocks");
+ LOG.debug("Deferring removal of stale storage " + storageInfo
+ + " with " + storageInfo.numBlocks() + " blocks");
}
}
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java
index 45c56a8b5fdd3..f68c4fd275499 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java
@@ -17,6 +17,8 @@
*/
package org.apache.hadoop.hdfs.server.blockmanagement;
+import static org.apache.hadoop.util.Time.monotonicNow;
+
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.net.InetAddresses;
@@ -42,9 +44,7 @@
import org.apache.hadoop.ipc.Server;
import org.apache.hadoop.net.*;
import org.apache.hadoop.net.NetworkTopology.InvalidTopologyException;
-import org.apache.hadoop.util.Daemon;
import org.apache.hadoop.util.ReflectionUtils;
-import org.apache.hadoop.util.Time;
import java.io.IOException;
import java.io.PrintWriter;
@@ -53,8 +53,6 @@
import java.net.UnknownHostException;
import java.util.*;
-import static org.apache.hadoop.util.Time.now;
-
/**
* Manage datanodes, include decommission and other activities.
*/
@@ -65,9 +63,9 @@ public class DatanodeManager {
private final Namesystem namesystem;
private final BlockManager blockManager;
+ private final DecommissionManager decomManager;
private final HeartbeatManager heartbeatManager;
private final FSClusterStats fsClusterStats;
- private Daemon decommissionthread = null;
/**
* Stores the datanode -> block map.
@@ -110,7 +108,7 @@ public class DatanodeManager {
private final HostFileManager hostFileManager = new HostFileManager();
/** The period to wait for datanode heartbeat.*/
- private final long heartbeatExpireInterval;
+ private long heartbeatExpireInterval;
/** Ask Datanode only up to this many blocks to delete. */
final int blockInvalidateLimit;
@@ -184,6 +182,8 @@ public class DatanodeManager {
networktopology = NetworkTopology.getInstance(conf);
this.heartbeatManager = new HeartbeatManager(namesystem, blockManager, conf);
+ this.decomManager = new DecommissionManager(namesystem, blockManager,
+ heartbeatManager);
this.fsClusterStats = newFSClusterStats();
this.defaultXferPort = NetUtils.createSocketAddr(
@@ -307,25 +307,12 @@ private static long getStaleIntervalFromConf(Configuration conf,
}
void activate(final Configuration conf) {
- final DecommissionManager dm = new DecommissionManager(namesystem, blockManager);
- this.decommissionthread = new Daemon(dm.new Monitor(
- conf.getInt(DFSConfigKeys.DFS_NAMENODE_DECOMMISSION_INTERVAL_KEY,
- DFSConfigKeys.DFS_NAMENODE_DECOMMISSION_INTERVAL_DEFAULT),
- conf.getInt(DFSConfigKeys.DFS_NAMENODE_DECOMMISSION_NODES_PER_INTERVAL_KEY,
- DFSConfigKeys.DFS_NAMENODE_DECOMMISSION_NODES_PER_INTERVAL_DEFAULT)));
- decommissionthread.start();
-
+ decomManager.activate(conf);
heartbeatManager.activate(conf);
}
void close() {
- if (decommissionthread != null) {
- decommissionthread.interrupt();
- try {
- decommissionthread.join(3000);
- } catch (InterruptedException e) {
- }
- }
+ decomManager.close();
heartbeatManager.close();
}
@@ -339,6 +326,20 @@ HeartbeatManager getHeartbeatManager() {
return heartbeatManager;
}
+ @VisibleForTesting
+ public DecommissionManager getDecomManager() {
+ return decomManager;
+ }
+
+ HostFileManager getHostFileManager() {
+ return hostFileManager;
+ }
+
+ @VisibleForTesting
+ public void setHeartbeatExpireInterval(long expiryMs) {
+ this.heartbeatExpireInterval = expiryMs;
+ }
+
@VisibleForTesting
public FSClusterStats getFSClusterStats() {
return fsClusterStats;
@@ -581,8 +582,8 @@ void removeDeadDatanode(final DatanodeID nodeID) {
/** Is the datanode dead? */
boolean isDatanodeDead(DatanodeDescriptor node) {
- return (node.getLastUpdate() <
- (Time.now() - heartbeatExpireInterval));
+ return (node.getLastUpdateMonotonic() <
+ (monotonicNow() - heartbeatExpireInterval));
}
/** Add a datanode. */
@@ -826,63 +827,14 @@ private void removeDecomNodeFromList(final List nodeList) {
}
/**
- * Decommission the node if it is in exclude list.
+ * Decommission the node if it is in the host exclude list.
+ *
+ * @param nodeReg datanode
*/
- private void checkDecommissioning(DatanodeDescriptor nodeReg) {
+ void startDecommissioningIfExcluded(DatanodeDescriptor nodeReg) {
// If the registered node is in exclude list, then decommission it
- if (hostFileManager.isExcluded(nodeReg)) {
- startDecommission(nodeReg);
- }
- }
-
- /**
- * Change, if appropriate, the admin state of a datanode to
- * decommission completed. Return true if decommission is complete.
- */
- boolean checkDecommissionState(DatanodeDescriptor node) {
- // Check to see if all blocks in this decommissioned
- // node has reached their target replication factor.
- if (node.isDecommissionInProgress() && node.checkBlockReportReceived()) {
- if (!blockManager.isReplicationInProgress(node)) {
- node.setDecommissioned();
- LOG.info("Decommission complete for " + node);
- }
- }
- return node.isDecommissioned();
- }
-
- /** Start decommissioning the specified datanode. */
- @InterfaceAudience.Private
- @VisibleForTesting
- public void startDecommission(DatanodeDescriptor node) {
- if (!node.isDecommissionInProgress()) {
- if (!node.isAlive) {
- LOG.info("Dead node " + node + " is decommissioned immediately.");
- node.setDecommissioned();
- } else if (!node.isDecommissioned()) {
- for (DatanodeStorageInfo storage : node.getStorageInfos()) {
- LOG.info("Start Decommissioning " + node + " " + storage
- + " with " + storage.numBlocks() + " blocks");
- }
- heartbeatManager.startDecommission(node);
- node.decommissioningStatus.setStartTime(now());
-
- // all the blocks that reside on this node have to be replicated.
- checkDecommissionState(node);
- }
- }
- }
-
- /** Stop decommissioning the specified datanodes. */
- void stopDecommission(DatanodeDescriptor node) {
- if (node.isDecommissionInProgress() || node.isDecommissioned()) {
- LOG.info("Stop Decommissioning " + node);
- heartbeatManager.stopDecommission(node);
- // Over-replicated blocks will be detected and processed when
- // the dead node comes back and send in its full block report.
- if (node.isAlive) {
- blockManager.processOverReplicatedBlocksOnReCommission(node);
- }
+ if (getHostFileManager().isExcluded(nodeReg)) {
+ decomManager.startDecommission(nodeReg);
}
}
@@ -993,7 +945,7 @@ nodes with its data cleared (or user can just remove the StorageID
// also treat the registration message as a heartbeat
heartbeatManager.register(nodeS);
incrementVersionCount(nodeS.getSoftwareVersion());
- checkDecommissioning(nodeS);
+ startDecommissioningIfExcluded(nodeS);
success = true;
} finally {
if (!success) {
@@ -1029,7 +981,7 @@ nodes with its data cleared (or user can just remove the StorageID
// because its is done when the descriptor is created
heartbeatManager.addDatanode(nodeDescr);
incrementVersionCount(nodeReg.getSoftwareVersion());
- checkDecommissioning(nodeDescr);
+ startDecommissioningIfExcluded(nodeDescr);
success = true;
} finally {
if (!success) {
@@ -1092,9 +1044,9 @@ private void refreshDatanodes() {
node.setDisallowed(true); // case 2.
} else {
if (hostFileManager.isExcluded(node)) {
- startDecommission(node); // case 3.
+ decomManager.startDecommission(node); // case 3.
} else {
- stopDecommission(node); // case 4.
+ decomManager.stopDecommission(node); // case 4.
}
}
}
@@ -1348,7 +1300,7 @@ public List getDatanodeListForReport(
.getAddress().getHostAddress(), addr.getHostName(), "",
addr.getPort() == 0 ? defaultXferPort : addr.getPort(),
defaultInfoPort, defaultInfoSecurePort, defaultIpcPort));
- dn.setLastUpdate(0); // Consider this node dead for reporting
+ setDatanodeDead(dn);
nodes.add(dn);
}
}
@@ -1381,6 +1333,7 @@ private static boolean isNameResolved(InetAddress address) {
private void setDatanodeDead(DatanodeDescriptor node) {
node.setLastUpdate(0);
+ node.setLastUpdateMonotonic(0);
}
/** Handle heartbeat from datanodes. */
@@ -1486,7 +1439,7 @@ public DatanodeCommand[] handleHeartbeat(DatanodeRegistration nodeReg,
blockPoolId, blks));
}
boolean sendingCachingCommands = false;
- long nowMs = Time.monotonicNow();
+ long nowMs = monotonicNow();
if (shouldSendCachingCommands &&
((nowMs - nodeinfo.getLastCachingDirectiveSentTimeMs()) >=
timeBetweenResendingCachingDirectivesMs)) {
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeStorageInfo.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeStorageInfo.java
index c4612a325ba0c..be16a8731490d 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeStorageInfo.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeStorageInfo.java
@@ -115,6 +115,9 @@ public void remove() {
private volatile BlockInfoContiguous blockList = null;
private int numBlocks = 0;
+ // The ID of the last full block report which updated this storage.
+ private long lastBlockReportId = 0;
+
/** The number of block reports received */
private int blockReportCount = 0;
@@ -178,7 +181,15 @@ public void setUtilizationForTesting(long capacity, long dfsUsed,
this.remaining = remaining;
this.blockPoolUsed = blockPoolUsed;
}
-
+
+ long getLastBlockReportId() {
+ return lastBlockReportId;
+ }
+
+ void setLastBlockReportId(long lastBlockReportId) {
+ this.lastBlockReportId = lastBlockReportId;
+ }
+
State getState() {
return this.state;
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DecommissionManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DecommissionManager.java
index a234cf545fcc2..9355329637f65 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DecommissionManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DecommissionManager.java
@@ -17,88 +17,605 @@
*/
package org.apache.hadoop.hdfs.server.blockmanagement;
+import java.util.AbstractList;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
import java.util.Map;
+import java.util.Queue;
+import java.util.TreeMap;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.TimeUnit;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.apache.hadoop.classification.InterfaceAudience;
-import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.protocol.Block;
+import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.server.namenode.Namesystem;
+import org.apache.hadoop.hdfs.util.CyclicIteration;
+import org.apache.hadoop.util.ChunkedArrayList;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static org.apache.hadoop.util.Time.monotonicNow;
/**
- * Manage node decommissioning.
+ * Manages datanode decommissioning. A background monitor thread
+ * periodically checks the status of datanodes that are in-progress of
+ * decommissioning.
+ *
+ * A datanode can be decommissioned in a few situations:
+ *
+ * - If a DN is dead, it is decommissioned immediately.
+ * - If a DN is alive, it is decommissioned after all of its blocks
+ * are sufficiently replicated. Merely under-replicated blocks do not
+ * block decommissioning as long as they are above a replication
+ * threshold.
+ *
+ * In the second case, the datanode transitions to a
+ * decommission-in-progress state and is tracked by the monitor thread. The
+ * monitor periodically scans through the list of insufficiently replicated
+ * blocks on these datanodes to
+ * determine if they can be decommissioned. The monitor also prunes this list
+ * as blocks become replicated, so monitor scans will become more efficient
+ * over time.
+ *
+ * Decommission-in-progress nodes that become dead do not progress to
+ * decommissioned until they become live again. This prevents potential
+ * durability loss for singly-replicated blocks (see HDFS-6791).
+ *
+ * This class depends on the FSNamesystem lock for synchronization.
*/
@InterfaceAudience.Private
-@InterfaceStability.Evolving
-class DecommissionManager {
- static final Log LOG = LogFactory.getLog(DecommissionManager.class);
+public class DecommissionManager {
+ private static final Logger LOG = LoggerFactory.getLogger(DecommissionManager
+ .class);
private final Namesystem namesystem;
- private final BlockManager blockmanager;
+ private final BlockManager blockManager;
+ private final HeartbeatManager hbManager;
+ private final ScheduledExecutorService executor;
+
+ /**
+ * Map containing the decommission-in-progress datanodes that are being
+ * tracked so they can be be marked as decommissioned.
+ *
+ * This holds a set of references to the under-replicated blocks on the DN at
+ * the time the DN is added to the map, i.e. the blocks that are preventing
+ * the node from being marked as decommissioned. During a monitor tick, this
+ * list is pruned as blocks becomes replicated.
+ *
+ * Note also that the reference to the list of under-replicated blocks
+ * will be null on initial add
+ *
+ * However, this map can become out-of-date since it is not updated by block
+ * reports or other events. Before being finally marking as decommissioned,
+ * another check is done with the actual block map.
+ */
+ private final TreeMap>
+ decomNodeBlocks;
+
+ /**
+ * Tracking a node in decomNodeBlocks consumes additional memory. To limit
+ * the impact on NN memory consumption, we limit the number of nodes in
+ * decomNodeBlocks. Additional nodes wait in pendingNodes.
+ */
+ private final Queue pendingNodes;
+
+ private Monitor monitor = null;
DecommissionManager(final Namesystem namesystem,
- final BlockManager blockmanager) {
+ final BlockManager blockManager, final HeartbeatManager hbManager) {
this.namesystem = namesystem;
- this.blockmanager = blockmanager;
+ this.blockManager = blockManager;
+ this.hbManager = hbManager;
+
+ executor = Executors.newScheduledThreadPool(1,
+ new ThreadFactoryBuilder().setNameFormat("DecommissionMonitor-%d")
+ .setDaemon(true).build());
+ decomNodeBlocks = new TreeMap<>();
+ pendingNodes = new LinkedList<>();
+ }
+
+ /**
+ * Start the decommission monitor thread.
+ * @param conf
+ */
+ void activate(Configuration conf) {
+ final int intervalSecs =
+ conf.getInt(DFSConfigKeys.DFS_NAMENODE_DECOMMISSION_INTERVAL_KEY,
+ DFSConfigKeys.DFS_NAMENODE_DECOMMISSION_INTERVAL_DEFAULT);
+ checkArgument(intervalSecs >= 0, "Cannot set a negative " +
+ "value for " + DFSConfigKeys.DFS_NAMENODE_DECOMMISSION_INTERVAL_KEY);
+
+ // By default, the new configuration key overrides the deprecated one.
+ // No # node limit is set.
+ int blocksPerInterval = conf.getInt(
+ DFSConfigKeys.DFS_NAMENODE_DECOMMISSION_BLOCKS_PER_INTERVAL_KEY,
+ DFSConfigKeys.DFS_NAMENODE_DECOMMISSION_BLOCKS_PER_INTERVAL_DEFAULT);
+ int nodesPerInterval = Integer.MAX_VALUE;
+
+ // If the expected key isn't present and the deprecated one is,
+ // use the deprecated one into the new one. This overrides the
+ // default.
+ //
+ // Also print a deprecation warning.
+ final String deprecatedKey =
+ "dfs.namenode.decommission.nodes.per.interval";
+ final String strNodes = conf.get(deprecatedKey);
+ if (strNodes != null) {
+ nodesPerInterval = Integer.parseInt(strNodes);
+ blocksPerInterval = Integer.MAX_VALUE;
+ LOG.warn("Using deprecated configuration key {} value of {}.",
+ deprecatedKey, nodesPerInterval);
+ LOG.warn("Please update your configuration to use {} instead.",
+ DFSConfigKeys.DFS_NAMENODE_DECOMMISSION_BLOCKS_PER_INTERVAL_KEY);
+ }
+ checkArgument(blocksPerInterval > 0,
+ "Must set a positive value for "
+ + DFSConfigKeys.DFS_NAMENODE_DECOMMISSION_BLOCKS_PER_INTERVAL_KEY);
+
+ final int maxConcurrentTrackedNodes = conf.getInt(
+ DFSConfigKeys.DFS_NAMENODE_DECOMMISSION_MAX_CONCURRENT_TRACKED_NODES,
+ DFSConfigKeys.DFS_NAMENODE_DECOMMISSION_MAX_CONCURRENT_TRACKED_NODES_DEFAULT);
+ checkArgument(maxConcurrentTrackedNodes >= 0, "Cannot set a negative " +
+ "value for "
+ + DFSConfigKeys.DFS_NAMENODE_DECOMMISSION_MAX_CONCURRENT_TRACKED_NODES);
+
+ monitor = new Monitor(blocksPerInterval,
+ nodesPerInterval, maxConcurrentTrackedNodes);
+ executor.scheduleAtFixedRate(monitor, intervalSecs, intervalSecs,
+ TimeUnit.SECONDS);
+
+ LOG.debug("Activating DecommissionManager with interval {} seconds, " +
+ "{} max blocks per interval, {} max nodes per interval, " +
+ "{} max concurrently tracked nodes.", intervalSecs,
+ blocksPerInterval, nodesPerInterval, maxConcurrentTrackedNodes);
+ }
+
+ /**
+ * Stop the decommission monitor thread, waiting briefly for it to terminate.
+ */
+ void close() {
+ executor.shutdownNow();
+ try {
+ executor.awaitTermination(3000, TimeUnit.MILLISECONDS);
+ } catch (InterruptedException e) {}
+ }
+
+ /**
+ * Start decommissioning the specified datanode.
+ * @param node
+ */
+ @VisibleForTesting
+ public void startDecommission(DatanodeDescriptor node) {
+ if (!node.isDecommissionInProgress()) {
+ if (!node.isAlive) {
+ LOG.info("Dead node {} is decommissioned immediately.", node);
+ node.setDecommissioned();
+ } else if (!node.isDecommissioned()) {
+ for (DatanodeStorageInfo storage : node.getStorageInfos()) {
+ LOG.info("Starting decommission of {} {} with {} blocks",
+ node, storage, storage.numBlocks());
+ }
+ // Update DN stats maintained by HeartbeatManager
+ hbManager.startDecommission(node);
+ node.decommissioningStatus.setStartTime(monotonicNow());
+ pendingNodes.add(node);
+ }
+ } else {
+ LOG.trace("startDecommission: Node {} is already decommission in "
+ + "progress, nothing to do.", node);
+ }
+ }
+
+ /**
+ * Stop decommissioning the specified datanode.
+ * @param node
+ */
+ void stopDecommission(DatanodeDescriptor node) {
+ if (node.isDecommissionInProgress() || node.isDecommissioned()) {
+ LOG.info("Stopping decommissioning of node {}", node);
+ // Update DN stats maintained by HeartbeatManager
+ hbManager.stopDecommission(node);
+ // Over-replicated blocks will be detected and processed when
+ // the dead node comes back and send in its full block report.
+ if (node.isAlive) {
+ blockManager.processOverReplicatedBlocksOnReCommission(node);
+ }
+ // Remove from tracking in DecommissionManager
+ pendingNodes.remove(node);
+ decomNodeBlocks.remove(node);
+ } else {
+ LOG.trace("stopDecommission: Node {} is not decommission in progress " +
+ "or decommissioned, nothing to do.", node);
+ }
+ }
+
+ private void setDecommissioned(DatanodeDescriptor dn) {
+ dn.setDecommissioned();
+ LOG.info("Decommissioning complete for node {}", dn);
}
- /** Periodically check decommission status. */
- class Monitor implements Runnable {
- /** recheckInterval is how often namenode checks
- * if a node has finished decommission
+ /**
+ * Checks whether a block is sufficiently replicated for decommissioning.
+ * Full-strength replication is not always necessary, hence "sufficient".
+ * @return true if sufficient, else false.
+ */
+ private boolean isSufficientlyReplicated(BlockInfoContiguous block,
+ BlockCollection bc,
+ NumberReplicas numberReplicas) {
+ final int numExpected = bc.getBlockReplication();
+ final int numLive = numberReplicas.liveReplicas();
+ if (!blockManager.isNeededReplication(block, numExpected, numLive)) {
+ // Block doesn't need replication. Skip.
+ LOG.trace("Block {} does not need replication.", block);
+ return true;
+ }
+
+ // Block is under-replicated
+ LOG.trace("Block {} numExpected={}, numLive={}", block, numExpected,
+ numLive);
+ if (numExpected > numLive) {
+ if (bc.isUnderConstruction() && block.equals(bc.getLastBlock())) {
+ // Can decom a UC block as long as there will still be minReplicas
+ if (numLive >= blockManager.minReplication) {
+ LOG.trace("UC block {} sufficiently-replicated since numLive ({}) "
+ + ">= minR ({})", block, numLive, blockManager.minReplication);
+ return true;
+ } else {
+ LOG.trace("UC block {} insufficiently-replicated since numLive "
+ + "({}) < minR ({})", block, numLive,
+ blockManager.minReplication);
+ }
+ } else {
+ // Can decom a non-UC as long as the default replication is met
+ if (numLive >= blockManager.defaultReplication) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ private static void logBlockReplicationInfo(Block block, BlockCollection bc,
+ DatanodeDescriptor srcNode, NumberReplicas num,
+ Iterable storages) {
+ int curReplicas = num.liveReplicas();
+ int curExpectedReplicas = bc.getBlockReplication();
+ StringBuilder nodeList = new StringBuilder();
+ for (DatanodeStorageInfo storage : storages) {
+ final DatanodeDescriptor node = storage.getDatanodeDescriptor();
+ nodeList.append(node);
+ nodeList.append(" ");
+ }
+ LOG.info("Block: " + block + ", Expected Replicas: "
+ + curExpectedReplicas + ", live replicas: " + curReplicas
+ + ", corrupt replicas: " + num.corruptReplicas()
+ + ", decommissioned replicas: " + num.decommissionedReplicas()
+ + ", excess replicas: " + num.excessReplicas()
+ + ", Is Open File: " + bc.isUnderConstruction()
+ + ", Datanodes having this block: " + nodeList + ", Current Datanode: "
+ + srcNode + ", Is current datanode decommissioning: "
+ + srcNode.isDecommissionInProgress());
+ }
+
+ @VisibleForTesting
+ public int getNumPendingNodes() {
+ return pendingNodes.size();
+ }
+
+ @VisibleForTesting
+ public int getNumTrackedNodes() {
+ return decomNodeBlocks.size();
+ }
+
+ @VisibleForTesting
+ public int getNumNodesChecked() {
+ return monitor.numNodesChecked;
+ }
+
+ /**
+ * Checks to see if DNs have finished decommissioning.
+ *
+ * Since this is done while holding the namesystem lock,
+ * the amount of work per monitor tick is limited.
+ */
+ private class Monitor implements Runnable {
+ /**
+ * The maximum number of blocks to check per tick.
+ */
+ private final int numBlocksPerCheck;
+ /**
+ * The maximum number of nodes to check per tick.
*/
- private final long recheckInterval;
- /** The number of decommission nodes to check for each interval */
private final int numNodesPerCheck;
- /** firstkey can be initialized to anything. */
- private String firstkey = "";
+ /**
+ * The maximum number of nodes to track in decomNodeBlocks. A value of 0
+ * means no limit.
+ */
+ private final int maxConcurrentTrackedNodes;
+ /**
+ * The number of blocks that have been checked on this tick.
+ */
+ private int numBlocksChecked = 0;
+ /**
+ * The number of nodes that have been checked on this tick. Used for
+ * testing.
+ */
+ private int numNodesChecked = 0;
+ /**
+ * The last datanode in decomNodeBlocks that we've processed
+ */
+ private DatanodeDescriptor iterkey = new DatanodeDescriptor(new
+ DatanodeID("", "", "", 0, 0, 0, 0));
- Monitor(int recheckIntervalInSecond, int numNodesPerCheck) {
- this.recheckInterval = recheckIntervalInSecond * 1000L;
+ Monitor(int numBlocksPerCheck, int numNodesPerCheck, int
+ maxConcurrentTrackedNodes) {
+ this.numBlocksPerCheck = numBlocksPerCheck;
this.numNodesPerCheck = numNodesPerCheck;
+ this.maxConcurrentTrackedNodes = maxConcurrentTrackedNodes;
+ }
+
+ private boolean exceededNumBlocksPerCheck() {
+ LOG.trace("Processed {} blocks so far this tick", numBlocksChecked);
+ return numBlocksChecked >= numBlocksPerCheck;
+ }
+
+ @Deprecated
+ private boolean exceededNumNodesPerCheck() {
+ LOG.trace("Processed {} nodes so far this tick", numNodesChecked);
+ return numNodesChecked >= numNodesPerCheck;
}
- /**
- * Check decommission status of numNodesPerCheck nodes
- * for every recheckInterval milliseconds.
- */
@Override
public void run() {
- for(; namesystem.isRunning(); ) {
- namesystem.writeLock();
- try {
- check();
- } finally {
- namesystem.writeUnlock();
+ if (!namesystem.isRunning()) {
+ LOG.info("Namesystem is not running, skipping decommissioning checks"
+ + ".");
+ return;
+ }
+ // Reset the checked count at beginning of each iteration
+ numBlocksChecked = 0;
+ numNodesChecked = 0;
+ // Check decom progress
+ namesystem.writeLock();
+ try {
+ processPendingNodes();
+ check();
+ } finally {
+ namesystem.writeUnlock();
+ }
+ if (numBlocksChecked + numNodesChecked > 0) {
+ LOG.info("Checked {} blocks and {} nodes this tick", numBlocksChecked,
+ numNodesChecked);
+ }
+ }
+
+ /**
+ * Pop datanodes off the pending list and into decomNodeBlocks,
+ * subject to the maxConcurrentTrackedNodes limit.
+ */
+ private void processPendingNodes() {
+ while (!pendingNodes.isEmpty() &&
+ (maxConcurrentTrackedNodes == 0 ||
+ decomNodeBlocks.size() < maxConcurrentTrackedNodes)) {
+ decomNodeBlocks.put(pendingNodes.poll(), null);
+ }
+ }
+
+ private void check() {
+ final Iterator>>
+ it = new CyclicIteration<>(decomNodeBlocks, iterkey).iterator();
+ final LinkedList toRemove = new LinkedList<>();
+
+ while (it.hasNext()
+ && !exceededNumBlocksPerCheck()
+ && !exceededNumNodesPerCheck()) {
+ numNodesChecked++;
+ final Map.Entry>
+ entry = it.next();
+ final DatanodeDescriptor dn = entry.getKey();
+ AbstractList blocks = entry.getValue();
+ boolean fullScan = false;
+ if (blocks == null) {
+ // This is a newly added datanode, run through its list to schedule
+ // under-replicated blocks for replication and collect the blocks
+ // that are insufficiently replicated for further tracking
+ LOG.debug("Newly-added node {}, doing full scan to find " +
+ "insufficiently-replicated blocks.", dn);
+ blocks = handleInsufficientlyReplicated(dn);
+ decomNodeBlocks.put(dn, blocks);
+ fullScan = true;
+ } else {
+ // This is a known datanode, check if its # of insufficiently
+ // replicated blocks has dropped to zero and if it can be decommed
+ LOG.debug("Processing decommission-in-progress node {}", dn);
+ pruneSufficientlyReplicated(dn, blocks);
}
-
- try {
- Thread.sleep(recheckInterval);
- } catch (InterruptedException ie) {
- LOG.warn(this.getClass().getSimpleName() + " interrupted: " + ie);
+ if (blocks.size() == 0) {
+ if (!fullScan) {
+ // If we didn't just do a full scan, need to re-check with the
+ // full block map.
+ //
+ // We've replicated all the known insufficiently replicated
+ // blocks. Re-check with the full block map before finally
+ // marking the datanode as decommissioned
+ LOG.debug("Node {} has finished replicating current set of "
+ + "blocks, checking with the full block map.", dn);
+ blocks = handleInsufficientlyReplicated(dn);
+ decomNodeBlocks.put(dn, blocks);
+ }
+ // If the full scan is clean AND the node liveness is okay,
+ // we can finally mark as decommissioned.
+ final boolean isHealthy =
+ blockManager.isNodeHealthyForDecommission(dn);
+ if (blocks.size() == 0 && isHealthy) {
+ setDecommissioned(dn);
+ toRemove.add(dn);
+ LOG.debug("Node {} is sufficiently replicated and healthy, "
+ + "marked as decommissioned.", dn);
+ } else {
+ if (LOG.isDebugEnabled()) {
+ StringBuilder b = new StringBuilder("Node {} ");
+ if (isHealthy) {
+ b.append("is ");
+ } else {
+ b.append("isn't ");
+ }
+ b.append("healthy and still needs to replicate {} more blocks," +
+ " decommissioning is still in progress.");
+ LOG.debug(b.toString(), dn, blocks.size());
+ }
+ }
+ } else {
+ LOG.debug("Node {} still has {} blocks to replicate "
+ + "before it is a candidate to finish decommissioning.",
+ dn, blocks.size());
}
+ iterkey = dn;
+ }
+ // Remove the datanodes that are decommissioned
+ for (DatanodeDescriptor dn : toRemove) {
+ Preconditions.checkState(dn.isDecommissioned(),
+ "Removing a node that is not yet decommissioned!");
+ decomNodeBlocks.remove(dn);
}
}
-
- private void check() {
- final DatanodeManager dm = blockmanager.getDatanodeManager();
- int count = 0;
- for(Map.Entry entry
- : dm.getDatanodeCyclicIteration(firstkey)) {
- final DatanodeDescriptor d = entry.getValue();
- firstkey = entry.getKey();
-
- if (d.isDecommissionInProgress()) {
- try {
- dm.checkDecommissionState(d);
- } catch(Exception e) {
- LOG.warn("entry=" + entry, e);
+
+ /**
+ * Removes sufficiently replicated blocks from the block list of a
+ * datanode.
+ */
+ private void pruneSufficientlyReplicated(final DatanodeDescriptor datanode,
+ AbstractList blocks) {
+ processBlocksForDecomInternal(datanode, blocks.iterator(), null, true);
+ }
+
+ /**
+ * Returns a list of blocks on a datanode that are insufficiently
+ * replicated, i.e. are under-replicated enough to prevent decommission.
+ *
+ * As part of this, it also schedules replication work for
+ * any under-replicated blocks.
+ *
+ * @param datanode
+ * @return List of insufficiently replicated blocks
+ */
+ private AbstractList handleInsufficientlyReplicated(
+ final DatanodeDescriptor datanode) {
+ AbstractList insufficient = new ChunkedArrayList<>();
+ processBlocksForDecomInternal(datanode, datanode.getBlockIterator(),
+ insufficient, false);
+ return insufficient;
+ }
+
+ /**
+ * Used while checking if decommission-in-progress datanodes can be marked
+ * as decommissioned. Combines shared logic of
+ * pruneSufficientlyReplicated and handleInsufficientlyReplicated.
+ *
+ * @param datanode Datanode
+ * @param it Iterator over the blocks on the
+ * datanode
+ * @param insufficientlyReplicated Return parameter. If it's not null,
+ * will contain the insufficiently
+ * replicated-blocks from the list.
+ * @param pruneSufficientlyReplicated whether to remove sufficiently
+ * replicated blocks from the iterator
+ * @return true if there are under-replicated blocks in the provided block
+ * iterator, else false.
+ */
+ private void processBlocksForDecomInternal(
+ final DatanodeDescriptor datanode,
+ final Iterator it,
+ final List insufficientlyReplicated,
+ boolean pruneSufficientlyReplicated) {
+ boolean firstReplicationLog = true;
+ int underReplicatedBlocks = 0;
+ int decommissionOnlyReplicas = 0;
+ int underReplicatedInOpenFiles = 0;
+ while (it.hasNext()) {
+ numBlocksChecked++;
+ final BlockInfoContiguous block = it.next();
+ // Remove the block from the list if it's no longer in the block map,
+ // e.g. the containing file has been deleted
+ if (blockManager.blocksMap.getStoredBlock(block) == null) {
+ LOG.trace("Removing unknown block {}", block);
+ it.remove();
+ continue;
+ }
+ BlockCollection bc = blockManager.blocksMap.getBlockCollection(block);
+ if (bc == null) {
+ // Orphan block, will be invalidated eventually. Skip.
+ continue;
+ }
+
+ final NumberReplicas num = blockManager.countNodes(block);
+ final int liveReplicas = num.liveReplicas();
+ final int curReplicas = liveReplicas;
+
+ // Schedule under-replicated blocks for replication if not already
+ // pending
+ if (blockManager.isNeededReplication(block, bc.getBlockReplication(),
+ liveReplicas)) {
+ if (!blockManager.neededReplications.contains(block) &&
+ blockManager.pendingReplications.getNumReplicas(block) == 0 &&
+ namesystem.isPopulatingReplQueues()) {
+ // Process these blocks only when active NN is out of safe mode.
+ blockManager.neededReplications.add(block,
+ curReplicas,
+ num.decommissionedReplicas(),
+ bc.getBlockReplication());
}
- if (++count == numNodesPerCheck) {
- return;
+ }
+
+ // Even if the block is under-replicated,
+ // it doesn't block decommission if it's sufficiently replicated
+ if (isSufficientlyReplicated(block, bc, num)) {
+ if (pruneSufficientlyReplicated) {
+ it.remove();
}
+ continue;
+ }
+
+ // We've found an insufficiently replicated block.
+ if (insufficientlyReplicated != null) {
+ insufficientlyReplicated.add(block);
+ }
+ // Log if this is our first time through
+ if (firstReplicationLog) {
+ logBlockReplicationInfo(block, bc, datanode, num,
+ blockManager.blocksMap.getStorages(block));
+ firstReplicationLog = false;
+ }
+ // Update various counts
+ underReplicatedBlocks++;
+ if (bc.isUnderConstruction()) {
+ underReplicatedInOpenFiles++;
+ }
+ if ((curReplicas == 0) && (num.decommissionedReplicas() > 0)) {
+ decommissionOnlyReplicas++;
}
}
+
+ datanode.decommissioningStatus.set(underReplicatedBlocks,
+ decommissionOnlyReplicas,
+ underReplicatedInOpenFiles);
}
}
+
+ @VisibleForTesting
+ void runMonitor() throws ExecutionException, InterruptedException {
+ Future f = executor.submit(monitor);
+ f.get();
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
index d60a39b277def..d2905a29b7aa6 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
@@ -353,7 +353,7 @@ private class Monitor implements Runnable {
public void run() {
while(namesystem.isRunning()) {
try {
- final long now = Time.now();
+ final long now = Time.monotonicNow();
if (lastHeartbeatCheck + heartbeatRecheckInterval < now) {
heartbeatCheck();
lastHeartbeatCheck = now;
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingReplicationBlocks.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingReplicationBlocks.java
index 57c296262514a..796b878c92d41 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingReplicationBlocks.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingReplicationBlocks.java
@@ -17,7 +17,7 @@
*/
package org.apache.hadoop.hdfs.server.blockmanagement;
-import static org.apache.hadoop.util.Time.now;
+import static org.apache.hadoop.util.Time.monotonicNow;
import java.io.PrintWriter;
import java.sql.Time;
@@ -177,7 +177,7 @@ static class PendingBlockInfo {
private final List targets;
PendingBlockInfo(DatanodeDescriptor[] targets) {
- this.timeStamp = now();
+ this.timeStamp = monotonicNow();
this.targets = targets == null ? new ArrayList()
: new ArrayList(Arrays.asList(targets));
}
@@ -187,7 +187,7 @@ long getTimeStamp() {
}
void setTimeStamp() {
- timeStamp = now();
+ timeStamp = monotonicNow();
}
void incrementReplicas(DatanodeDescriptor... newTargets) {
@@ -234,7 +234,7 @@ void pendingReplicationCheck() {
synchronized (pendingReplications) {
Iterator> iter =
pendingReplications.entrySet().iterator();
- long now = now();
+ long now = monotonicNow();
if(LOG.isDebugEnabled()) {
LOG.debug("PendingReplicationMonitor checking Q");
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Storage.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Storage.java
index e6bd5b289332d..e6f099983d5a9 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Storage.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Storage.java
@@ -672,7 +672,7 @@ public boolean isShared() {
*/
public void lock() throws IOException {
if (isShared()) {
- LOG.info("Locking is disabled");
+ LOG.info("Locking is disabled for " + this.root);
return;
}
FileLock newLock = tryLock();
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPOfferService.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPOfferService.java
index 86c881689b43d..da9642adfab37 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPOfferService.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPOfferService.java
@@ -29,6 +29,7 @@
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
+import org.apache.hadoop.hdfs.protocol.RollingUpgradeStatus;
import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolClientSideTranslatorPB;
import org.apache.hadoop.hdfs.server.protocol.*;
import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo.BlockStatus;
@@ -470,15 +471,19 @@ List getBPServiceActors() {
/**
* Signal the current rolling upgrade status as indicated by the NN.
- * @param inProgress true if a rolling upgrade is in progress
+ * @param rollingUpgradeStatus rolling upgrade status
*/
- void signalRollingUpgrade(boolean inProgress) throws IOException {
+ void signalRollingUpgrade(RollingUpgradeStatus rollingUpgradeStatus)
+ throws IOException {
+ if (rollingUpgradeStatus == null) {
+ return;
+ }
String bpid = getBlockPoolId();
- if (inProgress) {
+ if (!rollingUpgradeStatus.isFinalized()) {
dn.getFSDataset().enableTrash(bpid);
dn.getFSDataset().setRollingUpgradeMarker(bpid);
} else {
- dn.getFSDataset().restoreTrash(bpid);
+ dn.getFSDataset().clearTrash(bpid);
dn.getFSDataset().clearRollingUpgradeMarker(bpid);
}
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java
index ff1ad786d1da4..df582f1846af1 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java
@@ -17,7 +17,7 @@
*/
package org.apache.hadoop.hdfs.server.datanode;
-import static org.apache.hadoop.util.Time.now;
+import static org.apache.hadoop.util.Time.monotonicNow;
import java.io.EOFException;
import java.io.IOException;
@@ -44,6 +44,7 @@
import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolClientSideTranslatorPB;
import org.apache.hadoop.hdfs.server.common.IncorrectVersionException;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
+import org.apache.hadoop.hdfs.server.protocol.BlockReportContext;
import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
@@ -57,7 +58,6 @@
import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.ipc.RemoteException;
-import org.apache.hadoop.util.Time;
import org.apache.hadoop.util.VersionInfo;
import org.apache.hadoop.util.VersionUtil;
@@ -82,12 +82,11 @@ class BPServiceActor implements Runnable {
final BPOfferService bpos;
- // lastBlockReport, lastDeletedReport and lastHeartbeat may be assigned/read
+ // lastBlockReport and lastHeartbeat may be assigned/read
// by testing threads (through BPServiceActor#triggerXXX), while also
// assigned/read by the actor thread. Thus they should be declared as volatile
// to make sure the "happens-before" consistency.
volatile long lastBlockReport = 0;
- volatile long lastDeletedReport = 0;
boolean resetBlockReportTime = true;
@@ -228,7 +227,7 @@ private void connectToNNAndHandshake() throws IOException {
bpos.verifyAndSetNamespaceInfo(nsInfo);
// Second phase of the handshake with the NN.
- register();
+ register(nsInfo);
}
// This is useful to make sure NN gets Heartbeat before Blockreport
@@ -249,7 +248,7 @@ void scheduleHeartbeat() {
*/
void scheduleBlockReport(long delay) {
if (delay > 0) { // send BR after random delay
- lastBlockReport = Time.now()
+ lastBlockReport = monotonicNow()
- ( dnConf.blockReportInterval - DFSUtil.getRandom().nextInt((int)(delay)));
} else { // send at next heartbeat
lastBlockReport = lastHeartbeat - dnConf.blockReportInterval;
@@ -291,12 +290,14 @@ private void reportReceivedDeletedBlocks() throws IOException {
// Send incremental block reports to the Namenode outside the lock
boolean success = false;
+ final long startTime = monotonicNow();
try {
bpNamenode.blockReceivedAndDeleted(bpRegistration,
bpos.getBlockPoolId(),
reports.toArray(new StorageReceivedDeletedBlocks[reports.size()]));
success = true;
} finally {
+ dn.getMetrics().addIncrementalBlockReport(monotonicNow() - startTime);
if (!success) {
synchronized (pendingIncrementalBRperStorage) {
for (StorageReceivedDeletedBlocks report : reports) {
@@ -415,10 +416,10 @@ void triggerHeartbeatForTests() {
@VisibleForTesting
void triggerDeletionReportForTests() {
synchronized (pendingIncrementalBRperStorage) {
- lastDeletedReport = 0;
+ sendImmediateIBR = true;
pendingIncrementalBRperStorage.notifyAll();
- while (lastDeletedReport == 0) {
+ while (sendImmediateIBR) {
try {
pendingIncrementalBRperStorage.wait(100);
} catch (InterruptedException e) {
@@ -433,6 +434,17 @@ boolean hasPendingIBR() {
return sendImmediateIBR;
}
+ private long prevBlockReportId = 0;
+
+ private long generateUniqueBlockReportId() {
+ long id = System.nanoTime();
+ if (id <= prevBlockReportId) {
+ id = prevBlockReportId + 1;
+ }
+ prevBlockReportId = id;
+ return id;
+ }
+
/**
* Report the list blocks to the Namenode
* @return DatanodeCommands returned by the NN. May be null.
@@ -440,7 +452,7 @@ boolean hasPendingIBR() {
*/
List blockReport() throws IOException {
// send block report if timer has expired.
- final long startTime = now();
+ final long startTime = monotonicNow();
if (startTime - lastBlockReport <= dnConf.blockReportInterval) {
return null;
}
@@ -452,9 +464,8 @@ List blockReport() throws IOException {
// or we will report an RBW replica after the BlockReport already reports
// a FINALIZED one.
reportReceivedDeletedBlocks();
- lastDeletedReport = startTime;
- long brCreateStartTime = now();
+ long brCreateStartTime = monotonicNow();
Map perVolumeBlockLists =
dn.getFSDataset().getBlockReports(bpos.getBlockPoolId());
@@ -466,8 +477,7 @@ List blockReport() throws IOException {
for(Map.Entry kvPair : perVolumeBlockLists.entrySet()) {
BlockListAsLongs blockList = kvPair.getValue();
- reports[i++] = new StorageBlockReport(
- kvPair.getKey(), blockList.getBlockListAsLongs());
+ reports[i++] = new StorageBlockReport(kvPair.getKey(), blockList);
totalBlockCount += blockList.getNumberOfBlocks();
}
@@ -475,12 +485,14 @@ List blockReport() throws IOException {
int numReportsSent = 0;
int numRPCs = 0;
boolean success = false;
- long brSendStartTime = now();
+ long brSendStartTime = monotonicNow();
+ long reportId = generateUniqueBlockReportId();
try {
if (totalBlockCount < dnConf.blockReportSplitThreshold) {
// Below split threshold, send all reports in a single message.
DatanodeCommand cmd = bpNamenode.blockReport(
- bpRegistration, bpos.getBlockPoolId(), reports);
+ bpRegistration, bpos.getBlockPoolId(), reports,
+ new BlockReportContext(1, 0, reportId));
numRPCs = 1;
numReportsSent = reports.length;
if (cmd != null) {
@@ -488,10 +500,11 @@ List blockReport() throws IOException {
}
} else {
// Send one block report per message.
- for (StorageBlockReport report : reports) {
- StorageBlockReport singleReport[] = { report };
+ for (int r = 0; r < reports.length; r++) {
+ StorageBlockReport singleReport[] = { reports[r] };
DatanodeCommand cmd = bpNamenode.blockReport(
- bpRegistration, bpos.getBlockPoolId(), singleReport);
+ bpRegistration, bpos.getBlockPoolId(), singleReport,
+ new BlockReportContext(reports.length, r, reportId));
numReportsSent++;
numRPCs++;
if (cmd != null) {
@@ -502,16 +515,17 @@ List blockReport() throws IOException {
success = true;
} finally {
// Log the block report processing stats from Datanode perspective
- long brSendCost = now() - brSendStartTime;
+ long brSendCost = monotonicNow() - brSendStartTime;
long brCreateCost = brSendStartTime - brCreateStartTime;
dn.getMetrics().addBlockReport(brSendCost);
final int nCmds = cmds.size();
LOG.info((success ? "S" : "Uns") +
- "uccessfully sent " + numReportsSent +
- " of " + reports.length +
- " blockreports for " + totalBlockCount +
- " total blocks using " + numRPCs +
- " RPCs. This took " + brCreateCost +
+ "uccessfully sent block report 0x" +
+ Long.toHexString(reportId) + ", containing " + reports.length +
+ " storage report(s), of which we sent " + numReportsSent + "." +
+ " The reports had " + totalBlockCount +
+ " total blocks and used " + numRPCs +
+ " RPC(s). This took " + brCreateCost +
" msec to generate and " + brSendCost +
" msecs for RPC and NN processing." +
" Got back " +
@@ -538,7 +552,7 @@ private void scheduleNextBlockReport(long previousReportStartTime) {
* 1) normal like 9:20:18, next report should be at 10:20:14
* 2) unexpected like 11:35:43, next report should be at 12:20:14
*/
- lastBlockReport += (now() - lastBlockReport) /
+ lastBlockReport += (monotonicNow() - lastBlockReport) /
dnConf.blockReportInterval * dnConf.blockReportInterval;
}
}
@@ -550,7 +564,7 @@ DatanodeCommand cacheReport() throws IOException {
}
// send cache report if timer has expired.
DatanodeCommand cmd = null;
- final long startTime = Time.monotonicNow();
+ final long startTime = monotonicNow();
if (startTime - lastCacheReport > dnConf.cacheReportInterval) {
if (LOG.isDebugEnabled()) {
LOG.debug("Sending cacheReport from service actor: " + this);
@@ -559,10 +573,10 @@ DatanodeCommand cacheReport() throws IOException {
String bpid = bpos.getBlockPoolId();
List blockIds = dn.getFSDataset().getCacheReport(bpid);
- long createTime = Time.monotonicNow();
+ long createTime = monotonicNow();
cmd = bpNamenode.cacheReport(bpRegistration, bpid, blockIds);
- long sendTime = Time.monotonicNow();
+ long sendTime = monotonicNow();
long createCost = createTime - startTime;
long sendCost = sendTime - createTime;
dn.getMetrics().addCacheReport(sendCost);
@@ -648,7 +662,7 @@ private void handleRollingUpgradeStatus(HeartbeatResponse resp) throws IOExcepti
" in HeartbeatResponse. Expected " +
bpos.getBlockPoolId());
} else {
- bpos.signalRollingUpgrade(rollingUpgradeStatus != null);
+ bpos.signalRollingUpgrade(rollingUpgradeStatus);
}
}
@@ -658,7 +672,6 @@ private void handleRollingUpgradeStatus(HeartbeatResponse resp) throws IOExcepti
*/
private void offerService() throws Exception {
LOG.info("For namenode " + nnAddr + " using"
- + " DELETEREPORT_INTERVAL of " + dnConf.deleteReportInterval + " msec "
+ " BLOCKREPORT_INTERVAL of " + dnConf.blockReportInterval + "msec"
+ " CACHEREPORT_INTERVAL of " + dnConf.cacheReportInterval + "msec"
+ " Initial delay: " + dnConf.initialBlockReportDelay + "msec"
@@ -669,12 +682,14 @@ private void offerService() throws Exception {
//
while (shouldRun()) {
try {
- final long startTime = now();
+ final long startTime = monotonicNow();
//
// Every so often, send heartbeat or block-report
//
- if (startTime - lastHeartbeat >= dnConf.heartBeatInterval) {
+ boolean sendHeartbeat =
+ startTime - lastHeartbeat >= dnConf.heartBeatInterval;
+ if (sendHeartbeat) {
//
// All heartbeat messages include following info:
// -- Datanode name
@@ -686,7 +701,7 @@ private void offerService() throws Exception {
if (!dn.areHeartbeatsDisabledForTests()) {
HeartbeatResponse resp = sendHeartBeat();
assert resp != null;
- dn.getMetrics().addHeartbeat(now() - startTime);
+ dn.getMetrics().addHeartbeat(monotonicNow() - startTime);
// If the state of this NN has changed (eg STANDBY->ACTIVE)
// then let the BPOfferService update itself.
@@ -702,10 +717,10 @@ private void offerService() throws Exception {
handleRollingUpgradeStatus(resp);
}
- long startProcessCommands = now();
+ long startProcessCommands = monotonicNow();
if (!processCommand(resp.getCommands()))
continue;
- long endProcessCommands = now();
+ long endProcessCommands = monotonicNow();
if (endProcessCommands - startProcessCommands > 2000) {
LOG.info("Took " + (endProcessCommands - startProcessCommands)
+ "ms to process " + resp.getCommands().length
@@ -713,10 +728,8 @@ private void offerService() throws Exception {
}
}
}
- if (sendImmediateIBR ||
- (startTime - lastDeletedReport > dnConf.deleteReportInterval)) {
+ if (sendImmediateIBR || sendHeartbeat) {
reportReceivedDeletedBlocks();
- lastDeletedReport = startTime;
}
List cmds = blockReport();
@@ -730,7 +743,7 @@ private void offerService() throws Exception {
// or work arrives, and then iterate again.
//
long waitTime = dnConf.heartBeatInterval -
- (Time.now() - lastHeartbeat);
+ (monotonicNow() - lastHeartbeat);
synchronized(pendingIncrementalBRperStorage) {
if (waitTime > 0 && !sendImmediateIBR) {
try {
@@ -772,10 +785,11 @@ private void offerService() throws Exception {
*
* issued by the namenode to recognize registered datanodes.
*
+ * @param nsInfo current NamespaceInfo
* @see FSNamesystem#registerDatanode(DatanodeRegistration)
* @throws IOException
*/
- void register() throws IOException {
+ void register(NamespaceInfo nsInfo) throws IOException {
// The handshake() phase loaded the block pool storage
// off disk - so update the bpRegistration object from that info
bpRegistration = bpos.createRegistration();
@@ -786,6 +800,7 @@ void register() throws IOException {
try {
// Use returned registration from namenode with updated fields
bpRegistration = bpNamenode.registerDatanode(bpRegistration);
+ bpRegistration.setNamespaceInfo(nsInfo);
break;
} catch(EOFException e) { // namenode might have just restarted
LOG.info("Problem connecting to server: " + nnAddr + " :"
@@ -913,9 +928,9 @@ void reRegister() throws IOException {
if (shouldRun()) {
// re-retrieve namespace info to make sure that, if the NN
// was restarted, we still match its version (HDFS-2120)
- retrieveNamespaceInfo();
+ NamespaceInfo nsInfo = retrieveNamespaceInfo();
// and re-register
- register();
+ register(nsInfo);
scheduleHeartbeat();
}
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockPoolManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockPoolManager.java
index 9f389952afea2..28a6cc7edbdae 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockPoolManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockPoolManager.java
@@ -20,11 +20,8 @@
import java.io.IOException;
import java.net.InetSocketAddress;
import java.security.PrivilegedExceptionAction;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
+import java.util.*;
+import java.util.concurrent.CopyOnWriteArrayList;
import org.apache.commons.logging.Log;
import org.apache.hadoop.classification.InterfaceAudience;
@@ -53,7 +50,7 @@ class BlockPoolManager {
private final Map bpByBlockPoolId =
Maps.newHashMap();
private final List offerServices =
- Lists.newArrayList();
+ new CopyOnWriteArrayList<>();
private final DataNode dn;
@@ -74,12 +71,14 @@ synchronized void addBlockPool(BPOfferService bpos) {
}
/**
- * Returns the array of BPOfferService objects.
+ * Returns a list of BPOfferService objects. The underlying list
+ * implementation is a CopyOnWriteArrayList so it can be safely
+ * iterated while BPOfferServices are being added or removed.
+ *
* Caution: The BPOfferService returned could be shutdown any time.
*/
- synchronized BPOfferService[] getAllNamenodeThreads() {
- BPOfferService[] bposArray = new BPOfferService[offerServices.size()];
- return offerServices.toArray(bposArray);
+ synchronized List getAllNamenodeThreads() {
+ return Collections.unmodifiableList(offerServices);
}
synchronized BPOfferService get(String bpid) {
@@ -110,15 +109,13 @@ synchronized void remove(BPOfferService t) {
}
}
- void shutDownAll(BPOfferService[] bposArray) throws InterruptedException {
- if (bposArray != null) {
- for (BPOfferService bpos : bposArray) {
- bpos.stop(); //interrupts the threads
- }
- //now join
- for (BPOfferService bpos : bposArray) {
- bpos.join();
- }
+ void shutDownAll(List bposList) throws InterruptedException {
+ for (BPOfferService bpos : bposList) {
+ bpos.stop(); //interrupts the threads
+ }
+ //now join
+ for (BPOfferService bpos : bposList) {
+ bpos.join();
}
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockPoolSliceStorage.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockPoolSliceStorage.java
index 4076a8b4c621d..d26a9a591b894 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockPoolSliceStorage.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockPoolSliceStorage.java
@@ -351,7 +351,8 @@ private void doTransition(DataNode datanode, StorageDirectory sd,
sd.getPreviousDir() + " and " + getTrashRootDir(sd) + " should not " +
" both be present.");
doRollback(sd, nsInfo); // rollback if applicable
- } else {
+ } else if (startOpt == StartupOption.ROLLBACK &&
+ !sd.getPreviousDir().exists()) {
// Restore all the files in the trash. The restored files are retained
// during rolling upgrade rollback. They are deleted during rolling
// upgrade downgrade.
@@ -378,6 +379,12 @@ private void doTransition(DataNode datanode, StorageDirectory sd,
&& this.cTime == nsInfo.getCTime()) {
return; // regular startup
}
+ if (this.layoutVersion > HdfsConstants.DATANODE_LAYOUT_VERSION) {
+ int restored = restoreBlockFilesFromTrash(getTrashRootDir(sd));
+ LOG.info("Restored " + restored + " block files from trash " +
+ "before the layout upgrade. These blocks will be moved to " +
+ "the previous directory during the upgrade");
+ }
if (this.layoutVersion > HdfsConstants.DATANODE_LAYOUT_VERSION
|| this.cTime < nsInfo.getCTime()) {
doUpgrade(datanode, sd, nsInfo); // upgrade
@@ -730,16 +737,12 @@ String getRestoreDirectory(File blockFile) {
/**
* Delete all files and directories in the trash directories.
*/
- public void restoreTrash() {
+ public void clearTrash() {
for (StorageDirectory sd : storageDirs) {
File trashRoot = getTrashRootDir(sd);
- try {
- Preconditions.checkState(!(trashRoot.exists() && sd.getPreviousDir().exists()));
- restoreBlockFilesFromTrash(trashRoot);
- FileUtil.fullyDelete(getTrashRootDir(sd));
- } catch (IOException ioe) {
- LOG.warn("Restoring trash failed for storage directory " + sd);
- }
+ Preconditions.checkState(!(trashRoot.exists() && sd.getPreviousDir().exists()));
+ FileUtil.fullyDelete(trashRoot);
+ LOG.info("Cleared trash for storage directory " + sd);
}
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java
index 0a2b650f981e9..4e8ce94ab2e7c 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java
@@ -1372,7 +1372,7 @@ private void sendAckUpstreamUnprotected(PipelineAck ack, long seqno,
replies = new int[ackLen + 1];
replies[0] = myHeader;
for (int i = 0; i < ackLen; ++i) {
- replies[i + 1] = ack.getReply(i);
+ replies[i + 1] = ack.getHeaderFlag(i);
}
// If the mirror has reported that it received a corrupt packet,
// do self-destruct to mark myself bad, instead of making the
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockSender.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockSender.java
index f4cde11678adf..e76b93a5fd96a 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockSender.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockSender.java
@@ -246,6 +246,13 @@ class BlockSender implements java.io.Closeable {
if (replica.getGenerationStamp() < block.getGenerationStamp()) {
throw new IOException("Replica gen stamp < block genstamp, block="
+ block + ", replica=" + replica);
+ } else if (replica.getGenerationStamp() > block.getGenerationStamp()) {
+ if (DataNode.LOG.isDebugEnabled()) {
+ DataNode.LOG.debug("Bumping up the client provided"
+ + " block's genstamp to latest " + replica.getGenerationStamp()
+ + " for block " + block);
+ }
+ block.setGenerationStamp(replica.getGenerationStamp());
}
if (replicaVisibleLength < 0) {
throw new IOException("Replica is not readable, block="
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DNConf.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DNConf.java
index 67cd1ce1aae5b..3406f29f4c4cb 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DNConf.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DNConf.java
@@ -82,7 +82,6 @@ public class DNConf {
final long heartBeatInterval;
final long blockReportInterval;
final long blockReportSplitThreshold;
- final long deleteReportInterval;
final long initialBlockReportDelay;
final long cacheReportInterval;
final long dfsclientSlowIoWarningThresholdMs;
@@ -164,7 +163,6 @@ public DNConf(Configuration conf) {
heartBeatInterval = conf.getLong(DFS_HEARTBEAT_INTERVAL_KEY,
DFS_HEARTBEAT_INTERVAL_DEFAULT) * 1000L;
- this.deleteReportInterval = 100 * heartBeatInterval;
// do we need to sync block file contents to disk when blockfile is closed?
this.syncOnClose = conf.getBoolean(DFS_DATANODE_SYNCONCLOSE_KEY,
DFS_DATANODE_SYNCONCLOSE_DEFAULT);
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
index 92ddb7bd9606e..071aba1feb7ea 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
@@ -41,8 +41,6 @@
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_NETWORK_COUNTS_CACHE_MAX_SIZE_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_NETWORK_COUNTS_CACHE_MAX_SIZE_DEFAULT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_PLUGINS_KEY;
-import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SCAN_PERIOD_HOURS_DEFAULT;
-import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SCAN_PERIOD_HOURS_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_STARTUP_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_MAX_NUM_BLOCKS_TO_LOG_DEFAULT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_MAX_NUM_BLOCKS_TO_LOG_KEY;
@@ -53,6 +51,7 @@
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.EOFException;
+import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
@@ -73,6 +72,7 @@
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@@ -157,6 +157,7 @@
import org.apache.hadoop.hdfs.server.datanode.SecureDataNodeStarter.SecureResources;
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi;
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi;
+import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsVolumeImpl;
import org.apache.hadoop.hdfs.server.datanode.metrics.DataNodeMetrics;
import org.apache.hadoop.hdfs.server.datanode.web.DatanodeHttpServer;
import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand.RecoveringBlock;
@@ -613,20 +614,16 @@ public IOException call() {
errorMessageBuilder.append(
String.format("FAILED to ADD: %s: %s%n", volume,
e.toString()));
+ LOG.error("Failed to add volume: " + volume, e);
}
}
}
- if (!changedVolumes.deactivateLocations.isEmpty()) {
- LOG.info("Deactivating volumes: " +
- Joiner.on(",").join(changedVolumes.deactivateLocations));
-
- data.removeVolumes(changedVolumes.deactivateLocations);
- try {
- storage.removeVolumes(changedVolumes.deactivateLocations);
- } catch (IOException e) {
- errorMessageBuilder.append(e.getMessage());
- }
+ try {
+ removeVolumes(changedVolumes.deactivateLocations);
+ } catch (IOException e) {
+ errorMessageBuilder.append(e.getMessage());
+ LOG.error("Failed to remove volume: " + e.getMessage(), e);
}
if (errorMessageBuilder.length() > 0) {
@@ -636,6 +633,83 @@ public IOException call() {
conf.set(DFS_DATANODE_DATA_DIR_KEY,
Joiner.on(",").join(effectiveVolumes));
dataDirs = getStorageLocations(conf);
+
+ // Send a full block report to let NN acknowledge the volume changes.
+ triggerBlockReport(new BlockReportOptions.Factory()
+ .setIncremental(false).build());
+ }
+ }
+
+ /**
+ * Remove volumes from DataNode.
+ * See {@link removeVolumes(final Set, boolean)} for details.
+ *
+ * @param locations the StorageLocations of the volumes to be removed.
+ * @throws IOException
+ */
+ private void removeVolumes(final Collection locations)
+ throws IOException {
+ if (locations.isEmpty()) {
+ return;
+ }
+ Set volumesToRemove = new HashSet<>();
+ for (StorageLocation loc : locations) {
+ volumesToRemove.add(loc.getFile().getAbsoluteFile());
+ }
+ removeVolumes(volumesToRemove, true);
+ }
+
+ /**
+ * Remove volumes from DataNode.
+ *
+ * It does three things:
+ *
+ * Remove volumes and block info from FsDataset.
+ * Remove volumes from DataStorage.
+ * Reset configuration DATA_DIR and {@link dataDirs} to represent
+ * active volumes.
+ *
+ * @param absoluteVolumePaths the absolute path of volumes.
+ * @param clearFailure if true, clears the failure information related to the
+ * volumes.
+ * @throws IOException
+ */
+ private synchronized void removeVolumes(
+ final Set absoluteVolumePaths, boolean clearFailure)
+ throws IOException {
+ for (File vol : absoluteVolumePaths) {
+ Preconditions.checkArgument(vol.isAbsolute());
+ }
+
+ if (absoluteVolumePaths.isEmpty()) {
+ return;
+ }
+
+ LOG.info(String.format("Deactivating volumes (clear failure=%b): %s",
+ clearFailure, Joiner.on(",").join(absoluteVolumePaths)));
+
+ IOException ioe = null;
+ // Remove volumes and block infos from FsDataset.
+ data.removeVolumes(absoluteVolumePaths, clearFailure);
+
+ // Remove volumes from DataStorage.
+ try {
+ storage.removeVolumes(absoluteVolumePaths);
+ } catch (IOException e) {
+ ioe = e;
+ }
+
+ // Set configuration and dataDirs to reflect volume changes.
+ for (Iterator it = dataDirs.iterator(); it.hasNext(); ) {
+ StorageLocation loc = it.next();
+ if (absoluteVolumePaths.contains(loc.getFile().getAbsoluteFile())) {
+ it.remove();
+ }
+ }
+ conf.set(DFS_DATANODE_DATA_DIR_KEY, Joiner.on(",").join(dataDirs));
+
+ if (ioe != null) {
+ throw ioe;
}
}
@@ -1278,12 +1352,12 @@ void initBlockPool(BPOfferService bpos) throws IOException {
blockScanner.enableBlockPoolId(bpos.getBlockPoolId());
}
- BPOfferService[] getAllBpOs() {
+ List getAllBpOs() {
return blockPoolManager.getAllNamenodeThreads();
}
int getBpOsCount() {
- return blockPoolManager.getAllNamenodeThreads().length;
+ return blockPoolManager.getAllNamenodeThreads().size();
}
/**
@@ -1580,11 +1654,8 @@ public void shutdown() {
}
}
- // We need to make a copy of the original blockPoolManager#offerServices to
- // make sure blockPoolManager#shutDownAll() can still access all the
- // BPOfferServices, since after setting DataNode#shouldRun to false the
- // offerServices may be modified.
- BPOfferService[] bposArray = this.blockPoolManager == null ? null
+ List bposArray = (this.blockPoolManager == null)
+ ? new ArrayList()
: this.blockPoolManager.getAllNamenodeThreads();
// If shutdown is not for restart, set shouldRun to false early.
if (!shutdownForUpgrade) {
@@ -1653,8 +1724,9 @@ public void shutdown() {
// termination of receiver threads.
if (!this.shutdownForUpgrade ||
(this.shutdownForUpgrade && (Time.monotonicNow() - timeNotified
- > 2500))) {
+ > 1000))) {
this.threadGroup.interrupt();
+ break;
}
LOG.info("Waiting for threadgroup to exit, active threads is " +
this.threadGroup.activeCount());
@@ -1665,8 +1737,8 @@ public void shutdown() {
Thread.sleep(sleepMs);
} catch (InterruptedException e) {}
sleepMs = sleepMs * 3 / 2; // exponential backoff
- if (sleepMs > 1000) {
- sleepMs = 1000;
+ if (sleepMs > 200) {
+ sleepMs = 200;
}
}
this.threadGroup = null;
@@ -2263,8 +2335,7 @@ void join() {
while (shouldRun) {
try {
blockPoolManager.joinAll();
- if (blockPoolManager.getAllNamenodeThreads() != null
- && blockPoolManager.getAllNamenodeThreads().length == 0) {
+ if (blockPoolManager.getAllNamenodeThreads().size() == 0) {
shouldRun = false;
}
// Terminate if shutdown is complete or 2 seconds after all BPs
@@ -2424,6 +2495,10 @@ public BlockScanner getBlockScanner() {
return blockScanner;
}
+ @VisibleForTesting
+ DirectoryScanner getDirectoryScanner() {
+ return directoryScanner;
+ }
public static void secureMain(String args[], SecureResources resources) {
int errorCode = 0;
@@ -3076,10 +3151,20 @@ public ShortCircuitRegistry getShortCircuitRegistry() {
* Check the disk error
*/
private void checkDiskError() {
- try {
- data.checkDataDir();
- } catch (DiskErrorException de) {
- handleDiskError(de.getMessage());
+ Set unhealthyDataDirs = data.checkDataDir();
+ if (unhealthyDataDirs != null && !unhealthyDataDirs.isEmpty()) {
+ try {
+ // Remove all unhealthy volumes from DataNode.
+ removeVolumes(unhealthyDataDirs, false);
+ } catch (IOException e) {
+ LOG.warn("Error occurred when removing unhealthy storage dirs: "
+ + e.getMessage(), e);
+ }
+ StringBuilder sb = new StringBuilder("DataNode failed volumes:");
+ for (File dataDir : unhealthyDataDirs) {
+ sb.append(dataDir.getAbsolutePath() + ";");
+ }
+ handleDiskError(sb.toString());
}
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataStorage.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataStorage.java
index 754df2c7353ce..77fcfedb96b81 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataStorage.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataStorage.java
@@ -83,7 +83,6 @@
public class DataStorage extends Storage {
public final static String BLOCK_SUBDIR_PREFIX = "subdir";
- final static String BLOCK_FILE_PREFIX = "blk_";
final static String COPY_FILE_PREFIX = "dncp_";
final static String STORAGE_DIR_DETACHED = "detach";
public final static String STORAGE_DIR_RBW = "rbw";
@@ -169,11 +168,11 @@ public void enableTrash(String bpid) {
}
}
- public void restoreTrash(String bpid) {
+ public void clearTrash(String bpid) {
if (trashEnabledBpids.contains(bpid)) {
- getBPStorage(bpid).restoreTrash();
+ getBPStorage(bpid).clearTrash();
trashEnabledBpids.remove(bpid);
- LOG.info("Restored trash for bpid " + bpid);
+ LOG.info("Cleared trash for bpid " + bpid);
}
}
@@ -405,28 +404,23 @@ synchronized List addStorageLocations(DataNode datanode,
}
/**
- * Remove volumes from DataStorage. All volumes are removed even when the
+ * Remove storage dirs from DataStorage. All storage dirs are removed even when the
* IOException is thrown.
*
- * @param locations a collection of volumes.
+ * @param dirsToRemove a set of storage directories to be removed.
* @throws IOException if I/O error when unlocking storage directory.
*/
- synchronized void removeVolumes(Collection locations)
+ synchronized void removeVolumes(final Set dirsToRemove)
throws IOException {
- if (locations.isEmpty()) {
+ if (dirsToRemove.isEmpty()) {
return;
}
- Set dataDirs = new HashSet();
- for (StorageLocation sl : locations) {
- dataDirs.add(sl.getFile());
- }
-
StringBuilder errorMsgBuilder = new StringBuilder();
for (Iterator it = this.storageDirs.iterator();
it.hasNext(); ) {
StorageDirectory sd = it.next();
- if (dataDirs.contains(sd.getRoot())) {
+ if (dirsToRemove.contains(sd.getRoot())) {
// Remove the block pool level storage first.
for (Map.Entry entry :
this.bpStorageMap.entrySet()) {
@@ -1250,7 +1244,7 @@ static void linkBlocksHelper(File from, File to, int oldLV, HardLink hl,
String[] blockNames = from.list(new java.io.FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
- return name.startsWith(BLOCK_FILE_PREFIX);
+ return name.startsWith(Block.BLOCK_FILE_PREFIX);
}
});
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataXceiver.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataXceiver.java
index e9547a84e2ef6..cf1b6bebeab13 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataXceiver.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataXceiver.java
@@ -22,8 +22,10 @@
import static org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status.ERROR_INVALID;
import static org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status.ERROR_UNSUPPORTED;
import static org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status.SUCCESS;
+import static org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ShortCircuitFdResponse.USE_RECEIPT_VERIFICATION;
+import static org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ShortCircuitFdResponse.DO_NOT_USE_RECEIPT_VERIFICATION;
import static org.apache.hadoop.hdfs.server.datanode.DataNode.DN_CLIENTTRACE_FORMAT;
-import static org.apache.hadoop.util.Time.now;
+import static org.apache.hadoop.util.Time.monotonicNow;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
@@ -245,7 +247,7 @@ public void run() {
peer.setReadTimeout(dnConf.socketTimeout);
}
- opStartTime = now();
+ opStartTime = monotonicNow();
processOp(op);
++opsProcessed;
} while ((peer != null) &&
@@ -291,64 +293,83 @@ public void run() {
@Override
public void requestShortCircuitFds(final ExtendedBlock blk,
final Token token,
- SlotId slotId, int maxVersion) throws IOException {
+ SlotId slotId, int maxVersion, boolean supportsReceiptVerification)
+ throws IOException {
updateCurrentThreadName("Passing file descriptors for block " + blk);
BlockOpResponseProto.Builder bld = BlockOpResponseProto.newBuilder();
FileInputStream fis[] = null;
+ SlotId registeredSlotId = null;
+ boolean success = false;
try {
- if (peer.getDomainSocket() == null) {
- throw new IOException("You cannot pass file descriptors over " +
- "anything but a UNIX domain socket.");
- }
- if (slotId != null) {
- boolean isCached = datanode.data.
- isCached(blk.getBlockPoolId(), blk.getBlockId());
- datanode.shortCircuitRegistry.registerSlot(
- ExtendedBlockId.fromExtendedBlock(blk), slotId, isCached);
- }
try {
- fis = datanode.requestShortCircuitFdsForRead(blk, token, maxVersion);
- } finally {
- if ((fis == null) && (slotId != null)) {
- datanode.shortCircuitRegistry.unregisterSlot(slotId);
+ if (peer.getDomainSocket() == null) {
+ throw new IOException("You cannot pass file descriptors over " +
+ "anything but a UNIX domain socket.");
}
+ if (slotId != null) {
+ boolean isCached = datanode.data.
+ isCached(blk.getBlockPoolId(), blk.getBlockId());
+ datanode.shortCircuitRegistry.registerSlot(
+ ExtendedBlockId.fromExtendedBlock(blk), slotId, isCached);
+ registeredSlotId = slotId;
+ }
+ fis = datanode.requestShortCircuitFdsForRead(blk, token, maxVersion);
+ Preconditions.checkState(fis != null);
+ bld.setStatus(SUCCESS);
+ bld.setShortCircuitAccessVersion(DataNode.CURRENT_BLOCK_FORMAT_VERSION);
+ } catch (ShortCircuitFdsVersionException e) {
+ bld.setStatus(ERROR_UNSUPPORTED);
+ bld.setShortCircuitAccessVersion(DataNode.CURRENT_BLOCK_FORMAT_VERSION);
+ bld.setMessage(e.getMessage());
+ } catch (ShortCircuitFdsUnsupportedException e) {
+ bld.setStatus(ERROR_UNSUPPORTED);
+ bld.setMessage(e.getMessage());
+ } catch (InvalidToken e) {
+ bld.setStatus(ERROR_ACCESS_TOKEN);
+ bld.setMessage(e.getMessage());
+ } catch (IOException e) {
+ bld.setStatus(ERROR);
+ bld.setMessage(e.getMessage());
}
- bld.setStatus(SUCCESS);
- bld.setShortCircuitAccessVersion(DataNode.CURRENT_BLOCK_FORMAT_VERSION);
- } catch (ShortCircuitFdsVersionException e) {
- bld.setStatus(ERROR_UNSUPPORTED);
- bld.setShortCircuitAccessVersion(DataNode.CURRENT_BLOCK_FORMAT_VERSION);
- bld.setMessage(e.getMessage());
- } catch (ShortCircuitFdsUnsupportedException e) {
- bld.setStatus(ERROR_UNSUPPORTED);
- bld.setMessage(e.getMessage());
- } catch (InvalidToken e) {
- bld.setStatus(ERROR_ACCESS_TOKEN);
- bld.setMessage(e.getMessage());
- } catch (IOException e) {
- bld.setStatus(ERROR);
- bld.setMessage(e.getMessage());
- }
- try {
bld.build().writeDelimitedTo(socketOut);
if (fis != null) {
FileDescriptor fds[] = new FileDescriptor[fis.length];
for (int i = 0; i < fds.length; i++) {
fds[i] = fis[i].getFD();
}
- byte buf[] = new byte[] { (byte)0 };
- peer.getDomainSocket().
- sendFileDescriptors(fds, buf, 0, buf.length);
+ byte buf[] = new byte[1];
+ if (supportsReceiptVerification) {
+ buf[0] = (byte)USE_RECEIPT_VERIFICATION.getNumber();
+ } else {
+ buf[0] = (byte)DO_NOT_USE_RECEIPT_VERIFICATION.getNumber();
+ }
+ DomainSocket sock = peer.getDomainSocket();
+ sock.sendFileDescriptors(fds, buf, 0, buf.length);
+ if (supportsReceiptVerification) {
+ LOG.trace("Reading receipt verification byte for " + slotId);
+ int val = sock.getInputStream().read();
+ if (val < 0) {
+ throw new EOFException();
+ }
+ } else {
+ LOG.trace("Receipt verification is not enabled on the DataNode. " +
+ "Not verifying " + slotId);
+ }
+ success = true;
}
} finally {
+ if ((!success) && (registeredSlotId != null)) {
+ LOG.info("Unregistering " + registeredSlotId + " because the " +
+ "requestShortCircuitFdsForRead operation failed.");
+ datanode.shortCircuitRegistry.unregisterSlot(registeredSlotId);
+ }
if (ClientTraceLog.isInfoEnabled()) {
DatanodeRegistration dnR = datanode.getDNRegistrationForBP(blk
.getBlockPoolId());
BlockSender.ClientTraceLog.info(String.format(
"src: 127.0.0.1, dest: 127.0.0.1, op: REQUEST_SHORT_CIRCUIT_FDS," +
" blockid: %s, srvID: %s, success: %b",
- blk.getBlockId(), dnR.getDatanodeUuid(), (fis != null)
- ));
+ blk.getBlockId(), dnR.getDatanodeUuid(), success));
}
if (fis != null) {
IOUtils.cleanup(LOG, fis);
@@ -1181,7 +1202,7 @@ public void replaceBlock(final ExtendedBlock block,
}
private long elapsed() {
- return now() - opStartTime;
+ return monotonicNow() - opStartTime;
}
/**
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DirectoryScanner.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DirectoryScanner.java
index c7ee21e5ca6d7..61dfb14b9f4f9 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DirectoryScanner.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DirectoryScanner.java
@@ -443,13 +443,14 @@ void scan() {
int d = 0; // index for blockpoolReport
int m = 0; // index for memReprot
while (m < memReport.length && d < blockpoolReport.length) {
- FinalizedReplica memBlock = memReport[Math.min(m, memReport.length - 1)];
- ScanInfo info = blockpoolReport[Math.min(
- d, blockpoolReport.length - 1)];
+ FinalizedReplica memBlock = memReport[m];
+ ScanInfo info = blockpoolReport[d];
if (info.getBlockId() < memBlock.getBlockId()) {
- // Block is missing in memory
- statsRecord.missingMemoryBlocks++;
- addDifference(diffRecord, statsRecord, info);
+ if (!dataset.isDeletingBlock(bpid, info.getBlockId())) {
+ // Block is missing in memory
+ statsRecord.missingMemoryBlocks++;
+ addDifference(diffRecord, statsRecord, info);
+ }
d++;
continue;
}
@@ -495,8 +496,11 @@ void scan() {
current.getBlockId(), current.getVolume());
}
while (d < blockpoolReport.length) {
- statsRecord.missingMemoryBlocks++;
- addDifference(diffRecord, statsRecord, blockpoolReport[d++]);
+ if (!dataset.isDeletingBlock(bpid, blockpoolReport[d].getBlockId())) {
+ statsRecord.missingMemoryBlocks++;
+ addDifference(diffRecord, statsRecord, blockpoolReport[d]);
+ }
+ d++;
}
LOG.info(statsRecord.toString());
} //end for
@@ -633,7 +637,7 @@ private LinkedList compileReport(FsVolumeSpi vol,
continue;
}
if (!Block.isBlockFilename(files[i])) {
- if (isBlockMetaFile("blk_", files[i].getName())) {
+ if (isBlockMetaFile(Block.BLOCK_FILE_PREFIX, files[i].getName())) {
long blockId = Block.getBlockId(files[i].getName());
verifyFileLocation(files[i].getParentFile(), bpFinalizedDir,
blockId);
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ShortCircuitRegistry.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ShortCircuitRegistry.java
index 965b40a0c4504..b32c0d167c5a2 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ShortCircuitRegistry.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ShortCircuitRegistry.java
@@ -30,6 +30,7 @@
import java.util.Iterator;
import java.util.Set;
+import com.google.common.annotations.VisibleForTesting;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -83,7 +84,7 @@ public class ShortCircuitRegistry {
private static final int SHM_LENGTH = 8192;
- private static class RegisteredShm extends ShortCircuitShm
+ public static class RegisteredShm extends ShortCircuitShm
implements DomainSocketWatcher.Handler {
private final String clientName;
private final ShortCircuitRegistry registry;
@@ -176,7 +177,7 @@ public ShortCircuitRegistry(Configuration conf) throws IOException {
if (dswLoadingFailure != null) {
throw new IOException(dswLoadingFailure);
}
- watcher = new DomainSocketWatcher(interruptCheck);
+ watcher = new DomainSocketWatcher(interruptCheck, "datanode");
enabled = true;
if (LOG.isDebugEnabled()) {
LOG.debug("created new ShortCircuitRegistry with interruptCheck=" +
@@ -383,4 +384,14 @@ public void shutdown() {
}
IOUtils.closeQuietly(watcher);
}
+
+ public static interface Visitor {
+ void accept(HashMap segments,
+ HashMultimap slots);
+ }
+
+ @VisibleForTesting
+ public synchronized void visit(Visitor visitor) {
+ visitor.accept(segments, slots);
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/FsDatasetSpi.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/FsDatasetSpi.java
index 10c83694f9e85..8a741de3a0ecc 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/FsDatasetSpi.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/FsDatasetSpi.java
@@ -27,6 +27,7 @@
import java.util.Collection;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
@@ -113,9 +114,11 @@ public void addVolume(
* If the FSDataset supports block scanning, this function removes
* the volumes from the block scanner.
*
- * @param volumes The storage locations of the volumes to remove.
+ * @param volumes The paths of the volumes to be removed.
+ * @param clearFailure set true to clear the failure information about the
+ * volumes.
*/
- public void removeVolumes(Collection volumes);
+ public void removeVolumes(Set volumes, boolean clearFailure);
/** @return a storage with the given storage ID */
public DatanodeStorage getStorage(final String storageUuid);
@@ -388,9 +391,9 @@ public void checkBlock(ExtendedBlock b, long minLength, ReplicaState state)
/**
* Check if all the data directories are healthy
- * @throws DiskErrorException
+ * @return A set of unhealthy data directories.
*/
- public void checkDataDir() throws DiskErrorException;
+ public Set checkDataDir();
/**
* Shutdown the FSDataset
@@ -487,9 +490,9 @@ public HdfsBlocksMetadata getHdfsBlocksMetadata(String bpid,
public void enableTrash(String bpid);
/**
- * Restore trash
+ * Clear trash
*/
- public void restoreTrash(String bpid);
+ public void clearTrash(String bpid);
/**
* @return true when trash is enabled
@@ -543,4 +546,9 @@ public ReplicaInfo moveBlockAcrossStorage(final ExtendedBlock block,
* Check whether the block was pinned
*/
public boolean getPinning(ExtendedBlock block) throws IOException;
+
+ /**
+ * Confirm whether the block is deleting
+ */
+ public boolean isDeletingBlock(String bpid, long blockId);
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/BlockPoolSlice.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/BlockPoolSlice.java
index 5a69e1e4d666d..6daf03944ecd0 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/BlockPoolSlice.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/BlockPoolSlice.java
@@ -23,12 +23,12 @@
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
-import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.RandomAccessFile;
import java.io.Writer;
+import java.util.Iterator;
import java.util.Scanner;
import org.apache.commons.io.FileUtils;
@@ -39,6 +39,8 @@
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.protocol.Block;
+import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
+import org.apache.hadoop.hdfs.protocol.BlockListAsLongs.BlockReportReplica;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader;
import org.apache.hadoop.hdfs.server.datanode.DataStorage;
@@ -55,6 +57,7 @@
import org.apache.hadoop.util.ShutdownHookManager;
import org.apache.hadoop.util.Time;
+import com.google.common.io.Files;
/**
* A block pool slice represents a portion of a block pool stored on a volume.
* Taken together, all BlockPoolSlices sharing a block pool ID across a
@@ -77,7 +80,9 @@ class BlockPoolSlice {
private volatile boolean dfsUsedSaved = false;
private static final int SHUTDOWN_HOOK_PRIORITY = 30;
private final boolean deleteDuplicateReplicas;
-
+ private static final String REPLICA_CACHE_FILE = "replicas";
+ private final long replicaCacheExpiry = 5*60*1000;
+
// TODO:FEDERATION scalability issue - a thread per DU is needed
private final DU dfsUsage;
@@ -310,11 +315,14 @@ void getVolumeMap(ReplicaMap volumeMap,
FsDatasetImpl.LOG.info(
"Recovered " + numRecovered + " replicas from " + lazypersistDir);
}
-
- // add finalized replicas
- addToReplicasMap(volumeMap, finalizedDir, lazyWriteReplicaMap, true);
- // add rbw replicas
- addToReplicasMap(volumeMap, rbwDir, lazyWriteReplicaMap, false);
+
+ boolean success = readReplicasFromCache(volumeMap, lazyWriteReplicaMap);
+ if (!success) {
+ // add finalized replicas
+ addToReplicasMap(volumeMap, finalizedDir, lazyWriteReplicaMap, true);
+ // add rbw replicas
+ addToReplicasMap(volumeMap, rbwDir, lazyWriteReplicaMap, false);
+ }
}
/**
@@ -401,6 +409,75 @@ private int moveLazyPersistReplicasToFinalized(File source)
FileUtil.fullyDelete(source);
return numRecovered;
}
+
+ private void addReplicaToReplicasMap(Block block, ReplicaMap volumeMap,
+ final RamDiskReplicaTracker lazyWriteReplicaMap,boolean isFinalized)
+ throws IOException {
+ ReplicaInfo newReplica = null;
+ long blockId = block.getBlockId();
+ long genStamp = block.getGenerationStamp();
+ if (isFinalized) {
+ newReplica = new FinalizedReplica(blockId,
+ block.getNumBytes(), genStamp, volume, DatanodeUtil
+ .idToBlockDir(finalizedDir, blockId));
+ } else {
+ File file = new File(rbwDir, block.getBlockName());
+ boolean loadRwr = true;
+ File restartMeta = new File(file.getParent() +
+ File.pathSeparator + "." + file.getName() + ".restart");
+ Scanner sc = null;
+ try {
+ sc = new Scanner(restartMeta, "UTF-8");
+ // The restart meta file exists
+ if (sc.hasNextLong() && (sc.nextLong() > Time.now())) {
+ // It didn't expire. Load the replica as a RBW.
+ // We don't know the expected block length, so just use 0
+ // and don't reserve any more space for writes.
+ newReplica = new ReplicaBeingWritten(blockId,
+ validateIntegrityAndSetLength(file, genStamp),
+ genStamp, volume, file.getParentFile(), null, 0);
+ loadRwr = false;
+ }
+ sc.close();
+ if (!restartMeta.delete()) {
+ FsDatasetImpl.LOG.warn("Failed to delete restart meta file: " +
+ restartMeta.getPath());
+ }
+ } catch (FileNotFoundException fnfe) {
+ // nothing to do hereFile dir =
+ } finally {
+ if (sc != null) {
+ sc.close();
+ }
+ }
+ // Restart meta doesn't exist or expired.
+ if (loadRwr) {
+ newReplica = new ReplicaWaitingToBeRecovered(blockId,
+ validateIntegrityAndSetLength(file, genStamp),
+ genStamp, volume, file.getParentFile());
+ }
+ }
+
+ ReplicaInfo oldReplica = volumeMap.get(bpid, newReplica.getBlockId());
+ if (oldReplica == null) {
+ volumeMap.add(bpid, newReplica);
+ } else {
+ // We have multiple replicas of the same block so decide which one
+ // to keep.
+ newReplica = resolveDuplicateReplicas(newReplica, oldReplica, volumeMap);
+ }
+
+ // If we are retaining a replica on transient storage make sure
+ // it is in the lazyWriteReplicaMap so it can be persisted
+ // eventually.
+ if (newReplica.getVolume().isTransientStorage()) {
+ lazyWriteReplicaMap.addReplica(bpid, blockId,
+ (FsVolumeImpl) newReplica.getVolume());
+ } else {
+ lazyWriteReplicaMap.discardReplica(bpid, blockId, false);
+ }
+ }
+
/**
* Add replicas under the given directory to the volume map
@@ -434,66 +511,9 @@ void addToReplicasMap(ReplicaMap volumeMap, File dir,
long genStamp = FsDatasetUtil.getGenerationStampFromFile(
files, file);
long blockId = Block.filename2id(file.getName());
- ReplicaInfo newReplica = null;
- if (isFinalized) {
- newReplica = new FinalizedReplica(blockId,
- file.length(), genStamp, volume, file.getParentFile());
- } else {
-
- boolean loadRwr = true;
- File restartMeta = new File(file.getParent() +
- File.pathSeparator + "." + file.getName() + ".restart");
- Scanner sc = null;
- try {
- sc = new Scanner(restartMeta, "UTF-8");
- // The restart meta file exists
- if (sc.hasNextLong() && (sc.nextLong() > Time.now())) {
- // It didn't expire. Load the replica as a RBW.
- // We don't know the expected block length, so just use 0
- // and don't reserve any more space for writes.
- newReplica = new ReplicaBeingWritten(blockId,
- validateIntegrityAndSetLength(file, genStamp),
- genStamp, volume, file.getParentFile(), null, 0);
- loadRwr = false;
- }
- sc.close();
- if (!restartMeta.delete()) {
- FsDatasetImpl.LOG.warn("Failed to delete restart meta file: " +
- restartMeta.getPath());
- }
- } catch (FileNotFoundException fnfe) {
- // nothing to do hereFile dir =
- } finally {
- if (sc != null) {
- sc.close();
- }
- }
- // Restart meta doesn't exist or expired.
- if (loadRwr) {
- newReplica = new ReplicaWaitingToBeRecovered(blockId,
- validateIntegrityAndSetLength(file, genStamp),
- genStamp, volume, file.getParentFile());
- }
- }
-
- ReplicaInfo oldReplica = volumeMap.get(bpid, newReplica.getBlockId());
- if (oldReplica == null) {
- volumeMap.add(bpid, newReplica);
- } else {
- // We have multiple replicas of the same block so decide which one
- // to keep.
- newReplica = resolveDuplicateReplicas(newReplica, oldReplica, volumeMap);
- }
-
- // If we are retaining a replica on transient storage make sure
- // it is in the lazyWriteReplicaMap so it can be persisted
- // eventually.
- if (newReplica.getVolume().isTransientStorage()) {
- lazyWriteReplicaMap.addReplica(bpid, blockId,
- (FsVolumeImpl) newReplica.getVolume());
- } else {
- lazyWriteReplicaMap.discardReplica(bpid, blockId, false);
- }
+ Block block = new Block(blockId, file.length(), genStamp);
+ addReplicaToReplicasMap(block, volumeMap, lazyWriteReplicaMap,
+ isFinalized);
}
}
@@ -649,9 +669,121 @@ public String toString() {
return currentDir.getAbsolutePath();
}
- void shutdown() {
+ void shutdown(BlockListAsLongs blocksListToPersist) {
+ saveReplicas(blocksListToPersist);
saveDfsUsed();
dfsUsedSaved = true;
dfsUsage.shutdown();
}
+
+ private boolean readReplicasFromCache(ReplicaMap volumeMap,
+ final RamDiskReplicaTracker lazyWriteReplicaMap) {
+ ReplicaMap tmpReplicaMap = new ReplicaMap(this);
+ File replicaFile = new File(currentDir, REPLICA_CACHE_FILE);
+ // Check whether the file exists or not.
+ if (!replicaFile.exists()) {
+ LOG.info("Replica Cache file: "+ replicaFile.getPath() +
+ " doesn't exist ");
+ return false;
+ }
+ long fileLastModifiedTime = replicaFile.lastModified();
+ if (System.currentTimeMillis() > fileLastModifiedTime + replicaCacheExpiry) {
+ LOG.info("Replica Cache file: " + replicaFile.getPath() +
+ " has gone stale");
+ // Just to make findbugs happy
+ if (!replicaFile.delete()) {
+ LOG.info("Replica Cache file: " + replicaFile.getPath() +
+ " cannot be deleted");
+ }
+ return false;
+ }
+ FileInputStream inputStream = null;
+ try {
+ inputStream = new FileInputStream(replicaFile);
+ BlockListAsLongs blocksList = BlockListAsLongs.readFrom(inputStream);
+ Iterator iterator = blocksList.iterator();
+ while (iterator.hasNext()) {
+ BlockReportReplica replica = iterator.next();
+ switch (replica.getState()) {
+ case FINALIZED:
+ addReplicaToReplicasMap(replica, tmpReplicaMap, lazyWriteReplicaMap, true);
+ break;
+ case RUR:
+ case RBW:
+ case RWR:
+ addReplicaToReplicasMap(replica, tmpReplicaMap, lazyWriteReplicaMap, false);
+ break;
+ default:
+ break;
+ }
+ }
+ inputStream.close();
+ // Now it is safe to add the replica into volumeMap
+ // In case of any exception during parsing this cache file, fall back
+ // to scan all the files on disk.
+ for (ReplicaInfo info: tmpReplicaMap.replicas(bpid)) {
+ volumeMap.add(bpid, info);
+ }
+ LOG.info("Successfully read replica from cache file : "
+ + replicaFile.getPath());
+ return true;
+ } catch (Exception e) {
+ // Any exception we need to revert back to read from disk
+ // Log the error and return false
+ LOG.info("Exception occured while reading the replicas cache file: "
+ + replicaFile.getPath(), e );
+ return false;
+ }
+ finally {
+ if (!replicaFile.delete()) {
+ LOG.info("Failed to delete replica cache file: " +
+ replicaFile.getPath());
+ }
+ // close the inputStream
+ IOUtils.closeStream(inputStream);
+ }
+ }
+
+ private void saveReplicas(BlockListAsLongs blocksListToPersist) {
+ if (blocksListToPersist == null ||
+ blocksListToPersist.getNumberOfBlocks()== 0) {
+ return;
+ }
+ File tmpFile = new File(currentDir, REPLICA_CACHE_FILE + ".tmp");
+ if (tmpFile.exists() && !tmpFile.delete()) {
+ LOG.warn("Failed to delete tmp replicas file in " +
+ tmpFile.getPath());
+ return;
+ }
+ File replicaCacheFile = new File(currentDir, REPLICA_CACHE_FILE);
+ if (replicaCacheFile.exists() && !replicaCacheFile.delete()) {
+ LOG.warn("Failed to delete replicas file in " +
+ replicaCacheFile.getPath());
+ return;
+ }
+
+ FileOutputStream out = null;
+ try {
+ out = new FileOutputStream(tmpFile);
+ blocksListToPersist.writeTo(out);
+ out.close();
+ // Renaming the tmp file to replicas
+ Files.move(tmpFile, replicaCacheFile);
+ } catch (Exception e) {
+ // If write failed, the volume might be bad. Since the cache file is
+ // not critical, log the error, delete both the files (tmp and cache)
+ // and continue.
+ LOG.warn("Failed to write replicas to cache ", e);
+ if (replicaCacheFile.exists() && !replicaCacheFile.delete()) {
+ LOG.warn("Failed to delete replicas file: " +
+ replicaCacheFile.getPath());
+ }
+ } finally {
+ IOUtils.closeStream(out);
+ if (tmpFile.exists() && !tmpFile.delete()) {
+ LOG.warn("Failed to delete tmp file in " +
+ tmpFile.getPath());
+ }
+ }
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetAsyncDiskService.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetAsyncDiskService.java
index 13e854f0b7c91..c1d3990e22bea 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetAsyncDiskService.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetAsyncDiskService.java
@@ -22,7 +22,10 @@
import java.io.FileDescriptor;
import java.io.IOException;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
@@ -64,9 +67,14 @@ class FsDatasetAsyncDiskService {
private static final long THREADS_KEEP_ALIVE_SECONDS = 60;
private final DataNode datanode;
+ private final FsDatasetImpl fsdatasetImpl;
private final ThreadGroup threadGroup;
private Map executors
= new HashMap();
+ private Map> deletedBlockIds
+ = new HashMap>();
+ private static final int MAX_DELETED_BLOCKS = 64;
+ private int numDeletedBlocks = 0;
/**
* Create a AsyncDiskServices with a set of volumes (specified by their
@@ -75,8 +83,9 @@ class FsDatasetAsyncDiskService {
* The AsyncDiskServices uses one ThreadPool per volume to do the async
* disk operations.
*/
- FsDatasetAsyncDiskService(DataNode datanode) {
+ FsDatasetAsyncDiskService(DataNode datanode, FsDatasetImpl fsdatasetImpl) {
this.datanode = datanode;
+ this.fsdatasetImpl = fsdatasetImpl;
this.threadGroup = new ThreadGroup(getClass().getSimpleName());
}
@@ -286,7 +295,27 @@ public void run() {
LOG.info("Deleted " + block.getBlockPoolId() + " "
+ block.getLocalBlock() + " file " + blockFile);
}
+ updateDeletedBlockId(block);
IOUtils.cleanup(null, volumeRef);
}
}
+
+ private synchronized void updateDeletedBlockId(ExtendedBlock block) {
+ Set blockIds = deletedBlockIds.get(block.getBlockPoolId());
+ if (blockIds == null) {
+ blockIds = new HashSet();
+ deletedBlockIds.put(block.getBlockPoolId(), blockIds);
+ }
+ blockIds.add(block.getBlockId());
+ numDeletedBlocks++;
+ if (numDeletedBlocks == MAX_DELETED_BLOCKS) {
+ for (Entry> e : deletedBlockIds.entrySet()) {
+ String bpid = e.getKey();
+ Set bs = e.getValue();
+ fsdatasetImpl.removeDeletedBlocks(bpid, bs);
+ bs.clear();
+ }
+ numDeletedBlocks = 0;
+ }
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java
index cc6220aa80eb5..f15f6493b3065 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java
@@ -46,6 +46,7 @@
import javax.management.ObjectName;
import javax.management.StandardMBean;
+import com.google.common.annotations.VisibleForTesting;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
@@ -236,6 +237,7 @@ public LengthInputStream getMetaDataInputStream(ExtendedBlock b)
private volatile boolean fsRunning;
final ReplicaMap volumeMap;
+ final Map> deletingBlock;
final RamDiskReplicaTracker ramDiskReplicaTracker;
final RamDiskAsyncLazyPersistService asyncLazyPersistService;
@@ -274,8 +276,10 @@ public LengthInputStream getMetaDataInputStream(ExtendedBlock b)
this.validVolsRequired = volsConfigured - volFailuresTolerated;
if (volFailuresTolerated < 0 || volFailuresTolerated >= volsConfigured) {
- throw new DiskErrorException("Invalid volume failure "
- + " config value: " + volFailuresTolerated);
+ throw new DiskErrorException("Invalid value configured for "
+ + "dfs.datanode.failed.volumes.tolerated - " + volFailuresTolerated
+ + ". Value configured is either less than 0 or >= "
+ + "to the number of configured volumes (" + volsConfigured + ").");
}
if (volsFailed > volFailuresTolerated) {
throw new DiskErrorException("Too many failed volumes - "
@@ -297,8 +301,9 @@ public LengthInputStream getMetaDataInputStream(ExtendedBlock b)
VolumeChoosingPolicy.class), conf);
volumes = new FsVolumeList(volumeFailureInfos, datanode.getBlockScanner(),
blockChooserImpl);
- asyncDiskService = new FsDatasetAsyncDiskService(datanode);
+ asyncDiskService = new FsDatasetAsyncDiskService(datanode, this);
asyncLazyPersistService = new RamDiskAsyncLazyPersistService(datanode);
+ deletingBlock = new HashMap>();
for (int idx = 0; idx < storage.getNumStorageDirs(); idx++) {
addVolume(dataLocations, storage.getStorageDir(idx));
@@ -375,6 +380,12 @@ private void addVolume(Collection dataLocations,
LOG.info("Added volume - " + dir + ", StorageType: " + storageType);
}
+ @VisibleForTesting
+ public FsVolumeImpl createFsVolume(String storageUuid, File currentDir,
+ StorageType storageType) throws IOException {
+ return new FsVolumeImpl(this, storageUuid, currentDir, conf, storageType);
+ }
+
@Override
public void addVolume(final StorageLocation location,
final List nsInfos)
@@ -394,8 +405,8 @@ public void addVolume(final StorageLocation location,
final Storage.StorageDirectory sd = builder.getStorageDirectory();
StorageType storageType = location.getStorageType();
- final FsVolumeImpl fsVolume = new FsVolumeImpl(
- this, sd.getStorageUuid(), sd.getCurrentDir(), this.conf, storageType);
+ final FsVolumeImpl fsVolume =
+ createFsVolume(sd.getStorageUuid(), sd.getCurrentDir(), storageType);
final ReplicaMap tempVolumeMap = new ReplicaMap(fsVolume);
ArrayList exceptions = Lists.newArrayList();
@@ -411,6 +422,11 @@ public void addVolume(final StorageLocation location,
}
}
if (!exceptions.isEmpty()) {
+ try {
+ sd.unlock();
+ } catch (IOException e) {
+ exceptions.add(e);
+ }
throw MultipleIOException.createIOException(exceptions);
}
@@ -431,41 +447,42 @@ public void addVolume(final StorageLocation location,
}
/**
- * Removes a collection of volumes from FsDataset.
- * @param volumes the root directories of the volumes.
+ * Removes a set of volumes from FsDataset.
+ * @param volumesToRemove a set of absolute root path of each volume.
+ * @param clearFailure set true to clear failure information.
*
* DataNode should call this function before calling
* {@link DataStorage#removeVolumes(java.util.Collection)}.
*/
@Override
- public synchronized void removeVolumes(Collection volumes) {
- Set volumeSet = new HashSet<>();
- for (StorageLocation sl : volumes) {
- volumeSet.add(sl.getFile().getAbsolutePath());
+ public synchronized void removeVolumes(
+ Set volumesToRemove, boolean clearFailure) {
+ // Make sure that all volumes are absolute path.
+ for (File vol : volumesToRemove) {
+ Preconditions.checkArgument(vol.isAbsolute(),
+ String.format("%s is not absolute path.", vol.getPath()));
}
for (int idx = 0; idx < dataStorage.getNumStorageDirs(); idx++) {
Storage.StorageDirectory sd = dataStorage.getStorageDir(idx);
- String volume = sd.getRoot().getAbsolutePath();
- if (volumeSet.contains(volume)) {
- LOG.info("Removing " + volume + " from FsDataset.");
+ final File absRoot = sd.getRoot().getAbsoluteFile();
+ if (volumesToRemove.contains(absRoot)) {
+ LOG.info("Removing " + absRoot + " from FsDataset.");
// Disable the volume from the service.
asyncDiskService.removeVolume(sd.getCurrentDir());
- this.volumes.removeVolume(sd.getRoot());
+ volumes.removeVolume(absRoot, clearFailure);
// Removed all replica information for the blocks on the volume. Unlike
// updating the volumeMap in addVolume(), this operation does not scan
// disks.
for (String bpid : volumeMap.getBlockPoolList()) {
- List blocks = new ArrayList();
for (Iterator it = volumeMap.replicas(bpid).iterator();
- it.hasNext(); ) {
+ it.hasNext(); ) {
ReplicaInfo block = it.next();
- String absBasePath =
- new File(block.getVolume().getBasePath()).getAbsolutePath();
- if (absBasePath.equals(volume)) {
+ final File absBasePath =
+ new File(block.getVolume().getBasePath()).getAbsoluteFile();
+ if (absBasePath.equals(absRoot)) {
invalidate(bpid, block);
- blocks.add(block);
it.remove();
}
}
@@ -1560,30 +1577,26 @@ public Map getBlockReports(String bpid) {
Map blockReportsMap =
new HashMap();
- Map> finalized =
- new HashMap>();
- Map> uc =
- new HashMap>();
+ Map builders =
+ new HashMap();
List curVolumes = getVolumes();
for (FsVolumeSpi v : curVolumes) {
- finalized.put(v.getStorageID(), new ArrayList());
- uc.put(v.getStorageID(), new ArrayList());
+ builders.put(v.getStorageID(), BlockListAsLongs.builder());
}
synchronized(this) {
for (ReplicaInfo b : volumeMap.replicas(bpid)) {
switch(b.getState()) {
case FINALIZED:
- finalized.get(b.getVolume().getStorageID()).add(b);
- break;
case RBW:
case RWR:
- uc.get(b.getVolume().getStorageID()).add(b);
+ builders.get(b.getVolume().getStorageID()).add(b);
break;
case RUR:
ReplicaUnderRecovery rur = (ReplicaUnderRecovery)b;
- uc.get(rur.getVolume().getStorageID()).add(rur.getOriginalReplica());
+ builders.get(rur.getVolume().getStorageID())
+ .add(rur.getOriginalReplica());
break;
case TEMPORARY:
break;
@@ -1594,10 +1607,8 @@ public Map getBlockReports(String bpid) {
}
for (FsVolumeImpl v : curVolumes) {
- ArrayList finalizedList = finalized.get(v.getStorageID());
- ArrayList ucList = uc.get(v.getStorageID());
blockReportsMap.put(v.toDatanodeStorage(),
- new BlockListAsLongs(finalizedList, ucList));
+ builders.get(v.getStorageID()).build());
}
return blockReportsMap;
@@ -1783,7 +1794,12 @@ public void invalidate(String bpid, Block invalidBlks[]) throws IOException {
+ ". Parent not found for file " + f);
continue;
}
- volumeMap.remove(bpid, invalidBlks[i]);
+ ReplicaInfo removing = volumeMap.remove(bpid, invalidBlks[i]);
+ addDeletingBlock(bpid, removing.getBlockId());
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Block file " + removing.getBlockFile().getName()
+ + " is to be deleted");
+ }
}
if (v.isTransientStorage()) {
@@ -1956,50 +1972,14 @@ File getFile(final String bpid, final long blockId, boolean touch) {
/**
* check if a data directory is healthy
- * if some volumes failed - make sure to remove all the blocks that belong
- * to these volumes
- * @throws DiskErrorException
+ *
+ * if some volumes failed - the caller must emove all the blocks that belong
+ * to these failed volumes.
+ * @return the failed volumes. Returns null if no volume failed.
*/
@Override // FsDatasetSpi
- public void checkDataDir() throws DiskErrorException {
- long totalBlocks=0, removedBlocks=0;
- List