Skip to content

Commit 76bc412

Browse files
committed
HBASE-26347 Support detect and exclude slow DNs in fan-out of WAL (#3800)
Signed-off-by: Duo Zhang <[email protected]>
1 parent 2b05e68 commit 76bc412

File tree

18 files changed

+592
-72
lines changed

18 files changed

+592
-72
lines changed

hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/AsyncFSOutputHelper.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import org.apache.hadoop.fs.FileSystem;
2424
import org.apache.hadoop.fs.Path;
2525
import org.apache.hadoop.fs.StreamCapabilities;
26+
import org.apache.hadoop.hbase.io.asyncfs.monitor.StreamSlowMonitor;
2627
import org.apache.hadoop.hbase.util.CommonFSUtils;
2728
import org.apache.hadoop.hbase.util.CommonFSUtils.StreamLacksCapabilityException;
2829
import org.apache.hadoop.hdfs.DistributedFileSystem;
@@ -47,11 +48,11 @@ private AsyncFSOutputHelper() {
4748
*/
4849
public static AsyncFSOutput createOutput(FileSystem fs, Path f, boolean overwrite,
4950
boolean createParent, short replication, long blockSize, EventLoopGroup eventLoopGroup,
50-
Class<? extends Channel> channelClass)
51+
Class<? extends Channel> channelClass, StreamSlowMonitor monitor)
5152
throws IOException, CommonFSUtils.StreamLacksCapabilityException {
5253
if (fs instanceof DistributedFileSystem) {
5354
return FanOutOneBlockAsyncDFSOutputHelper.createOutput((DistributedFileSystem) fs, f,
54-
overwrite, createParent, replication, blockSize, eventLoopGroup, channelClass);
55+
overwrite, createParent, replication, blockSize, eventLoopGroup, channelClass, monitor);
5556
}
5657
final FSDataOutputStream out;
5758
int bufferSize = fs.getConf().getInt(CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY,

hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/FanOutOneBlockAsyncDFSOutput.java

Lines changed: 38 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
import java.util.Collection;
3333
import java.util.Collections;
3434
import java.util.Iterator;
35-
import java.util.List;
35+
import java.util.Map;
3636
import java.util.Set;
3737
import java.util.concurrent.CompletableFuture;
3838
import java.util.concurrent.ConcurrentHashMap;
@@ -45,7 +45,9 @@
4545
import org.apache.hadoop.crypto.Encryptor;
4646
import org.apache.hadoop.fs.Path;
4747
import org.apache.hadoop.hbase.io.asyncfs.FanOutOneBlockAsyncDFSOutputHelper.CancelOnClose;
48+
import org.apache.hadoop.hbase.io.asyncfs.monitor.StreamSlowMonitor;
4849
import org.apache.hadoop.hbase.util.CancelableProgressable;
50+
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
4951
import org.apache.hadoop.hbase.util.RecoverLeaseFSUtils;
5052
import org.apache.hadoop.hdfs.DFSClient;
5153
import org.apache.hadoop.hdfs.DistributedFileSystem;
@@ -68,6 +70,7 @@
6870
import org.apache.hbase.thirdparty.io.netty.channel.ChannelHandler.Sharable;
6971
import org.apache.hbase.thirdparty.io.netty.channel.ChannelHandlerContext;
7072
import org.apache.hbase.thirdparty.io.netty.channel.ChannelId;
73+
import org.apache.hbase.thirdparty.io.netty.channel.ChannelOutboundInvoker;
7174
import org.apache.hbase.thirdparty.io.netty.channel.SimpleChannelInboundHandler;
7275
import org.apache.hbase.thirdparty.io.netty.handler.codec.protobuf.ProtobufVarint32FrameDecoder;
7376
import org.apache.hbase.thirdparty.io.netty.handler.timeout.IdleStateEvent;
@@ -121,7 +124,7 @@ public class FanOutOneBlockAsyncDFSOutput implements AsyncFSOutput {
121124

122125
private final Encryptor encryptor;
123126

124-
private final List<Channel> datanodeList;
127+
private final Map<Channel, DatanodeInfo> datanodeInfoMap;
125128

126129
private final DataChecksum summer;
127130

@@ -137,17 +140,22 @@ private static final class Callback {
137140

138141
// should be backed by a thread safe collection
139142
private final Set<ChannelId> unfinishedReplicas;
143+
private final long packetDataLen;
144+
private final long flushTimestamp;
145+
private long lastAckTimestamp = -1;
140146

141147
public Callback(CompletableFuture<Long> future, long ackedLength,
142-
Collection<Channel> replicas) {
148+
final Collection<Channel> replicas, long packetDataLen) {
143149
this.future = future;
144150
this.ackedLength = ackedLength;
151+
this.packetDataLen = packetDataLen;
152+
this.flushTimestamp = EnvironmentEdgeManager.currentTime();
145153
if (replicas.isEmpty()) {
146154
this.unfinishedReplicas = Collections.emptySet();
147155
} else {
148156
this.unfinishedReplicas =
149157
Collections.newSetFromMap(new ConcurrentHashMap<ChannelId, Boolean>(replicas.size()));
150-
replicas.stream().map(c -> c.id()).forEachOrdered(unfinishedReplicas::add);
158+
replicas.stream().map(Channel::id).forEachOrdered(unfinishedReplicas::add);
151159
}
152160
}
153161
}
@@ -177,13 +185,19 @@ private enum State {
177185

178186
private volatile State state;
179187

188+
private final StreamSlowMonitor streamSlowMonitor;
189+
180190
// all lock-free to make it run faster
181191
private void completed(Channel channel) {
182192
for (Iterator<Callback> iter = waitingAckQueue.iterator(); iter.hasNext();) {
183193
Callback c = iter.next();
184194
// if the current unfinished replicas does not contain us then it means that we have already
185195
// acked this one, let's iterate to find the one we have not acked yet.
186196
if (c.unfinishedReplicas.remove(channel.id())) {
197+
long current = EnvironmentEdgeManager.currentTime();
198+
streamSlowMonitor.checkProcessTimeAndSpeed(datanodeInfoMap.get(channel), c.packetDataLen,
199+
current - c.flushTimestamp, c.lastAckTimestamp, c.unfinishedReplicas.size());
200+
c.lastAckTimestamp = current;
187201
if (c.unfinishedReplicas.isEmpty()) {
188202
// we need to remove first before complete the future. It is possible that after we
189203
// complete the future the upper layer will call close immediately before we remove the
@@ -246,7 +260,7 @@ private synchronized void failed(Channel channel, Supplier<Throwable> errorSuppl
246260
}
247261
break;
248262
}
249-
datanodeList.forEach(ch -> ch.close());
263+
datanodeInfoMap.keySet().forEach(ChannelOutboundInvoker::close);
250264
}
251265

252266
@Sharable
@@ -314,7 +328,7 @@ public void userEventTriggered(ChannelHandlerContext ctx, Object evt) throws Exc
314328

315329
private void setupReceiver(int timeoutMs) {
316330
AckHandler ackHandler = new AckHandler(timeoutMs);
317-
for (Channel ch : datanodeList) {
331+
for (Channel ch : datanodeInfoMap.keySet()) {
318332
ch.pipeline().addLast(
319333
new IdleStateHandler(timeoutMs, timeoutMs / 2, 0, TimeUnit.MILLISECONDS),
320334
new ProtobufVarint32FrameDecoder(),
@@ -325,8 +339,8 @@ private void setupReceiver(int timeoutMs) {
325339

326340
FanOutOneBlockAsyncDFSOutput(Configuration conf,DistributedFileSystem dfs,
327341
DFSClient client, ClientProtocol namenode, String clientName, String src, long fileId,
328-
LocatedBlock locatedBlock, Encryptor encryptor, List<Channel> datanodeList,
329-
DataChecksum summer, ByteBufAllocator alloc) {
342+
LocatedBlock locatedBlock, Encryptor encryptor, Map<Channel, DatanodeInfo> datanodeInfoMap,
343+
DataChecksum summer, ByteBufAllocator alloc, StreamSlowMonitor streamSlowMonitor) {
330344
this.conf = conf;
331345
this.dfs = dfs;
332346
this.client = client;
@@ -337,13 +351,14 @@ private void setupReceiver(int timeoutMs) {
337351
this.block = locatedBlock.getBlock();
338352
this.locations = locatedBlock.getLocations();
339353
this.encryptor = encryptor;
340-
this.datanodeList = datanodeList;
354+
this.datanodeInfoMap = datanodeInfoMap;
341355
this.summer = summer;
342356
this.maxDataLen = MAX_DATA_LEN - (MAX_DATA_LEN % summer.getBytesPerChecksum());
343357
this.alloc = alloc;
344358
this.buf = alloc.directBuffer(sendBufSizePRedictor.initialSize());
345359
this.state = State.STREAMING;
346360
setupReceiver(conf.getInt(DFS_CLIENT_SOCKET_TIMEOUT_KEY, READ_TIMEOUT));
361+
this.streamSlowMonitor = streamSlowMonitor;
347362
}
348363

349364
@Override
@@ -395,7 +410,8 @@ private void flushBuffer(CompletableFuture<Long> future, ByteBuf dataBuf,
395410
ByteBuf headerBuf = alloc.buffer(headerLen);
396411
header.putInBuffer(headerBuf.nioBuffer(0, headerLen));
397412
headerBuf.writerIndex(headerLen);
398-
Callback c = new Callback(future, nextPacketOffsetInBlock + dataLen, datanodeList);
413+
Callback c = new Callback(future, nextPacketOffsetInBlock + dataLen,
414+
datanodeInfoMap.keySet(), dataLen);
399415
waitingAckQueue.addLast(c);
400416
// recheck again after we pushed the callback to queue
401417
if (state != State.STREAMING && waitingAckQueue.peekFirst() == c) {
@@ -404,7 +420,9 @@ private void flushBuffer(CompletableFuture<Long> future, ByteBuf dataBuf,
404420
waitingAckQueue.removeFirst();
405421
return;
406422
}
407-
datanodeList.forEach(ch -> {
423+
// TODO: we should perhaps measure time taken per DN here;
424+
// we could collect statistics per DN, and/or exclude bad nodes in createOutput.
425+
datanodeInfoMap.keySet().forEach(ch -> {
408426
ch.write(headerBuf.retainedDuplicate());
409427
ch.write(checksumBuf.retainedDuplicate());
410428
ch.writeAndFlush(dataBuf.retainedDuplicate());
@@ -426,7 +444,7 @@ private void flush0(CompletableFuture<Long> future, boolean syncBlock) {
426444
long lengthAfterFlush = nextPacketOffsetInBlock + dataLen;
427445
Callback lastFlush = waitingAckQueue.peekLast();
428446
if (lastFlush != null) {
429-
Callback c = new Callback(future, lengthAfterFlush, Collections.emptyList());
447+
Callback c = new Callback(future, lengthAfterFlush, Collections.emptySet(), dataLen);
430448
waitingAckQueue.addLast(c);
431449
// recheck here if we have already removed the previous callback from the queue
432450
if (waitingAckQueue.peekFirst() == c) {
@@ -526,8 +544,8 @@ private void endBlock() throws IOException {
526544
header.putInBuffer(headerBuf.nioBuffer(0, headerLen));
527545
headerBuf.writerIndex(headerLen);
528546
CompletableFuture<Long> future = new CompletableFuture<>();
529-
waitingAckQueue.add(new Callback(future, finalizedLength, datanodeList));
530-
datanodeList.forEach(ch -> ch.writeAndFlush(headerBuf.retainedDuplicate()));
547+
waitingAckQueue.add(new Callback(future, finalizedLength, datanodeInfoMap.keySet(), 0));
548+
datanodeInfoMap.keySet().forEach(ch -> ch.writeAndFlush(headerBuf.retainedDuplicate()));
531549
headerBuf.release();
532550
try {
533551
future.get();
@@ -544,13 +562,14 @@ private void endBlock() throws IOException {
544562
* The close method when error occurred. Now we just call recoverFileLease.
545563
*/
546564
@Override
565+
@SuppressWarnings("FutureReturnValueIgnored")
547566
public void recoverAndClose(CancelableProgressable reporter) throws IOException {
548567
if (buf != null) {
549568
buf.release();
550569
buf = null;
551570
}
552-
datanodeList.forEach(ch -> ch.close());
553-
datanodeList.forEach(ch -> ch.closeFuture().awaitUninterruptibly());
571+
datanodeInfoMap.keySet().forEach(ChannelOutboundInvoker::close);
572+
datanodeInfoMap.keySet().forEach(ch -> ch.closeFuture().awaitUninterruptibly());
554573
endFileLease(client, fileId);
555574
RecoverLeaseFSUtils.recoverFileLease(dfs, new Path(src), conf,
556575
reporter == null ? new CancelOnClose(client) : reporter);
@@ -561,11 +580,12 @@ public void recoverAndClose(CancelableProgressable reporter) throws IOException
561580
* {@link #recoverAndClose(CancelableProgressable)} if this method throws an exception.
562581
*/
563582
@Override
583+
@SuppressWarnings("FutureReturnValueIgnored")
564584
public void close() throws IOException {
565585
endBlock();
566586
state = State.CLOSED;
567-
datanodeList.forEach(ch -> ch.close());
568-
datanodeList.forEach(ch -> ch.closeFuture().awaitUninterruptibly());
587+
datanodeInfoMap.keySet().forEach(ChannelOutboundInvoker::close);
588+
datanodeInfoMap.keySet().forEach(ch -> ch.closeFuture().awaitUninterruptibly());
569589
block.setNumBytes(ackedBlockLength);
570590
completeFile(client, namenode, src, clientName, block, fileId);
571591
}

hbase-asyncfs/src/main/java/org/apache/hadoop/hbase/io/asyncfs/FanOutOneBlockAsyncDFSOutputHelper.java

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,12 @@
3333
import java.lang.reflect.Method;
3434
import java.util.ArrayList;
3535
import java.util.EnumSet;
36+
import java.util.HashSet;
37+
import java.util.IdentityHashMap;
3638
import java.util.List;
39+
import java.util.Map;
40+
import java.util.Set;
3741
import java.util.concurrent.TimeUnit;
38-
import org.apache.commons.lang3.ArrayUtils;
3942
import org.apache.hadoop.conf.Configuration;
4043
import org.apache.hadoop.crypto.CryptoProtocolVersion;
4144
import org.apache.hadoop.crypto.Encryptor;
@@ -47,6 +50,8 @@
4750
import org.apache.hadoop.fs.UnresolvedLinkException;
4851
import org.apache.hadoop.fs.permission.FsPermission;
4952
import org.apache.hadoop.hbase.client.ConnectionUtils;
53+
import org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager;
54+
import org.apache.hadoop.hbase.io.asyncfs.monitor.StreamSlowMonitor;
5055
import org.apache.hadoop.hbase.util.CancelableProgressable;
5156
import org.apache.hadoop.hdfs.DFSClient;
5257
import org.apache.hadoop.hdfs.DFSOutputStream;
@@ -128,8 +133,6 @@ private FanOutOneBlockAsyncDFSOutputHelper() {
128133
// Timeouts for communicating with DataNode for streaming writes/reads
129134
public static final int READ_TIMEOUT = 60 * 1000;
130135

131-
private static final DatanodeInfo[] EMPTY_DN_ARRAY = new DatanodeInfo[0];
132-
133136
private interface LeaseManager {
134137

135138
void begin(DFSClient client, long inodeId);
@@ -511,15 +514,20 @@ private static EnumSetWritable<CreateFlag> getCreateFlags(boolean overwrite) {
511514

512515
private static FanOutOneBlockAsyncDFSOutput createOutput(DistributedFileSystem dfs, String src,
513516
boolean overwrite, boolean createParent, short replication, long blockSize,
514-
EventLoopGroup eventLoopGroup, Class<? extends Channel> channelClass) throws IOException {
517+
EventLoopGroup eventLoopGroup, Class<? extends Channel> channelClass,
518+
StreamSlowMonitor monitor) throws IOException {
515519
Configuration conf = dfs.getConf();
516520
DFSClient client = dfs.getClient();
517521
String clientName = client.getClientName();
518522
ClientProtocol namenode = client.getNamenode();
519523
int createMaxRetries = conf.getInt(ASYNC_DFS_OUTPUT_CREATE_MAX_RETRIES,
520524
DEFAULT_ASYNC_DFS_OUTPUT_CREATE_MAX_RETRIES);
521-
DatanodeInfo[] excludesNodes = EMPTY_DN_ARRAY;
525+
ExcludeDatanodeManager excludeDatanodeManager = monitor.getExcludeDatanodeManager();
526+
Set<DatanodeInfo> toExcludeNodes =
527+
new HashSet<>(excludeDatanodeManager.getExcludeDNs().keySet());
522528
for (int retry = 0;; retry++) {
529+
LOG.debug("When create output stream for {}, exclude list is {}, retry={}", src,
530+
toExcludeNodes, retry);
523531
HdfsFileStatus stat;
524532
try {
525533
stat = FILE_CREATOR.create(namenode, src,
@@ -539,24 +547,26 @@ private static FanOutOneBlockAsyncDFSOutput createOutput(DistributedFileSystem d
539547
List<Future<Channel>> futureList = null;
540548
try {
541549
DataChecksum summer = createChecksum(client);
542-
locatedBlock = namenode.addBlock(src, client.getClientName(), null, excludesNodes,
543-
stat.getFileId(), null, null);
544-
List<Channel> datanodeList = new ArrayList<>();
550+
locatedBlock = namenode.addBlock(src, client.getClientName(), null,
551+
toExcludeNodes.toArray(new DatanodeInfo[0]), stat.getFileId(), null, null);
552+
Map<Channel, DatanodeInfo> datanodes = new IdentityHashMap<>();
545553
futureList = connectToDataNodes(conf, client, clientName, locatedBlock, 0L, 0L,
546554
PIPELINE_SETUP_CREATE, summer, eventLoopGroup, channelClass);
547555
for (int i = 0, n = futureList.size(); i < n; i++) {
556+
DatanodeInfo datanodeInfo = locatedBlock.getLocations()[i];
548557
try {
549-
datanodeList.add(futureList.get(i).syncUninterruptibly().getNow());
558+
datanodes.put(futureList.get(i).syncUninterruptibly().getNow(), datanodeInfo);
550559
} catch (Exception e) {
551560
// exclude the broken DN next time
552-
excludesNodes = ArrayUtils.add(excludesNodes, locatedBlock.getLocations()[i]);
561+
toExcludeNodes.add(datanodeInfo);
562+
excludeDatanodeManager.tryAddExcludeDN(datanodeInfo, "connect error");
553563
throw e;
554564
}
555565
}
556566
Encryptor encryptor = createEncryptor(conf, stat, client);
557567
FanOutOneBlockAsyncDFSOutput output =
558568
new FanOutOneBlockAsyncDFSOutput(conf, dfs, client, namenode, clientName, src,
559-
stat.getFileId(), locatedBlock, encryptor, datanodeList, summer, ALLOC);
569+
stat.getFileId(), locatedBlock, encryptor, datanodes, summer, ALLOC, monitor);
560570
succ = true;
561571
return output;
562572
} catch (RemoteException e) {
@@ -607,14 +617,15 @@ public void operationComplete(Future<Channel> future) throws Exception {
607617
*/
608618
public static FanOutOneBlockAsyncDFSOutput createOutput(DistributedFileSystem dfs, Path f,
609619
boolean overwrite, boolean createParent, short replication, long blockSize,
610-
EventLoopGroup eventLoopGroup, Class<? extends Channel> channelClass) throws IOException {
620+
EventLoopGroup eventLoopGroup, Class<? extends Channel> channelClass,
621+
final StreamSlowMonitor monitor) throws IOException {
611622
return new FileSystemLinkResolver<FanOutOneBlockAsyncDFSOutput>() {
612623

613624
@Override
614625
public FanOutOneBlockAsyncDFSOutput doCall(Path p)
615626
throws IOException, UnresolvedLinkException {
616627
return createOutput(dfs, p.toUri().getPath(), overwrite, createParent, replication,
617-
blockSize, eventLoopGroup, channelClass);
628+
blockSize, eventLoopGroup, channelClass, monitor);
618629
}
619630

620631
@Override

0 commit comments

Comments
 (0)