From cb64f907032597467b77b1f0a154c0669ba04797 Mon Sep 17 00:00:00 2001 From: huaxiangsun Date: Wed, 8 Apr 2020 09:28:51 -0700 Subject: [PATCH] HBASE-24120 Flakey Test: TestReplicationAdminWithClusters timeout (#1441) Signed-off-by: stack Signed-off-by: Nick Dimiduk Signed-off-by: Duo Zhang --- .../regionserver/wal/ProtobufLogReader.java | 2 +- .../ReplicationRuntimeException.java | 40 +++++++++++++++++++ .../ReplicationSourceManager.java | 9 ++++- .../ReplicationSourceShipper.java | 5 ++- 4 files changed, 51 insertions(+), 5 deletions(-) create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationRuntimeException.java diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/ProtobufLogReader.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/ProtobufLogReader.java index 96be61c99b4d..c48caad7c7ba 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/ProtobufLogReader.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/ProtobufLogReader.java @@ -445,7 +445,7 @@ private IOException extractHiddenEof(Exception ex) { && ex.getCause() != null && ex.getCause() instanceof IOException) { ioEx = (IOException)ex.getCause(); } - if (ioEx != null) { + if ((ioEx != null) && (ioEx.getMessage() != null)) { if (ioEx.getMessage().contains("EOF")) return ioEx; return null; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationRuntimeException.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationRuntimeException.java new file mode 100644 index 000000000000..81ec0d9129cd --- /dev/null +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationRuntimeException.java @@ -0,0 +1,40 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.replication.regionserver; + +import org.apache.yetus.audience.InterfaceAudience; + +/** + * This exception is thrown when a replication source is terminated and source threads got + * interrupted. + * + * It is inherited from RuntimeException so that it can skip all the following processing logic + * and be propagated to the most top level and handled there. + */ +@InterfaceAudience.Private +public class ReplicationRuntimeException extends RuntimeException { + private static final long serialVersionUID = 1L; + + public ReplicationRuntimeException(String m, Throwable t) { + super(m, t); + } + + public ReplicationRuntimeException(String m) { + super(m); + } +} diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java index 60c49edacab1..16ff8e753d8a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java @@ -474,8 +474,13 @@ private void interruptOrAbortWhenFail(ReplicationQueueOperation op) { if (e.getCause() != null && e.getCause() instanceof KeeperException.SystemErrorException && e.getCause().getCause() != null && e.getCause() .getCause() instanceof InterruptedException) { - throw new RuntimeException( - "Thread is interrupted, the replication source may be terminated"); + // ReplicationRuntimeException(a RuntimeException) is thrown out here. The reason is + // that thread is interrupted deep down in the stack, it should pass the following + // processing logic and propagate to the most top layer which can handle this exception + // properly. In this specific case, the top layer is ReplicationSourceShipper#run(). + throw new ReplicationRuntimeException( + "Thread is interrupted, the replication source may be terminated", + e.getCause().getCause()); } server.abort("Failed to operate on replication queue", e); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceShipper.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceShipper.java index 1e686478944f..9874c46b3e21 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceShipper.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceShipper.java @@ -117,8 +117,9 @@ public final void run() { } else { shipEdits(entryBatch); } - } catch (InterruptedException e) { - LOG.trace("Interrupted while waiting for next replication entry batch", e); + } catch (InterruptedException | ReplicationRuntimeException e) { + // It is interrupted and needs to quit. + LOG.warn("Interrupted while waiting for next replication entry batch", e); Thread.currentThread().interrupt(); } }