diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AsyncFSWAL.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AsyncFSWAL.java index c4ccb21d2a7a..47b254baeaaf 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AsyncFSWAL.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AsyncFSWAL.java @@ -51,6 +51,7 @@ import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.io.asyncfs.AsyncFSOutput; import org.apache.hadoop.hbase.io.asyncfs.monitor.StreamSlowMonitor; +import org.apache.hadoop.hbase.util.RecoverLeaseFSUtils; import org.apache.hadoop.hbase.wal.AsyncFSWALProvider; import org.apache.hadoop.hbase.wal.WALEdit; import org.apache.hadoop.hbase.wal.WALKeyImpl; @@ -716,13 +717,22 @@ private void waitForSafePoint() { } } + private void recoverLease(FileSystem fs, Path p, Configuration conf) { + try { + RecoverLeaseFSUtils.recoverFileLease(fs, p, conf, null); + } catch (IOException ex) { + LOG.error("Unable to recover lease after several attempts. Give up.", ex); + } + } + private void closeWriter(AsyncWriter writer, Path path) { inflightWALClosures.put(path.getName(), writer); closeExecutor.execute(() -> { try { writer.close(); } catch (IOException e) { - LOG.warn("close old writer failed", e); + LOG.warn("close old writer failed.", e); + recoverLease(this.fs, path, conf); } finally { // call this even if the above close fails, as there is no other chance we can set closed to // true, it will not cause big problems. diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java index 42f0235f67bc..eb3b089c148a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java @@ -49,6 +49,7 @@ import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.ClassSize; import org.apache.hadoop.hbase.util.CommonFSUtils; +import org.apache.hadoop.hbase.util.RecoverLeaseFSUtils; import org.apache.hadoop.hbase.util.Threads; import org.apache.hadoop.hbase.wal.FSHLogProvider; import org.apache.hadoop.hbase.wal.WALEdit; @@ -455,15 +456,22 @@ private void closeWriter(Writer writer, Path path, boolean syncCloseCall) throws writer.close(); span.addEvent("writer closed"); } catch (IOException ioe) { - int errors = closeErrorCount.incrementAndGet(); - boolean hasUnflushedEntries = isUnflushedEntries(); - if (syncCloseCall && (hasUnflushedEntries || (errors > this.closeErrorsTolerated))) { - LOG.error("Close of WAL " + path + " failed. Cause=\"" + ioe.getMessage() + "\", errors=" - + errors + ", hasUnflushedEntries=" + hasUnflushedEntries); - throw ioe; + LOG.warn("close old writer failed.", ioe); + try { + RecoverLeaseFSUtils.recoverFileLease(fs, path, conf, null); + } catch (IOException ex) { + LOG.error("Unable to recover lease after several attempts. Give up.", ex); + + int errors = closeErrorCount.incrementAndGet(); + boolean hasUnflushedEntries = isUnflushedEntries(); + if (syncCloseCall && (hasUnflushedEntries || (errors > this.closeErrorsTolerated))) { + LOG.error("Close of WAL " + path + " failed. Cause=\"" + ioe.getMessage() + "\", errors=" + + errors + ", hasUnflushedEntries=" + hasUnflushedEntries); + throw ioe; + } + LOG.warn("Riding over failed WAL close of " + path + + "; THIS FILE WAS NOT CLOSED BUT ALL EDITS SYNCED SO SHOULD BE OK", ioe); } - LOG.warn("Riding over failed WAL close of " + path - + "; THIS FILE WAS NOT CLOSED BUT ALL EDITS SYNCED SO SHOULD BE OK", ioe); } }