Skip to content

Commit 98213df

Browse files
authored
Report more details of unobtainable ShardLock (#61255)
Today a common reason for a `ShardLockObtainFailedException` is when a shard is removed from a node and then assigned straight back to it again before the node has had a chance to shut the previous shard instance down. For instance, this can happen if a node briefly leaves the cluster holding a primary with no in-sync replicas. The message in this case is typically as follows: obtaining shard lock timed out after 5000ms, previous lock details: [shard creation] trying to lock for [shard creation] This is pretty hard to interpret, and doesn't raise the important question: "why didn't the shard shut down sooner?" With this change we reword the message a bit, report the age of the shard lock, and adjust the details to report that the lock is held by a closing shard: obtaining shard lock for [starting shard] timed out after [5000ms], lock already held for [closing shard] with age [12345ms] Relates #38807
1 parent f4f3a0b commit 98213df

File tree

4 files changed

+33
-7
lines changed

4 files changed

+33
-7
lines changed

server/src/main/java/org/elasticsearch/env/NodeEnvironment.java

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
import org.elasticsearch.common.Randomness;
4242
import org.elasticsearch.common.SuppressForbidden;
4343
import org.elasticsearch.common.UUIDs;
44+
import org.elasticsearch.common.collect.Tuple;
4445
import org.elasticsearch.common.io.FileSystemUtils;
4546
import org.elasticsearch.common.lease.Releasable;
4647
import org.elasticsearch.common.settings.Setting;
@@ -786,6 +787,11 @@ protected void closeInternal() {
786787
shardLock.release();
787788
logger.trace("released shard lock for [{}]", shardId);
788789
}
790+
791+
@Override
792+
public void setDetails(String details) {
793+
shardLock.setDetails(details);
794+
}
789795
};
790796
}
791797

@@ -817,13 +823,13 @@ private final class InternalShardLock {
817823
*/
818824
private final Semaphore mutex = new Semaphore(1);
819825
private int waitCount = 1; // guarded by shardLocks
820-
private String lockDetails;
821826
private final ShardId shardId;
827+
private volatile Tuple<Long, String> lockDetails;
822828

823829
InternalShardLock(final ShardId shardId, final String details) {
824830
this.shardId = shardId;
825831
mutex.acquireUninterruptibly();
826-
lockDetails = details;
832+
lockDetails = Tuple.tuple(System.nanoTime(), details);
827833
}
828834

829835
protected void release() {
@@ -854,17 +860,23 @@ private void decWaitCount() {
854860
void acquire(long timeoutInMillis, final String details) throws ShardLockObtainFailedException {
855861
try {
856862
if (mutex.tryAcquire(timeoutInMillis, TimeUnit.MILLISECONDS)) {
857-
lockDetails = details;
863+
setDetails(details);
858864
} else {
865+
final Tuple<Long, String> lockDetails = this.lockDetails; // single volatile read
859866
throw new ShardLockObtainFailedException(shardId,
860-
"obtaining shard lock timed out after " + timeoutInMillis + "ms, previous lock details: [" + lockDetails +
861-
"] trying to lock for [" + details + "]");
867+
"obtaining shard lock for [" + details + "] timed out after [" + timeoutInMillis +
868+
"ms], lock already held for [" + lockDetails.v2() + "] with age [" +
869+
TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - lockDetails.v1()) + "ms]");
862870
}
863871
} catch (InterruptedException e) {
864872
Thread.currentThread().interrupt();
865873
throw new ShardLockObtainFailedException(shardId, "thread interrupted while trying to obtain shard lock", e);
866874
}
867875
}
876+
877+
public void setDetails(String details) {
878+
lockDetails = Tuple.tuple(System.nanoTime(), details);
879+
}
868880
}
869881

870882
public boolean hasNodeFile() {

server/src/main/java/org/elasticsearch/env/ShardLock.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,14 @@ public final void close() {
5454
}
5555
}
5656

57-
protected abstract void closeInternal();
57+
protected abstract void closeInternal();
58+
59+
/**
60+
* Update the details of the holder of this lock. These details are displayed alongside a {@link ShardLockObtainFailedException}. Must
61+
* only be called by the holder of this lock.
62+
*/
63+
public void setDetails(String details) {
64+
}
5865

5966
@Override
6067
public String toString() {

server/src/main/java/org/elasticsearch/index/IndexService.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,7 @@ public synchronized IndexShard createShard(
406406
IndexShard indexShard = null;
407407
ShardLock lock = null;
408408
try {
409-
lock = nodeEnv.shardLock(shardId, "shard creation", TimeUnit.SECONDS.toMillis(5));
409+
lock = nodeEnv.shardLock(shardId, "starting shard", TimeUnit.SECONDS.toMillis(5));
410410
eventListener.beforeIndexShardCreated(shardId, indexSettings);
411411
ShardPath path;
412412
try {
@@ -515,6 +515,9 @@ public synchronized void removeShard(int shardId, String reason) {
515515
private void closeShard(String reason, ShardId sId, IndexShard indexShard, Store store, IndexEventListener listener) {
516516
final int shardId = sId.id();
517517
final Settings indexSettings = this.getIndexSettings().getSettings();
518+
if (store != null) {
519+
store.beforeClose();
520+
}
518521
try {
519522
try {
520523
listener.beforeIndexShardClosed(sId, indexShard, indexSettings);

server/src/main/java/org/elasticsearch/index/store/Store.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -677,6 +677,10 @@ public int refCount() {
677677
return refCounter.refCount();
678678
}
679679

680+
public void beforeClose() {
681+
shardLock.setDetails("closing shard");
682+
}
683+
680684
static final class StoreDirectory extends FilterDirectory {
681685

682686
private final Logger deletesLogger;

0 commit comments

Comments
 (0)