Skip to content

Commit 52e3387

Browse files
committed
Add details about what acquired the shard lock last (#38807)
This adds a `details` parameter to shard locking in `NodeEnvironment`. This is intended to be used for diagnosing issues such as ``` 1> [2019-02-11T14:34:19,262][INFO ][o.e.c.m.MetaDataDeleteIndexService] [node_s0] [.tasks/oSYOG0-9SHOx_pfAoiSExQ] deleting index 1> [2019-02-11T14:34:19,279][WARN ][o.e.i.IndicesService ] [node_s0] [.tasks/oSYOG0-9SHOx_pfAoiSExQ] failed to delete index 1> org.elasticsearch.env.ShardLockObtainFailedException: [.tasks][0]: obtaining shard lock timed out after 0ms 1> at org.elasticsearch.env.NodeEnvironment$InternalShardLock.acquire(NodeEnvironment.java:736) ~[main/:?] 1> at org.elasticsearch.env.NodeEnvironment.shardLock(NodeEnvironment.java:655) ~[main/:?] 1> at org.elasticsearch.env.NodeEnvironment.lockAllForIndex(NodeEnvironment.java:601) ~[main/:?] 1> at org.elasticsearch.env.NodeEnvironment.deleteIndexDirectorySafe(NodeEnvironment.java:554) ~[main/:?] ``` In the hope that we will be able to determine why the shard is still locked. Relates to #30290 as well as some other CI failures
1 parent 3afadaf commit 52e3387

File tree

8 files changed

+52
-43
lines changed

8 files changed

+52
-43
lines changed

server/src/main/java/org/elasticsearch/env/NodeEnvironment.java

Lines changed: 34 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,8 @@
1919

2020
package org.elasticsearch.env;
2121

22-
import java.io.UncheckedIOException;
23-
import java.util.Iterator;
24-
import java.util.stream.Collectors;
25-
import java.util.stream.Stream;
26-
import org.apache.logging.log4j.Logger;
2722
import org.apache.logging.log4j.LogManager;
23+
import org.apache.logging.log4j.Logger;
2824
import org.apache.logging.log4j.message.ParameterizedMessage;
2925
import org.apache.lucene.index.IndexWriter;
3026
import org.apache.lucene.index.SegmentInfos;
@@ -34,22 +30,22 @@
3430
import org.apache.lucene.store.LockObtainFailedException;
3531
import org.apache.lucene.store.NativeFSLockFactory;
3632
import org.apache.lucene.store.SimpleFSDirectory;
37-
import org.elasticsearch.common.CheckedFunction;
38-
import org.elasticsearch.common.lease.Releasable;
39-
import org.elasticsearch.core.internal.io.IOUtils;
4033
import org.elasticsearch.ElasticsearchException;
4134
import org.elasticsearch.cluster.metadata.IndexMetaData;
4235
import org.elasticsearch.cluster.node.DiscoveryNode;
36+
import org.elasticsearch.common.CheckedFunction;
4337
import org.elasticsearch.common.Randomness;
4438
import org.elasticsearch.common.SuppressForbidden;
4539
import org.elasticsearch.common.UUIDs;
4640
import org.elasticsearch.common.io.FileSystemUtils;
41+
import org.elasticsearch.common.lease.Releasable;
4742
import org.elasticsearch.common.settings.Setting;
4843
import org.elasticsearch.common.settings.Setting.Property;
4944
import org.elasticsearch.common.settings.Settings;
5045
import org.elasticsearch.common.unit.ByteSizeValue;
5146
import org.elasticsearch.common.unit.TimeValue;
5247
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
48+
import org.elasticsearch.core.internal.io.IOUtils;
5349
import org.elasticsearch.gateway.MetaDataStateFormat;
5450
import org.elasticsearch.index.Index;
5551
import org.elasticsearch.index.IndexSettings;
@@ -63,6 +59,7 @@
6359

6460
import java.io.Closeable;
6561
import java.io.IOException;
62+
import java.io.UncheckedIOException;
6663
import java.nio.file.AtomicMoveNotSupportedException;
6764
import java.nio.file.DirectoryStream;
6865
import java.nio.file.FileStore;
@@ -74,6 +71,7 @@
7471
import java.util.Collection;
7572
import java.util.HashMap;
7673
import java.util.HashSet;
74+
import java.util.Iterator;
7775
import java.util.List;
7876
import java.util.Locale;
7977
import java.util.Map;
@@ -84,6 +82,8 @@
8482
import java.util.concurrent.atomic.AtomicBoolean;
8583
import java.util.concurrent.atomic.AtomicReference;
8684
import java.util.function.Predicate;
85+
import java.util.stream.Collectors;
86+
import java.util.stream.Stream;
8787

8888
import static java.util.Collections.unmodifiableSet;
8989

@@ -440,7 +440,7 @@ private static String toString(Collection<String> items) {
440440
public void deleteShardDirectorySafe(ShardId shardId, IndexSettings indexSettings) throws IOException, ShardLockObtainFailedException {
441441
final Path[] paths = availableShardPaths(shardId);
442442
logger.trace("deleting shard {} directory, paths: [{}]", shardId, paths);
443-
try (ShardLock lock = shardLock(shardId)) {
443+
try (ShardLock lock = shardLock(shardId, "shard deletion under lock")) {
444444
deleteShardDirectoryUnderLock(lock, indexSettings);
445445
}
446446
}
@@ -532,7 +532,7 @@ private static boolean assertPathsDoNotExist(final Path[] paths) {
532532

533533
private boolean isShardLocked(ShardId id) {
534534
try {
535-
shardLock(id, 0).close();
535+
shardLock(id, "checking if shard is locked").close();
536536
return false;
537537
} catch (ShardLockObtainFailedException ex) {
538538
return true;
@@ -551,7 +551,7 @@ private boolean isShardLocked(ShardId id) {
551551
*/
552552
public void deleteIndexDirectorySafe(Index index, long lockTimeoutMS, IndexSettings indexSettings)
553553
throws IOException, ShardLockObtainFailedException {
554-
final List<ShardLock> locks = lockAllForIndex(index, indexSettings, lockTimeoutMS);
554+
final List<ShardLock> locks = lockAllForIndex(index, indexSettings, "deleting index directory", lockTimeoutMS);
555555
try {
556556
deleteIndexDirectoryUnderLock(index, indexSettings);
557557
} finally {
@@ -586,7 +586,8 @@ public void deleteIndexDirectoryUnderLock(Index index, IndexSettings indexSettin
586586
* @param lockTimeoutMS how long to wait for acquiring the indices shard locks
587587
* @return the {@link ShardLock} instances for this index.
588588
*/
589-
public List<ShardLock> lockAllForIndex(Index index, IndexSettings settings, long lockTimeoutMS) throws ShardLockObtainFailedException {
589+
public List<ShardLock> lockAllForIndex(final Index index, final IndexSettings settings,
590+
final String lockDetails, final long lockTimeoutMS) throws ShardLockObtainFailedException {
590591
final int numShards = settings.getNumberOfShards();
591592
if (numShards <= 0) {
592593
throw new IllegalArgumentException("settings must contain a non-null > 0 number of shards");
@@ -598,7 +599,7 @@ public List<ShardLock> lockAllForIndex(Index index, IndexSettings settings, long
598599
try {
599600
for (int i = 0; i < numShards; i++) {
600601
long timeoutLeftMS = Math.max(0, lockTimeoutMS - TimeValue.nsecToMSec((System.nanoTime() - startTimeNS)));
601-
allLocks.add(shardLock(new ShardId(index, i), timeoutLeftMS));
602+
allLocks.add(shardLock(new ShardId(index, i), lockDetails, timeoutLeftMS));
602603
}
603604
success = true;
604605
} finally {
@@ -619,10 +620,11 @@ public List<ShardLock> lockAllForIndex(Index index, IndexSettings settings, long
619620
* Note: this method will return immediately if the lock can't be acquired.
620621
*
621622
* @param id the shard ID to lock
623+
* @param details information about why the shard is being locked
622624
* @return the shard lock. Call {@link ShardLock#close()} to release the lock
623625
*/
624-
public ShardLock shardLock(ShardId id) throws ShardLockObtainFailedException {
625-
return shardLock(id, 0);
626+
public ShardLock shardLock(ShardId id, final String details) throws ShardLockObtainFailedException {
627+
return shardLock(id, details, 0);
626628
}
627629

628630
/**
@@ -631,11 +633,13 @@ public ShardLock shardLock(ShardId id) throws ShardLockObtainFailedException {
631633
* or recover from a different shard instance into it. If the shard lock can not be acquired
632634
* a {@link ShardLockObtainFailedException} is thrown
633635
* @param shardId the shard ID to lock
636+
* @param details information about why the shard is being locked
634637
* @param lockTimeoutMS the lock timeout in milliseconds
635638
* @return the shard lock. Call {@link ShardLock#close()} to release the lock
636639
*/
637-
public ShardLock shardLock(final ShardId shardId, long lockTimeoutMS) throws ShardLockObtainFailedException {
638-
logger.trace("acquiring node shardlock on [{}], timeout [{}]", shardId, lockTimeoutMS);
640+
public ShardLock shardLock(final ShardId shardId, final String details,
641+
final long lockTimeoutMS) throws ShardLockObtainFailedException {
642+
logger.trace("acquiring node shardlock on [{}], timeout [{}], details [{}]", shardId, lockTimeoutMS, details);
639643
final InternalShardLock shardLock;
640644
final boolean acquired;
641645
synchronized (shardLocks) {
@@ -644,15 +648,15 @@ public ShardLock shardLock(final ShardId shardId, long lockTimeoutMS) throws Sha
644648
shardLock.incWaitCount();
645649
acquired = false;
646650
} else {
647-
shardLock = new InternalShardLock(shardId);
651+
shardLock = new InternalShardLock(shardId, details);
648652
shardLocks.put(shardId, shardLock);
649653
acquired = true;
650654
}
651655
}
652656
if (acquired == false) {
653657
boolean success = false;
654658
try {
655-
shardLock.acquire(lockTimeoutMS);
659+
shardLock.acquire(lockTimeoutMS, details);
656660
success = true;
657661
} finally {
658662
if (success == false) {
@@ -671,11 +675,11 @@ protected void closeInternal() {
671675
}
672676

673677
/**
674-
* A functional interface that people can use to reference {@link #shardLock(ShardId, long)}
678+
* A functional interface that people can use to reference {@link #shardLock(ShardId, String, long)}
675679
*/
676680
@FunctionalInterface
677681
public interface ShardLocker {
678-
ShardLock lock(ShardId shardId, long lockTimeoutMS) throws ShardLockObtainFailedException;
682+
ShardLock lock(ShardId shardId, String lockDetails, long lockTimeoutMS) throws ShardLockObtainFailedException;
679683
}
680684

681685
/**
@@ -698,11 +702,13 @@ private final class InternalShardLock {
698702
*/
699703
private final Semaphore mutex = new Semaphore(1);
700704
private int waitCount = 1; // guarded by shardLocks
705+
private String lockDetails;
701706
private final ShardId shardId;
702707

703-
InternalShardLock(ShardId shardId) {
708+
InternalShardLock(final ShardId shardId, final String details) {
704709
this.shardId = shardId;
705710
mutex.acquireUninterruptibly();
711+
lockDetails = details;
706712
}
707713

708714
protected void release() {
@@ -730,11 +736,14 @@ private void decWaitCount() {
730736
}
731737
}
732738

733-
void acquire(long timeoutInMillis) throws ShardLockObtainFailedException {
739+
void acquire(long timeoutInMillis, final String details) throws ShardLockObtainFailedException {
734740
try {
735-
if (mutex.tryAcquire(timeoutInMillis, TimeUnit.MILLISECONDS) == false) {
741+
if (mutex.tryAcquire(timeoutInMillis, TimeUnit.MILLISECONDS)) {
742+
lockDetails = details;
743+
} else {
736744
throw new ShardLockObtainFailedException(shardId,
737-
"obtaining shard lock timed out after " + timeoutInMillis + "ms");
745+
"obtaining shard lock timed out after " + timeoutInMillis + "ms, previous lock details: [" + lockDetails +
746+
"] trying to lock for [" + details + "]");
738747
}
739748
} catch (InterruptedException e) {
740749
Thread.currentThread().interrupt();

server/src/main/java/org/elasticsearch/index/IndexService.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,7 @@ public synchronized IndexShard createShard(
334334
IndexShard indexShard = null;
335335
ShardLock lock = null;
336336
try {
337-
lock = nodeEnv.shardLock(shardId, TimeUnit.SECONDS.toMillis(5));
337+
lock = nodeEnv.shardLock(shardId, "shard creation", TimeUnit.SECONDS.toMillis(5));
338338
eventListener.beforeIndexShardCreated(shardId, indexSettings);
339339
ShardPath path;
340340
try {

server/src/main/java/org/elasticsearch/index/store/Store.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -436,7 +436,7 @@ private void closeInternal() {
436436
*/
437437
public static MetadataSnapshot readMetadataSnapshot(Path indexLocation, ShardId shardId, NodeEnvironment.ShardLocker shardLocker,
438438
Logger logger) throws IOException {
439-
try (ShardLock lock = shardLocker.lock(shardId, TimeUnit.SECONDS.toMillis(5));
439+
try (ShardLock lock = shardLocker.lock(shardId, "read metadata snapshot", TimeUnit.SECONDS.toMillis(5));
440440
Directory dir = new SimpleFSDirectory(indexLocation)) {
441441
failIfCorrupted(dir, shardId);
442442
return new MetadataSnapshot(null, dir, logger);
@@ -457,7 +457,7 @@ public static MetadataSnapshot readMetadataSnapshot(Path indexLocation, ShardId
457457
*/
458458
public static void tryOpenIndex(Path indexLocation, ShardId shardId, NodeEnvironment.ShardLocker shardLocker,
459459
Logger logger) throws IOException, ShardLockObtainFailedException {
460-
try (ShardLock lock = shardLocker.lock(shardId, TimeUnit.SECONDS.toMillis(5));
460+
try (ShardLock lock = shardLocker.lock(shardId, "open index", TimeUnit.SECONDS.toMillis(5));
461461
Directory dir = new SimpleFSDirectory(indexLocation)) {
462462
failIfCorrupted(dir, shardId);
463463
SegmentInfos segInfo = Lucene.readSegmentInfos(dir);

server/src/main/java/org/elasticsearch/indices/IndicesService.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1060,7 +1060,7 @@ public void processPendingDeletes(Index index, IndexSettings indexSettings, Time
10601060
throws IOException, InterruptedException, ShardLockObtainFailedException {
10611061
logger.debug("{} processing pending deletes", index);
10621062
final long startTimeNS = System.nanoTime();
1063-
final List<ShardLock> shardLocks = nodeEnv.lockAllForIndex(index, indexSettings, timeout.millis());
1063+
final List<ShardLock> shardLocks = nodeEnv.lockAllForIndex(index, indexSettings, "process pending deletes", timeout.millis());
10641064
int numRemoved = 0;
10651065
try {
10661066
Map<ShardId, ShardLock> locks = new HashMap<>();

server/src/test/java/org/elasticsearch/env/NodeEnvironmentTests.java

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -135,11 +135,11 @@ public void testShardLock() throws Exception {
135135
final NodeEnvironment env = newNodeEnvironment();
136136

137137
Index index = new Index("foo", "fooUUID");
138-
ShardLock fooLock = env.shardLock(new ShardId(index, 0));
138+
ShardLock fooLock = env.shardLock(new ShardId(index, 0), "1");
139139
assertEquals(new ShardId(index, 0), fooLock.getShardId());
140140

141141
try {
142-
env.shardLock(new ShardId(index, 0));
142+
env.shardLock(new ShardId(index, 0), "2");
143143
fail("shard is locked");
144144
} catch (ShardLockObtainFailedException ex) {
145145
// expected
@@ -149,19 +149,19 @@ public void testShardLock() throws Exception {
149149
Files.createDirectories(path.resolve("1"));
150150
}
151151
try {
152-
env.lockAllForIndex(index, idxSettings, randomIntBetween(0, 10));
152+
env.lockAllForIndex(index, idxSettings, "3", randomIntBetween(0, 10));
153153
fail("shard 0 is locked");
154154
} catch (ShardLockObtainFailedException ex) {
155155
// expected
156156
}
157157

158158
fooLock.close();
159159
// can lock again?
160-
env.shardLock(new ShardId(index, 0)).close();
160+
env.shardLock(new ShardId(index, 0), "4").close();
161161

162-
List<ShardLock> locks = env.lockAllForIndex(index, idxSettings, randomIntBetween(0, 10));
162+
List<ShardLock> locks = env.lockAllForIndex(index, idxSettings, "5", randomIntBetween(0, 10));
163163
try {
164-
env.shardLock(new ShardId(index, 0));
164+
env.shardLock(new ShardId(index, 0), "6");
165165
fail("shard is locked");
166166
} catch (ShardLockObtainFailedException ex) {
167167
// expected
@@ -239,7 +239,7 @@ public void testResolveIndexFolders() throws Exception {
239239
public void testDeleteSafe() throws Exception {
240240
final NodeEnvironment env = newNodeEnvironment();
241241
final Index index = new Index("foo", "fooUUID");
242-
ShardLock fooLock = env.shardLock(new ShardId(index, 0));
242+
ShardLock fooLock = env.shardLock(new ShardId(index, 0), "1");
243243
assertEquals(new ShardId(index, 0), fooLock.getShardId());
244244

245245
for (Path path : env.indexPaths(index)) {
@@ -295,7 +295,7 @@ public void onFailure(Exception e) {
295295
@Override
296296
protected void doRun() throws Exception {
297297
start.await();
298-
try (ShardLock autoCloses = env.shardLock(new ShardId(index, 0))) {
298+
try (ShardLock autoCloses = env.shardLock(new ShardId(index, 0), "2")) {
299299
blockLatch.countDown();
300300
Thread.sleep(randomIntBetween(1, 10));
301301
}
@@ -353,7 +353,7 @@ public void run() {
353353
for (int i = 0; i < iters; i++) {
354354
int shard = randomIntBetween(0, counts.length - 1);
355355
try {
356-
try (ShardLock autoCloses = env.shardLock(new ShardId("foo", "fooUUID", shard),
356+
try (ShardLock autoCloses = env.shardLock(new ShardId("foo", "fooUUID", shard), "1",
357357
scaledRandomIntBetween(0, 10))) {
358358
counts[shard].value++;
359359
countsAtomic[shard].incrementAndGet();

server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ public void testFailShard() throws Exception {
263263
assertEquals(shardStateMetaData, getShardStateMetadata(shard));
264264
// but index can't be opened for a failed shard
265265
assertThat("store index should be corrupted", StoreUtils.canOpenIndex(logger, shardPath.resolveIndex(), shard.shardId(),
266-
(shardId, lockTimeoutMS) -> new DummyShardLock(shardId)),
266+
(shardId, lockTimeoutMS, details) -> new DummyShardLock(shardId)),
267267
equalTo(false));
268268
}
269269

server/src/test/java/org/elasticsearch/index/store/StoreTests.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -925,17 +925,17 @@ public void testCanOpenIndex() throws IOException {
925925
IndexWriterConfig iwc = newIndexWriterConfig();
926926
Path tempDir = createTempDir();
927927
final BaseDirectoryWrapper dir = newFSDirectory(tempDir);
928-
assertFalse(StoreUtils.canOpenIndex(logger, tempDir, shardId, (id, l) -> new DummyShardLock(id)));
928+
assertFalse(StoreUtils.canOpenIndex(logger, tempDir, shardId, (id, l, d) -> new DummyShardLock(id)));
929929
IndexWriter writer = new IndexWriter(dir, iwc);
930930
Document doc = new Document();
931931
doc.add(new StringField("id", "1", random().nextBoolean() ? Field.Store.YES : Field.Store.NO));
932932
writer.addDocument(doc);
933933
writer.commit();
934934
writer.close();
935-
assertTrue(StoreUtils.canOpenIndex(logger, tempDir, shardId, (id, l) -> new DummyShardLock(id)));
935+
assertTrue(StoreUtils.canOpenIndex(logger, tempDir, shardId, (id, l, d) -> new DummyShardLock(id)));
936936
Store store = new Store(shardId, INDEX_SETTINGS, dir, new DummyShardLock(shardId));
937937
store.markStoreCorrupted(new CorruptIndexException("foo", "bar"));
938-
assertFalse(StoreUtils.canOpenIndex(logger, tempDir, shardId, (id, l) -> new DummyShardLock(id)));
938+
assertFalse(StoreUtils.canOpenIndex(logger, tempDir, shardId, (id, l, d) -> new DummyShardLock(id)));
939939
store.close();
940940
}
941941

test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2503,7 +2503,7 @@ public void assertAfterTest() throws IOException {
25032503
Set<ShardId> shardIds = env.lockedShards();
25042504
for (ShardId id : shardIds) {
25052505
try {
2506-
env.shardLock(id, TimeUnit.SECONDS.toMillis(5)).close();
2506+
env.shardLock(id, "InternalTestCluster assert after test", TimeUnit.SECONDS.toMillis(5)).close();
25072507
} catch (ShardLockObtainFailedException ex) {
25082508
fail("Shard " + id + " is still locked after 5 sec waiting");
25092509
}

0 commit comments

Comments
 (0)