Skip to content

Commit 843f977

Browse files
author
Vladimir Dolzhenko
committed
drop index.shard.check_on_startup: fix
Relates #31389
1 parent 7aa5365 commit 843f977

File tree

7 files changed

+167
-47
lines changed

7 files changed

+167
-47
lines changed

docs/reference/index-modules.asciidoc

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,12 +63,6 @@ corruption is detected, it will prevent the shard from being opened. Accepts:
6363
Check for both physical and logical corruption. This is much more
6464
expensive in terms of CPU and memory usage.
6565

66-
`fix`::
67-
68-
Check for both physical and logical corruption. Segments that were reported
69-
as corrupted will be automatically removed. This option *may result in data loss*.
70-
Use with extreme caution!
71-
7266
WARNING: Expert only. Checking shards may take a lot of time on large indices.
7367
--
7468

server/src/main/java/org/elasticsearch/index/IndexSettings.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,11 +74,10 @@ public final class IndexSettings {
7474
switch(s) {
7575
case "false":
7676
case "true":
77-
case "fix":
7877
case "checksum":
7978
return s;
8079
default:
81-
throw new IllegalArgumentException("unknown value for [index.shard.check_on_startup] must be one of [true, false, fix, checksum] but was: " + s);
80+
throw new IllegalArgumentException("unknown value for [index.shard.check_on_startup] must be one of [true, false, checksum] but was: " + s);
8281
}
8382
}, Property.IndexScope);
8483

server/src/main/java/org/elasticsearch/index/shard/IndexShard.java

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1298,7 +1298,7 @@ private void innerOpenEngineAndTranslog() throws IOException {
12981298
}
12991299
recoveryState.setStage(RecoveryState.Stage.VERIFY_INDEX);
13001300
// also check here, before we apply the translog
1301-
if (Booleans.isTrue(checkIndexOnStartup)) {
1301+
if (Booleans.isTrue(checkIndexOnStartup) || "checksum".equals(checkIndexOnStartup)) {
13021302
try {
13031303
checkIndex();
13041304
} catch (IOException ex) {
@@ -1890,6 +1890,9 @@ void checkIndex() throws IOException {
18901890
if (store.tryIncRef()) {
18911891
try {
18921892
doCheckIndex();
1893+
} catch (IOException e) {
1894+
store.markStoreCorrupted(e);
1895+
throw e;
18931896
} finally {
18941897
store.decRef();
18951898
}
@@ -1933,18 +1936,7 @@ private void doCheckIndex() throws IOException {
19331936
return;
19341937
}
19351938
logger.warn("check index [failure]\n{}", os.bytes().utf8ToString());
1936-
if ("fix".equals(checkIndexOnStartup)) {
1937-
if (logger.isDebugEnabled()) {
1938-
logger.debug("fixing index, writing new segments file ...");
1939-
}
1940-
store.exorciseIndex(status);
1941-
if (logger.isDebugEnabled()) {
1942-
logger.debug("index fixed, wrote new segments file \"{}\"", status.segmentsFileName);
1943-
}
1944-
} else {
1945-
// only throw a failure if we are not going to fix the index
1946-
throw new IllegalStateException("index check failure but can't fix it");
1947-
}
1939+
throw new IllegalStateException("index check failure");
19481940
}
19491941
}
19501942

server/src/main/java/org/elasticsearch/index/store/Store.java

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,8 @@ public class Store extends AbstractIndexShardComponent implements Closeable, Ref
134134
static final int VERSION_STACK_TRACE = 1; // we write the stack trace too since 1.4.0
135135
static final int VERSION_START = 0;
136136
static final int VERSION = VERSION_WRITE_THROWABLE;
137-
static final String CORRUPTED = "corrupted_";
137+
// public is for test purposes
138+
public static final String CORRUPTED = "corrupted_";
138139
public static final Setting<TimeValue> INDEX_STORE_STATS_REFRESH_INTERVAL_SETTING =
139140
Setting.timeSetting("index.store.stats_refresh_interval", TimeValue.timeValueSeconds(10), Property.IndexScope);
140141

@@ -360,18 +361,6 @@ public CheckIndex.Status checkIndex(PrintStream out) throws IOException {
360361
}
361362
}
362363

363-
/**
364-
* Repairs the index using the previous returned status from {@link #checkIndex(PrintStream)}.
365-
*/
366-
public void exorciseIndex(CheckIndex.Status status) throws IOException {
367-
metadataLock.writeLock().lock();
368-
try (CheckIndex checkIndex = new CheckIndex(directory)) {
369-
checkIndex.exorciseIndex(status);
370-
} finally {
371-
metadataLock.writeLock().unlock();
372-
}
373-
}
374-
375364
public StoreStats stats() throws IOException {
376365
ensureOpen();
377366
return new StoreStats(directory.estimateSize());

server/src/test/java/org/elasticsearch/action/admin/indices/template/put/MetaDataIndexTemplateServiceTests.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ public void testIndexTemplateInvalidNumberOfShards() {
6969
containsString("Failed to parse value [0] for setting [index.number_of_shards] must be >= 1"));
7070
assertThat(throwables.get(0).getMessage(),
7171
containsString("unknown value for [index.shard.check_on_startup] " +
72-
"must be one of [true, false, fix, checksum] but was: blargh"));
72+
"must be one of [true, false, checksum] but was: blargh"));
7373
}
7474

7575
public void testIndexTemplateValidationAccumulatesValidationErrors() {

server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java

Lines changed: 109 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,13 @@
2222
import org.apache.lucene.index.CorruptIndexException;
2323
import org.apache.lucene.index.DirectoryReader;
2424
import org.apache.lucene.index.IndexCommit;
25+
import org.apache.lucene.index.IndexWriter;
2526
import org.apache.lucene.index.Term;
2627
import org.apache.lucene.search.IndexSearcher;
2728
import org.apache.lucene.search.TermQuery;
2829
import org.apache.lucene.search.TopDocs;
2930
import org.apache.lucene.store.AlreadyClosedException;
31+
import org.apache.lucene.store.BaseDirectoryWrapper;
3032
import org.apache.lucene.store.Directory;
3133
import org.apache.lucene.store.FilterDirectory;
3234
import org.apache.lucene.store.IOContext;
@@ -53,6 +55,7 @@
5355
import org.elasticsearch.cluster.routing.ShardRoutingState;
5456
import org.elasticsearch.cluster.routing.TestShardRouting;
5557
import org.elasticsearch.cluster.routing.UnassignedInfo;
58+
import org.elasticsearch.common.CheckedFunction;
5659
import org.elasticsearch.common.Strings;
5760
import org.elasticsearch.common.UUIDs;
5861
import org.elasticsearch.common.breaker.CircuitBreaker;
@@ -93,6 +96,7 @@
9396
import org.elasticsearch.index.seqno.SeqNoStats;
9497
import org.elasticsearch.index.seqno.SequenceNumbers;
9598
import org.elasticsearch.index.snapshots.IndexShardSnapshotStatus;
99+
import org.elasticsearch.index.store.DirectoryService;
96100
import org.elasticsearch.index.store.Store;
97101
import org.elasticsearch.index.store.StoreStats;
98102
import org.elasticsearch.index.translog.TestTranslog;
@@ -110,6 +114,7 @@
110114
import org.elasticsearch.snapshots.SnapshotId;
111115
import org.elasticsearch.snapshots.SnapshotInfo;
112116
import org.elasticsearch.snapshots.SnapshotShardFailure;
117+
import org.elasticsearch.test.CorruptionUtils;
113118
import org.elasticsearch.test.DummyShardLock;
114119
import org.elasticsearch.test.FieldMaskingReader;
115120
import org.elasticsearch.test.VersionUtils;
@@ -118,7 +123,11 @@
118123

119124
import java.io.IOException;
120125
import java.nio.charset.Charset;
126+
import java.nio.file.FileVisitResult;
127+
import java.nio.file.Files;
121128
import java.nio.file.Path;
129+
import java.nio.file.SimpleFileVisitor;
130+
import java.nio.file.attribute.BasicFileAttributes;
122131
import java.util.ArrayList;
123132
import java.util.Arrays;
124133
import java.util.Collections;
@@ -1216,7 +1225,7 @@ public String[] listAll() throws IOException {
12161225
};
12171226

12181227
try (Store store = createStore(shardId, new IndexSettings(metaData, Settings.EMPTY), directory)) {
1219-
IndexShard shard = newShard(shardRouting, shardPath, metaData, store,
1228+
IndexShard shard = newShard(shardRouting, shardPath, metaData, i -> store,
12201229
null, new InternalEngineFactory(), () -> {
12211230
}, EMPTY_EVENT_LISTENER);
12221231
AtomicBoolean failureCallbackTriggered = new AtomicBoolean(false);
@@ -2561,6 +2570,104 @@ public void testReadSnapshotConcurrently() throws IOException, InterruptedExcept
25612570
closeShards(newShard);
25622571
}
25632572

2573+
public void testIndexCheckChecksum() throws Exception {
2574+
final boolean primary = true;
2575+
2576+
IndexShard indexShard = newStartedShard(primary);
2577+
2578+
final long numDocs = between(10, 100);
2579+
for (long i = 0; i < numDocs; i++) {
2580+
indexDoc(indexShard, "_doc", Long.toString(i), "{}");
2581+
}
2582+
indexShard.flush(new FlushRequest());
2583+
closeShards(indexShard);
2584+
2585+
// start shard with checksum - it has to pass successfully
2586+
2587+
final ShardRouting shardRouting = ShardRoutingHelper.initWithSameId(indexShard.routingEntry(),
2588+
primary ? RecoverySource.StoreRecoverySource.EXISTING_STORE_INSTANCE : RecoverySource.PeerRecoverySource.INSTANCE
2589+
);
2590+
final IndexMetaData indexMetaData = IndexMetaData.builder(indexShard.indexSettings().getIndexMetaData())
2591+
.settings(Settings.builder()
2592+
.put(indexShard.indexSettings.getSettings())
2593+
.put(IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(), "checksum"))
2594+
.build();
2595+
2596+
final ShardPath shardPath = indexShard.shardPath();
2597+
2598+
final IndexShard newShard = newShard(shardRouting, shardPath, indexMetaData,
2599+
null, null, indexShard.engineFactory,
2600+
indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER);
2601+
2602+
closeShards(newStartedShard(p -> newShard, primary));
2603+
2604+
// corrupt files
2605+
final Path indexPath = shardPath.getDataPath().resolve("index");
2606+
final Path[] filesToCorrupt =
2607+
Files.walk(indexPath)
2608+
.filter(p -> Files.isRegularFile(p) && IndexWriter.WRITE_LOCK_NAME.equals(p.getFileName().toString()) == false)
2609+
.toArray(Path[]::new);
2610+
CorruptionUtils.corruptFile(random(), filesToCorrupt);
2611+
2612+
// check that corrupt marker is *NOT* there
2613+
final AtomicInteger corruptedMarkerCount = new AtomicInteger();
2614+
final SimpleFileVisitor<Path> corruptedVisitor = new SimpleFileVisitor<Path>() {
2615+
@Override
2616+
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
2617+
if (Files.isRegularFile(file) && file.getFileName().toString().startsWith(Store.CORRUPTED)) {
2618+
corruptedMarkerCount.incrementAndGet();
2619+
}
2620+
return FileVisitResult.CONTINUE;
2621+
}
2622+
};
2623+
Files.walkFileTree(indexPath, corruptedVisitor);
2624+
2625+
assertThat("store is clean",
2626+
corruptedMarkerCount.get(), equalTo(0));
2627+
2628+
// storeProvider that does not perform check index on close - it is corrupted
2629+
final CheckedFunction<IndexSettings, Store, IOException> storeProvider = indexSettings -> {
2630+
final ShardId shardId = shardPath.getShardId();
2631+
final DirectoryService directoryService = new DirectoryService(shardId, indexSettings) {
2632+
@Override
2633+
public Directory newDirectory() throws IOException {
2634+
final BaseDirectoryWrapper baseDirectoryWrapper = newFSDirectory(shardPath.resolveIndex());
2635+
// index is corrupted - don't even try to check index on close - it fails
2636+
baseDirectoryWrapper.setCheckIndexOnClose(false);
2637+
return baseDirectoryWrapper;
2638+
}
2639+
};
2640+
return new Store(shardId, indexSettings, directoryService, new DummyShardLock(shardId));
2641+
};
2642+
2643+
// try to start shard on corrupted files
2644+
final IndexShard corruptedShard = newShard(shardRouting, shardPath, indexMetaData,
2645+
storeProvider, null, indexShard.engineFactory,
2646+
indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER);
2647+
2648+
expectThrows(IndexShardRecoveryException.class, () -> newStartedShard(p -> corruptedShard, primary));
2649+
closeShards(corruptedShard);
2650+
2651+
// check that corrupt marker is there
2652+
Files.walkFileTree(indexPath, corruptedVisitor);
2653+
assertThat("store has to be marked as corrupted",
2654+
corruptedMarkerCount.get(), equalTo(1));
2655+
2656+
// try to start another time shard on corrupted files
2657+
final IndexShard corruptedShard2 = newShard(shardRouting, shardPath, indexMetaData,
2658+
storeProvider, null, indexShard.engineFactory,
2659+
indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER);
2660+
2661+
expectThrows(IndexShardRecoveryException.class, () -> newStartedShard(p -> corruptedShard2, primary));
2662+
closeShards(corruptedShard2);
2663+
2664+
// check that corrupt marker is there
2665+
corruptedMarkerCount.set(0);
2666+
Files.walkFileTree(indexPath, corruptedVisitor);
2667+
assertThat("store still has a single corrupt marker",
2668+
corruptedMarkerCount.get(), equalTo(1));
2669+
}
2670+
25642671
/**
25652672
* Simulates a scenario that happens when we are async fetching snapshot metadata from GatewayService
25662673
* and checking index concurrently. This should always be possible without any exception.
@@ -2584,7 +2691,7 @@ public void testReadSnapshotAndCheckIndexConcurrently() throws Exception {
25842691
final IndexMetaData indexMetaData = IndexMetaData.builder(indexShard.indexSettings().getIndexMetaData())
25852692
.settings(Settings.builder()
25862693
.put(indexShard.indexSettings.getSettings())
2587-
.put(IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(), randomFrom("false", "true", "checksum", "fix")))
2694+
.put(IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(), randomFrom("false", "true", "checksum")))
25882695
.build();
25892696
final IndexShard newShard = newShard(shardRouting, indexShard.shardPath(), indexMetaData,
25902697
null, null, indexShard.engineFactory, indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER);

0 commit comments

Comments
 (0)