Skip to content

Commit 6de8c6b

Browse files
drop index.shard.check_on_startup: fix (#32279)
Relates #31389 (cherry picked from commit 3d82a30)
1 parent cf28ba6 commit 6de8c6b

File tree

5 files changed

+249
-56
lines changed

5 files changed

+249
-56
lines changed

docs/reference/index-modules.asciidoc

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,7 @@ corruption is detected, it will prevent the shard from being opened. Accepts:
6565

6666
`fix`::
6767

68-
Check for both physical and logical corruption. Segments that were reported
69-
as corrupted will be automatically removed. This option *may result in data loss*.
70-
Use with extreme caution!
68+
The same as `false`. This option is deprecated and will be completely removed in 7.0.
7169

7270
WARNING: Expert only. Checking shards may take a lot of time on large indices.
7371
--

server/src/main/java/org/elasticsearch/index/shard/IndexShard.java

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,10 @@ public IndexShard(
297297
logger.debug("state: [CREATED]");
298298

299299
this.checkIndexOnStartup = indexSettings.getValue(IndexSettings.INDEX_CHECK_ON_STARTUP);
300+
if ("fix".equals(checkIndexOnStartup)) {
301+
deprecationLogger.deprecated("Setting [index.shard.check_on_startup] is set to deprecated value [fix], "
302+
+ "which has no effect and will not be accepted in future");
303+
}
300304
this.translogConfig = new TranslogConfig(shardId, shardPath().resolveTranslog(), indexSettings, bigArrays);
301305
final String aId = shardRouting.allocationId().getId();
302306
this.globalCheckpointListeners = new GlobalCheckpointListeners(shardId, threadPool.executor(ThreadPool.Names.LISTENER), logger);
@@ -1354,7 +1358,7 @@ private void innerOpenEngineAndTranslog() throws IOException {
13541358
}
13551359
recoveryState.setStage(RecoveryState.Stage.VERIFY_INDEX);
13561360
// also check here, before we apply the translog
1357-
if (Booleans.isTrue(checkIndexOnStartup)) {
1361+
if (Booleans.isTrue(checkIndexOnStartup) || "checksum".equals(checkIndexOnStartup)) {
13581362
try {
13591363
checkIndex();
13601364
} catch (IOException ex) {
@@ -1958,6 +1962,9 @@ void checkIndex() throws IOException {
19581962
if (store.tryIncRef()) {
19591963
try {
19601964
doCheckIndex();
1965+
} catch (IOException e) {
1966+
store.markStoreCorrupted(e);
1967+
throw e;
19611968
} finally {
19621969
store.decRef();
19631970
}
@@ -2001,18 +2008,7 @@ private void doCheckIndex() throws IOException {
20012008
return;
20022009
}
20032010
logger.warn("check index [failure]\n{}", os.bytes().utf8ToString());
2004-
if ("fix".equals(checkIndexOnStartup)) {
2005-
if (logger.isDebugEnabled()) {
2006-
logger.debug("fixing index, writing new segments file ...");
2007-
}
2008-
store.exorciseIndex(status);
2009-
if (logger.isDebugEnabled()) {
2010-
logger.debug("index fixed, wrote new segments file \"{}\"", status.segmentsFileName);
2011-
}
2012-
} else {
2013-
// only throw a failure if we are not going to fix the index
2014-
throw new IllegalStateException("index check failure but can't fix it");
2015-
}
2011+
throw new IOException("index check failure");
20162012
}
20172013
}
20182014

server/src/main/java/org/elasticsearch/index/store/Store.java

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,8 @@ public class Store extends AbstractIndexShardComponent implements Closeable, Ref
134134
static final int VERSION_STACK_TRACE = 1; // we write the stack trace too since 1.4.0
135135
static final int VERSION_START = 0;
136136
static final int VERSION = VERSION_WRITE_THROWABLE;
137-
static final String CORRUPTED = "corrupted_";
137+
// public is for test purposes
138+
public static final String CORRUPTED = "corrupted_";
138139
public static final Setting<TimeValue> INDEX_STORE_STATS_REFRESH_INTERVAL_SETTING =
139140
Setting.timeSetting("index.store.stats_refresh_interval", TimeValue.timeValueSeconds(10), Property.IndexScope);
140141

@@ -360,18 +361,6 @@ public CheckIndex.Status checkIndex(PrintStream out) throws IOException {
360361
}
361362
}
362363

363-
/**
364-
* Repairs the index using the previous returned status from {@link #checkIndex(PrintStream)}.
365-
*/
366-
public void exorciseIndex(CheckIndex.Status status) throws IOException {
367-
metadataLock.writeLock().lock();
368-
try (CheckIndex checkIndex = new CheckIndex(directory)) {
369-
checkIndex.exorciseIndex(status);
370-
} finally {
371-
metadataLock.writeLock().unlock();
372-
}
373-
}
374-
375364
public StoreStats stats() throws IOException {
376365
ensureOpen();
377366
return new StoreStats(directory.estimateSize());

server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java

Lines changed: 155 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import org.apache.lucene.index.CorruptIndexException;
2323
import org.apache.lucene.index.DirectoryReader;
2424
import org.apache.lucene.index.IndexCommit;
25+
import org.apache.lucene.index.IndexWriter;
2526
import org.apache.lucene.index.Term;
2627
import org.apache.lucene.search.IndexSearcher;
2728
import org.apache.lucene.search.TermQuery;
@@ -110,6 +111,7 @@
110111
import org.elasticsearch.snapshots.SnapshotId;
111112
import org.elasticsearch.snapshots.SnapshotInfo;
112113
import org.elasticsearch.snapshots.SnapshotShardFailure;
114+
import org.elasticsearch.test.CorruptionUtils;
113115
import org.elasticsearch.test.DummyShardLock;
114116
import org.elasticsearch.test.FieldMaskingReader;
115117
import org.elasticsearch.test.VersionUtils;
@@ -118,7 +120,11 @@
118120

119121
import java.io.IOException;
120122
import java.nio.charset.Charset;
123+
import java.nio.file.FileVisitResult;
124+
import java.nio.file.Files;
121125
import java.nio.file.Path;
126+
import java.nio.file.SimpleFileVisitor;
127+
import java.nio.file.attribute.BasicFileAttributes;
122128
import java.util.ArrayList;
123129
import java.util.Arrays;
124130
import java.util.Collections;
@@ -1229,7 +1235,7 @@ public String[] listAll() throws IOException {
12291235
};
12301236

12311237
try (Store store = createStore(shardId, new IndexSettings(metaData, Settings.EMPTY), directory)) {
1232-
IndexShard shard = newShard(shardRouting, shardPath, metaData, store,
1238+
IndexShard shard = newShard(shardRouting, shardPath, metaData, i -> store,
12331239
null, new InternalEngineFactory(), () -> {
12341240
}, EMPTY_EVENT_LISTENER);
12351241
AtomicBoolean failureCallbackTriggered = new AtomicBoolean(false);
@@ -2569,6 +2575,143 @@ public void testReadSnapshotConcurrently() throws IOException, InterruptedExcept
25692575
closeShards(newShard);
25702576
}
25712577

2578+
public void testIndexCheckOnStartup() throws Exception {
2579+
final IndexShard indexShard = newStartedShard(true);
2580+
2581+
final long numDocs = between(10, 100);
2582+
for (long i = 0; i < numDocs; i++) {
2583+
indexDoc(indexShard, "_doc", Long.toString(i), "{}");
2584+
}
2585+
indexShard.flush(new FlushRequest());
2586+
closeShards(indexShard);
2587+
2588+
final ShardPath shardPath = indexShard.shardPath();
2589+
2590+
final Path indexPath = corruptIndexFile(shardPath);
2591+
2592+
final AtomicInteger corruptedMarkerCount = new AtomicInteger();
2593+
final SimpleFileVisitor<Path> corruptedVisitor = new SimpleFileVisitor<Path>() {
2594+
@Override
2595+
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
2596+
if (Files.isRegularFile(file) && file.getFileName().toString().startsWith(Store.CORRUPTED)) {
2597+
corruptedMarkerCount.incrementAndGet();
2598+
}
2599+
return FileVisitResult.CONTINUE;
2600+
}
2601+
};
2602+
Files.walkFileTree(indexPath, corruptedVisitor);
2603+
2604+
assertThat("corruption marker should not be there", corruptedMarkerCount.get(), equalTo(0));
2605+
2606+
final ShardRouting shardRouting = ShardRoutingHelper.initWithSameId(indexShard.routingEntry(),
2607+
RecoverySource.StoreRecoverySource.EXISTING_STORE_INSTANCE
2608+
);
2609+
// start shard and perform index check on startup. It enforce shard to fail due to corrupted index files
2610+
final IndexMetaData indexMetaData = IndexMetaData.builder(indexShard.indexSettings().getIndexMetaData())
2611+
.settings(Settings.builder()
2612+
.put(indexShard.indexSettings.getSettings())
2613+
.put(IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(), randomFrom("true", "checksum")))
2614+
.build();
2615+
2616+
IndexShard corruptedShard = newShard(shardRouting, shardPath, indexMetaData,
2617+
null, null, indexShard.engineFactory,
2618+
indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER);
2619+
2620+
final IndexShardRecoveryException indexShardRecoveryException =
2621+
expectThrows(IndexShardRecoveryException.class, () -> newStartedShard(p -> corruptedShard, true));
2622+
assertThat(indexShardRecoveryException.getMessage(), equalTo("failed recovery"));
2623+
2624+
// check that corrupt marker is there
2625+
Files.walkFileTree(indexPath, corruptedVisitor);
2626+
assertThat("store has to be marked as corrupted", corruptedMarkerCount.get(), equalTo(1));
2627+
2628+
try {
2629+
closeShards(corruptedShard);
2630+
} catch (RuntimeException e) {
2631+
assertThat(e.getMessage(), equalTo("CheckIndex failed"));
2632+
}
2633+
}
2634+
2635+
public void testShardDoesNotStartIfCorruptedMarkerIsPresent() throws Exception {
2636+
final IndexShard indexShard = newStartedShard(true);
2637+
2638+
final long numDocs = between(10, 100);
2639+
for (long i = 0; i < numDocs; i++) {
2640+
indexDoc(indexShard, "_doc", Long.toString(i), "{}");
2641+
}
2642+
indexShard.flush(new FlushRequest());
2643+
closeShards(indexShard);
2644+
2645+
final ShardPath shardPath = indexShard.shardPath();
2646+
2647+
final ShardRouting shardRouting = ShardRoutingHelper.initWithSameId(indexShard.routingEntry(),
2648+
RecoverySource.StoreRecoverySource.EXISTING_STORE_INSTANCE
2649+
);
2650+
final IndexMetaData indexMetaData = indexShard.indexSettings().getIndexMetaData();
2651+
2652+
final Path indexPath = shardPath.getDataPath().resolve(ShardPath.INDEX_FOLDER_NAME);
2653+
2654+
// create corrupted marker
2655+
final String corruptionMessage = "fake ioexception";
2656+
try(Store store = createStore(indexShard.indexSettings(), shardPath)) {
2657+
store.markStoreCorrupted(new IOException(corruptionMessage));
2658+
}
2659+
2660+
// try to start shard on corrupted files
2661+
final IndexShard corruptedShard = newShard(shardRouting, shardPath, indexMetaData,
2662+
null, null, indexShard.engineFactory,
2663+
indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER);
2664+
2665+
final IndexShardRecoveryException exception1 = expectThrows(IndexShardRecoveryException.class,
2666+
() -> newStartedShard(p -> corruptedShard, true));
2667+
assertThat(exception1.getCause().getMessage(), equalTo(corruptionMessage + " (resource=preexisting_corruption)"));
2668+
closeShards(corruptedShard);
2669+
2670+
final AtomicInteger corruptedMarkerCount = new AtomicInteger();
2671+
final SimpleFileVisitor<Path> corruptedVisitor = new SimpleFileVisitor<Path>() {
2672+
@Override
2673+
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
2674+
if (Files.isRegularFile(file) && file.getFileName().toString().startsWith(Store.CORRUPTED)) {
2675+
corruptedMarkerCount.incrementAndGet();
2676+
}
2677+
return FileVisitResult.CONTINUE;
2678+
}
2679+
};
2680+
Files.walkFileTree(indexPath, corruptedVisitor);
2681+
assertThat("store has to be marked as corrupted", corruptedMarkerCount.get(), equalTo(1));
2682+
2683+
// try to start another time shard on corrupted files
2684+
final IndexShard corruptedShard2 = newShard(shardRouting, shardPath, indexMetaData,
2685+
null, null, indexShard.engineFactory,
2686+
indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER);
2687+
2688+
final IndexShardRecoveryException exception2 = expectThrows(IndexShardRecoveryException.class,
2689+
() -> newStartedShard(p -> corruptedShard2, true));
2690+
assertThat(exception2.getCause().getMessage(), equalTo(corruptionMessage + " (resource=preexisting_corruption)"));
2691+
closeShards(corruptedShard2);
2692+
2693+
// check that corrupt marker is there
2694+
corruptedMarkerCount.set(0);
2695+
Files.walkFileTree(indexPath, corruptedVisitor);
2696+
assertThat("store still has a single corrupt marker", corruptedMarkerCount.get(), equalTo(1));
2697+
}
2698+
2699+
private Path corruptIndexFile(ShardPath shardPath) throws IOException {
2700+
final Path indexPath = shardPath.getDataPath().resolve(ShardPath.INDEX_FOLDER_NAME);
2701+
final Path[] filesToCorrupt =
2702+
Files.walk(indexPath)
2703+
.filter(p -> {
2704+
final String name = p.getFileName().toString();
2705+
return Files.isRegularFile(p)
2706+
&& name.startsWith("extra") == false // Skip files added by Lucene's ExtrasFS
2707+
&& IndexWriter.WRITE_LOCK_NAME.equals(name) == false
2708+
&& name.startsWith("segments_") == false && name.endsWith(".si") == false;
2709+
})
2710+
.toArray(Path[]::new);
2711+
CorruptionUtils.corruptFile(random(), filesToCorrupt);
2712+
return indexPath;
2713+
}
2714+
25722715
/**
25732716
* Simulates a scenario that happens when we are async fetching snapshot metadata from GatewayService
25742717
* and checking index concurrently. This should always be possible without any exception.
@@ -2592,7 +2735,7 @@ public void testReadSnapshotAndCheckIndexConcurrently() throws Exception {
25922735
final IndexMetaData indexMetaData = IndexMetaData.builder(indexShard.indexSettings().getIndexMetaData())
25932736
.settings(Settings.builder()
25942737
.put(indexShard.indexSettings.getSettings())
2595-
.put(IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(), randomFrom("false", "true", "checksum", "fix")))
2738+
.put(IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(), randomFrom("false", "true", "checksum")))
25962739
.build();
25972740
final IndexShard newShard = newShard(shardRouting, indexShard.shardPath(), indexMetaData,
25982741
null, null, indexShard.engineFactory, indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER);
@@ -2634,6 +2777,16 @@ public void testReadSnapshotAndCheckIndexConcurrently() throws Exception {
26342777
closeShards(newShard);
26352778
}
26362779

2780+
public void testCheckOnStartupDeprecatedValue() throws Exception {
2781+
final Settings settings = Settings.builder().put(IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(), "fix").build();
2782+
2783+
final IndexShard newShard = newShard(true, settings);
2784+
closeShards(newShard);
2785+
2786+
assertWarnings("Setting [index.shard.check_on_startup] is set to deprecated value [fix], "
2787+
+ "which has no effect and will not be accepted in future");
2788+
}
2789+
26372790
class Result {
26382791
private final int localCheckpoint;
26392792
private final int maxSeqNo;

0 commit comments

Comments
 (0)