Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
843f977
drop `index.shard.check_on_startup: fix`
Jul 23, 2018
4f01609
Merge remote-tracking branch 'remotes/origin/master' into fix/31389_1
Jul 31, 2018
153e4f2
create corrupted marker on `check_on_startup: true`; split testIndexC…
Aug 21, 2018
2964fef
Merge remote-tracking branch 'remotes/origin/master' into fix/31389_1
Aug 21, 2018
c71e306
create manually corruption marker (but don't corrupt index files) to …
Aug 21, 2018
a7668d6
checkstyle fix
Aug 21, 2018
97fa399
Merge remote-tracking branch 'remotes/origin/master' into fix/31389_1
Aug 24, 2018
c155b36
addressed unit test comments
Aug 27, 2018
85b7eef
keep `fix` for 6.x branch
Aug 27, 2018
3231803
added `fix` deprecation log message + test
Aug 28, 2018
c2b5b8a
added `fix` deprecation log message + test
Aug 28, 2018
14e6175
adjusted `fix` deprecation log message
Aug 28, 2018
fee8a5b
dropped `fix` to avoid deprecation warnings
Aug 28, 2018
5cee2b9
skip files added by Lucene's ExtrasFS
Aug 28, 2018
ad62da0
Merge remote-tracking branch 'remotes/origin/master' into fix/31389_1
Aug 28, 2018
6f6ca5a
Merge remote-tracking branch 'remotes/origin/master' into fix/31389_1
Aug 29, 2018
6763cf9
Merge remote-tracking branch 'remotes/origin/master' into fix/31389_1
Aug 29, 2018
5083e83
Merge remote-tracking branch 'remotes/origin/master' into fix/31389_1
Aug 31, 2018
2a9dbeb
resolved conflicts on Merge remote-tracking branch 'remotes/origin/ma…
Aug 31, 2018
aa16487
Merge remote-tracking branch 'remotes/origin/master' into fix/31389_1
Aug 31, 2018
d26fbfb
Merge remote-tracking branch 'remotes/origin/master' into fix/31389_1
Aug 31, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions docs/reference/index-modules.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,6 @@ corruption is detected, it will prevent the shard from being opened. Accepts:
Check for both physical and logical corruption. This is much more
expensive in terms of CPU and memory usage.

`fix`::

Check for both physical and logical corruption. Segments that were reported
as corrupted will be automatically removed. This option *may result in data loss*.
Use with extreme caution!

WARNING: Expert only. Checking shards may take a lot of time on large indices.
--

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,10 @@ public final class IndexSettings {
switch(s) {
case "false":
case "true":
case "fix":
case "checksum":
return s;
default:
throw new IllegalArgumentException("unknown value for [index.shard.check_on_startup] must be one of [true, false, fix, checksum] but was: " + s);
throw new IllegalArgumentException("unknown value for [index.shard.check_on_startup] must be one of [true, false, checksum] but was: " + s);
}
}, Property.IndexScope);

Expand Down
18 changes: 5 additions & 13 deletions server/src/main/java/org/elasticsearch/index/shard/IndexShard.java
Original file line number Diff line number Diff line change
Expand Up @@ -1298,7 +1298,7 @@ private void innerOpenEngineAndTranslog() throws IOException {
}
recoveryState.setStage(RecoveryState.Stage.VERIFY_INDEX);
// also check here, before we apply the translog
if (Booleans.isTrue(checkIndexOnStartup)) {
if (Booleans.isTrue(checkIndexOnStartup) || "checksum".equals(checkIndexOnStartup)) {
try {
checkIndex();
} catch (IOException ex) {
Expand Down Expand Up @@ -1890,6 +1890,9 @@ void checkIndex() throws IOException {
if (store.tryIncRef()) {
try {
doCheckIndex();
} catch (IOException e) {
store.markStoreCorrupted(e);
throw e;
} finally {
store.decRef();
}
Expand Down Expand Up @@ -1933,18 +1936,7 @@ private void doCheckIndex() throws IOException {
return;
}
logger.warn("check index [failure]\n{}", os.bytes().utf8ToString());
if ("fix".equals(checkIndexOnStartup)) {
if (logger.isDebugEnabled()) {
logger.debug("fixing index, writing new segments file ...");
}
store.exorciseIndex(status);
if (logger.isDebugEnabled()) {
logger.debug("index fixed, wrote new segments file \"{}\"", status.segmentsFileName);
}
} else {
// only throw a failure if we are not going to fix the index
throw new IllegalStateException("index check failure but can't fix it");
}
throw new IllegalStateException("index check failure");
}
}

Expand Down
15 changes: 2 additions & 13 deletions server/src/main/java/org/elasticsearch/index/store/Store.java
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,8 @@ public class Store extends AbstractIndexShardComponent implements Closeable, Ref
static final int VERSION_STACK_TRACE = 1; // we write the stack trace too since 1.4.0
static final int VERSION_START = 0;
static final int VERSION = VERSION_WRITE_THROWABLE;
static final String CORRUPTED = "corrupted_";
// public is for test purposes
public static final String CORRUPTED = "corrupted_";
public static final Setting<TimeValue> INDEX_STORE_STATS_REFRESH_INTERVAL_SETTING =
Setting.timeSetting("index.store.stats_refresh_interval", TimeValue.timeValueSeconds(10), Property.IndexScope);

Expand Down Expand Up @@ -360,18 +361,6 @@ public CheckIndex.Status checkIndex(PrintStream out) throws IOException {
}
}

/**
* Repairs the index using the previous returned status from {@link #checkIndex(PrintStream)}.
*/
public void exorciseIndex(CheckIndex.Status status) throws IOException {
metadataLock.writeLock().lock();
try (CheckIndex checkIndex = new CheckIndex(directory)) {
checkIndex.exorciseIndex(status);
} finally {
metadataLock.writeLock().unlock();
}
}

public StoreStats stats() throws IOException {
ensureOpen();
return new StoreStats(directory.estimateSize());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ public void testIndexTemplateInvalidNumberOfShards() {
containsString("Failed to parse value [0] for setting [index.number_of_shards] must be >= 1"));
assertThat(throwables.get(0).getMessage(),
containsString("unknown value for [index.shard.check_on_startup] " +
"must be one of [true, false, fix, checksum] but was: blargh"));
"must be one of [true, false, checksum] but was: blargh"));
}

public void testIndexTemplateValidationAccumulatesValidationErrors() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,13 @@
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexCommit;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.BaseDirectoryWrapper;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FilterDirectory;
import org.apache.lucene.store.IOContext;
Expand All @@ -53,6 +55,7 @@
import org.elasticsearch.cluster.routing.ShardRoutingState;
import org.elasticsearch.cluster.routing.TestShardRouting;
import org.elasticsearch.cluster.routing.UnassignedInfo;
import org.elasticsearch.common.CheckedFunction;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.UUIDs;
import org.elasticsearch.common.breaker.CircuitBreaker;
Expand Down Expand Up @@ -93,6 +96,7 @@
import org.elasticsearch.index.seqno.SeqNoStats;
import org.elasticsearch.index.seqno.SequenceNumbers;
import org.elasticsearch.index.snapshots.IndexShardSnapshotStatus;
import org.elasticsearch.index.store.DirectoryService;
import org.elasticsearch.index.store.Store;
import org.elasticsearch.index.store.StoreStats;
import org.elasticsearch.index.translog.TestTranslog;
Expand All @@ -110,6 +114,7 @@
import org.elasticsearch.snapshots.SnapshotId;
import org.elasticsearch.snapshots.SnapshotInfo;
import org.elasticsearch.snapshots.SnapshotShardFailure;
import org.elasticsearch.test.CorruptionUtils;
import org.elasticsearch.test.DummyShardLock;
import org.elasticsearch.test.FieldMaskingReader;
import org.elasticsearch.test.VersionUtils;
Expand All @@ -118,7 +123,11 @@

import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
Expand Down Expand Up @@ -1216,7 +1225,7 @@ public String[] listAll() throws IOException {
};

try (Store store = createStore(shardId, new IndexSettings(metaData, Settings.EMPTY), directory)) {
IndexShard shard = newShard(shardRouting, shardPath, metaData, store,
IndexShard shard = newShard(shardRouting, shardPath, metaData, i -> store,
null, new InternalEngineFactory(), () -> {
}, EMPTY_EVENT_LISTENER);
AtomicBoolean failureCallbackTriggered = new AtomicBoolean(false);
Expand Down Expand Up @@ -2558,6 +2567,104 @@ public void testReadSnapshotConcurrently() throws IOException, InterruptedExcept
closeShards(newShard);
}

public void testIndexCheckChecksum() throws Exception {
final boolean primary = true;

IndexShard indexShard = newStartedShard(primary);

final long numDocs = between(10, 100);
for (long i = 0; i < numDocs; i++) {
indexDoc(indexShard, "_doc", Long.toString(i), "{}");
}
indexShard.flush(new FlushRequest());
closeShards(indexShard);

// start shard with checksum - it has to pass successfully
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment seems misleading: we don't start the shard here, and we expect it not to pass successfully when we do.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes sense, addressed your comments


final ShardRouting shardRouting = ShardRoutingHelper.initWithSameId(indexShard.routingEntry(),
primary ? RecoverySource.StoreRecoverySource.EXISTING_STORE_INSTANCE : RecoverySource.PeerRecoverySource.INSTANCE
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

primary is always true here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

++

);
final IndexMetaData indexMetaData = IndexMetaData.builder(indexShard.indexSettings().getIndexMetaData())
.settings(Settings.builder()
.put(indexShard.indexSettings.getSettings())
.put(IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(), "checksum"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this test also work if we set this to true? If so, could we choose randomly between these two settings, and rename the test appropriately?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In case of corrupted index IndexShard with checksum option creates corrupted marker file, while in case of true it does not create it - seems due to https://github.com/elastic/elasticsearch/pull/32279/files#diff-49e1a1b834b522f4ae6997c5defe9eb0R1939 - as IllegalStateException is not subclass of IOException https://github.com/elastic/elasticsearch/pull/32279/files#diff-49e1a1b834b522f4ae6997c5defe9eb0R1894

Is it worth to change to IOException as well ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It sounds to me like it's doing the wrong thing here, but these things are subtle. @ywelsch might know: if check_on_startup: true finds a corruption it doesn't currently mark the shard as corrupted. I think it should. WDYT?

Copy link
Contributor Author

@vladimirdolzhenko vladimirdolzhenko Aug 21, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@DaveCTurner @ywelsch
https://www.elastic.co/guide/en/elasticsearch/reference/current/index-modules.html

  • checksum - Check for physical corruption.
  • true - Check for both physical and logical corruption. This is much more expensive in terms of CPU and memory usage.

true relies on Lucene CheckIndex check - that is indeed heavier operation - I would say we have to create a corruption marker

.build();

final ShardPath shardPath = indexShard.shardPath();

final IndexShard newShard = newShard(shardRouting, shardPath, indexMetaData,
null, null, indexShard.engineFactory,
indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER);

closeShards(newStartedShard(p -> newShard, primary));

// corrupt files
final Path indexPath = shardPath.getDataPath().resolve("index");
final Path[] filesToCorrupt =
Files.walk(indexPath)
.filter(p -> Files.isRegularFile(p) && IndexWriter.WRITE_LOCK_NAME.equals(p.getFileName().toString()) == false)
.toArray(Path[]::new);
CorruptionUtils.corruptFile(random(), filesToCorrupt);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this definitely corrupt a file that will be checked? It doesn't, for instance, hit a translog file in a generation that's early enough that it's no longer needed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It indeed corrupts only index file due to
final Path indexPath = shardPath.getDataPath().resolve("index");

translog files are stored in another place - shardPath.getDataPath().resolve("translog");


// check that corrupt marker is *NOT* there
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this comment is necessary here. The check is the assertion a few lines further down, and it should be clear what that assertion is doing. I think the message in the assertion could use the phrase "corruption marker" rather than "clean" for consistency.

final AtomicInteger corruptedMarkerCount = new AtomicInteger();
final SimpleFileVisitor<Path> corruptedVisitor = new SimpleFileVisitor<Path>() {
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
if (Files.isRegularFile(file) && file.getFileName().toString().startsWith(Store.CORRUPTED)) {
corruptedMarkerCount.incrementAndGet();
}
return FileVisitResult.CONTINUE;
}
};
Files.walkFileTree(indexPath, corruptedVisitor);

assertThat("store is clean",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I'd prefer statements to go all on one line if possible. There's a few places where that could be done, such as this one.

corruptedMarkerCount.get(), equalTo(0));

// storeProvider that does not perform check index on close - it is corrupted
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't fully understand this comment, or what this storeProvider is for. The heart of this test seems to be:

expectThrows(IndexShardRecoveryException.class, () -> newStartedShard(p -> corruptedShard, primary));

We also want the corruption marker to exist as soon as this is thrown, not after the shard is subsequently closed. If we need to test that opening a shard with a corruption marker fails, and that the corruption marker isn't removed when it's closed, then I think that should be a separate test.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, that heart of the test is
expectThrows(IndexShardRecoveryException.class, () -> newStartedShard(p -> corruptedShard, primary));

the problem with default store provider - that it checks index on close. In case of corrupted index - it fails on close (as it's corrupted) - and we have to close it to avoid resource leakage.

therefore the key point of adjusted storeProvider is

baseDirectoryWrapper.setCheckIndexOnClose(false);

Agree to split it into 2 tests - the 2nd one with opening a shard with a corruption marker fails

final CheckedFunction<IndexSettings, Store, IOException> storeProvider = indexSettings -> {
final ShardId shardId = shardPath.getShardId();
final DirectoryService directoryService = new DirectoryService(shardId, indexSettings) {
@Override
public Directory newDirectory() throws IOException {
final BaseDirectoryWrapper baseDirectoryWrapper = newFSDirectory(shardPath.resolveIndex());
// index is corrupted - don't even try to check index on close - it fails
baseDirectoryWrapper.setCheckIndexOnClose(false);
return baseDirectoryWrapper;
}
};
return new Store(shardId, indexSettings, directoryService, new DummyShardLock(shardId));
};

// try to start shard on corrupted files
final IndexShard corruptedShard = newShard(shardRouting, shardPath, indexMetaData,
storeProvider, null, indexShard.engineFactory,
indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER);

expectThrows(IndexShardRecoveryException.class, () -> newStartedShard(p -> corruptedShard, primary));
closeShards(corruptedShard);

// check that corrupt marker is there
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similarly - this comment isn't needed here.

Files.walkFileTree(indexPath, corruptedVisitor);
assertThat("store has to be marked as corrupted",
corruptedMarkerCount.get(), equalTo(1));

// try to start another time shard on corrupted files
final IndexShard corruptedShard2 = newShard(shardRouting, shardPath, indexMetaData,
storeProvider, null, indexShard.engineFactory,
indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER);

expectThrows(IndexShardRecoveryException.class, () -> newStartedShard(p -> corruptedShard2, primary));
closeShards(corruptedShard2);

// check that corrupt marker is there
corruptedMarkerCount.set(0);
Files.walkFileTree(indexPath, corruptedVisitor);
assertThat("store still has a single corrupt marker",
corruptedMarkerCount.get(), equalTo(1));
}

/**
* Simulates a scenario that happens when we are async fetching snapshot metadata from GatewayService
* and checking index concurrently. This should always be possible without any exception.
Expand All @@ -2581,7 +2688,7 @@ public void testReadSnapshotAndCheckIndexConcurrently() throws Exception {
final IndexMetaData indexMetaData = IndexMetaData.builder(indexShard.indexSettings().getIndexMetaData())
.settings(Settings.builder()
.put(indexShard.indexSettings.getSettings())
.put(IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(), randomFrom("false", "true", "checksum", "fix")))
.put(IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(), randomFrom("false", "true", "checksum")))
.build();
final IndexShard newShard = newShard(shardRouting, indexShard.shardPath(), indexMetaData,
null, null, indexShard.engineFactory, indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER);
Expand Down
Loading