Skip to content

Commit 7281b75

Browse files
committed
Add in-flight ops recovery test
This commit adds a test for performing a sequence-number-based recovery with some ops that are in-flight when the recovery starts.
1 parent 999ca91 commit 7281b75

File tree

1 file changed

+229
-0
lines changed

1 file changed

+229
-0
lines changed
Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
package org.elasticsearch.indices.recovery;
2+
3+
import org.apache.lucene.analysis.Analyzer;
4+
import org.apache.lucene.analysis.Tokenizer;
5+
import org.elasticsearch.action.admin.indices.flush.FlushRequest;
6+
import org.elasticsearch.common.settings.Settings;
7+
import org.elasticsearch.common.util.concurrent.EsExecutors;
8+
import org.elasticsearch.common.xcontent.XContentBuilder;
9+
import org.elasticsearch.index.Index;
10+
import org.elasticsearch.index.analysis.AnalyzerProvider;
11+
import org.elasticsearch.index.analysis.AnalyzerScope;
12+
import org.elasticsearch.indices.IndicesService;
13+
import org.elasticsearch.indices.analysis.AnalysisModule;
14+
import org.elasticsearch.plugins.AnalysisPlugin;
15+
import org.elasticsearch.plugins.Plugin;
16+
import org.elasticsearch.test.ESIntegTestCase;
17+
import org.elasticsearch.test.InternalTestCluster;
18+
19+
import java.io.IOException;
20+
import java.util.ArrayList;
21+
import java.util.Collection;
22+
import java.util.Collections;
23+
import java.util.List;
24+
import java.util.Map;
25+
import java.util.concurrent.CountDownLatch;
26+
import java.util.concurrent.atomic.AtomicReference;
27+
28+
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
29+
import static org.hamcrest.Matchers.equalTo;
30+
31+
@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0)
32+
public class EvilRecoveryIT extends ESIntegTestCase {
33+
34+
private static AtomicReference<CountDownLatch> indexLatch = new AtomicReference<>();
35+
private static AtomicReference<CountDownLatch> waitForOpsToCompleteLatch = new AtomicReference<>();
36+
37+
@Override
38+
protected Collection<Class<? extends Plugin>> nodePlugins() {
39+
return Collections.singletonList(LatchAnalysisPlugin.class);
40+
}
41+
42+
public static class LatchAnalysisPlugin extends Plugin implements AnalysisPlugin {
43+
44+
@Override
45+
public Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
46+
return Collections.singletonMap("latch_analyzer", (a, b, c, d) -> new LatchAnalyzerProvider());
47+
}
48+
49+
}
50+
51+
static class LatchAnalyzerProvider implements AnalyzerProvider<LatchAnalyzer> {
52+
53+
@Override
54+
public String name() {
55+
return "latch_analyzer";
56+
}
57+
58+
@Override
59+
public AnalyzerScope scope() {
60+
return AnalyzerScope.INDICES;
61+
}
62+
63+
@Override
64+
public LatchAnalyzer get() {
65+
return new LatchAnalyzer();
66+
}
67+
68+
}
69+
70+
static class LatchAnalyzer extends Analyzer {
71+
72+
@Override
73+
protected TokenStreamComponents createComponents(final String fieldName) {
74+
return new TokenStreamComponents(new LatchTokenizer());
75+
}
76+
77+
}
78+
79+
static class LatchTokenizer extends Tokenizer {
80+
81+
@Override
82+
public final boolean incrementToken() throws IOException {
83+
try {
84+
if (indexLatch.get() != null) {
85+
// latch that all exected operations are in the engine
86+
indexLatch.get().countDown();
87+
}
88+
89+
if (waitForOpsToCompleteLatch.get() != null) {
90+
// latch that waits for the replica to restart and allows recovery to proceed
91+
waitForOpsToCompleteLatch.get().await();
92+
}
93+
94+
} catch (final InterruptedException e) {
95+
throw new RuntimeException(e);
96+
}
97+
return false;
98+
}
99+
100+
}
101+
102+
@Override
103+
protected Settings nodeSettings(final int nodeOrdinal) {
104+
final Settings nodeSettings = super.nodeSettings(nodeOrdinal);
105+
final int processors = randomIntBetween(1, 4);
106+
/*
107+
* We have to do this to ensure that there are sufficiently many threads to accept the indexing requests, otherwise operations will
108+
* instead be queued and never trip the latch that all operations are inside the engine.
109+
*/
110+
return Settings.builder().put(nodeSettings).put("processors", processors).put("thread_pool.bulk.size", 1 + processors).build();
111+
}
112+
113+
/*
114+
* This tests that sequence-number-based recoveries wait for in-flight operations to complete. The trick here is simple. We latch some
115+
* in-flight operations inside the engine after sequence numbers are assigned. While these operations are latched, we restart a replica.
116+
* Sequence-number-based recovery on this replica has to wait until these in-flight operations complete to proceed. We verify at the end
117+
* of recovery that a file-based recovery was not completed, and that the expected number of operations was replayed via the translog.
118+
*/
119+
public void testRecoveryWaitsForOps() throws Exception {
120+
final int docs = randomIntBetween(1, 64);
121+
final int numberOfProcessors = EsExecutors.numberOfProcessors(nodeSettings(0));
122+
final int latchedDocs = randomIntBetween(1, 1 + numberOfProcessors);
123+
124+
try {
125+
internalCluster().startMasterOnlyNode();
126+
final String primaryNode = internalCluster().startDataOnlyNode(nodeSettings(0));
127+
128+
// prepare mapping that uses our latch analyzer
129+
final XContentBuilder mapping = jsonBuilder();
130+
mapping.startObject();
131+
{
132+
mapping.startObject("type");
133+
{
134+
mapping.startObject("properties");
135+
{
136+
mapping.startObject("foo");
137+
{
138+
mapping.field("type", "text");
139+
mapping.field("analyzer", "latch_analyzer");
140+
mapping.endObject();
141+
}
142+
mapping.endObject();
143+
}
144+
mapping.endObject();
145+
}
146+
mapping.endObject();
147+
}
148+
149+
// create the index with our mapping
150+
client()
151+
.admin()
152+
.indices()
153+
.prepareCreate("index")
154+
.addMapping("type", mapping)
155+
.setSettings(Settings.builder().put("number_of_shards", 1))
156+
.get();
157+
158+
// start the replica node; we do this after creating the index so we can control which node is holds the primary shard
159+
final String replicaNode = internalCluster().startDataOnlyNode(nodeSettings(1));
160+
ensureGreen();
161+
162+
// index some documents so that the replica will attempt a sequence-number-based recovery upon restart
163+
for (int foo = 0; foo < docs; foo++) {
164+
index(randomFrom(primaryNode, replicaNode), foo);
165+
}
166+
167+
if (randomBoolean()) {
168+
client().admin().indices().flush(new FlushRequest()).get();
169+
}
170+
171+
// start some in-flight operations that will get latched in the engine
172+
final List<Thread> threads = new ArrayList<>();
173+
indexLatch.set(new CountDownLatch(latchedDocs));
174+
waitForOpsToCompleteLatch.set(new CountDownLatch(1));
175+
for (int i = docs; i < docs + latchedDocs; i++) {
176+
final int foo = i;
177+
// we have to index through the primary since we are going to restart the replica
178+
final Thread thread = new Thread(() -> index(primaryNode, foo));
179+
threads.add(thread);
180+
thread.start();
181+
}
182+
183+
// latch until all operations are inside the engine
184+
indexLatch.get().await();
185+
186+
internalCluster().restartNode(replicaNode, new InternalTestCluster.RestartCallback());
187+
188+
final Index index = resolveIndex("index");
189+
190+
// wait until recovery starts
191+
assertBusy(() -> {
192+
final IndicesService primaryService = internalCluster().getInstance(IndicesService.class, primaryNode);
193+
assertThat(primaryService.indexServiceSafe(index).getShard(0).recoveryStats().currentAsSource(), equalTo(1));
194+
final IndicesService replicaService = internalCluster().getInstance(IndicesService.class, replicaNode);
195+
assertThat(replicaService.indexServiceSafe(index).getShard(0).recoveryStats().currentAsTarget(), equalTo(1));
196+
}
197+
);
198+
199+
// unlatch the operations that are latched inside the engine
200+
waitForOpsToCompleteLatch.get().countDown();
201+
202+
for (final Thread thread : threads) {
203+
thread.join();
204+
}
205+
206+
// recovery should complete successfully
207+
ensureGreen();
208+
209+
// verify that a sequence-number-based recovery was completed
210+
final org.elasticsearch.action.admin.indices.recovery.RecoveryResponse response =
211+
client().admin().indices().prepareRecoveries("index").get();
212+
final List<RecoveryState> states = response.shardRecoveryStates().get("index");
213+
for (final RecoveryState state : states) {
214+
if (state.getTargetNode().getName().equals(replicaNode)) {
215+
assertThat(state.getTranslog().recoveredOperations(), equalTo(latchedDocs));
216+
assertThat(state.getIndex().recoveredFilesPercent(), equalTo(0f));
217+
}
218+
}
219+
} finally {
220+
internalCluster().close();
221+
}
222+
223+
}
224+
225+
private void index(final String node, final int foo) {
226+
client(node).prepareIndex("index", "type").setSource("{\"foo\":\"" + Integer.toString(foo) + "\"}").get();
227+
}
228+
229+
}

0 commit comments

Comments
 (0)