Skip to content

Commit cbdb02f

Browse files
committed
Forbid use of Thread.sleep in :server:internalClusterTest
Signed-off-by: Andrew Ross <[email protected]>
1 parent ba16b24 commit cbdb02f

File tree

41 files changed

+163
-119
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+163
-119
lines changed

buildSrc/src/main/java/org/opensearch/gradle/precommit/ForbiddenApisPrecommitPlugin.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,22 @@ public Void call(Object... names) {
136136
return null;
137137
}
138138
});
139+
// Add a closure to allow projects to optionally call `forbidSleep()` which will add the signatures
140+
// to forbid all usages of `Thread.sleep`
141+
ext.set("forbidSleep", new Closure<Void>(t) {
142+
@Override
143+
public Void call(Object... unused) {
144+
final List<String> signatures = new ArrayList<>();
145+
signatures.addAll(t.getSignatures());
146+
signatures.add(
147+
"java.lang.Thread#sleep(**) @ Fixed sleeps lead to non-deterministic test failures."
148+
+ " Poll for whatever condition you're waiting for."
149+
+ " Use helpers like `assertBusy` or the awaitility lib."
150+
);
151+
t.setSignatures(signatures);
152+
return null;
153+
}
154+
});
139155
// Use of the deprecated security manager APIs are pervasive so set them to warn
140156
// globally for all projects. Replacements for (most of) these APIs are available
141157
// so usages can move to the non-deprecated variants to avoid the warnings.

server/build.gradle

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,8 @@ tasks.named("testingConventions").configure {
169169
}
170170
}
171171

172+
tasks.named('forbiddenApisInternalClusterTest').configure { forbidSleep() }
173+
172174
// Set to current version by default
173175
def japicmpCompareTarget = System.getProperty("japicmp.compare.version")
174176
if (japicmpCompareTarget == null) { /* use latest released version */

server/src/internalClusterTest/java/org/opensearch/action/admin/cluster/shards/TransportCatShardsActionIT.java

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -95,27 +95,13 @@ public void testCatShardsWithTimeoutException() throws IOException, AssertionErr
9595
// Dropping master node to delay in cluster state call.
9696
internalCluster().stopRandomNode(InternalTestCluster.nameFilter(masterNodes.get(0)));
9797

98-
CountDownLatch latch = new CountDownLatch(2);
99-
new Thread(() -> {
100-
try {
101-
// Ensures the cancellation timeout expires.
102-
Thread.sleep(2000);
103-
// Starting master node to proceed in cluster state call.
104-
internalCluster().startClusterManagerOnlyNode(
105-
Settings.builder().put("node.name", masterNodes.get(0)).put(clusterManagerDataPathSettings).build()
106-
);
107-
latch.countDown();
108-
} catch (InterruptedException e) {
109-
throw new RuntimeException(e);
110-
}
111-
}).start();
112-
98+
CountDownLatch latch = new CountDownLatch(1);
11399
final CatShardsRequest shardsRequest = new CatShardsRequest();
114100
TimeValue timeoutInterval = timeValueMillis(1000);
115101
shardsRequest.setCancelAfterTimeInterval(timeoutInterval);
116102
shardsRequest.clusterManagerNodeTimeout(timeValueMillis(2500));
117103
shardsRequest.setIndices(Strings.EMPTY_ARRAY);
118-
client().execute(CatShardsAction.INSTANCE, shardsRequest, new ActionListener<CatShardsResponse>() {
104+
client().execute(CatShardsAction.INSTANCE, shardsRequest, new ActionListener<>() {
119105
@Override
120106
public void onResponse(CatShardsResponse catShardsResponse) {
121107
// onResponse should not be called.
@@ -132,7 +118,13 @@ public void onFailure(Exception e) {
132118
latch.countDown();
133119
}
134120
});
121+
135122
latch.await();
123+
124+
// Restart cluster manager to restore cluster and allow test to cleanup and exit
125+
internalCluster().startClusterManagerOnlyNode(
126+
Settings.builder().put("node.name", masterNodes.get(0)).put(clusterManagerDataPathSettings).build()
127+
);
136128
}
137129

138130
public void testListShardsWithHiddenIndex() throws Exception {

server/src/internalClusterTest/java/org/opensearch/action/admin/indices/create/RemoteCloneIndexIT.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
import org.opensearch.action.admin.indices.stats.IndicesStatsResponse;
5050
import org.opensearch.cluster.metadata.RepositoryMetadata;
5151
import org.opensearch.cluster.routing.allocation.decider.EnableAllocationDecider;
52+
import org.opensearch.common.SuppressForbidden;
5253
import org.opensearch.common.settings.Settings;
5354
import org.opensearch.common.unit.TimeValue;
5455
import org.opensearch.core.common.unit.ByteSizeValue;
@@ -227,6 +228,7 @@ protected void setLowPriorityUploadRate(String repoName, String value) throws Ex
227228
createRepository(repoName, rmd.type(), settings);
228229
}
229230

231+
@SuppressForbidden(reason = "Waiting longer than timeout")
230232
public void testCreateCloneIndexFailure() throws ExecutionException, InterruptedException {
231233
asyncUploadMockFsRepo = false;
232234
Version version = VersionUtils.randomIndexCompatibleVersion(random());

server/src/internalClusterTest/java/org/opensearch/action/ingest/AsyncIngestProcessorIT.java

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -136,13 +136,6 @@ public Map<String, Processor.Factory> getProcessors(Processor.Parameters paramet
136136
public void execute(IngestDocument ingestDocument, BiConsumer<IngestDocument, Exception> handler) {
137137
threadPool.generic().execute(() -> {
138138
String id = (String) ingestDocument.getSourceAndMetadata().get("_id");
139-
if (usually()) {
140-
try {
141-
Thread.sleep(10);
142-
} catch (InterruptedException e) {
143-
// ignore
144-
}
145-
}
146139
ingestDocument.setFieldValue("foo", "bar-" + id);
147140
handler.accept(ingestDocument, null);
148141
});

server/src/internalClusterTest/java/org/opensearch/cluster/ClusterInfoServiceIT.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
import org.opensearch.cluster.routing.ShardRouting;
4545
import org.opensearch.cluster.routing.allocation.decider.EnableAllocationDecider;
4646
import org.opensearch.cluster.service.ClusterService;
47+
import org.opensearch.common.SuppressForbidden;
4748
import org.opensearch.common.settings.Settings;
4849
import org.opensearch.common.unit.TimeValue;
4950
import org.opensearch.core.action.ActionListener;
@@ -149,6 +150,7 @@ private void setClusterInfoTimeout(String timeValue) {
149150
);
150151
}
151152

153+
@SuppressForbidden(reason = "Fixed sleeps are prone to flakiness but no documented failures here")
152154
public void testClusterInfoServiceCollectsInformation() throws InterruptedException {
153155
Settings.Builder settingsBuilder = Settings.builder()
154156
.put(ResourceTrackerSettings.GLOBAL_JVM_USAGE_AC_WINDOW_DURATION_SETTING.getKey(), TimeValue.timeValueSeconds(1))
@@ -250,6 +252,7 @@ public void testClusterInfoServiceCollectsFileCacheInformation() {
250252
}
251253
}
252254

255+
@SuppressForbidden(reason = "Fixed sleeps are prone to flakiness but no documented failures here")
253256
public void testClusterInfoServiceCollectsNodeResourceStatsInformation() throws InterruptedException {
254257

255258
// setting time window as ResourceUsageTracker needs atleast both of these to be ready to start collecting the
@@ -279,6 +282,7 @@ public void testClusterInfoServiceCollectsNodeResourceStatsInformation() throws
279282
assertEquals(2, nodeResourceUsageStats.size());
280283
}
281284

285+
@SuppressForbidden(reason = "Fixed sleeps are prone to flakiness but no documented failures here")
282286
public void testClusterInfoServiceInformationClearOnError() throws InterruptedException {
283287
internalCluster().startNodes(
284288
2,

server/src/internalClusterTest/java/org/opensearch/cluster/coordination/NodeJoinLeftIT.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
import org.opensearch.cluster.NodeConnectionsService;
4141
import org.opensearch.cluster.metadata.IndexMetadata;
4242
import org.opensearch.cluster.service.ClusterService;
43+
import org.opensearch.common.SuppressForbidden;
4344
import org.opensearch.common.settings.Settings;
4445
import org.opensearch.index.MockEngineFactoryPlugin;
4546
import org.opensearch.indices.recovery.RecoverySettings;
@@ -75,6 +76,7 @@
7576
https://github.com/opensearch-project/OpenSearch/pull/15521 for context
7677
*/
7778
@ClusterScope(scope = Scope.TEST, numDataNodes = 0)
79+
@SuppressForbidden(reason = "Pending fix: https://github.com/opensearch-project/OpenSearch/issues/18972")
7880
public class NodeJoinLeftIT extends OpenSearchIntegTestCase {
7981

8082
private TestLogsAppender testLogsAppender;

server/src/internalClusterTest/java/org/opensearch/cluster/coordination/RareClusterStateIT.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
import org.opensearch.cluster.routing.allocation.AllocationService;
5454
import org.opensearch.cluster.routing.allocation.ExistingShardsAllocator;
5555
import org.opensearch.cluster.service.ClusterService;
56+
import org.opensearch.common.SuppressForbidden;
5657
import org.opensearch.common.action.ActionFuture;
5758
import org.opensearch.common.settings.Settings;
5859
import org.opensearch.common.unit.TimeValue;
@@ -219,6 +220,7 @@ public void testDeleteCreateInOneBulk() throws Exception {
219220
assertHitCount(client().prepareSearch("test").get(), 0);
220221
}
221222

223+
@SuppressForbidden(reason = "Best effort sleep, not critical to test logic")
222224
public void testDelayedMappingPropagationOnPrimary() throws Exception {
223225
// Here we want to test that things go well if there is a first request
224226
// that adds mappings but before mappings are propagated to all nodes
@@ -309,6 +311,7 @@ public void testDelayedMappingPropagationOnPrimary() throws Exception {
309311
assertEquals(1, docIndexResponse.get(10, TimeUnit.SECONDS).getShardInfo().getTotal());
310312
}
311313

314+
@SuppressForbidden(reason = "Best effort sleep, not critical to test logic")
312315
public void testDelayedMappingPropagationOnReplica() throws Exception {
313316
// This is essentially the same thing as testDelayedMappingPropagationOnPrimary
314317
// but for replicas

server/src/internalClusterTest/java/org/opensearch/cluster/routing/WeightedRoutingIT.java

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -335,15 +335,14 @@ public void testGetWeightedRouting_ClusterManagerNotDiscovered() throws Exceptio
335335
logger.info("--> network disruption is started");
336336
networkDisruption.startDisrupting();
337337

338-
// wait for leader checker to fail
339-
Thread.sleep(13000);
340-
341338
// get api to fetch local weighted routing for a node in zone a or b
342-
ClusterGetWeightedRoutingResponse weightedRoutingResponse = internalCluster().client(
343-
randomFrom(nodes_in_zone_a.get(0), nodes_in_zone_b.get(0))
344-
).admin().cluster().prepareGetWeightedRouting().setAwarenessAttribute("zone").setRequestLocal(true).get();
345-
assertEquals(weightedRouting, weightedRoutingResponse.weights());
346-
assertFalse(weightedRoutingResponse.getDiscoveredClusterManager());
339+
assertBusy(() -> {
340+
ClusterGetWeightedRoutingResponse weightedRoutingResponse = internalCluster().client(
341+
randomFrom(nodes_in_zone_a.get(0), nodes_in_zone_b.get(0))
342+
).admin().cluster().prepareGetWeightedRouting().setAwarenessAttribute("zone").setRequestLocal(true).get();
343+
assertEquals(weightedRouting, weightedRoutingResponse.weights());
344+
assertFalse(weightedRoutingResponse.getDiscoveredClusterManager());
345+
}, 13, TimeUnit.SECONDS);
347346
logger.info("--> network disruption is stopped");
348347
networkDisruption.stopDisrupting();
349348

server/src/internalClusterTest/java/org/opensearch/cluster/service/ClusterServiceIT.java

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import org.opensearch.cluster.ClusterStateUpdateTask;
3939
import org.opensearch.cluster.node.DiscoveryNode;
4040
import org.opensearch.common.Nullable;
41+
import org.opensearch.common.SuppressForbidden;
4142
import org.opensearch.common.settings.Settings;
4243
import org.opensearch.common.unit.TimeValue;
4344
import org.opensearch.test.OpenSearchIntegTestCase;
@@ -55,6 +56,8 @@
5556
import static org.hamcrest.Matchers.equalTo;
5657
import static org.hamcrest.Matchers.greaterThan;
5758
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
59+
import static org.hamcrest.Matchers.hasSize;
60+
import static org.awaitility.Awaitility.await;
5861

5962
@ClusterScope(scope = Scope.TEST, numDataNodes = 0)
6063
public class ClusterServiceIT extends OpenSearchIntegTestCase {
@@ -350,6 +353,7 @@ public void testPendingUpdateTask() throws Exception {
350353
final CountDownLatch invoked1 = new CountDownLatch(1);
351354
clusterService.submitStateUpdateTask("1", new ClusterStateUpdateTask() {
352355
@Override
356+
@SuppressForbidden(reason = "Sleeping to guarantee a >0 time metric calculation")
353357
public ClusterState execute(ClusterState currentState) {
354358
try {
355359
Thread.sleep(50);
@@ -458,10 +462,11 @@ public void onFailure(String source, Exception e) {
458462
}
459463
});
460464
}
461-
Thread.sleep(100);
462465

463-
pendingClusterTasks = clusterService.getClusterManagerService().pendingTasks();
464-
assertThat(pendingClusterTasks.size(), greaterThanOrEqualTo(5));
466+
pendingClusterTasks = await().until(
467+
() -> clusterService.getClusterManagerService().pendingTasks(),
468+
hasSize(greaterThanOrEqualTo(5))
469+
);
465470
controlSources = new HashSet<>(Arrays.asList("1", "2", "3", "4", "5"));
466471
for (PendingClusterTask task : pendingClusterTasks) {
467472
controlSources.remove(task.getSource().string());

0 commit comments

Comments
 (0)