Skip to content

Commit c65af26

Browse files
committed
Make all aborts explicit events
1 parent a96194e commit c65af26

File tree

3 files changed

+50
-30
lines changed

3 files changed

+50
-30
lines changed

trust-quorum/test-utils/src/nexus.rs

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -126,13 +126,6 @@ impl NexusState {
126126
(&config.coordinator, config.to_reconfigure_msg(self.rack_id))
127127
}
128128

129-
/// Abort the latest reconfiguration attempt
130-
pub fn abort_reconfiguration(&mut self) {
131-
let config = self.configs.iter().last().expect("at least one config");
132-
// Can only abort while preparing
133-
assert_eq!(config.op, NexusOp::Preparing);
134-
}
135-
136129
pub fn latest_config(&self) -> &NexusConfig {
137130
self.configs.iter().last().expect("at least one config")
138131
}

trust-quorum/test-utils/src/state.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,10 +91,7 @@ impl TqState {
9191
pub fn send_reconfigure_msg(&mut self) {
9292
let (coordinator, msg) = self.nexus.reconfigure_msg_for_latest_config();
9393
let epoch_to_config = msg.epoch;
94-
if self.faults.crashed_nodes.contains(coordinator) {
95-
// We must abort the configuration. This mimics a timeout.
96-
self.nexus.abort_reconfiguration();
97-
} else {
94+
if !self.faults.crashed_nodes.contains(coordinator) {
9895
let (node, ctx) = self
9996
.sut
10097
.nodes
@@ -369,6 +366,9 @@ impl TqState {
369366
id: PlatformId,
370367
connection_order: Vec<PlatformId>,
371368
) {
369+
// The node is no longer crashed.
370+
self.faults.crashed_nodes.remove(&id);
371+
372372
// We need to clear the mutable state of the `Node`. We do this by
373373
// creating a new `Node` and passing in the existing context which
374374
// contains the persistent state.

trust-quorum/tests/cluster.rs

Lines changed: 46 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,11 @@ impl TestState {
4242
TestState { tq_state: TqState::new(log), skipped_actions: 0 }
4343
}
4444

45-
fn initial_config_event(
45+
fn initial_config_events(
4646
&self,
4747
config: GeneratedConfiguration,
4848
down_nodes: BTreeSet<usize>,
49-
) -> Event {
49+
) -> Vec<Event> {
5050
// `tq_state` doesn't create the member universe until the first event is
5151
// applied. We duplicate it here so we can create that initial config
5252
// event.
@@ -65,22 +65,28 @@ impl TestState {
6565
let coordinator =
6666
members.first().cloned().expect("at least one member");
6767
let last_committed_epoch = None;
68+
let crashed_nodes: BTreeSet<_> = down_nodes
69+
.into_iter()
70+
.map(|index| member_universe[index].clone())
71+
.collect();
72+
let should_abort = crashed_nodes.contains(&coordinator);
6873
let config = NexusConfig::new(
6974
epoch,
7075
last_committed_epoch,
7176
coordinator,
7277
members,
7378
threshold,
7479
);
75-
let crashed_nodes = down_nodes
76-
.into_iter()
77-
.map(|index| member_universe[index].clone())
78-
.collect();
79-
Event::InitialSetup {
80+
let mut events = vec![Event::InitialSetup {
8081
member_universe_size: MEMBER_UNIVERSE_SIZE,
8182
config,
8283
crashed_nodes,
84+
}];
85+
86+
if should_abort {
87+
events.push(Event::AbortConfiguration(epoch));
8388
}
89+
events
8490
}
8591

8692
// Execute the proptest generated actions
@@ -195,7 +201,20 @@ impl TestState {
195201
}
196202

197203
let id = selector.select(faultable).clone();
198-
vec![Event::CrashNode(id)]
204+
let latest_config = self.tq_state.nexus.latest_config();
205+
if id == latest_config.coordinator
206+
&& latest_config.op == NexusOp::Preparing
207+
{
208+
// The `AbortConfiguration` simulates Nexus polling and timing
209+
// out or receiving an error response on node restart because the
210+
// configuration was lost.
211+
vec![
212+
Event::CrashNode(id.clone()),
213+
Event::AbortConfiguration(latest_config.epoch),
214+
]
215+
} else {
216+
vec![Event::CrashNode(id.clone())]
217+
}
199218
}
200219

201220
fn action_to_events_restart_node(
@@ -309,8 +328,7 @@ impl TestState {
309328
return events;
310329
}
311330

312-
// If the coordinator has crashed then Nexus should abort.
313-
// Crashing is not actually implemented yet, but it will be.
331+
// If the coordinator is currently down then Nexus should abort.
314332
if self
315333
.tq_state
316334
.faults
@@ -346,9 +364,9 @@ impl TestState {
346364
//
347365
// In a real system this request would go over the network, but would
348366
// end up at the same place.
349-
let cs = coordinator
350-
.get_coordinator_state()
351-
.expect("coordinator is coordinating");
367+
let cs = coordinator.get_coordinator_state().unwrap_or_else(|| {
368+
panic!("coordinator is coordinating: {}", ctx.platform_id())
369+
});
352370

353371
// Put the reply on the network
354372
events.push(Event::SendNexusReplyOnUnderlay(
@@ -510,11 +528,18 @@ impl TestState {
510528
let nexus_config = NexusConfig::new(
511529
epoch,
512530
last_committed_epoch,
513-
coordinator,
531+
coordinator.clone(),
514532
new_members,
515533
threshold,
516534
);
517-
vec![Event::Reconfigure(nexus_config)]
535+
let mut events = vec![Event::Reconfigure(nexus_config)];
536+
537+
if self.tq_state.faults.crashed_nodes.contains(&coordinator) {
538+
// This simulates a timeout on the reply from the coordinator which
539+
// triggers an abort.
540+
events.push(Event::AbortConfiguration(epoch));
541+
}
542+
events
518543
}
519544

520545
/// At every point during the running of the test, invariants over the system
@@ -885,10 +910,12 @@ fn test_trust_quorum_protocol(input: TestInput) {
885910
let mut state = TestState::new(log.clone());
886911

887912
// Perform the initial setup
888-
let event = state
889-
.initial_config_event(input.initial_config, input.initial_down_nodes);
890-
event_log.record(&event);
891-
state.tq_state.apply_event(event);
913+
let events = state
914+
.initial_config_events(input.initial_config, input.initial_down_nodes);
915+
for event in events {
916+
event_log.record(&event);
917+
state.tq_state.apply_event(event);
918+
}
892919

893920
// Start executing the actions
894921
state.run_actions(input.actions, &mut event_log)?;

0 commit comments

Comments
 (0)