Skip to content
This repository was archived by the owner on Jan 22, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 94 additions & 43 deletions core/src/consensus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ pub struct Tower {
// (This is a special field for slashing-free validator restart with edge cases).
// This could be emptied after some time; but left intact indefinitely for easier
// implementation
// Further, stray slot can be stale or not. `Stale` here means whether given
// bank_forks (=~ ledger) lacks the slot or not.
stray_restored_slot: Option<Slot>,
#[serde(skip)]
pub last_switch_threshold_check: Option<(Slot, SwitchForkDecision)>,
Expand Down Expand Up @@ -438,7 +440,7 @@ impl Tower {

// root may be forcibly set by arbitrary replay root slot, for example from a root
// after replaying a snapshot.
// Also, tower.root() couldn't be None; do_initialize_lockouts() ensures that.
// Also, tower.root() couldn't be None; initialize_lockouts() ensures that.
// Conceptually, every tower must have been constructed from a concrete starting point,
// which establishes the origin of trust (i.e. root) whether booting from genesis (slot 0) or
// snapshot (slot N). In other words, there should be no possibility a Tower doesn't have
Expand Down Expand Up @@ -517,7 +519,7 @@ impl Tower {
let last_vote_ancestors =
ancestors.get(&last_voted_slot).unwrap_or_else(|| {
if !self.is_stray_last_vote() {
// Unless last vote is stray, ancestors.get(last_voted_slot) must
// Unless last vote is stray and stale, ancestors.get(last_voted_slot) must
// return Some(_), justifying to panic! here.
// Also, adjust_lockouts_after_replay() correctly makes last_voted_slot None,
// if all saved votes are ancestors of replayed_root_slot. So this code shouldn't be
Expand All @@ -527,8 +529,8 @@ impl Tower {
// all of them.
panic!("no ancestors found with slot: {}", last_voted_slot);
} else {
// This condition shouldn't occur under normal validator operation, indicating
// something unusual happened.
// This condition (stale stray last vote) shouldn't occur under normal validator
// operation, indicating something unusual happened.
// Possible causes include: OS/HW crash, validator process crash, only saved tower
// is moved over to a new setup, etc...

Expand Down Expand Up @@ -829,17 +831,18 @@ impl Tower {
// The tower root can be older/newer if the validator booted from a newer/older snapshot, so
// tower lockouts may need adjustment
pub fn adjust_lockouts_after_replay(
self,
mut self,
replayed_root: Slot,
slot_history: &SlotHistory,
) -> Result<Self> {
// sanity assertions for roots
let tower_root = self.root();
info!(
"adjusting lockouts (after replay up to {}): {:?} tower root: {}",
"adjusting lockouts (after replay up to {}): {:?} tower root: {} replayed root: {}",
replayed_root,
self.voted_slots(),
tower_root,
replayed_root,
);
assert_eq!(slot_history.check(replayed_root), Check::Found);

Expand All @@ -860,30 +863,24 @@ impl Tower {
)
);

// return immediately if votes are empty...
if self.lockouts.votes.is_empty() {
return Ok(self);
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't like my previous coding (especially regarding the branched-off early bail-out code flow with Ok(_))... Well, I found out that I can steamline these a bit.

}

let last_voted_slot = self.last_voted_slot().unwrap();
if slot_history.check(last_voted_slot) == Check::TooOld {
// We could try hard to anchor with other older votes, but opt to simplify the
// following logic
return Err(TowerError::TooOldTower(
last_voted_slot,
slot_history.oldest(),
));
if let Some(last_voted_slot) = self.last_voted_slot() {
if slot_history.check(last_voted_slot) == Check::TooOld {
// We could try hard to anchor with other older votes, but opt to simplify the
// following logic
return Err(TowerError::TooOldTower(
last_voted_slot,
slot_history.oldest(),
));
}
self.adjust_lockouts_with_slot_history(slot_history)?;
self.initialize_root(replayed_root);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cool, I think this looks equivalent to me, much easier to read 😄

}

self.do_adjust_lockouts_after_replay(tower_root, replayed_root, slot_history)
Ok(self)
}

fn do_adjust_lockouts_after_replay(
mut self,
tower_root: Slot,
replayed_root: Slot,
slot_history: &SlotHistory,
) -> Result<Self> {
fn adjust_lockouts_with_slot_history(&mut self, slot_history: &SlotHistory) -> Result<()> {
let tower_root = self.root();
// retained slots will be consisted only from divergent slots
let mut retain_flags_for_each_vote_in_reverse: Vec<_> =
Vec::with_capacity(self.lockouts.votes.len());
Expand Down Expand Up @@ -931,7 +928,12 @@ impl Tower {
if !voting_from_genesis {
// Unless we're voting since genesis, slots_in_tower must always be older than last checked_slot
// including all vote slot and the root slot.
assert!(*slot_in_tower < checked_slot)
assert!(
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm adding tests before adjusting this assert!.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi! I'm catching this assertion when I'm running a new validator: #13128

*slot_in_tower < checked_slot,
"slot_in_tower({}) < checked_slot({})",
*slot_in_tower,
checked_slot
);
}
}

Expand Down Expand Up @@ -959,15 +961,10 @@ impl Tower {
retain_flags_for_each_vote_in_reverse.into_iter().rev();

let original_votes_len = self.lockouts.votes.len();
self.do_initialize_lockouts(replayed_root, move |_| {
retain_flags_for_each_vote.next().unwrap()
});
self.initialize_lockouts(move |_| retain_flags_for_each_vote.next().unwrap());

if self.lockouts.votes.is_empty() {
info!(
"All restored votes were behind replayed_root({}); resetting root_slot and last_vote in tower!",
replayed_root
);
info!("All restored votes were behind; resetting root_slot and last_vote in tower!");
// we might not have banks for those votes so just reset.
// That's because the votes may well past replayed_root
self.last_vote = Vote::default();
Expand All @@ -986,7 +983,7 @@ impl Tower {
self.stray_restored_slot = Some(self.last_vote.last_voted_slot().unwrap());
}

Ok(self)
Ok(())
}

fn initialize_lockouts_from_bank(
Expand All @@ -999,7 +996,8 @@ impl Tower {
let vote_state = VoteState::deserialize(&vote_account.data)
.expect("vote_account isn't a VoteState?");
self.lockouts = vote_state;
self.do_initialize_lockouts(root, |v| v.slot > root);
self.initialize_root(root);
self.initialize_lockouts(|v| v.slot > root);
trace!(
"Lockouts in tower for {} is initialized using bank {}",
self.node_pubkey,
Expand All @@ -1010,7 +1008,7 @@ impl Tower {
"vote account's node_pubkey doesn't match",
);
} else {
self.do_initialize_lockouts(root, |_| true);
self.initialize_root(root);
info!(
"vote account({}) not found in bank (slot={})",
vote_account_pubkey,
Expand All @@ -1019,13 +1017,16 @@ impl Tower {
}
}

fn do_initialize_lockouts<F: FnMut(&Lockout) -> bool>(&mut self, root: Slot, should_retain: F) {
// Updating root is needed to correctly restore from newly-saved tower for the next
// boot
self.lockouts.root_slot = Some(root);
fn initialize_lockouts<F: FnMut(&Lockout) -> bool>(&mut self, should_retain: F) {
self.lockouts.votes.retain(should_retain);
}

// Updating root is needed to correctly restore from newly-saved tower for the next
// boot
fn initialize_root(&mut self, root: Slot) {
self.lockouts.root_slot = Some(root);
}

pub fn get_filename(path: &Path, node_pubkey: &Pubkey) -> PathBuf {
path.join(format!("tower-{}", node_pubkey))
.with_extension("bin")
Expand Down Expand Up @@ -1176,7 +1177,7 @@ pub fn reconcile_blockstore_roots_with_tower(
"at least 1 parent slot must be found"
);

blockstore.set_roots(&new_roots)?
blockstore.set_roots(&new_roots)?;
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oddly enough, clippy and rustfmt doesn't seem to complain this. ;)

}
Ok(())
}
Expand Down Expand Up @@ -2796,7 +2797,7 @@ pub mod test {
}

#[test]
fn test_adjust_lockouts_after_relay_all_rooted_with_too_old() {
fn test_adjust_lockouts_after_replay_all_rooted_with_too_old() {
use solana_sdk::slot_history::MAX_ENTRIES;

let mut tower = Tower::new_for_tests(10, 0.9);
Expand Down Expand Up @@ -2999,4 +3000,54 @@ pub mod test {
"The tower is fatally inconsistent with blockstore: not too old once after got too old?"
);
}

#[test]
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm adding tests before adjusting this assert!.

#[should_panic(expected = "slot_in_tower(2) < checked_slot(1)")]
fn test_adjust_lockouts_after_replay_reversed_votes() {
let mut tower = Tower::new_for_tests(10, 0.9);
tower.lockouts.votes.push_back(Lockout::new(2));
tower.lockouts.votes.push_back(Lockout::new(1));
let vote = Vote::new(vec![1], Hash::default());
tower.last_vote = vote;

let mut slot_history = SlotHistory::default();
slot_history.add(0);
slot_history.add(2);

tower
.adjust_lockouts_after_replay(2, &slot_history)
.unwrap();
}

#[test]
#[should_panic(expected = "slot_in_tower(3) < checked_slot(3)")]
fn test_adjust_lockouts_after_replay_repeated_non_root_votes() {
let mut tower = Tower::new_for_tests(10, 0.9);
tower.lockouts.votes.push_back(Lockout::new(2));
tower.lockouts.votes.push_back(Lockout::new(3));
tower.lockouts.votes.push_back(Lockout::new(3));
let vote = Vote::new(vec![3], Hash::default());
tower.last_vote = vote;

let mut slot_history = SlotHistory::default();
slot_history.add(0);
slot_history.add(2);

tower
.adjust_lockouts_after_replay(2, &slot_history)
.unwrap();
}

#[test]
fn test_adjust_lockouts_after_replay_vote_on_genesis() {
let mut tower = Tower::new_for_tests(10, 0.9);
tower.lockouts.votes.push_back(Lockout::new(0));
let vote = Vote::new(vec![0], Hash::default());
tower.last_vote = vote;

let mut slot_history = SlotHistory::default();
slot_history.add(0);

assert!(tower.adjust_lockouts_after_replay(0, &slot_history).is_ok());
}
}
6 changes: 3 additions & 3 deletions core/src/heaviest_subtree_fork_choice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,7 @@ impl HeaviestSubtreeForkChoice {
let heaviest_slot_on_same_voted_fork = self.best_slot(last_voted_slot);
if heaviest_slot_on_same_voted_fork.is_none() {
if !tower.is_stray_last_vote() {
// Unless last vote is stray, self.bast_slot(last_voted_slot) must return
// Unless last vote is stray and stale, self.bast_slot(last_voted_slot) must return
// Some(_), justifying to panic! here.
// Also, adjust_lockouts_after_replay() correctly makes last_voted_slot None,
// if all saved votes are ancestors of replayed_root_slot. So this code shouldn't be
Expand All @@ -507,12 +507,12 @@ impl HeaviestSubtreeForkChoice {
// validator has been running, so we must be able to fetch best_slots for all of
// them.
panic!(
"a bank at last_voted_slot({}) is a frozen bank so must have been\
"a bank at last_voted_slot({}) is a frozen bank so must have been \
added to heaviest_subtree_fork_choice at time of freezing",
last_voted_slot,
)
} else {
// fork_infos doesn't have corresponding data for the stray restored last vote,
// fork_infos doesn't have corresponding data for the stale stray last vote,
// meaning some inconsistency between saved tower and ledger.
// (newer snapshot, or only a saved tower is moved over to new setup?)
return None;
Expand Down
2 changes: 1 addition & 1 deletion core/src/validator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -744,7 +744,7 @@ fn post_process_restored_tower(
error!("Requested mandatory tower restore failed: {}", err);
error!(
"And there is an existing vote_account containing actual votes. \
Aborting due to possible conflicting duplicate votes"
Aborting due to possible conflicting duplicate votes",
);
process::exit(1);
}
Expand Down