Skip to content
This repository was archived by the owner on Jan 22, 2025. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion core/src/consensus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -646,7 +646,9 @@ impl Tower {
// By the time we reach here, any ancestors of the `last_vote`,
// should have been filtered out, as they all have a descendant,
// namely the `last_vote` itself.
assert!(!last_vote_ancestors.contains(candidate_slot));
if !self.is_stray_last_vote() {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Investigating why this is necessary, hopefully just a test issue 🙏

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ryoqun, I commented this check out, but haven't been able to repro the failure you described. The test seems to be passing fine on several different machines.

assert!(!last_vote_ancestors.contains(candidate_slot));
}

// Evaluate which vote accounts in the bank are locked out
// in the interval candidate_slot..last_vote, which means
Expand Down Expand Up @@ -1182,6 +1184,9 @@ pub enum TowerError {

#[error("The tower is fatally inconsistent with blockstore: {0}")]
FatallyInconsistent(&'static str),

#[error("The tower is useless because of new hard fork: {0}")]
HardFork(Slot),
}

impl TowerError {
Expand Down
31 changes: 29 additions & 2 deletions core/src/validator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -720,11 +720,38 @@ fn post_process_restored_tower(
ledger_path: &Path,
bank_forks: &BankForks,
) -> Tower {
let mut should_require_tower = config.require_tower;

restored_tower
.and_then(|tower| {
let root_bank = bank_forks.root_bank();
let slot_history = root_bank.get_slot_history();
tower.adjust_lockouts_after_replay(root_bank.slot(), &slot_history)
let tower = tower.adjust_lockouts_after_replay(root_bank.slot(), &slot_history);

if let Some(wait_slot_for_supermajority) = config.wait_for_supermajority {
if root_bank.slot() == wait_slot_for_supermajority {
// intentionally fail to restore tower; we're supposedly in a new hard fork; past
// out-of-chain vote state doesn't make sense at all
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm. we need to check last_voted_slot?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably fine for now, but how about storing the shred version in the persistent vote file? I think that might be more robust, we can just discard it if the shred version has a mismatch.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah. makes sense.

// what if --wait-for-supermajority again if the validator restarted?
let message = format!("Hardfork is detected; discarding tower restoration result: {:?}", tower);
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oops! Debug of tower can be really big. it might not be wise to put in the datapoint_error!, only add error! below.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nits: include wait_slot_for_supermajority instead.

datapoint_error!(
"tower_error",
(
"error",
message,
String
),
);
error!("{}", message);

// unconditionally relax tower requirement so that we can always restore tower
// from root bank.
should_require_tower = false;
return Err(crate::consensus::TowerError::HardFork(wait_slot_for_supermajority));
}
}

tower
})
.unwrap_or_else(|err| {
let voting_has_been_active =
Expand All @@ -739,7 +766,7 @@ fn post_process_restored_tower(
),
);
}
if config.require_tower && voting_has_been_active {
if should_require_tower && voting_has_been_active {
error!("Requested mandatory tower restore failed: {}", err);
error!(
"And there is an existing vote_account containing actual votes. \
Expand Down
10 changes: 10 additions & 0 deletions local-cluster/src/cluster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,15 @@ pub trait Cluster {
fn get_contact_info(&self, pubkey: &Pubkey) -> Option<&ContactInfo>;
fn exit_node(&mut self, pubkey: &Pubkey) -> ClusterValidatorInfo;
fn restart_node(&mut self, pubkey: &Pubkey, cluster_validator_info: ClusterValidatorInfo);
fn create_restart_context(
&mut self,
pubkey: &Pubkey,
cluster_validator_info: &mut ClusterValidatorInfo,
) -> (solana_core::cluster_info::Node, Option<ContactInfo>);
fn restart_node_with_context(
cluster_validator_info: ClusterValidatorInfo,
restart_context: (solana_core::cluster_info::Node, Option<ContactInfo>),
) -> ClusterValidatorInfo;
fn add_restarted_node(&mut self, pubkey: &Pubkey, cluster_validator_info: ClusterValidatorInfo);
Comment on lines +43 to +52
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

some ugly api is needed to work around blocking Validaotor::new. ;)

fn exit_restart_node(&mut self, pubkey: &Pubkey, config: ValidatorConfig);
}
35 changes: 30 additions & 5 deletions local-cluster/src/local_cluster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -615,7 +615,11 @@ impl Cluster for LocalCluster {
node
}

fn restart_node(&mut self, pubkey: &Pubkey, mut cluster_validator_info: ClusterValidatorInfo) {
fn create_restart_context(
&mut self,
pubkey: &Pubkey,
cluster_validator_info: &mut ClusterValidatorInfo,
) -> (solana_core::cluster_info::Node, Option<ContactInfo>) {
// Update the stored ContactInfo for this node
let node = Node::new_localhost_with_pubkey(&pubkey);
cluster_validator_info.info.contact_info = node.info.clone();
Expand All @@ -627,10 +631,32 @@ impl Cluster for LocalCluster {
self.entry_point_info = node.info.clone();
None
} else {
Some(&self.entry_point_info)
Some(self.entry_point_info.clone())
}
};

(node, entry_point_info)
}

fn restart_node(&mut self, pubkey: &Pubkey, mut cluster_validator_info: ClusterValidatorInfo) {
let restart_context = self.create_restart_context(pubkey, &mut cluster_validator_info);
let cluster_validator_info =
Self::restart_node_with_context(cluster_validator_info, restart_context);
self.add_restarted_node(pubkey, cluster_validator_info);
}

fn add_restarted_node(
&mut self,
pubkey: &Pubkey,
cluster_validator_info: ClusterValidatorInfo,
) {
self.validators.insert(*pubkey, cluster_validator_info);
}

fn restart_node_with_context(
mut cluster_validator_info: ClusterValidatorInfo,
(node, entry_point_info): (Node, Option<ContactInfo>),
) -> ClusterValidatorInfo {
// Restart the node
let validator_info = &cluster_validator_info.info;
cluster_validator_info.config.account_paths =
Expand All @@ -641,12 +667,11 @@ impl Cluster for LocalCluster {
&validator_info.ledger_path,
&validator_info.voting_keypair.pubkey(),
vec![validator_info.voting_keypair.clone()],
entry_point_info,
entry_point_info.as_ref(),
&cluster_validator_info.config,
);

cluster_validator_info.validator = Some(restarted_node);
self.validators.insert(*pubkey, cluster_validator_info);
cluster_validator_info
}

fn exit_restart_node(&mut self, pubkey: &Pubkey, validator_config: ValidatorConfig) {
Expand Down
116 changes: 116 additions & 0 deletions local-cluster/tests/local_cluster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1984,6 +1984,122 @@ fn test_future_tower_master_slave() {
do_test_future_tower(ClusterMode::MasterSlave);
}

#[test]
#[serial]
fn test_hard_fork() {
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there was no test or whatever for hard fork code.. hence this generic test name can be justified.

solana_logger::setup();

// First set up the cluster with 2 nodes
let slots_per_epoch = 2048;
let node_stakes = vec![60, 40];

let validator_keys = vec![
"28bN3xyvrP4E8LwEgtLjhnkb7cY4amQb6DrYAbAYjgRV4GAGgkVM2K7wnxnAS7WDneuavza7x21MiafLu1HkwQt4",
"2saHBBoTkLMmttmPQP8KfBkcCw45S5cwtV3wTdGCscRC8uxdgvHxpHiWXKx4LvJjNJtnNcbSv5NdheokFFqnNDt8",
]
.iter()
.map(|s| (Arc::new(Keypair::from_base58_string(s)), true))
.take(node_stakes.len())
.collect::<Vec<_>>();
let validators = validator_keys
.iter()
.map(|(kp, _)| kp.pubkey())
.collect::<Vec<_>>();

let validator_a_pubkey = validators[0];
let validator_b_pubkey = validators[1];

let config = ClusterConfig {
cluster_lamports: 100_000,
node_stakes: node_stakes.clone(),
validator_configs: vec![ValidatorConfig::default(); node_stakes.len()],
validator_keys: Some(validator_keys),
slots_per_epoch,
stakers_slot_offset: slots_per_epoch,
skip_warmup_slots: true,
..ClusterConfig::default()
};
let cluster = std::sync::Arc::new(std::sync::Mutex::new(LocalCluster::new(&config)));

let val_a_ledger_path = cluster.lock().unwrap().ledger_path(&validator_a_pubkey);

loop {
sleep(Duration::from_millis(100));

if let Some(root) = root_in_tower(&val_a_ledger_path, &validator_a_pubkey) {
if root >= 15 {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's call this root min_root, and then the hard_fork_slot below can be min_root - 5 to indicate the purpose is to be less than min_root

break;
}
}
}

let mut validator_a_info = cluster.lock().unwrap().exit_node(&validator_a_pubkey);
let mut validator_b_info = cluster.lock().unwrap().exit_node(&validator_b_pubkey);

// setup hard fork!
let hard_fork_slot = 10;
let hard_fork_slots = Some(vec![hard_fork_slot]);
let mut hard_forks = solana_sdk::hard_forks::HardForks::default();
hard_forks.register(hard_fork_slot);

let expected_shred_version = solana_sdk::shred_version::compute_shred_version(
&cluster.lock().unwrap().genesis_config.hash(),
Some(&hard_forks),
);

validator_a_info.config.new_hard_forks = hard_fork_slots.clone();
validator_a_info.config.wait_for_supermajority = Some(hard_fork_slot);
validator_a_info.config.expected_shred_version = Some(expected_shred_version);

validator_b_info.config.new_hard_forks = hard_fork_slots;
validator_b_info.config.wait_for_supermajority = Some(hard_fork_slot);
validator_b_info.config.expected_shred_version = Some(expected_shred_version);

// restart validator A first
let cluster_for_a = cluster.clone();
// Spawn a thread because wait_for_supermajority blocks in Validator::new()!
let thread = std::thread::spawn(move || {
let restart_context = cluster_for_a
.lock()
.unwrap()
.create_restart_context(&validator_a_pubkey, &mut validator_a_info);
let restarted_validator_info =
LocalCluster::restart_node_with_context(validator_a_info, restart_context);
cluster_for_a
.lock()
.unwrap()
.add_restarted_node(&validator_a_pubkey, restarted_validator_info);
});

// test validator A actually to wait for supermajority
let mut last_vote = None;
for _ in 0..10 {
sleep(Duration::from_millis(1000));

let new_last_vote = last_vote_in_tower(&val_a_ledger_path, &validator_a_pubkey).unwrap();
if let Some(last_vote) = last_vote {
assert_eq!(last_vote, new_last_vote);
} else {
last_vote = Some(new_last_vote);
}
}

// restart validator B normally
cluster
.lock()
.unwrap()
.restart_node(&validator_b_pubkey, validator_b_info);

// validator A should now start so join its thread here
thread.join().unwrap();

// new slots should be rooted after hard-fork cluster relaunch
cluster
.lock()
.unwrap()
.check_for_new_roots(16, &"hard fork");
}

#[test]
#[serial]
fn test_no_optimistic_confirmation_violation_with_tower() {
Expand Down