Skip to content

Commit 9223335

Browse files
author
Mark Travis
committed
Several changes to improve Consensus stability:
* Verify accepted ledger becomes validated, and retry with a new consensus transaction set if not. * Always store proposals. * Track proposals by ledger sequence. This helps slow peers catch up with the rest of the network. * Acquire transaction sets for proposals with future ledger sequences. This also helps slow peers catch up. * Optimize timer delay for establish phase to wait based on how long validators have been sending proposals. This also helps slow peers to catch up. * Fix impasse achieving close time consensus. * Don't wait between open and establish phases.
1 parent c500396 commit 9223335

19 files changed

+811
-182
lines changed

src/ripple/app/consensus/RCLConsensus.cpp

+75-40
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ RCLConsensus::RCLConsensus(
5555
LedgerMaster& ledgerMaster,
5656
LocalTxs& localTxs,
5757
InboundTransactions& inboundTransactions,
58-
Consensus<Adaptor>::clock_type const& clock,
58+
Consensus<Adaptor>::clock_type& clock,
5959
ValidatorKeys const& validatorKeys,
6060
beast::Journal journal)
6161
: adaptor_(
@@ -171,6 +171,9 @@ RCLConsensus::Adaptor::share(RCLCxPeerPos const& peerPos)
171171
auto const sig = peerPos.signature();
172172
prop.set_signature(sig.data(), sig.size());
173173

174+
if (proposal.ledgerSeq().has_value())
175+
prop.set_ledgerseq(*proposal.ledgerSeq());
176+
174177
app_.overlay().relay(prop, peerPos.suppressionID(), peerPos.publicKey());
175178
}
176179

@@ -180,7 +183,7 @@ RCLConsensus::Adaptor::share(RCLCxTx const& tx)
180183
// If we didn't relay this transaction recently, relay it to all peers
181184
if (app_.getHashRouter().shouldRelay(tx.id()))
182185
{
183-
JLOG(j_.debug()) << "Relaying disputed tx " << tx.id();
186+
JLOG(j_.trace()) << "Relaying disputed tx " << tx.id();
184187
auto const slice = tx.tx_->slice();
185188
protocol::TMTransaction msg;
186189
msg.set_rawtransaction(slice.data(), slice.size());
@@ -192,13 +195,13 @@ RCLConsensus::Adaptor::share(RCLCxTx const& tx)
192195
}
193196
else
194197
{
195-
JLOG(j_.debug()) << "Not relaying disputed tx " << tx.id();
198+
JLOG(j_.trace()) << "Not relaying disputed tx " << tx.id();
196199
}
197200
}
198201
void
199202
RCLConsensus::Adaptor::propose(RCLCxPeerPos::Proposal const& proposal)
200203
{
201-
JLOG(j_.trace()) << (proposal.isBowOut() ? "We bow out: " : "We propose: ")
204+
JLOG(j_.debug()) << (proposal.isBowOut() ? "We bow out: " : "We propose: ")
202205
<< ripple::to_string(proposal.prevLedger()) << " -> "
203206
<< ripple::to_string(proposal.position());
204207

@@ -212,6 +215,7 @@ RCLConsensus::Adaptor::propose(RCLCxPeerPos::Proposal const& proposal)
212215
prop.set_closetime(proposal.closeTime().time_since_epoch().count());
213216
prop.set_nodepubkey(
214217
validatorKeys_.publicKey.data(), validatorKeys_.publicKey.size());
218+
prop.set_ledgerseq(*proposal.ledgerSeq());
215219

216220
auto sig = signDigest(
217221
validatorKeys_.publicKey,
@@ -297,7 +301,8 @@ auto
297301
RCLConsensus::Adaptor::onClose(
298302
RCLCxLedger const& ledger,
299303
NetClock::time_point const& closeTime,
300-
ConsensusMode mode) -> Result
304+
ConsensusMode mode,
305+
clock_type& clock) -> Result
301306
{
302307
const bool wrongLCL = mode == ConsensusMode::wrongLedger;
303308
const bool proposing = mode == ConsensusMode::proposing;
@@ -379,7 +384,7 @@ RCLConsensus::Adaptor::onClose(
379384

380385
// Needed because of the move below.
381386
auto const setHash = initialSet->getHash().as_uint256();
382-
387+
initialLedger->info().seq;
383388
return Result{
384389
std::move(initialSet),
385390
RCLCxPeerPos::Proposal{
@@ -388,7 +393,9 @@ RCLConsensus::Adaptor::onClose(
388393
setHash,
389394
closeTime,
390395
app_.timeKeeper().closeTime(),
391-
validatorKeys_.nodeID}};
396+
validatorKeys_.nodeID,
397+
initialLedger->info().seq,
398+
clock}};
392399
}
393400

394401
void
@@ -400,50 +407,43 @@ RCLConsensus::Adaptor::onForceAccept(
400407
ConsensusMode const& mode,
401408
Json::Value&& consensusJson)
402409
{
403-
doAccept(
404-
result,
405-
prevLedger,
406-
closeResolution,
407-
rawCloseTimes,
408-
mode,
409-
std::move(consensusJson));
410+
auto txsBuilt = buildAndValidate(
411+
result, prevLedger, closeResolution, mode, std::move(consensusJson));
412+
prepareOpenLedger(std::move(txsBuilt), result, rawCloseTimes, mode);
410413
}
411414

412415
void
413416
RCLConsensus::Adaptor::onAccept(
414417
Result const& result,
415-
RCLCxLedger const& prevLedger,
416-
NetClock::duration const& closeResolution,
417418
ConsensusCloseTimes const& rawCloseTimes,
418419
ConsensusMode const& mode,
419-
Json::Value&& consensusJson)
420+
Json::Value&& consensusJson,
421+
std::pair<CanonicalTxSet_t, Ledger_t>&& tb)
420422
{
421423
app_.getJobQueue().addJob(
422424
jtACCEPT,
423425
"acceptLedger",
424-
[=, this, cj = std::move(consensusJson)]() mutable {
426+
[=,
427+
this,
428+
cj = std::move(consensusJson),
429+
txsBuilt = std::move(tb)]() mutable {
425430
// Note that no lock is held or acquired during this job.
426431
// This is because generic Consensus guarantees that once a ledger
427432
// is accepted, the consensus results and capture by reference state
428433
// will not change until startRound is called (which happens via
429434
// endConsensus).
430-
this->doAccept(
431-
result,
432-
prevLedger,
433-
closeResolution,
434-
rawCloseTimes,
435-
mode,
436-
std::move(cj));
435+
prepareOpenLedger(std::move(txsBuilt), result, rawCloseTimes, mode);
437436
this->app_.getOPs().endConsensus();
438437
});
439438
}
440439

441-
void
442-
RCLConsensus::Adaptor::doAccept(
440+
std::pair<
441+
RCLConsensus::Adaptor::CanonicalTxSet_t,
442+
RCLConsensus::Adaptor::Ledger_t>
443+
RCLConsensus::Adaptor::buildAndValidate(
443444
Result const& result,
444-
RCLCxLedger const& prevLedger,
445-
NetClock::duration closeResolution,
446-
ConsensusCloseTimes const& rawCloseTimes,
445+
Ledger_t const& prevLedger,
446+
NetClock::duration const& closeResolution,
447447
ConsensusMode const& mode,
448448
Json::Value&& consensusJson)
449449
{
@@ -497,12 +497,12 @@ RCLConsensus::Adaptor::doAccept(
497497
{
498498
retriableTxs.insert(
499499
std::make_shared<STTx const>(SerialIter{item.slice()}));
500-
JLOG(j_.debug()) << " Tx: " << item.key();
500+
JLOG(j_.trace()) << " Tx: " << item.key();
501501
}
502502
catch (std::exception const& ex)
503503
{
504504
failed.insert(item.key());
505-
JLOG(j_.warn())
505+
JLOG(j_.trace())
506506
<< " Tx: " << item.key() << " throws: " << ex.what();
507507
}
508508
}
@@ -579,6 +579,19 @@ RCLConsensus::Adaptor::doAccept(
579579
ledgerMaster_.consensusBuilt(
580580
built.ledger_, result.txns.id(), std::move(consensusJson));
581581

582+
return {retriableTxs, built};
583+
}
584+
585+
void
586+
RCLConsensus::Adaptor::prepareOpenLedger(
587+
std::pair<CanonicalTxSet_t, Ledger_t>&& txsBuilt,
588+
Result const& result,
589+
ConsensusCloseTimes const& rawCloseTimes,
590+
ConsensusMode const& mode)
591+
{
592+
auto& retriableTxs = txsBuilt.first;
593+
auto const& built = txsBuilt.second;
594+
582595
//-------------------------------------------------------------------------
583596
{
584597
// Apply disputed transactions that didn't get in
@@ -601,7 +614,7 @@ RCLConsensus::Adaptor::doAccept(
601614
// we voted NO
602615
try
603616
{
604-
JLOG(j_.debug())
617+
JLOG(j_.trace())
605618
<< "Test applying disputed transaction that did"
606619
<< " not get in " << dispute.tx().id();
607620

@@ -619,7 +632,7 @@ RCLConsensus::Adaptor::doAccept(
619632
}
620633
catch (std::exception const& ex)
621634
{
622-
JLOG(j_.debug()) << "Failed to apply transaction we voted "
635+
JLOG(j_.trace()) << "Failed to apply transaction we voted "
623636
"NO on. Exception: "
624637
<< ex.what();
625638
}
@@ -669,6 +682,7 @@ RCLConsensus::Adaptor::doAccept(
669682
// we entered the round with the network,
670683
// see how close our close time is to other node's
671684
// close time reports, and update our clock.
685+
const bool consensusFail = result.state == ConsensusState::MovedOn;
672686
if ((mode == ConsensusMode::proposing ||
673687
mode == ConsensusMode::observing) &&
674688
!consensusFail)
@@ -889,12 +903,32 @@ RCLConsensus::Adaptor::onModeChange(ConsensusMode before, ConsensusMode after)
889903
mode_ = after;
890904
}
891905

906+
bool
907+
RCLConsensus::Adaptor::retryAccept(
908+
Ledger_t const& newLedger,
909+
std::optional<std::chrono::time_point<std::chrono::steady_clock>>& start)
910+
const
911+
{
912+
static bool const standalone = ledgerMaster_.standalone();
913+
auto const& validLedger = ledgerMaster_.getValidatedLedger();
914+
915+
return (app_.getOPs().isFull() && !standalone &&
916+
(validLedger && (newLedger.id() != validLedger->info().hash) &&
917+
(newLedger.seq() >= validLedger->info().seq))) &&
918+
(!start ||
919+
std::chrono::duration_cast<std::chrono::seconds>(
920+
std::chrono::steady_clock::now() - *start)
921+
.count() < 5);
922+
}
923+
924+
//-----------------------------------------------------------------------------
925+
892926
Json::Value
893927
RCLConsensus::getJson(bool full) const
894928
{
895929
Json::Value ret;
896930
{
897-
std::lock_guard _{mutex_};
931+
std::lock_guard _{adaptor_.peekMutex()};
898932
ret = consensus_.getJson(full);
899933
}
900934
ret["validating"] = adaptor_.validating();
@@ -906,7 +940,7 @@ RCLConsensus::timerEntry(NetClock::time_point const& now)
906940
{
907941
try
908942
{
909-
std::lock_guard _{mutex_};
943+
std::lock_guard _{adaptor_.peekMutex()};
910944
consensus_.timerEntry(now);
911945
}
912946
catch (SHAMapMissingNode const& mn)
@@ -922,7 +956,7 @@ RCLConsensus::gotTxSet(NetClock::time_point const& now, RCLTxSet const& txSet)
922956
{
923957
try
924958
{
925-
std::lock_guard _{mutex_};
959+
std::lock_guard _{adaptor_.peekMutex()};
926960
consensus_.gotTxSet(now, txSet);
927961
}
928962
catch (SHAMapMissingNode const& mn)
@@ -940,7 +974,7 @@ RCLConsensus::simulate(
940974
NetClock::time_point const& now,
941975
std::optional<std::chrono::milliseconds> consensusDelay)
942976
{
943-
std::lock_guard _{mutex_};
977+
std::lock_guard _{adaptor_.peekMutex()};
944978
consensus_.simulate(now, consensusDelay);
945979
}
946980

@@ -949,7 +983,7 @@ RCLConsensus::peerProposal(
949983
NetClock::time_point const& now,
950984
RCLCxPeerPos const& newProposal)
951985
{
952-
std::lock_guard _{mutex_};
986+
std::lock_guard _{adaptor_.peekMutex()};
953987
return consensus_.peerProposal(now, newProposal);
954988
}
955989

@@ -1051,12 +1085,13 @@ RCLConsensus::startRound(
10511085
hash_set<NodeID> const& nowUntrusted,
10521086
hash_set<NodeID> const& nowTrusted)
10531087
{
1054-
std::lock_guard _{mutex_};
1088+
std::lock_guard _{adaptor_.peekMutex()};
10551089
consensus_.startRound(
10561090
now,
10571091
prevLgrId,
10581092
prevLgr,
10591093
nowUntrusted,
10601094
adaptor_.preStartRound(prevLgr, nowTrusted));
10611095
}
1096+
10621097
} // namespace ripple

0 commit comments

Comments
 (0)