Skip to content

Commit e87a81b

Browse files
Mark Travisximinez
Mark Travis
authored andcommitted
Several changes to improve Consensus stability: (XRPLF#4505)
* Verify accepted ledger becomes validated, and retry with a new consensus transaction set if not. * Always store proposals. * Track proposals by ledger sequence. This helps slow peers catch up with the rest of the network. * Acquire transaction sets for proposals with future ledger sequences. This also helps slow peers catch up. * Optimize timer delay for establish phase to wait based on how long validators have been sending proposals. This also helps slow peers to catch up. * Fix impasse achieving close time consensus. * Don't wait between open and establish phases.
1 parent 319c1ee commit e87a81b

20 files changed

+910
-187
lines changed

Builds/levelization/results/ordering.txt

+3
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ ripple.consensus > ripple.basics
1414
ripple.consensus > ripple.beast
1515
ripple.consensus > ripple.json
1616
ripple.consensus > ripple.protocol
17+
ripple.consensus > ripple.shamap
1718
ripple.core > ripple.beast
1819
ripple.core > ripple.json
1920
ripple.core > ripple.protocol
@@ -125,11 +126,13 @@ test.core > ripple.server
125126
test.core > test.jtx
126127
test.core > test.toplevel
127128
test.core > test.unit_test
129+
test.csf > ripple.app
128130
test.csf > ripple.basics
129131
test.csf > ripple.beast
130132
test.csf > ripple.consensus
131133
test.csf > ripple.json
132134
test.csf > ripple.protocol
135+
test.csf > test.jtx
133136
test.json > ripple.beast
134137
test.json > ripple.json
135138
test.json > test.jtx

src/ripple/app/consensus/RCLConsensus.cpp

+78-40
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ RCLConsensus::RCLConsensus(
5555
LedgerMaster& ledgerMaster,
5656
LocalTxs& localTxs,
5757
InboundTransactions& inboundTransactions,
58-
Consensus<Adaptor>::clock_type const& clock,
58+
Consensus<Adaptor>::clock_type& clock,
5959
ValidatorKeys const& validatorKeys,
6060
beast::Journal journal)
6161
: adaptor_(
@@ -171,6 +171,9 @@ RCLConsensus::Adaptor::share(RCLCxPeerPos const& peerPos)
171171
auto const sig = peerPos.signature();
172172
prop.set_signature(sig.data(), sig.size());
173173

174+
if (proposal.ledgerSeq().has_value())
175+
prop.set_ledgerseq(*proposal.ledgerSeq());
176+
174177
app_.overlay().relay(prop, peerPos.suppressionID(), peerPos.publicKey());
175178
}
176179

@@ -180,7 +183,7 @@ RCLConsensus::Adaptor::share(RCLCxTx const& tx)
180183
// If we didn't relay this transaction recently, relay it to all peers
181184
if (app_.getHashRouter().shouldRelay(tx.id()))
182185
{
183-
JLOG(j_.debug()) << "Relaying disputed tx " << tx.id();
186+
JLOG(j_.trace()) << "Relaying disputed tx " << tx.id();
184187
auto const slice = tx.tx_->slice();
185188
protocol::TMTransaction msg;
186189
msg.set_rawtransaction(slice.data(), slice.size());
@@ -192,13 +195,13 @@ RCLConsensus::Adaptor::share(RCLCxTx const& tx)
192195
}
193196
else
194197
{
195-
JLOG(j_.debug()) << "Not relaying disputed tx " << tx.id();
198+
JLOG(j_.trace()) << "Not relaying disputed tx " << tx.id();
196199
}
197200
}
198201
void
199202
RCLConsensus::Adaptor::propose(RCLCxPeerPos::Proposal const& proposal)
200203
{
201-
JLOG(j_.trace()) << (proposal.isBowOut() ? "We bow out: " : "We propose: ")
204+
JLOG(j_.debug()) << (proposal.isBowOut() ? "We bow out: " : "We propose: ")
202205
<< ripple::to_string(proposal.prevLedger()) << " -> "
203206
<< ripple::to_string(proposal.position());
204207

@@ -212,6 +215,7 @@ RCLConsensus::Adaptor::propose(RCLCxPeerPos::Proposal const& proposal)
212215
prop.set_closetime(proposal.closeTime().time_since_epoch().count());
213216
prop.set_nodepubkey(
214217
validatorKeys_.publicKey.data(), validatorKeys_.publicKey.size());
218+
prop.set_ledgerseq(*proposal.ledgerSeq());
215219

216220
auto sig = signDigest(
217221
validatorKeys_.publicKey,
@@ -297,7 +301,8 @@ auto
297301
RCLConsensus::Adaptor::onClose(
298302
RCLCxLedger const& ledger,
299303
NetClock::time_point const& closeTime,
300-
ConsensusMode mode) -> Result
304+
ConsensusMode mode,
305+
clock_type& clock) -> Result
301306
{
302307
const bool wrongLCL = mode == ConsensusMode::wrongLedger;
303308
const bool proposing = mode == ConsensusMode::proposing;
@@ -379,7 +384,6 @@ RCLConsensus::Adaptor::onClose(
379384

380385
// Needed because of the move below.
381386
auto const setHash = initialSet->getHash().as_uint256();
382-
383387
return Result{
384388
std::move(initialSet),
385389
RCLCxPeerPos::Proposal{
@@ -388,7 +392,9 @@ RCLConsensus::Adaptor::onClose(
388392
setHash,
389393
closeTime,
390394
app_.timeKeeper().closeTime(),
391-
validatorKeys_.nodeID}};
395+
validatorKeys_.nodeID,
396+
initialLedger->info().seq,
397+
clock}};
392398
}
393399

394400
void
@@ -400,50 +406,43 @@ RCLConsensus::Adaptor::onForceAccept(
400406
ConsensusMode const& mode,
401407
Json::Value&& consensusJson)
402408
{
403-
doAccept(
404-
result,
405-
prevLedger,
406-
closeResolution,
407-
rawCloseTimes,
408-
mode,
409-
std::move(consensusJson));
409+
auto txsBuilt = buildAndValidate(
410+
result, prevLedger, closeResolution, mode, std::move(consensusJson));
411+
prepareOpenLedger(std::move(txsBuilt), result, rawCloseTimes, mode);
410412
}
411413

412414
void
413415
RCLConsensus::Adaptor::onAccept(
414416
Result const& result,
415-
RCLCxLedger const& prevLedger,
416-
NetClock::duration const& closeResolution,
417417
ConsensusCloseTimes const& rawCloseTimes,
418418
ConsensusMode const& mode,
419-
Json::Value&& consensusJson)
419+
Json::Value&& consensusJson,
420+
std::pair<CanonicalTxSet_t, Ledger_t>&& tb)
420421
{
421422
app_.getJobQueue().addJob(
422423
jtACCEPT,
423424
"acceptLedger",
424-
[=, this, cj = std::move(consensusJson)]() mutable {
425+
[=,
426+
this,
427+
cj = std::move(consensusJson),
428+
txsBuilt = std::move(tb)]() mutable {
425429
// Note that no lock is held or acquired during this job.
426430
// This is because generic Consensus guarantees that once a ledger
427431
// is accepted, the consensus results and capture by reference state
428432
// will not change until startRound is called (which happens via
429433
// endConsensus).
430-
this->doAccept(
431-
result,
432-
prevLedger,
433-
closeResolution,
434-
rawCloseTimes,
435-
mode,
436-
std::move(cj));
434+
prepareOpenLedger(std::move(txsBuilt), result, rawCloseTimes, mode);
437435
this->app_.getOPs().endConsensus();
438436
});
439437
}
440438

441-
void
442-
RCLConsensus::Adaptor::doAccept(
439+
std::pair<
440+
RCLConsensus::Adaptor::CanonicalTxSet_t,
441+
RCLConsensus::Adaptor::Ledger_t>
442+
RCLConsensus::Adaptor::buildAndValidate(
443443
Result const& result,
444-
RCLCxLedger const& prevLedger,
445-
NetClock::duration closeResolution,
446-
ConsensusCloseTimes const& rawCloseTimes,
444+
Ledger_t const& prevLedger,
445+
NetClock::duration const& closeResolution,
447446
ConsensusMode const& mode,
448447
Json::Value&& consensusJson)
449448
{
@@ -497,12 +496,12 @@ RCLConsensus::Adaptor::doAccept(
497496
{
498497
retriableTxs.insert(
499498
std::make_shared<STTx const>(SerialIter{item.slice()}));
500-
JLOG(j_.debug()) << " Tx: " << item.key();
499+
JLOG(j_.trace()) << " Tx: " << item.key();
501500
}
502501
catch (std::exception const& ex)
503502
{
504503
failed.insert(item.key());
505-
JLOG(j_.warn())
504+
JLOG(j_.trace())
506505
<< " Tx: " << item.key() << " throws: " << ex.what();
507506
}
508507
}
@@ -579,6 +578,19 @@ RCLConsensus::Adaptor::doAccept(
579578
ledgerMaster_.consensusBuilt(
580579
built.ledger_, result.txns.id(), std::move(consensusJson));
581580

581+
return {retriableTxs, built};
582+
}
583+
584+
void
585+
RCLConsensus::Adaptor::prepareOpenLedger(
586+
std::pair<CanonicalTxSet_t, Ledger_t>&& txsBuilt,
587+
Result const& result,
588+
ConsensusCloseTimes const& rawCloseTimes,
589+
ConsensusMode const& mode)
590+
{
591+
auto& retriableTxs = txsBuilt.first;
592+
auto const& built = txsBuilt.second;
593+
582594
//-------------------------------------------------------------------------
583595
{
584596
// Apply disputed transactions that didn't get in
@@ -601,7 +613,7 @@ RCLConsensus::Adaptor::doAccept(
601613
// we voted NO
602614
try
603615
{
604-
JLOG(j_.debug())
616+
JLOG(j_.trace())
605617
<< "Test applying disputed transaction that did"
606618
<< " not get in " << dispute.tx().id();
607619

@@ -619,7 +631,7 @@ RCLConsensus::Adaptor::doAccept(
619631
}
620632
catch (std::exception const& ex)
621633
{
622-
JLOG(j_.debug()) << "Failed to apply transaction we voted "
634+
JLOG(j_.trace()) << "Failed to apply transaction we voted "
623635
"NO on. Exception: "
624636
<< ex.what();
625637
}
@@ -669,6 +681,7 @@ RCLConsensus::Adaptor::doAccept(
669681
// we entered the round with the network,
670682
// see how close our close time is to other node's
671683
// close time reports, and update our clock.
684+
bool const consensusFail = result.state == ConsensusState::MovedOn;
672685
if ((mode == ConsensusMode::proposing ||
673686
mode == ConsensusMode::observing) &&
674687
!consensusFail)
@@ -889,12 +902,30 @@ RCLConsensus::Adaptor::onModeChange(ConsensusMode before, ConsensusMode after)
889902
mode_ = after;
890903
}
891904

905+
bool
906+
RCLConsensus::Adaptor::retryAccept(
907+
Ledger_t const& newLedger,
908+
std::optional<std::chrono::time_point<std::chrono::steady_clock>>& start)
909+
const
910+
{
911+
static bool const standalone = ledgerMaster_.standalone();
912+
auto const& validLedger = ledgerMaster_.getValidatedLedger();
913+
914+
return (app_.getOPs().isFull() && !standalone &&
915+
(validLedger && (newLedger.id() != validLedger->info().hash) &&
916+
(newLedger.seq() >= validLedger->info().seq))) &&
917+
(!start ||
918+
std::chrono::steady_clock::now() - *start < std::chrono::seconds{5});
919+
}
920+
921+
//-----------------------------------------------------------------------------
922+
892923
Json::Value
893924
RCLConsensus::getJson(bool full) const
894925
{
895926
Json::Value ret;
896927
{
897-
std::lock_guard _{mutex_};
928+
std::lock_guard _{adaptor_.peekMutex()};
898929
ret = consensus_.getJson(full);
899930
}
900931
ret["validating"] = adaptor_.validating();
@@ -906,7 +937,7 @@ RCLConsensus::timerEntry(NetClock::time_point const& now)
906937
{
907938
try
908939
{
909-
std::lock_guard _{mutex_};
940+
std::lock_guard _{adaptor_.peekMutex()};
910941
consensus_.timerEntry(now);
911942
}
912943
catch (SHAMapMissingNode const& mn)
@@ -922,7 +953,7 @@ RCLConsensus::gotTxSet(NetClock::time_point const& now, RCLTxSet const& txSet)
922953
{
923954
try
924955
{
925-
std::lock_guard _{mutex_};
956+
std::lock_guard _{adaptor_.peekMutex()};
926957
consensus_.gotTxSet(now, txSet);
927958
}
928959
catch (SHAMapMissingNode const& mn)
@@ -940,7 +971,7 @@ RCLConsensus::simulate(
940971
NetClock::time_point const& now,
941972
std::optional<std::chrono::milliseconds> consensusDelay)
942973
{
943-
std::lock_guard _{mutex_};
974+
std::lock_guard _{adaptor_.peekMutex()};
944975
consensus_.simulate(now, consensusDelay);
945976
}
946977

@@ -949,7 +980,7 @@ RCLConsensus::peerProposal(
949980
NetClock::time_point const& now,
950981
RCLCxPeerPos const& newProposal)
951982
{
952-
std::lock_guard _{mutex_};
983+
std::lock_guard _{adaptor_.peekMutex()};
953984
return consensus_.peerProposal(now, newProposal);
954985
}
955986

@@ -1022,6 +1053,12 @@ RCLConsensus::Adaptor::getQuorumKeys() const
10221053
return app_.validators().getQuorumKeys();
10231054
}
10241055

1056+
std::size_t
1057+
RCLConsensus::Adaptor::quorum() const
1058+
{
1059+
return app_.validators().quorum();
1060+
}
1061+
10251062
std::size_t
10261063
RCLConsensus::Adaptor::laggards(
10271064
Ledger_t::Seq const seq,
@@ -1051,12 +1088,13 @@ RCLConsensus::startRound(
10511088
hash_set<NodeID> const& nowUntrusted,
10521089
hash_set<NodeID> const& nowTrusted)
10531090
{
1054-
std::lock_guard _{mutex_};
1091+
std::lock_guard _{adaptor_.peekMutex()};
10551092
consensus_.startRound(
10561093
now,
10571094
prevLgrId,
10581095
prevLgr,
10591096
nowUntrusted,
10601097
adaptor_.preStartRound(prevLgr, nowTrusted));
10611098
}
1099+
10621100
} // namespace ripple

0 commit comments

Comments
 (0)