diff --git a/cfg/rippled-example.cfg b/cfg/rippled-example.cfg index f87ad44c7ad..83a9353aa4d 100644 --- a/cfg/rippled-example.cfg +++ b/cfg/rippled-example.cfg @@ -1140,17 +1140,10 @@ # The online delete process checks periodically # that rippled is still in sync with the network, # and that the validated ledger is less than -# 'age_threshold_seconds' old. By default, if it -# is not the online delete process aborts and -# tries again later. If 'recovery_wait_seconds' -# is set and rippled is out of sync, but likely to -# recover quickly, then online delete will wait -# this number of seconds for rippled to get back -# into sync before it aborts. -# Set this value if the node is otherwise staying -# in sync, or recovering quickly, but the online -# delete process is unable to finish. -# Default is unset. +# 'age_threshold_seconds' old. If not, then continue +# sleeping for this number of seconds and +# checking until healthy. +# Default is 5. # # Optional keys for Cassandra: # diff --git a/src/ripple/app/misc/SHAMapStoreImp.cpp b/src/ripple/app/misc/SHAMapStoreImp.cpp index 56a817934c7..62aa28c2261 100644 --- a/src/ripple/app/misc/SHAMapStoreImp.cpp +++ b/src/ripple/app/misc/SHAMapStoreImp.cpp @@ -138,7 +138,7 @@ SHAMapStoreImp::SHAMapStoreImp( if (get_if_exists(section, "age_threshold_seconds", temp)) ageThreshold_ = std::chrono::seconds{temp}; if (get_if_exists(section, "recovery_wait_seconds", temp)) - recoveryWaitTime_.emplace(std::chrono::seconds{temp}); + recoveryWaitTime_ = std::chrono::seconds{temp}; get_if_exists(section, "advisory_delete", advisoryDelete_); @@ -268,7 +268,7 @@ SHAMapStoreImp::copyNode(std::uint64_t& nodeCount, SHAMapTreeNode const& node) true); if (!(++nodeCount % checkHealthInterval_)) { - if (health()) + if (stopping()) return false; } @@ -326,7 +326,7 @@ SHAMapStoreImp::run() bool const readyToRotate = validatedSeq >= lastRotated + deleteInterval_ && - canDelete_ >= lastRotated - 1 && !health(); + canDelete_ >= lastRotated - 1 && !stopping(); // Make sure we don't delete ledgers currently being // imported into the ShardStore @@ -358,15 +358,8 @@ SHAMapStoreImp::run() << ledgerMaster_->getValidatedLedgerAge().count() << 's'; clearPrior(lastRotated); - switch (health()) - { - case Health::stopping: - return; - case Health::unhealthy: - continue; - case Health::ok: - default:; - } + if (stopping()) + return; JLOG(journal_.debug()) << "copying ledger " << validatedSeq; std::uint64_t nodeCount = 0; @@ -375,30 +368,16 @@ SHAMapStoreImp::run() this, std::ref(nodeCount), std::placeholders::_1)); - switch (health()) - { - case Health::stopping: - return; - case Health::unhealthy: - continue; - case Health::ok: - default:; - } + if (stopping()) + return; // Only log if we completed without a "health" abort JLOG(journal_.debug()) << "copied ledger " << validatedSeq << " nodecount " << nodeCount; JLOG(journal_.debug()) << "freshening caches"; freshenCaches(); - switch (health()) - { - case Health::stopping: - return; - case Health::unhealthy: - continue; - case Health::ok: - default:; - } + if (stopping()) + return; // Only log if we completed without a "health" abort JLOG(journal_.debug()) << validatedSeq << " freshened caches"; @@ -408,15 +387,8 @@ SHAMapStoreImp::run() << validatedSeq << " new backend " << newBackend->getName(); clearCaches(validatedSeq); - switch (health()) - { - case Health::stopping: - return; - case Health::unhealthy: - continue; - case Health::ok: - default:; - } + if (stopping()) + return; lastRotated = validatedSeq; @@ -580,7 +552,7 @@ SHAMapStoreImp::clearSql( min = *m; } - if (min > lastRotated || health() != Health::ok) + if (min > lastRotated || stopping()) return; if (min == lastRotated) { @@ -601,11 +573,11 @@ SHAMapStoreImp::clearSql( JLOG(journal_.trace()) << "End: Delete up to " << deleteBatch_ << " rows with LedgerSeq < " << min << " from: " << TableName; - if (health()) + if (stopping()) return; if (min < lastRotated) std::this_thread::sleep_for(backOff_); - if (health()) + if (stopping()) return; } JLOG(journal_.debug()) << "finished deleting from: " << TableName; @@ -645,7 +617,7 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated) ledgerMaster_->clearPriorLedgers(lastRotated); JLOG(journal_.trace()) << "End: Clear internal ledgers up to " << lastRotated; - if (health()) + if (stopping()) return; RelationalDBInterfaceSqlite* iface = @@ -661,7 +633,7 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated) [&iface](LedgerIndex min) -> void { iface->deleteBeforeLedgerSeq(min); }); - if (health()) + if (stopping()) return; if (!app_.config().useTxTables()) @@ -676,7 +648,7 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated) [&iface](LedgerIndex min) -> void { iface->deleteTransactionsBeforeLedgerSeq(min); }); - if (health()) + if (stopping()) return; clearSql( @@ -688,52 +660,33 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated) [&iface](LedgerIndex min) -> void { iface->deleteAccountTransactionsBeforeLedgerSeq(min); }); - if (health()) + if (stopping()) return; } -SHAMapStoreImp::Health -SHAMapStoreImp::health() +bool +SHAMapStoreImp::stopping() { + auto age = ledgerMaster_->getValidatedLedgerAge(); + OperatingMode mode = netOPs_->getOperatingMode(); + std::unique_lock lock(mutex_); + while (!stop_ && + (mode != OperatingMode::FULL || + age > ageThreshold_)) { - std::lock_guard lock(mutex_); - if (stop_) - return Health::stopping; - } - if (!netOPs_) - return Health::ok; - assert(deleteInterval_); - - if (healthy_) - { - auto age = ledgerMaster_->getValidatedLedgerAge(); - OperatingMode mode = netOPs_->getOperatingMode(); - if (recoveryWaitTime_ && mode == OperatingMode::SYNCING && - age < ageThreshold_) - { - JLOG(journal_.warn()) - << "Waiting " << recoveryWaitTime_->count() - << "s for node to get back into sync with network. state: " - << app_.getOPs().strOperatingMode(mode, false) << ". age " - << age.count() << 's'; - std::this_thread::sleep_for(*recoveryWaitTime_); - - age = ledgerMaster_->getValidatedLedgerAge(); - mode = netOPs_->getOperatingMode(); - } - if (mode != OperatingMode::FULL || age > ageThreshold_) - { - JLOG(journal_.warn()) << "Not deleting. state: " - << app_.getOPs().strOperatingMode(mode, false) - << ". age " << age.count() << 's'; - healthy_ = false; - } + lock.unlock(); + JLOG(journal_.warn()) + << "Waiting " << recoveryWaitTime_.count() + << "s for node to stabilize. state: " + << app_.getOPs().strOperatingMode(mode, false) << ". age " + << age.count() << 's'; + std::this_thread::sleep_for(recoveryWaitTime_); + age = ledgerMaster_->getValidatedLedgerAge(); + mode = netOPs_->getOperatingMode(); + lock.lock(); } - if (healthy_) - return Health::ok; - else - return Health::unhealthy; + return stop_; } void diff --git a/src/ripple/app/misc/SHAMapStoreImp.h b/src/ripple/app/misc/SHAMapStoreImp.h index e3528faaada..c2b4476589e 100644 --- a/src/ripple/app/misc/SHAMapStoreImp.h +++ b/src/ripple/app/misc/SHAMapStoreImp.h @@ -40,8 +40,6 @@ class NetworkOPs; class SHAMapStoreImp : public SHAMapStore { private: - enum Health : std::uint8_t { ok = 0, stopping, unhealthy }; - class SavedStateDB { public: @@ -106,12 +104,12 @@ class SHAMapStoreImp : public SHAMapStore std::uint32_t deleteBatch_ = 100; std::chrono::milliseconds backOff_{100}; std::chrono::seconds ageThreshold_{60}; - /// If set, and the node is out of sync during an + /// If the node is out of sync during an /// online_delete health check, sleep the thread - /// for this time and check again so the node can - /// recover. + /// for this time, and continue checking until + /// recovery. /// See also: "recovery_wait_seconds" in rippled-example.cfg - std::optional recoveryWaitTime_; + std::chrono::seconds recoveryWaitTime_{5}; // these do not exist upon SHAMapStore creation, but do exist // as of run() or before @@ -201,7 +199,7 @@ class SHAMapStoreImp : public SHAMapStore { dbRotating_->fetchNodeObject( key, 0, NodeStore::FetchType::synchronous, true); - if (!(++check % checkHealthInterval_) && health()) + if (!(++check % checkHealthInterval_) && stopping()) return true; } @@ -225,16 +223,15 @@ class SHAMapStoreImp : public SHAMapStore void clearPrior(LedgerIndex lastRotated); - // If rippled is not healthy, defer rotate-delete. - // If already unhealthy, do not change state on further check. - // Assume that, once unhealthy, a necessary step has been - // aborted, so the online-delete process needs to restart - // at next ledger. - // If recoveryWaitTime_ is set, this may sleep to give rippled - // time to recover, so never call it from any thread other than - // the main "run()". - Health - health(); + /** + * This is a health check for online deletion that waits until rippled is + * stable until returning. If the server is stopping, then it returns + * "true" to inform the caller to allow the server to stop. + * + * @return Whether the server is stopping. + */ + bool + stopping(); public: void