Instead of abending, wait until server stabilizes after

failing online delete healch check.
XRPLF · May 2, 2022 · cbdfe9e · cbdfe9e
1 parent 7c66747
commit cbdfe9e
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 112 deletions.
diff --git a/cfg/rippled-example.cfg b/cfg/rippled-example.cfg
@@ -1140,17 +1140,10 @@
 #                           The online delete process checks periodically
 #                           that rippled is still in sync with the network,
 #                           and that the validated ledger is less than
-#                           'age_threshold_seconds' old. By default, if it
-#                           is not the online delete process aborts and
-#                           tries again later. If 'recovery_wait_seconds'
-#                           is set and rippled is out of sync, but likely to
-#                           recover quickly, then online delete will wait
-#                           this number of seconds for rippled to get back
-#                           into sync before it aborts.
-#                           Set this value if the node is otherwise staying
-#                           in sync, or recovering quickly, but the online
-#                           delete process is unable to finish.
-#                           Default is unset.
+#                           'age_threshold_seconds' old. If not, then continue
+#                           sleeping for this number of seconds and 
+#                           checking until healthy.
+#                           Default is 5.
 #
 #   Optional keys for Cassandra:
 #

diff --git a/src/ripple/app/misc/SHAMapStoreImp.cpp b/src/ripple/app/misc/SHAMapStoreImp.cpp
@@ -138,7 +138,7 @@ SHAMapStoreImp::SHAMapStoreImp(
         if (get_if_exists(section, "age_threshold_seconds", temp))
             ageThreshold_ = std::chrono::seconds{temp};
         if (get_if_exists(section, "recovery_wait_seconds", temp))
-            recoveryWaitTime_.emplace(std::chrono::seconds{temp});
+            recoveryWaitTime_ = std::chrono::seconds{temp};
 
         get_if_exists(section, "advisory_delete", advisoryDelete_);
 
@@ -268,7 +268,7 @@ SHAMapStoreImp::copyNode(std::uint64_t& nodeCount, SHAMapTreeNode const& node)
         true);
     if (!(++nodeCount % checkHealthInterval_))
     {
-        if (health())
+        if (stopping())
             return false;
     }
 
@@ -326,7 +326,7 @@ SHAMapStoreImp::run()
 
         bool const readyToRotate =
             validatedSeq >= lastRotated + deleteInterval_ &&
-            canDelete_ >= lastRotated - 1 && !health();
+            canDelete_ >= lastRotated - 1 && !stopping();
 
         // Make sure we don't delete ledgers currently being
         // imported into the ShardStore
@@ -358,15 +358,8 @@ SHAMapStoreImp::run()
                 << ledgerMaster_->getValidatedLedgerAge().count() << 's';
 
             clearPrior(lastRotated);
-            switch (health())
-            {
-                case Health::stopping:
-                    return;
-                case Health::unhealthy:
-                    continue;
-                case Health::ok:
-                default:;
-            }
+            if (stopping())
+                return;
 
             JLOG(journal_.debug()) << "copying ledger " << validatedSeq;
             std::uint64_t nodeCount = 0;
@@ -375,30 +368,16 @@ SHAMapStoreImp::run()
                 this,
                 std::ref(nodeCount),
                 std::placeholders::_1));
-            switch (health())
-            {
-                case Health::stopping:
-                    return;
-                case Health::unhealthy:
-                    continue;
-                case Health::ok:
-                default:;
-            }
+            if (stopping())
+                return;
             // Only log if we completed without a "health" abort
             JLOG(journal_.debug()) << "copied ledger " << validatedSeq
                                    << " nodecount " << nodeCount;
 
             JLOG(journal_.debug()) << "freshening caches";
             freshenCaches();
-            switch (health())
-            {
-                case Health::stopping:
-                    return;
-                case Health::unhealthy:
-                    continue;
-                case Health::ok:
-                default:;
-            }
+            if (stopping())
+                return;
             // Only log if we completed without a "health" abort
             JLOG(journal_.debug()) << validatedSeq << " freshened caches";
 
@@ -408,15 +387,8 @@ SHAMapStoreImp::run()
                 << validatedSeq << " new backend " << newBackend->getName();
 
             clearCaches(validatedSeq);
-            switch (health())
-            {
-                case Health::stopping:
-                    return;
-                case Health::unhealthy:
-                    continue;
-                case Health::ok:
-                default:;
-            }
+            if (stopping())
+                return;
 
             lastRotated = validatedSeq;
 
@@ -580,7 +552,7 @@ SHAMapStoreImp::clearSql(
         min = *m;
     }
 
-    if (min > lastRotated || health() != Health::ok)
+    if (min > lastRotated || stopping())
         return;
     if (min == lastRotated)
     {
@@ -601,11 +573,11 @@ SHAMapStoreImp::clearSql(
         JLOG(journal_.trace())
             << "End: Delete up to " << deleteBatch_ << " rows with LedgerSeq < "
             << min << " from: " << TableName;
-        if (health())
+        if (stopping())
             return;
         if (min < lastRotated)
             std::this_thread::sleep_for(backOff_);
-        if (health())
+        if (stopping())
             return;
     }
     JLOG(journal_.debug()) << "finished deleting from: " << TableName;
@@ -645,7 +617,7 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated)
     ledgerMaster_->clearPriorLedgers(lastRotated);
     JLOG(journal_.trace()) << "End: Clear internal ledgers up to "
                            << lastRotated;
-    if (health())
+    if (stopping())
         return;
 
     RelationalDBInterfaceSqlite* iface =
@@ -661,7 +633,7 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated)
         [&iface](LedgerIndex min) -> void {
             iface->deleteBeforeLedgerSeq(min);
         });
-    if (health())
+    if (stopping())
         return;
 
     if (!app_.config().useTxTables())
@@ -676,7 +648,7 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated)
         [&iface](LedgerIndex min) -> void {
             iface->deleteTransactionsBeforeLedgerSeq(min);
         });
-    if (health())
+    if (stopping())
         return;
 
     clearSql(
@@ -688,52 +660,30 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated)
         [&iface](LedgerIndex min) -> void {
             iface->deleteAccountTransactionsBeforeLedgerSeq(min);
         });
-    if (health())
+    if (stopping())
         return;
 }
 
-SHAMapStoreImp::Health
-SHAMapStoreImp::health()
+bool
+SHAMapStoreImp::stopping()
 {
+    auto age = ledgerMaster_->getValidatedLedgerAge();
+    OperatingMode mode = netOPs_->getOperatingMode();
+    std::unique_lock lock(mutex_);
+    while (!stop_ && (mode != OperatingMode::FULL || age > ageThreshold_))
     {
-        std::lock_guard lock(mutex_);
-        if (stop_)
-            return Health::stopping;
-    }
-    if (!netOPs_)
-        return Health::ok;
-    assert(deleteInterval_);
-
-    if (healthy_)
-    {
-        auto age = ledgerMaster_->getValidatedLedgerAge();
-        OperatingMode mode = netOPs_->getOperatingMode();
-        if (recoveryWaitTime_ && mode == OperatingMode::SYNCING &&
-            age < ageThreshold_)
-        {
-            JLOG(journal_.warn())
-                << "Waiting " << recoveryWaitTime_->count()
-                << "s for node to get back into sync with network. state: "
-                << app_.getOPs().strOperatingMode(mode, false) << ". age "
-                << age.count() << 's';
-            std::this_thread::sleep_for(*recoveryWaitTime_);
-
-            age = ledgerMaster_->getValidatedLedgerAge();
-            mode = netOPs_->getOperatingMode();
-        }
-        if (mode != OperatingMode::FULL || age > ageThreshold_)
-        {
-            JLOG(journal_.warn()) << "Not deleting. state: "
-                                  << app_.getOPs().strOperatingMode(mode, false)
-                                  << ". age " << age.count() << 's';
-            healthy_ = false;
-        }
+        lock.unlock();
+        JLOG(journal_.warn()) << "Waiting " << recoveryWaitTime_.count()
+                              << "s for node to stabilize. state: "
+                              << app_.getOPs().strOperatingMode(mode, false)
+                              << ". age " << age.count() << 's';
+        std::this_thread::sleep_for(recoveryWaitTime_);
+        age = ledgerMaster_->getValidatedLedgerAge();
+        mode = netOPs_->getOperatingMode();
+        lock.lock();
     }
 
-    if (healthy_)
-        return Health::ok;
-    else
-        return Health::unhealthy;
+    return stop_;
 }
 
 void

diff --git a/src/ripple/app/misc/SHAMapStoreImp.h b/src/ripple/app/misc/SHAMapStoreImp.h
@@ -40,8 +40,6 @@ class NetworkOPs;
 class SHAMapStoreImp : public SHAMapStore
 {
 private:
-    enum Health : std::uint8_t { ok = 0, stopping, unhealthy };
-
     class SavedStateDB
     {
     public:
@@ -106,12 +104,12 @@ class SHAMapStoreImp : public SHAMapStore
     std::uint32_t deleteBatch_ = 100;
     std::chrono::milliseconds backOff_{100};
     std::chrono::seconds ageThreshold_{60};
-    /// If set, and the node is out of sync during an
+    /// If  the node is out of sync during an
     /// online_delete health check, sleep the thread
-    /// for this time and check again so the node can
-    /// recover.
+    /// for this time, and continue checking until
+    /// recovery.
     /// See also: "recovery_wait_seconds" in rippled-example.cfg
-    std::optional<std::chrono::seconds> recoveryWaitTime_;
+    std::chrono::seconds recoveryWaitTime_{5};
 
     // these do not exist upon SHAMapStore creation, but do exist
     // as of run() or before
@@ -201,7 +199,7 @@ class SHAMapStoreImp : public SHAMapStore
         {
             dbRotating_->fetchNodeObject(
                 key, 0, NodeStore::FetchType::synchronous, true);
-            if (!(++check % checkHealthInterval_) && health())
+            if (!(++check % checkHealthInterval_) && stopping())
                 return true;
         }
 
@@ -225,16 +223,15 @@ class SHAMapStoreImp : public SHAMapStore
     void
     clearPrior(LedgerIndex lastRotated);
 
-    // If rippled is not healthy, defer rotate-delete.
-    // If already unhealthy, do not change state on further check.
-    // Assume that, once unhealthy, a necessary step has been
-    // aborted, so the online-delete process needs to restart
-    // at next ledger.
-    // If recoveryWaitTime_ is set, this may sleep to give rippled
-    // time to recover, so never call it from any thread other than
-    // the main "run()".
-    Health
-    health();
+    /**
+     * This is a health check for online deletion that waits until rippled is
+     * stable until returning. If the server is stopping, then it returns
+     * "true" to inform the caller to allow the server to stop.
+     *
+     * @return Whether the server is stopping.
+     */
+    bool
+    stopping();
 
 public:
     void