Skip to content

Commit 7aa9482

Browse files
committed
Process shutdown, chronos version
Rebase of #7404 following the adoption of #7418. When using chronos to terminate the process, we run into various threading and cross-platform issues captured in status-im/nim-chronos#581, specially in `nimbus` which has a more complex threading picture.
1 parent 2a57434 commit 7aa9482

File tree

2 files changed

+103
-29
lines changed

2 files changed

+103
-29
lines changed

beacon_chain/nimbus_beacon_node.nim

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2506,8 +2506,7 @@ proc run(node: BeaconNode) {.raises: [CatchableError].} =
25062506
asyncSpawn runQueueProcessingLoop(node.blockProcessor)
25072507
asyncSpawn runKeystoreCachePruningLoop(node.keystoreCache)
25082508

2509-
while not ProcessState.stopIt(notice("Shutting down", reason = it)):
2510-
poll()
2509+
waitFor ProcessState.waitStopSignals()
25112510

25122511
# time to say goodbye
25132512
node.stop()

beacon_chain/process_state.nim

Lines changed: 102 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -22,20 +22,14 @@
2222
## * The main thread wakes up any threads it started and notifies them of the
2323
## imminent shutdown then waits for them to terminate
2424
##
25-
## `chronos` has a `waitSignal` function that could be use to wake it when a
26-
## signal arrives - at the time of writing, it only works in a single-threaded
27-
## application when chronos is the only signal handler and requires using
28-
## its own raising mechanism instead of the standard `raise`/`pthread_kill`
29-
## functions which makes it difficult to use:
30-
## https://github.com/status-im/nim-chronos/issues/581
31-
##
32-
## As such, polling `ProcessState.stopping` ends up being the more reliable
33-
## cross-platform solution in spite of its downsides.
25+
## In this way, the main thread is notified that _some_ thread or the user wants
26+
## the process to shut down. The main thread stops whatever it's doing and
27+
## notifies all threads it started that shutdown is imminent and then proceeds
28+
## with the shutdown.
3429

3530
{.push raises: [].}
3631

37-
import std/atomics, results
38-
32+
import std/atomics, results, chronos, chronos/threadsync, chronicles
3933
export results
4034

4135
type ProcessState* {.pure.} = enum
@@ -48,29 +42,69 @@ var shutdownSource: Atomic[pointer]
4842

4943
import system/ansi_c
5044

45+
when defined(posix):
46+
import posix
47+
48+
when defined(linux):
49+
var signalTarget = pthread_self()
50+
51+
proc ignoreStopSignalsInThread*(_: type ProcessState): bool =
52+
# Block signals in the current thread and all threads created from it
53+
# (hopefully)
54+
var signalMask, oldSignalMask: Sigset
55+
56+
sigemptyset(signalMask) == 0 and sigaddset(signalMask, posix.SIGINT) == 0 and
57+
sigaddset(signalMask, posix.SIGTERM) == 0 and
58+
pthread_sigmask(SIG_BLOCK, signalMask, oldSignalMask) == 0
59+
60+
proc raiseStopSignal() =
61+
# If the default signal handler is blocked and the app is polling, we still
62+
# want the state updated - blocking signals on all threads is necessary for
63+
# waitSignal to work, but the application might want to check _before_ the
64+
# signal handler is invoked.
65+
processState.store(ProcessState.Stopping)
66+
67+
when defined(linux):
68+
# On linux, we want to direct the signal to the thread that is currently
69+
# listening on `waitSignal` - when there's no such thread, it doesn't
70+
# really matter which thread it goes to
71+
discard pthread_kill(signalTarget, posix.SIGTERM)
72+
else:
73+
# kqueue listens only to process-directed signals - for waitSignal to
74+
# work as expected, we use kill
75+
discard kill(getpid(), posix.SIGTERM)
76+
77+
else:
78+
proc ignoreStopSignalsInThread*(_: type ProcessState): bool =
79+
true
80+
81+
import chronos/osutils
82+
83+
proc raiseStopSignal() =
84+
discard c_raise(ansi_c.SIGINT)
85+
# Chronos installs its own handlers that are incompatible with `raise` -
86+
# when waitSignal is running we must also notify chronos
87+
discard osutils.raiseSignal(chronos.SIGINT)
88+
5189
proc scheduleStop*(_: type ProcessState, source: cstring) =
5290
## Schedule that the process should stop in a thread-safe way. This function
5391
## can be used from non-nim threads as well.
54-
##
55-
# TODO in theory, we could use `raise`/`kill`/`etc` depending on the platform
56-
# to set `processState` from within the signal handler - if we were
57-
# a kqueue/epoll-based signal handler, this would be the way to go so
58-
# as to provide a wakeup notification - there are platform-based
59-
# differences to take into account however, ie on kqueue, only process-
60-
# directed signals are woken up whereas on linux, the signal has to
61-
# reach the correct thread that is doing the waiting which requires
62-
# special care.
6392
var nilptr: pointer
6493
discard shutdownSource.compareExchange(nilptr, source, moRelaxed)
65-
processState.store(ProcessState.Stopping)
94+
raiseStopSignal()
6695

6796
proc notifyRunning*(_: type ProcessState) =
6897
processState.store(ProcessState.Running, moRelaxed)
6998

7099
proc setupStopHandlers*(_: type ProcessState) =
71100
## Install signal handlers for SIGINT/SIGTERM such that the application
72101
## updates `processState` on CTRL-C and similar, allowing it to gracefully
73-
## shut down by monitoring `ProcessState.stopping` at regular intervals.
102+
## shut down by monitoring `ProcessState.running` at regular intervals.
103+
##
104+
## `async` applications should prefer to use
105+
## `await ProcessState.waitStopsignals()` since the CTRL-C handling provided
106+
## by `signal` does not wake the async polling loop and can therefore get
107+
## stuck if no events are happening.
74108
##
75109
## This function should be called early on from the main thread to avoid the
76110
## default Nim signal handlers from being used as these will crash or close
@@ -104,6 +138,46 @@ proc setupStopHandlers*(_: type ProcessState) =
104138
when defined(posix):
105139
c_signal(ansi_c.SIGTERM, controlCHandler)
106140

141+
proc waitStopSignals*(_: type ProcessState) {.async: (raises: [CancelledError]).} =
142+
## Monitor stop signals via chronos' event loop, masking other handlers.
143+
##
144+
## This approach ensures that the event loop wakes up on signal delivery
145+
## unlike `setupStopHandlers` which merely sets a flag that must be polled.
146+
##
147+
## Only one thread should ever listen for stop signals this way.
148+
149+
# Ensure other threads don't cause a crash, in case the application did not
150+
# already call it
151+
ProcessState.setupStopHandlers()
152+
153+
let
154+
sigint = waitSignal(chronos.SIGINT)
155+
sigterm = waitSignal(chronos.SIGTERM)
156+
157+
debug "Waiting for signal", chroniclesThreadIds = true
158+
when defined(linux):
159+
signalTarget = pthread_self()
160+
161+
try:
162+
discard await race(sigint, sigterm)
163+
164+
var source = cast[cstring](shutdownSource.load())
165+
if source == nil:
166+
source = "Unknown"
167+
168+
notice "Shutting down", chroniclesThreadIds = true, source
169+
170+
processState.store(ProcessState.Stopping, moRelaxed)
171+
finally:
172+
# waitSignal sometimes overwrites signal handlers:
173+
# https://github.com/status-im/nim-chronos/issues/581
174+
ProcessState.setupStopHandlers()
175+
176+
# Might be finished already, which is fine..
177+
await noCancel sigint.cancelAndWait()
178+
await noCancel sigterm.cancelAndWait()
179+
180+
107181
proc running*(_: type ProcessState): bool =
108182
processState.load(moRelaxed) == ProcessState.Running
109183

@@ -126,7 +200,7 @@ template stopIt*(_: type ProcessState, body: untyped): bool =
126200
false
127201

128202
when isMainModule: # Test case
129-
import os, chronos, chronos/threadsync
203+
import os
130204

131205
proc threadWork() {.async.} =
132206
var todo = 2
@@ -167,14 +241,15 @@ when isMainModule: # Test case
167241
# set the same flag as `waitStopSignals` does.
168242
ProcessState.setupStopHandlers()
169243

244+
# Wait for a stop signal - this can be either the user pressing ctrl-c or
245+
# an out-of-band notification via kill/windows service command / some rest
246+
# API etc
170247
echo "main thread waiting"
171-
while ProcessState.stopping.isNone:
172-
os.sleep(100)
173-
248+
waitFor ProcessState.waitStopSignals()
174249
echo "main thread firing stopper"
175250

176251
# Notify the thread should stop itself as well using a ThreadSignalPtr
177-
# rather than an OS signal - this is more portable
252+
# rather than an OS signal
178253
waitFor stopper.fire()
179254

180255
workerThread.joinThread()

0 commit comments

Comments
 (0)