Skip to content

Commit 0c4e41f

Browse files
committed
Rework process shutdown, simplified (fixes #3235)
#7404 attempted to use `waitSignal` to handle shutdown notifications but this turned out to be unreliable, in particular due to multi-threading and cross-platform differences. This PR fixes the crash in signal handlers by removing logging but otherwise retains the polling nature of the shutdown initiation - it also allows nimbus eth1/2 to share the same cross-thread mechanism for initiating a shutdown.
1 parent cf32a6c commit 0c4e41f

File tree

6 files changed

+232
-63
lines changed

6 files changed

+232
-63
lines changed

Makefile

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,8 @@ XML_TEST_BINARIES := \
299299
# test suite
300300
TEST_BINARIES := \
301301
block_sim \
302-
test_libnimbus_lc
302+
test_libnimbus_lc \
303+
process_state
303304
.PHONY: $(TEST_BINARIES) $(XML_TEST_BINARIES) force_build_alone_all_tests
304305

305306
# Preset-dependent tests
@@ -384,6 +385,14 @@ block_sim: | build deps
384385
$(NIM_PARAMS) && \
385386
echo -e $(BUILD_END_MSG) "build/$@"
386387

388+
process_state: | build deps
389+
+ echo -e $(BUILD_MSG) "build/$@" && \
390+
MAKE="$(MAKE)" V="$(V)" $(ENV_SCRIPT) scripts/compile_nim_program.sh \
391+
$@ \
392+
"beacon_chain/$@.nim" \
393+
$(NIM_PARAMS) && \
394+
echo -e $(BUILD_END_MSG) "build/$@"
395+
387396
DISABLE_TEST_FIXTURES_SCRIPT := 0
388397
# This parameter passing scheme is ugly, but short.
389398
test: | $(XML_TEST_BINARIES) $(TEST_BINARIES)

beacon_chain/beacon_node_status.nim

Lines changed: 0 additions & 18 deletions
This file was deleted.

beacon_chain/nimbus_beacon_node.nim

Lines changed: 26 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ import
2323
engine_authentication, weak_subjectivity, peerdas_helpers],
2424
./sync/[sync_protocol, light_client_protocol, sync_overseer],
2525
./validators/[keystore_management, beacon_validators],
26-
"."/[
26+
./[
2727
beacon_node, beacon_node_light_client, deposits,
28-
nimbus_binary_common, statusbar, trusted_node_sync, wallets]
28+
nimbus_binary_common, process_state, statusbar, trusted_node_sync, wallets]
2929

3030
when defined(posix):
3131
import system/ansi_c
@@ -395,7 +395,7 @@ proc initFullNode(
395395

396396
proc eventWaiter(): Future[void] {.async: (raises: [CancelledError]).} =
397397
await node.shutdownEvent.wait()
398-
bnStatus = BeaconNodeStatus.Stopping
398+
ProcessState.scheduleStop("shutdownEvent")
399399

400400
asyncSpawn eventWaiter()
401401

@@ -741,14 +741,14 @@ proc init*(T: type BeaconNode,
741741
try:
742742
if config.numThreads < 0:
743743
fatal "The number of threads --num-threads cannot be negative."
744-
quit 1
744+
quit QuitFailure
745745
elif config.numThreads == 0:
746746
Taskpool.new(numThreads = min(countProcessors(), 16))
747747
else:
748748
Taskpool.new(numThreads = config.numThreads)
749749
except CatchableError as e:
750750
fatal "Cannot start taskpool", err = e.msg
751-
quit 1
751+
quit QuitFailure
752752

753753
info "Threadpool started", numThreads = taskpool.numThreads
754754

@@ -1931,7 +1931,7 @@ proc onSecond(node: BeaconNode, time: Moment) =
19311931
if node.config.stopAtSyncedEpoch != 0 and
19321932
node.dag.head.slot.epoch >= node.config.stopAtSyncedEpoch:
19331933
notice "Shutting down after having reached the target synced epoch"
1934-
bnStatus = BeaconNodeStatus.Stopping
1934+
ProcessState.scheduleStop("stopAtSyncedEpoch")
19351935

19361936
proc runOnSecondLoop(node: BeaconNode) {.async.} =
19371937
const
@@ -2161,8 +2161,6 @@ proc installMessageValidators(node: BeaconNode) =
21612161
node.installLightClientMessageValidators()
21622162

21632163
proc stop(node: BeaconNode) =
2164-
bnStatus = BeaconNodeStatus.Stopping
2165-
notice "Graceful shutdown"
21662164
if not node.config.inProcessValidators:
21672165
try:
21682166
node.vcProcess.close()
@@ -2181,7 +2179,7 @@ proc stop(node: BeaconNode) =
21812179
notice "Databases closed"
21822180

21832181
proc run(node: BeaconNode) {.raises: [CatchableError].} =
2184-
bnStatus = BeaconNodeStatus.Running
2182+
ProcessState.notifyRunning()
21852183

21862184
if not isNil(node.restServer):
21872185
node.restServer.installRestHandlers(node)
@@ -2219,9 +2217,13 @@ proc run(node: BeaconNode) {.raises: [CatchableError].} =
22192217
asyncSpawn runQueueProcessingLoop(node.blockProcessor)
22202218
asyncSpawn runKeystoreCachePruningLoop(node.keystoreCache)
22212219

2222-
# main event loop
2223-
while bnStatus == BeaconNodeStatus.Running:
2224-
poll() # if poll fails, the network is broken
2220+
block mainLoop:
2221+
while true:
2222+
ProcessState.stopping.isErrOr:
2223+
notice "Shutting down", reason = value
2224+
break mainLoop
2225+
2226+
poll()
22252227

22262228
# time to say goodbye
22272229
node.stop()
@@ -2439,8 +2441,6 @@ proc doRunBeaconNode(config: var BeaconNodeConf, rng: ref HmacDrbgContext) {.rai
24392441
ignoreDeprecatedOption web3ForcePolling
24402442
ignoreDeprecatedOption finalizedDepositTreeSnapshot
24412443

2442-
createPidFile(config.dataDir.string / "beacon_node.pid")
2443-
24442444
config.createDumpDirs()
24452445

24462446
# There are no managed event loops in here, to do a graceful shutdown, but
@@ -2453,27 +2453,6 @@ proc doRunBeaconNode(config: var BeaconNodeConf, rng: ref HmacDrbgContext) {.rai
24532453
for node in metadata.bootstrapNodes:
24542454
config.bootstrapNodes.add node
24552455

2456-
## Ctrl+C handling
2457-
proc controlCHandler() {.noconv.} =
2458-
when defined(windows):
2459-
# workaround for https://github.com/nim-lang/Nim/issues/4057
2460-
try:
2461-
setupForeignThreadGc()
2462-
except Exception as exc: raiseAssert exc.msg # shouldn't happen
2463-
notice "Shutting down after having received SIGINT"
2464-
bnStatus = BeaconNodeStatus.Stopping
2465-
try:
2466-
setControlCHook(controlCHandler)
2467-
except Exception as exc: # TODO Exception
2468-
warn "Cannot set ctrl-c handler", msg = exc.msg
2469-
2470-
# equivalent SIGTERM handler
2471-
when defined(posix):
2472-
proc SIGTERMHandler(signal: cint) {.noconv.} =
2473-
notice "Shutting down after having received SIGTERM"
2474-
bnStatus = BeaconNodeStatus.Stopping
2475-
c_signal(ansi_c.SIGTERM, SIGTERMHandler)
2476-
24772456
block:
24782457
let res =
24792458
if config.trustedSetupFile.isNone:
@@ -2483,6 +2462,10 @@ proc doRunBeaconNode(config: var BeaconNodeConf, rng: ref HmacDrbgContext) {.rai
24832462
if res.isErr():
24842463
raiseAssert res.error()
24852464

2465+
ProcessState.stopping.isErrOr:
2466+
notice "Shutting down", reason = value
2467+
return
2468+
24862469
let node = waitFor BeaconNode.init(rng, config, metadata)
24872470

24882471
let metricsServer = (waitFor config.initMetricsServer()).valueOr:
@@ -2494,7 +2477,8 @@ proc doRunBeaconNode(config: var BeaconNodeConf, rng: ref HmacDrbgContext) {.rai
24942477

24952478
node.metricsServer = metricsServer
24962479

2497-
if bnStatus == BeaconNodeStatus.Stopping:
2480+
ProcessState.stopping.isErrOr:
2481+
notice "Shutting down", reason = value
24982482
return
24992483

25002484
when not defined(windows):
@@ -2601,7 +2585,11 @@ proc handleStartUpCmd(config: var BeaconNodeConf) {.raises: [CatchableError].} =
26012585
let rng = HmacDrbgContext.new()
26022586

26032587
case config.cmd
2604-
of BNStartUpCmd.noCommand: doRunBeaconNode(config, rng)
2588+
of BNStartUpCmd.noCommand:
2589+
createPidFile(config.dataDir.string / "beacon_node.pid")
2590+
ProcessState.setupStopHandlers()
2591+
2592+
doRunBeaconNode(config, rng)
26052593
of BNStartUpCmd.deposits: doDeposits(config, rng[])
26062594
of BNStartUpCmd.wallets: doWallets(config, rng[])
26072595
of BNStartUpCmd.record: doRecord(config, rng[])
@@ -2667,7 +2655,7 @@ proc main() {.noinline, raises: [CatchableError].} =
26672655
when defined(windows):
26682656
if config.runAsService:
26692657
proc exitService() =
2670-
bnStatus = BeaconNodeStatus.Stopping
2658+
ProcessState.scheduleStop("exitService")
26712659
establishWindowsService(clientId, copyrights, nimBanner, SPEC_VERSION,
26722660
"nimbus_beacon_node", BeaconNodeConf,
26732661
handleStartUpCmd, exitService)

beacon_chain/nimbus_binary_common.nim

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import
2121
# Local modules
2222
./spec/[helpers, keystore],
2323
./spec/datatypes/base,
24-
./[beacon_clock, beacon_node_status, conf, version]
24+
./[beacon_clock, conf, process_state, version]
2525

2626
when defaultChroniclesStream.outputs.type.arity == 2:
2727
from std/os import commandLineParams, getEnv, splitFile
@@ -41,7 +41,7 @@ declareGauge nimVersionGauge, "Nim version info", ["version", "nim_commit"], nam
4141
nimVersionGauge.set(1, labelValues=[NimVersion, getNimGitHash()])
4242

4343
export
44-
confutils, toml_serialization, beacon_clock, beacon_node_status, conf
44+
confutils, toml_serialization, beacon_clock, conf
4545

4646

4747
type
@@ -296,7 +296,7 @@ proc runSlotLoop*[T](node: T, startTime: BeaconTime,
296296
fatal "System time adjusted backwards significantly - clock may be inaccurate - shutting down",
297297
nextSlot = shortLog(nextSlot),
298298
wallSlot = shortLog(wallSlot)
299-
bnStatus = BeaconNodeStatus.Stopping
299+
ProcessState.scheduleStop("clock skew")
300300
return
301301

302302
# Time moved back by a single slot - this could be a minor adjustment,

0 commit comments

Comments
 (0)