Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
62dc57c
feat(daemon): add monitoring layer with idle detection, stuck-state r…
andreabadesso Feb 23, 2026
1cd9e56
refactor(daemon): move stuck-processing timeout logic into Monitoring…
andreabadesso Feb 24, 2026
ee88ca9
fix(daemon): mock addAlert in integration tests to prevent real SQS c…
andreabadesso Feb 24, 2026
6a96883
fix(daemon): add missing monitoring config keys to integration test mock
andreabadesso Feb 24, 2026
0cb4b61
fix(daemon): adjust alert severities — idle=MINOR, stuck/storm=MAJOR
andreabadesso Feb 24, 2026
fd50522
feat(daemon): terminate process on idle timeout to allow kubernetes r…
andreabadesso Feb 24, 2026
78ba9c1
fix(daemon): apply defaults for monitoring config values to guard aga…
andreabadesso Feb 24, 2026
289eed9
refactor(daemon): stuck-processing only alerts; default 1 hour
andreabadesso Feb 24, 2026
2c2cc5c
Merge branch 'master' into feat/daemon-monitoring-layer
andreabadesso Feb 25, 2026
53aba8e
fix(daemon): address monitoring layer review feedback
andreabadesso Feb 26, 2026
7044b7a
fix: missing type
andreabadesso Feb 27, 2026
b020cae
refactor(daemon): address monitoring layer nitpicks
andreabadesso Feb 27, 2026
de03512
Merge branch 'master' into feat/daemon-monitoring-layer
andreabadesso Mar 4, 2026
3c6fbaa
refactor(daemon): address PR feedback on monitoring layer
andreabadesso Mar 4, 2026
17af02f
Merge branch 'master' into feat/daemon-monitoring-layer
andreabadesso Mar 5, 2026
2582051
Merge branch 'master' into feat/daemon-monitoring-layer
andreabadesso Mar 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
333 changes: 333 additions & 0 deletions packages/daemon/__tests__/actors/MonitoringActor.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,333 @@
/**
* Copyright (c) Hathor Labs and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/

import MonitoringActor from '../../src/actors/MonitoringActor';
import logger from '../../src/logger';
import { EventTypes } from '../../src/types/event';
import getConfig from '../../src/config';
import { addAlert, Severity } from '@wallet-service/common';

const MONITORING_IDLE_TIMEOUT_EVENT = { type: EventTypes.MONITORING_IDLE_TIMEOUT };

jest.useFakeTimers();
jest.spyOn(global, 'setInterval');
jest.spyOn(global, 'clearInterval');
jest.spyOn(global, 'setTimeout');
jest.spyOn(global, 'clearTimeout');

jest.mock('@wallet-service/common', () => ({
...jest.requireActual('@wallet-service/common'),
addAlert: jest.fn().mockResolvedValue(undefined),
}));

const mockAddAlert = addAlert as jest.Mock;

describe('MonitoringActor', () => {
let mockCallback: jest.Mock;
let mockReceive: jest.Mock;
let receiveCallback: (event: any) => void;
let config: ReturnType<typeof getConfig>;
let processExitSpy: jest.SpyInstance;

const sendEvent = (monitoringEventType: string) => {
receiveCallback({
type: EventTypes.MONITORING_EVENT,
event: { type: monitoringEventType },
});
};

beforeEach(() => {
jest.clearAllMocks();
jest.clearAllTimers();
processExitSpy = jest.spyOn(process, 'exit').mockImplementation(() => undefined as never);
config = getConfig();
config['IDLE_EVENT_TIMEOUT_MS'] = 5 * 60 * 1000; // 5 min
config['STUCK_PROCESSING_TIMEOUT_MS'] = 5 * 60 * 1000; // 5 min
config['RECONNECTION_STORM_THRESHOLD'] = 3; // low threshold for tests
config['RECONNECTION_STORM_WINDOW_MS'] = 5 * 60 * 1000; // 5 min

mockCallback = jest.fn();
mockReceive = jest.fn().mockImplementation((cb: any) => {
receiveCallback = cb;
});
});

afterEach(() => {
processExitSpy.mockRestore();
});

afterAll(() => {
jest.clearAllMocks();
jest.useRealTimers();
});

// ── Idle detection ───────────────────────────────────────────────────────────

it('should not start the idle timer on initialization', () => {
MonitoringActor(mockCallback, mockReceive, config);
expect(setInterval).not.toHaveBeenCalled();
});

it('should start the idle timer when receiving a CONNECTED event', () => {
MonitoringActor(mockCallback, mockReceive, config);
sendEvent('CONNECTED');
expect(setInterval).toHaveBeenCalledTimes(1);
});

it('should stop the idle timer when receiving a DISCONNECTED event', () => {
MonitoringActor(mockCallback, mockReceive, config);
sendEvent('CONNECTED');
sendEvent('DISCONNECTED');
expect(clearInterval).toHaveBeenCalledTimes(1);
});

it('should stop the idle timer when the actor is stopped', () => {
const stopActor = MonitoringActor(mockCallback, mockReceive, config);
sendEvent('CONNECTED');
stopActor();
expect(clearInterval).toHaveBeenCalledTimes(1);
});

it('should fire an idle alert and send MONITORING_IDLE_TIMEOUT after IDLE_EVENT_TIMEOUT_MS with no events', async () => {
MonitoringActor(mockCallback, mockReceive, config);
sendEvent('CONNECTED');

jest.advanceTimersByTime(config['IDLE_EVENT_TIMEOUT_MS'] + 1);
await Promise.resolve();
await Promise.resolve(); // flush the .finally() microtask

expect(mockAddAlert).toHaveBeenCalledTimes(1);
expect(mockAddAlert.mock.calls[0][0]).toBe('Daemon Idle — No Events Received');
expect(mockAddAlert.mock.calls[0][2]).toBe(Severity.MAJOR);
expect(mockCallback).toHaveBeenCalledWith(MONITORING_IDLE_TIMEOUT_EVENT);
expect(processExitSpy).not.toHaveBeenCalled();
});

it('should NOT fire an idle alert when events keep arriving', async () => {
MonitoringActor(mockCallback, mockReceive, config);
sendEvent('CONNECTED');

// Stay below the threshold each time
jest.advanceTimersByTime(config['IDLE_EVENT_TIMEOUT_MS'] - 1000);
sendEvent('EVENT_RECEIVED');
jest.advanceTimersByTime(config['IDLE_EVENT_TIMEOUT_MS'] - 1000);

await Promise.resolve();
expect(mockAddAlert).not.toHaveBeenCalled();
});

it('should fire only one idle alert and send MONITORING_IDLE_TIMEOUT once per idle period', async () => {
MonitoringActor(mockCallback, mockReceive, config);
sendEvent('CONNECTED');

jest.advanceTimersByTime(config['IDLE_EVENT_TIMEOUT_MS'] * 3);
await Promise.resolve();
await Promise.resolve();

expect(mockAddAlert).toHaveBeenCalledTimes(1);
expect(mockCallback).toHaveBeenCalledTimes(1);
expect(mockCallback).toHaveBeenCalledWith(MONITORING_IDLE_TIMEOUT_EVENT);
expect(processExitSpy).not.toHaveBeenCalled();
});

it('should reset the idle alert flag when an event is received, allowing a second MONITORING_IDLE_TIMEOUT', async () => {
MonitoringActor(mockCallback, mockReceive, config);
sendEvent('CONNECTED');

// Trigger first alert (interval = T/2, fires at T/2 then T — alert at T)
jest.advanceTimersByTime(config['IDLE_EVENT_TIMEOUT_MS'] + 1);
await Promise.resolve();
await Promise.resolve();
expect(mockAddAlert).toHaveBeenCalledTimes(1);
expect(mockCallback).toHaveBeenCalledTimes(1);

// Receive an event — resets idleAlertFired and lastEventReceivedAt (~T from start)
sendEvent('EVENT_RECEIVED');

// With interval=T/2, interval fires at 3T/2, 2T, 5T/2, … from start.
// The first fire where idleMs >= T after EVENT_RECEIVED is at 5T/2 (idleMs = 3T/2 - ε).
// Advancing 2T from here (total ~3T from start) covers 5T/2, so the second alert fires.
jest.advanceTimersByTime(2 * config['IDLE_EVENT_TIMEOUT_MS']);
await Promise.resolve();
await Promise.resolve();

expect(mockAddAlert).toHaveBeenCalledTimes(2);
expect(mockCallback).toHaveBeenCalledTimes(2);
expect(processExitSpy).not.toHaveBeenCalled();
});

it('should restart the idle timer when CONNECTED is sent while already running', () => {
MonitoringActor(mockCallback, mockReceive, config);
sendEvent('CONNECTED');
sendEvent('CONNECTED'); // second connect clears old and starts new
expect(clearInterval).toHaveBeenCalledTimes(1);
expect(setInterval).toHaveBeenCalledTimes(2);
});

// ── Stuck-processing detection ───────────────────────────────────────────────

it('should start a stuck timer on PROCESSING_STARTED', () => {
MonitoringActor(mockCallback, mockReceive, config);
sendEvent('PROCESSING_STARTED');
expect(setTimeout).toHaveBeenCalledTimes(1);
});

it('should cancel the stuck timer on PROCESSING_COMPLETED', () => {
MonitoringActor(mockCallback, mockReceive, config);
sendEvent('PROCESSING_STARTED');
sendEvent('PROCESSING_COMPLETED');
expect(clearTimeout).toHaveBeenCalledTimes(1);
});

it('should fire a MAJOR alert when stuck and NOT send MONITORING_IDLE_TIMEOUT', async () => {
MonitoringActor(mockCallback, mockReceive, config);
sendEvent('PROCESSING_STARTED');

jest.advanceTimersByTime(config['STUCK_PROCESSING_TIMEOUT_MS'] + 1);
await Promise.resolve();
await Promise.resolve();

expect(mockAddAlert).toHaveBeenCalledTimes(1);
expect(mockAddAlert.mock.calls[0][0]).toBe('Daemon Stuck In Processing State');
expect(mockAddAlert.mock.calls[0][2]).toBe(Severity.MAJOR);
// Stuck detection intentionally does not notify the machine — machine keeps running
expect(mockCallback).not.toHaveBeenCalled();
});

it('should NOT fire the stuck alert when PROCESSING_COMPLETED arrives in time', async () => {
MonitoringActor(mockCallback, mockReceive, config);
sendEvent('PROCESSING_STARTED');

jest.advanceTimersByTime(config['STUCK_PROCESSING_TIMEOUT_MS'] - 1000);
sendEvent('PROCESSING_COMPLETED');

jest.advanceTimersByTime(2000); // advance past original timeout
await Promise.resolve();

expect(mockAddAlert).not.toHaveBeenCalled();
expect(mockCallback).not.toHaveBeenCalled();
});

it('should reset the stuck timer on consecutive PROCESSING_STARTED events', () => {
MonitoringActor(mockCallback, mockReceive, config);
sendEvent('PROCESSING_STARTED');
sendEvent('PROCESSING_STARTED'); // second one clears the first
expect(clearTimeout).toHaveBeenCalledTimes(1);
expect(setTimeout).toHaveBeenCalledTimes(2);
});

it('should stop the stuck timer when the actor is stopped', () => {
const stopActor = MonitoringActor(mockCallback, mockReceive, config);
sendEvent('PROCESSING_STARTED');
stopActor();
expect(clearTimeout).toHaveBeenCalledTimes(1);
});

it('should also clear the stuck timer on DISCONNECTED', () => {
MonitoringActor(mockCallback, mockReceive, config);
sendEvent('CONNECTED');
sendEvent('PROCESSING_STARTED');
sendEvent('DISCONNECTED');
// clearTimeout for stuck timer + clearInterval for idle timer
expect(clearTimeout).toHaveBeenCalledTimes(1);
expect(clearInterval).toHaveBeenCalledTimes(1);
});

// ── Reconnection storm detection ─────────────────────────────────────────────

it('should fire a reconnection storm alert when the threshold is reached', async () => {
MonitoringActor(mockCallback, mockReceive, config);

sendEvent('RECONNECTING');
sendEvent('RECONNECTING');
sendEvent('RECONNECTING'); // threshold is 3 in test config

await Promise.resolve();
expect(mockAddAlert).toHaveBeenCalledTimes(1);
expect(mockAddAlert.mock.calls[0][0]).toBe('Daemon Reconnection Storm');
expect(mockAddAlert.mock.calls[0][2]).toBe(Severity.MAJOR);
});

it('should NOT fire a reconnection storm alert below the threshold', async () => {
MonitoringActor(mockCallback, mockReceive, config);

sendEvent('RECONNECTING');
sendEvent('RECONNECTING');

await Promise.resolve();
expect(mockAddAlert).not.toHaveBeenCalled();
});

it('should not fire more than one storm alert within the 1-minute cooldown window', async () => {
MonitoringActor(mockCallback, mockReceive, config);

// Trigger threshold
sendEvent('RECONNECTING');
sendEvent('RECONNECTING');
sendEvent('RECONNECTING'); // threshold = 3 → first alert

await Promise.resolve();
expect(mockAddAlert).toHaveBeenCalledTimes(1);

// Additional reconnections within cooldown (no time advanced)
sendEvent('RECONNECTING');
sendEvent('RECONNECTING');

await Promise.resolve();
// Cooldown prevents a second alert
expect(mockAddAlert).toHaveBeenCalledTimes(1);
});

it('should fire another storm alert after the 1-minute cooldown expires', async () => {
MonitoringActor(mockCallback, mockReceive, config);

sendEvent('RECONNECTING');
sendEvent('RECONNECTING');
sendEvent('RECONNECTING'); // first alert

await Promise.resolve();
expect(mockAddAlert).toHaveBeenCalledTimes(1);

// Advance past the 1-minute cooldown
jest.advanceTimersByTime(61 * 1000);

sendEvent('RECONNECTING'); // still >= threshold in window, cooldown expired

await Promise.resolve();
expect(mockAddAlert).toHaveBeenCalledTimes(2);
expect(mockAddAlert.mock.calls[1][0]).toBe('Daemon Reconnection Storm');
});

it('should evict old reconnections outside the storm window', async () => {
MonitoringActor(mockCallback, mockReceive, config);

sendEvent('RECONNECTING');
sendEvent('RECONNECTING');

jest.advanceTimersByTime(config['RECONNECTION_STORM_WINDOW_MS'] + 1000);

// Only 1 new reconnection — below threshold after eviction
sendEvent('RECONNECTING');

await Promise.resolve();
expect(mockAddAlert).not.toHaveBeenCalled();
});

// ── Misc ─────────────────────────────────────────────────────────────────────

it('should ignore events of other types and log a warning', () => {
const warnSpy = jest.spyOn(logger, 'warn');
MonitoringActor(mockCallback, mockReceive, config);

receiveCallback({ type: 'SOME_OTHER_EVENT', event: { type: 'WHATEVER' } });

expect(warnSpy).toHaveBeenCalledWith(
'[monitoring] Unexpected event type received by MonitoringActor',
);
expect(setInterval).not.toHaveBeenCalled();
});
});
4 changes: 4 additions & 0 deletions packages/daemon/__tests__/integration/balances.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ getConfig.mockReturnValue({
DB_PASS,
DB_PORT,
ACK_TIMEOUT_MS: 20000,
IDLE_EVENT_TIMEOUT_MS: 5 * 60 * 1000,
STUCK_PROCESSING_TIMEOUT_MS: 5 * 60 * 1000,
RECONNECTION_STORM_THRESHOLD: 10,
RECONNECTION_STORM_WINDOW_MS: 5 * 60 * 1000,
});

let mysql: Connection;
Expand Down
4 changes: 4 additions & 0 deletions packages/daemon/__tests__/integration/token_creation.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ getConfig.mockReturnValue({
DB_PASS,
DB_PORT,
ACK_TIMEOUT_MS: 20000,
IDLE_EVENT_TIMEOUT_MS: 5 * 60 * 1000,
STUCK_PROCESSING_TIMEOUT_MS: 5 * 60 * 1000,
RECONNECTION_STORM_THRESHOLD: 10,
RECONNECTION_STORM_WINDOW_MS: 5 * 60 * 1000,
});

let mysql: Connection;
Expand Down
16 changes: 16 additions & 0 deletions packages/daemon/jestIntegrationSetup.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
/**
* Copyright (c) Hathor Labs and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/

/**
* Integration test environment setup.
* Mocks addAlert so MonitoringActor does not attempt real SQS/SNS connections
* in environments where AWS credentials / region are not configured.
*/
jest.mock('@wallet-service/common', () => ({
...jest.requireActual('@wallet-service/common'),
addAlert: jest.fn().mockResolvedValue(undefined),
}));
1 change: 1 addition & 0 deletions packages/daemon/jest_integration.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ const mainTestMatch = process.env.SPECIFIC_INTEGRATION_TEST_FILE
module.exports = {
roots: ["<rootDir>/__tests__"],
setupFiles: ['./jestSetup.ts'],
setupFilesAfterEnv: ['./jestIntegrationSetup.ts'],
transform: {
"^.+\\.ts$": ["ts-jest", {
tsconfig: "./tsconfig.json",
Expand Down
Loading