From b2a88d913219940913c62f824daf1cad67e2e624 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Abadesso?= Date: Tue, 14 Apr 2026 11:36:36 -0300 Subject: [PATCH 1/5] feat(daemon): scheduled address balance validation in MonitoringActor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds an optional periodic safety-net check that compares every row in address_balance against the sum of its non-voided address_tx_history rows, firing a MAJOR alert on any mismatch. This complements #376, which made per-void validation skippable for performance — the scheduled check closes the loop by guaranteeing full-database consistency is still verified on an operator-tunable cadence. The check is a single SQL LEFT JOIN query bounded by LIMIT 100, running on a configurable interval while CONNECTED: BALANCE_VALIDATION_ENABLED default: false BALANCE_VALIDATION_INTERVAL_MS default: 600000 (10 min) Letting the database do the pairing (JOIN), the math (native BIGINT, no JS Number coercion), and the consistency snapshot (single statement = one InnoDB read view) keeps the actor logic to ~30 lines and avoids a family of correctness traps that come with paginated pairing in application code — index desync when filters diverge, precision loss on large token supplies, and races between separate balance/history queries. An in-flight guard skips overlapping ticks so a slow run never stacks connections. Errors log and move on — the daemon never crashes on validation failure. DISCONNECTED clears the interval; any in-flight SELECT completes and releases its connection harmlessly. If mismatches are found, the alert payload carries up to 100 sample rows and a `truncated` flag so operators know whether to run their own unbounded query to assess full extent. Tests: 7 new cases covering enable/disable, start/stop lifecycle, mismatch detection, no-mismatch path, truncation, DB error handling, and idle/stuck/storm regression (22 existing cases still pass). Depends on #369. Relates to #380. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../__tests__/actors/MonitoringActor.test.ts | 146 ++++++++++++++++++ packages/daemon/src/actors/MonitoringActor.ts | 106 +++++++++++++ packages/daemon/src/config.ts | 6 + 3 files changed, 258 insertions(+) diff --git a/packages/daemon/__tests__/actors/MonitoringActor.test.ts b/packages/daemon/__tests__/actors/MonitoringActor.test.ts index 07ce898d..b8411030 100644 --- a/packages/daemon/__tests__/actors/MonitoringActor.test.ts +++ b/packages/daemon/__tests__/actors/MonitoringActor.test.ts @@ -10,6 +10,7 @@ import logger from '../../src/logger'; import { EventTypes } from '../../src/types/event'; import getConfig from '../../src/config'; import { addAlert, Severity } from '@wallet-service/common'; +import * as db from '../../src/db'; const MONITORING_IDLE_TIMEOUT_EVENT = { type: EventTypes.MONITORING_IDLE_TIMEOUT }; @@ -24,6 +25,10 @@ jest.mock('@wallet-service/common', () => ({ addAlert: jest.fn().mockResolvedValue(undefined), })); +jest.mock('../../src/db', () => ({ + getDbConnection: jest.fn(), +})); + const mockAddAlert = addAlert as jest.Mock; describe('MonitoringActor', () => { @@ -49,6 +54,8 @@ describe('MonitoringActor', () => { config['STUCK_PROCESSING_TIMEOUT_MS'] = 5 * 60 * 1000; // 5 min config['RECONNECTION_STORM_THRESHOLD'] = 3; // low threshold for tests config['RECONNECTION_STORM_WINDOW_MS'] = 5 * 60 * 1000; // 5 min + config['BALANCE_VALIDATION_ENABLED'] = false; + config['BALANCE_VALIDATION_INTERVAL_MS'] = 5000; mockCallback = jest.fn(); mockReceive = jest.fn().mockImplementation((cb: any) => { @@ -330,4 +337,143 @@ describe('MonitoringActor', () => { ); expect(setInterval).not.toHaveBeenCalled(); }); + + // ── Balance validation ──────────────────────────────────────────────────── + + const flushPromises = () => new Promise(jest.requireActual('timers').setImmediate); + + describe('balance validation', () => { + let mockMysql: any; + + beforeEach(() => { + mockMysql = { + release: jest.fn(), + query: jest.fn().mockResolvedValue([[], []]), + }; + (db.getDbConnection as jest.Mock).mockResolvedValue(mockMysql); + }); + + it('should not start balance validation when disabled', () => { + config['BALANCE_VALIDATION_ENABLED'] = false; + MonitoringActor(mockCallback, mockReceive, config); + sendEvent('CONNECTED'); + + // Only the idle-check interval should fire; no validation interval. + expect(setInterval).toHaveBeenCalledTimes(1); + }); + + it('should start the validation interval on CONNECTED when enabled', () => { + config['BALANCE_VALIDATION_ENABLED'] = true; + MonitoringActor(mockCallback, mockReceive, config); + sendEvent('CONNECTED'); + + // Idle check + balance validation = 2 intervals. + expect(setInterval).toHaveBeenCalledTimes(2); + expect(setInterval).toHaveBeenCalledWith( + expect.any(Function), + config['BALANCE_VALIDATION_INTERVAL_MS'], + ); + }); + + it('should clear the validation interval on DISCONNECTED', () => { + config['BALANCE_VALIDATION_ENABLED'] = true; + MonitoringActor(mockCallback, mockReceive, config); + sendEvent('CONNECTED'); + sendEvent('DISCONNECTED'); + + // Idle check + balance validation = 2 cleared intervals. + expect(clearInterval).toHaveBeenCalledTimes(2); + }); + + it('should alert when the validation query returns mismatch rows', async () => { + config['BALANCE_VALIDATION_ENABLED'] = true; + + const mismatchRow = { + address: 'addr1', + tokenId: 'token1', + balanceSum: '100', + historySum: '200', + }; + mockMysql.query.mockResolvedValueOnce([[mismatchRow], []]); + + MonitoringActor(mockCallback, mockReceive, config); + sendEvent('CONNECTED'); + + jest.advanceTimersByTime(config['BALANCE_VALIDATION_INTERVAL_MS']); + await flushPromises(); + + expect(mockMysql.query).toHaveBeenCalledWith(expect.stringContaining('LEFT JOIN')); + expect(mockAddAlert).toHaveBeenCalledWith( + 'Balance validation found mismatches', + expect.stringContaining('1 balance mismatch'), + Severity.MAJOR, + expect.objectContaining({ + truncated: false, + samples: [mismatchRow], + }), + expect.anything(), + ); + expect(mockMysql.release).toHaveBeenCalled(); + }); + + it('should log info when no mismatches found', async () => { + config['BALANCE_VALIDATION_ENABLED'] = true; + const mockLoggerInfo = jest.spyOn(logger, 'info'); + + mockMysql.query.mockResolvedValueOnce([[], []]); + + MonitoringActor(mockCallback, mockReceive, config); + sendEvent('CONNECTED'); + + jest.advanceTimersByTime(config['BALANCE_VALIDATION_INTERVAL_MS']); + await flushPromises(); + + expect(mockAddAlert).not.toHaveBeenCalled(); + expect(mockLoggerInfo).toHaveBeenCalledWith( + expect.stringContaining('no mismatches found'), + ); + }); + + it('should mark the alert as truncated when the row count hits the LIMIT', async () => { + config['BALANCE_VALIDATION_ENABLED'] = true; + + // The actor's SAMPLE_LIMIT is 100; if exactly that many come back we + // assume more exist and surface "100+" + truncated:true. + const rows = Array.from({ length: 100 }, (_, i) => ({ + address: `addr${i}`, tokenId: 'tok', balanceSum: '1', historySum: '0', + })); + mockMysql.query.mockResolvedValueOnce([rows, []]); + + MonitoringActor(mockCallback, mockReceive, config); + sendEvent('CONNECTED'); + + jest.advanceTimersByTime(config['BALANCE_VALIDATION_INTERVAL_MS']); + await flushPromises(); + + expect(mockAddAlert).toHaveBeenCalledWith( + 'Balance validation found mismatches', + expect.stringContaining('100+'), + Severity.MAJOR, + expect.objectContaining({ truncated: true }), + expect.anything(), + ); + }); + + it('should handle DB errors without crashing', async () => { + config['BALANCE_VALIDATION_ENABLED'] = true; + const mockLoggerError = jest.spyOn(logger, 'error'); + + (db.getDbConnection as jest.Mock).mockRejectedValueOnce(new Error('DB connection failed')); + + MonitoringActor(mockCallback, mockReceive, config); + sendEvent('CONNECTED'); + + jest.advanceTimersByTime(config['BALANCE_VALIDATION_INTERVAL_MS']); + await flushPromises(); + + expect(mockLoggerError).toHaveBeenCalledWith( + expect.stringContaining('Balance validation error'), + ); + }); + }); }); diff --git a/packages/daemon/src/actors/MonitoringActor.ts b/packages/daemon/src/actors/MonitoringActor.ts index 89e36012..948d00b4 100644 --- a/packages/daemon/src/actors/MonitoringActor.ts +++ b/packages/daemon/src/actors/MonitoringActor.ts @@ -9,6 +9,7 @@ import logger from '../logger'; import getConfig from '../config'; import { addAlert, Severity } from '@wallet-service/common'; import { Event, EventTypes } from '../types'; +import { getDbConnection } from '../db'; /** * MonitoringActor @@ -33,6 +34,12 @@ import { Event, EventTypes } from '../types'; * reconnects more than RECONNECTION_STORM_THRESHOLD times within * RECONNECTION_STORM_WINDOW_MS. Duplicate alerts are suppressed for * STORM_ALERT_COOLDOWN_MS (1 min) to avoid spamming the alerting system. + * + * 4. Scheduled balance validation — when BALANCE_VALIDATION_ENABLED is true, + * periodically runs a single SQL query that joins address_balance against + * SUM(address_tx_history.balance) and reports rows where the two disagree. + * Bounded by LIMIT, so a catastrophic mismatch produces a sample, not a + * flood. Errors never crash the daemon. */ export default (callback: any, receive: any, config = getConfig()) => { logger.info('Starting monitoring actor'); @@ -150,6 +157,102 @@ export default (callback: any, receive: any, config = getConfig()) => { } }; + // ── Scheduled balance validation ────────────────────────────────────────────── + // + // One SQL query per tick. The DB does the pairing (LEFT JOIN), the math + // (native BIGINT, no precision loss), and the consistency snapshot (single + // statement = one read view). The query is bounded by LIMIT so a catastrophic + // mismatch produces a sample, not a megabyte of payload. + // + // If a run exceeds the interval the in-flight guard skips the next tick + // rather than overlapping. DISCONNECTED clears the timer; an in-flight + // SELECT runs to completion and releases its connection — harmless. + // + // Sample size note: 100 is intentional. Any non-zero count means we'll dive + // into the data with our own queries anyway; the alert just needs to prove + // something is wrong and give a representative starting point. + const BALANCE_VALIDATION_SAMPLE_LIMIT = 100; + const BALANCE_VALIDATION_SQL = ` + SELECT + ab.address, + ab.token_id AS tokenId, + CAST(ab.unlocked_balance + ab.locked_balance AS SIGNED) AS balanceSum, + CAST(COALESCE(SUM(h.balance), 0) AS SIGNED) AS historySum + FROM \`address_balance\` ab + LEFT JOIN \`address_tx_history\` h + ON h.address = ab.address + AND h.token_id = ab.token_id + AND h.voided = FALSE + WHERE ab.transactions > 0 + GROUP BY ab.address, ab.token_id + HAVING balanceSum != historySum + LIMIT ${BALANCE_VALIDATION_SAMPLE_LIMIT} + `; + + let balanceValidationTimer: ReturnType | null = null; + let isValidating = false; + + const runBalanceValidation = async () => { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + let mysql: any; + try { + mysql = await getDbConnection(); + const [rows] = await mysql.query(BALANCE_VALIDATION_SQL); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const samples = rows as any[]; + + if (samples.length > 0) { + const truncated = samples.length === BALANCE_VALIDATION_SAMPLE_LIMIT; + const countLabel = truncated ? `${BALANCE_VALIDATION_SAMPLE_LIMIT}+` : String(samples.length); + logger.error(`[monitoring] Balance validation found ${countLabel} mismatch(es)`, { samples }); + await addAlert( + 'Balance validation found mismatches', + `Found ${countLabel} balance mismatch(es)${truncated ? ' (sample capped)' : ''}`, + Severity.MAJOR, + { samples, truncated }, + logger, + ); + } else { + logger.info('[monitoring] Balance validation complete, no mismatches found'); + } + } catch (err) { + logger.error(`[monitoring] Balance validation error: ${err}`); + } finally { + if (mysql) { + try { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (mysql as any).release(); + } catch (releaseErr) { + logger.warn(`[monitoring] Balance validation: connection release failed: ${releaseErr}`); + } + } + } + }; + + const startBalanceValidation = () => { + if (!config.BALANCE_VALIDATION_ENABLED) return; + stopBalanceValidation(); + + logger.info('[monitoring] Starting scheduled balance validation'); + balanceValidationTimer = setInterval(async () => { + if (isValidating) return; // prior run still going — skip this tick + isValidating = true; + try { + await runBalanceValidation(); + } finally { + isValidating = false; + } + }, config.BALANCE_VALIDATION_INTERVAL_MS); + }; + + const stopBalanceValidation = () => { + if (balanceValidationTimer) { + clearInterval(balanceValidationTimer); + balanceValidationTimer = null; + } + }; + + // ── Event handling ──────────────────────────────────────────────────────────── receive((event: Event) => { if (event.type !== EventTypes.MONITORING_EVENT) { @@ -162,6 +265,7 @@ export default (callback: any, receive: any, config = getConfig()) => { logger.info('[monitoring] WebSocket connected — starting idle-event timer'); isConnected = true; startIdleCheck(); + startBalanceValidation(); break; case 'DISCONNECTED': @@ -169,6 +273,7 @@ export default (callback: any, receive: any, config = getConfig()) => { isConnected = false; stopIdleCheck(); clearStuckTimer(); + stopBalanceValidation(); break; case 'EVENT_RECEIVED': @@ -194,5 +299,6 @@ export default (callback: any, receive: any, config = getConfig()) => { logger.info('Stopping monitoring actor'); stopIdleCheck(); clearStuckTimer(); + stopBalanceValidation(); }; }; diff --git a/packages/daemon/src/config.ts b/packages/daemon/src/config.ts index 8a5d683a..6f32f557 100644 --- a/packages/daemon/src/config.ts +++ b/packages/daemon/src/config.ts @@ -100,6 +100,10 @@ export const RECONNECTION_STORM_WINDOW_MS = parseInt(process.env.RECONNECTION_ST // Other export const USE_SSL = process.env.USE_SSL === 'true'; +// Scheduled balance validation configuration +export const BALANCE_VALIDATION_ENABLED = process.env.BALANCE_VALIDATION_ENABLED === 'true'; +export const BALANCE_VALIDATION_INTERVAL_MS = parseInt(process.env.BALANCE_VALIDATION_INTERVAL_MS ?? '600000', 10); // 10 minutes + // Reorg size thresholds for different alert levels export const REORG_SIZE_INFO = parseInt(process.env.REORG_SIZE_INFO ?? '1', 10); export const REORG_SIZE_MINOR = parseInt(process.env.REORG_SIZE_MINOR ?? '3', 10); @@ -141,6 +145,8 @@ export default () => ({ STUCK_PROCESSING_TIMEOUT_MS, RECONNECTION_STORM_THRESHOLD, RECONNECTION_STORM_WINDOW_MS, + BALANCE_VALIDATION_ENABLED, + BALANCE_VALIDATION_INTERVAL_MS, REORG_SIZE_INFO, REORG_SIZE_MINOR, REORG_SIZE_MAJOR, From 0358af91da39789a34766337c74b0f5ebea3d48b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Abadesso?= Date: Wed, 22 Apr 2026 13:59:45 -0300 Subject: [PATCH 2/5] fix(daemon): tighten scheduled balance validation per review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four related changes addressing @luislhl + Copilot comments: - Guard BALANCE_VALIDATION_INTERVAL_MS against NaN / below-minimum values. `parseInt('abc', 10)` yields NaN, and `setInterval(fn, NaN)` behaves like delay=0 — a tight loop hammering the DB. On invalid input, log a loud error and leave the scheduler disabled for the session rather than silently substituting a default (operators should see and fix the misconfig). - Log errors with `err instanceof Error ? (err.stack ?? err.message) : String(err)` instead of template-interpolating `${err}`. The old form collapsed non-Error throws to `[object Object]` and dropped stack traces entirely. - Expose the sample cap as `BALANCE_VALIDATION_SAMPLE_LIMIT` (default 100). Naming reflects what it actually is — a result-row cap for the alert payload, not a batch size. The query is still a full-table scan per tick; a follow-up for cursor-based batching is out of scope. - Drop `WHERE ab.transactions > 0` from the validation SQL. The LEFT JOIN + COALESCE(SUM, 0) handles empty history correctly, and the filter was hiding a real bug class: rows with `transactions = 0` AND non-zero balance (void cleanup should have deleted them). Genuinely empty rows still match `historySum = 0` and get filtered by HAVING, so no false positives. Code comment above the SQL also warns that LIMIT bounds the result payload, not execution cost — operators should EXPLAIN this against production-sized data before enabling BALANCE_VALIDATION_ENABLED=true. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../__tests__/actors/MonitoringActor.test.ts | 31 ++++++++++ packages/daemon/src/actors/MonitoringActor.ts | 57 +++++++++++++++---- packages/daemon/src/config.ts | 5 ++ 3 files changed, 82 insertions(+), 11 deletions(-) diff --git a/packages/daemon/__tests__/actors/MonitoringActor.test.ts b/packages/daemon/__tests__/actors/MonitoringActor.test.ts index b8411030..539e9b12 100644 --- a/packages/daemon/__tests__/actors/MonitoringActor.test.ts +++ b/packages/daemon/__tests__/actors/MonitoringActor.test.ts @@ -56,6 +56,7 @@ describe('MonitoringActor', () => { config['RECONNECTION_STORM_WINDOW_MS'] = 5 * 60 * 1000; // 5 min config['BALANCE_VALIDATION_ENABLED'] = false; config['BALANCE_VALIDATION_INTERVAL_MS'] = 5000; + config['BALANCE_VALIDATION_SAMPLE_LIMIT'] = 100; mockCallback = jest.fn(); mockReceive = jest.fn().mockImplementation((cb: any) => { @@ -475,5 +476,35 @@ describe('MonitoringActor', () => { expect.stringContaining('Balance validation error'), ); }); + + it('should refuse to schedule validation when interval is NaN', () => { + config['BALANCE_VALIDATION_ENABLED'] = true; + config['BALANCE_VALIDATION_INTERVAL_MS'] = NaN; + const mockLoggerError = jest.spyOn(logger, 'error'); + + MonitoringActor(mockCallback, mockReceive, config); + sendEvent('CONNECTED'); + + // Only the idle-check interval should fire; the validation interval + // must NOT be scheduled because the config is invalid. + expect(setInterval).toHaveBeenCalledTimes(1); + expect(mockLoggerError).toHaveBeenCalledWith( + expect.stringContaining('BALANCE_VALIDATION_INTERVAL_MS=NaN is invalid'), + ); + }); + + it('should refuse to schedule validation when interval is below the minimum', () => { + config['BALANCE_VALIDATION_ENABLED'] = true; + config['BALANCE_VALIDATION_INTERVAL_MS'] = 10; // below the 1000ms floor + const mockLoggerError = jest.spyOn(logger, 'error'); + + MonitoringActor(mockCallback, mockReceive, config); + sendEvent('CONNECTED'); + + expect(setInterval).toHaveBeenCalledTimes(1); + expect(mockLoggerError).toHaveBeenCalledWith( + expect.stringContaining('is invalid'), + ); + }); }); }); diff --git a/packages/daemon/src/actors/MonitoringActor.ts b/packages/daemon/src/actors/MonitoringActor.ts index 948d00b4..a5405e5e 100644 --- a/packages/daemon/src/actors/MonitoringActor.ts +++ b/packages/daemon/src/actors/MonitoringActor.ts @@ -164,14 +164,27 @@ export default (callback: any, receive: any, config = getConfig()) => { // statement = one read view). The query is bounded by LIMIT so a catastrophic // mismatch produces a sample, not a megabyte of payload. // + // Cost caveat: LIMIT bounds *result rows*, not execution cost. The LEFT JOIN + // + GROUP BY still scans every `address_balance` row and joins against + // `address_tx_history` on each tick. Operators should EXPLAIN this query + // against production-sized data before enabling — see the follow-up issue + // tracked for cursor-based batching if the scan cost is prohibitive. + // + // The `transactions > 0` filter is intentionally omitted: a row with + // `transactions = 0` AND non-zero balance is itself a bug (the void cleanup + // should have deleted it), and we want the validator to surface that. + // Genuinely-empty rows (transactions=0, balance=0) match `historySum=0` + // via the COALESCE and are filtered out by HAVING. + // // If a run exceeds the interval the in-flight guard skips the next tick // rather than overlapping. DISCONNECTED clears the timer; an in-flight // SELECT runs to completion and releases its connection — harmless. // - // Sample size note: 100 is intentional. Any non-zero count means we'll dive - // into the data with our own queries anyway; the alert just needs to prove - // something is wrong and give a representative starting point. - const BALANCE_VALIDATION_SAMPLE_LIMIT = 100; + // Sample limit is configurable via BALANCE_VALIDATION_SAMPLE_LIMIT (default + // 100). Any non-zero count means operators will dive into the data with + // their own queries anyway; the alert just needs to prove something is + // wrong and give a representative starting point. + const sampleLimit = config.BALANCE_VALIDATION_SAMPLE_LIMIT; const BALANCE_VALIDATION_SQL = ` SELECT ab.address, @@ -183,10 +196,9 @@ export default (callback: any, receive: any, config = getConfig()) => { ON h.address = ab.address AND h.token_id = ab.token_id AND h.voided = FALSE - WHERE ab.transactions > 0 GROUP BY ab.address, ab.token_id HAVING balanceSum != historySum - LIMIT ${BALANCE_VALIDATION_SAMPLE_LIMIT} + LIMIT ${sampleLimit} `; let balanceValidationTimer: ReturnType | null = null; @@ -202,8 +214,8 @@ export default (callback: any, receive: any, config = getConfig()) => { const samples = rows as any[]; if (samples.length > 0) { - const truncated = samples.length === BALANCE_VALIDATION_SAMPLE_LIMIT; - const countLabel = truncated ? `${BALANCE_VALIDATION_SAMPLE_LIMIT}+` : String(samples.length); + const truncated = samples.length === sampleLimit; + const countLabel = truncated ? `${sampleLimit}+` : String(samples.length); logger.error(`[monitoring] Balance validation found ${countLabel} mismatch(es)`, { samples }); await addAlert( 'Balance validation found mismatches', @@ -216,21 +228,44 @@ export default (callback: any, receive: any, config = getConfig()) => { logger.info('[monitoring] Balance validation complete, no mismatches found'); } } catch (err) { - logger.error(`[monitoring] Balance validation error: ${err}`); + const detail = err instanceof Error ? (err.stack ?? err.message) : String(err); + logger.error(`[monitoring] Balance validation error: ${detail}`); } finally { if (mysql) { try { // eslint-disable-next-line @typescript-eslint/no-explicit-any (mysql as any).release(); } catch (releaseErr) { - logger.warn(`[monitoring] Balance validation: connection release failed: ${releaseErr}`); + const detail = releaseErr instanceof Error + ? (releaseErr.stack ?? releaseErr.message) + : String(releaseErr); + logger.warn(`[monitoring] Balance validation: connection release failed: ${detail}`); } } } }; + // Minimum tick interval. Below this, we'd hammer the DB faster than a + // validation run can reasonably complete and risk cascading overruns. + const MIN_BALANCE_VALIDATION_INTERVAL_MS = 1000; + const startBalanceValidation = () => { if (!config.BALANCE_VALIDATION_ENABLED) return; + + const intervalMs = config.BALANCE_VALIDATION_INTERVAL_MS; + // Guard against misconfig: parseInt('abc') yields NaN, and setInterval(fn, NaN) + // behaves like delay=0 — a tight loop hammering the DB. Fail loud and stay + // disabled rather than silently substitute a default; operators should see + // this and fix the env var. + if (!Number.isFinite(intervalMs) || intervalMs < MIN_BALANCE_VALIDATION_INTERVAL_MS) { + logger.error( + `[monitoring] BALANCE_VALIDATION_INTERVAL_MS=${intervalMs} is invalid ` + + `(must be a finite number >= ${MIN_BALANCE_VALIDATION_INTERVAL_MS}). ` + + 'Scheduled balance validation will NOT run this session.', + ); + return; + } + stopBalanceValidation(); logger.info('[monitoring] Starting scheduled balance validation'); @@ -242,7 +277,7 @@ export default (callback: any, receive: any, config = getConfig()) => { } finally { isValidating = false; } - }, config.BALANCE_VALIDATION_INTERVAL_MS); + }, intervalMs); }; const stopBalanceValidation = () => { diff --git a/packages/daemon/src/config.ts b/packages/daemon/src/config.ts index 6f32f557..85a6b6ee 100644 --- a/packages/daemon/src/config.ts +++ b/packages/daemon/src/config.ts @@ -103,6 +103,10 @@ export const USE_SSL = process.env.USE_SSL === 'true'; // Scheduled balance validation configuration export const BALANCE_VALIDATION_ENABLED = process.env.BALANCE_VALIDATION_ENABLED === 'true'; export const BALANCE_VALIDATION_INTERVAL_MS = parseInt(process.env.BALANCE_VALIDATION_INTERVAL_MS ?? '600000', 10); // 10 minutes +// Max mismatch rows surfaced per tick. LIMIT bounds the alert payload size; +// it does NOT reduce query execution cost (the LEFT JOIN + GROUP BY still +// scans address_balance + address_tx_history per tick). +export const BALANCE_VALIDATION_SAMPLE_LIMIT = parseInt(process.env.BALANCE_VALIDATION_SAMPLE_LIMIT ?? '100', 10); // Reorg size thresholds for different alert levels export const REORG_SIZE_INFO = parseInt(process.env.REORG_SIZE_INFO ?? '1', 10); @@ -147,6 +151,7 @@ export default () => ({ RECONNECTION_STORM_WINDOW_MS, BALANCE_VALIDATION_ENABLED, BALANCE_VALIDATION_INTERVAL_MS, + BALANCE_VALIDATION_SAMPLE_LIMIT, REORG_SIZE_INFO, REORG_SIZE_MINOR, REORG_SIZE_MAJOR, From 9df383bc33f6385d286803c7db8807a74999e902 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Abadesso?= Date: Wed, 22 Apr 2026 21:33:16 -0300 Subject: [PATCH 3/5] perf(daemon): scope balance validation query by updated_at MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Benchmarked the full-table query on production data (≈1.5M address_balance rows, ≈8.3M address_tx_history rows) and confirmed what @luislhl suspected: LIMIT bounds the result payload, not the execution cost. A full-table tick takes tens of seconds because the LEFT JOIN + GROUP BY touches every row. Scoping the outer set by `ab.updated_at > NOW() - INTERVAL :window SECOND` uses the existing `address_balance_updated_at_idx` to restrict the tick to recently changed rows. updated_at is `ON UPDATE CURRENT_TIMESTAMP` at the schema level, so every row write bumps it — correctness of the scope is structural, not app-managed. Added `BALANCE_VALIDATION_WINDOW_MS` config (default 15 min) with 50% slack over the default 10 min tick so a late tick can't miss a row. Trade-off this does NOT solve — hot addresses: Scoping limits WHICH addresses we check per tick; it does NOT limit HOW MUCH history we sum per address. address_tx_history has no covering index that includes `balance`, so MySQL fetches every non-voided history row for each recently-changed address via a PK scan on `address`. For whale addresses with hundreds of thousands of history rows, a single tick where that address shows up pays a multi-second per-address cost. EXPLAIN ANALYZE confirmed: optimizer estimated 5 rows per address, reality was ~800K rows per address (the long tail). Mitigation is a covering index (address, voided, token_id, balance) on address_tx_history — tracked as follow-up #404 (retargeted from cursor-batching to covering-index). Until that index ships, keep BALANCE_VALIDATION_ENABLED=false in prod. Long-tail drift on cold rows (never updated since the bug introduced it) is out of scope for the scheduled tick — a separate full-table sweep at a lower cadence is the right mechanism for that. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../__tests__/actors/MonitoringActor.test.ts | 4 ++ packages/daemon/src/actors/MonitoringActor.ts | 52 +++++++++++++------ packages/daemon/src/config.ts | 8 ++- 3 files changed, 46 insertions(+), 18 deletions(-) diff --git a/packages/daemon/__tests__/actors/MonitoringActor.test.ts b/packages/daemon/__tests__/actors/MonitoringActor.test.ts index 539e9b12..83bf1a42 100644 --- a/packages/daemon/__tests__/actors/MonitoringActor.test.ts +++ b/packages/daemon/__tests__/actors/MonitoringActor.test.ts @@ -56,6 +56,7 @@ describe('MonitoringActor', () => { config['RECONNECTION_STORM_WINDOW_MS'] = 5 * 60 * 1000; // 5 min config['BALANCE_VALIDATION_ENABLED'] = false; config['BALANCE_VALIDATION_INTERVAL_MS'] = 5000; + config['BALANCE_VALIDATION_WINDOW_MS'] = 900000; config['BALANCE_VALIDATION_SAMPLE_LIMIT'] = 100; mockCallback = jest.fn(); @@ -404,6 +405,9 @@ describe('MonitoringActor', () => { await flushPromises(); expect(mockMysql.query).toHaveBeenCalledWith(expect.stringContaining('LEFT JOIN')); + // Scope-by-updated_at is load-bearing for perf (see follow-up #404); + // pin it so a future refactor doesn't silently drop the filter. + expect(mockMysql.query).toHaveBeenCalledWith(expect.stringContaining('ab.updated_at > NOW() - INTERVAL')); expect(mockAddAlert).toHaveBeenCalledWith( 'Balance validation found mismatches', expect.stringContaining('1 balance mismatch'), diff --git a/packages/daemon/src/actors/MonitoringActor.ts b/packages/daemon/src/actors/MonitoringActor.ts index a5405e5e..b1ba9e92 100644 --- a/packages/daemon/src/actors/MonitoringActor.ts +++ b/packages/daemon/src/actors/MonitoringActor.ts @@ -159,32 +159,51 @@ export default (callback: any, receive: any, config = getConfig()) => { // ── Scheduled balance validation ────────────────────────────────────────────── // - // One SQL query per tick. The DB does the pairing (LEFT JOIN), the math - // (native BIGINT, no precision loss), and the consistency snapshot (single - // statement = one read view). The query is bounded by LIMIT so a catastrophic - // mismatch produces a sample, not a megabyte of payload. + // Each tick runs a single SQL query that compares address_balance against the + // sum of non-voided address_tx_history for rows whose `updated_at` falls + // within the configured lookback window. The DB does the pairing (LEFT JOIN), + // the math (native BIGINT, no precision loss), and the consistency snapshot + // (single statement = one read view). // - // Cost caveat: LIMIT bounds *result rows*, not execution cost. The LEFT JOIN - // + GROUP BY still scans every `address_balance` row and joins against - // `address_tx_history` on each tick. Operators should EXPLAIN this query - // against production-sized data before enabling — see the follow-up issue - // tracked for cursor-based batching if the scan cost is prohibitive. + // Why `updated_at > NOW() - INTERVAL :window SECOND`: + // A full-table pass was benchmarked on production data (≈1.5M + // address_balance rows, ≈8.3M address_tx_history rows) and took tens of + // seconds per tick. The `updated_at` index scopes the outer set to recently + // changed rows, which is what a scheduled monitor actually needs — drift + // introduced by a bad write will be caught within one tick of the offending + // change. updated_at is `ON UPDATE CURRENT_TIMESTAMP` in the schema, so any + // write to the row bumps it; correctness of the scope is structural. + // + // Trade-off — hot addresses are still expensive: + // Scoping limits WHICH addresses we check per tick; it does NOT limit HOW + // MUCH history we sum per address. address_tx_history has no covering + // index that includes `balance`, so MySQL fetches every non-voided history + // row for each recently-changed address via a PK scan on `address`. For + // whale addresses with hundreds of thousands of history rows this is a + // multi-second per-address cost even when only a handful of addresses + // updated in the window. + // + // Mitigation (separate migration): add a covering index + // `(address, voided, token_id, balance)` to address_tx_history so the + // aggregate becomes index-only. Until that ships, keep + // BALANCE_VALIDATION_ENABLED=false in prod. + // + // Long tail — slow drift on cold rows (balance changed long ago and the + // row never touched since) goes undetected by this scheduled validator. + // A separate once-a-week/month full-table sweep is the right mechanism + // for that; out of scope for the scheduled job. // // The `transactions > 0` filter is intentionally omitted: a row with // `transactions = 0` AND non-zero balance is itself a bug (the void cleanup // should have deleted it), and we want the validator to surface that. - // Genuinely-empty rows (transactions=0, balance=0) match `historySum=0` - // via the COALESCE and are filtered out by HAVING. + // Genuinely-empty rows match `historySum=0` via COALESCE and HAVING drops + // them. // // If a run exceeds the interval the in-flight guard skips the next tick // rather than overlapping. DISCONNECTED clears the timer; an in-flight // SELECT runs to completion and releases its connection — harmless. - // - // Sample limit is configurable via BALANCE_VALIDATION_SAMPLE_LIMIT (default - // 100). Any non-zero count means operators will dive into the data with - // their own queries anyway; the alert just needs to prove something is - // wrong and give a representative starting point. const sampleLimit = config.BALANCE_VALIDATION_SAMPLE_LIMIT; + const windowSeconds = Math.floor(config.BALANCE_VALIDATION_WINDOW_MS / 1000); const BALANCE_VALIDATION_SQL = ` SELECT ab.address, @@ -196,6 +215,7 @@ export default (callback: any, receive: any, config = getConfig()) => { ON h.address = ab.address AND h.token_id = ab.token_id AND h.voided = FALSE + WHERE ab.updated_at > NOW() - INTERVAL ${windowSeconds} SECOND GROUP BY ab.address, ab.token_id HAVING balanceSum != historySum LIMIT ${sampleLimit} diff --git a/packages/daemon/src/config.ts b/packages/daemon/src/config.ts index 85a6b6ee..953bf1a7 100644 --- a/packages/daemon/src/config.ts +++ b/packages/daemon/src/config.ts @@ -103,9 +103,12 @@ export const USE_SSL = process.env.USE_SSL === 'true'; // Scheduled balance validation configuration export const BALANCE_VALIDATION_ENABLED = process.env.BALANCE_VALIDATION_ENABLED === 'true'; export const BALANCE_VALIDATION_INTERVAL_MS = parseInt(process.env.BALANCE_VALIDATION_INTERVAL_MS ?? '600000', 10); // 10 minutes +// Lookback window for recently-changed address_balance rows. Should be >= the +// tick interval so no row slips between ticks if one runs late. Default is +// interval + 50% slack. +export const BALANCE_VALIDATION_WINDOW_MS = parseInt(process.env.BALANCE_VALIDATION_WINDOW_MS ?? '900000', 10); // 15 minutes // Max mismatch rows surfaced per tick. LIMIT bounds the alert payload size; -// it does NOT reduce query execution cost (the LEFT JOIN + GROUP BY still -// scans address_balance + address_tx_history per tick). +// does NOT reduce query execution cost. export const BALANCE_VALIDATION_SAMPLE_LIMIT = parseInt(process.env.BALANCE_VALIDATION_SAMPLE_LIMIT ?? '100', 10); // Reorg size thresholds for different alert levels @@ -151,6 +154,7 @@ export default () => ({ RECONNECTION_STORM_WINDOW_MS, BALANCE_VALIDATION_ENABLED, BALANCE_VALIDATION_INTERVAL_MS, + BALANCE_VALIDATION_WINDOW_MS, BALANCE_VALIDATION_SAMPLE_LIMIT, REORG_SIZE_INFO, REORG_SIZE_MINOR, From 5a15e25dff9c07c6426527c50b9b2de4f873f08c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Abadesso?= Date: Wed, 22 Apr 2026 22:05:56 -0300 Subject: [PATCH 4/5] docs(daemon): clarify that balance validator is ad-hoc, not scheduled in prod MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing comment implied that flipping BALANCE_VALIDATION_ENABLED=true was a goal (just pending #404). That's not the plan — the validator is intended for ad-hoc / on-demand use (local, testnet, manual) and the feature flag is designed to stay false in production. Rephrase the comment so the next reader doesn't assume there's a path to "turn this on" once #404 lands. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/daemon/src/actors/MonitoringActor.ts | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/packages/daemon/src/actors/MonitoringActor.ts b/packages/daemon/src/actors/MonitoringActor.ts index b1ba9e92..066f2a3d 100644 --- a/packages/daemon/src/actors/MonitoringActor.ts +++ b/packages/daemon/src/actors/MonitoringActor.ts @@ -183,15 +183,16 @@ export default (callback: any, receive: any, config = getConfig()) => { // multi-second per-address cost even when only a handful of addresses // updated in the window. // - // Mitigation (separate migration): add a covering index - // `(address, voided, token_id, balance)` to address_tx_history so the - // aggregate becomes index-only. Until that ships, keep - // BALANCE_VALIDATION_ENABLED=false in prod. + // Because of this, `BALANCE_VALIDATION_ENABLED` is intended to stay + // `false` in production. The actor + query are here for ad-hoc / on-demand + // runs (local, testnet, or triggered manually), not for a scheduled + // in-production job. See #404 for the covering-index perf improvement that + // makes ad-hoc runs faster; it is not a prerequisite for "enabling" this + // feature, because enabling isn't planned. // // Long tail — slow drift on cold rows (balance changed long ago and the - // row never touched since) goes undetected by this scheduled validator. - // A separate once-a-week/month full-table sweep is the right mechanism - // for that; out of scope for the scheduled job. + // row never touched since) goes undetected by this validator. A separate + // full-table sweep is the right mechanism for that; out of scope. // // The `transactions > 0` filter is intentionally omitted: a row with // `transactions = 0` AND non-zero balance is itself a bug (the void cleanup From f4ed7bed0a3375604ce1cf7721ba008ffc1d4891 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Abadesso?= Date: Wed, 22 Apr 2026 22:13:03 -0300 Subject: [PATCH 5/5] fix(daemon): precision-safe validation payload + SAMPLE_LIMIT guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two small correctness items from the PR review: - Wrap the BIGINT sums in CAST(... AS CHAR) so `balanceSum` and `historySum` transport to the client as decimal strings instead of JS Numbers. mysql2 returns BIGINT as Number by default, which loses precision above 2^53. HTR values are well below that, but the alert payload is read by humans and tooling — keeping the decimal string form avoids any silent rounding if the validator is ever run against a non-HTR token. HAVING still compares the same CHAR expressions, which for canonical CAST(SIGNED AS CHAR) output is equivalent to numeric equality. - Add the same "fail loud, stay disabled" guard on BALANCE_VALIDATION_SAMPLE_LIMIT that INTERVAL_MS already has. sampleLimit is interpolated directly into `LIMIT ${sampleLimit}`: NaN produces a SQL syntax error, 0/negative returns no rows and silently claims "no mismatches" without actually checking. The guard logs a loud error and leaves the scheduler disabled rather than pretending everything is fine. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../__tests__/actors/MonitoringActor.test.ts | 14 ++++++++++ packages/daemon/src/actors/MonitoringActor.ts | 26 ++++++++++++++++--- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/packages/daemon/__tests__/actors/MonitoringActor.test.ts b/packages/daemon/__tests__/actors/MonitoringActor.test.ts index 83bf1a42..71a35a2d 100644 --- a/packages/daemon/__tests__/actors/MonitoringActor.test.ts +++ b/packages/daemon/__tests__/actors/MonitoringActor.test.ts @@ -510,5 +510,19 @@ describe('MonitoringActor', () => { expect.stringContaining('is invalid'), ); }); + + it('should refuse to schedule validation when sample limit is invalid', () => { + config['BALANCE_VALIDATION_ENABLED'] = true; + config['BALANCE_VALIDATION_SAMPLE_LIMIT'] = 0; // 0 would silently skip every row + const mockLoggerError = jest.spyOn(logger, 'error'); + + MonitoringActor(mockCallback, mockReceive, config); + sendEvent('CONNECTED'); + + expect(setInterval).toHaveBeenCalledTimes(1); + expect(mockLoggerError).toHaveBeenCalledWith( + expect.stringContaining('BALANCE_VALIDATION_SAMPLE_LIMIT=0 is invalid'), + ); + }); }); }); diff --git a/packages/daemon/src/actors/MonitoringActor.ts b/packages/daemon/src/actors/MonitoringActor.ts index 066f2a3d..ec1df685 100644 --- a/packages/daemon/src/actors/MonitoringActor.ts +++ b/packages/daemon/src/actors/MonitoringActor.ts @@ -205,12 +205,20 @@ export default (callback: any, receive: any, config = getConfig()) => { // SELECT runs to completion and releases its connection — harmless. const sampleLimit = config.BALANCE_VALIDATION_SAMPLE_LIMIT; const windowSeconds = Math.floor(config.BALANCE_VALIDATION_WINDOW_MS / 1000); + // Wrap the SIGNED BIGINT in CAST(... AS CHAR) so values transport to the + // client as strings. mysql2 returns BIGINT as JS Number by default, which + // loses precision above 2^53. HTR max supply is well below that, but we + // log this payload in alerts that humans and tools read — keeping the + // decimal string form avoids any silent rounding if the validator ever + // runs against a non-HTR token. HAVING compares the same CHAR expressions + // via `!=`, which is equivalent to numeric equality for canonical decimal + // strings produced by CAST(SIGNED AS CHAR). const BALANCE_VALIDATION_SQL = ` SELECT ab.address, - ab.token_id AS tokenId, - CAST(ab.unlocked_balance + ab.locked_balance AS SIGNED) AS balanceSum, - CAST(COALESCE(SUM(h.balance), 0) AS SIGNED) AS historySum + ab.token_id AS tokenId, + CAST(CAST(ab.unlocked_balance + ab.locked_balance AS SIGNED) AS CHAR) AS balanceSum, + CAST(CAST(COALESCE(SUM(h.balance), 0) AS SIGNED) AS CHAR) AS historySum FROM \`address_balance\` ab LEFT JOIN \`address_tx_history\` h ON h.address = ab.address @@ -287,6 +295,18 @@ export default (callback: any, receive: any, config = getConfig()) => { return; } + // Same guard for SAMPLE_LIMIT: it's interpolated directly into + // `LIMIT ${sampleLimit}`. NaN would produce a SQL syntax error; 0 or + // negative would return no rows and silently claim "no mismatches" + // without actually checking. Fail loud and stay disabled. + if (!Number.isFinite(sampleLimit) || sampleLimit < 1) { + logger.error( + `[monitoring] BALANCE_VALIDATION_SAMPLE_LIMIT=${sampleLimit} is invalid ` + + '(must be a finite number >= 1). Scheduled balance validation will NOT run this session.', + ); + return; + } + stopBalanceValidation(); logger.info('[monitoring] Starting scheduled balance validation');