diff --git a/.changeset/chilly-stingrays-decide.md b/.changeset/chilly-stingrays-decide.md new file mode 100644 index 0000000000000..1c236eedd589d --- /dev/null +++ b/.changeset/chilly-stingrays-decide.md @@ -0,0 +1,5 @@ +--- +'@eth-optimism/replica-healthcheck': patch +--- + +Fixes a bug in the replica-healthcheck docker file diff --git a/.changeset/orange-oranges-bow.md b/.changeset/orange-oranges-bow.md new file mode 100644 index 0000000000000..42fc1e11dcb1c --- /dev/null +++ b/.changeset/orange-oranges-bow.md @@ -0,0 +1,5 @@ +--- +'@eth-optimism/common-ts': patch +--- + +Properly exposes metrics as part of a metrics server at port 7300 diff --git a/.changeset/plenty-dancers-eat.md b/.changeset/plenty-dancers-eat.md new file mode 100644 index 0000000000000..c3b8b0af1bf3a --- /dev/null +++ b/.changeset/plenty-dancers-eat.md @@ -0,0 +1,5 @@ +--- +'@eth-optimism/integration-tests': patch +--- + +Add integration test for healthcheck server diff --git a/.changeset/thirty-hairs-remain.md b/.changeset/thirty-hairs-remain.md new file mode 100644 index 0000000000000..93b187fdb3bc6 --- /dev/null +++ b/.changeset/thirty-hairs-remain.md @@ -0,0 +1,5 @@ +--- +'@eth-optimism/replica-healthcheck': patch +--- + +Add checks and metrics for dead networks diff --git a/.circleci/config.yml b/.circleci/config.yml index 7f91c78040f97..5677bc05d147b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -154,6 +154,7 @@ jobs: --env L2_GAS_PRICE=onchain \ --env RUN_DEBUG_TRACE_TESTS=false \ --env RUN_REPLICA_TESTS=false \ + --env RUN_HEALTHCHECK_TESTS=false \ --env RUN_STRESS_TESTS=false \ --env OVMCONTEXT_SPEC_NUM_TXS=1 \ --env DTL_ENQUEUE_CONFIRMATIONS=12 \ @@ -252,7 +253,7 @@ workflows: - develop jobs: - build-dtl: - context: + context: - optimism - slack <<: *slack-nightly-build-fail-post-step @@ -331,4 +332,4 @@ workflows: } ] } - event: always \ No newline at end of file + event: always diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index d72fe09ada9f3..5a6b4b8b6ec59 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -50,7 +50,7 @@ jobs: working-directory: ./ops run: | ./scripts/stats.sh & - docker-compose -f docker-compose.yml up -d + docker-compose -f docker-compose.yml up -d --scale replica-healthcheck=1 - name: Wait for the Sequencer node working-directory: ./ops @@ -64,7 +64,6 @@ jobs: if: failure() uses: jwalton/gh-docker-logs@v1 with: - images: 'ethereumoptimism/hardhat,ops_deployer,ops_dtl,ops_l2geth,ethereumoptimism/message-relayer,ops_batch_submitter,ops_replica,ops_integration_tests' dest: '/home/runner/logs' - name: Tar logs diff --git a/integration-tests/.env.example b/integration-tests/.env.example index 335710ab12b1c..e643c71d30814 100644 --- a/integration-tests/.env.example +++ b/integration-tests/.env.example @@ -10,8 +10,9 @@ OVMCONTEXT_SPEC_NUM_TXS=1 RUN_WITHDRAWAL_TESTS=false RUN_DEBUG_TRACE_TESTS=false RUN_REPLICA_TESTS=false +RUN_HEALTHCHECK_TESTS=false RUN_STRESS_TESTS=false # Can be configured up or down as necessary MOCHA_TIMEOUT=300000 # Set to true to make Mocha stop after the first failed test. -MOCHA_BAIL=false \ No newline at end of file +MOCHA_BAIL=false diff --git a/integration-tests/package.json b/integration-tests/package.json index aff8c1e8ea89c..9dcd0a103338d 100644 --- a/integration-tests/package.json +++ b/integration-tests/package.json @@ -67,6 +67,7 @@ "hardhat-gas-reporter": "^1.0.4", "lint-staged": "11.0.0", "mocha": "^8.4.0", + "node-fetch": "^2.6.7", "prom-client": "^14.0.1", "rimraf": "^3.0.2", "typescript": "^4.3.5", diff --git a/integration-tests/test/healthcheck.spec.ts b/integration-tests/test/healthcheck.spec.ts new file mode 100644 index 0000000000000..aa268c79883b4 --- /dev/null +++ b/integration-tests/test/healthcheck.spec.ts @@ -0,0 +1,18 @@ +import fetch from 'node-fetch' + +import { expect } from './shared/setup' +import { envConfig } from './shared/utils' + +describe('Healthcheck Tests', () => { + before(async function () { + if (!envConfig.RUN_HEALTHCHECK_TESTS) { + this.skip() + } + }) + + // Super simple test, is the metric server up? + it('should have metrics exposed', async () => { + const response = await fetch(envConfig.HEALTHCHECK_URL) + expect(response.status).to.equal(200) + }) +}) diff --git a/integration-tests/test/shared/utils.ts b/integration-tests/test/shared/utils.ts index 31e505b54fdbe..c7fe77acf7fda 100644 --- a/integration-tests/test/shared/utils.ts +++ b/integration-tests/test/shared/utils.ts @@ -56,6 +56,8 @@ const procEnv = cleanEnv(process.env, { VERIFIER_URL: str({ default: 'http://localhost:8547' }), + HEALTHCHECK_URL: str({ default: 'http://localhost:7300/metrics' }), + PRIVATE_KEY: str({ default: '0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80', @@ -78,6 +80,9 @@ const procEnv = cleanEnv(process.env, { RUN_REPLICA_TESTS: bool({ default: true, }), + RUN_HEALTHCHECK_TESTS: bool({ + default: true, + }), RUN_DEBUG_TRACE_TESTS: bool({ default: true, }), diff --git a/ops/docker-compose.yml b/ops/docker-compose.yml index d31f95b59943e..4c26cc68b75c0 100644 --- a/ops/docker-compose.yml +++ b/ops/docker-compose.yml @@ -197,6 +197,23 @@ services: - ${REPLICA_HTTP_PORT:-8549}:8545 - ${REPLICA_WS_PORT:-8550}:8546 + replica-healthcheck: + depends_on: + - l2geth + - replica + deploy: + replicas: 0 + build: + context: .. + dockerfile: ./ops/docker/Dockerfile.packages + target: replica-healthcheck + image: ethereumoptimism/replica-healthcheck:${DOCKER_TAG_REPLICA_HEALTHCHECK:-latest} + environment: + HEALTHCHECK__REFERENCE_RPC_PROVIDER: http://l2geth:8545 + HEALTHCHECK__TARGET_RPC_PROVIDER: http://replica:8545 + ports: + - ${HEALTHCHECK_HTTP_PORT:-7300}:7300 + integration_tests: deploy: replicas: 0 @@ -209,6 +226,7 @@ services: environment: L1_URL: http://l1_chain:8545 L2_URL: http://l2geth:8545 + HEALTHCHECK_URL: http://replica-healthcheck:7300/metrics REPLICA_URL: http://replica:8545 VERIFIER_URL: http://verifier:8545 URL: http://deployer:8081/addresses.json diff --git a/ops/docker/Dockerfile.packages b/ops/docker/Dockerfile.packages index 60e7e15f4e5e8..509ff8a10085c 100644 --- a/ops/docker/Dockerfile.packages +++ b/ops/docker/Dockerfile.packages @@ -61,5 +61,5 @@ CMD ["npm", "run", "start"] FROM base as replica-healthcheck -WORKDIR /opts/optimism/packages/replica-healthcheck +WORKDIR /opt/optimism/packages/replica-healthcheck ENTRYPOINT ["npm", "run", "start"] diff --git a/packages/common-ts/src/base-service/base-service-v2.ts b/packages/common-ts/src/base-service/base-service-v2.ts index 6ec723a24494d..17b7b1cad7288 100644 --- a/packages/common-ts/src/base-service/base-service-v2.ts +++ b/packages/common-ts/src/base-service/base-service-v2.ts @@ -1,12 +1,14 @@ -/* Imports: External */ +import { Server } from 'net' + import Config from 'bcfg' import * as dotenv from 'dotenv' import { Command, Option } from 'commander' import { ValidatorSpec, Spec, cleanEnv } from 'envalid' import { sleep } from '@eth-optimism/core-utils' import snakeCase from 'lodash/snakeCase' +import express from 'express' +import prometheus, { Registry } from 'prom-client' -/* Imports: Internal */ import { Logger } from '../common/logger' import { Metric } from './metrics' @@ -82,6 +84,26 @@ export abstract class BaseServiceV2< */ protected readonly metrics: TMetrics + /** + * Registry for prometheus metrics. + */ + protected readonly metricsRegistry: Registry + + /** + * Metrics server. + */ + protected metricsServer: Server + + /** + * Port for the metrics server. + */ + protected readonly metricsServerPort: number + + /** + * Hostname for the metrics server. + */ + protected readonly metricsServerHostname: string + /** * @param params Options for the construction of the service. * @param params.name Name for the service. This name will determine the prefix used for logging, @@ -93,6 +115,8 @@ export abstract class BaseServiceV2< * @param params.options Options to pass to the service. * @param params.loops Whether or not the service should loop. Defaults to true. * @param params.loopIntervalMs Loop interval in milliseconds. Defaults to zero. + * @param params.metricsServerPort Port for the metrics server. Defaults to 7300. + * @param params.metricsServerHostname Hostname for the metrics server. Defaults to 0.0.0.0. */ constructor(params: { name: string @@ -101,6 +125,8 @@ export abstract class BaseServiceV2< options?: Partial loop?: boolean loopIntervalMs?: number + metricsServerPort?: number + metricsServerHostname?: string }) { this.loop = params.loop !== undefined ? params.loop : true this.loopIntervalMs = @@ -203,6 +229,11 @@ export abstract class BaseServiceV2< return acc }, {}) as TMetrics + // Create the metrics server. + this.metricsRegistry = prometheus.register + this.metricsServerPort = params.metricsServerPort || 7300 + this.metricsServerHostname = params.metricsServerHostname || '0.0.0.0' + this.logger = new Logger({ name: params.name }) // Gracefully handle stop signals. @@ -222,6 +253,33 @@ export abstract class BaseServiceV2< public async run(): Promise { this.done = false + // Start the metrics server if not yet running. + if (!this.metricsServer) { + this.logger.info('starting metrics server') + + await new Promise((resolve) => { + const app = express() + + app.get('/metrics', async (_, res) => { + res.status(200).send(await this.metricsRegistry.metrics()) + }) + + this.metricsServer = app.listen( + this.metricsServerPort, + this.metricsServerHostname, + () => { + resolve(null) + } + ) + }) + + this.logger.info(`metrics started`, { + port: this.metricsServerPort, + hostname: this.metricsServerHostname, + route: '/metrics', + }) + } + if (this.init) { this.logger.info('initializing service') await this.init() @@ -267,7 +325,18 @@ export abstract class BaseServiceV2< while (!this.done) { await sleep(1000) } - this.logger.info('main loop finished, goodbye!') + + // Shut down the metrics server if it's running. + if (this.metricsServer) { + this.logger.info('stopping metrics server') + await new Promise((resolve) => { + this.metricsServer.close(() => { + resolve(null) + }) + }) + this.logger.info('metrics server stopped') + this.metricsServer = undefined + } } /** diff --git a/packages/replica-healthcheck/src/service.ts b/packages/replica-healthcheck/src/service.ts index aaf8e05387b6e..98885a40231a4 100644 --- a/packages/replica-healthcheck/src/service.ts +++ b/packages/replica-healthcheck/src/service.ts @@ -1,5 +1,10 @@ -import { Provider } from '@ethersproject/abstract-provider' -import { BaseServiceV2, Gauge, validators } from '@eth-optimism/common-ts' +import { Provider, Block } from '@ethersproject/abstract-provider' +import { + BaseServiceV2, + Counter, + Gauge, + validators, +} from '@eth-optimism/common-ts' import { sleep } from '@eth-optimism/core-utils' type HealthcheckOptions = { @@ -13,6 +18,8 @@ type HealthcheckMetrics = { isCurrentlyDiverged: Gauge referenceHeight: Gauge targetHeight: Gauge + targetConnectionFailures: Counter + referenceConnectionFailures: Counter } type HealthcheckState = {} @@ -59,15 +66,48 @@ export class HealthcheckService extends BaseServiceV2< type: Gauge, desc: 'Block height of the target client', }, + targetConnectionFailures: { + type: Counter, + desc: 'Number of connection failures to the target client', + }, + referenceConnectionFailures: { + type: Counter, + desc: 'Number of connection failures to the reference client', + }, }, }) } async main() { - const targetLatest = await this.options.targetRpcProvider.getBlock('latest') - const referenceLatest = await this.options.referenceRpcProvider.getBlock( - 'latest' - ) + // Get the latest block from the target client and check for connection failures. + let targetLatest: Block + try { + targetLatest = await this.options.targetRpcProvider.getBlock('latest') + } catch (err) { + if (err.message.includes('could not detect network')) { + this.logger.error('target client not connected') + this.metrics.targetConnectionFailures.inc() + return + } else { + throw err + } + } + + // Get the latest block from the reference client and check for connection failures. + let referenceLatest: Block + try { + referenceLatest = await this.options.referenceRpcProvider.getBlock( + 'latest' + ) + } catch (err) { + if (err.message.includes('could not detect network')) { + this.logger.error('reference client not connected') + this.metrics.referenceConnectionFailures.inc() + return + } else { + throw err + } + } // Update these metrics first so they'll refresh no matter what. this.metrics.targetHeight.set(targetLatest.number) diff --git a/yarn.lock b/yarn.lock index d181bb2d06c74..30e0a9ca02441 100644 --- a/yarn.lock +++ b/yarn.lock @@ -11520,7 +11520,7 @@ node-fetch@2.6.1: resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.1.tgz#045bd323631f76ed2e2b55573394416b639a0052" integrity sha512-V4aYg89jEoVRxRb2fJdAg8FHvI7cEyYdVAh94HH0UIK8oJxUfkjlDQN9RbMx+bEjP7+ggMiFRprSti032Oipxw== -node-fetch@^2.6.0, node-fetch@^2.6.1: +node-fetch@^2.6.0, node-fetch@^2.6.1, node-fetch@^2.6.7: version "2.6.7" resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.7.tgz#24de9fba827e3b4ae44dc8b20256a379160052ad" integrity sha512-ZjMPFEfVx5j+y2yF35Kzx5sF7kDzxuDj6ziH4FFbOp87zKDZNx8yExJIb05OGF4Nlt9IHFIMBkRl41VdvcNdbQ==