Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/chilly-stingrays-decide.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
'@eth-optimism/replica-healthcheck': patch
---

Fixes a bug in the replica-healthcheck docker file
5 changes: 5 additions & 0 deletions .changeset/orange-oranges-bow.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
'@eth-optimism/common-ts': patch
---

Properly exposes metrics as part of a metrics server at port 7300
5 changes: 5 additions & 0 deletions .changeset/plenty-dancers-eat.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
'@eth-optimism/integration-tests': patch
---

Add integration test for healthcheck server
5 changes: 5 additions & 0 deletions .changeset/thirty-hairs-remain.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
'@eth-optimism/replica-healthcheck': patch
---

Add checks and metrics for dead networks
5 changes: 3 additions & 2 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ jobs:
--env L2_GAS_PRICE=onchain \
--env RUN_DEBUG_TRACE_TESTS=false \
--env RUN_REPLICA_TESTS=false \
--env RUN_HEALTHCHECK_TESTS=false \
--env RUN_STRESS_TESTS=false \
--env OVMCONTEXT_SPEC_NUM_TXS=1 \
--env DTL_ENQUEUE_CONFIRMATIONS=12 \
Expand Down Expand Up @@ -252,7 +253,7 @@ workflows:
- develop
jobs:
- build-dtl:
context:
context:
- optimism
- slack
<<: *slack-nightly-build-fail-post-step
Expand Down Expand Up @@ -331,4 +332,4 @@ workflows:
}
]
}
event: always
event: always
3 changes: 1 addition & 2 deletions .github/workflows/integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ jobs:
working-directory: ./ops
run: |
./scripts/stats.sh &
docker-compose -f docker-compose.yml up -d
docker-compose -f docker-compose.yml up -d --scale replica-healthcheck=1

- name: Wait for the Sequencer node
working-directory: ./ops
Expand All @@ -64,7 +64,6 @@ jobs:
if: failure()
uses: jwalton/gh-docker-logs@v1
with:
images: 'ethereumoptimism/hardhat,ops_deployer,ops_dtl,ops_l2geth,ethereumoptimism/message-relayer,ops_batch_submitter,ops_replica,ops_integration_tests'
dest: '/home/runner/logs'

- name: Tar logs
Expand Down
3 changes: 2 additions & 1 deletion integration-tests/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@ OVMCONTEXT_SPEC_NUM_TXS=1
RUN_WITHDRAWAL_TESTS=false
RUN_DEBUG_TRACE_TESTS=false
RUN_REPLICA_TESTS=false
RUN_HEALTHCHECK_TESTS=false
RUN_STRESS_TESTS=false
# Can be configured up or down as necessary
MOCHA_TIMEOUT=300000
# Set to true to make Mocha stop after the first failed test.
MOCHA_BAIL=false
MOCHA_BAIL=false
1 change: 1 addition & 0 deletions integration-tests/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
"hardhat-gas-reporter": "^1.0.4",
"lint-staged": "11.0.0",
"mocha": "^8.4.0",
"node-fetch": "^2.6.7",
"prom-client": "^14.0.1",
"rimraf": "^3.0.2",
"typescript": "^4.3.5",
Expand Down
18 changes: 18 additions & 0 deletions integration-tests/test/healthcheck.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import fetch from 'node-fetch'

import { expect } from './shared/setup'
import { envConfig } from './shared/utils'

describe('Healthcheck Tests', () => {
before(async function () {
if (!envConfig.RUN_HEALTHCHECK_TESTS) {
this.skip()
}
})

// Super simple test, is the metric server up?
it('should have metrics exposed', async () => {
const response = await fetch(envConfig.HEALTHCHECK_URL)
expect(response.status).to.equal(200)
})
})
5 changes: 5 additions & 0 deletions integration-tests/test/shared/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ const procEnv = cleanEnv(process.env, {

VERIFIER_URL: str({ default: 'http://localhost:8547' }),

HEALTHCHECK_URL: str({ default: 'http://localhost:7300/metrics' }),

PRIVATE_KEY: str({
default:
'0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80',
Expand All @@ -78,6 +80,9 @@ const procEnv = cleanEnv(process.env, {
RUN_REPLICA_TESTS: bool({
default: true,
}),
RUN_HEALTHCHECK_TESTS: bool({
default: true,
}),
RUN_DEBUG_TRACE_TESTS: bool({
default: true,
}),
Expand Down
18 changes: 18 additions & 0 deletions ops/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,23 @@ services:
- ${REPLICA_HTTP_PORT:-8549}:8545
- ${REPLICA_WS_PORT:-8550}:8546

replica-healthcheck:
depends_on:
- l2geth
- replica
deploy:
replicas: 0
build:
context: ..
dockerfile: ./ops/docker/Dockerfile.packages
target: replica-healthcheck
image: ethereumoptimism/replica-healthcheck:${DOCKER_TAG_REPLICA_HEALTHCHECK:-latest}
environment:
HEALTHCHECK__REFERENCE_RPC_PROVIDER: http://l2geth:8545
HEALTHCHECK__TARGET_RPC_PROVIDER: http://replica:8545
ports:
- ${HEALTHCHECK_HTTP_PORT:-7300}:7300

integration_tests:
deploy:
replicas: 0
Expand All @@ -209,6 +226,7 @@ services:
environment:
L1_URL: http://l1_chain:8545
L2_URL: http://l2geth:8545
HEALTHCHECK_URL: http://replica-healthcheck:7300/metrics
REPLICA_URL: http://replica:8545
VERIFIER_URL: http://verifier:8545
URL: http://deployer:8081/addresses.json
Expand Down
2 changes: 1 addition & 1 deletion ops/docker/Dockerfile.packages
Original file line number Diff line number Diff line change
Expand Up @@ -61,5 +61,5 @@ CMD ["npm", "run", "start"]


FROM base as replica-healthcheck
WORKDIR /opts/optimism/packages/replica-healthcheck
WORKDIR /opt/optimism/packages/replica-healthcheck
ENTRYPOINT ["npm", "run", "start"]
75 changes: 72 additions & 3 deletions packages/common-ts/src/base-service/base-service-v2.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
/* Imports: External */
import { Server } from 'net'

import Config from 'bcfg'
import * as dotenv from 'dotenv'
import { Command, Option } from 'commander'
import { ValidatorSpec, Spec, cleanEnv } from 'envalid'
import { sleep } from '@eth-optimism/core-utils'
import snakeCase from 'lodash/snakeCase'
import express from 'express'
import prometheus, { Registry } from 'prom-client'

/* Imports: Internal */
import { Logger } from '../common/logger'
import { Metric } from './metrics'

Expand Down Expand Up @@ -82,6 +84,26 @@ export abstract class BaseServiceV2<
*/
protected readonly metrics: TMetrics

/**
* Registry for prometheus metrics.
*/
protected readonly metricsRegistry: Registry

/**
* Metrics server.
*/
protected metricsServer: Server

/**
* Port for the metrics server.
*/
protected readonly metricsServerPort: number

/**
* Hostname for the metrics server.
*/
protected readonly metricsServerHostname: string

/**
* @param params Options for the construction of the service.
* @param params.name Name for the service. This name will determine the prefix used for logging,
Expand All @@ -93,6 +115,8 @@ export abstract class BaseServiceV2<
* @param params.options Options to pass to the service.
* @param params.loops Whether or not the service should loop. Defaults to true.
* @param params.loopIntervalMs Loop interval in milliseconds. Defaults to zero.
* @param params.metricsServerPort Port for the metrics server. Defaults to 7300.
* @param params.metricsServerHostname Hostname for the metrics server. Defaults to 0.0.0.0.
*/
constructor(params: {
name: string
Expand All @@ -101,6 +125,8 @@ export abstract class BaseServiceV2<
options?: Partial<TOptions>
loop?: boolean
loopIntervalMs?: number
metricsServerPort?: number
metricsServerHostname?: string
}) {
this.loop = params.loop !== undefined ? params.loop : true
this.loopIntervalMs =
Expand Down Expand Up @@ -203,6 +229,11 @@ export abstract class BaseServiceV2<
return acc
}, {}) as TMetrics

// Create the metrics server.
this.metricsRegistry = prometheus.register
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whoops. I forgot to expose the metrics at all.

this.metricsServerPort = params.metricsServerPort || 7300
this.metricsServerHostname = params.metricsServerHostname || '0.0.0.0'

this.logger = new Logger({ name: params.name })

// Gracefully handle stop signals.
Expand All @@ -222,6 +253,33 @@ export abstract class BaseServiceV2<
public async run(): Promise<void> {
this.done = false

// Start the metrics server if not yet running.
if (!this.metricsServer) {
this.logger.info('starting metrics server')

await new Promise((resolve) => {
const app = express()

app.get('/metrics', async (_, res) => {
res.status(200).send(await this.metricsRegistry.metrics())
})

this.metricsServer = app.listen(
this.metricsServerPort,
this.metricsServerHostname,
() => {
resolve(null)
}
)
})

this.logger.info(`metrics started`, {
port: this.metricsServerPort,
hostname: this.metricsServerHostname,
route: '/metrics',
})
}

if (this.init) {
this.logger.info('initializing service')
await this.init()
Expand Down Expand Up @@ -267,7 +325,18 @@ export abstract class BaseServiceV2<
while (!this.done) {
await sleep(1000)
}
this.logger.info('main loop finished, goodbye!')

// Shut down the metrics server if it's running.
if (this.metricsServer) {
this.logger.info('stopping metrics server')
await new Promise((resolve) => {
this.metricsServer.close(() => {
resolve(null)
})
})
this.logger.info('metrics server stopped')
this.metricsServer = undefined
}
}

/**
Expand Down
52 changes: 46 additions & 6 deletions packages/replica-healthcheck/src/service.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
import { Provider } from '@ethersproject/abstract-provider'
import { BaseServiceV2, Gauge, validators } from '@eth-optimism/common-ts'
import { Provider, Block } from '@ethersproject/abstract-provider'
import {
BaseServiceV2,
Counter,
Gauge,
validators,
} from '@eth-optimism/common-ts'
import { sleep } from '@eth-optimism/core-utils'

type HealthcheckOptions = {
Expand All @@ -13,6 +18,8 @@ type HealthcheckMetrics = {
isCurrentlyDiverged: Gauge
referenceHeight: Gauge
targetHeight: Gauge
targetConnectionFailures: Counter
referenceConnectionFailures: Counter
}

type HealthcheckState = {}
Expand Down Expand Up @@ -59,15 +66,48 @@ export class HealthcheckService extends BaseServiceV2<
type: Gauge,
desc: 'Block height of the target client',
},
targetConnectionFailures: {
type: Counter,
desc: 'Number of connection failures to the target client',
},
referenceConnectionFailures: {
type: Counter,
desc: 'Number of connection failures to the reference client',
},
},
})
}

async main() {
const targetLatest = await this.options.targetRpcProvider.getBlock('latest')
const referenceLatest = await this.options.referenceRpcProvider.getBlock(
'latest'
)
// Get the latest block from the target client and check for connection failures.
let targetLatest: Block
try {
targetLatest = await this.options.targetRpcProvider.getBlock('latest')
} catch (err) {
if (err.message.includes('could not detect network')) {
this.logger.error('target client not connected')
this.metrics.targetConnectionFailures.inc()
return
} else {
throw err
}
}

// Get the latest block from the reference client and check for connection failures.
let referenceLatest: Block
try {
referenceLatest = await this.options.referenceRpcProvider.getBlock(
'latest'
)
} catch (err) {
if (err.message.includes('could not detect network')) {
this.logger.error('reference client not connected')
this.metrics.referenceConnectionFailures.inc()
return
} else {
throw err
}
}

// Update these metrics first so they'll refresh no matter what.
this.metrics.targetHeight.set(targetLatest.number)
Expand Down
2 changes: 1 addition & 1 deletion yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -11520,7 +11520,7 @@ node-fetch@2.6.1:
resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.1.tgz#045bd323631f76ed2e2b55573394416b639a0052"
integrity sha512-V4aYg89jEoVRxRb2fJdAg8FHvI7cEyYdVAh94HH0UIK8oJxUfkjlDQN9RbMx+bEjP7+ggMiFRprSti032Oipxw==

node-fetch@^2.6.0, node-fetch@^2.6.1:
node-fetch@^2.6.0, node-fetch@^2.6.1, node-fetch@^2.6.7:
version "2.6.7"
resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.7.tgz#24de9fba827e3b4ae44dc8b20256a379160052ad"
integrity sha512-ZjMPFEfVx5j+y2yF35Kzx5sF7kDzxuDj6ziH4FFbOp87zKDZNx8yExJIb05OGF4Nlt9IHFIMBkRl41VdvcNdbQ==
Expand Down