diff --git a/spartan/aztec-chaos-scenarios/templates/prover-broker-kill.yaml b/spartan/aztec-chaos-scenarios/templates/prover-broker-kill.yaml new file mode 100644 index 000000000000..5f46d51ae6cd --- /dev/null +++ b/spartan/aztec-chaos-scenarios/templates/prover-broker-kill.yaml @@ -0,0 +1,21 @@ +{{- if .Values.proverBrokerKill.enabled }} +--- +apiVersion: chaos-mesh.org/v1alpha1 +kind: PodChaos +metadata: + name: {{ .Values.global.targetNamespace }}-prover-broker-kill + namespace: {{ .Values.global.chaosMeshNamespace }} + labels: + {{- include "aztec-chaos-scenarios.labels" . | nindent 4 }} + annotations: + "helm.sh/resource-policy": keep +spec: + action: pod-kill + mode: fixed-percent + value: {{ .Values.proverBrokerKill.percent | quote }} + selector: + namespaces: + - {{ .Values.global.targetNamespace }} + labelSelectors: + app: prover-broker +{{- end }} diff --git a/spartan/aztec-chaos-scenarios/values.yaml b/spartan/aztec-chaos-scenarios/values.yaml index afe4344d8600..815df03893f2 100644 --- a/spartan/aztec-chaos-scenarios/values.yaml +++ b/spartan/aztec-chaos-scenarios/values.yaml @@ -73,6 +73,10 @@ proverKill: enabled: false percent: 100 +proverBrokerKill: + enabled: false + percent: 100 + validatorKill: enabled: false percent: 30 diff --git a/spartan/aztec-chaos-scenarios/values/prover-broker-kill.yaml b/spartan/aztec-chaos-scenarios/values/prover-broker-kill.yaml new file mode 100644 index 000000000000..91733d4025a9 --- /dev/null +++ b/spartan/aztec-chaos-scenarios/values/prover-broker-kill.yaml @@ -0,0 +1,6 @@ +global: + namespace: "smoke" + +proverBrokerKill: + enabled: true + percent: 100 diff --git a/spartan/bootstrap.sh b/spartan/bootstrap.sh index c6b7851aabab..1d29700c381b 100755 --- a/spartan/bootstrap.sh +++ b/spartan/bootstrap.sh @@ -122,10 +122,6 @@ case "$cmd" in "test-kind-transfer-blob-with-sink") OVERRIDES="blobSink.enabled=true" ./bootstrap.sh test-kind-transfer ;; - "test-kind-chaos-prover") - chaos-mesh/install.sh - OVERRIDES="proverAgent.testDelayMs=1000" NAMESPACE=chaos-prover FRESH_INSTALL=${FRESH_INSTALL:-true} INSTALL_METRICS=true ./scripts/test_kind.sh src/spartan/prover-node.test.ts ci.yaml - ;; "test-local") # Isolate network stack in docker. docker_isolate ../scripts/run_native_testnet.sh -i -val 3 diff --git a/yarn-project/end-to-end/src/spartan/prover-node.test.ts b/yarn-project/end-to-end/src/spartan/prover-node.test.ts index e39921dda513..1cd1bfffb84a 100644 --- a/yarn-project/end-to-end/src/spartan/prover-node.test.ts +++ b/yarn-project/end-to-end/src/spartan/prover-node.test.ts @@ -1,8 +1,15 @@ import { retryUntil } from '@aztec/aztec.js'; import { createLogger } from '@aztec/foundation/log'; -import { type AlertConfig, AlertTriggeredError } from '../quality_of_service/alert_checker.js'; -import { applyProverKill, isK8sConfig, runAlertCheck, setupEnvironment, startPortForward } from './utils.js'; +import { AlertTriggeredError } from '../quality_of_service/alert_checker.js'; +import { + applyProverBrokerKill, + applyProverKill, + isK8sConfig, + runAlertCheck, + setupEnvironment, + startPortForward, +} from './utils.js'; const config = setupEnvironment(process.env); if (!isK8sConfig(config)) { @@ -23,15 +30,23 @@ const logger = createLogger('e2e:spartan-test:prover-node'); const interval = '1m'; const cachedProvingJobs = { alert: 'CachedProvingJobRate', - expr: `increase(sum(last_over_time(aztec_proving_queue_cached_jobs[${interval}]) or vector(0))[${interval}:])`, + expr: `increase(sum(last_over_time(aztec_proving_queue_cached_jobs_count[${interval}]) or vector(0))[${interval}:])`, labels: { severity: 'error' }, for: interval, annotations: {}, }; -const completedProvingJobs: AlertConfig = { - alert: 'ResolvedProvingJobRate', - expr: `rate(aztec_proving_queue_total_jobs{aztec_proving_job_type=~"BLOCK_ROOT_ROLLUP|SINGLE_TX_BLOCK_ROOT_ROLLUP"}[${interval}])>0`, +const enqueuedBlockRollupJobs = { + alert: 'EnqueuedBlockRootRollup', + expr: `rate(aztec_proving_queue_enqueued_jobs_count{aztec_proving_job_type=~"BLOCK_ROOT_ROLLUP|SINGLE_TX_BLOCK_ROOT_ROLLUP"}[${interval}])>0`, + labels: { severity: 'error' }, + for: interval, + annotations: {}, +}; + +const enqueuedRootRollupJobs = { + alert: 'EnqueuedRootRollup', + expr: `rate(aztec_proving_queue_enqueued_jobs_count{aztec_proving_job_type="ROOT_ROLLUP"}[${interval}])>0`, labels: { severity: 'error' }, for: interval, annotations: {}, @@ -39,13 +54,6 @@ const completedProvingJobs: AlertConfig = { describe('prover node recovery', () => { beforeAll(async () => { - await startPortForward({ - resource: `svc/${config.INSTANCE_NAME}-aztec-network-prover-node`, - namespace: config.NAMESPACE, - containerPort: config.CONTAINER_PROVER_NODE_PORT, - hostPort: config.HOST_PROVER_NODE_PORT, - }); - await startPortForward({ resource: `svc/metrics-grafana`, namespace: 'metrics', @@ -54,14 +62,14 @@ describe('prover node recovery', () => { }); }); - it('should start proving', async () => { + it('should recover after a crash', async () => { logger.info(`Waiting for epoch to be partially proven`); // use the alert checker to wait until grafana picks up a proof has started await retryUntil( async () => { try { - await runAlertCheck(config, [completedProvingJobs], logger); + await runAlertCheck(config, [enqueuedBlockRollupJobs], logger); } catch (err) { return err && err instanceof AlertTriggeredError; } @@ -99,4 +107,49 @@ describe('prover node recovery', () => { expect(result).toBeTrue(); }, 1_800_000); + + it('should recover after a broker crash', async () => { + logger.info(`Waiting for epoch proving job to start`); + + // use the alert checker to wait until grafana picks up a proof has started + await retryUntil( + async () => { + try { + await runAlertCheck(config, [enqueuedBlockRollupJobs], logger); + } catch { + return true; + } + }, + 'wait for epoch', + 600, + 5, + ); + + logger.info(`Detected epoch proving job. Killing the broker`); + + await applyProverBrokerKill({ + namespace: config.NAMESPACE, + spartanDir: config.SPARTAN_DIR, + logger, + }); + + // wait for the broker to come back online and for proving to continue + const result = await retryUntil( + async () => { + try { + await runAlertCheck(config, [enqueuedRootRollupJobs], logger); + } catch (err) { + if (err && err instanceof AlertTriggeredError) { + return true; + } + } + return false; + }, + 'wait for root rollup', + 600, + 5, + ); + + expect(result).toBeTrue(); + }, 1_800_000); }); diff --git a/yarn-project/end-to-end/src/spartan/utils.ts b/yarn-project/end-to-end/src/spartan/utils.ts index 18f442b8071d..f3159cb4e01b 100644 --- a/yarn-project/end-to-end/src/spartan/utils.ts +++ b/yarn-project/end-to-end/src/spartan/utils.ts @@ -331,6 +331,25 @@ export function applyProverKill({ }); } +export function applyProverBrokerKill({ + namespace, + spartanDir, + logger, +}: { + namespace: string; + spartanDir: string; + logger: Logger; +}) { + return installChaosMeshChart({ + instanceName: 'prover-broker-kill', + targetNamespace: namespace, + valuesFile: 'prover-broker-kill.yaml', + helmChartDir: getChartDir(spartanDir, 'aztec-chaos-scenarios'), + clean: true, + logger, + }); +} + export function applyBootNodeFailure({ namespace, spartanDir, diff --git a/yarn-project/telemetry-client/src/metrics.ts b/yarn-project/telemetry-client/src/metrics.ts index 802507f5da0c..f39792de44cd 100644 --- a/yarn-project/telemetry-client/src/metrics.ts +++ b/yarn-project/telemetry-client/src/metrics.ts @@ -100,13 +100,13 @@ export const PROVING_ORCHESTRATOR_BASE_ROLLUP_INPUTS_DURATION = export const PROVING_QUEUE_JOB_SIZE = 'aztec.proving_queue.job_size'; export const PROVING_QUEUE_SIZE = 'aztec.proving_queue.size'; -export const PROVING_QUEUE_TOTAL_JOBS = 'aztec.proving_queue.total_jobs'; -export const PROVING_QUEUE_CACHED_JOBS = 'aztec.proving_queue.cached_jobs'; -export const PROVING_QUEUE_ACTIVE_JOBS = 'aztec.proving_queue.active_jobs'; -export const PROVING_QUEUE_RESOLVED_JOBS = 'aztec.proving_queue.resolved_jobs'; -export const PROVING_QUEUE_REJECTED_JOBS = 'aztec.proving_queue.rejected_jobs'; -export const PROVING_QUEUE_RETRIED_JOBS = 'aztec.proving_queue.retried_jobs'; -export const PROVING_QUEUE_TIMED_OUT_JOBS = 'aztec.proving_queue.timed_out_jobs'; +export const PROVING_QUEUE_TOTAL_JOBS = 'aztec.proving_queue.enqueued_jobs_count'; +export const PROVING_QUEUE_CACHED_JOBS = 'aztec.proving_queue.cached_jobs_count'; +export const PROVING_QUEUE_ACTIVE_JOBS = 'aztec.proving_queue.active_jobs_count'; +export const PROVING_QUEUE_RESOLVED_JOBS = 'aztec.proving_queue.resolved_jobs_count'; +export const PROVING_QUEUE_REJECTED_JOBS = 'aztec.proving_queue.rejected_jobs_count'; +export const PROVING_QUEUE_RETRIED_JOBS = 'aztec.proving_queue.retried_jobs_count'; +export const PROVING_QUEUE_TIMED_OUT_JOBS = 'aztec.proving_queue.timed_out_jobs_count'; export const PROVING_QUEUE_JOB_WAIT = 'aztec.proving_queue.job_wait'; export const PROVING_QUEUE_JOB_DURATION = 'aztec.proving_queue.job_duration'; export const PROVING_QUEUE_DB_NUM_ITEMS = 'aztec.proving_queue.db.num_items';