Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions spartan/aztec-chaos-scenarios/templates/prover-broker-kill.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{{- if .Values.proverBrokerKill.enabled }}
---
apiVersion: chaos-mesh.org/v1alpha1
kind: PodChaos
metadata:
name: {{ .Values.global.targetNamespace }}-prover-broker-kill
namespace: {{ .Values.global.chaosMeshNamespace }}
labels:
{{- include "aztec-chaos-scenarios.labels" . | nindent 4 }}
annotations:
"helm.sh/resource-policy": keep
spec:
action: pod-kill
mode: fixed-percent
value: {{ .Values.proverBrokerKill.percent | quote }}
selector:
namespaces:
- {{ .Values.global.targetNamespace }}
labelSelectors:
app: prover-broker
{{- end }}
4 changes: 4 additions & 0 deletions spartan/aztec-chaos-scenarios/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ proverKill:
enabled: false
percent: 100

proverBrokerKill:
enabled: false
percent: 100

validatorKill:
enabled: false
percent: 30
Expand Down
6 changes: 6 additions & 0 deletions spartan/aztec-chaos-scenarios/values/prover-broker-kill.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
global:
namespace: "smoke"

proverBrokerKill:
enabled: true
percent: 100
4 changes: 0 additions & 4 deletions spartan/bootstrap.sh
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,6 @@ case "$cmd" in
"test-kind-transfer-blob-with-sink")
OVERRIDES="blobSink.enabled=true" ./bootstrap.sh test-kind-transfer
;;
"test-kind-chaos-prover")
chaos-mesh/install.sh
OVERRIDES="proverAgent.testDelayMs=1000" NAMESPACE=chaos-prover FRESH_INSTALL=${FRESH_INSTALL:-true} INSTALL_METRICS=true ./scripts/test_kind.sh src/spartan/prover-node.test.ts ci.yaml
;;
"test-local")
# Isolate network stack in docker.
docker_isolate ../scripts/run_native_testnet.sh -i -val 3
Expand Down
83 changes: 68 additions & 15 deletions yarn-project/end-to-end/src/spartan/prover-node.test.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
import { retryUntil } from '@aztec/aztec.js';
import { createLogger } from '@aztec/foundation/log';

import { type AlertConfig, AlertTriggeredError } from '../quality_of_service/alert_checker.js';
import { applyProverKill, isK8sConfig, runAlertCheck, setupEnvironment, startPortForward } from './utils.js';
import { AlertTriggeredError } from '../quality_of_service/alert_checker.js';
import {
applyProverBrokerKill,
applyProverKill,
isK8sConfig,
runAlertCheck,
setupEnvironment,
startPortForward,
} from './utils.js';

const config = setupEnvironment(process.env);
if (!isK8sConfig(config)) {
Expand All @@ -23,29 +30,30 @@ const logger = createLogger('e2e:spartan-test:prover-node');
const interval = '1m';
const cachedProvingJobs = {
alert: 'CachedProvingJobRate',
expr: `increase(sum(last_over_time(aztec_proving_queue_cached_jobs[${interval}]) or vector(0))[${interval}:])`,
expr: `increase(sum(last_over_time(aztec_proving_queue_cached_jobs_count[${interval}]) or vector(0))[${interval}:])`,
labels: { severity: 'error' },
for: interval,
annotations: {},
};

const completedProvingJobs: AlertConfig = {
alert: 'ResolvedProvingJobRate',
expr: `rate(aztec_proving_queue_total_jobs{aztec_proving_job_type=~"BLOCK_ROOT_ROLLUP|SINGLE_TX_BLOCK_ROOT_ROLLUP"}[${interval}])>0`,
const enqueuedBlockRollupJobs = {
alert: 'EnqueuedBlockRootRollup',
expr: `rate(aztec_proving_queue_enqueued_jobs_count{aztec_proving_job_type=~"BLOCK_ROOT_ROLLUP|SINGLE_TX_BLOCK_ROOT_ROLLUP"}[${interval}])>0`,
labels: { severity: 'error' },
for: interval,
annotations: {},
};

const enqueuedRootRollupJobs = {
alert: 'EnqueuedRootRollup',
expr: `rate(aztec_proving_queue_enqueued_jobs_count{aztec_proving_job_type="ROOT_ROLLUP"}[${interval}])>0`,
labels: { severity: 'error' },
for: interval,
annotations: {},
};

describe('prover node recovery', () => {
beforeAll(async () => {
await startPortForward({
resource: `svc/${config.INSTANCE_NAME}-aztec-network-prover-node`,
namespace: config.NAMESPACE,
containerPort: config.CONTAINER_PROVER_NODE_PORT,
hostPort: config.HOST_PROVER_NODE_PORT,
});

await startPortForward({
resource: `svc/metrics-grafana`,
namespace: 'metrics',
Expand All @@ -54,14 +62,14 @@ describe('prover node recovery', () => {
});
});

it('should start proving', async () => {
it('should recover after a crash', async () => {
logger.info(`Waiting for epoch to be partially proven`);

// use the alert checker to wait until grafana picks up a proof has started
await retryUntil(
async () => {
try {
await runAlertCheck(config, [completedProvingJobs], logger);
await runAlertCheck(config, [enqueuedBlockRollupJobs], logger);
} catch (err) {
return err && err instanceof AlertTriggeredError;
}
Expand Down Expand Up @@ -99,4 +107,49 @@ describe('prover node recovery', () => {

expect(result).toBeTrue();
}, 1_800_000);

it('should recover after a broker crash', async () => {
logger.info(`Waiting for epoch proving job to start`);

// use the alert checker to wait until grafana picks up a proof has started
await retryUntil(
async () => {
try {
await runAlertCheck(config, [enqueuedBlockRollupJobs], logger);
} catch {
return true;
}
},
'wait for epoch',
600,
5,
);

logger.info(`Detected epoch proving job. Killing the broker`);

await applyProverBrokerKill({
namespace: config.NAMESPACE,
spartanDir: config.SPARTAN_DIR,
logger,
});

// wait for the broker to come back online and for proving to continue
const result = await retryUntil(
async () => {
try {
await runAlertCheck(config, [enqueuedRootRollupJobs], logger);
} catch (err) {
if (err && err instanceof AlertTriggeredError) {
return true;
}
}
return false;
},
'wait for root rollup',
600,
5,
);

expect(result).toBeTrue();
}, 1_800_000);
});
19 changes: 19 additions & 0 deletions yarn-project/end-to-end/src/spartan/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,25 @@ export function applyProverKill({
});
}

export function applyProverBrokerKill({
namespace,
spartanDir,
logger,
}: {
namespace: string;
spartanDir: string;
logger: Logger;
}) {
return installChaosMeshChart({
instanceName: 'prover-broker-kill',
targetNamespace: namespace,
valuesFile: 'prover-broker-kill.yaml',
helmChartDir: getChartDir(spartanDir, 'aztec-chaos-scenarios'),
clean: true,
logger,
});
}

export function applyBootNodeFailure({
namespace,
spartanDir,
Expand Down
14 changes: 7 additions & 7 deletions yarn-project/telemetry-client/src/metrics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,13 +100,13 @@ export const PROVING_ORCHESTRATOR_BASE_ROLLUP_INPUTS_DURATION =

export const PROVING_QUEUE_JOB_SIZE = 'aztec.proving_queue.job_size';
export const PROVING_QUEUE_SIZE = 'aztec.proving_queue.size';
export const PROVING_QUEUE_TOTAL_JOBS = 'aztec.proving_queue.total_jobs';
export const PROVING_QUEUE_CACHED_JOBS = 'aztec.proving_queue.cached_jobs';
export const PROVING_QUEUE_ACTIVE_JOBS = 'aztec.proving_queue.active_jobs';
export const PROVING_QUEUE_RESOLVED_JOBS = 'aztec.proving_queue.resolved_jobs';
export const PROVING_QUEUE_REJECTED_JOBS = 'aztec.proving_queue.rejected_jobs';
export const PROVING_QUEUE_RETRIED_JOBS = 'aztec.proving_queue.retried_jobs';
export const PROVING_QUEUE_TIMED_OUT_JOBS = 'aztec.proving_queue.timed_out_jobs';
export const PROVING_QUEUE_TOTAL_JOBS = 'aztec.proving_queue.enqueued_jobs_count';
export const PROVING_QUEUE_CACHED_JOBS = 'aztec.proving_queue.cached_jobs_count';
export const PROVING_QUEUE_ACTIVE_JOBS = 'aztec.proving_queue.active_jobs_count';
export const PROVING_QUEUE_RESOLVED_JOBS = 'aztec.proving_queue.resolved_jobs_count';
export const PROVING_QUEUE_REJECTED_JOBS = 'aztec.proving_queue.rejected_jobs_count';
export const PROVING_QUEUE_RETRIED_JOBS = 'aztec.proving_queue.retried_jobs_count';
export const PROVING_QUEUE_TIMED_OUT_JOBS = 'aztec.proving_queue.timed_out_jobs_count';
export const PROVING_QUEUE_JOB_WAIT = 'aztec.proving_queue.job_wait';
export const PROVING_QUEUE_JOB_DURATION = 'aztec.proving_queue.job_duration';
export const PROVING_QUEUE_DB_NUM_ITEMS = 'aztec.proving_queue.db.num_items';
Expand Down