diff --git a/spartan/aztec-chaos-scenarios/templates/prover-kill.yaml b/spartan/aztec-chaos-scenarios/templates/prover-kill.yaml new file mode 100644 index 000000000000..322b52d9884b --- /dev/null +++ b/spartan/aztec-chaos-scenarios/templates/prover-kill.yaml @@ -0,0 +1,21 @@ +{{- if .Values.proverKill.enabled }} +--- +apiVersion: chaos-mesh.org/v1alpha1 +kind: PodChaos +metadata: + name: {{ .Values.global.targetNamespace }}-prover-kill + namespace: {{ .Values.global.chaosMeshNamespace }} + labels: + {{- include "aztec-chaos-scenarios.labels" . | nindent 4 }} + annotations: + "helm.sh/resource-policy": keep +spec: + action: pod-kill + mode: fixed-percent + value: {{ .Values.proverKill.percent | quote }} + selector: + namespaces: + - {{ .Values.global.targetNamespace }} + labelSelectors: + app: prover-node +{{- end }} diff --git a/spartan/aztec-chaos-scenarios/values.yaml b/spartan/aztec-chaos-scenarios/values.yaml index cb85d9e008ff..afe4344d8600 100644 --- a/spartan/aztec-chaos-scenarios/values.yaml +++ b/spartan/aztec-chaos-scenarios/values.yaml @@ -69,6 +69,10 @@ proverFailure: enabled: false duration: 13m +proverKill: + enabled: false + percent: 100 + validatorKill: enabled: false percent: 30 diff --git a/spartan/aztec-chaos-scenarios/values/prover-kill.yaml b/spartan/aztec-chaos-scenarios/values/prover-kill.yaml new file mode 100644 index 000000000000..7a1f132c5e0b --- /dev/null +++ b/spartan/aztec-chaos-scenarios/values/prover-kill.yaml @@ -0,0 +1,6 @@ +global: + namespace: "smoke" + +proverKill: + enabled: true + percent: 100 diff --git a/spartan/aztec-network/templates/prover-agent.yaml b/spartan/aztec-network/templates/prover-agent.yaml index af376f03ef25..502fb4709b33 100644 --- a/spartan/aztec-network/templates/prover-agent.yaml +++ b/spartan/aztec-network/templates/prover-agent.yaml @@ -100,6 +100,12 @@ spec: value: "1" - name: PROVER_AGENT_POLL_INTERVAL_MS value: "{{ .Values.proverAgent.pollIntervalMs }}" + - name: PROVER_TEST_DELAY_TYPE + value: "{{ .Values.proverAgent.testDelayType }}" + - name: PROVER_TEST_DELAY_MS + value: "{{ .Values.proverAgent.testDelayMs }}" + - name: PROVER_TEST_DELAY_FACTOR + value: "{{ .Values.proverAgent.testDelayFactor }}" - name: PROVER_AGENT_PROOF_TYPES value: {{ join "," .Values.proverAgent.proofTypes | quote }} - name: OTEL_RESOURCE_ATTRIBUTES diff --git a/spartan/aztec-network/values.yaml b/spartan/aztec-network/values.yaml index 790562bcf75b..d6b73401910d 100644 --- a/spartan/aztec-network/values.yaml +++ b/spartan/aztec-network/values.yaml @@ -279,7 +279,10 @@ proverAgent: enabled: true replicas: 1 pollIntervalMs: 1000 - proofTypes: ["foo", "bar", "baz"] + proofTypes: [] + testDelayType: "fixed" + testDelayMs: 100 # each fake proof takes 100ms + testDelayFactor: 1 gke: spotEnabled: true logLevel: "debug; info: aztec:simulator, json-rpc" diff --git a/spartan/aztec-network/values/prover-node-chaos.yaml b/spartan/aztec-network/values/prover-node-chaos.yaml new file mode 100644 index 000000000000..75ffa8c34e80 --- /dev/null +++ b/spartan/aztec-network/values/prover-node-chaos.yaml @@ -0,0 +1,20 @@ +validator: + replicas: 1 + validatorKeys: + - 0x59c6995e998f97a5a0044966f0945389dc9e86dae88c7a8412f4603b6b78690d + validatorAddresses: + - 0x70997970C51812dc3A010C7d01b50e0d17dc79C8 + validator: + disabled: false + +bootNode: + validator: + disabled: true + +telemetry: + enabled: true + +proverAgent: + testDelayType: "fixed" + testDelayMs: 2000 + diff --git a/spartan/bootstrap.sh b/spartan/bootstrap.sh index 1d29700c381b..c6b7851aabab 100755 --- a/spartan/bootstrap.sh +++ b/spartan/bootstrap.sh @@ -122,6 +122,10 @@ case "$cmd" in "test-kind-transfer-blob-with-sink") OVERRIDES="blobSink.enabled=true" ./bootstrap.sh test-kind-transfer ;; + "test-kind-chaos-prover") + chaos-mesh/install.sh + OVERRIDES="proverAgent.testDelayMs=1000" NAMESPACE=chaos-prover FRESH_INSTALL=${FRESH_INSTALL:-true} INSTALL_METRICS=true ./scripts/test_kind.sh src/spartan/prover-node.test.ts ci.yaml + ;; "test-local") # Isolate network stack in docker. docker_isolate ../scripts/run_native_testnet.sh -i -val 3 diff --git a/spartan/scripts/test_kind.sh b/spartan/scripts/test_kind.sh index aeada169180c..3284d28003ad 100755 --- a/spartan/scripts/test_kind.sh +++ b/spartan/scripts/test_kind.sh @@ -86,8 +86,8 @@ if [ "$fresh_install" != "no-deploy" ]; then OVERRIDES="$OVERRIDES" ./deploy_kind.sh $namespace $values_file $sepolia_run fi -# Find 4 free ports between 9000 and 10000 -free_ports="$(find_ports 5)" +# Find 6 free ports between 9000 and 10000 +free_ports="$(find_ports 6)" # Extract the free ports from the list forwarded_pxe_port=$(echo $free_ports | awk '{print $1}') @@ -95,6 +95,7 @@ forwarded_anvil_port=$(echo $free_ports | awk '{print $2}') forwarded_metrics_port=$(echo $free_ports | awk '{print $3}') forwarded_node_port=$(echo $free_ports | awk '{print $4}') forwarded_sequencer_port=$(echo $free_ports | awk '{print $5}') +forwarded_prover_node_port=$(echo $free_ports | awk '{print $6}') if [ "$install_metrics" = "true" ]; then grafana_password=$(kubectl get secrets -n metrics metrics-grafana -o jsonpath='{.data.admin-password}' | base64 --decode) @@ -125,6 +126,8 @@ export HOST_NODE_PORT="$forwarded_node_port" export CONTAINER_NODE_PORT="8080" export HOST_SEQUENCER_PORT=$forwarded_sequencer_port export CONTAINER_SEQUENCER_PORT="8080" +export HOST_PROVER_NODE_PORT=$forwarded_prover_node_port +export CONTAINER_PROVER_NODE_PORT="8080" export HOST_METRICS_PORT="$forwarded_metrics_port" export CONTAINER_METRICS_PORT="80" export GRAFANA_PASSWORD="$grafana_password" diff --git a/yarn-project/bb-prover/src/test/delay_values.ts b/yarn-project/bb-prover/src/test/delay_values.ts new file mode 100644 index 000000000000..0d77e6447ad3 --- /dev/null +++ b/yarn-project/bb-prover/src/test/delay_values.ts @@ -0,0 +1,31 @@ +import { ProvingRequestType } from '@aztec/circuit-types/interfaces/server'; + +export const WITGEN_DELAY_MS: Record = { + [ProvingRequestType.BASE_PARITY]: 60, + [ProvingRequestType.BLOCK_MERGE_ROLLUP]: 650, + [ProvingRequestType.BLOCK_ROOT_ROLLUP]: 60_000, + [ProvingRequestType.EMPTY_BLOCK_ROOT_ROLLUP]: 0, + [ProvingRequestType.MERGE_ROLLUP]: 0, + [ProvingRequestType.PRIVATE_BASE_ROLLUP]: 400_000, + [ProvingRequestType.SINGLE_TX_BLOCK_ROOT_ROLLUP]: 0, // TBD + [ProvingRequestType.PUBLIC_BASE_ROLLUP]: 470_000, + [ProvingRequestType.ROOT_PARITY]: 100, + [ProvingRequestType.ROOT_ROLLUP]: 650, + [ProvingRequestType.TUBE_PROOF]: 0, + [ProvingRequestType.PUBLIC_VM]: 0, +}; + +export const PROOF_DELAY_MS: Record = { + [ProvingRequestType.BASE_PARITY]: 3_000, + [ProvingRequestType.BLOCK_MERGE_ROLLUP]: 15_000, + [ProvingRequestType.BLOCK_ROOT_ROLLUP]: 55_000, + [ProvingRequestType.EMPTY_BLOCK_ROOT_ROLLUP]: 0, + [ProvingRequestType.MERGE_ROLLUP]: 0, + [ProvingRequestType.PRIVATE_BASE_ROLLUP]: 145_000, + [ProvingRequestType.SINGLE_TX_BLOCK_ROOT_ROLLUP]: 0, // TBD + [ProvingRequestType.PUBLIC_BASE_ROLLUP]: 160_000, + [ProvingRequestType.ROOT_PARITY]: 30_000, + [ProvingRequestType.ROOT_ROLLUP]: 15_000, + [ProvingRequestType.TUBE_PROOF]: 30_000, + [ProvingRequestType.PUBLIC_VM]: 0, +}; diff --git a/yarn-project/bb-prover/src/test/test_circuit_prover.ts b/yarn-project/bb-prover/src/test/test_circuit_prover.ts index ab898ab650a7..d428021dfaa9 100644 --- a/yarn-project/bb-prover/src/test/test_circuit_prover.ts +++ b/yarn-project/bb-prover/src/test/test_circuit_prover.ts @@ -1,5 +1,6 @@ import { type ProofAndVerificationKey, + ProvingRequestType, type PublicInputsAndRecursiveProof, type ServerCircuitProver, makeProofAndVerificationKey, @@ -72,6 +73,17 @@ import { type WitnessMap } from '@noir-lang/types'; import { ProverInstrumentation } from '../instrumentation.js'; import { mapProtocolArtifactNameToCircuitName } from '../stats.js'; +import { PROOF_DELAY_MS, WITGEN_DELAY_MS } from './delay_values.js'; + +type TestDelay = + | { + proverTestDelayType: 'fixed'; + proverTestDelayMs?: number; + } + | { + proverTestDelayType: 'realistic'; + proverTestDelayFactor?: number; + }; /** * A class for use in testing situations (e2e, unit test, etc) and temporarily for assembling a block in the sequencer. @@ -84,7 +96,7 @@ export class TestCircuitProver implements ServerCircuitProver { constructor( private simulationProvider?: SimulationProvider, - private opts: { proverTestDelayMs: number } = { proverTestDelayMs: 0 }, + private opts: TestDelay = { proverTestDelayType: 'fixed', proverTestDelayMs: 0 }, telemetry: TelemetryClient = getTelemetryClient(), ) { this.instrumentation = new ProverInstrumentation(telemetry, 'TestCircuitProver'); @@ -100,15 +112,17 @@ export class TestCircuitProver implements ServerCircuitProver { * @returns The public inputs of the parity circuit. */ @trackSpan('TestCircuitProver.getBaseParityProof') - public async getBaseParityProof( + public getBaseParityProof( inputs: BaseParityInputs, ): Promise> { - return await this.simulate( - inputs, - 'BaseParityArtifact', - RECURSIVE_PROOF_LENGTH, - convertBaseParityInputsToWitnessMap, - convertBaseParityOutputsFromWitnessMap, + return this.applyDelay(ProvingRequestType.BASE_PARITY, () => + this.simulate( + inputs, + 'BaseParityArtifact', + RECURSIVE_PROOF_LENGTH, + convertBaseParityInputsToWitnessMap, + convertBaseParityOutputsFromWitnessMap, + ), ); } @@ -118,53 +132,57 @@ export class TestCircuitProver implements ServerCircuitProver { * @returns The public inputs of the parity circuit. */ @trackSpan('TestCircuitProver.getRootParityProof') - public async getRootParityProof( + public getRootParityProof( inputs: RootParityInputs, ): Promise> { - return await this.simulate( - inputs, - 'RootParityArtifact', - NESTED_RECURSIVE_PROOF_LENGTH, - convertRootParityInputsToWitnessMap, - convertRootParityOutputsFromWitnessMap, + return this.applyDelay(ProvingRequestType.ROOT_PARITY, () => + this.simulate( + inputs, + 'RootParityArtifact', + NESTED_RECURSIVE_PROOF_LENGTH, + convertRootParityInputsToWitnessMap, + convertRootParityOutputsFromWitnessMap, + ), ); } - public async getTubeProof(_tubeInput: TubeInputs): Promise> { - await this.delay(); - return makeProofAndVerificationKey( - makeEmptyRecursiveProof(TUBE_PROOF_LENGTH), - VerificationKeyData.makeFakeRollupHonk(), + public getTubeProof(_tubeInput: TubeInputs): Promise> { + return this.applyDelay(ProvingRequestType.TUBE_PROOF, () => + makeProofAndVerificationKey(makeEmptyRecursiveProof(TUBE_PROOF_LENGTH), VerificationKeyData.makeFakeRollupHonk()), ); } @trackSpan('TestCircuitProver.getPrivateBaseRollupProof') - public async getPrivateBaseRollupProof( + public getPrivateBaseRollupProof( inputs: PrivateBaseRollupInputs, ): Promise< PublicInputsAndRecursiveProof > { - return await this.simulate( - inputs, - 'PrivateBaseRollupArtifact', - NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH, - convertSimulatedPrivateBaseRollupInputsToWitnessMap, - convertSimulatedPrivateBaseRollupOutputsFromWitnessMap, + return this.applyDelay(ProvingRequestType.PRIVATE_BASE_ROLLUP, () => + this.simulate( + inputs, + 'PrivateBaseRollupArtifact', + NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH, + convertSimulatedPrivateBaseRollupInputsToWitnessMap, + convertSimulatedPrivateBaseRollupOutputsFromWitnessMap, + ), ); } @trackSpan('TestCircuitProver.getPublicBaseRollupProof') - public async getPublicBaseRollupProof( + public getPublicBaseRollupProof( inputs: PublicBaseRollupInputs, ): Promise< PublicInputsAndRecursiveProof > { - return await this.simulate( - inputs, - 'PublicBaseRollupArtifact', - NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH, - convertSimulatedPublicBaseRollupInputsToWitnessMap, - convertSimulatedPublicBaseRollupOutputsFromWitnessMap, + return this.applyDelay(ProvingRequestType.PUBLIC_BASE_ROLLUP, () => + this.simulate( + inputs, + 'PublicBaseRollupArtifact', + NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH, + convertSimulatedPublicBaseRollupInputsToWitnessMap, + convertSimulatedPublicBaseRollupOutputsFromWitnessMap, + ), ); } @@ -174,17 +192,19 @@ export class TestCircuitProver implements ServerCircuitProver { * @returns The public inputs as outputs of the simulation. */ @trackSpan('TestCircuitProver.getMergeRollupProof') - public async getMergeRollupProof( + public getMergeRollupProof( input: MergeRollupInputs, ): Promise< PublicInputsAndRecursiveProof > { - return await this.simulate( - input, - 'MergeRollupArtifact', - NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH, - convertMergeRollupInputsToWitnessMap, - convertMergeRollupOutputsFromWitnessMap, + return this.applyDelay(ProvingRequestType.MERGE_ROLLUP, () => + this.simulate( + input, + 'MergeRollupArtifact', + NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH, + convertMergeRollupInputsToWitnessMap, + convertMergeRollupOutputsFromWitnessMap, + ), ); } @@ -194,17 +214,19 @@ export class TestCircuitProver implements ServerCircuitProver { * @returns The public inputs as outputs of the simulation. */ @trackSpan('TestCircuitProver.getBlockRootRollupProof') - public async getBlockRootRollupProof( + public getBlockRootRollupProof( input: BlockRootRollupInputs, ): Promise< PublicInputsAndRecursiveProof > { - return await this.simulate( - input, - 'BlockRootRollupArtifact', - NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH, - convertSimulatedBlockRootRollupInputsToWitnessMap, - convertSimulatedBlockRootRollupOutputsFromWitnessMap, + return this.applyDelay(ProvingRequestType.BLOCK_ROOT_ROLLUP, () => + this.simulate( + input, + 'BlockRootRollupArtifact', + NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH, + convertSimulatedBlockRootRollupInputsToWitnessMap, + convertSimulatedBlockRootRollupOutputsFromWitnessMap, + ), ); } @@ -214,12 +236,14 @@ export class TestCircuitProver implements ServerCircuitProver { ): Promise< PublicInputsAndRecursiveProof > { - return await this.simulate( - input, - 'SingleTxBlockRootRollupArtifact', - NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH, - convertSimulatedSingleTxBlockRootRollupInputsToWitnessMap, - convertSimulatedSingleTxBlockRootRollupOutputsFromWitnessMap, + return await this.applyDelay(ProvingRequestType.SINGLE_TX_BLOCK_ROOT_ROLLUP, () => + this.simulate( + input, + 'SingleTxBlockRootRollupArtifact', + NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH, + convertSimulatedSingleTxBlockRootRollupInputsToWitnessMap, + convertSimulatedSingleTxBlockRootRollupOutputsFromWitnessMap, + ), ); } @@ -229,17 +253,19 @@ export class TestCircuitProver implements ServerCircuitProver { * @returns The public inputs as outputs of the simulation. */ @trackSpan('TestCircuitProver.getEmptyBlockRootRollupProof') - public async getEmptyBlockRootRollupProof( + public getEmptyBlockRootRollupProof( input: EmptyBlockRootRollupInputs, ): Promise< PublicInputsAndRecursiveProof > { - return await this.simulate( - input, - 'EmptyBlockRootRollupArtifact', - NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH, - convertEmptyBlockRootRollupInputsToWitnessMap, - convertEmptyBlockRootRollupOutputsFromWitnessMap, + return this.applyDelay(ProvingRequestType.EMPTY_BLOCK_ROOT_ROLLUP, () => + this.simulate( + input, + 'EmptyBlockRootRollupArtifact', + NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH, + convertEmptyBlockRootRollupInputsToWitnessMap, + convertEmptyBlockRootRollupOutputsFromWitnessMap, + ), ); } @@ -249,17 +275,19 @@ export class TestCircuitProver implements ServerCircuitProver { * @returns The public inputs as outputs of the simulation. */ @trackSpan('TestCircuitProver.getBlockMergeRollupProof') - public async getBlockMergeRollupProof( + public getBlockMergeRollupProof( input: BlockMergeRollupInputs, ): Promise< PublicInputsAndRecursiveProof > { - return await this.simulate( - input, - 'BlockMergeRollupArtifact', - NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH, - convertBlockMergeRollupInputsToWitnessMap, - convertBlockMergeRollupOutputsFromWitnessMap, + return this.applyDelay(ProvingRequestType.BLOCK_MERGE_ROLLUP, () => + this.simulate( + input, + 'BlockMergeRollupArtifact', + NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH, + convertBlockMergeRollupInputsToWitnessMap, + convertBlockMergeRollupOutputsFromWitnessMap, + ), ); } @@ -269,35 +297,42 @@ export class TestCircuitProver implements ServerCircuitProver { * @returns The public inputs as outputs of the simulation. */ @trackSpan('TestCircuitProver.getRootRollupProof') - public async getRootRollupProof( - input: RootRollupInputs, - ): Promise> { - return await this.simulate( - input, - 'RootRollupArtifact', - NESTED_RECURSIVE_PROOF_LENGTH, - convertRootRollupInputsToWitnessMap, - convertRootRollupOutputsFromWitnessMap, + public getRootRollupProof(input: RootRollupInputs): Promise> { + return this.applyDelay(ProvingRequestType.ROOT_ROLLUP, () => + this.simulate( + input, + 'RootRollupArtifact', + NESTED_RECURSIVE_PROOF_LENGTH, + convertRootRollupInputsToWitnessMap, + convertRootRollupOutputsFromWitnessMap, + ), ); } - public async getAvmProof( - _inputs: AvmCircuitInputs, - ): Promise> { + public getAvmProof(_inputs: AvmCircuitInputs): Promise> { // We can't simulate the AVM because we don't have enough context to do so (e.g., DBs). // We just return an empty proof and VK data. this.logger.debug('Skipping AVM simulation in TestCircuitProver.'); - await this.delay(); - return makeProofAndVerificationKey( - makeEmptyRecursiveProof(AVM_PROOF_LENGTH_IN_FIELDS), - VerificationKeyData.makeFake(AVM_VERIFICATION_KEY_LENGTH_IN_FIELDS), + return this.applyDelay(ProvingRequestType.PUBLIC_VM, () => + makeProofAndVerificationKey( + makeEmptyRecursiveProof(AVM_PROOF_LENGTH_IN_FIELDS), + VerificationKeyData.makeFake(AVM_VERIFICATION_KEY_LENGTH_IN_FIELDS), + ), ); } - private async delay(): Promise { - if (this.opts.proverTestDelayMs > 0) { - await sleep(this.opts.proverTestDelayMs); + private async applyDelay any>(type: ProvingRequestType, fn: F): Promise>> { + const timer = new Timer(); + const res = await fn(); + const duration = timer.ms(); + if (this.opts.proverTestDelayType === 'fixed') { + await sleep(Math.max(0, (this.opts.proverTestDelayMs ?? 0) - duration)); + } else if (this.opts.proverTestDelayType === 'realistic') { + const delay = WITGEN_DELAY_MS[type] + PROOF_DELAY_MS[type]; + await sleep(Math.max(0, delay * (this.opts.proverTestDelayFactor ?? 1) - duration)); } + + return res; } // Not implemented for test circuits @@ -335,7 +370,6 @@ export class TestCircuitProver implements ServerCircuitProver { this.instrumentation.recordDuration('simulationDuration', circuitName, timer); emitCircuitSimulationStats(circuitName, timer.ms(), input.toBuffer().length, result.toBuffer().length, this.logger); - await this.delay(); return makePublicInputsAndRecursiveProof(result, makeRecursiveProof(proofLength), ProtocolCircuitVks[artifactName]); } } diff --git a/yarn-project/circuit-types/src/interfaces/prover-client.ts b/yarn-project/circuit-types/src/interfaces/prover-client.ts index 04492e953f48..4f57912091a9 100644 --- a/yarn-project/circuit-types/src/interfaces/prover-client.ts +++ b/yarn-project/circuit-types/src/interfaces/prover-client.ts @@ -11,8 +11,12 @@ import { type ProvingJobConsumer } from './prover-broker.js'; export type ActualProverConfig = { /** Whether to construct real proofs */ realProofs: boolean; - /** Artificial delay to introduce to all operations to the test prover. */ + /** The type of artificial delay to introduce */ + proverTestDelayType: 'fixed' | 'realistic'; + /** If using fixed delay, the time each operation takes. */ proverTestDelayMs: number; + /** If using realistic delays, what percentage of realistic times to apply. */ + proverTestDelayFactor: number; }; /** @@ -33,7 +37,9 @@ export const ProverConfigSchema = z.object({ nodeUrl: z.string().optional(), realProofs: z.boolean(), proverId: schemas.Fr, + proverTestDelayType: z.enum(['fixed', 'realistic']), proverTestDelayMs: z.number(), + proverTestDelayFactor: z.number(), proverAgentCount: z.number(), }) satisfies ZodFor; @@ -53,11 +59,20 @@ export const proverConfigMappings: ConfigMappingsType = { description: 'Identifier of the prover', defaultValue: Fr.ZERO, }, + proverTestDelayType: { + env: 'PROVER_TEST_DELAY_TYPE', + description: 'The type of artificial delay to introduce', + }, proverTestDelayMs: { env: 'PROVER_TEST_DELAY_MS', description: 'Artificial delay to introduce to all operations to the test prover.', ...numberConfigHelper(0), }, + proverTestDelayFactor: { + env: 'PROVER_TEST_DELAY_FACTOR', + description: 'If using realistic delays, what percentage of realistic times to apply.', + ...numberConfigHelper(1), + }, proverAgentCount: { env: 'PROVER_AGENT_COUNT', description: 'The number of prover agents to start', diff --git a/yarn-project/end-to-end/src/quality_of_service/alert_checker.ts b/yarn-project/end-to-end/src/quality_of_service/alert_checker.ts index 2a6ced89ac65..884681414e58 100644 --- a/yarn-project/end-to-end/src/quality_of_service/alert_checker.ts +++ b/yarn-project/end-to-end/src/quality_of_service/alert_checker.ts @@ -98,7 +98,7 @@ export class AlertChecker { } if (alertTriggered) { - throw new Error('Test failed due to triggered alert'); + throw new AlertTriggeredError('Test failed due to triggered alert'); } } @@ -125,3 +125,7 @@ export class AlertChecker { await this.checkAlerts(alerts); } } + +export class AlertTriggeredError extends Error { + override name = 'AlertTriggeredError'; +} diff --git a/yarn-project/end-to-end/src/spartan/prover-node.test.ts b/yarn-project/end-to-end/src/spartan/prover-node.test.ts new file mode 100644 index 000000000000..e39921dda513 --- /dev/null +++ b/yarn-project/end-to-end/src/spartan/prover-node.test.ts @@ -0,0 +1,102 @@ +import { retryUntil } from '@aztec/aztec.js'; +import { createLogger } from '@aztec/foundation/log'; + +import { type AlertConfig, AlertTriggeredError } from '../quality_of_service/alert_checker.js'; +import { applyProverKill, isK8sConfig, runAlertCheck, setupEnvironment, startPortForward } from './utils.js'; + +const config = setupEnvironment(process.env); +if (!isK8sConfig(config)) { + throw new Error('This test requires running in K8s'); +} + +const logger = createLogger('e2e:spartan-test:prover-node'); + +/** + * This test aims to check that a prover node is able to recover after a crash. + * How do we that? We check what proofs get submitted to the broker when the node comes back online + * If everything works as expected, the broker should report a bunch of 'cached' proving jobs. + * This would be the prover node coming back online and starting the proving process over. + * Because the proving jobs are cached their results will be available immediately. + * + * We'll wait for an epoch to be partially proven (at least one BLOCK_ROOT_ROLLUP has been submitted) so that the next time the prover starts it'll hit the cache. + */ +const interval = '1m'; +const cachedProvingJobs = { + alert: 'CachedProvingJobRate', + expr: `increase(sum(last_over_time(aztec_proving_queue_cached_jobs[${interval}]) or vector(0))[${interval}:])`, + labels: { severity: 'error' }, + for: interval, + annotations: {}, +}; + +const completedProvingJobs: AlertConfig = { + alert: 'ResolvedProvingJobRate', + expr: `rate(aztec_proving_queue_total_jobs{aztec_proving_job_type=~"BLOCK_ROOT_ROLLUP|SINGLE_TX_BLOCK_ROOT_ROLLUP"}[${interval}])>0`, + labels: { severity: 'error' }, + for: interval, + annotations: {}, +}; + +describe('prover node recovery', () => { + beforeAll(async () => { + await startPortForward({ + resource: `svc/${config.INSTANCE_NAME}-aztec-network-prover-node`, + namespace: config.NAMESPACE, + containerPort: config.CONTAINER_PROVER_NODE_PORT, + hostPort: config.HOST_PROVER_NODE_PORT, + }); + + await startPortForward({ + resource: `svc/metrics-grafana`, + namespace: 'metrics', + containerPort: config.CONTAINER_METRICS_PORT, + hostPort: config.HOST_METRICS_PORT, + }); + }); + + it('should start proving', async () => { + logger.info(`Waiting for epoch to be partially proven`); + + // use the alert checker to wait until grafana picks up a proof has started + await retryUntil( + async () => { + try { + await runAlertCheck(config, [completedProvingJobs], logger); + } catch (err) { + return err && err instanceof AlertTriggeredError; + } + }, + 'wait for proofs', + 600, + 5, + ); + + logger.info(`Detected partial epoch proven. Killing the prover node`); + + await applyProverKill({ + namespace: config.NAMESPACE, + spartanDir: config.SPARTAN_DIR, + logger, + }); + + // wait for the node to start proving again and + // validate it hits the cache + const result = await retryUntil( + async () => { + try { + await runAlertCheck(config, [cachedProvingJobs], logger); + } catch (err) { + if (err && err instanceof AlertTriggeredError) { + return true; + } + } + return false; + }, + 'wait for cached proving jobs', + 600, + 5, + ); + + expect(result).toBeTrue(); + }, 1_800_000); +}); diff --git a/yarn-project/end-to-end/src/spartan/utils.ts b/yarn-project/end-to-end/src/spartan/utils.ts index 1c49588802ae..18f442b8071d 100644 --- a/yarn-project/end-to-end/src/spartan/utils.ts +++ b/yarn-project/end-to-end/src/spartan/utils.ts @@ -21,6 +21,8 @@ const k8sLocalConfigSchema = z.object({ CONTAINER_NODE_PORT: z.coerce.number().default(8080), HOST_SEQUENCER_PORT: z.coerce.number().min(1, 'HOST_SEQUENCER_PORT env variable must be set'), CONTAINER_SEQUENCER_PORT: z.coerce.number().default(8080), + HOST_PROVER_NODE_PORT: z.coerce.number().min(1, 'HOST_PROVER_NODE_PORT env variable must be set'), + CONTAINER_PROVER_NODE_PORT: z.coerce.number().default(8080), HOST_PXE_PORT: z.coerce.number().min(1, 'HOST_PXE_PORT env variable must be set'), CONTAINER_PXE_PORT: z.coerce.number().default(8080), HOST_ETHEREUM_PORT: z.coerce.number().min(1, 'HOST_ETHEREUM_PORT env variable must be set'), @@ -310,6 +312,25 @@ export function applyProverFailure({ }); } +export function applyProverKill({ + namespace, + spartanDir, + logger, +}: { + namespace: string; + spartanDir: string; + logger: Logger; +}) { + return installChaosMeshChart({ + instanceName: 'prover-kill', + targetNamespace: namespace, + valuesFile: 'prover-kill.yaml', + helmChartDir: getChartDir(spartanDir, 'aztec-chaos-scenarios'), + clean: true, + logger, + }); +} + export function applyBootNodeFailure({ namespace, spartanDir, diff --git a/yarn-project/foundation/src/config/env_var.ts b/yarn-project/foundation/src/config/env_var.ts index 15cdbbeb4240..6ed39980bb47 100644 --- a/yarn-project/foundation/src/config/env_var.ts +++ b/yarn-project/foundation/src/config/env_var.ts @@ -143,7 +143,9 @@ export type EnvVar = | 'PROVER_PUBLISHER_PRIVATE_KEY' | 'PROVER_REAL_PROOFS' | 'PROVER_REQUIRED_CONFIRMATIONS' + | 'PROVER_TEST_DELAY_FACTOR' | 'PROVER_TEST_DELAY_MS' + | 'PROVER_TEST_DELAY_TYPE' | 'PXE_L2_STARTING_BLOCK' | 'PXE_PROVER_ENABLED' | 'PROVER_TARGET_ESCROW_AMOUNT' diff --git a/yarn-project/prover-client/src/proving_broker/config.ts b/yarn-project/prover-client/src/proving_broker/config.ts index 7424b716afa0..2131067a1252 100644 --- a/yarn-project/prover-client/src/proving_broker/config.ts +++ b/yarn-project/prover-client/src/proving_broker/config.ts @@ -64,8 +64,12 @@ export const ProverAgentConfig = z.object({ proverBrokerUrl: z.string().optional(), /** Whether to construct real proofs */ realProofs: z.boolean(), - /** Artificial delay to introduce to all operations to the test prover. */ + /** The type of artificial delay to introduce */ + proverTestDelayType: z.enum(['fixed', 'realistic']), + /** If using fixed delay, the time each operation takes. */ proverTestDelayMs: z.number(), + /** If using realistic delays, what percentage of realistic times to apply. */ + proverTestDelayFactor: z.number(), }); export type ProverAgentConfig = z.infer; @@ -99,9 +103,19 @@ export const proverAgentConfigMappings: ConfigMappingsType = description: 'Whether to construct real proofs', ...booleanConfigHelper(false), }, + proverTestDelayType: { + env: 'PROVER_TEST_DELAY_TYPE', + description: 'The type of artificial delay to introduce', + defaultValue: 'fixed', + }, proverTestDelayMs: { env: 'PROVER_TEST_DELAY_MS', description: 'Artificial delay to introduce to all operations to the test prover.', ...numberConfigHelper(0), }, + proverTestDelayFactor: { + env: 'PROVER_TEST_DELAY_FACTOR', + description: 'If using realistic delays, what percentage of realistic times to apply.', + ...numberConfigHelper(1), + }, }; diff --git a/yarn-project/prover-client/src/proving_broker/proving_broker.ts b/yarn-project/prover-client/src/proving_broker/proving_broker.ts index 8f8172ad41a3..3c210fa722f0 100644 --- a/yarn-project/prover-client/src/proving_broker/proving_broker.ts +++ b/yarn-project/prover-client/src/proving_broker/proving_broker.ts @@ -237,10 +237,11 @@ export class ProvingBroker implements ProvingJobProducer, ProvingJobConsumer, Tr if (this.jobsCache.has(job.id)) { const existing = this.jobsCache.get(job.id); assert.deepStrictEqual(job, existing, 'Duplicate proving job ID'); - this.logger.debug(`Duplicate proving job id=${job.id} epochNumber=${job.epochNumber}. Ignoring`, { + this.logger.warn(`Cached proving job id=${job.id} epochNumber=${job.epochNumber}. Not enqueuing again`, { provingJobId: job.id, }); - return Promise.resolve(jobStatus); + this.instrumentation.incCachedJobs(job.type); + return jobStatus; } if (this.isJobStale(job)) { @@ -256,6 +257,7 @@ export class ProvingBroker implements ProvingJobProducer, ProvingJobConsumer, Tr this.jobsCache.set(job.id, job); await this.database.addProvingJob(job); this.enqueueJobInternal(job); + this.instrumentation.incTotalJobs(job.type); } catch (err) { this.logger.error(`Failed to save proving job id=${job.id}: ${err}`, err, { provingJobId: job.id }); this.jobsCache.delete(job.id); diff --git a/yarn-project/prover-client/src/proving_broker/proving_broker_instrumentation.ts b/yarn-project/prover-client/src/proving_broker/proving_broker_instrumentation.ts index 2dc9ac270236..891e84978609 100644 --- a/yarn-project/prover-client/src/proving_broker/proving_broker_instrumentation.ts +++ b/yarn-project/prover-client/src/proving_broker/proving_broker_instrumentation.ts @@ -19,6 +19,8 @@ export class ProvingBrokerInstrumentation { private resolvedJobs: UpDownCounter; private rejectedJobs: UpDownCounter; private timedOutJobs: UpDownCounter; + private cachedJobs: UpDownCounter; + private totalJobs: UpDownCounter; private jobWait: Histogram; private jobDuration: Histogram; private retriedJobs: UpDownCounter; @@ -50,6 +52,14 @@ export class ProvingBrokerInstrumentation { valueType: ValueType.INT, }); + this.cachedJobs = meter.createUpDownCounter(Metrics.PROVING_QUEUE_CACHED_JOBS, { + valueType: ValueType.INT, + }); + + this.totalJobs = meter.createUpDownCounter(Metrics.PROVING_QUEUE_TOTAL_JOBS, { + valueType: ValueType.INT, + }); + this.jobWait = meter.createHistogram(Metrics.PROVING_QUEUE_JOB_WAIT, { description: 'Records how long a job sits in the queue', unit: 'ms', @@ -95,6 +105,18 @@ export class ProvingBrokerInstrumentation { }); } + incCachedJobs(proofType: ProvingRequestType) { + this.cachedJobs.add(1, { + [Attributes.PROVING_JOB_TYPE]: ProvingRequestType[proofType], + }); + } + + incTotalJobs(proofType: ProvingRequestType) { + this.totalJobs.add(1, { + [Attributes.PROVING_JOB_TYPE]: ProvingRequestType[proofType], + }); + } + recordJobWait(proofType: ProvingRequestType, msOrTimer: Timer | number) { const duration = typeof msOrTimer === 'number' ? msOrTimer : Math.floor(msOrTimer.ms()); this.jobWait.record(duration, { diff --git a/yarn-project/telemetry-client/src/metrics.ts b/yarn-project/telemetry-client/src/metrics.ts index d60da7575c92..802507f5da0c 100644 --- a/yarn-project/telemetry-client/src/metrics.ts +++ b/yarn-project/telemetry-client/src/metrics.ts @@ -100,6 +100,8 @@ export const PROVING_ORCHESTRATOR_BASE_ROLLUP_INPUTS_DURATION = export const PROVING_QUEUE_JOB_SIZE = 'aztec.proving_queue.job_size'; export const PROVING_QUEUE_SIZE = 'aztec.proving_queue.size'; +export const PROVING_QUEUE_TOTAL_JOBS = 'aztec.proving_queue.total_jobs'; +export const PROVING_QUEUE_CACHED_JOBS = 'aztec.proving_queue.cached_jobs'; export const PROVING_QUEUE_ACTIVE_JOBS = 'aztec.proving_queue.active_jobs'; export const PROVING_QUEUE_RESOLVED_JOBS = 'aztec.proving_queue.resolved_jobs'; export const PROVING_QUEUE_REJECTED_JOBS = 'aztec.proving_queue.rejected_jobs';