diff --git a/spartan/.gitignore b/spartan/.gitignore index 792fa0ebb8b7..6fbca9faac92 100644 --- a/spartan/.gitignore +++ b/spartan/.gitignore @@ -29,4 +29,5 @@ environments/* !environments/tps-scenario.env !environments/kind-minimal.env !environments/kind-provers.env +!environments/alpha-net.env *.tfvars diff --git a/spartan/environments/alpha-net.env b/spartan/environments/alpha-net.env new file mode 100644 index 000000000000..979d88d588cb --- /dev/null +++ b/spartan/environments/alpha-net.env @@ -0,0 +1,89 @@ +NAMESPACE=${NAMESPACE:-alpha-net} +CLUSTER=aztec-gke-private +GCP_REGION=us-west1-a +DESTROY_NAMESPACE=true +DESTROY_ETH_DEVNET=true +CREATE_ETH_DEVNET=${CREATE_ETH_DEVNET:-true} +AZTEC_EPOCH_DURATION=8 +AZTEC_SLOT_DURATION=72 +AZTEC_PROOF_SUBMISSION_EPOCHS=2 +ETHEREUM_CHAIN_ID=1337 +LABS_INFRA_MNEMONIC="test test test test test test test test test test test junk" +FUNDING_PRIVATE_KEY="0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80" +# CREATE_CHAOS_MESH=true + +# Install chaos mesh peer isolation after Aztec infra deploys. Validators, +# RPC nodes, and prover nodes can only peer with full-nodes, not each other. +# Requires P2P_PUBLIC_IP=false so P2P uses pod IPs that iptables rules can match. +P2P_PUBLIC_IP=false +CHAOS_MESH_SCENARIOS_FILE=network-requirements.yaml + +AZTEC_MANA_TARGET=2147483647 + +P2P_TX_POOL_DELETE_TXS_AFTER_REORG=true + +# For mbps +SEQ_BUILD_CHECKPOINT_IF_EMPTY=true +SEQ_BLOCK_DURATION_MS=6000 +SEQ_SKIP_CHECKPOINT_PUBLISH_PERCENT=5 + +CREATE_ROLLUP_CONTRACTS=true +REDEPLOY_ROLLUP_CONTRACTS=true +VERIFY_CONTRACTS=false +DESTROY_AZTEC_INFRA=true + +AZTEC_LAG_IN_EPOCHS_FOR_VALIDATOR_SET=1 +AZTEC_LAG_IN_EPOCHS_FOR_RANDAO=1 + +OTEL_COLLECTOR_ENDPOINT=REPLACE_WITH_GCP_SECRET + +VALIDATOR_REPLICAS=12 +VALIDATORS_PER_NODE=4 +PUBLISHERS_PER_VALIDATOR_KEY=2 +VALIDATOR_PUBLISHER_MNEMONIC_START_INDEX=5000 +VALIDATOR_RESOURCE_PROFILE="2-core-dedicated" + +REAL_VERIFIER=false + +RPC_REPLICAS=12 +RPC_INGRESS_ENABLED=false + +FULL_NODE_REPLICAS=500 +FULL_NODE_RESOURCE_PROFILE="2-core-spot" + +PUBLISHERS_PER_PROVER=2 +PROVER_PUBLISHER_MNEMONIC_START_INDEX=8000 +PROVER_REPLICAS=128 +PROVER_RESOURCE_PROFILE="hi-tps" +PROVER_AGENT_POLL_INTERVAL_MS=10000 + +RUN_TESTS=false + +PROVER_TEST_DELAY_TYPE=fixed + +AZTEC_SLASHING_ROUND_SIZE_IN_EPOCHS=1 +AZTEC_SLASHING_QUORUM=5 +AZTEC_SLASHING_EXECUTION_DELAY_IN_ROUNDS=0 +AZTEC_SLASHING_OFFSET_IN_ROUNDS=1 +AZTEC_LOCAL_EJECTION_THRESHOLD=90000000000000000000 + +SEQ_MAX_TX_PER_CHECKPOINT=72 +SEQ_MIN_TX_PER_BLOCK=0 + +# Override L1 tx utils bump percentages for scenario tests +VALIDATOR_L1_PRIORITY_FEE_BUMP_PERCENTAGE=0 +VALIDATOR_L1_PRIORITY_FEE_RETRY_BUMP_PERCENTAGE=0 +PROVER_L1_PRIORITY_FEE_BUMP_PERCENTAGE=0 +PROVER_L1_PRIORITY_FEE_RETRY_BUMP_PERCENTAGE=0 + +# Enable latency mesaruement for p2p messages +DEBUG_P2P_INSTRUMENT_MESSAGES=true + +# Inject artificial delay of proof verification for all nodes +PROVER_TEST_VERIFICATION_DELAY_MS=250 + +# Reduce the amount of metrics produced by prover agents and full nodes +PROVER_AGENT_INCLUDE_METRICS="aztec.circuit" +FULL_NODE_INCLUDE_METRICS="aztec.p2p.gossip.agg_" +LOG_LEVEL=info + diff --git a/spartan/scripts/deploy_network.sh b/spartan/scripts/deploy_network.sh index 223cef36b8f7..705403f47596 100755 --- a/spartan/scripts/deploy_network.sh +++ b/spartan/scripts/deploy_network.sh @@ -67,7 +67,6 @@ LABS_INFRA_INDICES=${LABS_INFRA_INDICES:-0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,1 ######################## # ROLLUP VARIABLES ######################## -REDEPLOY_ROLLUP_CONTRACTS=${REDEPLOY_ROLLUP_CONTRACTS:-false} CREATE_ROLLUP_CONTRACTS=${CREATE_ROLLUP_CONTRACTS:-true} SPONSORED_FPC=${SPONSORED_FPC:-true} TEST_ACCOUNTS=${TEST_ACCOUNTS:-false} @@ -200,6 +199,12 @@ P2P_GOSSIPSUB_DHI=${P2P_GOSSIPSUB_DHI:-12} P2P_DROP_TX=${P2P_DROP_TX:-false} P2P_DROP_TX_CHANCE=${P2P_DROP_TX_CHANCE:-0} +# Chaos mesh scenarios values file (e.g., "network-requirements.yaml") +# If set, the experiment is installed after Aztec infra, rules are injected, +# then all pods are restarted so they come up clean with partition rules active. +# Requires the chaos mesh operator to already be running (see deploy_chaos_mesh.sh). +CHAOS_MESH_SCENARIOS_FILE=${CHAOS_MESH_SCENARIOS_FILE:-} + # Compute validator addresses (skip if no validators) if [[ $VALIDATOR_REPLICAS -gt 0 ]]; then VALIDATOR_ADDRESSES=$(echo "$VALIDATOR_INDICES" | tr ',' '\n' | xargs -I{} cast wallet address --mnemonic "$LABS_INFRA_MNEMONIC" --mnemonic-index {} | tr '\n' ',' | sed 's/,$//') @@ -623,6 +628,47 @@ k8s_denoise "tf_run "${DEPLOY_AZTEC_INFRA_DIR}" "${DESTROY_AZTEC_INFRA}" "${CREA STAGE_TIMINGS[aztec_infra]=$(($(date +%s) - AZTEC_INFRA_START)) log "Deployed aztec infra" +# ------------------------------------------------------- +# Optionally install chaos mesh scenarios after Aztec infra +# ------------------------------------------------------- +# Chaos Mesh resolves pod selectors at experiment creation time, so the target +# pods must already exist. The chaos-daemon injects iptables DROP rules into +# each matched pod's network namespace. For partition experiments, this +# immediately blocks packets between the partitioned pods, causing existing +# TCP connections to timeout and preventing new ones from forming. +# +# IMPORTANT: Do NOT restart pods after chaos injection. Chaos Mesh does not +# automatically re-inject rules into recreated pods, leaving them unpartitioned. +if [[ -n "${CHAOS_MESH_SCENARIOS_FILE}" ]]; then + CHAOS_SCENARIOS_DIR="${SCRIPT_DIR}/../aztec-chaos-scenarios" + log "Installing chaos mesh scenarios from ${CHAOS_MESH_SCENARIOS_FILE}" + helm upgrade --install network-shaping "${CHAOS_SCENARIOS_DIR}" \ + --namespace "${NAMESPACE}" \ + --values "${CHAOS_SCENARIOS_DIR}/values/${CHAOS_MESH_SCENARIOS_FILE}" \ + --set "global.targetNamespace=${NAMESPACE}" \ + --wait --timeout=5m + log "Chaos mesh scenarios installed, waiting for rules to be injected..." + + # Wait for all NetworkChaos experiments to have their rules injected. + # The AllInjected condition confirms iptables rules are active on every matched pod. + CHAOS_WAIT_TIMEOUT=120 + CHAOS_WAITED=0 + while true; do + NOT_INJECTED=$(kubectl get networkchaos -n "${NAMESPACE}" -o jsonpath='{range .items[*]}{.status.conditions[?(@.type=="AllInjected")].status}{"\n"}{end}' 2>/dev/null | grep -c "False" || true) + if [[ "${NOT_INJECTED}" -eq 0 ]]; then + log "All chaos mesh rules injected" + break + fi + if [[ "${CHAOS_WAITED}" -ge "${CHAOS_WAIT_TIMEOUT}" ]]; then + log "WARNING: Timed out waiting for chaos mesh injection after ${CHAOS_WAIT_TIMEOUT}s (${NOT_INJECTED} experiments not yet injected)" + break + fi + sleep 5 + CHAOS_WAITED=$((CHAOS_WAITED + 5)) + done + log "Chaos mesh partition active — existing connections will break as packets are dropped" +fi + # Calculate total deployment time DEPLOY_END_TIME=$(date +%s) TOTAL_DEPLOY_TIME=$((DEPLOY_END_TIME - DEPLOY_START_TIME)) diff --git a/yarn-project/aztec-node/src/aztec-node/server.ts b/yarn-project/aztec-node/src/aztec-node/server.ts index 4ea7c746cb74..0f831bf4b767 100644 --- a/yarn-project/aztec-node/src/aztec-node/server.ts +++ b/yarn-project/aztec-node/src/aztec-node/server.ts @@ -866,8 +866,9 @@ export class AztecNodeService implements AztecNode, AztecNodeAdmin, Traceable { } await this.p2pClient!.sendTx(tx); - this.metrics.receivedTx(timer.ms(), true); - this.log.info(`Received tx ${txHash}`, { txHash }); + const duration = timer.ms(); + this.metrics.receivedTx(duration, true); + this.log.info(`Received tx ${txHash} in ${duration}ms`, { txHash }); } public async getTxReceipt(txHash: TxHash): Promise { diff --git a/yarn-project/end-to-end/src/spartan/n_tps.test.ts b/yarn-project/end-to-end/src/spartan/n_tps.test.ts index 5bbdb2a94136..27aeba4b54d6 100644 --- a/yarn-project/end-to-end/src/spartan/n_tps.test.ts +++ b/yarn-project/end-to-end/src/spartan/n_tps.test.ts @@ -33,6 +33,7 @@ import { getChartDir, getGitProjectRoot, getRPCEndpoint, + hasDeployedHelmRelease, installChaosMeshChart, setupEnvironment, startPortForwardForPrometeheus, @@ -226,6 +227,32 @@ describe('sustained N TPS test', () => { }); const spartanDir = `${getGitProjectRoot()}/spartan`; + // Skip chaos mesh installation if it was already deployed by deploy_network.sh + // (via CHAOS_MESH_SCENARIOS_FILE). Installing before infra ensures partition + // rules are in place when pods start, preventing unwanted peer connections. + const alreadyDeployed = await hasDeployedHelmRelease(CHAOS_MESH_NAME, config.NAMESPACE); + if (alreadyDeployed) { + logger.info('Chaos mesh chart already deployed, skipping installation'); + } else { + logger.info('Installing chaos mesh chart', { + name: CHAOS_MESH_NAME, + namespace: config.NAMESPACE, + valuesFile: 'network-requirements.yaml', + }); + await installChaosMeshChart({ + logger, + targetNamespace: config.NAMESPACE, + instanceName: CHAOS_MESH_NAME, + valuesFile: 'network-requirements.yaml', + helmChartDir: getChartDir(spartanDir, 'aztec-chaos-scenarios'), + }); + logger.info('Chaos mesh installation complete'); + + logger.info('Waiting for network to stabilize after chaos mesh installation...'); + await sleep(30 * 1000); + logger.info('Network stabilization wait complete'); + } + const rpcEndpoint = await getRPCEndpoint(config.NAMESPACE); endpoints.push(rpcEndpoint); const rpcUrl = rpcEndpoint.url; @@ -285,24 +312,6 @@ describe('sustained N TPS test', () => { })); logger.info('Benchmark contract deployed', { address: benchmarkContract.address.toString() }); - logger.info('Installing chaos mesh chart', { - name: CHAOS_MESH_NAME, - namespace: config.NAMESPACE, - valuesFile: 'network-requirements.yaml', - }); - await installChaosMeshChart({ - logger, - targetNamespace: config.NAMESPACE, - instanceName: CHAOS_MESH_NAME, - valuesFile: 'network-requirements.yaml', - helmChartDir: getChartDir(spartanDir, 'aztec-chaos-scenarios'), - }); - logger.info('Chaos mesh installation complete'); - - logger.info('Waiting for network to stabilize after chaos mesh installation...'); - await sleep(30 * 1000); - logger.info('Network stabilization wait complete'); - logger.info(`Test setup complete`); }); @@ -328,7 +337,7 @@ describe('sustained N TPS test', () => { prototypeTxs.set(from.toString(), prototypeTx); } - const tx = await cloneTx(prototypeTx, priorytFee); + const tx = await cloneTx(prototypeTx, priorytFee, logger); return tx; }; @@ -345,15 +354,23 @@ describe('sustained N TPS test', () => { let lowValueTxs = 0; const lowValueSendTx = async (wallet: TestWallet) => { lowValueTxs++; - //const feeAmount = Number(randomBigInt(100n)) + 1; - //const feeAmount = 1; const feeAmount = Math.floor(lowValueTxs / 1000) + 1; const fee = new GasFees(0, feeAmount); - logger.info('Sending low value tx ' + lowValueTxs + ' with fee ' + feeAmount); + const t0 = performance.now(); const tx = await (config.REAL_VERIFIER ? submitProven(wallet, fee) : submitUnproven(wallet, fee)); + const t1 = performance.now(); const txHash = await tx.send({ wait: NO_WAIT }); + const t2 = performance.now(); + + logger.info('Low value tx sent', { + txNum: lowValueTxs, + feeAmount, + cloneMs: Math.round(t1 - t0), + sendMs: Math.round(t2 - t1), + totalMs: Math.round(t2 - t0), + }); return txHash.toString(); }; @@ -362,13 +379,23 @@ describe('sustained N TPS test', () => { highValueTxs++; const feeAmount = Number(randomBigInt(10n)) + 1000; const fee = new GasFees(0, feeAmount); - logger.info('Sending high value tx ' + highValueTxs + ' with fee ' + feeAmount); + const t0 = performance.now(); const tx = await (config.REAL_VERIFIER ? submitProven(wallet, fee) : submitUnproven(wallet, fee)); + const t1 = performance.now(); metrics.recordSentTx(tx, `high_value_${highValueTps}tps`); const txHash = await tx.send({ wait: NO_WAIT }); + const t2 = performance.now(); + + logger.info('High value tx sent', { + txNum: highValueTxs, + feeAmount, + cloneMs: Math.round(t1 - t0), + sendMs: Math.round(t2 - t1), + totalMs: Math.round(t2 - t0), + }); return txHash.toString(); }; @@ -514,9 +541,11 @@ function sendTxsAtTps( return txHashes; } -async function cloneTx(tx: ProvenTx, priorityFee: GasFees): Promise { - // Clone the transaction +async function cloneTx(tx: ProvenTx, priorityFee: GasFees, logger: Logger): Promise { + const t0 = performance.now(); const clonedTxData = Tx.clone(tx, false); + const t1 = performance.now(); + (clonedTxData.data.constants.txContext.gasSettings as any).maxPriorityFeesPerGas = priorityFee; if (clonedTxData.data.forRollup) { @@ -534,7 +563,17 @@ async function cloneTx(tx: ProvenTx, priorityFee: GasFees): Promise { clonedTxData.data.forPublic.nonRevertibleAccumulatedData.nullifiers[i] = Fr.random(); } } + const t2 = performance.now(); + const clonedTx = new ProvenTx((tx as any).node, clonedTxData, tx.offchainEffects, tx.stats); await clonedTx.recomputeHash(); + const t3 = performance.now(); + + logger.debug('cloneTx timing', { + cloneMs: Math.round(t1 - t0), + mutateMs: Math.round(t2 - t1), + rehashMs: Math.round(t3 - t2), + totalMs: Math.round(t3 - t0), + }); return clonedTx; } diff --git a/yarn-project/end-to-end/src/spartan/utils/index.ts b/yarn-project/end-to-end/src/spartan/utils/index.ts index b4ecc612825f..8f917cecdcdc 100644 --- a/yarn-project/end-to-end/src/spartan/utils/index.ts +++ b/yarn-project/end-to-end/src/spartan/utils/index.ts @@ -41,6 +41,9 @@ export { applyNetworkShaping, } from './chaos.js'; +// Helm +export { hasDeployedHelmRelease } from './helm.js'; + // Bot management export { restartBot, installTransferBot, uninstallTransferBot } from './bot.js';