From e426aa13e134779b377632752d3a1c89ee7784ed Mon Sep 17 00:00:00 2001 From: Pablo Deymonnaz Date: Fri, 6 Feb 2026 15:46:19 -0300 Subject: [PATCH 01/10] Add restart stall reproduction test using eth-docker Add a Python script and Makefile targets to reproduce the snap sync restart stall bug reported on Discord, where ethrex stalls downloading headers after a restart. The test uses eth-docker (with ethrex + Prysm) and runs in two phases: - Phase 1: Fresh snap sync from scratch, wait for completion and block progress - Phase 2: Stop only the execution client (keeping consensus + volumes), restart it, and monitor for header download stall The restart phase is repeated multiple times (default 3) since the stall is intermittent. Slack notifications are sent at each phase transition using the same webhook pattern as the existing multisync monitoring. The --configure flag auto-writes eth-docker's .env for ethrex + Prysm with the specified network and fee recipient address. --- tooling/sync/Makefile | 32 +- tooling/sync/restart_stall_test.py | 597 +++++++++++++++++++++++++++++ 2 files changed, 628 insertions(+), 1 deletion(-) create mode 100644 tooling/sync/restart_stall_test.py diff --git a/tooling/sync/Makefile b/tooling/sync/Makefile index 0343373b30d..9d9e400df4f 100644 --- a/tooling/sync/Makefile +++ b/tooling/sync/Makefile @@ -4,7 +4,8 @@ flamegraph-hoodi start-lighthouse start-ethrex backup-db start-mainnet-metrics-d start-sepolia-metrics-docker start-holesky-metrics-docker start-hoodi-metrics-docker \ start-metrics-docker tail-syncing-logs tail-metrics-logs copy_flamegraph import-with-metrics \ multisync-up multisync-down multisync-clean multisync-logs multisync-status \ -multisync-restart multisync-monitor multisync-run multisync-loop multisync-loop-auto +multisync-restart multisync-monitor multisync-run multisync-loop multisync-loop-auto \ +restart-stall-test restart-stall-test-skip-sync ETHREX_DIR ?= "../.." @@ -342,3 +343,32 @@ multisync-loop-auto: ## Continuous loop with auto-update: pull latest, build, an --build-profile "$(MULTISYNC_BUILD_PROFILE)" \ --image-tag "$(MULTISYNC_LOCAL_IMAGE)" \ --ethrex-dir "$(ETHREX_DIR)" + +# ============================================================================== +# Restart Stall Test (eth-docker) +# ============================================================================== +# Tests whether ethrex stalls on header download after restart. +# Uses eth-docker (must be cloned already) with ethrex + Prysm. +# Phase 1: Fresh snap sync (terminate + start, wait for completion) +# Phase 2: Stop only execution client, restart it, check for stall +# ============================================================================== + +RESTART_TEST_NETWORK ?= hoodi +RESTART_TEST_COUNT ?= 3 +ETH_DOCKER_DIR ?= $(HOME)/eth-docker + +restart-stall-test: ## Run restart stall test via eth-docker: configure, sync, then restart cycles. + python3 restart_stall_test.py \ + --eth-docker-dir $(ETH_DOCKER_DIR) \ + --network $(RESTART_TEST_NETWORK) \ + --restart-count $(RESTART_TEST_COUNT) \ + --configure \ + --ethrex-dir $(ETHREX_DIR) + +restart-stall-test-skip-sync: ## Run restart stall test, skip initial sync (node must be synced). + python3 restart_stall_test.py \ + --eth-docker-dir $(ETH_DOCKER_DIR) \ + --network $(RESTART_TEST_NETWORK) \ + --restart-count $(RESTART_TEST_COUNT) \ + --skip-phase1 \ + --ethrex-dir $(ETHREX_DIR) diff --git a/tooling/sync/restart_stall_test.py b/tooling/sync/restart_stall_test.py new file mode 100644 index 00000000000..e19c52aeff1 --- /dev/null +++ b/tooling/sync/restart_stall_test.py @@ -0,0 +1,597 @@ +#!/usr/bin/env python3 +"""Test for snap sync restart stall bug using eth-docker. + +Reproduces the issue where ethrex stalls downloading headers after a restart. +Assumes eth-docker is already cloned and configured with ethrex as the EL client. + +Flow: + Phase 1: Fresh snap sync (terminate + start, wait for completion) + Phase 2: Stop only execution client, restart it, monitor for stall + +Prerequisites: + - eth-docker cloned (default: ~/eth-docker) + - .env configured with ethrex (COMPOSE_FILE=lighthouse.yml:ethrex.yml, NETWORK=hoodi, etc.) + - Slack webhooks in eth-docker's .env or exported as env vars (optional) + +Usage: + python3 restart_stall_test.py --eth-docker-dir ~/eth-docker + python3 restart_stall_test.py --eth-docker-dir ~/eth-docker --restart-count 5 + python3 restart_stall_test.py --eth-docker-dir ~/eth-docker --skip-phase1 + python3 restart_stall_test.py --eth-docker-dir ~/eth-docker --no-slack +""" + +import argparse +import os +import socket +import subprocess +import sys +import time +from datetime import datetime +from pathlib import Path + +import requests + +# Timeouts (in seconds), configurable via env vars +SYNC_TIMEOUT = int(os.environ.get("SYNC_TIMEOUT", 8 * 60)) * 60 # default 8h +BLOCK_PROCESSING_DURATION = int(os.environ.get("BLOCK_PROCESSING_DURATION", 22 * 60)) # default 22m +RESTART_STALL_TIMEOUT = int(os.environ.get("RESTART_STALL_TIMEOUT", 15 * 60)) # default 15m +NODE_STARTUP_TIMEOUT = int(os.environ.get("NODE_STARTUP_TIMEOUT", 5 * 60)) # default 5m +CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 10)) + +LOGS_DIR = Path("./restart_stall_logs") + + +def configure_eth_docker(eth_docker_dir: str, network: str, fee_recipient: str = "", slack_success: str = "", slack_failed: str = ""): + """Write eth-docker .env configured for ethrex + Prysm. + + Copies default.env as base and overrides the key settings. + """ + default_env = os.path.join(eth_docker_dir, "default.env") + env_file = os.path.join(eth_docker_dir, ".env") + + if not os.path.isfile(default_env): + print(f"Error: default.env not found at {default_env}") + sys.exit(1) + + # Read default.env as base + with open(default_env) as f: + lines = f.readlines() + + # Settings to override + overrides = { + "COMPOSE_FILE": "prysm.yml:ethrex.yml", + "NETWORK": network, + "ETHREX_DOCKERFILE": "Dockerfile.binary", + "ETHREX_DOCKER_REPO": "ghcr.io/lambdaclass/ethrex", + "ETHREX_DOCKER_TAG": "latest", + } + if fee_recipient: + overrides["FEE_RECIPIENT"] = fee_recipient + if slack_success: + overrides["SLACK_WEBHOOK_URL_SUCCESS"] = slack_success + if slack_failed: + overrides["SLACK_WEBHOOK_URL_FAILED"] = slack_failed + + applied = set() + new_lines = [] + for line in lines: + stripped = line.strip() + # Match lines like KEY=value or #KEY=value + for key, value in overrides.items(): + if stripped.startswith(f"{key}=") or stripped.startswith(f"#{key}="): + line = f"{key}={value}\n" + applied.add(key) + break + new_lines.append(line) + + # Append any overrides that weren't found in default.env + for key, value in overrides.items(): + if key not in applied: + new_lines.append(f"{key}={value}\n") + + with open(env_file, "w") as f: + f.writelines(new_lines) + + print(f" Wrote {env_file}") + print(f" COMPOSE_FILE=prysm.yml:ethrex.yml") + print(f" NETWORK={network}") + if fee_recipient: + print(f" FEE_RECIPIENT={fee_recipient}") + print(f" ETHREX_DOCKER_REPO=ghcr.io/lambdaclass/ethrex") + print(f" ETHREX_DOCKER_TAG=latest") + + +def load_env_file(env_path: str): + """Load variables from an .env file into os.environ (without overriding existing).""" + if not os.path.exists(env_path): + return + with open(env_path) as f: + for line in f: + line = line.strip() + if line and not line.startswith("#"): + key, _, value = line.partition("=") + key, value = key.strip(), value.strip() + if key and key not in os.environ: + os.environ[key] = value + + +def fmt_time(secs: float) -> str: + secs = int(abs(secs)) + h, m, s = secs // 3600, (secs % 3600) // 60, secs % 60 + return " ".join(f"{v}{u}" for v, u in [(h, "h"), (m, "m"), (s, "s")] if v or (not h and not m)) + + +def git_info(cwd: str = None) -> tuple[str, str]: + try: + commit = subprocess.check_output( + ["git", "rev-parse", "--short", "HEAD"], stderr=subprocess.DEVNULL, cwd=cwd + ).decode().strip() + except Exception: + commit = "unknown" + try: + branch = subprocess.check_output( + ["git", "rev-parse", "--abbrev-ref", "HEAD"], stderr=subprocess.DEVNULL, cwd=cwd + ).decode().strip() + except Exception: + branch = "unknown" + return branch, commit + + +def rpc_call(url: str, method: str, params=None): + try: + payload = {"jsonrpc": "2.0", "method": method, "params": params or [], "id": 1} + resp = requests.post(url, json=payload, timeout=5) + return resp.json().get("result") + except Exception: + return None + + +def rpc_block_number(url: str): + result = rpc_call(url, "eth_blockNumber") + if result: + return int(result, 16) + return None + + +def ethd(eth_docker_dir: str, *args) -> subprocess.CompletedProcess: + """Run an ./ethd command in the eth-docker directory.""" + cmd = ["./ethd"] + list(args) + print(f" $ {' '.join(cmd)}") + return subprocess.run(cmd, cwd=eth_docker_dir, capture_output=True, text=True) + + +def docker_compose_in_ethd(eth_docker_dir: str, *args) -> subprocess.CompletedProcess: + """Run a docker compose command in the eth-docker directory. + + Uses eth-docker's .env for COMPOSE_FILE so the right yml files are picked up. + """ + cmd = ["docker", "compose"] + list(args) + print(f" $ {' '.join(cmd)}") + return subprocess.run(cmd, cwd=eth_docker_dir, capture_output=True, text=True) + + +def slack_notify(message: str, success: bool, details: str = "", ethrex_dir: str = None): + """Send a Slack notification using the configured webhooks.""" + url = os.environ.get("SLACK_WEBHOOK_URL_SUCCESS" if success else "SLACK_WEBHOOK_URL_FAILED") + if not url: + print(" [no slack webhook configured, skipping notification]") + return + + branch, commit = git_info(cwd=ethrex_dir) + hostname = socket.gethostname() + + blocks = [ + {"type": "header", "text": {"type": "plain_text", "text": message}}, + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": ( + f"*Host:* `{hostname}`\n" + f"*Branch:* `{branch}`\n" + f"*Commit:* \n" + f"*Test:* Restart stall reproduction (eth-docker)" + ), + }, + }, + ] + if details: + blocks.append({"type": "section", "text": {"type": "mrkdwn", "text": details}}) + + try: + requests.post(url, json={"blocks": blocks}, timeout=10) + except Exception: + pass + + +def save_ethd_logs(eth_docker_dir: str, run_dir: Path, suffix: str = ""): + """Save execution and consensus logs from eth-docker.""" + for service in ["execution", "consensus"]: + log_file = run_dir / f"{service}{suffix}.log" + try: + result = subprocess.run( + ["docker", "compose", "logs", "--no-color", service], + cwd=eth_docker_dir, capture_output=True, text=True, timeout=60, + ) + log_file.write_text(result.stdout + result.stderr) + print(f" Saved logs: {log_file}") + except Exception as e: + print(f" Failed to save {service} logs: {e}") + + +def wait_for_node(rpc_url: str, timeout: int) -> bool: + """Wait for the node to respond to RPC calls.""" + print(f" Waiting for node to respond at {rpc_url}...") + start = time.time() + while time.time() - start < timeout: + if rpc_call(rpc_url, "eth_blockNumber") is not None: + print(f" Node is up ({fmt_time(time.time() - start)} elapsed)") + return True + time.sleep(CHECK_INTERVAL) + print(f" Node did not respond within {fmt_time(timeout)}") + return False + + +def wait_for_sync(rpc_url: str, timeout: int) -> tuple[bool, float]: + """Wait for snap sync to complete. + + Returns (success, sync_time_seconds). + """ + print(f" Waiting for sync to complete (timeout: {fmt_time(timeout)})...") + start = time.time() + last_status_print = 0 + + while time.time() - start < timeout: + syncing = rpc_call(rpc_url, "eth_syncing") + elapsed = time.time() - start + + if syncing is False: + print(f" Sync completed in {fmt_time(elapsed)}") + return True, elapsed + + if syncing is None: + if time.time() - last_status_print > 60: + print(f" [{fmt_time(elapsed)}] Node not responding...") + last_status_print = time.time() + else: + if time.time() - last_status_print > 60: + block = rpc_block_number(rpc_url) + block_str = f"block {block}" if block else "unknown block" + print(f" [{fmt_time(elapsed)}] Still syncing... {block_str}") + last_status_print = time.time() + + time.sleep(CHECK_INTERVAL) + + return False, time.time() - start + + +def wait_for_block_progress(rpc_url: str, duration: int, stall_timeout: int) -> tuple[bool, int]: + """Wait for block progress after sync. + + Returns (success, blocks_processed). + """ + print(f" Monitoring block progress for {fmt_time(duration)} (stall timeout: {fmt_time(stall_timeout)})...") + start = time.time() + initial_block = rpc_block_number(rpc_url) or 0 + last_block = initial_block + last_block_time = time.time() + last_status_print = 0 + + while time.time() - start < duration: + block = rpc_block_number(rpc_url) + + if block is None: + if time.time() - last_block_time > stall_timeout: + print(f" Node stopped responding for {fmt_time(stall_timeout)}") + return False, last_block - initial_block + elif block > last_block: + last_block = block + last_block_time = time.time() + elif time.time() - last_block_time > stall_timeout: + print(f" Block stalled at {last_block} for {fmt_time(stall_timeout)}") + return False, last_block - initial_block + + if time.time() - last_status_print > 60: + elapsed = time.time() - start + blocks_done = last_block - initial_block + print(f" [{fmt_time(elapsed)}] Block {last_block} (+{blocks_done} since sync)") + last_status_print = time.time() + + time.sleep(CHECK_INTERVAL) + + blocks_processed = last_block - initial_block + if blocks_processed > 0: + return True, blocks_processed + return False, 0 + + +def monitor_restart_for_stall(rpc_url: str, timeout: int) -> tuple[str, str]: + """Monitor a restarted node for header download stall. + + Returns (result, details) where result is one of: + - "ok": Node synced/caught up within timeout + - "stall": Node appears stalled (not progressing) + - "unresponsive": Node never came back up + """ + print(f"\n Monitoring restart for stall (timeout: {fmt_time(timeout)})...") + start = time.time() + + # Wait for node to come back up + if not wait_for_node(rpc_url, NODE_STARTUP_TIMEOUT): + elapsed = time.time() - start + return "unresponsive", f"Node never responded after {fmt_time(elapsed)}" + + # Monitor: is it syncing? Is it making progress? + last_block = rpc_block_number(rpc_url) or 0 + last_progress_time = time.time() + last_status_print = 0 + syncing_reported = False + + while time.time() - start < timeout: + syncing = rpc_call(rpc_url, "eth_syncing") + block = rpc_block_number(rpc_url) + elapsed = time.time() - start + + if syncing is False: + print(f" Node caught up in {fmt_time(elapsed)} (block {block})") + return "ok", f"Caught up in {fmt_time(elapsed)}, block {block}" + + if syncing is not None and not syncing_reported: + print(f" Node is syncing (expected after restart)") + syncing_reported = True + + if block is not None: + if block > last_block: + last_block = block + last_progress_time = time.time() + elif time.time() - last_progress_time > RESTART_STALL_TIMEOUT: + stall_duration = fmt_time(time.time() - last_progress_time) + return "stall", f"Stalled at block {last_block} for {stall_duration}" + elif block is None and syncing is None: + if time.time() - last_progress_time > NODE_STARTUP_TIMEOUT: + return "unresponsive", f"Node stopped responding after {fmt_time(elapsed)}" + + if time.time() - last_status_print > 60: + stall_elapsed = fmt_time(time.time() - last_progress_time) + print(f" [{fmt_time(elapsed)}] Block {last_block}, last progress {stall_elapsed} ago, syncing={syncing is not False}") + last_status_print = time.time() + + time.sleep(CHECK_INTERVAL) + + return "stall", f"Still syncing after {fmt_time(timeout)}, stuck at block {last_block}" + + +def phase1_fresh_sync(eth_docker_dir: str, rpc_url: str) -> bool: + """Phase 1: Clean start via eth-docker and wait for sync completion.""" + print(f"\n{'='*60}") + print(f"PHASE 1: Fresh snap sync") + print(f"{'='*60}\n") + + # Terminate (removes volumes) and start fresh + print("Terminating existing containers and volumes...") + ethd(eth_docker_dir, "terminate") + time.sleep(5) + + print("Starting eth-docker...") + ethd(eth_docker_dir, "up") + time.sleep(30) + + # Wait for node to come up + if not wait_for_node(rpc_url, NODE_STARTUP_TIMEOUT): + print("FAILED: Node never came up") + return False + + # Wait for sync + synced, sync_time = wait_for_sync(rpc_url, SYNC_TIMEOUT) + if not synced: + print(f"FAILED: Sync timed out after {fmt_time(sync_time)}") + return False + + # Verify block progress + print(f"\n Sync complete. Verifying block progress...") + progress_ok, blocks = wait_for_block_progress(rpc_url, BLOCK_PROCESSING_DURATION, 10 * 60) + if not progress_ok: + print(f"FAILED: No block progress after sync (processed {blocks} blocks)") + return False + + print(f"\n Phase 1 SUCCESS: synced in {fmt_time(sync_time)}, processed +{blocks} blocks") + return True + + +def phase2_restart_test(eth_docker_dir: str, rpc_url: str, restart_num: int) -> tuple[str, str]: + """Phase 2: Stop only execution client, restart it, monitor for stall.""" + print(f"\n{'='*60}") + print(f"PHASE 2: Restart test #{restart_num}") + print(f"{'='*60}\n") + + # Stop only the execution client (keep consensus + volumes) + print("Stopping execution client (keeping consensus + volumes)...") + docker_compose_in_ethd(eth_docker_dir, "stop", "execution") + time.sleep(10) + + # Restart execution client + print("Restarting execution client...") + docker_compose_in_ethd(eth_docker_dir, "start", "execution") + time.sleep(5) + + # Monitor for stall + result, details = monitor_restart_for_stall(rpc_url, RESTART_STALL_TIMEOUT * 2) + + status_str = { + "ok": "PASS", + "stall": "STALL DETECTED", + "unresponsive": "NODE UNRESPONSIVE", + }.get(result, result.upper()) + + print(f"\n Restart #{restart_num} result: {status_str} - {details}") + return result, details + + +def main(): + parser = argparse.ArgumentParser(description="Test snap sync restart stall bug (eth-docker)") + parser.add_argument("--eth-docker-dir", default=os.path.expanduser("~/eth-docker"), + help="Path to eth-docker clone (default: ~/eth-docker)") + parser.add_argument("--network", default="hoodi", + help="Ethereum network (default: hoodi)") + parser.add_argument("--configure", action="store_true", + help="Write eth-docker .env for ethrex+Prysm before starting") + parser.add_argument("--fee-recipient", default="", + help="Ethereum address for EL rewards (FEE_RECIPIENT in eth-docker)") + parser.add_argument("--rpc-port", type=int, default=8545, + help="RPC port for ethrex (default: 8545)") + parser.add_argument("--restart-count", type=int, default=3, + help="Number of restart cycles to test (default: 3)") + parser.add_argument("--no-slack", action="store_true", + help="Disable Slack notifications") + parser.add_argument("--skip-phase1", action="store_true", + help="Skip fresh sync (assume node is already synced)") + parser.add_argument("--ethrex-dir", default=None, + help="Path to ethrex repo (for git info in Slack). Auto-detected if not set.") + args = parser.parse_args() + + eth_docker_dir = os.path.abspath(args.eth_docker_dir) + rpc_url = f"http://localhost:{args.rpc_port}" + run_id = datetime.now().strftime("%Y%m%d_%H%M%S") + + # Validate eth-docker directory + if not os.path.isfile(os.path.join(eth_docker_dir, "ethd")): + print(f"Error: eth-docker not found at {eth_docker_dir}") + print("Clone it with: git clone https://github.com/ethstaker/eth-docker.git ~/eth-docker") + sys.exit(1) + + # Load our local .env first (for Slack webhooks) + load_env_file(".env") + + # Configure eth-docker .env if requested + if args.configure: + print("Configuring eth-docker for ethrex + Prysm...") + configure_eth_docker( + eth_docker_dir, + network=args.network, + fee_recipient=args.fee_recipient, + slack_success=os.environ.get("SLACK_WEBHOOK_URL_SUCCESS", ""), + slack_failed=os.environ.get("SLACK_WEBHOOK_URL_FAILED", ""), + ) + + env_file = os.path.join(eth_docker_dir, ".env") + if not os.path.isfile(env_file): + print(f"Error: .env not found at {env_file}") + print("Run with --configure, or configure manually: cd ~/eth-docker && ./ethd config") + sys.exit(1) + + # Load eth-docker .env for network info and any extra vars + load_env_file(env_file) + + network = os.environ.get("NETWORK", args.network) + ethrex_dir = args.ethrex_dir or os.environ.get("ETHREX_DIR") + branch, commit = git_info(cwd=ethrex_dir) + + # Create logs directory + run_dir = LOGS_DIR / f"run_{run_id}" + run_dir.mkdir(parents=True, exist_ok=True) + + print(f"Restart Stall Test (eth-docker)") + print(f" eth-docker: {eth_docker_dir}") + print(f" Network: {network}") + print(f" RPC: {rpc_url}") + print(f" Branch: {branch}") + print(f" Commit: {commit}") + print(f" Restarts: {args.restart_count}") + print(f" Logs: {run_dir}") + print() + + # Phase 1: Fresh sync + if not args.skip_phase1: + sync_ok = phase1_fresh_sync(eth_docker_dir, rpc_url) + save_ethd_logs(eth_docker_dir, run_dir, suffix="_phase1") + + if not sync_ok: + if not args.no_slack: + slack_notify( + "Restart Stall Test - Phase 1 FAILED", + success=False, + details=f"*Network:* `{network}`\nFresh sync failed. Cannot proceed to restart test.", + ethrex_dir=ethrex_dir, + ) + sys.exit(1) + + if not args.no_slack: + slack_notify( + "Restart Stall Test - Phase 1 Complete", + success=True, + details=f"*Network:* `{network}`\nFresh sync completed. Starting restart tests...", + ethrex_dir=ethrex_dir, + ) + + # Phase 2: Restart cycles + results = [] + for i in range(1, args.restart_count + 1): + result, details = phase2_restart_test(eth_docker_dir, rpc_url, i) + results.append((i, result, details)) + + save_ethd_logs(eth_docker_dir, run_dir, suffix=f"_restart{i}") + + if result != "ok" and not args.no_slack: + slack_notify( + f"Restart Stall Test - STALL on restart #{i}", + success=False, + details=( + f"*Network:* `{network}`\n" + f"*Restart:* #{i} of {args.restart_count}\n" + f"*Result:* {details}\n" + f"*Logs:* `{run_dir}`\n\n" + "Containers are still running for inspection." + ), + ethrex_dir=ethrex_dir, + ) + + # Final summary + stalls = [(i, r, d) for i, r, d in results if r != "ok"] + all_ok = len(stalls) == 0 + + print(f"\n{'='*60}") + print(f"FINAL RESULTS") + print(f"{'='*60}") + for i, result, details in results: + status = "PASS" if result == "ok" else "FAIL" + print(f" Restart #{i}: {status} - {details}") + print(f"\n Overall: {'ALL PASSED' if all_ok else f'{len(stalls)}/{len(results)} STALLED'}") + + # Save summary + summary_lines = [ + f"Restart Stall Test - {run_id}", + f"Network: {network}", + f"Branch: {branch}", + f"Commit: {commit}", + f"Host: {socket.gethostname()}", + f"eth-docker: {eth_docker_dir}", + "", + ] + for i, result, details in results: + summary_lines.append(f"Restart #{i}: {result} - {details}") + summary_lines.append(f"\nOverall: {'ALL PASSED' if all_ok else f'{len(stalls)}/{len(results)} STALLED'}") + (run_dir / "summary.txt").write_text("\n".join(summary_lines)) + + # Final Slack notification + if not args.no_slack: + result_lines = "\n".join( + f"{'PASS' if r == 'ok' else 'FAIL'} Restart #{i}: {d}" for i, r, d in results + ) + slack_notify( + f"Restart Stall Test - {'ALL PASSED' if all_ok else 'STALL DETECTED'}", + success=all_ok, + details=( + f"*Network:* `{network}`\n" + f"*Restarts:* {args.restart_count}\n" + f"*Stalls:* {len(stalls)}/{len(results)}\n\n" + f"```\n{result_lines}\n```\n" + f"*Logs:* `{run_dir}`" + ), + ethrex_dir=ethrex_dir, + ) + + sys.exit(0 if all_ok else 1) + + +if __name__ == "__main__": + main() From ccf3005c98c34c085fa5adcd35730b5ea0d22588 Mon Sep 17 00:00:00 2001 From: Pablo Deymonnaz Date: Fri, 6 Feb 2026 15:55:00 -0300 Subject: [PATCH 02/10] Use docker compose directly instead of ./ethd to avoid interactive prompts The ./ethd terminate command has an interactive Yes/No confirmation that blocks the script when running non-interactively in tmux. Replace it with docker compose down -v and docker compose up -d which work without prompts. --- tooling/sync/restart_stall_test.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tooling/sync/restart_stall_test.py b/tooling/sync/restart_stall_test.py index e19c52aeff1..1bd2ff598e8 100644 --- a/tooling/sync/restart_stall_test.py +++ b/tooling/sync/restart_stall_test.py @@ -368,12 +368,13 @@ def phase1_fresh_sync(eth_docker_dir: str, rpc_url: str) -> bool: print(f"{'='*60}\n") # Terminate (removes volumes) and start fresh - print("Terminating existing containers and volumes...") - ethd(eth_docker_dir, "terminate") + # Use docker compose directly to avoid interactive prompts from ./ethd + print("Stopping and removing containers + volumes...") + docker_compose_in_ethd(eth_docker_dir, "down", "-v") time.sleep(5) print("Starting eth-docker...") - ethd(eth_docker_dir, "up") + docker_compose_in_ethd(eth_docker_dir, "up", "-d") time.sleep(30) # Wait for node to come up From c1317fbf10cd69dce51fe047a1639f470f4a1058 Mon Sep 17 00:00:00 2001 From: Pablo Deymonnaz Date: Fri, 6 Feb 2026 15:58:55 -0300 Subject: [PATCH 03/10] Add el-shared.yml to COMPOSE_FILE to expose RPC port to host eth-docker doesn't publish the EL RPC port to the host by default. Adding el-shared.yml maps port 8545 to localhost so the monitoring script can poll eth_syncing and eth_blockNumber via RPC. --- tooling/sync/restart_stall_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tooling/sync/restart_stall_test.py b/tooling/sync/restart_stall_test.py index 1bd2ff598e8..88332cabc9c 100644 --- a/tooling/sync/restart_stall_test.py +++ b/tooling/sync/restart_stall_test.py @@ -59,7 +59,7 @@ def configure_eth_docker(eth_docker_dir: str, network: str, fee_recipient: str = # Settings to override overrides = { - "COMPOSE_FILE": "prysm.yml:ethrex.yml", + "COMPOSE_FILE": "prysm.yml:ethrex.yml:el-shared.yml", "NETWORK": network, "ETHREX_DOCKERFILE": "Dockerfile.binary", "ETHREX_DOCKER_REPO": "ghcr.io/lambdaclass/ethrex", @@ -93,7 +93,7 @@ def configure_eth_docker(eth_docker_dir: str, network: str, fee_recipient: str = f.writelines(new_lines) print(f" Wrote {env_file}") - print(f" COMPOSE_FILE=prysm.yml:ethrex.yml") + print(f" COMPOSE_FILE=prysm.yml:ethrex.yml:el-shared.yml") print(f" NETWORK={network}") if fee_recipient: print(f" FEE_RECIPIENT={fee_recipient}") From 0e839929d49bb4ac184cb3654a66682f1badd779 Mon Sep 17 00:00:00 2001 From: Pablo Deymonnaz Date: Fri, 6 Feb 2026 16:01:51 -0300 Subject: [PATCH 04/10] Add fee-recipient and PYTHONUNBUFFERED to Makefile restart-stall targets Pass PYTHONUNBUFFERED=1 so output appears immediately when piped to tee in tmux. Add RESTART_TEST_FEE_RECIPIENT variable for the Ethereum address. --- tooling/sync/Makefile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tooling/sync/Makefile b/tooling/sync/Makefile index 9d9e400df4f..4fd38a79c50 100644 --- a/tooling/sync/Makefile +++ b/tooling/sync/Makefile @@ -355,20 +355,23 @@ multisync-loop-auto: ## Continuous loop with auto-update: pull latest, build, an RESTART_TEST_NETWORK ?= hoodi RESTART_TEST_COUNT ?= 3 +RESTART_TEST_FEE_RECIPIENT ?= ETH_DOCKER_DIR ?= $(HOME)/eth-docker restart-stall-test: ## Run restart stall test via eth-docker: configure, sync, then restart cycles. - python3 restart_stall_test.py \ + PYTHONUNBUFFERED=1 python3 restart_stall_test.py \ --eth-docker-dir $(ETH_DOCKER_DIR) \ --network $(RESTART_TEST_NETWORK) \ --restart-count $(RESTART_TEST_COUNT) \ --configure \ + $(if $(RESTART_TEST_FEE_RECIPIENT),--fee-recipient $(RESTART_TEST_FEE_RECIPIENT)) \ --ethrex-dir $(ETHREX_DIR) restart-stall-test-skip-sync: ## Run restart stall test, skip initial sync (node must be synced). - python3 restart_stall_test.py \ + PYTHONUNBUFFERED=1 python3 restart_stall_test.py \ --eth-docker-dir $(ETH_DOCKER_DIR) \ --network $(RESTART_TEST_NETWORK) \ --restart-count $(RESTART_TEST_COUNT) \ --skip-phase1 \ + $(if $(RESTART_TEST_FEE_RECIPIENT),--fee-recipient $(RESTART_TEST_FEE_RECIPIENT)) \ --ethrex-dir $(ETHREX_DIR) From 8a12e3e030bddc777a3ad751aca2912ddedc249f Mon Sep 17 00:00:00 2001 From: Pablo Deymonnaz Date: Fri, 6 Feb 2026 16:12:20 -0300 Subject: [PATCH 05/10] Address Copilot review feedback on restart stall test - Fix docstring to say prysm.yml (not lighthouse.yml) matching actual config - Anchor LOGS_DIR to script directory instead of cwd - Standardize SYNC_TIMEOUT to seconds (was minutes * 60, now direct seconds) - Extract BLOCK_STALL_TIMEOUT constant (was hard-coded 10*60) - Add check parameter to docker_compose_in_ethd, fail fast on non-zero exit - Retry initial block number fetch in wait_for_block_progress instead of or 0 - Pass stall_timeout as parameter to monitor_restart_for_stall instead of using global RESTART_STALL_TIMEOUT, fixing the mismatch between function timeout and stall detection threshold - Distinguish "timeout" (still progressing) from "stall" (no progress) in monitor_restart_for_stall return values - Log Slack notification failures instead of bare except pass - Use docker_compose_in_ethd for save_ethd_logs to ensure correct cwd/context --- tooling/sync/restart_stall_test.py | 91 +++++++++++++++++++----------- 1 file changed, 57 insertions(+), 34 deletions(-) diff --git a/tooling/sync/restart_stall_test.py b/tooling/sync/restart_stall_test.py index 88332cabc9c..3d71cf32730 100644 --- a/tooling/sync/restart_stall_test.py +++ b/tooling/sync/restart_stall_test.py @@ -10,7 +10,7 @@ Prerequisites: - eth-docker cloned (default: ~/eth-docker) - - .env configured with ethrex (COMPOSE_FILE=lighthouse.yml:ethrex.yml, NETWORK=hoodi, etc.) + - .env configured with ethrex (COMPOSE_FILE=prysm.yml:ethrex.yml:el-shared.yml, NETWORK=hoodi, etc.) - Slack webhooks in eth-docker's .env or exported as env vars (optional) Usage: @@ -31,14 +31,15 @@ import requests -# Timeouts (in seconds), configurable via env vars -SYNC_TIMEOUT = int(os.environ.get("SYNC_TIMEOUT", 8 * 60)) * 60 # default 8h +# Timeouts (all in seconds), configurable via env vars +SYNC_TIMEOUT = int(os.environ.get("SYNC_TIMEOUT", 8 * 60 * 60)) # default 8h BLOCK_PROCESSING_DURATION = int(os.environ.get("BLOCK_PROCESSING_DURATION", 22 * 60)) # default 22m +BLOCK_STALL_TIMEOUT = int(os.environ.get("BLOCK_STALL_TIMEOUT", 10 * 60)) # default 10m RESTART_STALL_TIMEOUT = int(os.environ.get("RESTART_STALL_TIMEOUT", 15 * 60)) # default 15m NODE_STARTUP_TIMEOUT = int(os.environ.get("NODE_STARTUP_TIMEOUT", 5 * 60)) # default 5m CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 10)) -LOGS_DIR = Path("./restart_stall_logs") +LOGS_DIR = Path(__file__).resolve().parent / "restart_stall_logs" def configure_eth_docker(eth_docker_dir: str, network: str, fee_recipient: str = "", slack_success: str = "", slack_failed: str = ""): @@ -153,21 +154,21 @@ def rpc_block_number(url: str): return None -def ethd(eth_docker_dir: str, *args) -> subprocess.CompletedProcess: - """Run an ./ethd command in the eth-docker directory.""" - cmd = ["./ethd"] + list(args) - print(f" $ {' '.join(cmd)}") - return subprocess.run(cmd, cwd=eth_docker_dir, capture_output=True, text=True) - - -def docker_compose_in_ethd(eth_docker_dir: str, *args) -> subprocess.CompletedProcess: +def docker_compose_in_ethd(eth_docker_dir: str, *args, check: bool = False) -> subprocess.CompletedProcess: """Run a docker compose command in the eth-docker directory. Uses eth-docker's .env for COMPOSE_FILE so the right yml files are picked up. + If check=True, raises on non-zero exit code. """ cmd = ["docker", "compose"] + list(args) print(f" $ {' '.join(cmd)}") - return subprocess.run(cmd, cwd=eth_docker_dir, capture_output=True, text=True) + result = subprocess.run(cmd, cwd=eth_docker_dir, capture_output=True, text=True) + if check and result.returncode != 0: + print(f" FAILED (exit code {result.returncode})") + if result.stderr: + print(f" stderr: {result.stderr.strip()}") + raise RuntimeError(f"docker compose {' '.join(args)} failed with exit code {result.returncode}") + return result def slack_notify(message: str, success: bool, details: str = "", ethrex_dir: str = None): @@ -200,8 +201,8 @@ def slack_notify(message: str, success: bool, details: str = "", ethrex_dir: str try: requests.post(url, json={"blocks": blocks}, timeout=10) - except Exception: - pass + except Exception as e: + print(f" Failed to send Slack notification: {e}") def save_ethd_logs(eth_docker_dir: str, run_dir: Path, suffix: str = ""): @@ -209,10 +210,7 @@ def save_ethd_logs(eth_docker_dir: str, run_dir: Path, suffix: str = ""): for service in ["execution", "consensus"]: log_file = run_dir / f"{service}{suffix}.log" try: - result = subprocess.run( - ["docker", "compose", "logs", "--no-color", service], - cwd=eth_docker_dir, capture_output=True, text=True, timeout=60, - ) + result = docker_compose_in_ethd(eth_docker_dir, "logs", "--no-color", service) log_file.write_text(result.stdout + result.stderr) print(f" Saved logs: {log_file}") except Exception as e: @@ -272,7 +270,17 @@ def wait_for_block_progress(rpc_url: str, duration: int, stall_timeout: int) -> """ print(f" Monitoring block progress for {fmt_time(duration)} (stall timeout: {fmt_time(stall_timeout)})...") start = time.time() - initial_block = rpc_block_number(rpc_url) or 0 + + # Retry to get a valid initial block number + initial_block = rpc_block_number(rpc_url) + retry_start = time.time() + while initial_block is None and time.time() - retry_start < stall_timeout: + time.sleep(CHECK_INTERVAL) + initial_block = rpc_block_number(rpc_url) + if initial_block is None: + print(f" Failed to fetch initial block number; node not responding for {fmt_time(stall_timeout)}") + return False, 0 + last_block = initial_block last_block_time = time.time() last_status_print = 0 @@ -305,15 +313,21 @@ def wait_for_block_progress(rpc_url: str, duration: int, stall_timeout: int) -> return False, 0 -def monitor_restart_for_stall(rpc_url: str, timeout: int) -> tuple[str, str]: +def monitor_restart_for_stall(rpc_url: str, timeout: int, stall_timeout: int) -> tuple[str, str]: """Monitor a restarted node for header download stall. + Args: + rpc_url: RPC endpoint to poll. + timeout: Overall time limit for monitoring. + stall_timeout: Time without block progress to declare a stall. + Returns (result, details) where result is one of: - "ok": Node synced/caught up within timeout - - "stall": Node appears stalled (not progressing) + - "stall": Node appears stalled (no block progress for stall_timeout) + - "timeout": Overall timeout reached but node was still making progress - "unresponsive": Node never came back up """ - print(f"\n Monitoring restart for stall (timeout: {fmt_time(timeout)})...") + print(f"\n Monitoring restart for stall (timeout: {fmt_time(timeout)}, stall: {fmt_time(stall_timeout)})...") start = time.time() # Wait for node to come back up @@ -344,7 +358,7 @@ def monitor_restart_for_stall(rpc_url: str, timeout: int) -> tuple[str, str]: if block > last_block: last_block = block last_progress_time = time.time() - elif time.time() - last_progress_time > RESTART_STALL_TIMEOUT: + elif time.time() - last_progress_time > stall_timeout: stall_duration = fmt_time(time.time() - last_progress_time) return "stall", f"Stalled at block {last_block} for {stall_duration}" elif block is None and syncing is None: @@ -358,7 +372,11 @@ def monitor_restart_for_stall(rpc_url: str, timeout: int) -> tuple[str, str]: time.sleep(CHECK_INTERVAL) - return "stall", f"Still syncing after {fmt_time(timeout)}, stuck at block {last_block}" + # Distinguish timeout-while-progressing from true stall + stall_elapsed = time.time() - last_progress_time + if stall_elapsed <= CHECK_INTERVAL * 2: + return "timeout", f"Timed out after {fmt_time(timeout)} while still making progress (block {last_block})" + return "stall", f"No progress for {fmt_time(stall_elapsed)} (stuck at block {last_block})" def phase1_fresh_sync(eth_docker_dir: str, rpc_url: str) -> bool: @@ -374,7 +392,7 @@ def phase1_fresh_sync(eth_docker_dir: str, rpc_url: str) -> bool: time.sleep(5) print("Starting eth-docker...") - docker_compose_in_ethd(eth_docker_dir, "up", "-d") + docker_compose_in_ethd(eth_docker_dir, "up", "-d", check=True) time.sleep(30) # Wait for node to come up @@ -390,7 +408,7 @@ def phase1_fresh_sync(eth_docker_dir: str, rpc_url: str) -> bool: # Verify block progress print(f"\n Sync complete. Verifying block progress...") - progress_ok, blocks = wait_for_block_progress(rpc_url, BLOCK_PROCESSING_DURATION, 10 * 60) + progress_ok, blocks = wait_for_block_progress(rpc_url, BLOCK_PROCESSING_DURATION, BLOCK_STALL_TIMEOUT) if not progress_ok: print(f"FAILED: No block progress after sync (processed {blocks} blocks)") return False @@ -407,20 +425,25 @@ def phase2_restart_test(eth_docker_dir: str, rpc_url: str, restart_num: int) -> # Stop only the execution client (keep consensus + volumes) print("Stopping execution client (keeping consensus + volumes)...") - docker_compose_in_ethd(eth_docker_dir, "stop", "execution") + docker_compose_in_ethd(eth_docker_dir, "stop", "execution", check=True) time.sleep(10) # Restart execution client print("Restarting execution client...") - docker_compose_in_ethd(eth_docker_dir, "start", "execution") + docker_compose_in_ethd(eth_docker_dir, "start", "execution", check=True) time.sleep(5) # Monitor for stall - result, details = monitor_restart_for_stall(rpc_url, RESTART_STALL_TIMEOUT * 2) + result, details = monitor_restart_for_stall( + rpc_url, + timeout=RESTART_STALL_TIMEOUT * 2, + stall_timeout=RESTART_STALL_TIMEOUT, + ) status_str = { "ok": "PASS", "stall": "STALL DETECTED", + "timeout": "TIMEOUT (still progressing)", "unresponsive": "NODE UNRESPONSIVE", }.get(result, result.upper()) @@ -532,7 +555,7 @@ def main(): save_ethd_logs(eth_docker_dir, run_dir, suffix=f"_restart{i}") - if result != "ok" and not args.no_slack: + if result not in ("ok", "timeout") and not args.no_slack: slack_notify( f"Restart Stall Test - STALL on restart #{i}", success=False, @@ -547,14 +570,14 @@ def main(): ) # Final summary - stalls = [(i, r, d) for i, r, d in results if r != "ok"] + stalls = [(i, r, d) for i, r, d in results if r not in ("ok", "timeout")] all_ok = len(stalls) == 0 print(f"\n{'='*60}") print(f"FINAL RESULTS") print(f"{'='*60}") for i, result, details in results: - status = "PASS" if result == "ok" else "FAIL" + status = "PASS" if result == "ok" else ("TIMEOUT" if result == "timeout" else "FAIL") print(f" Restart #{i}: {status} - {details}") print(f"\n Overall: {'ALL PASSED' if all_ok else f'{len(stalls)}/{len(results)} STALLED'}") @@ -576,7 +599,7 @@ def main(): # Final Slack notification if not args.no_slack: result_lines = "\n".join( - f"{'PASS' if r == 'ok' else 'FAIL'} Restart #{i}: {d}" for i, r, d in results + f"{'PASS' if r == 'ok' else ('TIMEOUT' if r == 'timeout' else 'FAIL')} Restart #{i}: {d}" for i, r, d in results ) slack_notify( f"Restart Stall Test - {'ALL PASSED' if all_ok else 'STALL DETECTED'}", From 29c2880bbfd25c6d1c7e6c33562af2a43a60baeb Mon Sep 17 00:00:00 2001 From: Pablo Deymonnaz Date: Fri, 6 Feb 2026 17:31:50 -0300 Subject: [PATCH 06/10] Add CHECKPOINT_SYNC_URL to eth-docker configuration based on network. The configure_eth_docker function was not setting CHECKPOINT_SYNC_URL, causing Prysm to use the default.env value (hoodi) even when NETWORK=mainnet, resulting in a fatal fork mismatch and crash loop. --- tooling/sync/restart_stall_test.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tooling/sync/restart_stall_test.py b/tooling/sync/restart_stall_test.py index 3d71cf32730..5ba61a4d833 100644 --- a/tooling/sync/restart_stall_test.py +++ b/tooling/sync/restart_stall_test.py @@ -58,6 +58,15 @@ def configure_eth_docker(eth_docker_dir: str, network: str, fee_recipient: str = with open(default_env) as f: lines = f.readlines() + # Map network to checkpoint sync URL + checkpoint_urls = { + "mainnet": "https://mainnet.checkpoint.sigp.io", + "hoodi": "https://hoodi.checkpoint.sigp.io", + "holesky": "https://holesky.checkpoint.sigp.io", + "sepolia": "https://sepolia.checkpoint.sigp.io", + } + checkpoint_url = checkpoint_urls.get(network, "") + # Settings to override overrides = { "COMPOSE_FILE": "prysm.yml:ethrex.yml:el-shared.yml", @@ -66,6 +75,8 @@ def configure_eth_docker(eth_docker_dir: str, network: str, fee_recipient: str = "ETHREX_DOCKER_REPO": "ghcr.io/lambdaclass/ethrex", "ETHREX_DOCKER_TAG": "latest", } + if checkpoint_url: + overrides["CHECKPOINT_SYNC_URL"] = checkpoint_url if fee_recipient: overrides["FEE_RECIPIENT"] = fee_recipient if slack_success: @@ -96,6 +107,8 @@ def configure_eth_docker(eth_docker_dir: str, network: str, fee_recipient: str = print(f" Wrote {env_file}") print(f" COMPOSE_FILE=prysm.yml:ethrex.yml:el-shared.yml") print(f" NETWORK={network}") + if checkpoint_url: + print(f" CHECKPOINT_SYNC_URL={checkpoint_url}") if fee_recipient: print(f" FEE_RECIPIENT={fee_recipient}") print(f" ETHREX_DOCKER_REPO=ghcr.io/lambdaclass/ethrex") From 192ee9e4a2a16c8c51e8fd94b3b5344ea7438321 Mon Sep 17 00:00:00 2001 From: Pablo Deymonnaz Date: Mon, 9 Feb 2026 16:16:24 -0300 Subject: [PATCH 07/10] Resolve .env path relative to script directory instead of CWD so Slack webhooks are loaded regardless of where the script is launched from. --- tooling/sync/restart_stall_test.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tooling/sync/restart_stall_test.py b/tooling/sync/restart_stall_test.py index 5ba61a4d833..02bdc0b653e 100644 --- a/tooling/sync/restart_stall_test.py +++ b/tooling/sync/restart_stall_test.py @@ -39,7 +39,8 @@ NODE_STARTUP_TIMEOUT = int(os.environ.get("NODE_STARTUP_TIMEOUT", 5 * 60)) # default 5m CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 10)) -LOGS_DIR = Path(__file__).resolve().parent / "restart_stall_logs" +SCRIPT_DIR = Path(__file__).resolve().parent +LOGS_DIR = SCRIPT_DIR / "restart_stall_logs" def configure_eth_docker(eth_docker_dir: str, network: str, fee_recipient: str = "", slack_success: str = "", slack_failed: str = ""): @@ -497,7 +498,7 @@ def main(): sys.exit(1) # Load our local .env first (for Slack webhooks) - load_env_file(".env") + load_env_file(str(SCRIPT_DIR / ".env")) # Configure eth-docker .env if requested if args.configure: From d06fbbb348e240885877dcbbc6423d5c5ef64058 Mon Sep 17 00:00:00 2001 From: Pablo Deymonnaz Date: Mon, 9 Feb 2026 16:28:41 -0300 Subject: [PATCH 08/10] Add --wipe-el-data flag to restart stall test so each Phase 2 cycle can optionally wipe all data volumes (EL, consensus, validator) and force a fresh snap sync from scratch. Includes wipe_data_volumes() helper that removes containers and volumes while preserving JWT, and a restart-stall-test-wipe Makefile target. --- tooling/sync/Makefile | 12 ++- tooling/sync/restart_stall_test.py | 121 ++++++++++++++++++++++------- 2 files changed, 104 insertions(+), 29 deletions(-) diff --git a/tooling/sync/Makefile b/tooling/sync/Makefile index 4fd38a79c50..144f46fff31 100644 --- a/tooling/sync/Makefile +++ b/tooling/sync/Makefile @@ -5,7 +5,7 @@ start-sepolia-metrics-docker start-holesky-metrics-docker start-hoodi-metrics-do start-metrics-docker tail-syncing-logs tail-metrics-logs copy_flamegraph import-with-metrics \ multisync-up multisync-down multisync-clean multisync-logs multisync-status \ multisync-restart multisync-monitor multisync-run multisync-loop multisync-loop-auto \ -restart-stall-test restart-stall-test-skip-sync +restart-stall-test restart-stall-test-skip-sync restart-stall-test-wipe ETHREX_DIR ?= "../.." @@ -375,3 +375,13 @@ restart-stall-test-skip-sync: ## Run restart stall test, skip initial sync (node --skip-phase1 \ $(if $(RESTART_TEST_FEE_RECIPIENT),--fee-recipient $(RESTART_TEST_FEE_RECIPIENT)) \ --ethrex-dir $(ETHREX_DIR) + +restart-stall-test-wipe: ## Run restart stall test with data wipe (full snap sync each cycle). + PYTHONUNBUFFERED=1 python3 restart_stall_test.py \ + --eth-docker-dir $(ETH_DOCKER_DIR) \ + --network $(RESTART_TEST_NETWORK) \ + --restart-count $(RESTART_TEST_COUNT) \ + --configure \ + --wipe-el-data \ + $(if $(RESTART_TEST_FEE_RECIPIENT),--fee-recipient $(RESTART_TEST_FEE_RECIPIENT)) \ + --ethrex-dir $(ETHREX_DIR) diff --git a/tooling/sync/restart_stall_test.py b/tooling/sync/restart_stall_test.py index 02bdc0b653e..23888646d4f 100644 --- a/tooling/sync/restart_stall_test.py +++ b/tooling/sync/restart_stall_test.py @@ -185,6 +185,31 @@ def docker_compose_in_ethd(eth_docker_dir: str, *args, check: bool = False) -> s return result +def wipe_data_volumes(eth_docker_dir: str): + """Remove and recreate containers and their data volumes for a fresh start. + + Stops and removes the execution and consensus containers, then removes all + data volumes (EL, consensus, validator) while preserving the JWT secret. + """ + docker_compose_in_ethd(eth_docker_dir, "rm", "-f", "-s", "execution", "consensus", check=True) + + project = os.path.basename(eth_docker_dir) + volumes = [ + f"{project}_ethrex-el-data", + f"{project}_prysmconsensus-data", + f"{project}_prysmvalidator-data", + ] + for volume in volumes: + print(f" Removing volume {volume}...") + result = subprocess.run( + ["docker", "volume", "rm", volume], + capture_output=True, text=True, + ) + if result.returncode != 0: + # Volume may not exist yet on first run; warn but don't fail + print(f" Warning: could not remove {volume}: {result.stderr.strip()}") + + def slack_notify(message: str, success: bool, details: str = "", ethrex_dir: str = None): """Send a Slack notification using the configured webhooks.""" url = os.environ.get("SLACK_WEBHOOK_URL_SUCCESS" if success else "SLACK_WEBHOOK_URL_FAILED") @@ -431,38 +456,75 @@ def phase1_fresh_sync(eth_docker_dir: str, rpc_url: str) -> bool: return True -def phase2_restart_test(eth_docker_dir: str, rpc_url: str, restart_num: int) -> tuple[str, str]: - """Phase 2: Stop only execution client, restart it, monitor for stall.""" +def phase2_restart_test(eth_docker_dir: str, rpc_url: str, restart_num: int, wipe_data: bool = False) -> tuple[str, str]: + """Phase 2: Stop execution client, restart it, monitor for stall. + + When wipe_data=True, removes containers and all data volumes (EL + + consensus + validator) before restarting, forcing a full snap sync from + scratch. + """ + mode = "wipe + fresh sync" if wipe_data else "restart" print(f"\n{'='*60}") - print(f"PHASE 2: Restart test #{restart_num}") + print(f"PHASE 2: Restart test #{restart_num} ({mode})") print(f"{'='*60}\n") - # Stop only the execution client (keep consensus + volumes) - print("Stopping execution client (keeping consensus + volumes)...") - docker_compose_in_ethd(eth_docker_dir, "stop", "execution", check=True) - time.sleep(10) - - # Restart execution client - print("Restarting execution client...") - docker_compose_in_ethd(eth_docker_dir, "start", "execution", check=True) - time.sleep(5) - - # Monitor for stall - result, details = monitor_restart_for_stall( - rpc_url, - timeout=RESTART_STALL_TIMEOUT * 2, - stall_timeout=RESTART_STALL_TIMEOUT, - ) + if wipe_data: + print("Wiping data volumes (EL + consensus + validator)...") + wipe_data_volumes(eth_docker_dir) + time.sleep(5) + + print("Starting containers with fresh volumes...") + docker_compose_in_ethd(eth_docker_dir, "up", "-d", "execution", "consensus", check=True) + time.sleep(30) + + # Wait for node to come up + if not wait_for_node(rpc_url, NODE_STARTUP_TIMEOUT): + print(f"\n Restart #{restart_num} result: NODE UNRESPONSIVE") + return "unresponsive", "Node never responded after wipe" + + # Wait for full snap sync + synced, sync_time = wait_for_sync(rpc_url, SYNC_TIMEOUT) + if not synced: + details = f"Sync timed out after {fmt_time(sync_time)}" + print(f"\n Restart #{restart_num} result: TIMEOUT - {details}") + return "timeout", details + + # Verify block progress + print(f"\n Sync complete. Verifying block progress...") + progress_ok, blocks = wait_for_block_progress(rpc_url, BLOCK_PROCESSING_DURATION, BLOCK_STALL_TIMEOUT) + if not progress_ok: + details = f"No block progress after sync (processed {blocks} blocks, sync took {fmt_time(sync_time)})" + print(f"\n Restart #{restart_num} result: FAIL - {details}") + return "stall", details + + details = f"Synced in {fmt_time(sync_time)}, processed +{blocks} blocks" + print(f"\n Restart #{restart_num} result: PASS - {details}") + return "ok", details + else: + # Original behavior: stop/start, monitor for stall + print("Stopping execution client (keeping consensus + volumes)...") + docker_compose_in_ethd(eth_docker_dir, "stop", "execution", check=True) + time.sleep(10) + + print("Restarting execution client...") + docker_compose_in_ethd(eth_docker_dir, "start", "execution", check=True) + time.sleep(5) + + result, details = monitor_restart_for_stall( + rpc_url, + timeout=RESTART_STALL_TIMEOUT * 2, + stall_timeout=RESTART_STALL_TIMEOUT, + ) - status_str = { - "ok": "PASS", - "stall": "STALL DETECTED", - "timeout": "TIMEOUT (still progressing)", - "unresponsive": "NODE UNRESPONSIVE", - }.get(result, result.upper()) + status_str = { + "ok": "PASS", + "stall": "STALL DETECTED", + "timeout": "TIMEOUT (still progressing)", + "unresponsive": "NODE UNRESPONSIVE", + }.get(result, result.upper()) - print(f"\n Restart #{restart_num} result: {status_str} - {details}") - return result, details + print(f"\n Restart #{restart_num} result: {status_str} - {details}") + return result, details def main(): @@ -483,6 +545,8 @@ def main(): help="Disable Slack notifications") parser.add_argument("--skip-phase1", action="store_true", help="Skip fresh sync (assume node is already synced)") + parser.add_argument("--wipe-el-data", action="store_true", + help="Wipe all data volumes on each restart (forces fresh snap sync)") parser.add_argument("--ethrex-dir", default=None, help="Path to ethrex repo (for git info in Slack). Auto-detected if not set.") args = parser.parse_args() @@ -535,6 +599,7 @@ def main(): print(f" Branch: {branch}") print(f" Commit: {commit}") print(f" Restarts: {args.restart_count}") + print(f" Wipe data: {args.wipe_el_data}") print(f" Logs: {run_dir}") print() @@ -564,7 +629,7 @@ def main(): # Phase 2: Restart cycles results = [] for i in range(1, args.restart_count + 1): - result, details = phase2_restart_test(eth_docker_dir, rpc_url, i) + result, details = phase2_restart_test(eth_docker_dir, rpc_url, i, wipe_data=args.wipe_el_data) results.append((i, result, details)) save_ethd_logs(eth_docker_dir, run_dir, suffix=f"_restart{i}") From 7edb2a8a88b5889540d11e00a7ec1552ff86195e Mon Sep 17 00:00:00 2001 From: Pablo Deymonnaz Date: Mon, 9 Feb 2026 16:36:36 -0300 Subject: [PATCH 09/10] Make data wipe the default behavior in restart stall test. Each Phase 2 cycle now wipes all volumes and forces a fresh snap sync by default. Add --keep-data flag for the old stop/start behavior without wipe. --- tooling/sync/Makefile | 6 +++--- tooling/sync/restart_stall_test.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tooling/sync/Makefile b/tooling/sync/Makefile index 144f46fff31..584d4e418fd 100644 --- a/tooling/sync/Makefile +++ b/tooling/sync/Makefile @@ -5,7 +5,7 @@ start-sepolia-metrics-docker start-holesky-metrics-docker start-hoodi-metrics-do start-metrics-docker tail-syncing-logs tail-metrics-logs copy_flamegraph import-with-metrics \ multisync-up multisync-down multisync-clean multisync-logs multisync-status \ multisync-restart multisync-monitor multisync-run multisync-loop multisync-loop-auto \ -restart-stall-test restart-stall-test-skip-sync restart-stall-test-wipe +restart-stall-test restart-stall-test-skip-sync restart-stall-test-keep-data ETHREX_DIR ?= "../.." @@ -376,12 +376,12 @@ restart-stall-test-skip-sync: ## Run restart stall test, skip initial sync (node $(if $(RESTART_TEST_FEE_RECIPIENT),--fee-recipient $(RESTART_TEST_FEE_RECIPIENT)) \ --ethrex-dir $(ETHREX_DIR) -restart-stall-test-wipe: ## Run restart stall test with data wipe (full snap sync each cycle). +restart-stall-test-keep-data: ## Run restart stall test keeping data (stop/start only, no wipe). PYTHONUNBUFFERED=1 python3 restart_stall_test.py \ --eth-docker-dir $(ETH_DOCKER_DIR) \ --network $(RESTART_TEST_NETWORK) \ --restart-count $(RESTART_TEST_COUNT) \ --configure \ - --wipe-el-data \ + --keep-data \ $(if $(RESTART_TEST_FEE_RECIPIENT),--fee-recipient $(RESTART_TEST_FEE_RECIPIENT)) \ --ethrex-dir $(ETHREX_DIR) diff --git a/tooling/sync/restart_stall_test.py b/tooling/sync/restart_stall_test.py index 23888646d4f..9b9dafffc86 100644 --- a/tooling/sync/restart_stall_test.py +++ b/tooling/sync/restart_stall_test.py @@ -545,8 +545,8 @@ def main(): help="Disable Slack notifications") parser.add_argument("--skip-phase1", action="store_true", help="Skip fresh sync (assume node is already synced)") - parser.add_argument("--wipe-el-data", action="store_true", - help="Wipe all data volumes on each restart (forces fresh snap sync)") + parser.add_argument("--keep-data", action="store_true", + help="Keep data volumes on restart (skip wipe, just stop/start)") parser.add_argument("--ethrex-dir", default=None, help="Path to ethrex repo (for git info in Slack). Auto-detected if not set.") args = parser.parse_args() @@ -599,7 +599,7 @@ def main(): print(f" Branch: {branch}") print(f" Commit: {commit}") print(f" Restarts: {args.restart_count}") - print(f" Wipe data: {args.wipe_el_data}") + print(f" Wipe data: {not args.keep_data}") print(f" Logs: {run_dir}") print() @@ -629,7 +629,7 @@ def main(): # Phase 2: Restart cycles results = [] for i in range(1, args.restart_count + 1): - result, details = phase2_restart_test(eth_docker_dir, rpc_url, i, wipe_data=args.wipe_el_data) + result, details = phase2_restart_test(eth_docker_dir, rpc_url, i, wipe_data=not args.keep_data) results.append((i, result, details)) save_ethd_logs(eth_docker_dir, run_dir, suffix=f"_restart{i}") From b7dabc94747ab85e0b276a3c0e02b1a5149877df Mon Sep 17 00:00:00 2001 From: Pablo Deymonnaz Date: Mon, 9 Feb 2026 16:40:37 -0300 Subject: [PATCH 10/10] Loop restart cycles indefinitely by default (Ctrl+C to stop). Each cycle wipes volumes, snap syncs from scratch, and verifies block progress. Use --restart-count N to limit cycles. Summary is printed and saved on exit. --- tooling/sync/Makefile | 2 +- tooling/sync/restart_stall_test.py | 117 +++++++++++++++++------------ 2 files changed, 68 insertions(+), 51 deletions(-) diff --git a/tooling/sync/Makefile b/tooling/sync/Makefile index 584d4e418fd..07632c68cdc 100644 --- a/tooling/sync/Makefile +++ b/tooling/sync/Makefile @@ -354,7 +354,7 @@ multisync-loop-auto: ## Continuous loop with auto-update: pull latest, build, an # ============================================================================== RESTART_TEST_NETWORK ?= hoodi -RESTART_TEST_COUNT ?= 3 +RESTART_TEST_COUNT ?= 0 RESTART_TEST_FEE_RECIPIENT ?= ETH_DOCKER_DIR ?= $(HOME)/eth-docker diff --git a/tooling/sync/restart_stall_test.py b/tooling/sync/restart_stall_test.py index 9b9dafffc86..af03588f062 100644 --- a/tooling/sync/restart_stall_test.py +++ b/tooling/sync/restart_stall_test.py @@ -539,8 +539,8 @@ def main(): help="Ethereum address for EL rewards (FEE_RECIPIENT in eth-docker)") parser.add_argument("--rpc-port", type=int, default=8545, help="RPC port for ethrex (default: 8545)") - parser.add_argument("--restart-count", type=int, default=3, - help="Number of restart cycles to test (default: 3)") + parser.add_argument("--restart-count", type=int, default=0, + help="Number of restart cycles (0 = infinite, stop with Ctrl+C)") parser.add_argument("--no-slack", action="store_true", help="Disable Slack notifications") parser.add_argument("--skip-phase1", action="store_true", @@ -598,7 +598,7 @@ def main(): print(f" RPC: {rpc_url}") print(f" Branch: {branch}") print(f" Commit: {commit}") - print(f" Restarts: {args.restart_count}") + print(f" Restarts: {'infinite (Ctrl+C to stop)' if args.restart_count == 0 else args.restart_count}") print(f" Wipe data: {not args.keep_data}") print(f" Logs: {run_dir}") print() @@ -626,67 +626,84 @@ def main(): ethrex_dir=ethrex_dir, ) - # Phase 2: Restart cycles + # Phase 2: Restart cycles (infinite by default, Ctrl+C to stop) results = [] - for i in range(1, args.restart_count + 1): - result, details = phase2_restart_test(eth_docker_dir, rpc_url, i, wipe_data=not args.keep_data) - results.append((i, result, details)) + i = 0 + max_restarts = args.restart_count # 0 = infinite + + def print_summary(): + stalls = [(n, r, d) for n, r, d in results if r not in ("ok", "timeout")] + all_ok = len(stalls) == 0 + print(f"\n{'='*60}") + print(f"RESULTS ({len(results)} cycles)") + print(f"{'='*60}") + for n, result, details in results: + status = "PASS" if result == "ok" else ("TIMEOUT" if result == "timeout" else "FAIL") + print(f" Cycle #{n}: {status} - {details}") + print(f"\n Overall: {'ALL PASSED' if all_ok else f'{len(stalls)}/{len(results)} FAILED'}") + + # Save summary + summary_lines = [ + f"Restart Stall Test - {run_id}", + f"Network: {network}", + f"Branch: {branch}", + f"Commit: {commit}", + f"Host: {socket.gethostname()}", + f"eth-docker: {eth_docker_dir}", + "", + ] + for n, r, d in results: + summary_lines.append(f"Cycle #{n}: {r} - {d}") + summary_lines.append(f"\nOverall: {'ALL PASSED' if all_ok else f'{len(stalls)}/{len(results)} FAILED'}") + (run_dir / "summary.txt").write_text("\n".join(summary_lines)) + return all_ok, stalls - save_ethd_logs(eth_docker_dir, run_dir, suffix=f"_restart{i}") + try: + while True: + i += 1 + if max_restarts > 0 and i > max_restarts: + break - if result not in ("ok", "timeout") and not args.no_slack: - slack_notify( - f"Restart Stall Test - STALL on restart #{i}", - success=False, - details=( - f"*Network:* `{network}`\n" - f"*Restart:* #{i} of {args.restart_count}\n" - f"*Result:* {details}\n" - f"*Logs:* `{run_dir}`\n\n" - "Containers are still running for inspection." - ), - ethrex_dir=ethrex_dir, - ) + result, details = phase2_restart_test(eth_docker_dir, rpc_url, i, wipe_data=not args.keep_data) + results.append((i, result, details)) - # Final summary - stalls = [(i, r, d) for i, r, d in results if r not in ("ok", "timeout")] - all_ok = len(stalls) == 0 + save_ethd_logs(eth_docker_dir, run_dir, suffix=f"_restart{i}") - print(f"\n{'='*60}") - print(f"FINAL RESULTS") - print(f"{'='*60}") - for i, result, details in results: - status = "PASS" if result == "ok" else ("TIMEOUT" if result == "timeout" else "FAIL") - print(f" Restart #{i}: {status} - {details}") - print(f"\n Overall: {'ALL PASSED' if all_ok else f'{len(stalls)}/{len(results)} STALLED'}") - - # Save summary - summary_lines = [ - f"Restart Stall Test - {run_id}", - f"Network: {network}", - f"Branch: {branch}", - f"Commit: {commit}", - f"Host: {socket.gethostname()}", - f"eth-docker: {eth_docker_dir}", - "", - ] - for i, result, details in results: - summary_lines.append(f"Restart #{i}: {result} - {details}") - summary_lines.append(f"\nOverall: {'ALL PASSED' if all_ok else f'{len(stalls)}/{len(results)} STALLED'}") - (run_dir / "summary.txt").write_text("\n".join(summary_lines)) + if result not in ("ok", "timeout") and not args.no_slack: + slack_notify( + f"Restart Stall Test - FAIL on cycle #{i}", + success=False, + details=( + f"*Network:* `{network}`\n" + f"*Cycle:* #{i}\n" + f"*Result:* {details}\n" + f"*Logs:* `{run_dir}`\n\n" + "Containers are still running for inspection." + ), + ethrex_dir=ethrex_dir, + ) + + except KeyboardInterrupt: + print("\n\nInterrupted by user (Ctrl+C)") + + if not results: + print("No cycles completed.") + sys.exit(0) + + all_ok, stalls = print_summary() # Final Slack notification if not args.no_slack: result_lines = "\n".join( - f"{'PASS' if r == 'ok' else ('TIMEOUT' if r == 'timeout' else 'FAIL')} Restart #{i}: {d}" for i, r, d in results + f"{'PASS' if r == 'ok' else ('TIMEOUT' if r == 'timeout' else 'FAIL')} Cycle #{n}: {d}" for n, r, d in results ) slack_notify( - f"Restart Stall Test - {'ALL PASSED' if all_ok else 'STALL DETECTED'}", + f"Restart Stall Test - {'ALL PASSED' if all_ok else 'FAILURE DETECTED'} ({len(results)} cycles)", success=all_ok, details=( f"*Network:* `{network}`\n" - f"*Restarts:* {args.restart_count}\n" - f"*Stalls:* {len(stalls)}/{len(results)}\n\n" + f"*Cycles completed:* {len(results)}\n" + f"*Failures:* {len(stalls)}/{len(results)}\n\n" f"```\n{result_lines}\n```\n" f"*Logs:* `{run_dir}`" ),