diff --git a/.github/workflows/local-testnet.yml b/.github/workflows/local-testnet.yml index 455931aa1e9..7bd8b40d76f 100644 --- a/.github/workflows/local-testnet.yml +++ b/.github/workflows/local-testnet.yml @@ -67,6 +67,7 @@ jobs: working-directory: scripts/local_testnet - name: Upload logs artifact + if: always() uses: actions/upload-artifact@v4 with: name: logs-local-testnet @@ -125,6 +126,7 @@ jobs: working-directory: scripts/tests - name: Upload logs artifact + if: always() uses: actions/upload-artifact@v4 with: name: logs-doppelganger-protection-success @@ -160,6 +162,7 @@ jobs: working-directory: scripts/tests - name: Upload logs artifact + if: always() uses: actions/upload-artifact@v4 with: name: logs-doppelganger-protection-failure @@ -167,6 +170,48 @@ jobs: scripts/local_testnet/logs retention-days: 3 + # Tests checkpoint syncing to a live network (current fork) and a running devnet (usually next scheduled fork) + checkpoint-sync-test: + name: checkpoint-sync-test-${{ matrix.network }} + runs-on: ubuntu-latest + needs: dockerfile-ubuntu + if: contains(github.event.pull_request.labels.*.name, 'syncing') + continue-on-error: true + strategy: + matrix: + network: [sepolia, devnet] + steps: + - uses: actions/checkout@v4 + + - name: Install Kurtosis + run: | + echo "deb [trusted=yes] https://apt.fury.io/kurtosis-tech/ /" | sudo tee /etc/apt/sources.list.d/kurtosis.list + sudo apt update + sudo apt install -y kurtosis-cli + kurtosis analytics disable + + - name: Download Docker image artifact + uses: actions/download-artifact@v4 + with: + name: lighthouse-docker + path: . + + - name: Load Docker image + run: docker load -i lighthouse-docker.tar + + - name: Run the checkpoint sync test script + run: | + ./checkpoint-sync.sh "sync-${{ matrix.network }}" "checkpoint-sync-config-${{ matrix.network }}.yaml" + working-directory: scripts/tests + + - name: Upload logs artifact + if: always() + uses: actions/upload-artifact@v4 + with: + name: logs-checkpoint-sync-${{ matrix.network }} + path: | + scripts/local_testnet/logs + retention-days: 3 # This job succeeds ONLY IF all others succeed. It is used by the merge queue to determine whether # a PR is safe to merge. New jobs should be added here. @@ -182,4 +227,6 @@ jobs: steps: - uses: actions/checkout@v4 - name: Check that success job is dependent on all others - run: ./scripts/ci/check-success-job.sh ./.github/workflows/local-testnet.yml local-testnet-success + run: | + exclude_jobs='checkpoint-sync-test' + ./scripts/ci/check-success-job.sh ./.github/workflows/local-testnet.yml local-testnet-success "$exclude_jobs" diff --git a/scripts/ci/check-success-job.sh b/scripts/ci/check-success-job.sh index dfa5c03257c..2eee35f69e0 100755 --- a/scripts/ci/check-success-job.sh +++ b/scripts/ci/check-success-job.sh @@ -5,8 +5,13 @@ set -euf -o pipefail YAML=$1 SUCCESS_JOB=$2 +EXCLUDE_JOBS_REGEX=${3:-} + +yq '... comments="" | .jobs | map(. | key) | .[]' < "$YAML" | + grep -v "$SUCCESS_JOB" | + { [ -n "$EXCLUDE_JOBS_REGEX" ] && grep -Ev "$EXCLUDE_JOBS_REGEX" || cat; } | + sort > all_jobs.txt -yq '... comments="" | .jobs | map(. | key) | .[]' < "$YAML" | grep -v "$SUCCESS_JOB" | sort > all_jobs.txt yq "... comments=\"\" | .jobs.$SUCCESS_JOB.needs[]" < "$YAML" | grep -v "$SUCCESS_JOB" | sort > dep_jobs.txt diff all_jobs.txt dep_jobs.txt || (echo "COMPLETENESS CHECK FAILED" && exit 1) rm all_jobs.txt dep_jobs.txt diff --git a/scripts/tests/checkpoint-sync-config-devnet.yaml b/scripts/tests/checkpoint-sync-config-devnet.yaml new file mode 100644 index 00000000000..e81e5d44010 --- /dev/null +++ b/scripts/tests/checkpoint-sync-config-devnet.yaml @@ -0,0 +1,16 @@ +# Kurtosis config file to checkpoint sync to a running devnet supported by ethPandaOps and `ethereum-package`. +participants: + - cl_type: lighthouse + cl_image: lighthouse:local + supernode: true + - cl_type: lighthouse + cl_image: lighthouse:local + supernode: false + +checkpoint_sync_enabled: true +checkpoint_sync_url: "https://checkpoint-sync.fusaka-devnet-0.ethpandaops.io" + +global_log_level: debug + +network_params: + network: fusaka-devnet-0 diff --git a/scripts/tests/checkpoint-sync-config-sepolia.yaml b/scripts/tests/checkpoint-sync-config-sepolia.yaml new file mode 100644 index 00000000000..2adf9c22b84 --- /dev/null +++ b/scripts/tests/checkpoint-sync-config-sepolia.yaml @@ -0,0 +1,16 @@ +# Kurtosis config file to checkpoint sync to a live network (Sepolia). +participants: + - cl_type: lighthouse + cl_image: lighthouse:local + supernode: true + - cl_type: lighthouse + cl_image: lighthouse:local + supernode: false + +checkpoint_sync_enabled: true +checkpoint_sync_url: "https://checkpoint-sync.sepolia.ethpandaops.io" + +global_log_level: debug + +network_params: + network: sepolia diff --git a/scripts/tests/checkpoint-sync.sh b/scripts/tests/checkpoint-sync.sh new file mode 100755 index 00000000000..a170d1e94dc --- /dev/null +++ b/scripts/tests/checkpoint-sync.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +# +# Checkpoint sync to a live network. +# +# Start with checkpoint sync and let the node(s) sync to head and perform backfill for a specified number of slots. +# This test ensures we cover all sync components (range, lookup, backfill) and measures sync speed +# to detect any performance regressions. +SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +ENCLAVE_NAME=${1:-sync-testnet} +CONFIG=${2:-$SCRIPT_DIR/checkpoint-sync-config-sepolia.yaml} + +# Test configuration +# ------------------------------------------------------ +# Interval for polling the /lighthouse/syncing endpoint for sync status +POLL_INTERVAL_SECS=5 +# Target number of slots to backfill to complete this test. +TARGET_BACKFILL_SLOTS=1024 +# Timeout for this test, if the node(s) fail to backfill `TARGET_BACKFILL_SLOTS` slots, fail the test. +TIMEOUT_MINS=10 +TIMEOUT_SECS=$((TIMEOUT_MINS * 60)) +# ------------------------------------------------------ + +# Polls a single node's sync status +poll_node() { + local node_type=$1 + local url=${node_urls[$node_type]} + + response=$(curl -s "${url}/lighthouse/syncing") + + if [ -z "$response" ] || [ "$response" = "null" ]; then + echo "${node_type} status: No response or null response" + return + fi + + # Print syncing status + sync_state=$(echo "$response" | jq -r 'if (.data | type) == "object" then "object" else "string" end' 2>/dev/null) + + if [ "$sync_state" = "object" ]; then + status=$(echo "$response" | jq -r '.data | keys[0] // "Unknown"') + fields=$(echo "$response" | jq -r ".data.${status} | to_entries | map(\"\(.key): \(.value)\") | join(\", \")") + echo "${node_type} status: ${status}, ${fields}" + else + status=$(echo "$response" | jq -r '.data' 2>/dev/null) + echo "${node_type} status: ${status:-Unknown}" + fi + + # Check for completion criteria + if [ "$status" = "BackFillSyncing" ]; then + completed=$(echo "$response" | jq -r ".data.${status}.completed // 0") + if [ "$completed" -ge "$TARGET_BACKFILL_SLOTS" ]; then + mark_node_complete "$node_type" + fi + fi + # For other states (Synced, SyncingFinalized, SyncingHead, SyncTransition, Stalled, Unknown), + # we continue polling + # NOTE: there is a bug where Lighthouse briefly switch to "Synced" before completing backfilling. We ignore this state + # as it's unlikely a node is fully synced without going through backfilling `TARGET_BACKFILL_SLOTS` slots (only + # possible on a new network). +} + +# Marks a node as complete and record time +mark_node_complete() { + local node_type=$1 + if [ "${node_completed[$node_type]}" = false ]; then + node_completed[$node_type]=true + node_complete_time[$node_type]=$(date +%s) + echo "${node_type} completed backfill in $((node_complete_time[$node_type] - start_time)) seconds" + fi +} + +exit_and_dump_logs() { + local exit_code=$1 + echo "Shutting down..." + $SCRIPT_DIR/../local_testnet/stop_local_testnet.sh $ENCLAVE_NAME + echo "Test completed with exit code $exit_code." + exit $exit_code +} + +# Start the nodes +$SCRIPT_DIR/../local_testnet/start_local_testnet.sh -e $ENCLAVE_NAME -b false -n $CONFIG +if [ $? -ne 0 ]; then + echo "Failed to start local testnet" + exit_and_dump_logs 1 +fi + +start_time=$(date +%s) + +# Get all beacon API URLs +supernode_url=$(kurtosis port print $ENCLAVE_NAME cl-1-lighthouse-geth http) +fullnode_url=$(kurtosis port print $ENCLAVE_NAME cl-2-lighthouse-geth http) + +# Initialize statuses +declare -A node_completed +declare -A node_complete_time +declare -A node_urls + +node_urls["supernode"]="$supernode_url" +node_urls["fullnode"]="$fullnode_url" +node_completed["supernode"]=false +node_completed["fullnode"]=false + +echo "Polling sync status until backfill reaches ${TARGET_BACKFILL_SLOTS} slots or timeout of ${TIMEOUT_MINS} mins" + +while [ "${node_completed[supernode]}" = false ] || [ "${node_completed[fullnode]}" = false ]; do + current_time=$(date +%s) + elapsed=$((current_time - start_time)) + + if [ "$elapsed" -ge "$TIMEOUT_SECS" ]; then + echo "ERROR: Nodes timed out syncing after ${TIMEOUT_MINS} minutes. Exiting." + exit_and_dump_logs 1 + fi + + # Poll each node that hasn't completed yet + for node in "supernode" "fullnode"; do + if [ "${node_completed[$node]}" = false ]; then + poll_node "$node" + fi + done + + sleep $POLL_INTERVAL_SECS +done + +echo "Sync test complete! Both supernode and fullnode have synced to HEAD and backfilled ${TARGET_BACKFILL_SLOTS} slots." +echo "Supernode time: $((node_complete_time[supernode] - start_time)) seconds" +echo "Fullnode time: $((node_complete_time[fullnode] - start_time)) seconds" +exit_and_dump_logs 0 \ No newline at end of file