Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 48 additions & 1 deletion .github/workflows/local-testnet.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ jobs:
working-directory: scripts/local_testnet

- name: Upload logs artifact
if: always()
uses: actions/upload-artifact@v4
with:
name: logs-local-testnet
Expand Down Expand Up @@ -125,6 +126,7 @@ jobs:
working-directory: scripts/tests

- name: Upload logs artifact
if: always()
uses: actions/upload-artifact@v4
with:
name: logs-doppelganger-protection-success
Expand Down Expand Up @@ -160,13 +162,56 @@ jobs:
working-directory: scripts/tests

- name: Upload logs artifact
if: always()
uses: actions/upload-artifact@v4
with:
name: logs-doppelganger-protection-failure
path: |
scripts/local_testnet/logs
retention-days: 3

# Tests checkpoint syncing to a live network (current fork) and a running devnet (usually next scheduled fork)
checkpoint-sync-test:
name: checkpoint-sync-test-${{ matrix.network }}
runs-on: ubuntu-latest
needs: dockerfile-ubuntu
if: contains(github.event.pull_request.labels.*.name, 'syncing')
continue-on-error: true
strategy:
matrix:
network: [sepolia, devnet]
steps:
- uses: actions/checkout@v4

- name: Install Kurtosis
run: |
echo "deb [trusted=yes] https://apt.fury.io/kurtosis-tech/ /" | sudo tee /etc/apt/sources.list.d/kurtosis.list
sudo apt update
sudo apt install -y kurtosis-cli
kurtosis analytics disable

- name: Download Docker image artifact
uses: actions/download-artifact@v4
with:
name: lighthouse-docker
path: .

- name: Load Docker image
run: docker load -i lighthouse-docker.tar

- name: Run the checkpoint sync test script
run: |
./checkpoint-sync.sh "sync-${{ matrix.network }}" "checkpoint-sync-config-${{ matrix.network }}.yaml"
working-directory: scripts/tests

- name: Upload logs artifact
if: always()
uses: actions/upload-artifact@v4
with:
name: logs-checkpoint-sync-${{ matrix.network }}
path: |
scripts/local_testnet/logs
retention-days: 3

# This job succeeds ONLY IF all others succeed. It is used by the merge queue to determine whether
# a PR is safe to merge. New jobs should be added here.
Expand All @@ -182,4 +227,6 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Check that success job is dependent on all others
run: ./scripts/ci/check-success-job.sh ./.github/workflows/local-testnet.yml local-testnet-success
run: |
exclude_jobs='checkpoint-sync-test'
./scripts/ci/check-success-job.sh ./.github/workflows/local-testnet.yml local-testnet-success "$exclude_jobs"
7 changes: 6 additions & 1 deletion scripts/ci/check-success-job.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,13 @@ set -euf -o pipefail

YAML=$1
SUCCESS_JOB=$2
EXCLUDE_JOBS_REGEX=${3:-}

yq '... comments="" | .jobs | map(. | key) | .[]' < "$YAML" |
grep -v "$SUCCESS_JOB" |
{ [ -n "$EXCLUDE_JOBS_REGEX" ] && grep -Ev "$EXCLUDE_JOBS_REGEX" || cat; } |
sort > all_jobs.txt

yq '... comments="" | .jobs | map(. | key) | .[]' < "$YAML" | grep -v "$SUCCESS_JOB" | sort > all_jobs.txt
yq "... comments=\"\" | .jobs.$SUCCESS_JOB.needs[]" < "$YAML" | grep -v "$SUCCESS_JOB" | sort > dep_jobs.txt
diff all_jobs.txt dep_jobs.txt || (echo "COMPLETENESS CHECK FAILED" && exit 1)
rm all_jobs.txt dep_jobs.txt
Expand Down
16 changes: 16 additions & 0 deletions scripts/tests/checkpoint-sync-config-devnet.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Kurtosis config file to checkpoint sync to a running devnet supported by ethPandaOps and `ethereum-package`.
participants:
- cl_type: lighthouse
cl_image: lighthouse:local
supernode: true
- cl_type: lighthouse
cl_image: lighthouse:local
supernode: false

checkpoint_sync_enabled: true
checkpoint_sync_url: "https://checkpoint-sync.fusaka-devnet-0.ethpandaops.io"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we don't want to run the devnet tests, we can just leave off the syncing label altogether right? I'm just thinking of the case where we want to merge devnet-1 changes prior to devnet-1 existing

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes these tests don't run by default and not required for merge - although we could make sepolia one mandatory.

Yeah i think in that case its probably ok for it to fail, and i think mergify won't block it (i've updated the local-testnet-success job).

I've made a feature request (ethpandaops/kurtosis-sync-test#4) to test checkpoint sync with a local testnet here, so longer term we could potentially getting rid of the devnet test here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok yeah sounds good


global_log_level: debug

network_params:
network: fusaka-devnet-0
16 changes: 16 additions & 0 deletions scripts/tests/checkpoint-sync-config-sepolia.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Kurtosis config file to checkpoint sync to a live network (Sepolia).
participants:
- cl_type: lighthouse
cl_image: lighthouse:local
supernode: true
- cl_type: lighthouse
cl_image: lighthouse:local
supernode: false

checkpoint_sync_enabled: true
checkpoint_sync_url: "https://checkpoint-sync.sepolia.ethpandaops.io"

global_log_level: debug

network_params:
network: sepolia
127 changes: 127 additions & 0 deletions scripts/tests/checkpoint-sync.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#!/usr/bin/env bash
#
# Checkpoint sync to a live network.
#
# Start with checkpoint sync and let the node(s) sync to head and perform backfill for a specified number of slots.
# This test ensures we cover all sync components (range, lookup, backfill) and measures sync speed
# to detect any performance regressions.
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"

ENCLAVE_NAME=${1:-sync-testnet}
CONFIG=${2:-$SCRIPT_DIR/checkpoint-sync-config-sepolia.yaml}

# Test configuration
# ------------------------------------------------------
# Interval for polling the /lighthouse/syncing endpoint for sync status
POLL_INTERVAL_SECS=5
# Target number of slots to backfill to complete this test.
TARGET_BACKFILL_SLOTS=1024
# Timeout for this test, if the node(s) fail to backfill `TARGET_BACKFILL_SLOTS` slots, fail the test.
TIMEOUT_MINS=10
TIMEOUT_SECS=$((TIMEOUT_MINS * 60))
# ------------------------------------------------------

# Polls a single node's sync status
poll_node() {
local node_type=$1
local url=${node_urls[$node_type]}

response=$(curl -s "${url}/lighthouse/syncing")

if [ -z "$response" ] || [ "$response" = "null" ]; then
echo "${node_type} status: No response or null response"
return
fi

# Print syncing status
sync_state=$(echo "$response" | jq -r 'if (.data | type) == "object" then "object" else "string" end' 2>/dev/null)

if [ "$sync_state" = "object" ]; then
status=$(echo "$response" | jq -r '.data | keys[0] // "Unknown"')
fields=$(echo "$response" | jq -r ".data.${status} | to_entries | map(\"\(.key): \(.value)\") | join(\", \")")
echo "${node_type} status: ${status}, ${fields}"
else
status=$(echo "$response" | jq -r '.data' 2>/dev/null)
echo "${node_type} status: ${status:-Unknown}"
fi

# Check for completion criteria
if [ "$status" = "BackFillSyncing" ]; then
completed=$(echo "$response" | jq -r ".data.${status}.completed // 0")
if [ "$completed" -ge "$TARGET_BACKFILL_SLOTS" ]; then
mark_node_complete "$node_type"
fi
fi
# For other states (Synced, SyncingFinalized, SyncingHead, SyncTransition, Stalled, Unknown),
# we continue polling
# NOTE: there is a bug where Lighthouse briefly switch to "Synced" before completing backfilling. We ignore this state
# as it's unlikely a node is fully synced without going through backfilling `TARGET_BACKFILL_SLOTS` slots (only
# possible on a new network).
}

# Marks a node as complete and record time
mark_node_complete() {
local node_type=$1
if [ "${node_completed[$node_type]}" = false ]; then
node_completed[$node_type]=true
node_complete_time[$node_type]=$(date +%s)
echo "${node_type} completed backfill in $((node_complete_time[$node_type] - start_time)) seconds"
fi
}

exit_and_dump_logs() {
local exit_code=$1
echo "Shutting down..."
$SCRIPT_DIR/../local_testnet/stop_local_testnet.sh $ENCLAVE_NAME
echo "Test completed with exit code $exit_code."
exit $exit_code
}

# Start the nodes
$SCRIPT_DIR/../local_testnet/start_local_testnet.sh -e $ENCLAVE_NAME -b false -n $CONFIG
if [ $? -ne 0 ]; then
echo "Failed to start local testnet"
exit_and_dump_logs 1
fi

start_time=$(date +%s)

# Get all beacon API URLs
supernode_url=$(kurtosis port print $ENCLAVE_NAME cl-1-lighthouse-geth http)
fullnode_url=$(kurtosis port print $ENCLAVE_NAME cl-2-lighthouse-geth http)

# Initialize statuses
declare -A node_completed
declare -A node_complete_time
declare -A node_urls

node_urls["supernode"]="$supernode_url"
node_urls["fullnode"]="$fullnode_url"
node_completed["supernode"]=false
node_completed["fullnode"]=false

echo "Polling sync status until backfill reaches ${TARGET_BACKFILL_SLOTS} slots or timeout of ${TIMEOUT_MINS} mins"

while [ "${node_completed[supernode]}" = false ] || [ "${node_completed[fullnode]}" = false ]; do
current_time=$(date +%s)
elapsed=$((current_time - start_time))

if [ "$elapsed" -ge "$TIMEOUT_SECS" ]; then
echo "ERROR: Nodes timed out syncing after ${TIMEOUT_MINS} minutes. Exiting."
exit_and_dump_logs 1
fi

# Poll each node that hasn't completed yet
for node in "supernode" "fullnode"; do
if [ "${node_completed[$node]}" = false ]; then
poll_node "$node"
fi
done

sleep $POLL_INTERVAL_SECS
done

echo "Sync test complete! Both supernode and fullnode have synced to HEAD and backfilled ${TARGET_BACKFILL_SLOTS} slots."
echo "Supernode time: $((node_complete_time[supernode] - start_time)) seconds"
echo "Fullnode time: $((node_complete_time[fullnode] - start_time)) seconds"
exit_and_dump_logs 0
Loading