Skip to content

Commit

Permalink
save progress on disk creation
Browse files Browse the repository at this point in the history
  • Loading branch information
areshand committed Nov 13, 2024
1 parent 995f85d commit c2a48a9
Show file tree
Hide file tree
Showing 6 changed files with 364 additions and 9 deletions.
33 changes: 33 additions & 0 deletions .github/workflows/provision-replay-verify-archive-disks.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# This defines a workflow to replay transactions on the given chain with the latest aptos node software.
# In order to trigger it go to the Actions Tab of the Repo, click "replay-verify" and then "Run Workflow".
#
# On PR, a single test case will run. On workflow_dispatch, you may specify the CHAIN_NAME to verify.

name: "provision-replay-verify-archive-disks"
on:
# Allow triggering manually
workflow_dispatch:
inputs:
NETWORK:
required: true
type: string
description: The network to provision storage for.
pull_request:
paths:
- '.github/workflows/provision-replay-verify-archive-disks.yaml'
- '.github/workflows/workflow-run-replay-verify-archive-storage-provision.yaml'
schedule:
- cron: "0 22 * * 0,2,4" # The main branch cadence. This runs every Sun,Tues,Thurs

jobs:
replay-testnet:
uses: ./.github/workflows/workflow-run-replay-verify-archive-storage-provision.yaml
secrets: inherit
with:
NETWORK: testnet

replay-mainnet:
uses: ./.github/workflows/workflow-run-replay-verify-archive-storage-provision.yaml
secrets: inherit
with:
NETWORK: mainnet
2 changes: 1 addition & 1 deletion .github/workflows/workflow-run-module-verify.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ on:
type: string
required: true
default: "high-perf-docker-with-local-ssd"
TIMEOUT_MINUTES:
TIMEOUT_S:
description: "Github job timeout in minutes"
type: number
required: true
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
name: "*run replay-verify reusable workflow"

on:
# This allows the workflow to be triggered from another workflow
workflow_call:
inputs:
NETWORK:
required: true
type: string
description: The network to provision storage for.
# This allows the workflow to be triggered manually from the Github UI or CLI
# NOTE: because the "number" type is not supported, we default to 720 minute timeout
workflow_dispatch:
inputs:
NETWORK:
description: The network to provision storage for.
type: string
required: true
jobs:
provision:
runs-on: runs-on,cpu=4,ram=16,family=m7a+m7i-flex,image=aptos-ubuntu-x64,run-id=${{ github.run_id }},spot=co
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: main
- name: Setup Python
uses: ./.github/actions/python-setup
with:
pyproject_directory: ./testsuite/replay-verify

- uses: aptos-labs/aptos-core/.github/actions/docker-setup@main
id: docker-setup
with:
GCP_WORKLOAD_IDENTITY_PROVIDER: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }}
GCP_SERVICE_ACCOUNT_EMAIL: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }}
EXPORT_GCP_PROJECT_VARIABLES: "false"
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_DOCKER_ARTIFACT_REPO: ${{ secrets.AWS_DOCKER_ARTIFACT_REPO }}
GIT_CREDENTIALS: ${{ secrets.GIT_CREDENTIALS }}
GCP_AUTH_DURATION: 3600

- name: "Export GCloud auth token"
id: gcloud-auth
run: echo "CLOUDSDK_AUTH_ACCESS_TOKEN=${{ steps.docker-setup.outputs.CLOUDSDK_AUTH_ACCESS_TOKEN }}" >> $GITHUB_ENV
shell: bash

- name: "Setup GCloud project"
shell: bash
run: gcloud config set project aptos-devinfra-0

- name: "Provision storage"
run: poetry run python archive_disk_utils.py ${{ inputs.NETWORK}}
working-directory: ./testsuite/replay-verify




253 changes: 253 additions & 0 deletions .github/workflows/workflow-run-replay-verify-on-archive copy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
name: "*run replay-verify reusable workflow"

on:
# This allows the workflow to be triggered from another workflow
workflow_call:
inputs:
GIT_SHA:
required: true
type: string
description: The git SHA1 to test.
# replay-verify config
START_VERSION:
required: false
type: string
description: The history start to use for the backup. If not specified, it will use the default history start.
END_VERSION:
required: false
type: string
description: The end version to use for the backup. If not specified, it will use the latest version.
RANGES_TO_SKIP:
required: false
type: string
description: The optional list of transaction ranges to skip.
RUNS_ON:
description: "The runner to use for the job."
type: string
required: true
default: "medium-perf-local-ssd"
# This allows the workflow to be triggered manually from the Github UI or CLI
# NOTE: because the "number" type is not supported, we default to 720 minute timeout
workflow_dispatch:
inputs:
GIT_SHA:
required: true
type: string
description: The git SHA1 to test.
# replay-verify config
START_VERSION:
required: false
type: string
description: The history start to use for the backup. If not specified, it will use the default history start.
END_VERSION:
required: false
type: string
description: The end version to use for the backup. If not specified, it will use the latest version.
RANGES_TO_SKIP:
required: false
type: string
description: The optional list of transaction ranges to skip.
RUNS_ON:
description: "The runner to use for the job."
type: string
required: true
default: "high-perf-docker-with-local-ssd"
jobs:
prepare:
runs-on: ${{ inputs.RUNS_ON }}
outputs:
job_ids: ${{ steps.gen-jobs.outputs.job_ids }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.GIT_SHA }}

- name: Load cached aptos-debugger binary
id: cache-aptos-debugger-binary
uses: actions/cache@v4
with:
# copy the binary to the root of the repo and cache it there, because rust-setup calls a cache-rust action
# which cleans up the target directory in its post action
path: |
aptos-debugger
key: aptos-debugger-${{ inputs.GIT_SHA || github.sha }}

- name: Prepare for build if not cached
if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true'
uses: aptos-labs/aptos-core/.github/actions/rust-setup@main
with:
GIT_CREDENTIALS: ${{ inputs.GIT_CREDENTIALS }}

- name: Build and strip aptos-debugger binary if not cached
if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true'
shell: bash
run: |
cargo build --release -p aptos-debugger
strip -s target/release/aptos-debugger
cp target/release/aptos-debugger .
- name: Install GCloud SDK
uses: "google-github-actions/setup-gcloud@v2"
with:
version: ">= 418.0.0"
install_components: "kubectl,gke-gcloud-auth-plugin"

- name: get timestamp to use in cache key
id: get-timestamp
run: echo "ts=$(date +%s)" >> $GITHUB_OUTPUT

- name: Load cached backup storage metadata cache dir (and save back afterwards)
uses: actions/cache@v4
with:
path: metadata_cache
key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ steps.get-timestamp.outputs.ts }}
restore-keys: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-

- name: Generate job ranges
id: gen-jobs
env:
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
run: |
./aptos-debugger aptos-db gen-replay-verify-jobs \
--metadata-cache-dir ./metadata_cache \
--command-adapter-config ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }} \
--start-version ${{ inputs.HISTORY_START }} \
--ranges-to-skip "${{ inputs.RANGES_TO_SKIP }}" \
--max-versions-per-range ${{ inputs.MAX_VERSIONS_PER_RANGE }} \
\
--max-ranges-per-job 16 \
--output-json-file jobs.json \
jq -c 'length as $N | [range(0; $N)]' jobs.json > job_ids.json
cat job_ids.json
jq . jobs.json
echo "job_ids=$(cat job_ids.json)" >> $GITHUB_OUTPUT
- name: Cache backup storage config and job definition
uses: actions/cache/save@v4
with:
path: |
${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
jobs.json
key: backup-config-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ github.run_id }}

replay-verify:
needs: prepare
timeout-minutes: ${{ inputs.TIMEOUT_MINUTES || 180 }}
runs-on: ${{ inputs.RUNS_ON }}
strategy:
fail-fast: false
matrix:
job_id: ${{ fromJson(needs.prepare.outputs.job_ids) }}
steps:
- name: Load cached aptos-debugger binary
uses: actions/cache/restore@v4
with:
path: |
aptos-debugger
key: aptos-debugger-${{ inputs.GIT_SHA || github.sha }}
fail-on-cache-miss: true

- name: Load cached backup storage metadata cache dir
uses: actions/cache/restore@v4
with:
path: metadata_cache
key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-
fail-on-cache-miss: true

- name: Load cached backup storage config and job definitions
uses: actions/cache/restore@v4
with:
path: |
${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
jobs.json
key: backup-config-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ github.run_id }}
fail-on-cache-miss: true

- name: Install GCloud SDK
uses: "google-github-actions/setup-gcloud@v2"
with:
version: ">= 418.0.0"
install_components: "kubectl,gke-gcloud-auth-plugin"

- name: Run replay-verify in parallel
env:
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
shell: bash
run: |
set -o nounset -o errexit -o pipefail
replay() {
idx=$1
id=$2
begin=$3
end=$4
desc=$5
echo ---------
echo Job start. $id: $desc
echo ---------
MC=metadata_cache_$idx
cp -r metadata_cache $MC
DB=db_$idx
for try in {0..6}
do
if [ $try -gt 0 ]; then
SLEEP=$((10 * $try))
echo "sleeping for $SLEEP seconds before retry #$try" >&2
sleep $SLEEP
fi
res=0
./aptos-debugger aptos-db replay-verify \
--metadata-cache-dir $MC \
--command-adapter-config ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }} \
--start-version $begin \
--end-version $end \
\
--lazy-quit \
--enable-storage-sharding \
--target-db-dir $DB \
--concurrent-downloads 8 \
--replay-concurrency-level 4 \
|| res=$?
if [[ $res == 0 || $res == 2 ]]
then
return $res
fi
done
return 1
}
pids=()
idx=0
while read id begin end desc; do
replay $idx $id $begin $end "$desc" 2>&1 | sed "s/^/[partition $idx]: /" &
pids[$idx]=$!
idx=$((idx+1))
done < <(jq '.[${{ matrix.job_id }}][]' jobs.json)
res=0
for idx in `seq 0 $((idx-1))`
do
range_res=0
wait ${pids[$idx]} || range_res=$?
echo partition $idx returned $range_res
if [[ $range_res != 0 ]]
then
res=$range_res
fi
done
echo All partitions done, returning $res
exit $res
22 changes: 18 additions & 4 deletions testsuite/replay-verify/archive_disk_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import concurrent.futures
import time
import yaml
import sys

# Constants
DISK_COPIES = 4
Expand Down Expand Up @@ -307,16 +308,29 @@ def create_disk_pv_pvc(project, zone, cluster_name, snapshot_name, prefix, names


if __name__ == "__main__":
# check input arg network
if len(sys.argv) != 2:
print("Usage: python archive_disk_utils.py <network>")
sys.exit(1)
network = sys.argv[1]
source_project_id = "aptos-platform-compute-0"
region = "us-central1"
source_cluster_id = "general-usce1-0"
source_namespace = "testnet-pfn-usce1-backup"
project_id = "aptos-devinfra-0"
snapshot_name = "testnet-archive"
new_pv_prefix = "testnet-archive"
target_namespace = "default"
zone = "us-central1-a"
cluster_name = "devinfra-usce1-0"

if network == "testnet":
source_cluster_id = "general-usce1-0"
source_namespace = "testnet-pfn-usce1-backup"
snapshot_name = "testnet-archive"
new_pv_prefix = "testnet-archive"
else:
source_cluster_id = "mainnet-usce1-0"
source_namespace = "mainnet-pfn-usce1-backup"
snapshot_name = "mainnet-archive"
new_pv_prefix = "mainnet-archive"

create_snapshot_with_gcloud(
snapshot_name,
source_project_id,
Expand Down
Loading

0 comments on commit c2a48a9

Please sign in to comment.