From 061fc1a7b16b934e90d3664231f6b6726a7de795 Mon Sep 17 00:00:00 2001 From: Bo Wu Date: Wed, 13 Nov 2024 15:31:51 -0800 Subject: [PATCH] save progress on disk creation --- ...provision-replay-verify-archive-disks.yaml | 33 +++ .../workflows/workflow-run-module-verify.yaml | 2 +- ...play-verify-archive-storage-provision.yaml | 60 +++++ ...workflow-run-replay-verify-on-archive.yaml | 253 ++++++++++++++++++ testsuite/replay-verify/archive_disk_utils.py | 22 +- testsuite/replay-verify/poetry.lock | 30 +-- testsuite/replay-verify/pyproject.toml | 5 - 7 files changed, 380 insertions(+), 25 deletions(-) create mode 100644 .github/workflows/provision-replay-verify-archive-disks.yaml create mode 100644 .github/workflows/workflow-run-replay-verify-archive-storage-provision.yaml create mode 100644 .github/workflows/workflow-run-replay-verify-on-archive.yaml diff --git a/.github/workflows/provision-replay-verify-archive-disks.yaml b/.github/workflows/provision-replay-verify-archive-disks.yaml new file mode 100644 index 00000000000000..da5842cb787051 --- /dev/null +++ b/.github/workflows/provision-replay-verify-archive-disks.yaml @@ -0,0 +1,33 @@ +# This defines a workflow to replay transactions on the given chain with the latest aptos node software. +# In order to trigger it go to the Actions Tab of the Repo, click "replay-verify" and then "Run Workflow". +# +# On PR, a single test case will run. On workflow_dispatch, you may specify the CHAIN_NAME to verify. + +name: "provision-replay-verify-archive-disks" +on: + # Allow triggering manually + workflow_dispatch: + inputs: + NETWORK: + required: true + type: string + description: The network to provision storage for. + pull_request: + paths: + - '.github/workflows/provision-replay-verify-archive-disks.yaml' + - '.github/workflows/workflow-run-replay-verify-archive-storage-provision.yaml' + schedule: + - cron: "0 22 * * 0,2,4" # The main branch cadence. This runs every Sun,Tues,Thurs + +jobs: + replay-testnet: + uses: ./.github/workflows/workflow-run-replay-verify-archive-storage-provision.yaml + secrets: inherit + with: + NETWORK: testnet + + replay-mainnet: + uses: ./.github/workflows/workflow-run-replay-verify-archive-storage-provision.yaml + secrets: inherit + with: + NETWORK: mainnet diff --git a/.github/workflows/workflow-run-module-verify.yaml b/.github/workflows/workflow-run-module-verify.yaml index de16bbade2ffe5..165653d8f5ac76 100644 --- a/.github/workflows/workflow-run-module-verify.yaml +++ b/.github/workflows/workflow-run-module-verify.yaml @@ -26,7 +26,7 @@ on: type: string required: true default: "high-perf-docker-with-local-ssd" - TIMEOUT_MINUTES: + TIMEOUT_S: description: "Github job timeout in minutes" type: number required: true diff --git a/.github/workflows/workflow-run-replay-verify-archive-storage-provision.yaml b/.github/workflows/workflow-run-replay-verify-archive-storage-provision.yaml new file mode 100644 index 00000000000000..d6ff3d8826a3a1 --- /dev/null +++ b/.github/workflows/workflow-run-replay-verify-archive-storage-provision.yaml @@ -0,0 +1,60 @@ +name: "*run replay-verify reusable workflow" + +on: + # This allows the workflow to be triggered from another workflow + workflow_call: + inputs: + NETWORK: + required: true + type: string + description: The network to provision storage for. + # This allows the workflow to be triggered manually from the Github UI or CLI + # NOTE: because the "number" type is not supported, we default to 720 minute timeout + workflow_dispatch: + inputs: + NETWORK: + description: The network to provision storage for. + type: string + required: true +jobs: + provision: + runs-on: runs-on,cpu=4,ram=16,family=m7a+m7i-flex,image=aptos-ubuntu-x64,run-id=${{ github.run_id }},spot=co + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: main + - name: Setup Python + uses: ./.github/actions/python-setup + with: + pyproject_directory: testsuite/replay-verify + python_version: 3.10 + + - uses: aptos-labs/aptos-core/.github/actions/docker-setup@main + id: docker-setup + with: + GCP_WORKLOAD_IDENTITY_PROVIDER: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} + GCP_SERVICE_ACCOUNT_EMAIL: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }} + EXPORT_GCP_PROJECT_VARIABLES: "false" + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DOCKER_ARTIFACT_REPO: ${{ secrets.AWS_DOCKER_ARTIFACT_REPO }} + GIT_CREDENTIALS: ${{ secrets.GIT_CREDENTIALS }} + GCP_AUTH_DURATION: 3600 + + - name: "Export GCloud auth token" + id: gcloud-auth + run: echo "CLOUDSDK_AUTH_ACCESS_TOKEN=${{ steps.docker-setup.outputs.CLOUDSDK_AUTH_ACCESS_TOKEN }}" >> $GITHUB_ENV + shell: bash + + - name: "Setup GCloud project" + shell: bash + run: gcloud config set project aptos-devinfra-0 + + - name: "Provision storage" + run: poetry run python archive_disk_utils.py ${{ inputs.NETWORK}} + working-directory: ./testsuite/replay-verify + + + + \ No newline at end of file diff --git a/.github/workflows/workflow-run-replay-verify-on-archive.yaml b/.github/workflows/workflow-run-replay-verify-on-archive.yaml new file mode 100644 index 00000000000000..0e1e9d37037f9f --- /dev/null +++ b/.github/workflows/workflow-run-replay-verify-on-archive.yaml @@ -0,0 +1,253 @@ +name: "*run replay-verify reusable workflow" + +on: + # This allows the workflow to be triggered from another workflow + workflow_call: + inputs: + GIT_SHA: + required: true + type: string + description: The git SHA1 to test. + # replay-verify config + START_VERSION: + required: false + type: string + description: The history start to use for the backup. If not specified, it will use the default history start. + END_VERSION: + required: false + type: string + description: The end version to use for the backup. If not specified, it will use the latest version. + RANGES_TO_SKIP: + required: false + type: string + description: The optional list of transaction ranges to skip. + RUNS_ON: + description: "The runner to use for the job." + type: string + required: true + default: "medium-perf-local-ssd" + # This allows the workflow to be triggered manually from the Github UI or CLI + # NOTE: because the "number" type is not supported, we default to 720 minute timeout + workflow_dispatch: + inputs: + GIT_SHA: + required: true + type: string + description: The git SHA1 to test. + # replay-verify config + START_VERSION: + required: false + type: string + description: The history start to use for the backup. If not specified, it will use the default history start. + END_VERSION: + required: false + type: string + description: The end version to use for the backup. If not specified, it will use the latest version. + RANGES_TO_SKIP: + required: false + type: string + description: The optional list of transaction ranges to skip. + RUNS_ON: + description: "The runner to use for the job." + type: string + required: true + default: "high-perf-docker-with-local-ssd" +jobs: + prepare: + runs-on: ${{ inputs.RUNS_ON }} + outputs: + job_ids: ${{ steps.gen-jobs.outputs.job_ids }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.GIT_SHA }} + + - name: Load cached aptos-debugger binary + id: cache-aptos-debugger-binary + uses: actions/cache@v4 + with: + # copy the binary to the root of the repo and cache it there, because rust-setup calls a cache-rust action + # which cleans up the target directory in its post action + path: | + aptos-debugger + key: aptos-debugger-${{ inputs.GIT_SHA || github.sha }} + + - name: Prepare for build if not cached + if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true' + uses: aptos-labs/aptos-core/.github/actions/rust-setup@main + with: + GIT_CREDENTIALS: ${{ inputs.GIT_CREDENTIALS }} + + - name: Build and strip aptos-debugger binary if not cached + if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true' + shell: bash + run: | + cargo build --release -p aptos-debugger + strip -s target/release/aptos-debugger + cp target/release/aptos-debugger . + + - name: Install GCloud SDK + uses: "google-github-actions/setup-gcloud@v2" + with: + version: ">= 418.0.0" + install_components: "kubectl,gke-gcloud-auth-plugin" + + - name: get timestamp to use in cache key + id: get-timestamp + run: echo "ts=$(date +%s)" >> $GITHUB_OUTPUT + + - name: Load cached backup storage metadata cache dir (and save back afterwards) + uses: actions/cache@v4 + with: + path: metadata_cache + key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ steps.get-timestamp.outputs.ts }} + restore-keys: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}- + + - name: Generate job ranges + id: gen-jobs + env: + BUCKET: ${{ inputs.BUCKET }} + SUB_DIR: ${{ inputs.SUB_DIR }} + run: | + ./aptos-debugger aptos-db gen-replay-verify-jobs \ + --metadata-cache-dir ./metadata_cache \ + --command-adapter-config ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }} \ + --start-version ${{ inputs.HISTORY_START }} \ + --ranges-to-skip "${{ inputs.RANGES_TO_SKIP }}" \ + --max-versions-per-range ${{ inputs.MAX_VERSIONS_PER_RANGE }} \ + \ + --max-ranges-per-job 16 \ + --output-json-file jobs.json \ + + + jq -c 'length as $N | [range(0; $N)]' jobs.json > job_ids.json + + cat job_ids.json + jq . jobs.json + + echo "job_ids=$(cat job_ids.json)" >> $GITHUB_OUTPUT + + - name: Cache backup storage config and job definition + uses: actions/cache/save@v4 + with: + path: | + ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }} + jobs.json + key: backup-config-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ github.run_id }} + + replay-verify: + needs: prepare + timeout-minutes: ${{ inputs.TIMEOUT_MINUTES || 180 }} + runs-on: ${{ inputs.RUNS_ON }} + strategy: + fail-fast: false + matrix: + job_id: ${{ fromJson(needs.prepare.outputs.job_ids) }} + steps: + - name: Load cached aptos-debugger binary + uses: actions/cache/restore@v4 + with: + path: | + aptos-debugger + key: aptos-debugger-${{ inputs.GIT_SHA || github.sha }} + fail-on-cache-miss: true + + - name: Load cached backup storage metadata cache dir + uses: actions/cache/restore@v4 + with: + path: metadata_cache + key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}- + fail-on-cache-miss: true + + - name: Load cached backup storage config and job definitions + uses: actions/cache/restore@v4 + with: + path: | + ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }} + jobs.json + key: backup-config-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ github.run_id }} + fail-on-cache-miss: true + + - name: Install GCloud SDK + uses: "google-github-actions/setup-gcloud@v2" + with: + version: ">= 418.0.0" + install_components: "kubectl,gke-gcloud-auth-plugin" + + - name: Run replay-verify in parallel + env: + BUCKET: ${{ inputs.BUCKET }} + SUB_DIR: ${{ inputs.SUB_DIR }} + shell: bash + run: | + set -o nounset -o errexit -o pipefail + replay() { + idx=$1 + id=$2 + begin=$3 + end=$4 + desc=$5 + + echo --------- + echo Job start. $id: $desc + echo --------- + + MC=metadata_cache_$idx + cp -r metadata_cache $MC + DB=db_$idx + + for try in {0..6} + do + if [ $try -gt 0 ]; then + SLEEP=$((10 * $try)) + echo "sleeping for $SLEEP seconds before retry #$try" >&2 + sleep $SLEEP + fi + + res=0 + ./aptos-debugger aptos-db replay-verify \ + --metadata-cache-dir $MC \ + --command-adapter-config ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }} \ + --start-version $begin \ + --end-version $end \ + \ + --lazy-quit \ + --enable-storage-sharding \ + --target-db-dir $DB \ + --concurrent-downloads 8 \ + --replay-concurrency-level 4 \ + || res=$? + + if [[ $res == 0 || $res == 2 ]] + then + return $res + fi + done + return 1 + } + + pids=() + idx=0 + while read id begin end desc; do + + replay $idx $id $begin $end "$desc" 2>&1 | sed "s/^/[partition $idx]: /" & + + pids[$idx]=$! + idx=$((idx+1)) + done < <(jq '.[${{ matrix.job_id }}][]' jobs.json) + + res=0 + for idx in `seq 0 $((idx-1))` + do + range_res=0 + wait ${pids[$idx]} || range_res=$? + echo partition $idx returned $range_res + if [[ $range_res != 0 ]] + then + res=$range_res + fi + done + + echo All partitions done, returning $res + exit $res diff --git a/testsuite/replay-verify/archive_disk_utils.py b/testsuite/replay-verify/archive_disk_utils.py index 59f9d0f0ee2331..22c5c073f6a127 100644 --- a/testsuite/replay-verify/archive_disk_utils.py +++ b/testsuite/replay-verify/archive_disk_utils.py @@ -5,6 +5,7 @@ import concurrent.futures import time import yaml +import sys # Constants DISK_COPIES = 4 @@ -307,16 +308,29 @@ def create_disk_pv_pvc(project, zone, cluster_name, snapshot_name, prefix, names if __name__ == "__main__": + # check input arg network + if len(sys.argv) != 2: + print("Usage: python archive_disk_utils.py ") + sys.exit(1) + network = sys.argv[1] source_project_id = "aptos-platform-compute-0" region = "us-central1" - source_cluster_id = "general-usce1-0" - source_namespace = "testnet-pfn-usce1-backup" project_id = "aptos-devinfra-0" - snapshot_name = "testnet-archive" - new_pv_prefix = "testnet-archive" target_namespace = "default" zone = "us-central1-a" cluster_name = "devinfra-usce1-0" + + if network == "testnet": + source_cluster_id = "general-usce1-0" + source_namespace = "testnet-pfn-usce1-backup" + snapshot_name = "testnet-archive" + new_pv_prefix = "testnet-archive" + else: + source_cluster_id = "mainnet-usce1-0" + source_namespace = "mainnet-pfn-usce1-backup" + snapshot_name = "mainnet-archive" + new_pv_prefix = "mainnet-archive" + create_snapshot_with_gcloud( snapshot_name, source_project_id, diff --git a/testsuite/replay-verify/poetry.lock b/testsuite/replay-verify/poetry.lock index f0eca350aa08f2..68f4d281db0544 100644 --- a/testsuite/replay-verify/poetry.lock +++ b/testsuite/replay-verify/poetry.lock @@ -227,14 +227,14 @@ files = [ [[package]] name = "google-api-core" -version = "2.22.0" +version = "2.23.0" description = "Google API client core library" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "google_api_core-2.22.0-py3-none-any.whl", hash = "sha256:a6652b6bd51303902494998626653671703c420f6f4c88cfd3f50ed723e9d021"}, - {file = "google_api_core-2.22.0.tar.gz", hash = "sha256:26f8d76b96477db42b55fd02a33aae4a42ec8b86b98b94969b7333a2c828bf35"}, + {file = "google_api_core-2.23.0-py3-none-any.whl", hash = "sha256:c20100d4c4c41070cf365f1d8ddf5365915291b5eb11b83829fbd1c999b5122f"}, + {file = "google_api_core-2.23.0.tar.gz", hash = "sha256:2ceb087315e6af43f256704b871d99326b1f12a9d6ce99beaedec99ba26a0ace"}, ] [package.dependencies] @@ -263,14 +263,14 @@ grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] [[package]] name = "google-auth" -version = "2.35.0" +version = "2.36.0" description = "Google Authentication Library" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "google_auth-2.35.0-py2.py3-none-any.whl", hash = "sha256:25df55f327ef021de8be50bad0dfd4a916ad0de96da86cd05661c9297723ad3f"}, - {file = "google_auth-2.35.0.tar.gz", hash = "sha256:f4c64ed4e01e8e8b646ef34c018f8bf3338df0c8e37d8b3bba40e7f574a3278a"}, + {file = "google_auth-2.36.0-py2.py3-none-any.whl", hash = "sha256:51a15d47028b66fd36e5c64a82d2d57480075bccc7da37cde257fc94177a61fb"}, + {file = "google_auth-2.36.0.tar.gz", hash = "sha256:545e9618f2df0bcbb7dcbc45a546485b1212624716975a1ea5ae8149ce769ab1"}, ] [package.dependencies] @@ -329,14 +329,14 @@ protobuf = ">=3.20.2,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4 [[package]] name = "googleapis-common-protos" -version = "1.65.0" +version = "1.66.0" description = "Common protobufs used in Google APIs" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "googleapis_common_protos-1.65.0-py2.py3-none-any.whl", hash = "sha256:2972e6c496f435b92590fd54045060867f3fe9be2c82ab148fc8885035479a63"}, - {file = "googleapis_common_protos-1.65.0.tar.gz", hash = "sha256:334a29d07cddc3aa01dee4988f9afd9b2916ee2ff49d6b757155dc0d197852c0"}, + {file = "googleapis_common_protos-1.66.0-py2.py3-none-any.whl", hash = "sha256:d7abcd75fabb2e0ec9f74466401f6c119a0b498e27370e9be4c94cb7e382b8ed"}, + {file = "googleapis_common_protos-1.66.0.tar.gz", hash = "sha256:c3e7b33d15fdca5374cc0a7346dd92ffa847425cc4ea941d970f13680052ec8c"}, ] [package.dependencies] @@ -504,14 +504,14 @@ signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"] [[package]] name = "packaging" -version = "24.1" +version = "24.2" description = "Core utilities for Python packages" category = "dev" optional = false python-versions = ">=3.8" files = [ - {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"}, - {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, + {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, + {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, ] [[package]] @@ -757,14 +757,14 @@ files = [ [[package]] name = "tomli" -version = "2.0.2" +version = "2.1.0" description = "A lil' TOML parser" category = "dev" optional = false python-versions = ">=3.8" files = [ - {file = "tomli-2.0.2-py3-none-any.whl", hash = "sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38"}, - {file = "tomli-2.0.2.tar.gz", hash = "sha256:d46d457a85337051c36524bc5349dd91b1877838e2979ac5ced3e710ed8a60ed"}, + {file = "tomli-2.1.0-py3-none-any.whl", hash = "sha256:a5c57c3d1c56f5ccdf89f6523458f60ef716e210fc47c4cfb188c5ba473e0391"}, + {file = "tomli-2.1.0.tar.gz", hash = "sha256:3f646cae2aec94e17d04973e4249548320197cfabdf130015d023de4b74d8ab8"}, ] [[package]] diff --git a/testsuite/replay-verify/pyproject.toml b/testsuite/replay-verify/pyproject.toml index 565365ca6b021b..b61a72ccef8dc0 100644 --- a/testsuite/replay-verify/pyproject.toml +++ b/testsuite/replay-verify/pyproject.toml @@ -4,7 +4,6 @@ version = "0.1.0" description = "" authors = ["Bo Wu "] readme = "README.md" -packages = [{include = "replay_verify"}] [tool.poetry.dependencies] python = "^3.10" @@ -19,7 +18,3 @@ black = "^24.10.0" [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" - - -[tool.poetry.scripts] -replay-verify = "main:main" \ No newline at end of file