diff --git a/.github/workflows/chaos-test.json b/.github/workflows/chaos-test.json new file mode 100644 index 000000000000..c85e1edb257e --- /dev/null +++ b/.github/workflows/chaos-test.json @@ -0,0 +1,30 @@ +{ + "pull_request": { + "include": [ + { + "start-block": "101", + "end-block": "200000", + "block-dir-src": "cchain-mainnet-blocks-1m-ldb", + "current-state-dir-src": "cchain-current-state-firewood-100", + "min-wait-time": "120s", + "max-wait-time": "150s", + "runner": "ubuntu-latest", + "timeout-minutes": 60 + } + ] + }, + "schedule": { + "include": [ + { + "start-block": "101", + "end-block": "200000", + "block-dir-src": "cchain-mainnet-blocks-1m-ldb", + "current-state-dir-src": "cchain-current-state-firewood-100", + "min-wait-time": "120s", + "max-wait-time": "150s", + "runner": "ubuntu-latest", + "timeout-minutes": 60 + } + ] + } +} diff --git a/.github/workflows/chaos-test.yml b/.github/workflows/chaos-test.yml new file mode 100644 index 000000000000..b10ad9ca15d8 --- /dev/null +++ b/.github/workflows/chaos-test.yml @@ -0,0 +1,108 @@ +name: Firewood Chaos Test + +on: + workflow_dispatch: + inputs: + start-block: + description: 'The start block for the chaos test.' + default: '' + end-block: + description: 'The end block for the chaos test.' + default: '' + block-dir-src: + description: 'The source block directory. Supports S3 directory/zip and local directories.' + default: '' + current-state-dir-src: + description: 'The current state directory. Supports S3 directory/zip and local directories.' + default: '' + min-wait-time: + description: 'Minimum wait time before killing the process (e.g., 120s, 2m).' + default: '120s' + max-wait-time: + description: 'Maximum wait time before killing the process (e.g., 150s, 3m).' + default: '150s' + runner: + description: 'Runner to execute the chaos test. Input to the runs-on field of the job.' + required: true + timeout-minutes: + description: 'Timeout in minutes for the job.' + default: '60' + # XXX: remove this before merging + pull_request: + schedule: + - cron: '0 9 * * *' # Runs every day at 09:00 UTC (04:00 EST) + +jobs: + define-matrix: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.define-matrix.outputs.matrix }} + steps: + - uses: actions/checkout@v4 + - name: Define Matrix + id: define-matrix + shell: bash -x {0} + run: | + if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then + { + echo "matrix<> "$GITHUB_OUTPUT" + else + json_string=$(jq -r ".\"${{ github.event_name }}\"" .github/workflows/chaos-test.json) + { + echo "matrix<> "$GITHUB_OUTPUT" + fi + + firewood-chaos-test: + needs: define-matrix + strategy: + fail-fast: false + matrix: ${{ fromJSON(needs.define-matrix.outputs.matrix) }} + timeout-minutes: ${{ matrix.timeout-minutes }} + runs-on: ${{ matrix.runner }} + permissions: + id-token: write + contents: read + steps: + - uses: cachix/install-nix-action@02a151ada4993995686f9ed4f1be7cfbb229e56f #v31 + with: + github_access_token: ${{ secrets.GITHUB_TOKEN }} + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.AWS_S3_READ_ONLY_ROLE }} + aws-region: 'us-east-2' + role-duration-seconds: '43200' + - uses: actions/checkout@v4 + - name: Set task env + shell: bash + run: | + TIMESTAMP=$(date '+%Y%m%d-%H%M%S') + echo "EXECUTION_DATA_DIR=/tmp/reexecution-data-${TIMESTAMP}" >> "$GITHUB_ENV" + - name: Run chaos test with Firewood + shell: nix develop --impure --command bash -x {0} + run: | + TIMESTAMP=$(date +%s) + EXECUTION_DATA_DIR="/tmp/reexecution-data-${TIMESTAMP}" + ./scripts/run_task.sh test-firewood-chaos-with-copied-data \ + START_BLOCK=${{ matrix.start-block }} \ + END_BLOCK=${{ matrix.end-block }} \ + BLOCK_DIR_SRC=${{ matrix.block-dir-src }} \ + EXECUTION_DATA_DIR=$EXECUTION_DATA_DIR \ + CURRENT_STATE_DIR_SRC=${{ matrix.current-state-dir-src }} \ + MIN_WAIT_TIME=${{ matrix.min-wait-time }} \ + MAX_WAIT_TIME=${{ matrix.max-wait-time }} + diff --git a/Taskfile.yml b/Taskfile.yml index 230daca54dd0..42a7fc0cab16 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -394,6 +394,45 @@ tasks: E2E_SERIAL: 1 cmds: - cmd: bash -x ./scripts/tests.e2e.kube.sh --ginkgo.focus-file=xsvm.go {{.CLI_ARGS}} + + test-firewood-chaos: + desc: Chaos test during reexecution test with Firewood + vars: + START_BLOCK: '{{.START_BLOCK}}' + END_BLOCK: '{{.END_BLOCK}}' + CURRENT_STATE_DIR: '{{.CURRENT_STATE_DIR}}' + BLOCK_DIR: '{{.BLOCK_DIR}}' + MIN_WAIT_TIME: '{{.MIN_WAIT_TIME}}' + MAX_WAIT_TIME: '{{.MAX_WAIT_TIME}}' + cmd: go run ./tests/reexecute/chaos --start-block={{.START_BLOCK}} + --end-block={{.END_BLOCK}} --current-state-dir={{.CURRENT_STATE_DIR}} + --block-dir={{.BLOCK_DIR}} --min-wait-time={{.MIN_WAIT_TIME}} + --max-wait-time={{.MAX_WAIT_TIME}} + + test-firewood-chaos-with-copied-data: + desc: Combines import-cchain-reexecute-range and firewood-chaos-test + vars: + START_BLOCK: '{{.START_BLOCK}}' + END_BLOCK: '{{.END_BLOCK}}' + CURRENT_STATE_DIR_SRC: '{{.CURRENT_STATE_DIR_SRC}}' + BLOCK_DIR_SRC: '{{.BLOCK_DIR_SRC}}' + EXECUTION_DATA_DIR: '{{.EXECUTION_DATA_DIR}}' + MIN_WAIT_TIME: '{{.MIN_WAIT_TIME}}' + MAX_WAIT_TIME: '{{.MAX_WAIT_TIME}}' + cmds: + - task: import-cchain-reexecute-range + vars: + BLOCK_DIR_SRC: '{{.S3_BOOTSTRAP_BUCKET}}/{{.BLOCK_DIR_SRC}}/**' + CURRENT_STATE_DIR_SRC: '{{.S3_BOOTSTRAP_BUCKET}}/{{.CURRENT_STATE_DIR_SRC}}/**' + EXECUTION_DATA_DIR: '{{.EXECUTION_DATA_DIR}}' + - task: test-firewood-chaos + vars: + START_BLOCK: '{{.START_BLOCK}}' + END_BLOCK: '{{.END_BLOCK}}' + CURRENT_STATE_DIR: '{{.EXECUTION_DATA_DIR}}/current-state' + BLOCK_DIR: '{{.EXECUTION_DATA_DIR}}/blocks' + MIN_WAIT_TIME: '{{.MIN_WAIT_TIME}}' + MAX_WAIT_TIME: '{{.MAX_WAIT_TIME}}' # To use a different fuzz time, run `task test-fuzz FUZZTIME=[value in seconds]`. # A value of `-1` will run until it encounters a failing output. diff --git a/tests/reexecute/chaos/main.go b/tests/reexecute/chaos/main.go new file mode 100644 index 000000000000..3edaf6c53433 --- /dev/null +++ b/tests/reexecute/chaos/main.go @@ -0,0 +1,227 @@ +// Copyright (C) 2019-2025, Ava Labs, Inc. All rights reserved. +// See the file LICENSE for licensing terms. + +package main + +import ( + "context" + "flag" + "fmt" + "math/rand" + "os" + "os/exec" + "path/filepath" + "strconv" + "syscall" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/require" + "go.uber.org/zap" + + "github.com/ava-labs/avalanchego/api/metrics" + "github.com/ava-labs/avalanchego/database" + "github.com/ava-labs/avalanchego/database/leveldb" + "github.com/ava-labs/avalanchego/graft/coreth/plugin/evm" + "github.com/ava-labs/avalanchego/tests" + "github.com/ava-labs/avalanchego/tests/reexecute" + "github.com/ava-labs/avalanchego/utils/logging" +) + +var ( + blockDirArg string + currentStateDirArg string + startBlockArg uint64 + endBlockArg uint64 + minWaitTimeArg time.Duration + maxWaitTimeArg time.Duration + + firewoodConfig = `{ + "state-scheme": "firewood", + "snapshot-cache": 0, + "pruning-enabled": true, + "state-sync-enabled": false + }` +) + +func init() { + evm.RegisterAllLibEVMExtras() + + flag.StringVar(&blockDirArg, "block-dir", blockDirArg, "Block DB directory to read from during re-execution.") + flag.StringVar(¤tStateDirArg, "current-state-dir", currentStateDirArg, "Current state directory including VM DB and Chain Data Directory for re-execution.") + flag.Uint64Var(&startBlockArg, "start-block", 101, "Start block to begin execution (exclusive).") + flag.Uint64Var(&endBlockArg, "end-block", 200, "End block to end execution (inclusive).") + flag.DurationVar(&minWaitTimeArg, "min-wait-time", 20*time.Second, "Minimum amount of time to wait before crashing.") + flag.DurationVar(&maxWaitTimeArg, "max-wait-time", 30*time.Second, "Maximum amount of time to wait before crashing.") + + flag.Parse() +} + +func main() { + tc := tests.NewTestContext(tests.NewDefaultLogger("chaos-test")) + tc.SetDefaultContextParent(context.Background()) + tc.RecoverAndExit() + + run( + tc, + minWaitTimeArg, + maxWaitTimeArg, + blockDirArg, + currentStateDirArg, + startBlockArg, + endBlockArg, + ) +} + +// run executes a chaos test that simulates an application crash during C-Chain +// block reexecution that uses Firewood. It verifies that the VM can recover from +// an unexpected termination and resume processing from the correct block height +// using persisted state. +// +// Running the chaos test involves a few steps: +// 1. Start a reexecution test process using the Firewood state scheme +// 2. Allow the reexecution test to run for the specified wait duration +// 3. Forcefully terminate the process with SIGKILL to simulate a crash +// 4. Open the VM database to read the last accepted block height from persisted state +// 5. Restart the reexecution test from the recovered height to verify state consistency +func run( + tc tests.TestContext, + minWaitTime time.Duration, + maxWaitTime time.Duration, + blockDir string, + currentStateDir string, + startBlock uint64, + endBlock uint64, +) { + r := require.New(tc) + log := tc.Log() + + cmd := createReexecutionCmd(blockDir, currentStateDir, startBlock, endBlock) + // Set process group ID so we can kill all child processes + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + + // 1. Start a reexecution test process using the Firewood state scheme + r.NoError(cmd.Start()) + + done := make(chan error, 1) + go func() { + done <- cmd.Wait() + }() + + // 2. Allow the reexecution test to run for the specified wait duration + waitTime := time.Duration(rand.Int63n(int64(maxWaitTime-minWaitTime)+1)) + minWaitTime + log.Debug("started reexecution test", zap.Duration("wait time", waitTime)) + + time.Sleep(waitTime) + + // 3. Forcefully terminate the process with SIGKILL to simulate a crash + select { + case waitErr := <-done: + r.FailNow("reexecution test terminated prior to crash test", zap.Error(waitErr)) + default: + pgid, err := syscall.Getpgid(cmd.Process.Pid) + r.NoError(err) + + log.Debug("killing reexecution test") + + r.NoError(syscall.Kill(-pgid, syscall.SIGKILL)) + + waitCtx := tc.DefaultContext() + + var waitErr error + select { + case err := <-done: + waitErr = err + case <-waitCtx.Done(): + r.FailNow("timed out waiting for killed process to terminate") + } + + exitErr, ok := waitErr.(*exec.ExitError) + r.True(ok) + + // ExitCode() returns -1 when killed by signal + r.Equal(-1, exitErr.ProcessState.ExitCode(), "unexpected exit code after kill") + } + + var ( + vmDBDir = filepath.Join(currentStateDir, "db") + chainDataDir = filepath.Join(currentStateDir, "chain-data-dir") + ) + + // 4. Open the VM database to read the last accepted block height from persisted state + db, err := openDB(vmDBDir, 10) + r.NoError(err) + + ctx := tc.GetDefaultContextParent() + vm, err := reexecute.NewMainnetCChainVM( + ctx, + db, + chainDataDir, + []byte(firewoodConfig), + metrics.NewPrefixGatherer(), + prometheus.NewRegistry(), + ) + r.NoError(err) + + lastAcceptedID, err := vm.LastAccepted(ctx) + r.NoError(err) + + lastAcceptedBlock, err := vm.GetBlock(ctx, lastAcceptedID) + r.NoError(err) + + r.NoError(vm.Shutdown(ctx)) + r.NoError(db.Close()) + + log.Debug("read VM", zap.Uint64("latest height", lastAcceptedBlock.Height())) + + cmd = createReexecutionCmd(blockDir, currentStateDir, lastAcceptedBlock.Height()+1, endBlock) + + // 5. Restart the reexecution test from the recovered height to verify state consistency + r.NoError(cmd.Run()) +} + +// openDB attempts to open a LevelDB database with retry logic and linear backoff. +// This is necessary after killing a process that held the database open, as the OS may +// need time to release file locks even after the process terminates. +// +// The backoff strategy increases by 500ms per attempt (500ms, 1s, 1.5s, 2s, ...). +func openDB(dbDir string, maxAttempts int) (database.Database, error) { + attempt := 0 + for { + db, err := leveldb.New(dbDir, nil, logging.NoLog{}, prometheus.NewRegistry()) + if err == nil { + return db, nil + } + + attempt += 1 + if attempt == maxAttempts { + return nil, fmt.Errorf("failed to reopen db after %d attempts: %w", maxAttempts, err) + } + + backoff := time.Duration(attempt) * 500 * time.Millisecond + time.Sleep(backoff) + } +} + +// createReexecutionCmd constructs a command to run the C-Chain reexecution test. +func createReexecutionCmd( + blockDir string, + currentStateDir string, + startBlock uint64, + endBlock uint64, +) *exec.Cmd { + cmd := exec.Command("go", + "run", + "github.com/ava-labs/avalanchego/tests/reexecute/c", + "--config=firewood", + "--block-dir="+blockDir, + "--current-state-dir="+currentStateDir, + "--start-block="+strconv.Itoa(int(startBlock)), + "--end-block="+strconv.Itoa(int(endBlock)), + ) + + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + + return cmd +}