diff --git a/.github/workflows/ci3.yml b/.github/workflows/ci3.yml index e8eef4a05774..901dab2ec488 100644 --- a/.github/workflows/ci3.yml +++ b/.github/workflows/ci3.yml @@ -36,7 +36,7 @@ jobs: if: github.event.pull_request.head.repo.fork != true environment: ${{ startsWith(github.ref, 'refs/tags/v') && 'master' || '' }} strategy: - fail-fast: false + fail-fast: true matrix: # Only run arm64 build with arm64-ci label or on master or on tagged releases. # The way to do conditions here is to parse full strings as JSON. @@ -104,8 +104,7 @@ jobs: EXTERNAL_ETHEREUM_CONSENSUS_HOST: "https://beacon.${{ secrets.GCP_SEPOLIA_URL }}" EXTERNAL_ETHEREUM_CONSENSUS_HOST_API_KEY: ${{ secrets.GCP_SEPOLIA_API_KEY }} EXTERNAL_ETHEREUM_CONSENSUS_HOST_API_KEY_HEADER: "X-goog-api-key" - run: | - ./ci.sh ec2 + run: exec ./ci.sh ec2 - name: Download benchmarks if: matrix.settings.arch == 'amd64' && github.event_name == 'push' && github.ref_name == 'master' @@ -184,7 +183,7 @@ jobs: strategy: matrix: number: [1, 2] - fail-fast: false + fail-fast: true steps: ############# # Prepare Env @@ -217,8 +216,7 @@ jobs: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} CI: 1 CI_FULL: 1 - run: | - ./ci.sh ec2-test + run: exec ./ci.sh ec2-test # Necessary as github actions won't allow checks on matrix builds. merge-check: diff --git a/.test_patterns.yml b/.test_patterns.yml index c1563e5bc8a2..d0c21d936fc1 100644 --- a/.test_patterns.yml +++ b/.test_patterns.yml @@ -53,6 +53,10 @@ tests: skip: true owners: - *tom + - regex: "tests::execution_success::test_ram_blowup_regression" + error_regex: "assertion `left == right` failed" + owners: + - *tom # noir # Something to do with how I run the tests now. Think these are fine in nextest. @@ -124,6 +128,10 @@ tests: error_regex: "BlockOutOfRangeError" owners: - *palla + - regex: "simple e2e_sequencer_config" + error_regex: "Anvil failed to stop in time" + owners: + - *palla # yarn-project tests - regex: "p2p/src/services/discv5/discv5_service.test.ts" @@ -134,6 +142,10 @@ tests: error_regex: "Exceeded timeout of 120000" owners: - *sean + - regex: "p2p/src/services/reqresp/reqresp.test.ts" + error_regex: "✕ should stop after max retry attempts" + owners: + - *sean - regex: "p2p/src/testbench/port_change.test.ts" error_regex: "Timeout waiting for worker" owners: diff --git a/ci.sh b/ci.sh index 0bee34465d40..43608550e96b 100755 --- a/ci.sh +++ b/ci.sh @@ -72,23 +72,23 @@ case "$cmd" in "ec2") # Spin up ec2 instance and ci bootstrap with shell on failure. export USE_TEST_CACHE=1 - bootstrap_ec2 + exec bootstrap_ec2 ;; "ec2-no-cache") # Disable the build and test cache. export NO_CACHE=1 export USE_TEST_CACHE=0 - bootstrap_ec2 + exec bootstrap_ec2 ;; "ec2-test") # Can use the build cache, but don't use the test cache. export USE_TEST_CACHE=0 - bootstrap_ec2 + exec bootstrap_ec2 ;; "ec2-shell") # Spin up ec2 instance, clone, and drop into shell. # False triggers the shell on fail. - bootstrap_ec2 "false" + exec bootstrap_ec2 "false" ;; "ec2-grind") # Same as ec2-test but repeat it over arg1 instances. diff --git a/ci3/bootstrap_ec2 b/ci3/bootstrap_ec2 index f1471a718434..b46e6e0ce7bf 100755 --- a/ci3/bootstrap_ec2 +++ b/ci3/bootstrap_ec2 @@ -15,7 +15,8 @@ if [ "$arch" == "arm64" ]; then export AWS_SHUTDOWN_TIME=90 else if [ "$CI_FULL" -eq 1 ]; then - cores=192,128,64 + # cores=192,128,64 + cores=128,64 else cores=128,64 fi @@ -27,12 +28,16 @@ cores=${CPUS:-$cores} # Trap function to terminate our running instance when the script exits. function on_exit { set +e - if [ "$NO_TERMINATE" -eq 0 ]; then - aws_terminate_instance $iid $sir - else - echo "Remote machine not terminated, connect with: ARCH=$arch ./ci.sh shell" + if [ -n "$iid" ]; then + if [ "$NO_TERMINATE" -eq 0 ]; then + aws_terminate_instance $iid $sir + else + echo "Remote machine not terminated, connect with: ARCH=$arch ./ci.sh shell" + fi + iid="" fi } +trap on_exit SIGINT SIGTERM EXIT # Verify that the commit exists on the remote. It will be the remote tip of itself if so. current_commit=$(git rev-parse HEAD) @@ -76,7 +81,6 @@ IFS=':' read -r -a parts <<< "$ip_sir" ip="${parts[0]}" sir="${parts[1]}" iid="${parts[2]}" -trap on_exit EXIT # If AWS credentials are not set, try to load them from ~/.aws/build_instance_credentials. if [ -z "${AWS_ACCESS_KEY_ID:-}" ] || [ -z "${AWS_SECRET_ACCESS_KEY:-}" ]; then @@ -181,72 +185,85 @@ EOF # We set SSH_CONNECTION to something to ensure the hostname is shown in the lean prompt. # We provide the host user and group ids to the entrypoint script to ensure alignment. # We raise the default pid limit to 32k. -set +e -ssh ${ssh_args:-} -F $ci3/aws/build_instance_ssh_config ubuntu@$ip " - # TODO: This should *not* be needed in a CI run. Remove "watching" code, e.g. in boxes. - sudo sysctl fs.inotify.max_user_watches=1048576 &>/dev/null - sudo sysctl fs.inotify.max_user_instances=1048576 &>/dev/null +function run { + ssh ${ssh_args:-} -F $ci3/aws/build_instance_ssh_config ubuntu@$ip " + # TODO: This should *not* be needed in a CI run. Remove "watching" code, e.g. in boxes. + sudo sysctl fs.inotify.max_user_watches=1048576 &>/dev/null + sudo sysctl fs.inotify.max_user_instances=1048576 &>/dev/null + + echo Loading CRS into tmpfs... + sudo mkdir /mnt/bb-crs + sudo mount -t tmpfs -o size=3G tmpfs /mnt/bb-crs + sudo cp -r \$HOME/.bb-crs/* /mnt/bb-crs + echo Done in \$SECONDS seconds. - echo Loading CRS into tmpfs... - sudo mkdir /mnt/bb-crs - sudo mount -t tmpfs -o size=3G tmpfs /mnt/bb-crs - sudo cp -r \$HOME/.bb-crs/* /mnt/bb-crs - echo Done in \$SECONDS seconds. + echo Installing and starting sysdig... + sudo apt-get update &>/dev/null + sudo apt-get install -y sysdig &>/dev/null + sudo sysdig -p '%evt.time (cid: %container.id) (event: %evt.dir %evt.type) (args: %evt.args): %proc.cmdline' 'evt.type=bind or evt.type=listen' > /tmp/netfile & + netfile_pid=\$! + # Capture cpu load. + mpstat 2 &> /tmp/cpufile & + cpufile_pid=\$! + # Capture mem load. + vmstat -w -S M 2 &> /tmp/memfile & + memfile_pid=\$! + trap 'sudo kill \$netfile_pid \$cpufile_pid \$memfile_pid' EXIT - echo Installing and starting sysdig... - sudo apt-get update &>/dev/null - sudo apt-get install -y sysdig &>/dev/null - sudo sysdig -p '%evt.time (cid: %container.id) (event: %evt.dir %evt.type) (args: %evt.args): %proc.cmdline' 'evt.type=bind or evt.type=listen' > /tmp/netfile & - netfile_pid=\$! - # Capture cpu load. - mpstat 2 &> /tmp/cpufile & - cpufile_pid=\$! - # Capture mem load. - vmstat -w -S M 2 &> /tmp/memfile & - memfile_pid=\$! - trap 'sudo kill \$netfile_pid \$cpufile_pid \$memfile_pid' EXIT + echo Starting devbox... + docker run --privileged ${docker_args:-} \ + --name aztec_build \ + --hostname $instance_name \ + -v bootstrap_ci_local_docker:/var/lib/docker \ + -v bootstrap_ci_repo:/home/aztec-dev/aztec-packages \ + -v \$HOME/.aws:/home/aztec-dev/.aws:ro \ + -v /mnt/bb-crs:/home/aztec-dev/.bb-crs:ro \ + -v /tmp:/tmp \ + -v /dev/kmsg:/dev/kmsg \ + -e RUN_ID=${RUN_ID:-} \ + -e JOB_ID=${JOB_ID:-} \ + -e NO_CACHE=${NO_CACHE:-} \ + -e USE_TEST_CACHE=${USE_TEST_CACHE:-1} \ + -e CI_REDIS='ci-redis-tiered.lzka0i.ng.0001.use2.cache.amazonaws.com' \ + -e SSH_CONNECTION=' ' \ + -e LOCAL_USER_ID=\$(id -u) \ + -e LOCAL_GROUP_ID=\$(id -g) \ + -e CI=$CI \ + -e CI_FULL=$CI_FULL \ + -e CI_NIGHTLY=${CI_NIGHTLY:-0} \ + -e EXTERNAL_ETHEREUM_HOSTS=${EXTERNAL_ETHEREUM_HOSTS:-} \ + -e EXTERNAL_ETHEREUM_CONSENSUS_HOST=${EXTERNAL_ETHEREUM_CONSENSUS_HOST:-} \ + -e EXTERNAL_ETHEREUM_CONSENSUS_HOST_API_KEY=${EXTERNAL_ETHEREUM_CONSENSUS_HOST_API_KEY:-} \ + -e EXTERNAL_ETHEREUM_CONSENSUS_HOST_API_KEY_HEADER=${EXTERNAL_ETHEREUM_CONSENSUS_HOST_API_KEY_HEADER:-} \ + -e L1_DEPLOYMENT_PRIVATE_KEY=${L1_DEPLOYMENT_PRIVATE_KEY:-} \ + -e DRY_RUN=${DRY_RUN:-0} \ + -e DOCKERHUB_PASSWORD=${DOCKERHUB_PASSWORD:-} \ + -e AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-} \ + -e AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:-} \ + -e BUILD_SYSTEM_DEBUG=${BUILD_SYSTEM_DEBUG:-} \ + -e GITHUB_TOKEN=${GITHUB_TOKEN:-} \ + -e NETLIFY_SITE_ID=${NETLIFY_SITE_ID:-} \ + -e NETLIFY_AUTH_TOKEN=${NETLIFY_AUTH_TOKEN:-} \ + -e NPM_TOKEN=${NPM_TOKEN:-} \ + -e SLACK_BOT_TOKEN=${SLACK_BOT_TOKEN:-} \ + -e REF_NAME=${REF_NAME:-} \ + -e AWS_TOKEN=\$(curl -sX PUT http://169.254.169.254/latest/api/token -H 'X-aws-ec2-metadata-token-ttl-seconds: 21600') \ + --pids-limit=32768 \ + aztecprotocol/devbox:3.0 bash -c $(printf '%q' "$container_script") + " +} + +set +e +# If in terminal run in foreground. +# If not, run in background so we can handle the signals in a timely fashion, and wait for it to finish. +if [ -t 0 ]; then + run +else + echo "No tty, running in background..." + run & + wait $! +fi - echo Starting devbox... - docker run --privileged ${docker_args:-} \ - --name aztec_build \ - --hostname $instance_name \ - -v bootstrap_ci_local_docker:/var/lib/docker \ - -v bootstrap_ci_repo:/home/aztec-dev/aztec-packages \ - -v \$HOME/.aws:/home/aztec-dev/.aws:ro \ - -v /mnt/bb-crs:/home/aztec-dev/.bb-crs:ro \ - -v /tmp:/tmp \ - -v /dev/kmsg:/dev/kmsg \ - -e RUN_ID=${RUN_ID:-} \ - -e JOB_ID=${JOB_ID:-} \ - -e NO_CACHE=${NO_CACHE:-} \ - -e USE_TEST_CACHE=${USE_TEST_CACHE:-1} \ - -e CI_REDIS='ci-redis-tiered.lzka0i.ng.0001.use2.cache.amazonaws.com' \ - -e SSH_CONNECTION=' ' \ - -e LOCAL_USER_ID=\$(id -u) \ - -e LOCAL_GROUP_ID=\$(id -g) \ - -e CI=$CI \ - -e CI_FULL=$CI_FULL \ - -e CI_NIGHTLY=${CI_NIGHTLY:-0} \ - -e EXTERNAL_ETHEREUM_HOSTS=${EXTERNAL_ETHEREUM_HOSTS:-} \ - -e EXTERNAL_ETHEREUM_CONSENSUS_HOST=${EXTERNAL_ETHEREUM_CONSENSUS_HOST:-} \ - -e EXTERNAL_ETHEREUM_CONSENSUS_HOST_API_KEY=${EXTERNAL_ETHEREUM_CONSENSUS_HOST_API_KEY:-} \ - -e EXTERNAL_ETHEREUM_CONSENSUS_HOST_API_KEY_HEADER=${EXTERNAL_ETHEREUM_CONSENSUS_HOST_API_KEY_HEADER:-} \ - -e L1_DEPLOYMENT_PRIVATE_KEY=${L1_DEPLOYMENT_PRIVATE_KEY:-} \ - -e DRY_RUN=${DRY_RUN:-0} \ - -e DOCKERHUB_PASSWORD=${DOCKERHUB_PASSWORD:-} \ - -e AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-} \ - -e AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:-} \ - -e BUILD_SYSTEM_DEBUG=${BUILD_SYSTEM_DEBUG:-} \ - -e GITHUB_TOKEN=${GITHUB_TOKEN:-} \ - -e NETLIFY_SITE_ID=${NETLIFY_SITE_ID:-} \ - -e NETLIFY_AUTH_TOKEN=${NETLIFY_AUTH_TOKEN:-} \ - -e NPM_TOKEN=${NPM_TOKEN:-} \ - -e SLACK_BOT_TOKEN=${SLACK_BOT_TOKEN:-} \ - -e REF_NAME=${REF_NAME:-} \ - -e AWS_TOKEN=\$(curl -sX PUT http://169.254.169.254/latest/api/token -H 'X-aws-ec2-metadata-token-ttl-seconds: 21600') \ - --pids-limit=32768 \ - aztecprotocol/devbox:3.0 bash -c $(printf '%q' "$container_script") -" code=$? set -e echo "SSH exited with code: $code" diff --git a/ci3/parallelise b/ci3/parallelise index 8554aff54d1a..33a2ed5185d5 100755 --- a/ci3/parallelise +++ b/ci3/parallelise @@ -7,8 +7,7 @@ cd $root jobs=$(get_num_cpus_max ${1:-}) parallel_args="-j$jobs --memsuspend $(memsuspend_limit) --line-buffer --joblog joblog.txt" -# If not in CI, fail fast. -if [ "$CI" -eq 0 ]; then +if [ "${NO_FAIL_FAST:-0}" -eq 0 ]; then parallel_args+=" --halt now,fail=1" fi diff --git a/ci3/run_test_cmd b/ci3/run_test_cmd index 9103e9188fab..f2d739c860ea 100755 --- a/ci3/run_test_cmd +++ b/ci3/run_test_cmd @@ -85,6 +85,11 @@ trap "kill -- -$timeout_pid &>/dev/null; exit" SIGTERM SIGINT wait $timeout_pid code=$? +# If the test received a SIGTERM or SIGINT, we don't want to track or print anything. +if [ "$code" -eq 143 ] || [ "$code" -eq 130 ]; then + exit $code +fi + if [ "$CI_REDIS_AVAILABLE" -eq 1 ]; then # If the test succeeded and we're using the test cache, set success flag for test. This key is unique to the test. # If the test succeeded and we're in CI, save the test log.