diff --git a/.github/workflows/main.yml b/.github/workflows/build_test.yml similarity index 80% rename from .github/workflows/main.yml rename to .github/workflows/build_test.yml index 4afd19b6e6..5130be682a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/build_test.yml @@ -1,17 +1,16 @@ -name: Pull Request Tests +name: Pull request tests on: push: - branches: - - develop - pull_request: - branches: - - develop + branches: ['develop'] + pull_request_review: + types: [submitted] + branches: ['develop'] jobs: setup: name: Set up - runs-on: ubuntu-latest + runs-on: ubuntu-20.04 outputs: tn: ${{ steps.parse.outputs.tn }} @@ -46,7 +45,8 @@ jobs: build: name: Build (${{ matrix.bld_set }}) needs: setup - runs-on: ubuntu-latest + if: github.event_name == 'push' || (github.event.review.state == 'approved' && toJson(github.event.pull_request.requested_reviewers) == '[]') + runs-on: ubuntu-20.04 strategy: fail-fast: false @@ -85,8 +85,7 @@ jobs: utest: name: Unit test (${{ needs.setup.outputs.tn }}, ${{ matrix.test_set }}) needs: [setup,build] - runs-on: ubuntu-latest - #runs-on: self-hosted + runs-on: self-hosted strategy: fail-fast: false @@ -101,7 +100,7 @@ jobs: - name: Prepare artifacts run: | tar xvjf artifact.tar.bz2 && rm -f artifact.tar.bz2 - sudo docker load --input ${{ needs.setup.outputs.img }}.tar.gz && rm -f ${{ needs.setup.outputs.img }}.tar.gz + docker load --input ${{ needs.setup.outputs.img }}.tar.gz && rm -f ${{ needs.setup.outputs.img }}.tar.gz - name: Run utest run: ./ci.sh -n ${{ needs.setup.outputs.tn }} -r ${{ matrix.test_set }} @@ -117,5 +116,7 @@ jobs: if: ${{ always() }} run: | rm -f ci.sh ci.test - sudo docker rm my-container && sudo docker rmi ${{ needs.setup.outputs.img }}:latest - sudo docker volume rm DataVolume + docker stop my-container && docker rm my-container && docker rmi ${{ needs.setup.outputs.img }}:latest + docker volume rm DataVolume + #docker rmi minsukjinoaa/fv3-input-data:input-data-20210115 + rm -f memory_stat diff --git a/.github/workflows/manage.yml b/.github/workflows/manage_workflows.yml similarity index 94% rename from .github/workflows/manage.yml rename to .github/workflows/manage_workflows.yml index c933b36e64..57a1552ddd 100644 --- a/.github/workflows/manage.yml +++ b/.github/workflows/manage_workflows.yml @@ -2,14 +2,14 @@ name: Manage workflows on: workflow_run: - workflows: ["Pull Request Tests"] + workflows: ["Pull request tests"] types: - requested jobs: job1: - name: Job 1 - runs-on: ubuntu-latest + name: Cancel workflows + runs-on: ubuntu-20.04 steps: - name: Checkout codes @@ -17,6 +17,7 @@ jobs: - name: Check if skip-ci is requested run: | + sleep 40 cd ${GITHUB_WORKSPACE}/tests/ci repo="${GITHUB_API_URL}/repos/${GITHUB_REPOSITORY}/actions/runs" tr_id=$(cat ${GITHUB_EVENT_PATH} | ./json_helper.py get_trigger_id) diff --git a/.github/workflows/start_runners.yml b/.github/workflows/start_runners.yml new file mode 100644 index 0000000000..423315de58 --- /dev/null +++ b/.github/workflows/start_runners.yml @@ -0,0 +1,33 @@ +name: Start runners + +on: + workflow_run: + workflows: ["Pull request tests"] + types: + - requested + +jobs: + job1: + name: Start AWS runners + runs-on: ubuntu-20.04 + + steps: + - name: Check out codes + uses: actions/checkout@v2 + + - name: Configure AWS + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 + + - name: Check the status of build and start self-hosted runners + env: + aws_instance_id: ${{ secrets.AWS_INSTANCE_ID }} + run: | + cd ${GITHUB_WORKSPACE}/tests/ci + conclusion=$(cat ${GITHUB_EVENT_PATH} | ./build_status_check.py) + if [[ $conclusion == "success" ]]; then + aws ec2 start-instances --instance-ids $aws_instance_id + fi diff --git a/.github/workflows/stop_runners.yml b/.github/workflows/stop_runners.yml new file mode 100644 index 0000000000..f3497318c3 --- /dev/null +++ b/.github/workflows/stop_runners.yml @@ -0,0 +1,25 @@ +name: Stop runners + +on: + workflow_run: + workflows: ["Pull request tests"] + types: + - completed + +jobs: + job1: + name: Stop AWS runners + runs-on: ubuntu-20.04 + + steps: + - name: Configure AWS + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 + + - name: Stop self-hosted runners + env: + aws_instance_id: ${{ secrets.AWS_INSTANCE_ID }} + run: aws ec2 stop-instances --instance-ids $aws_instance_id diff --git a/tests/ci/build_status_check.py b/tests/ci/build_status_check.py new file mode 100755 index 0000000000..0f84cc38f1 --- /dev/null +++ b/tests/ci/build_status_check.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 + +import re +import sys +import json +import time +from urllib.request import urlopen + +def update_url_data(response): + data = json.loads(response.read().decode()) + indices=[] + for n in range(data["total_count"]): + if re.search("Build", data["jobs"][n]["name"]): + indices.append(n) + + if len(indices) == 0: + raise ValueError("No build job exists.") + + return data, indices + +def main(): + + time.sleep(40) + url = json.load(sys.stdin)["workflow_run"]["jobs_url"] + + status="not-completed" + no_completed_jobs = 0 + + while status != "completed": + response = urlopen(url) + data, indices = update_url_data(response) + + for i in indices: + if data["jobs"][i]["status"] == "completed": + no_completed_jobs += 1 + + if no_completed_jobs == len(indices): + status = "completed" + else: + no_completed_jobs = 0 + time.sleep(40) + + time.sleep(40) + conclusion="failure" + no_successful_jobs = 0 + for i in indices: + if data["jobs"][i]["conclusion"] == "success": + no_successful_jobs += 1 + + if no_successful_jobs == len(indices): + conclusion = "success" + + print(conclusion) + +if __name__ == "__main__": main() diff --git a/tests/ci/ci.sh b/tests/ci/ci.sh index f9af49440c..bcb90f2aec 100755 --- a/tests/ci/ci.sh +++ b/tests/ci/ci.sh @@ -72,15 +72,15 @@ if [ $BUILD = "true" ]; then elif [ $RUN == "true" ]; then - sudo docker run -d --rm -v DataVolume:/tmp minsukjinoaa/fv3-input-data:input-data-20210115 - sudo docker run -d -e test_case=${TEST_CASE} -v DataVolume:/home/builder/data/NEMSfv3gfs/input-data-20210115 --name my-container ${IMG_NAME} + docker run -d --rm -v DataVolume:/tmp minsukjinoaa/fv3-input-data:input-data-20210115 + docker run -d -e test_case=${TEST_CASE} --shm-size=512m -v DataVolume:/home/builder/data/NEMSfv3gfs/input-data-20210115 --name my-container ${IMG_NAME} echo 'cache,rss,shmem' >memory_stat sleep 3 - containerID=$(sudo docker ps -q --no-trunc) + containerID=$(docker ps -q --no-trunc) check_memory_usage $containerID >>memory_stat & - sudo docker logs -f $containerID - exit $(sudo docker inspect $containerID --format='{{.State.ExitCode}}') + docker logs -f $containerID + exit $(docker inspect $containerID --format='{{.State.ExitCode}}') fi diff --git a/tests/ci/ci.test b/tests/ci/ci.test index c414f778c2..c1ebb3278b 100644 --- a/tests/ci/ci.test +++ b/tests/ci/ci.test @@ -1,3 +1,3 @@ fv3_ccpp_control -rst bit +thr mpi dcp rst bit dbg ci-test-weather diff --git a/tests/ci/json_helper.py b/tests/ci/json_helper.py index cc4fd4f32c..c27c6df7f5 100755 --- a/tests/ci/json_helper.py +++ b/tests/ci/json_helper.py @@ -19,6 +19,7 @@ def cancel_workflow(data): x["id"]!=int(os.environ["GITHUB_RUN_ID"]) and x["id"]!=int(os.environ["TRIGGER_ID"]) and x["head_branch"]==os.environ["TRIGGER_BR"] and + x["event"]!="workflow_run" and (x["status"]=="queued" or x["status"]=="in_progress")] return wfs diff --git a/tests/default_vars.sh b/tests/default_vars.sh index 3820f68695..715dc63b59 100755 --- a/tests/default_vars.sh +++ b/tests/default_vars.sh @@ -146,7 +146,7 @@ elif [[ $MACHINE_ID = hera.* ]]; then elif [[ $MACHINE_ID = linux.* ]]; then if [[ $CI_TEST = true ]]; then - TASKS_dflt=12 ; TPN_dflt=16 ; INPES_dflt=1 ; JNPES_dflt=1 + TASKS_dflt=42 ; TPN_dflt=48 ; INPES_dflt=3 ; JNPES_dflt=2 else TASKS_dflt=150 ; TPN_dflt=40 ; INPES_dflt=3 ; JNPES_dflt=8 fi diff --git a/tests/run_test.sh b/tests/run_test.sh index f26377efb8..7b6fcea244 100755 --- a/tests/run_test.sh +++ b/tests/run_test.sh @@ -150,7 +150,11 @@ atparse < ${PATHRT}/parm/${NEMS_CONFIGURE:-nems.configure} > nems.configure if [[ $SCHEDULER = 'none' ]]; then ulimit -s unlimited - mpiexec -n ${TASKS} ./fv3.exe >out 2> >(tee err >&3) + if [[ $CI_TEST = 'true' ]]; then + eval mpiexec -n ${TASKS} ${MPI_PROC_BIND} ./fv3.exe >out 2> >(tee err >&3) + else + mpiexec -n ${TASKS} ./fv3.exe >out 2> >(tee err >&3) + fi else diff --git a/tests/utest b/tests/utest index e9dcc83413..6dd1394290 100755 --- a/tests/utest +++ b/tests/utest @@ -138,17 +138,21 @@ run_utests() { CREATE_BASELINE=true BL_SUFFIX=_std_base cat <<-EOF > ${RUNDIR_ROOT}/unit_test${RT_SUFFIX}.env - export UNIT_TEST=true + export UNIT_TEST=${UNIT_TEST} + export CI_TEST=${CI_TEST} export RT_COMPILER=${RT_COMPILER} export RESTART_INTERVAL=$(( FHMAX/2 )) + export MPI_PROC_BIND="-bind-to user:0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41" EOF ;; std) CREATE_BASELINE=false BL_SUFFIX=_std_base cat <<-EOF > ${RUNDIR_ROOT}/unit_test${RT_SUFFIX}.env - export UNIT_TEST=true + export UNIT_TEST=${UNIT_TEST} + export CI_TEST=${CI_TEST} export RT_COMPILER=${RT_COMPILER} + export MPI_PROC_BIND="-bind-to user:0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41" EOF ;; thr) @@ -161,31 +165,41 @@ run_utests() { TPN=$(( TPN/THRD )) NODES=$(( TASKS/TPN + 1 )) cat <<-EOF > ${RUNDIR_ROOT}/unit_test${RT_SUFFIX}.env - export UNIT_TEST=true + export UNIT_TEST=${UNIT_TEST} + export CI_TEST=${CI_TEST} export RT_COMPILER=${RT_COMPILER} export THRD=${THRD} export JNPES=${JNPES} export TASKS=${TASKS} export TPN=${TPN} export NODES=${NODES} + export MPI_PROC_BIND="-bind-to user:0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,37,38,39,40,41" EOF ;; mpi) CREATE_BASELINE=false BL_SUFFIX=_std_base - JNPES=$(( JNPES/2 )) + if [[ ${CI_TEST} == true ]]; then + INPES=2 + JNPES=2 + else + JNPES=$(( JNPES/2 )) + fi WRITE_GROUP=2 WRTTASK_PER_GROUP=12 TASKS=$(( INPES*JNPES*6 + WRITE_GROUP*WRTTASK_PER_GROUP )) NODES=$(( TASKS/TPN + 1 )) cat <<-EOF > ${RUNDIR_ROOT}/unit_test${RT_SUFFIX}.env - export UNIT_TEST=true + export UNIT_TEST=${UNIT_TEST} + export CI_TEST=${CI_TEST} export RT_COMPILER=${RT_COMPILER} + export INPES=${INPES} export JNPES=${JNPES} export WRITE_GROUP=${WRITE_GROUP} export WRTTASK_PER_GROUP=${WRTTASK_PER_GROUP} export TASKS=${TASKS} export NODES=${NODES} + export MPI_PROC_BIND="-bind-to user:0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47" EOF ;; dcp) @@ -195,10 +209,12 @@ run_utests() { INPES=$JNPES JNPES=$temp cat <<-EOF >${RUNDIR_ROOT}/unit_test${RT_SUFFIX}.env - export UNIT_TEST=true + export UNIT_TEST=${UNIT_TEST} + export CI_TEST=${CI_TEST} export RT_COMPILER=${RT_COMPILER} export INPES=${INPES} export JNPES=${JNPES} + export MPI_PROC_BIND="-bind-to user:0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41" EOF ;; rst) # this is not going to work for regional model @@ -250,7 +266,8 @@ run_utests() { fi cat <<-EOF >${RUNDIR_ROOT}/unit_test${RT_SUFFIX}.env - export UNIT_TEST=true + export UNIT_TEST=${UNIT_TEST} + export CI_TEST=${CI_TEST} export RT_COMPILER=${RT_COMPILER} export RESTART_FILE_PREFIX=${RESTART_FILE_PREFIX} export WARM_START=${WARM_START} @@ -261,6 +278,7 @@ run_utests() { export NA_INIT=${NA_INIT} export NSTF_NAME=${NSTF_NAME} export LIST_FILES="${LIST_FILES}" + export MPI_PROC_BIND="-bind-to user:0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41" EOF ;; bit_base) @@ -268,8 +286,10 @@ run_utests() { BL_SUFFIX=_bit_base comp_nm=bit cat <<-EOF >${RUNDIR_ROOT}/unit_test${RT_SUFFIX}.env - export UNIT_TEST=true + export UNIT_TEST=${UNIT_TEST} + export CI_TEST=${CI_TEST} export RT_COMPILER=${RT_COMPILER} + export MPI_PROC_BIND="-bind-to user:0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41" EOF ;; bit) @@ -277,8 +297,10 @@ run_utests() { BL_SUFFIX=_bit_base comp_nm=bit cat <<-EOF >${RUNDIR_ROOT}/unit_test${RT_SUFFIX}.env - export UNIT_TEST=true + export UNIT_TEST=${UNIT_TEST} + export CI_TEST=${CI_TEST} export RT_COMPILER=${RT_COMPILER} + export MPI_PROC_BIND="-bind-to user:0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41" EOF ;; dbg_base) @@ -287,9 +309,11 @@ run_utests() { comp_nm=dbg WLCLK=30 cat <<-EOF >${RUNDIR_ROOT}/unit_test${RT_SUFFIX}.env - export UNIT_TEST=true + export UNIT_TEST=${UNIT_TEST} + export CI_TEST=${CI_TEST} export RT_COMPILER=${RT_COMPILER} export WLCLK=${WLCLK} + export MPI_PROC_BIND="-bind-to user:0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41" EOF ;; dbg) @@ -298,9 +322,11 @@ run_utests() { comp_nm=dbg WLCLK=30 cat <<-EOF >${RUNDIR_ROOT}/unit_test${RT_SUFFIX}.env - export UNIT_TEST=true + export UNIT_TEST=${UNIT_TEST} + export CI_TEST=${CI_TEST} export RT_COMPILER=${RT_COMPILER} export WLCLK=${WLCLK} + export MPI_PROC_BIND="-bind-to user:0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41" EOF ;; esac