diff --git a/.github/scripts/eval_base_config.py b/.github/scripts/eval_base_config.py index 845286b87c..dc31293603 100644 --- a/.github/scripts/eval_base_config.py +++ b/.github/scripts/eval_base_config.py @@ -66,3 +66,9 @@ turbomind_qwen1_5_7b = deepcopy(*lmdeploy_qwen1_5_7b) turbomind_qwen2_7b = deepcopy(*lmdeploy_qwen2_7b) turbomind_internlm2_5_7b = deepcopy(*lmdeploy_internlm2_5_7b) +turbomind_internlm2_5_7b_batch1 = deepcopy(*lmdeploy_internlm2_5_7b) + +turbomind_internlm2_5_7b_batch1[ + 'abbr'] = turbomind_internlm2_5_7b_batch1['abbr'] + '_batch1' +turbomind_internlm2_5_7b_batch1['engine_config']['max_batch_size'] = 1 +turbomind_internlm2_5_7b_batch1['batch_size'] = 1 diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml index 229fdd6ca6..84fcaf5034 100644 --- a/.github/workflows/daily_ete_test.yml +++ b/.github/workflows/daily_ete_test.yml @@ -530,7 +530,11 @@ jobs: if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'evaluation'))}} runs-on: [self-hosted, linux-a100] needs: test_quantization - timeout-minutes: 120 # 5hours + timeout-minutes: 120 # 2hours + strategy: + fail-fast: false + matrix: + evaluate_type: ['chat', 'base'] container: image: openmmlab/lmdeploy:latest-cu11 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" @@ -581,10 +585,17 @@ jobs: ln -s /root/opencompass-data ./data python3 .github/scripts/action_tools.py create_model_links /nvme/qa_test_models . - name: Evaluate models + if: matrix.evaluate_type == 'chat' run: | export LMDEPLOY_DIR=$(pwd) python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_4bits, turbomind_internlm2_5_7b_chat_kvint4, pt_internlm2_5_7b_chat, turbomind_internlm2_5_20b_chat, turbomind_internlm2_5_20b_chat_4bits, turbomind_internlm2_5_20b_chat_kvint4, pt_internlm2_5_20b_chat, turbomind_llama_3d1_8b_instruct, pt_llama_3d1_8b_instruct, turbomind_llama_3d1_8b_instruct_4bits, turbomind_llama_3d1_8b_instruct_kvint4, turbomind_qwen2_7b_instruct, turbomind_qwen2_7b_instruct_4bits, pt_qwen1_5_moe_2_7b_chat, pt_gemma_2_9b_it]" "[*race_datasets, *gsm8k_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true + - name: Evaluate base models + if: matrix.evaluate_type == 'base' + run: | + export LMDEPLOY_DIR=$(pwd) + + python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b, turbomind_qwen2_7b, turbomind_internlm2_5_7b_batch1]" "[*mmlu_datasets, *gsm8k_datasets]" /root/evaluation-reports/${{ github.run_id }} base true - name: Clear workspace if: always() run: | diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index 1e8d78d143..6b91cd2746 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -25,19 +25,24 @@ on: default: '[*mmlu_datasets, *gsm8k_datasets]' base_models: required: true - description: 'Tested TurboMind models list. eg. [turbomind_internlm2_5_7b, turbomind_qwen2_7b]' + description: 'Tested TurboMind models list. eg. [turbomind_internlm2_5_7b, turbomind_qwen2_7b, turbomind_internlm2_5_7b_batch1]' type: string - default: '[turbomind_internlm2_5_7b, turbomind_qwen2_7b]' + default: '[turbomind_internlm2_5_7b, turbomind_qwen2_7b, turbomind_internlm2_5_7b_batch1]' baes_datasets: required: true description: 'Tested datasets list. eg. [*mmlu_datasets, *gsm8k_datasets]' type: string default: '[*mmlu_datasets, *gsm8k_datasets]' - local_config: - required: true - description: 'Whether use local eval config' - type: boolean - default: false + oc_repo_org: + required: false + description: 'Tested repository organization name. Default is open-compass/opencompass' + type: string + default: 'open-compass/opencompass' + oc_repo_ref: + required: false + description: 'Set branch or tag or commit id. Default is "main"' + type: string + default: 'main' offline_mode: required: true description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' @@ -143,8 +148,9 @@ jobs: python3 -m pip install -r requirements/test.txt - name: Install opencompass run: | - git clone --depth=1 https://github.com/open-compass/opencompass.git + git clone https://github.com/${{ github.event.inputs.oc_repo_org}}.git cd opencompass + git checkout ${{ github.event.inputs.oc_repo_ref}} python3 -m pip install -e . echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV - name: Check env @@ -156,9 +162,6 @@ jobs: run: | ln -s /root/opencompass-data ./data python3 .github/scripts/action_tools.py create_model_links /root/models . - - name: Use local config - if: ${{inputs.local_config}} - run: cp /root/models/offline_pkg/eval_config.py .github/scripts/eval_opencompass_config.py - name: Evaluate chat models if: matrix.evaluate_type == 'chat' run: |