InternLM · lvhan028 · Oct 23, 2024 · Oct 12, 2024 · Oct 21, 2024 · Oct 23, 2024
diff --git a/.github/scripts/eval_base_config.py b/.github/scripts/eval_base_config.py
@@ -66,3 +66,9 @@
 turbomind_qwen1_5_7b = deepcopy(*lmdeploy_qwen1_5_7b)
 turbomind_qwen2_7b = deepcopy(*lmdeploy_qwen2_7b)
 turbomind_internlm2_5_7b = deepcopy(*lmdeploy_internlm2_5_7b)
+turbomind_internlm2_5_7b_batch1 = deepcopy(*lmdeploy_internlm2_5_7b)
+
+turbomind_internlm2_5_7b_batch1[
+    'abbr'] = turbomind_internlm2_5_7b_batch1['abbr'] + '_batch1'
+turbomind_internlm2_5_7b_batch1['engine_config']['max_batch_size'] = 1
+turbomind_internlm2_5_7b_batch1['batch_size'] = 1
diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
@@ -530,7 +530,11 @@ jobs:
     if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'evaluation'))}}
     runs-on: [self-hosted, linux-a100]
     needs: test_quantization
-    timeout-minutes: 120 # 5hours
+    timeout-minutes: 120 # 2hours
+    strategy:
+      fail-fast: false
+      matrix:
+        evaluate_type: ['chat', 'base']
     container:
       image: openmmlab/lmdeploy:latest-cu11
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
@@ -581,10 +585,17 @@ jobs:
           ln -s /root/opencompass-data ./data
           python3 .github/scripts/action_tools.py create_model_links /nvme/qa_test_models .
       - name: Evaluate models
+        if: matrix.evaluate_type == 'chat'
         run: |
           export LMDEPLOY_DIR=$(pwd)
 
           python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_4bits, turbomind_internlm2_5_7b_chat_kvint4, pt_internlm2_5_7b_chat, turbomind_internlm2_5_20b_chat, turbomind_internlm2_5_20b_chat_4bits, turbomind_internlm2_5_20b_chat_kvint4, pt_internlm2_5_20b_chat, turbomind_llama_3d1_8b_instruct, pt_llama_3d1_8b_instruct, turbomind_llama_3d1_8b_instruct_4bits, turbomind_llama_3d1_8b_instruct_kvint4, turbomind_qwen2_7b_instruct, turbomind_qwen2_7b_instruct_4bits, pt_qwen1_5_moe_2_7b_chat, pt_gemma_2_9b_it]" "[*race_datasets, *gsm8k_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true
+      - name: Evaluate base models
+        if: matrix.evaluate_type == 'base'
+        run: |
+          export LMDEPLOY_DIR=$(pwd)
+
+          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b, turbomind_qwen2_7b, turbomind_internlm2_5_7b_batch1]" "[*mmlu_datasets, *gsm8k_datasets]" /root/evaluation-reports/${{ github.run_id }} base true
       - name: Clear workspace
         if: always()
         run: |

diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml
@@ -25,19 +25,24 @@ on:
         default: '[*mmlu_datasets, *gsm8k_datasets]'
       base_models:
         required: true
-        description: 'Tested TurboMind models list. eg. [turbomind_internlm2_5_7b, turbomind_qwen2_7b]'
+        description: 'Tested TurboMind models list. eg. [turbomind_internlm2_5_7b, turbomind_qwen2_7b, turbomind_internlm2_5_7b_batch1]'
         type: string
-        default: '[turbomind_internlm2_5_7b, turbomind_qwen2_7b]'
+        default: '[turbomind_internlm2_5_7b, turbomind_qwen2_7b, turbomind_internlm2_5_7b_batch1]'
       baes_datasets:
         required: true
         description: 'Tested datasets list. eg. [*mmlu_datasets, *gsm8k_datasets]'
         type: string
         default: '[*mmlu_datasets, *gsm8k_datasets]'
-      local_config:
-        required: true
-        description: 'Whether use local eval config'
-        type: boolean
-        default: false
+      oc_repo_org:
+        required: false
+        description: 'Tested repository organization name. Default is open-compass/opencompass'
+        type: string
+        default: 'open-compass/opencompass'
+      oc_repo_ref:
+        required: false
+        description: 'Set branch or tag or commit id. Default is "main"'
+        type: string
+        default: 'main'
       offline_mode:
         required: true
         description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
@@ -143,8 +148,9 @@ jobs:
           python3 -m pip install -r requirements/test.txt
       - name: Install opencompass
         run: |
-          git clone --depth=1 https://github.com/open-compass/opencompass.git
+          git clone https://github.com/${{ github.event.inputs.oc_repo_org}}.git
           cd opencompass
+          git checkout ${{ github.event.inputs.oc_repo_ref}}
           python3 -m pip install -e .
           echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV
       - name: Check env
@@ -156,9 +162,6 @@ jobs:
         run: |
           ln -s /root/opencompass-data ./data
           python3 .github/scripts/action_tools.py create_model_links /root/models .
-      - name: Use local config
-        if: ${{inputs.local_config}}
-        run: cp /root/models/offline_pkg/eval_config.py .github/scripts/eval_opencompass_config.py
       - name: Evaluate chat models
         if: matrix.evaluate_type == 'chat'
         run: |