InternLM · lvhan028 · Oct 20, 2025 · Oct 14, 2025 · Oct 14, 2025 · Oct 15, 2025
diff --git a/.github/workflows/api_eval.yml b/.github/workflows/api_eval.yml
@@ -18,6 +18,20 @@ on:
         description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
         type: string
         default: "['turbomind', 'pytorch']"
+      execution_mode:
+        required: false
+        description: 'Select execution mode: infer, eval, or both. Default is "both"'
+        type: choice
+        options:
+          - both
+          - infer
+          - eval
+        default: 'both'
+      run_id:
+        required: false
+        description: 'Set custom run ID. If not provided, github.run_id will be used'
+        type: string
+        default: ''
 
 
 env:
@@ -107,7 +121,6 @@ jobs:
           name: my-artifact-${{ github.run_id }}-py310
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
@@ -116,6 +129,7 @@ jobs:
       - name: Install opencompass
         run: |
           python3 -m pip install opencompass
+          python3 -m pip install langdetect
       - name: Check env
         run: |
           python3 -m pip list
@@ -128,10 +142,17 @@ jobs:
         run: |
           overall_exit=0
           ln -s /mnt/187/opencompass-data/data ./data
-          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and not pr_test and ${{matrix.backend}}" -n 8 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
-          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and not pr_test and ${{matrix.backend}}" -n 4 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
-          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and not pr_test and ${{matrix.backend}}" -n 2 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
-          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and not pr_test and ${{matrix.backend}}" -n 1 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          ln -s /nvme/qa_test_models/resource/nltk_data /usr/share/nltk_data
+          execution_mode="${{ github.event.inputs.execution_mode || 'both' }}"
+          if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "infer" ]; then
+            pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and ${{matrix.backend}} and infer" -n 8 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+            pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and ${{matrix.backend}} and infer" -n 4 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+            pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and ${{matrix.backend}} and infer" -n 2 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+            pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and ${{matrix.backend}} and infer" -n 1 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          fi
+          if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "eval" ]; then
+            pytest autotest/evaluate/test_api_evaluate.py -m "${{matrix.backend}} and eval" -n 4 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          fi
           exit $overall_exit
       - name: Clear workspace
         if: always()

diff --git a/.github/workflows/api_eval_h800.yml b/.github/workflows/api_eval_h800.yml
@@ -18,6 +18,20 @@ on:
         description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
         type: string
         default: "['turbomind', 'pytorch']"
+      execution_mode:
+        required: false
+        description: 'Select execution mode: infer, eval, or both. Default is "both"'
+        type: choice
+        options:
+          - both
+          - infer
+          - eval
+        default: 'both'
+      run_id:
+        required: false
+        description: 'Set custom run ID. If not provided, github.run_id will be used'
+        type: string
+        default: ''
 
 
 env:
@@ -91,9 +105,9 @@ jobs:
         - /nvme/qa_test_models:/nvme/qa_test_models
         - /nvme1/qa_test_models:/nvme1/qa_test_models
         - /nvme2/share:/nvme2/share
-        - /mnt/137_nvme2:/mnt/137_nvme2
-        - /mnt/137_nvme3:/mnt/137_nvme3
-        - /mnt/137_nvme4:/mnt/137_nvme4
+        - /mnt/158_nvme2:/mnt/158_nvme2
+        - /mnt/158_nvme3:/mnt/158_nvme3
+        - /mnt/158_nvme4:/mnt/158_nvme4
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
       - name: Create and change to _wk directory
@@ -110,7 +124,6 @@ jobs:
           name: my-artifact-${{ github.run_id }}-py310
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
@@ -120,6 +133,7 @@ jobs:
       - name: Install opencompass
         run: |
           python3 -m pip install opencompass
+          python3 -m pip install langdetect
       - name: Check env
         run: |
           python3 -m pip list
@@ -131,11 +145,18 @@ jobs:
         if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind')
         run: |
           overall_exit=0
-          ln -s /mnt/187/opencompass-data/data ./data
-          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and not pr_test and ${{matrix.backend}}" -n 8 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
-          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and not pr_test and ${{matrix.backend}}" -n 4 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
-          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and not pr_test and ${{matrix.backend}}" -n 2 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
-          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and not pr_test and ${{matrix.backend}}" -n 1 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          ln -s /nvme/qa_test_models/resource/opencompass-data/data ./data
+          ln -s /nvme/qa_test_models/resource/nltk_data /usr/share/nltk_data
+          execution_mode="${{ github.event.inputs.execution_mode || 'both' }}"
+          if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "infer" ]; then
+            pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and ${{matrix.backend}} and infer" -n 8 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+            pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and ${{matrix.backend}} and infer" -n 4 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+            pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and ${{matrix.backend}} and infer" -n 2 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+            pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and ${{matrix.backend}} and infer" -n 1 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          fi
+          if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "eval" ]; then
+            pytest autotest/evaluate/test_api_evaluate.py -m "${{matrix.backend}} and eval" -n 4 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          fi
           exit $overall_exit
       - name: Clear workspace
         if: always()

diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
@@ -538,7 +538,6 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install sentence_transformers==2.2.2 --no-deps
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |

diff --git a/.github/workflows/daily_ete_test_5080.yml b/.github/workflows/daily_ete_test_5080.yml
@@ -186,6 +186,7 @@ jobs:
           rm -rf $workdir
           mkdir $workdir
           chmod -R 777 $workdir
+
   test_tools:
     if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
     runs-on: [self-hosted, 5080-r1]
@@ -263,6 +264,7 @@ jobs:
           rm -rf $workdir
           mkdir $workdir
           chmod -R 777 $workdir
+
   test_restful:
     if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}}
     runs-on: [self-hosted, 5080-r1]
@@ -351,6 +353,7 @@ jobs:
           rm -rf $workdir
           mkdir $workdir
           chmod -R 777 $workdir
+
   get_coverage_report:
     if: ${{!cancelled() && success()}}
     runs-on: [self-hosted, 5080-r1]

diff --git a/autotest/config.yaml b/autotest/config.yaml
@@ -289,6 +289,7 @@ turbomind_quatization:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Chat
+        - Qwen/Qwen2.5-7B-Instruct
 
 pytorch_quatization:
     awq:

diff --git a/autotest/evaluate/eval_config_chat.py b/autotest/evaluate/eval_config_chat.py
@@ -1,53 +1,154 @@
+# flake8: noqa
+
 from mmengine.config import read_base
 from opencompass.models import OpenAISDK
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
 from opencompass.utils.text_postprocessors import extract_non_reasoning_content
 
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
 with read_base():
-    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets  # noqa: F401, E501
-    from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups  # noqa: F401, E501
-
-mmlu_datasets = [
-    x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [
-        'business_ethics', 'clinical_knowledge', 'college_medicine', 'global_facts', 'human_aging', 'management',
-        'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting',
-        'professional_medicine', 'virology'
-    ]
-]
+    # Datasets
+    from opencompass.configs.datasets.aime2025.aime2025_llmjudge_academic import aime2025_datasets
+    from opencompass.configs.datasets.gpqa.gpqa_cascade_eval_academic import gpqa_datasets
+    from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import ifeval_datasets
+    # LiveCodeBench dataset commented out to avoid version errors
+    # from opencompass.configs.datasets.livecodebench.livecodebench_v6_academic import \
+    #     LCBCodeGeneration_dataset
+    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import mmlu_pro_datasets
+    # HLE dataset commented out to avoid connection errors
+    # from opencompass.configs.datasets.HLE.hle_llmverify_academic import \
+    #     hle_datasets
+    # Summary Groups
+    from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups
 
-datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
+#######################################################################
+#                         Model Configuration                         #
+#######################################################################
 
 MODEL_NAME = ''
 MODEL_PATH = ''
 API_BASE = ''
+JUDGE_MODEL_PATH = ''
+JUDGE_API_BASE = ''
 
 api_meta_template = dict(round=[
     dict(role='HUMAN', api_role='HUMAN'),
     dict(role='BOT', api_role='BOT', generate=True),
 ])
 
+# Use OpenAISDK to configure LMDeploy OpenAI interface
 models = [
     dict(type=OpenAISDK,
          abbr=f'{MODEL_NAME}-lmdeploy-api',
-         openai_api_base=API_BASE,
-         key='EMPTY',
          path=MODEL_PATH,
+         key='EMPTY',
+         openai_api_base=API_BASE,
+         retry=3,
+         run_cfg=dict(num_gpus=0),
          meta_template=api_meta_template,
-         max_out_len=32768,
-         batch_size=500,
-         temperature=0.1,
          pred_postprocessor=dict(type=extract_non_reasoning_content))
 ]
 
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+# datasets list for evaluation
+# Remove LCBCodeGeneration_dataset due to version errors
+
+mmlu_pro_datasets = [x for x in mmlu_pro_datasets if 'math' in x['abbr'] or 'other' in x['abbr']]
+
+# Modify datasets list to exclude hle_datasets and LCBCodeGeneration_dataset
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets') and k != 'hle_datasets'), [])
+
+# LLM judge config: using LLM to evaluate predictions
+judge_cfg = dict(
+    type=OpenAISDK,
+    path=JUDGE_MODEL_PATH,
+    key='EMPTY',
+    openai_api_base=JUDGE_API_BASE,
+    meta_template=dict(round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ]),
+    query_per_second=16,
+    batch_size=1024,
+    temperature=0.001,
+    tokenizer_path=JUDGE_MODEL_PATH,
+    verbose=True,
+    max_out_len=16384,
+    max_seq_len=49152,
+)
+
+for item in datasets:
+    if 'judge_cfg' in item['eval_cfg']['evaluator']:
+        item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
+    if 'llm_evaluator' in item['eval_cfg']['evaluator'].keys(
+    ) and 'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']:
+        item['eval_cfg']['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg
+
+#######################################################################
+#                       PART 2  Dataset Summarizer                    #
+#######################################################################
+
+core_summary_groups = [
+    {
+        'name':
+        'core_average',
+        'subsets': [
+            ['IFEval', 'Prompt-level-strict-accuracy'],
+            # Remove hle_llmjudge due to unavailable dataset
+            # ['hle_llmjudge', 'accuracy'],
+            ['aime2025_repeat_32', 'accuracy (32 runs average)'],
+            ['GPQA_diamond_repeat_4', 'accuracy (4 runs average)'],
+            ['mmlu_pro', 'naive_average'],
+            'mmlu_pro_math',
+            'mmlu_pro_other',
+            # Remove lcb_code_generation_repeat_6 due to version errors
+            # ['lcb_code_generation_repeat_6', 'pass@1 (6 runs average)'],
+        ],
+    },
+]
+
 summarizer = dict(
     dataset_abbrs=[
-        ['mmlu', 'naive_average'],
-        ['gsm8k', 'accuracy'],
-        'mmlu-other',
+        ['core_average', 'naive_average'],
+        ['IFEval', 'Prompt-level-strict-accuracy'],
+        # Remove hle_llmjudge due to unavailable dataset
+        # ['hle_llmjudge', 'accuracy'],
+        ['GPQA_diamond_repeat_4', 'accuracy (4 runs average)'],
+        ['aime2025_repeat_32', 'accuracy (32 runs average)'],
+        ['mmlu_pro', 'naive_average'],
+        'mmlu_pro_math',
+        'mmlu_pro_other',
     ],
-    summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+    summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []) + core_summary_groups,
 )
 
 for item in datasets:
     if 'max_out_len' in item['infer_cfg']['inferencer']:
         del item['infer_cfg']['inferencer']['max_out_len']
+
+#######################################################################
+#                 PART 4  Inference/Evaluation Configuration          #
+#######################################################################
+
+# infer with local runner
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        retry=0,
+        task=dict(type=OpenICLInferTask),
+    ),
+)
+
+# eval with local runner
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=OpenICLEvalTask)),
+)