From 6d87ba94dba5440782fb4b0d5edca9cd666e75e2 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Tue, 14 Oct 2025 17:33:58 +0800
Subject: [PATCH 1/7] TEST: add judge evaluate

---
 .github/workflows/api_eval.yml         |  30 +++-
 autotest/evaluate/eval_config_chat.py  | 143 ++++++++++++++---
 autotest/evaluate/test_api_evaluate.py | 205 ++++++++++++++++++++++---
 autotest/utils/evaluate_utils.py       | 100 ++++++++----
 autotest/utils/run_restful_chat.py     |  92 ++++++++++-
 5 files changed, 490 insertions(+), 80 deletions(-)

diff --git a/.github/workflows/api_eval.yml b/.github/workflows/api_eval.yml
index 5e5d49be36..d72d5569f7 100644
--- a/.github/workflows/api_eval.yml
+++ b/.github/workflows/api_eval.yml
@@ -18,6 +18,20 @@ on:
         description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
         type: string
         default: "['turbomind', 'pytorch']"
+      execution_mode:
+        required: false
+        description: 'Select execution mode: infer, eval, or both. Default is "both"'
+        type: choice
+        options:
+          - both
+          - infer
+          - eval
+        default: 'both'
+      run_id:
+        required: false
+        description: 'Set custom run ID. If not provided, github.run_id will be used'
+        type: string
+        default: ''
 
 
 env:
@@ -116,6 +130,7 @@ jobs:
       - name: Install opencompass
         run: |
           python3 -m pip install opencompass
+          python3 -m pip install langdetect
       - name: Check env
         run: |
           python3 -m pip list
@@ -128,10 +143,17 @@ jobs:
         run: |
           overall_exit=0
           ln -s /mnt/187/opencompass-data/data ./data
-          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and not pr_test and ${{matrix.backend}}" -n 8 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
-          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and not pr_test and ${{matrix.backend}}" -n 4 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
-          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and not pr_test and ${{matrix.backend}}" -n 2 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
-          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and not pr_test and ${{matrix.backend}}" -n 1 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          ln -s /nvme/qa_test_models/resource/nltk_data /usr/share/nltk_data
+          execution_mode="${{ github.event.inputs.execution_mode || 'both' }}"
+          if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "infer" ]; then
+            pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and ${{matrix.backend}} and infer" -n 8 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+            pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and ${{matrix.backend}} and infer" -n 4 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+            pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and ${{matrix.backend}} and infer" -n 2 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+            pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and ${{matrix.backend}} and infer" -n 1 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          fi
+          if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "eval" ]; then
+            pytest autotest/evaluate/test_api_evaluate.py -m "${{matrix.backend}} and eval" -n 4 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          fi
           exit $overall_exit
       - name: Clear workspace
         if: always()
diff --git a/autotest/evaluate/eval_config_chat.py b/autotest/evaluate/eval_config_chat.py
index 34bd4300c1..e7337b73a7 100644
--- a/autotest/evaluate/eval_config_chat.py
+++ b/autotest/evaluate/eval_config_chat.py
@@ -1,53 +1,154 @@
+# flake8: noqa
+
 from mmengine.config import read_base
 from opencompass.models import OpenAISDK
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
 from opencompass.utils.text_postprocessors import extract_non_reasoning_content
 
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
 with read_base():
-    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets  # noqa: F401, E501
-    from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups  # noqa: F401, E501
-
-mmlu_datasets = [
-    x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [
-        'business_ethics', 'clinical_knowledge', 'college_medicine', 'global_facts', 'human_aging', 'management',
-        'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting',
-        'professional_medicine', 'virology'
-    ]
-]
+    # Datasets
+    from opencompass.configs.datasets.aime2025.aime2025_llmjudge_academic import aime2025_datasets
+    from opencompass.configs.datasets.gpqa.gpqa_cascade_eval_academic import gpqa_datasets
+    from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import ifeval_datasets
+    # LiveCodeBench dataset commented out to avoid version errors
+    # from opencompass.configs.datasets.livecodebench.livecodebench_v6_academic import \
+    #     LCBCodeGeneration_dataset
+    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import mmlu_pro_datasets
+    # HLE dataset commented out to avoid connection errors
+    # from opencompass.configs.datasets.HLE.hle_llmverify_academic import \
+    #     hle_datasets
+    # Summary Groups
+    from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups
 
-datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
+#######################################################################
+#                         Model Configuration                         #
+#######################################################################
 
 MODEL_NAME = ''
 MODEL_PATH = ''
 API_BASE = ''
+JUDGE_MODEL_PATH = ''
+JUDGE_API_BASE = ''
 
 api_meta_template = dict(round=[
     dict(role='HUMAN', api_role='HUMAN'),
     dict(role='BOT', api_role='BOT', generate=True),
 ])
 
+# Use OpenAISDK to configure LMDeploy OpenAI interface
 models = [
     dict(type=OpenAISDK,
          abbr=f'{MODEL_NAME}-lmdeploy-api',
-         openai_api_base=API_BASE,
-         key='EMPTY',
          path=MODEL_PATH,
+         key='EMPTY',
+         openai_api_base=API_BASE,
+         retry=3,
+         run_cfg=dict(num_gpus=0),
          meta_template=api_meta_template,
-         max_out_len=32768,
-         batch_size=500,
-         temperature=0.1,
          pred_postprocessor=dict(type=extract_non_reasoning_content))
 ]
 
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+# datasets list for evaluation
+# Remove LCBCodeGeneration_dataset due to version errors
+
+mmlu_pro_datasets = [x for x in mmlu_pro_datasets if 'math' in x['abbr'] or 'other' in x['abbr']]
+
+# Modify datasets list to exclude hle_datasets and LCBCodeGeneration_dataset
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets') and k != 'hle_datasets'), [])
+
+# LLM judge config: using LLM to evaluate predictions
+judge_cfg = dict(
+    type=OpenAISDK,
+    path=JUDGE_MODEL_PATH,
+    key='EMPTY',
+    openai_api_base=JUDGE_API_BASE,
+    meta_template=dict(round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ]),
+    query_per_second=16,
+    batch_size=1024,
+    temperature=0.001,
+    tokenizer_path=JUDGE_MODEL_PATH,
+    verbose=True,
+    max_out_len=16384,
+    max_seq_len=49152,
+)
+
+for item in datasets:
+    if 'judge_cfg' in item['eval_cfg']['evaluator']:
+        item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
+    if 'llm_evaluator' in item['eval_cfg']['evaluator'].keys(
+    ) and 'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']:
+        item['eval_cfg']['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg
+
+#######################################################################
+#                       PART 2  Dataset Summarizer                    #
+#######################################################################
+
+core_summary_groups = [
+    {
+        'name':
+        'core_average',
+        'subsets': [
+            ['IFEval', 'Prompt-level-strict-accuracy'],
+            # Remove hle_llmjudge due to unavailable dataset
+            # ['hle_llmjudge', 'accuracy'],
+            ['aime2025_repeat_32', 'accuracy (32 runs average)'],
+            ['GPQA_diamond_repeat_4', 'accuracy (4 runs average)'],
+            ['mmlu_pro', 'naive_average'],
+            'mmlu_pro_math',
+            'mmlu_pro_other',
+            # Remove lcb_code_generation_repeat_6 due to version errors
+            # ['lcb_code_generation_repeat_6', 'pass@1 (6 runs average)'],
+        ],
+    },
+]
+
 summarizer = dict(
     dataset_abbrs=[
-        ['mmlu', 'naive_average'],
-        ['gsm8k', 'accuracy'],
-        'mmlu-other',
+        ['core_average', 'naive_average'],
+        ['IFEval', 'Prompt-level-strict-accuracy'],
+        # Remove hle_llmjudge due to unavailable dataset
+        # ['hle_llmjudge', 'accuracy'],
+        ['GPQA_diamond_repeat_4', 'accuracy (4 runs average)'],
+        ['aime2025_repeat_32', 'accuracy (32 runs average)'],
+        ['mmlu_pro', 'naive_average'],
+        'mmlu_pro_math',
+        'mmlu_pro_other',
     ],
-    summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+    summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []) + core_summary_groups,
 )
 
 for item in datasets:
     if 'max_out_len' in item['infer_cfg']['inferencer']:
         del item['infer_cfg']['inferencer']['max_out_len']
+
+#######################################################################
+#                 PART 4  Inference/Evaluation Configuration          #
+#######################################################################
+
+# infer with local runner
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        retry=0,
+        task=dict(type=OpenICLInferTask),
+    ),
+)
+
+# eval with local runner
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=OpenICLEvalTask)),
+)
diff --git a/autotest/evaluate/test_api_evaluate.py b/autotest/evaluate/test_api_evaluate.py
index ffd3edc97e..c88e51e796 100644
--- a/autotest/evaluate/test_api_evaluate.py
+++ b/autotest/evaluate/test_api_evaluate.py
@@ -1,12 +1,22 @@
 import pytest
 from utils.config_utils import get_evaluate_pytorch_model_list, get_evaluate_turbomind_model_list, get_workerid
 from utils.evaluate_utils import restful_test
-from utils.run_restful_chat import start_restful_api, stop_restful_api
+from utils.run_restful_chat import start_proxy_server, start_restful_api, stop_restful_api
 
 DEFAULT_PORT = 23333
+PROXY_PORT = 8000
 
+EVAL_CONFIGS = {
+    'default': {
+        'query_per_second': 1,
+        'max_out_len': 32768,
+        'batch_size': 500,
+        'temperature': 0.1,
+    }
+}
 
-@pytest.fixture(scope='function', autouse=True)
+
+@pytest.fixture(scope='function')
 def prepare_environment(request, config, worker_id):
     param = request.param
     model = param['model']
@@ -17,6 +27,39 @@ def prepare_environment(request, config, worker_id):
     stop_restful_api(pid, startRes, param)
 
 
+@pytest.fixture(scope='function')
+def prepare_environment_judge_evaluate(request, config, worker_id):
+    if get_workerid(worker_id) is None:
+        port = PROXY_PORT
+    else:
+        port = PROXY_PORT + get_workerid(worker_id)
+    judge_config = {
+        'model': 'Qwen/Qwen2.5-32B-Instruct',
+        'backend': 'turbomind',
+        'param': {
+            'tp_num': 2,
+            'extra': f'--proxy-url http://127.0.0.1:{port} --session-len 46000 --cache-max-entry-count 0.7 ',
+            'cuda_prefix': None
+        },
+        'log_path': config.get('log_path'),
+    }
+
+    param = judge_config['param']
+    model = judge_config['model']
+    backend = judge_config['backend']
+    model_path = config.get('model_path') + '/' + model
+
+    proxy_pid, proxy_process = start_proxy_server(config, worker_id)
+
+    judge_pid, judge_start_res = start_restful_api(config, param, model, model_path, backend, worker_id)
+
+    try:
+        yield request.param
+    finally:
+        stop_restful_api(judge_pid, judge_start_res, request.param)
+        stop_restful_api(proxy_pid, proxy_process, request.param)
+
+
 def get_turbomind_model_list(tp_num):
     model_list = get_evaluate_turbomind_model_list(tp_num, kvint_list=[4, 8])
     new_model_list = []
@@ -35,85 +78,205 @@ def get_pytorch_model_list(tp_num):
     return new_model_list
 
 
-def run_test(config, run_id, prepare_environment, worker_id):
+def run_test(config, run_id, prepare_environment, worker_id, test_type='infer', eval_config_name='default'):
+    """Run test with specified evaluation configuration."""
+    preset_config = EVAL_CONFIGS.get(eval_config_name, {})
+
+    if test_type == 'infer':
+        port = DEFAULT_PORT
+    else:  # eval
+        port = PROXY_PORT
+
     if get_workerid(worker_id) is None:
-        result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id)
+        result, msg = restful_test(config,
+                                   run_id,
+                                   prepare_environment,
+                                   worker_id=worker_id,
+                                   port=port,
+                                   test_type=test_type,
+                                   **preset_config)
     else:
         result, msg = restful_test(config,
                                    run_id,
                                    prepare_environment,
                                    worker_id=worker_id,
-                                   port=DEFAULT_PORT + get_workerid(worker_id))
+                                   port=port + get_workerid(worker_id),
+                                   test_type=test_type,
+                                   **preset_config)
     return result, msg
 
 
+@pytest.mark.infer
 @pytest.mark.turbomind
 @pytest.mark.gpu_num_1
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('prepare_environment', get_turbomind_model_list(tp_num=1), indirect=True)
-def test_turbomind_restful_tp1(config, run_id, prepare_environment, worker_id):
-    result, msg = run_test(config, run_id, prepare_environment, worker_id)
+@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys()))
+def test_turbomind_restful_tp1(config, run_id, prepare_environment, worker_id, eval_config):
+    result, msg = run_test(config, run_id, prepare_environment, worker_id, 'infer', eval_config)
     assert result, msg
 
 
+@pytest.mark.infer
 @pytest.mark.turbomind
 @pytest.mark.gpu_num_2
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('prepare_environment', get_turbomind_model_list(tp_num=2), indirect=True)
-def test_turbomind_restful_tp2(config, run_id, prepare_environment, worker_id):
-    result, msg = run_test(config, run_id, prepare_environment, worker_id)
+@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys()))
+def test_turbomind_restful_tp2(config, run_id, prepare_environment, worker_id, eval_config):
+    result, msg = run_test(config, run_id, prepare_environment, worker_id, 'infer', eval_config)
     assert result, msg
 
 
+@pytest.mark.infer
 @pytest.mark.turbomind
 @pytest.mark.gpu_num_4
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('prepare_environment', get_turbomind_model_list(tp_num=4), indirect=True)
-def test_turbomind_restful_tp4(config, run_id, prepare_environment, worker_id):
-    result, msg = run_test(config, run_id, prepare_environment, worker_id)
+@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys()))
+def test_turbomind_restful_tp4(config, run_id, prepare_environment, worker_id, eval_config):
+    result, msg = run_test(config, run_id, prepare_environment, worker_id, 'infer', eval_config)
     assert result, msg
 
 
+@pytest.mark.infer
 @pytest.mark.turbomind
 @pytest.mark.gpu_num_8
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('prepare_environment', get_turbomind_model_list(tp_num=8), indirect=True)
-def test_turbomind_restful_tp8(config, run_id, prepare_environment, worker_id):
-    result, msg = run_test(config, run_id, prepare_environment, worker_id)
+@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys()))
+def test_turbomind_restful_tp8(config, run_id, prepare_environment, worker_id, eval_config):
+    result, msg = run_test(config, run_id, prepare_environment, worker_id, 'infer', eval_config)
     assert result, msg
 
 
+@pytest.mark.infer
 @pytest.mark.pytorch
 @pytest.mark.gpu_num_1
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('prepare_environment', get_pytorch_model_list(tp_num=1), indirect=True)
-def test_pytorch_restful_tp1(config, run_id, prepare_environment, worker_id):
-    result, msg = run_test(config, run_id, prepare_environment, worker_id)
+@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys()))
+def test_pytorch_restful_tp1(config, run_id, prepare_environment, worker_id, eval_config):
+    result, msg = run_test(config, run_id, prepare_environment, worker_id, 'infer', eval_config)
     assert result, msg
 
 
+@pytest.mark.infer
 @pytest.mark.pytorch
 @pytest.mark.gpu_num_2
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('prepare_environment', get_pytorch_model_list(tp_num=2), indirect=True)
-def test_pytorch_restful_tp2(config, run_id, prepare_environment, worker_id):
-    result, msg = run_test(config, run_id, prepare_environment, worker_id)
+@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys()))
+def test_pytorch_restful_tp2(config, run_id, prepare_environment, worker_id, eval_config):
+    result, msg = run_test(config, run_id, prepare_environment, worker_id, 'infer', eval_config)
     assert result, msg
 
 
+@pytest.mark.infer
 @pytest.mark.pytorch
 @pytest.mark.gpu_num_4
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('prepare_environment', get_pytorch_model_list(tp_num=4), indirect=True)
-def test_pytorch_restful_tp4(config, run_id, prepare_environment, worker_id):
-    result, msg = run_test(config, run_id, prepare_environment, worker_id)
+@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys()))
+def test_pytorch_restful_tp4(config, run_id, prepare_environment, worker_id, eval_config):
+    result, msg = run_test(config, run_id, prepare_environment, worker_id, 'infer', eval_config)
     assert result, msg
 
 
+@pytest.mark.infer
 @pytest.mark.pytorch
 @pytest.mark.gpu_num_8
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('prepare_environment', get_pytorch_model_list(tp_num=8), indirect=True)
-def test_pytorch_restful_tp8(config, run_id, prepare_environment, worker_id):
-    result, msg = run_test(config, run_id, prepare_environment, worker_id)
+@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys()))
+def test_pytorch_restful_tp8(config, run_id, prepare_environment, worker_id, eval_config):
+    result, msg = run_test(config, run_id, prepare_environment, worker_id, 'infer', eval_config)
+    assert result, msg
+
+
+@pytest.mark.eval
+@pytest.mark.pytorch
+@pytest.mark.gpu_num_1
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_pytorch_model_list(tp_num=1), indirect=True)
+@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys()))
+def test_pytorch_judgeeval_tp1(config, run_id, prepare_environment_judge_evaluate, worker_id, eval_config):
+    result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval', eval_config)
+    assert result, msg
+
+
+@pytest.mark.eval
+@pytest.mark.pytorch
+@pytest.mark.gpu_num_2
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_pytorch_model_list(tp_num=2), indirect=True)
+@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys()))
+def test_pytorch_judgeeval_tp2(config, run_id, prepare_environment_judge_evaluate, worker_id, eval_config):
+    result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval', eval_config)
+    assert result, msg
+
+
+@pytest.mark.eval
+@pytest.mark.pytorch
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.gpu_num_4
+@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_pytorch_model_list(tp_num=4), indirect=True)
+@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys()))
+def test_pytorch_judgeeval_tp4(config, run_id, prepare_environment_judge_evaluate, worker_id, eval_config):
+    result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval', eval_config)
+    assert result, msg
+
+
+@pytest.mark.eval
+@pytest.mark.pytorch
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.gpu_num_8
+@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_pytorch_model_list(tp_num=8), indirect=True)
+@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys()))
+def test_pytorch_judgeeval_tp8(config, run_id, prepare_environment_judge_evaluate, worker_id, eval_config):
+    result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval', eval_config)
+    assert result, msg
+
+
+@pytest.mark.eval
+@pytest.mark.turbomind
+@pytest.mark.gpu_num_1
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_turbomind_model_list(tp_num=1), indirect=True)
+@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys()))
+def test_turbomind_judgeeval_tp1(config, run_id, prepare_environment_judge_evaluate, worker_id, eval_config):
+    result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval', eval_config)
+    assert result, msg
+
+
+@pytest.mark.eval
+@pytest.mark.turbomind
+@pytest.mark.gpu_num_2
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_turbomind_model_list(tp_num=2), indirect=True)
+@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys()))
+def test_turbomind_judgeeval_tp2(config, run_id, prepare_environment_judge_evaluate, worker_id, eval_config):
+    result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval', eval_config)
+    assert result, msg
+
+
+@pytest.mark.eval
+@pytest.mark.turbomind
+@pytest.mark.gpu_num_4
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_turbomind_model_list(tp_num=4), indirect=True)
+@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys()))
+def test_turbomind_judgeeval_tp4(config, run_id, prepare_environment_judge_evaluate, worker_id, eval_config):
+    result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval', eval_config)
+    assert result, msg
+
+
+@pytest.mark.eval
+@pytest.mark.turbomind
+@pytest.mark.gpu_num_8
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_turbomind_model_list(tp_num=8), indirect=True)
+@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys()))
+def test_turbomind_judgeeval_tp8(config, run_id, prepare_environment_judge_evaluate, worker_id, eval_config):
+    result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval', eval_config)
     assert result, msg
diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py
index 527bb64994..09c33dea00 100644
--- a/autotest/utils/evaluate_utils.py
+++ b/autotest/utils/evaluate_utils.py
@@ -72,7 +72,7 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, c
         )
 
 
-def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFAULT_PORT):
+def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFAULT_PORT, test_type='infer', **kwargs):
     work_dir = None
     try:
         model_name = prepare_environment['model']
@@ -108,34 +108,69 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
 
         try:
 
-            if not os.path.exists(config_file):
-                return False, f'Config file {config_file} not found'
-
-            cfg = Config.fromfile(config_file)
-
-            cfg.MODEL_NAME = summary_model_name
-            cfg.MODEL_PATH = model_path
-            cfg.API_BASE = f'http://127.0.0.1:{port}/v1'  # noqa: E231
-
-            if cfg.models and len(cfg.models) > 0:
-                model_cfg = cfg.models[0]
-                model_cfg['abbr'] = f'{summary_model_name}-lmdeploy-api'
-                model_cfg['openai_api_base'] = f'http://127.0.0.1:{port}/v1'  # noqa: E231
-                model_cfg['path'] = model_path
-                if 'backend' in model_cfg:
-                    model_cfg['backend'] = backend_type
-
-                if 'engine_config' in model_cfg and 'communicator' in model_cfg['engine_config']:
-                    model_cfg['engine_config']['communicator'] = communicator
-
-            simple_model_name = model_name.replace('/', '_')
-            temp_config_file = f'temp_{simple_model_name}_{os.getpid()}.py'
+            temp_config_file = f"temp_{backend_type}_{summary_model_name.replace('/', '_')}_{communicator}.py"
             temp_config_path = os.path.join(log_path, temp_config_file)
 
-            cfg.dump(temp_config_path)
-            print(f'Modified config saved to: {temp_config_path}')
-
-            cmd = ['opencompass', temp_config_path, '--reuse', '--max-num-workers', '16', '-w', work_dir]
+            if test_type == 'infer':
+                if not os.path.exists(config_file):
+                    return False, f'Config file {config_file} not found'
+
+                cfg = Config.fromfile(config_file)
+
+                cfg.MODEL_NAME = summary_model_name
+                cfg.MODEL_PATH = model_path
+                cfg.API_BASE = f'http://127.0.0.1:{port}/v1'  # noqa: E231
+
+                if cfg.models and len(cfg.models) > 0:
+                    model_cfg = cfg.models[0]
+                    model_cfg['abbr'] = f'{summary_model_name}-lmdeploy-api'
+                    model_cfg['openai_api_base'] = f'http://127.0.0.1:{port}/v1'  # noqa: E231
+                    model_cfg['path'] = model_path
+
+                    for key, value in kwargs.items():
+                        model_cfg[key] = value
+
+                cfg.dump(temp_config_path)
+                print(f'Modified config saved to: {temp_config_path}')
+            elif test_type == 'eval':
+                if not os.path.exists(temp_config_path):
+                    error_msg = f'Temp config file {temp_config_path} not found for eval stage'
+                    write_to_summary(summary_model_name, tp_num, False, error_msg, worker_id, backend_type,
+                                     communicator, work_dir)
+                    return False, error_msg
+
+                cfg = Config.fromfile(temp_config_path)
+                print(f'Using existing temp config file: {temp_config_path}')
+
+                cfg.JUDGE_API_BASE = f'http://127.0.0.1:{port}/v1'
+                cfg.JUDGE_MODEL_PATH = os.path.join(model_base_path, 'Qwen/Qwen2.5-32B-Instruct')
+
+                if hasattr(cfg, 'judge_cfg'):
+                    cfg.judge_cfg['path'] = cfg.JUDGE_MODEL_PATH
+                    cfg.judge_cfg['openai_api_base'] = cfg.JUDGE_API_BASE
+                    cfg.judge_cfg['tokenizer_path'] = cfg.JUDGE_MODEL_PATH
+
+                if hasattr(cfg, 'datasets') and cfg.datasets:
+                    for dataset in cfg.datasets:
+                        if 'eval_cfg' in dataset and 'evaluator' in dataset['eval_cfg']:
+                            evaluator = dataset['eval_cfg']['evaluator']
+
+                            if 'judge_cfg' in evaluator:
+                                evaluator['judge_cfg']['path'] = cfg.JUDGE_MODEL_PATH
+                                evaluator['judge_cfg']['openai_api_base'] = cfg.JUDGE_API_BASE
+                                evaluator['judge_cfg']['tokenizer_path'] = cfg.JUDGE_MODEL_PATH
+
+                            if 'llm_evaluator' in evaluator and 'judge_cfg' in evaluator['llm_evaluator']:
+                                evaluator['llm_evaluator']['judge_cfg']['path'] = cfg.JUDGE_MODEL_PATH
+                                evaluator['llm_evaluator']['judge_cfg']['openai_api_base'] = cfg.JUDGE_API_BASE
+                                evaluator['llm_evaluator']['judge_cfg']['tokenizer_path'] = cfg.JUDGE_MODEL_PATH
+
+                cfg.dump(temp_config_path)
+                print(f'Modified config for eval stage saved to: {temp_config_path}')
+
+            cmd = [
+                'opencompass', temp_config_path, '--reuse', '--max-num-workers', '16', '-w', work_dir, '-m', test_type
+            ]
             print(f"Running command: {' '.join(cmd)}")
             print(f'Work directory: {work_dir}')
 
@@ -144,7 +179,7 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
             stdout_output = result.stdout
             stderr_output = result.stderr
 
-            log_filename = (f'eval_{backend_type}_'
+            log_filename = (f'{test_type}_{backend_type}_'
                             f"{model_name.replace('/', '_')}_"
                             f'{communicator}_'
                             f'{worker_id}_'
@@ -198,8 +233,9 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
                         error_lines = ' | '.join(error_lines[:3])
                         final_msg += f'\nLog errors: {error_lines}'
 
-            write_to_summary(summary_model_name, tp_num, final_result, final_msg, worker_id, backend_type, communicator,
-                             work_dir)
+            if test_type == 'eval':
+                write_to_summary(summary_model_name, tp_num, final_result, final_msg, worker_id, backend_type,
+                                 communicator, work_dir)
 
             return final_result, final_msg
 
@@ -210,13 +246,13 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
     except subprocess.TimeoutExpired:
         timeout_msg = (f'Evaluation timed out for {model_name} '
                        f'after 7200 seconds')
-        if work_dir:
+        if work_dir and test_type == 'eval':
             write_to_summary(summary_model_name, tp_num, False, timeout_msg, worker_id, backend_type, communicator,
                              work_dir)
         return False, timeout_msg
     except Exception as e:
         error_msg = f'Error during evaluation for {model_name}: {str(e)}'
-        if work_dir:
+        if work_dir and test_type == 'eval':
             write_to_summary(summary_model_name, tp_num, False, error_msg, worker_id, backend_type, communicator,
                              work_dir)
         return False, error_msg
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index 876fd295e2..d6a02b170b 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -6,6 +6,7 @@
 
 import allure
 import psutil
+import requests
 from openai import OpenAI
 from pytest_assume.plugin import assume
 from utils.config_utils import _is_bf16_supported_by_device, get_cuda_prefix_by_workerid, get_workerid
@@ -17,6 +18,7 @@
 
 BASE_HTTP_URL = 'http://localhost'
 DEFAULT_PORT = 23333
+PROXY_PORT = 8000
 
 
 def start_restful_api(config, param, model, model_path, backend_type, worker_id):
@@ -53,8 +55,7 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id)
     else:
         port = DEFAULT_PORT + worker_num
 
-    cmd = get_command_with_extra('lmdeploy serve api_server ' + model_path + ' --session-len 8096 --server-port ' +
-                                 str(port),
+    cmd = get_command_with_extra('lmdeploy serve api_server ' + model_path + ' --server-port ' + str(port),
                                  config,
                                  model,
                                  need_tp=True,
@@ -681,3 +682,90 @@ def run_tools_case(config, port: int = DEFAULT_PORT):
 
     file.close()
     allure.attach.file(restful_log, attachment_type=allure.attachment_type.TEXT)
+
+
+def proxy_health_check(url):
+    """Check if proxy server is healthy."""
+    try:
+        # For proxy server, we check if it responds to the /v1/models endpoint
+        import requests
+        response = requests.get(f'{url}/v1/models', timeout=5)
+        if response.status_code == 200:
+            return True
+        return False
+    except Exception:
+        return False
+
+
+def start_proxy_server(config, worker_id):
+    """Start the proxy server for testing with enhanced error handling and
+    logging."""
+    log_path = config.get('eval_log_path')
+    if log_path is None:
+        log_path = '/nvme/qa_test_models/evaluation_report'
+    os.makedirs(log_path, exist_ok=True)
+
+    timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
+    proxy_log = os.path.join(log_path, f'proxy_server_{worker_id}_{timestamp}.log')
+
+    worker_num = get_workerid(worker_id)
+    if worker_num is None:
+        port = PROXY_PORT
+    else:
+        port = PROXY_PORT + worker_num
+
+    proxy_url = f'http://127.0.0.1:{port}'
+    try:
+        response = requests.get(f'{proxy_url}/nodes/status', timeout=5)
+        if response.status_code == 200:
+            print(f'Terminating existing nodes on proxy {proxy_url}')
+            requests.get(f'{proxy_url}/nodes/terminate_all', timeout=10)
+            sleep(5)
+    except requests.exceptions.RequestException:
+        pass
+
+    cmd = (f'lmdeploy serve proxy --server-name 127.0.0.1 --server-port {port} '
+           f'--routing-strategy min_expected_latency --serving-strategy Hybrid')
+
+    print(f'Starting proxy server with command: {cmd}')
+    print(f'Proxy log will be saved to: {proxy_log}')
+
+    proxy_file = open(proxy_log, 'w')
+    proxy_process = subprocess.Popen([cmd],
+                                     stdout=proxy_file,
+                                     stderr=proxy_file,
+                                     shell=True,
+                                     text=True,
+                                     encoding='utf-8')
+    pid = proxy_process.pid
+
+    start_time = int(time())
+    timeout = 300
+
+    sleep(5)
+    for i in range(timeout):
+        sleep(1)
+        if proxy_health_check(f'http://127.0.0.1:{port}'):
+            break
+
+        try:
+            # Check if process is still running
+            return_code = proxy_process.wait(timeout=1)  # Small timeout to check status
+            if return_code != 0:
+                with open(proxy_log, 'r') as f:
+                    content = f.read()
+                    print(content)
+                return 0, proxy_process
+        except subprocess.TimeoutExpired:
+            continue
+
+        end_time = int(time())
+        total_time = end_time - start_time
+        if total_time >= timeout:
+            break
+
+    proxy_file.close()
+    allure.attach.file(proxy_log, attachment_type=allure.attachment_type.TEXT)
+
+    print(f'Proxy server started successfully with PID: {pid}')
+    return pid, proxy_process

From 58b5c46a70d86bb840ae7cbcb88de49c061d3d3f Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Tue, 14 Oct 2025 17:53:52 +0800
Subject: [PATCH 2/7] TEST: update yaml

---
 autotest/config.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/autotest/config.yaml b/autotest/config.yaml
index 87c428fb18..b4acb02fb3 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -289,6 +289,7 @@ turbomind_quatization:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Chat
+        - Qwen/Qwen2.5-7B-Instruct
 
 pytorch_quatization:
     awq:

From 3701067f466879380b666b3a5645edb1da6c9057 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Wed, 15 Oct 2025 10:54:28 +0800
Subject: [PATCH 3/7] Update api_eval_h800.yml

---
 .github/workflows/api_eval_h800.yml | 37 ++++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/api_eval_h800.yml b/.github/workflows/api_eval_h800.yml
index 37b7832fc5..37640624be 100644
--- a/.github/workflows/api_eval_h800.yml
+++ b/.github/workflows/api_eval_h800.yml
@@ -18,6 +18,20 @@ on:
         description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
         type: string
         default: "['turbomind', 'pytorch']"
+      execution_mode:
+        required: false
+        description: 'Select execution mode: infer, eval, or both. Default is "both"'
+        type: choice
+        options:
+          - both
+          - infer
+          - eval
+        default: 'both'
+      run_id:
+        required: false
+        description: 'Set custom run ID. If not provided, github.run_id will be used'
+        type: string
+        default: ''
 
 
 env:
@@ -91,9 +105,9 @@ jobs:
         - /nvme/qa_test_models:/nvme/qa_test_models
         - /nvme1/qa_test_models:/nvme1/qa_test_models
         - /nvme2/share:/nvme2/share
-        - /mnt/137_nvme2:/mnt/137_nvme2
-        - /mnt/137_nvme3:/mnt/137_nvme3
-        - /mnt/137_nvme4:/mnt/137_nvme4
+        - /mnt/158_nvme2:/mnt/158_nvme2
+        - /mnt/158_nvme3:/mnt/158_nvme3
+        - /mnt/158_nvme4:/mnt/158_nvme4
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
       - name: Create and change to _wk directory
@@ -131,11 +145,18 @@ jobs:
         if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind')
         run: |
           overall_exit=0
-          ln -s /mnt/187/opencompass-data/data ./data
-          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and not pr_test and ${{matrix.backend}}" -n 8 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
-          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and not pr_test and ${{matrix.backend}}" -n 4 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
-          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and not pr_test and ${{matrix.backend}}" -n 2 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
-          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and not pr_test and ${{matrix.backend}}" -n 1 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          ln -s /nvme/qa_test_models/resource/opencompass-data/data ./data
+          ln -s /nvme/qa_test_models/resource/nltk_data /usr/share/nltk_data
+          execution_mode="${{ github.event.inputs.execution_mode || 'both' }}"
+          if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "infer" ]; then
+            pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and ${{matrix.backend}} and infer" -n 8 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+            pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and ${{matrix.backend}} and infer" -n 4 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+            pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and ${{matrix.backend}} and infer" -n 2 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+            pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and ${{matrix.backend}} and infer" -n 1 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          fi
+          if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "eval" ]; then
+            pytest autotest/evaluate/test_api_evaluate.py -m "${{matrix.backend}} and eval" -n 4 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          fi
           exit $overall_exit
       - name: Clear workspace
         if: always()

From dd9dd3604cb8c140dfe8e51223e7cef728ed6ede Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Thu, 16 Oct 2025 14:34:34 +0800
Subject: [PATCH 4/7] TEST: fix port

---
 autotest/evaluate/test_api_evaluate.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/autotest/evaluate/test_api_evaluate.py b/autotest/evaluate/test_api_evaluate.py
index c88e51e796..5887736595 100644
--- a/autotest/evaluate/test_api_evaluate.py
+++ b/autotest/evaluate/test_api_evaluate.py
@@ -37,9 +37,13 @@ def prepare_environment_judge_evaluate(request, config, worker_id):
         'model': 'Qwen/Qwen2.5-32B-Instruct',
         'backend': 'turbomind',
         'param': {
-            'tp_num': 2,
-            'extra': f'--proxy-url http://127.0.0.1:{port} --session-len 46000 --cache-max-entry-count 0.7 ',
-            'cuda_prefix': None
+            'tp_num':
+            2,
+            'extra':
+            '--server-name 127.0.0.1 --proxy-url http://127.0.0.1:{} --session-len 46000 '
+            '--cache-max-entry-count 0.7 '.format(port),
+            'cuda_prefix':
+            None
         },
         'log_path': config.get('log_path'),
     }

From 81f4c10e31b238db3dfa52f401bb71a51bfe9555 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Fri, 17 Oct 2025 10:26:56 +0800
Subject: [PATCH 5/7] update

---
 .github/workflows/daily_ete_test.yml      | 6 +++++-
 .github/workflows/daily_ete_test_3090.yml | 2 ++
 .github/workflows/daily_ete_test_5080.yml | 5 +++++
 .github/workflows/daily_ete_test_h800.yml | 2 ++
 .github/workflows/evaluate.yml            | 1 +
 .github/workflows/evaluate_h800.yml       | 1 +
 6 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
index 1c2f0b549d..bfe61bbbd7 100644
--- a/.github/workflows/daily_ete_test.yml
+++ b/.github/workflows/daily_ete_test.yml
@@ -236,6 +236,7 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
+          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
@@ -338,6 +339,7 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
+          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
@@ -422,6 +424,7 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
+          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
@@ -479,6 +482,7 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
+          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
@@ -538,7 +542,7 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install sentence_transformers==2.2.2 --no-deps
+          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml
index 9243887ecf..9878c7672f 100644
--- a/.github/workflows/daily_ete_test_3090.yml
+++ b/.github/workflows/daily_ete_test_3090.yml
@@ -223,6 +223,7 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
+          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
@@ -287,6 +288,7 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
+          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
diff --git a/.github/workflows/daily_ete_test_5080.yml b/.github/workflows/daily_ete_test_5080.yml
index 3a080f2615..53a8f99dd8 100644
--- a/.github/workflows/daily_ete_test_5080.yml
+++ b/.github/workflows/daily_ete_test_5080.yml
@@ -186,6 +186,7 @@ jobs:
           rm -rf $workdir
           mkdir $workdir
           chmod -R 777 $workdir
+
   test_tools:
     if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
     runs-on: [self-hosted, 5080-r1]
@@ -222,6 +223,7 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
+          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
@@ -263,6 +265,7 @@ jobs:
           rm -rf $workdir
           mkdir $workdir
           chmod -R 777 $workdir
+
   test_restful:
     if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}}
     runs-on: [self-hosted, 5080-r1]
@@ -286,6 +289,7 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
+          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
@@ -351,6 +355,7 @@ jobs:
           rm -rf $workdir
           mkdir $workdir
           chmod -R 777 $workdir
+
   get_coverage_report:
     if: ${{!cancelled() && success()}}
     runs-on: [self-hosted, 5080-r1]
diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml
index 9f1db0dce8..4850ac2033 100644
--- a/.github/workflows/daily_ete_test_h800.yml
+++ b/.github/workflows/daily_ete_test_h800.yml
@@ -168,6 +168,7 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
+          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
@@ -258,6 +259,7 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
+          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml
index 9079d1fa68..c8258c35ed 100644
--- a/.github/workflows/evaluate.yml
+++ b/.github/workflows/evaluate.yml
@@ -123,6 +123,7 @@ jobs:
           name: my-artifact-${{ github.run_id }}-py310
       - name: Install lmdeploy - dependency
         run: |
+          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r /root/models/offline_pkg/requirements.txt
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
diff --git a/.github/workflows/evaluate_h800.yml b/.github/workflows/evaluate_h800.yml
index 07e66f9dc3..5d4f9768a5 100644
--- a/.github/workflows/evaluate_h800.yml
+++ b/.github/workflows/evaluate_h800.yml
@@ -123,6 +123,7 @@ jobs:
           name: my-artifact-${{ github.run_id }}-py310
       - name: Install lmdeploy - dependency
         run: |
+          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}

From d7c5d45f26e75266a28fa18fdaf430c45d1df9d8 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Fri, 17 Oct 2025 10:29:13 +0800
Subject: [PATCH 6/7] update

---
 .github/workflows/api_eval_h800.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/api_eval_h800.yml b/.github/workflows/api_eval_h800.yml
index 37640624be..473a7e35cd 100644
--- a/.github/workflows/api_eval_h800.yml
+++ b/.github/workflows/api_eval_h800.yml
@@ -134,6 +134,7 @@ jobs:
       - name: Install opencompass
         run: |
           python3 -m pip install opencompass
+          python3 -m pip install langdetect
       - name: Check env
         run: |
           python3 -m pip list

From 5d5419d3a2829f8b81c98e780b4c8f78318d4dff Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Fri, 17 Oct 2025 11:33:12 +0800
Subject: [PATCH 7/7] update

---
 .github/workflows/api_eval.yml            | 1 -
 .github/workflows/api_eval_h800.yml       | 1 -
 .github/workflows/daily_ete_test.yml      | 5 -----
 .github/workflows/daily_ete_test_3090.yml | 2 --
 .github/workflows/daily_ete_test_5080.yml | 2 --
 .github/workflows/daily_ete_test_h800.yml | 2 --
 .github/workflows/evaluate.yml            | 1 -
 .github/workflows/evaluate_h800.yml       | 1 -
 8 files changed, 15 deletions(-)

diff --git a/.github/workflows/api_eval.yml b/.github/workflows/api_eval.yml
index d72d5569f7..2d43eb0e05 100644
--- a/.github/workflows/api_eval.yml
+++ b/.github/workflows/api_eval.yml
@@ -121,7 +121,6 @@ jobs:
           name: my-artifact-${{ github.run_id }}-py310
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
diff --git a/.github/workflows/api_eval_h800.yml b/.github/workflows/api_eval_h800.yml
index 473a7e35cd..feb363bad5 100644
--- a/.github/workflows/api_eval_h800.yml
+++ b/.github/workflows/api_eval_h800.yml
@@ -124,7 +124,6 @@ jobs:
           name: my-artifact-${{ github.run_id }}-py310
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
index bfe61bbbd7..ecfb9ae54e 100644
--- a/.github/workflows/daily_ete_test.yml
+++ b/.github/workflows/daily_ete_test.yml
@@ -236,7 +236,6 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
@@ -339,7 +338,6 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
@@ -424,7 +422,6 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
@@ -482,7 +479,6 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
@@ -542,7 +538,6 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml
index 9878c7672f..9243887ecf 100644
--- a/.github/workflows/daily_ete_test_3090.yml
+++ b/.github/workflows/daily_ete_test_3090.yml
@@ -223,7 +223,6 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
@@ -288,7 +287,6 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
diff --git a/.github/workflows/daily_ete_test_5080.yml b/.github/workflows/daily_ete_test_5080.yml
index 53a8f99dd8..89487d7d94 100644
--- a/.github/workflows/daily_ete_test_5080.yml
+++ b/.github/workflows/daily_ete_test_5080.yml
@@ -223,7 +223,6 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
@@ -289,7 +288,6 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml
index 4850ac2033..9f1db0dce8 100644
--- a/.github/workflows/daily_ete_test_h800.yml
+++ b/.github/workflows/daily_ete_test_h800.yml
@@ -168,7 +168,6 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
@@ -259,7 +258,6 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml
index c8258c35ed..9079d1fa68 100644
--- a/.github/workflows/evaluate.yml
+++ b/.github/workflows/evaluate.yml
@@ -123,7 +123,6 @@ jobs:
           name: my-artifact-${{ github.run_id }}-py310
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r /root/models/offline_pkg/requirements.txt
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
diff --git a/.github/workflows/evaluate_h800.yml b/.github/workflows/evaluate_h800.yml
index 5d4f9768a5..07e66f9dc3 100644
--- a/.github/workflows/evaluate_h800.yml
+++ b/.github/workflows/evaluate_h800.yml
@@ -123,7 +123,6 @@ jobs:
           name: my-artifact-${{ github.run_id }}-py310
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install -r requirements_cuda.txt
           python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}