From 6d87ba94dba5440782fb4b0d5edca9cd666e75e2 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Tue, 14 Oct 2025 17:33:58 +0800 Subject: [PATCH 1/7] TEST: add judge evaluate --- .github/workflows/api_eval.yml | 30 +++- autotest/evaluate/eval_config_chat.py | 143 ++++++++++++++--- autotest/evaluate/test_api_evaluate.py | 205 ++++++++++++++++++++++--- autotest/utils/evaluate_utils.py | 100 ++++++++---- autotest/utils/run_restful_chat.py | 92 ++++++++++- 5 files changed, 490 insertions(+), 80 deletions(-) diff --git a/.github/workflows/api_eval.yml b/.github/workflows/api_eval.yml index 5e5d49be36..d72d5569f7 100644 --- a/.github/workflows/api_eval.yml +++ b/.github/workflows/api_eval.yml @@ -18,6 +18,20 @@ on: description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"' type: string default: "['turbomind', 'pytorch']" + execution_mode: + required: false + description: 'Select execution mode: infer, eval, or both. Default is "both"' + type: choice + options: + - both + - infer + - eval + default: 'both' + run_id: + required: false + description: 'Set custom run ID. If not provided, github.run_id will be used' + type: string + default: '' env: @@ -116,6 +130,7 @@ jobs: - name: Install opencompass run: | python3 -m pip install opencompass + python3 -m pip install langdetect - name: Check env run: | python3 -m pip list @@ -128,10 +143,17 @@ jobs: run: | overall_exit=0 ln -s /mnt/187/opencompass-data/data ./data - pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and not pr_test and ${{matrix.backend}}" -n 8 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? - pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and not pr_test and ${{matrix.backend}}" -n 4 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? - pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and not pr_test and ${{matrix.backend}}" -n 2 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? - pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and not pr_test and ${{matrix.backend}}" -n 1 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + ln -s /nvme/qa_test_models/resource/nltk_data /usr/share/nltk_data + execution_mode="${{ github.event.inputs.execution_mode || 'both' }}" + if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "infer" ]; then + pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and ${{matrix.backend}} and infer" -n 8 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and ${{matrix.backend}} and infer" -n 4 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and ${{matrix.backend}} and infer" -n 2 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and ${{matrix.backend}} and infer" -n 1 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + fi + if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "eval" ]; then + pytest autotest/evaluate/test_api_evaluate.py -m "${{matrix.backend}} and eval" -n 4 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + fi exit $overall_exit - name: Clear workspace if: always() diff --git a/autotest/evaluate/eval_config_chat.py b/autotest/evaluate/eval_config_chat.py index 34bd4300c1..e7337b73a7 100644 --- a/autotest/evaluate/eval_config_chat.py +++ b/autotest/evaluate/eval_config_chat.py @@ -1,53 +1,154 @@ +# flake8: noqa + from mmengine.config import read_base from opencompass.models import OpenAISDK +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask from opencompass.utils.text_postprocessors import extract_non_reasoning_content +####################################################################### +# PART 0 Essential Configs # +####################################################################### with read_base(): - from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets # noqa: F401, E501 - from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets # noqa: F401, E501 - from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups # noqa: F401, E501 - -mmlu_datasets = [ - x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [ - 'business_ethics', 'clinical_knowledge', 'college_medicine', 'global_facts', 'human_aging', 'management', - 'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting', - 'professional_medicine', 'virology' - ] -] + # Datasets + from opencompass.configs.datasets.aime2025.aime2025_llmjudge_academic import aime2025_datasets + from opencompass.configs.datasets.gpqa.gpqa_cascade_eval_academic import gpqa_datasets + from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import ifeval_datasets + # LiveCodeBench dataset commented out to avoid version errors + # from opencompass.configs.datasets.livecodebench.livecodebench_v6_academic import \ + # LCBCodeGeneration_dataset + from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import mmlu_pro_datasets + # HLE dataset commented out to avoid connection errors + # from opencompass.configs.datasets.HLE.hle_llmverify_academic import \ + # hle_datasets + # Summary Groups + from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups -datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) +####################################################################### +# Model Configuration # +####################################################################### MODEL_NAME = '' MODEL_PATH = '' API_BASE = '' +JUDGE_MODEL_PATH = '' +JUDGE_API_BASE = '' api_meta_template = dict(round=[ dict(role='HUMAN', api_role='HUMAN'), dict(role='BOT', api_role='BOT', generate=True), ]) +# Use OpenAISDK to configure LMDeploy OpenAI interface models = [ dict(type=OpenAISDK, abbr=f'{MODEL_NAME}-lmdeploy-api', - openai_api_base=API_BASE, - key='EMPTY', path=MODEL_PATH, + key='EMPTY', + openai_api_base=API_BASE, + retry=3, + run_cfg=dict(num_gpus=0), meta_template=api_meta_template, - max_out_len=32768, - batch_size=500, - temperature=0.1, pred_postprocessor=dict(type=extract_non_reasoning_content)) ] +####################################################################### +# PART 1 Datasets List # +####################################################################### +# datasets list for evaluation +# Remove LCBCodeGeneration_dataset due to version errors + +mmlu_pro_datasets = [x for x in mmlu_pro_datasets if 'math' in x['abbr'] or 'other' in x['abbr']] + +# Modify datasets list to exclude hle_datasets and LCBCodeGeneration_dataset +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets') and k != 'hle_datasets'), []) + +# LLM judge config: using LLM to evaluate predictions +judge_cfg = dict( + type=OpenAISDK, + path=JUDGE_MODEL_PATH, + key='EMPTY', + openai_api_base=JUDGE_API_BASE, + meta_template=dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ]), + query_per_second=16, + batch_size=1024, + temperature=0.001, + tokenizer_path=JUDGE_MODEL_PATH, + verbose=True, + max_out_len=16384, + max_seq_len=49152, +) + +for item in datasets: + if 'judge_cfg' in item['eval_cfg']['evaluator']: + item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg + if 'llm_evaluator' in item['eval_cfg']['evaluator'].keys( + ) and 'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']: + item['eval_cfg']['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg + +####################################################################### +# PART 2 Dataset Summarizer # +####################################################################### + +core_summary_groups = [ + { + 'name': + 'core_average', + 'subsets': [ + ['IFEval', 'Prompt-level-strict-accuracy'], + # Remove hle_llmjudge due to unavailable dataset + # ['hle_llmjudge', 'accuracy'], + ['aime2025_repeat_32', 'accuracy (32 runs average)'], + ['GPQA_diamond_repeat_4', 'accuracy (4 runs average)'], + ['mmlu_pro', 'naive_average'], + 'mmlu_pro_math', + 'mmlu_pro_other', + # Remove lcb_code_generation_repeat_6 due to version errors + # ['lcb_code_generation_repeat_6', 'pass@1 (6 runs average)'], + ], + }, +] + summarizer = dict( dataset_abbrs=[ - ['mmlu', 'naive_average'], - ['gsm8k', 'accuracy'], - 'mmlu-other', + ['core_average', 'naive_average'], + ['IFEval', 'Prompt-level-strict-accuracy'], + # Remove hle_llmjudge due to unavailable dataset + # ['hle_llmjudge', 'accuracy'], + ['GPQA_diamond_repeat_4', 'accuracy (4 runs average)'], + ['aime2025_repeat_32', 'accuracy (32 runs average)'], + ['mmlu_pro', 'naive_average'], + 'mmlu_pro_math', + 'mmlu_pro_other', ], - summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []), + summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []) + core_summary_groups, ) for item in datasets: if 'max_out_len' in item['infer_cfg']['inferencer']: del item['infer_cfg']['inferencer']['max_out_len'] + +####################################################################### +# PART 4 Inference/Evaluation Configuration # +####################################################################### + +# infer with local runner +infer = dict( + partitioner=dict(type=NumWorkerPartitioner, num_worker=8), + runner=dict( + type=LocalRunner, + max_num_workers=16, + retry=0, + task=dict(type=OpenICLInferTask), + ), +) + +# eval with local runner +eval = dict( + partitioner=dict(type=NaivePartitioner, n=10), + runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=OpenICLEvalTask)), +) diff --git a/autotest/evaluate/test_api_evaluate.py b/autotest/evaluate/test_api_evaluate.py index ffd3edc97e..c88e51e796 100644 --- a/autotest/evaluate/test_api_evaluate.py +++ b/autotest/evaluate/test_api_evaluate.py @@ -1,12 +1,22 @@ import pytest from utils.config_utils import get_evaluate_pytorch_model_list, get_evaluate_turbomind_model_list, get_workerid from utils.evaluate_utils import restful_test -from utils.run_restful_chat import start_restful_api, stop_restful_api +from utils.run_restful_chat import start_proxy_server, start_restful_api, stop_restful_api DEFAULT_PORT = 23333 +PROXY_PORT = 8000 +EVAL_CONFIGS = { + 'default': { + 'query_per_second': 1, + 'max_out_len': 32768, + 'batch_size': 500, + 'temperature': 0.1, + } +} -@pytest.fixture(scope='function', autouse=True) + +@pytest.fixture(scope='function') def prepare_environment(request, config, worker_id): param = request.param model = param['model'] @@ -17,6 +27,39 @@ def prepare_environment(request, config, worker_id): stop_restful_api(pid, startRes, param) +@pytest.fixture(scope='function') +def prepare_environment_judge_evaluate(request, config, worker_id): + if get_workerid(worker_id) is None: + port = PROXY_PORT + else: + port = PROXY_PORT + get_workerid(worker_id) + judge_config = { + 'model': 'Qwen/Qwen2.5-32B-Instruct', + 'backend': 'turbomind', + 'param': { + 'tp_num': 2, + 'extra': f'--proxy-url http://127.0.0.1:{port} --session-len 46000 --cache-max-entry-count 0.7 ', + 'cuda_prefix': None + }, + 'log_path': config.get('log_path'), + } + + param = judge_config['param'] + model = judge_config['model'] + backend = judge_config['backend'] + model_path = config.get('model_path') + '/' + model + + proxy_pid, proxy_process = start_proxy_server(config, worker_id) + + judge_pid, judge_start_res = start_restful_api(config, param, model, model_path, backend, worker_id) + + try: + yield request.param + finally: + stop_restful_api(judge_pid, judge_start_res, request.param) + stop_restful_api(proxy_pid, proxy_process, request.param) + + def get_turbomind_model_list(tp_num): model_list = get_evaluate_turbomind_model_list(tp_num, kvint_list=[4, 8]) new_model_list = [] @@ -35,85 +78,205 @@ def get_pytorch_model_list(tp_num): return new_model_list -def run_test(config, run_id, prepare_environment, worker_id): +def run_test(config, run_id, prepare_environment, worker_id, test_type='infer', eval_config_name='default'): + """Run test with specified evaluation configuration.""" + preset_config = EVAL_CONFIGS.get(eval_config_name, {}) + + if test_type == 'infer': + port = DEFAULT_PORT + else: # eval + port = PROXY_PORT + if get_workerid(worker_id) is None: - result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id) + result, msg = restful_test(config, + run_id, + prepare_environment, + worker_id=worker_id, + port=port, + test_type=test_type, + **preset_config) else: result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id, - port=DEFAULT_PORT + get_workerid(worker_id)) + port=port + get_workerid(worker_id), + test_type=test_type, + **preset_config) return result, msg +@pytest.mark.infer @pytest.mark.turbomind @pytest.mark.gpu_num_1 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('prepare_environment', get_turbomind_model_list(tp_num=1), indirect=True) -def test_turbomind_restful_tp1(config, run_id, prepare_environment, worker_id): - result, msg = run_test(config, run_id, prepare_environment, worker_id) +@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys())) +def test_turbomind_restful_tp1(config, run_id, prepare_environment, worker_id, eval_config): + result, msg = run_test(config, run_id, prepare_environment, worker_id, 'infer', eval_config) assert result, msg +@pytest.mark.infer @pytest.mark.turbomind @pytest.mark.gpu_num_2 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('prepare_environment', get_turbomind_model_list(tp_num=2), indirect=True) -def test_turbomind_restful_tp2(config, run_id, prepare_environment, worker_id): - result, msg = run_test(config, run_id, prepare_environment, worker_id) +@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys())) +def test_turbomind_restful_tp2(config, run_id, prepare_environment, worker_id, eval_config): + result, msg = run_test(config, run_id, prepare_environment, worker_id, 'infer', eval_config) assert result, msg +@pytest.mark.infer @pytest.mark.turbomind @pytest.mark.gpu_num_4 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('prepare_environment', get_turbomind_model_list(tp_num=4), indirect=True) -def test_turbomind_restful_tp4(config, run_id, prepare_environment, worker_id): - result, msg = run_test(config, run_id, prepare_environment, worker_id) +@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys())) +def test_turbomind_restful_tp4(config, run_id, prepare_environment, worker_id, eval_config): + result, msg = run_test(config, run_id, prepare_environment, worker_id, 'infer', eval_config) assert result, msg +@pytest.mark.infer @pytest.mark.turbomind @pytest.mark.gpu_num_8 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('prepare_environment', get_turbomind_model_list(tp_num=8), indirect=True) -def test_turbomind_restful_tp8(config, run_id, prepare_environment, worker_id): - result, msg = run_test(config, run_id, prepare_environment, worker_id) +@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys())) +def test_turbomind_restful_tp8(config, run_id, prepare_environment, worker_id, eval_config): + result, msg = run_test(config, run_id, prepare_environment, worker_id, 'infer', eval_config) assert result, msg +@pytest.mark.infer @pytest.mark.pytorch @pytest.mark.gpu_num_1 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('prepare_environment', get_pytorch_model_list(tp_num=1), indirect=True) -def test_pytorch_restful_tp1(config, run_id, prepare_environment, worker_id): - result, msg = run_test(config, run_id, prepare_environment, worker_id) +@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys())) +def test_pytorch_restful_tp1(config, run_id, prepare_environment, worker_id, eval_config): + result, msg = run_test(config, run_id, prepare_environment, worker_id, 'infer', eval_config) assert result, msg +@pytest.mark.infer @pytest.mark.pytorch @pytest.mark.gpu_num_2 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('prepare_environment', get_pytorch_model_list(tp_num=2), indirect=True) -def test_pytorch_restful_tp2(config, run_id, prepare_environment, worker_id): - result, msg = run_test(config, run_id, prepare_environment, worker_id) +@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys())) +def test_pytorch_restful_tp2(config, run_id, prepare_environment, worker_id, eval_config): + result, msg = run_test(config, run_id, prepare_environment, worker_id, 'infer', eval_config) assert result, msg +@pytest.mark.infer @pytest.mark.pytorch @pytest.mark.gpu_num_4 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('prepare_environment', get_pytorch_model_list(tp_num=4), indirect=True) -def test_pytorch_restful_tp4(config, run_id, prepare_environment, worker_id): - result, msg = run_test(config, run_id, prepare_environment, worker_id) +@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys())) +def test_pytorch_restful_tp4(config, run_id, prepare_environment, worker_id, eval_config): + result, msg = run_test(config, run_id, prepare_environment, worker_id, 'infer', eval_config) assert result, msg +@pytest.mark.infer @pytest.mark.pytorch @pytest.mark.gpu_num_8 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('prepare_environment', get_pytorch_model_list(tp_num=8), indirect=True) -def test_pytorch_restful_tp8(config, run_id, prepare_environment, worker_id): - result, msg = run_test(config, run_id, prepare_environment, worker_id) +@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys())) +def test_pytorch_restful_tp8(config, run_id, prepare_environment, worker_id, eval_config): + result, msg = run_test(config, run_id, prepare_environment, worker_id, 'infer', eval_config) + assert result, msg + + +@pytest.mark.eval +@pytest.mark.pytorch +@pytest.mark.gpu_num_1 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_pytorch_model_list(tp_num=1), indirect=True) +@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys())) +def test_pytorch_judgeeval_tp1(config, run_id, prepare_environment_judge_evaluate, worker_id, eval_config): + result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval', eval_config) + assert result, msg + + +@pytest.mark.eval +@pytest.mark.pytorch +@pytest.mark.gpu_num_2 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_pytorch_model_list(tp_num=2), indirect=True) +@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys())) +def test_pytorch_judgeeval_tp2(config, run_id, prepare_environment_judge_evaluate, worker_id, eval_config): + result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval', eval_config) + assert result, msg + + +@pytest.mark.eval +@pytest.mark.pytorch +@pytest.mark.flaky(reruns=0) +@pytest.mark.gpu_num_4 +@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_pytorch_model_list(tp_num=4), indirect=True) +@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys())) +def test_pytorch_judgeeval_tp4(config, run_id, prepare_environment_judge_evaluate, worker_id, eval_config): + result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval', eval_config) + assert result, msg + + +@pytest.mark.eval +@pytest.mark.pytorch +@pytest.mark.flaky(reruns=0) +@pytest.mark.gpu_num_8 +@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_pytorch_model_list(tp_num=8), indirect=True) +@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys())) +def test_pytorch_judgeeval_tp8(config, run_id, prepare_environment_judge_evaluate, worker_id, eval_config): + result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval', eval_config) + assert result, msg + + +@pytest.mark.eval +@pytest.mark.turbomind +@pytest.mark.gpu_num_1 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_turbomind_model_list(tp_num=1), indirect=True) +@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys())) +def test_turbomind_judgeeval_tp1(config, run_id, prepare_environment_judge_evaluate, worker_id, eval_config): + result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval', eval_config) + assert result, msg + + +@pytest.mark.eval +@pytest.mark.turbomind +@pytest.mark.gpu_num_2 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_turbomind_model_list(tp_num=2), indirect=True) +@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys())) +def test_turbomind_judgeeval_tp2(config, run_id, prepare_environment_judge_evaluate, worker_id, eval_config): + result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval', eval_config) + assert result, msg + + +@pytest.mark.eval +@pytest.mark.turbomind +@pytest.mark.gpu_num_4 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_turbomind_model_list(tp_num=4), indirect=True) +@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys())) +def test_turbomind_judgeeval_tp4(config, run_id, prepare_environment_judge_evaluate, worker_id, eval_config): + result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval', eval_config) + assert result, msg + + +@pytest.mark.eval +@pytest.mark.turbomind +@pytest.mark.gpu_num_8 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_turbomind_model_list(tp_num=8), indirect=True) +@pytest.mark.parametrize('eval_config', list(EVAL_CONFIGS.keys())) +def test_turbomind_judgeeval_tp8(config, run_id, prepare_environment_judge_evaluate, worker_id, eval_config): + result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval', eval_config) assert result, msg diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py index 527bb64994..09c33dea00 100644 --- a/autotest/utils/evaluate_utils.py +++ b/autotest/utils/evaluate_utils.py @@ -72,7 +72,7 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, c ) -def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFAULT_PORT): +def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFAULT_PORT, test_type='infer', **kwargs): work_dir = None try: model_name = prepare_environment['model'] @@ -108,34 +108,69 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA try: - if not os.path.exists(config_file): - return False, f'Config file {config_file} not found' - - cfg = Config.fromfile(config_file) - - cfg.MODEL_NAME = summary_model_name - cfg.MODEL_PATH = model_path - cfg.API_BASE = f'http://127.0.0.1:{port}/v1' # noqa: E231 - - if cfg.models and len(cfg.models) > 0: - model_cfg = cfg.models[0] - model_cfg['abbr'] = f'{summary_model_name}-lmdeploy-api' - model_cfg['openai_api_base'] = f'http://127.0.0.1:{port}/v1' # noqa: E231 - model_cfg['path'] = model_path - if 'backend' in model_cfg: - model_cfg['backend'] = backend_type - - if 'engine_config' in model_cfg and 'communicator' in model_cfg['engine_config']: - model_cfg['engine_config']['communicator'] = communicator - - simple_model_name = model_name.replace('/', '_') - temp_config_file = f'temp_{simple_model_name}_{os.getpid()}.py' + temp_config_file = f"temp_{backend_type}_{summary_model_name.replace('/', '_')}_{communicator}.py" temp_config_path = os.path.join(log_path, temp_config_file) - cfg.dump(temp_config_path) - print(f'Modified config saved to: {temp_config_path}') - - cmd = ['opencompass', temp_config_path, '--reuse', '--max-num-workers', '16', '-w', work_dir] + if test_type == 'infer': + if not os.path.exists(config_file): + return False, f'Config file {config_file} not found' + + cfg = Config.fromfile(config_file) + + cfg.MODEL_NAME = summary_model_name + cfg.MODEL_PATH = model_path + cfg.API_BASE = f'http://127.0.0.1:{port}/v1' # noqa: E231 + + if cfg.models and len(cfg.models) > 0: + model_cfg = cfg.models[0] + model_cfg['abbr'] = f'{summary_model_name}-lmdeploy-api' + model_cfg['openai_api_base'] = f'http://127.0.0.1:{port}/v1' # noqa: E231 + model_cfg['path'] = model_path + + for key, value in kwargs.items(): + model_cfg[key] = value + + cfg.dump(temp_config_path) + print(f'Modified config saved to: {temp_config_path}') + elif test_type == 'eval': + if not os.path.exists(temp_config_path): + error_msg = f'Temp config file {temp_config_path} not found for eval stage' + write_to_summary(summary_model_name, tp_num, False, error_msg, worker_id, backend_type, + communicator, work_dir) + return False, error_msg + + cfg = Config.fromfile(temp_config_path) + print(f'Using existing temp config file: {temp_config_path}') + + cfg.JUDGE_API_BASE = f'http://127.0.0.1:{port}/v1' + cfg.JUDGE_MODEL_PATH = os.path.join(model_base_path, 'Qwen/Qwen2.5-32B-Instruct') + + if hasattr(cfg, 'judge_cfg'): + cfg.judge_cfg['path'] = cfg.JUDGE_MODEL_PATH + cfg.judge_cfg['openai_api_base'] = cfg.JUDGE_API_BASE + cfg.judge_cfg['tokenizer_path'] = cfg.JUDGE_MODEL_PATH + + if hasattr(cfg, 'datasets') and cfg.datasets: + for dataset in cfg.datasets: + if 'eval_cfg' in dataset and 'evaluator' in dataset['eval_cfg']: + evaluator = dataset['eval_cfg']['evaluator'] + + if 'judge_cfg' in evaluator: + evaluator['judge_cfg']['path'] = cfg.JUDGE_MODEL_PATH + evaluator['judge_cfg']['openai_api_base'] = cfg.JUDGE_API_BASE + evaluator['judge_cfg']['tokenizer_path'] = cfg.JUDGE_MODEL_PATH + + if 'llm_evaluator' in evaluator and 'judge_cfg' in evaluator['llm_evaluator']: + evaluator['llm_evaluator']['judge_cfg']['path'] = cfg.JUDGE_MODEL_PATH + evaluator['llm_evaluator']['judge_cfg']['openai_api_base'] = cfg.JUDGE_API_BASE + evaluator['llm_evaluator']['judge_cfg']['tokenizer_path'] = cfg.JUDGE_MODEL_PATH + + cfg.dump(temp_config_path) + print(f'Modified config for eval stage saved to: {temp_config_path}') + + cmd = [ + 'opencompass', temp_config_path, '--reuse', '--max-num-workers', '16', '-w', work_dir, '-m', test_type + ] print(f"Running command: {' '.join(cmd)}") print(f'Work directory: {work_dir}') @@ -144,7 +179,7 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA stdout_output = result.stdout stderr_output = result.stderr - log_filename = (f'eval_{backend_type}_' + log_filename = (f'{test_type}_{backend_type}_' f"{model_name.replace('/', '_')}_" f'{communicator}_' f'{worker_id}_' @@ -198,8 +233,9 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA error_lines = ' | '.join(error_lines[:3]) final_msg += f'\nLog errors: {error_lines}' - write_to_summary(summary_model_name, tp_num, final_result, final_msg, worker_id, backend_type, communicator, - work_dir) + if test_type == 'eval': + write_to_summary(summary_model_name, tp_num, final_result, final_msg, worker_id, backend_type, + communicator, work_dir) return final_result, final_msg @@ -210,13 +246,13 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA except subprocess.TimeoutExpired: timeout_msg = (f'Evaluation timed out for {model_name} ' f'after 7200 seconds') - if work_dir: + if work_dir and test_type == 'eval': write_to_summary(summary_model_name, tp_num, False, timeout_msg, worker_id, backend_type, communicator, work_dir) return False, timeout_msg except Exception as e: error_msg = f'Error during evaluation for {model_name}: {str(e)}' - if work_dir: + if work_dir and test_type == 'eval': write_to_summary(summary_model_name, tp_num, False, error_msg, worker_id, backend_type, communicator, work_dir) return False, error_msg diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py index 876fd295e2..d6a02b170b 100644 --- a/autotest/utils/run_restful_chat.py +++ b/autotest/utils/run_restful_chat.py @@ -6,6 +6,7 @@ import allure import psutil +import requests from openai import OpenAI from pytest_assume.plugin import assume from utils.config_utils import _is_bf16_supported_by_device, get_cuda_prefix_by_workerid, get_workerid @@ -17,6 +18,7 @@ BASE_HTTP_URL = 'http://localhost' DEFAULT_PORT = 23333 +PROXY_PORT = 8000 def start_restful_api(config, param, model, model_path, backend_type, worker_id): @@ -53,8 +55,7 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id) else: port = DEFAULT_PORT + worker_num - cmd = get_command_with_extra('lmdeploy serve api_server ' + model_path + ' --session-len 8096 --server-port ' + - str(port), + cmd = get_command_with_extra('lmdeploy serve api_server ' + model_path + ' --server-port ' + str(port), config, model, need_tp=True, @@ -681,3 +682,90 @@ def run_tools_case(config, port: int = DEFAULT_PORT): file.close() allure.attach.file(restful_log, attachment_type=allure.attachment_type.TEXT) + + +def proxy_health_check(url): + """Check if proxy server is healthy.""" + try: + # For proxy server, we check if it responds to the /v1/models endpoint + import requests + response = requests.get(f'{url}/v1/models', timeout=5) + if response.status_code == 200: + return True + return False + except Exception: + return False + + +def start_proxy_server(config, worker_id): + """Start the proxy server for testing with enhanced error handling and + logging.""" + log_path = config.get('eval_log_path') + if log_path is None: + log_path = '/nvme/qa_test_models/evaluation_report' + os.makedirs(log_path, exist_ok=True) + + timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + proxy_log = os.path.join(log_path, f'proxy_server_{worker_id}_{timestamp}.log') + + worker_num = get_workerid(worker_id) + if worker_num is None: + port = PROXY_PORT + else: + port = PROXY_PORT + worker_num + + proxy_url = f'http://127.0.0.1:{port}' + try: + response = requests.get(f'{proxy_url}/nodes/status', timeout=5) + if response.status_code == 200: + print(f'Terminating existing nodes on proxy {proxy_url}') + requests.get(f'{proxy_url}/nodes/terminate_all', timeout=10) + sleep(5) + except requests.exceptions.RequestException: + pass + + cmd = (f'lmdeploy serve proxy --server-name 127.0.0.1 --server-port {port} ' + f'--routing-strategy min_expected_latency --serving-strategy Hybrid') + + print(f'Starting proxy server with command: {cmd}') + print(f'Proxy log will be saved to: {proxy_log}') + + proxy_file = open(proxy_log, 'w') + proxy_process = subprocess.Popen([cmd], + stdout=proxy_file, + stderr=proxy_file, + shell=True, + text=True, + encoding='utf-8') + pid = proxy_process.pid + + start_time = int(time()) + timeout = 300 + + sleep(5) + for i in range(timeout): + sleep(1) + if proxy_health_check(f'http://127.0.0.1:{port}'): + break + + try: + # Check if process is still running + return_code = proxy_process.wait(timeout=1) # Small timeout to check status + if return_code != 0: + with open(proxy_log, 'r') as f: + content = f.read() + print(content) + return 0, proxy_process + except subprocess.TimeoutExpired: + continue + + end_time = int(time()) + total_time = end_time - start_time + if total_time >= timeout: + break + + proxy_file.close() + allure.attach.file(proxy_log, attachment_type=allure.attachment_type.TEXT) + + print(f'Proxy server started successfully with PID: {pid}') + return pid, proxy_process From 58b5c46a70d86bb840ae7cbcb88de49c061d3d3f Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Tue, 14 Oct 2025 17:53:52 +0800 Subject: [PATCH 2/7] TEST: update yaml --- autotest/config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/autotest/config.yaml b/autotest/config.yaml index 87c428fb18..b4acb02fb3 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -289,6 +289,7 @@ turbomind_quatization: - deepseek-ai/DeepSeek-V2-Lite-Chat no_kvint8: - deepseek-ai/DeepSeek-V2-Chat + - Qwen/Qwen2.5-7B-Instruct pytorch_quatization: awq: From 3701067f466879380b666b3a5645edb1da6c9057 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Wed, 15 Oct 2025 10:54:28 +0800 Subject: [PATCH 3/7] Update api_eval_h800.yml --- .github/workflows/api_eval_h800.yml | 37 ++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/.github/workflows/api_eval_h800.yml b/.github/workflows/api_eval_h800.yml index 37b7832fc5..37640624be 100644 --- a/.github/workflows/api_eval_h800.yml +++ b/.github/workflows/api_eval_h800.yml @@ -18,6 +18,20 @@ on: description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"' type: string default: "['turbomind', 'pytorch']" + execution_mode: + required: false + description: 'Select execution mode: infer, eval, or both. Default is "both"' + type: choice + options: + - both + - infer + - eval + default: 'both' + run_id: + required: false + description: 'Set custom run ID. If not provided, github.run_id will be used' + type: string + default: '' env: @@ -91,9 +105,9 @@ jobs: - /nvme/qa_test_models:/nvme/qa_test_models - /nvme1/qa_test_models:/nvme1/qa_test_models - /nvme2/share:/nvme2/share - - /mnt/137_nvme2:/mnt/137_nvme2 - - /mnt/137_nvme3:/mnt/137_nvme3 - - /mnt/137_nvme4:/mnt/137_nvme4 + - /mnt/158_nvme2:/mnt/158_nvme2 + - /mnt/158_nvme3:/mnt/158_nvme3 + - /mnt/158_nvme4:/mnt/158_nvme4 - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro steps: - name: Create and change to _wk directory @@ -131,11 +145,18 @@ jobs: if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') run: | overall_exit=0 - ln -s /mnt/187/opencompass-data/data ./data - pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and not pr_test and ${{matrix.backend}}" -n 8 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? - pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and not pr_test and ${{matrix.backend}}" -n 4 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? - pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and not pr_test and ${{matrix.backend}}" -n 2 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? - pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and not pr_test and ${{matrix.backend}}" -n 1 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + ln -s /nvme/qa_test_models/resource/opencompass-data/data ./data + ln -s /nvme/qa_test_models/resource/nltk_data /usr/share/nltk_data + execution_mode="${{ github.event.inputs.execution_mode || 'both' }}" + if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "infer" ]; then + pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and ${{matrix.backend}} and infer" -n 8 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and ${{matrix.backend}} and infer" -n 4 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and ${{matrix.backend}} and infer" -n 2 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and ${{matrix.backend}} and infer" -n 1 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + fi + if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "eval" ]; then + pytest autotest/evaluate/test_api_evaluate.py -m "${{matrix.backend}} and eval" -n 4 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + fi exit $overall_exit - name: Clear workspace if: always() From dd9dd3604cb8c140dfe8e51223e7cef728ed6ede Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Thu, 16 Oct 2025 14:34:34 +0800 Subject: [PATCH 4/7] TEST: fix port --- autotest/evaluate/test_api_evaluate.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/autotest/evaluate/test_api_evaluate.py b/autotest/evaluate/test_api_evaluate.py index c88e51e796..5887736595 100644 --- a/autotest/evaluate/test_api_evaluate.py +++ b/autotest/evaluate/test_api_evaluate.py @@ -37,9 +37,13 @@ def prepare_environment_judge_evaluate(request, config, worker_id): 'model': 'Qwen/Qwen2.5-32B-Instruct', 'backend': 'turbomind', 'param': { - 'tp_num': 2, - 'extra': f'--proxy-url http://127.0.0.1:{port} --session-len 46000 --cache-max-entry-count 0.7 ', - 'cuda_prefix': None + 'tp_num': + 2, + 'extra': + '--server-name 127.0.0.1 --proxy-url http://127.0.0.1:{} --session-len 46000 ' + '--cache-max-entry-count 0.7 '.format(port), + 'cuda_prefix': + None }, 'log_path': config.get('log_path'), } From 81f4c10e31b238db3dfa52f401bb71a51bfe9555 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Fri, 17 Oct 2025 10:26:56 +0800 Subject: [PATCH 5/7] update --- .github/workflows/daily_ete_test.yml | 6 +++++- .github/workflows/daily_ete_test_3090.yml | 2 ++ .github/workflows/daily_ete_test_5080.yml | 5 +++++ .github/workflows/daily_ete_test_h800.yml | 2 ++ .github/workflows/evaluate.yml | 1 + .github/workflows/evaluate_h800.yml | 1 + 6 files changed, 16 insertions(+), 1 deletion(-) diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml index 1c2f0b549d..bfe61bbbd7 100644 --- a/.github/workflows/daily_ete_test.yml +++ b/.github/workflows/daily_ete_test.yml @@ -236,6 +236,7 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | + python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | @@ -338,6 +339,7 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | + python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | @@ -422,6 +424,7 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | + python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | @@ -479,6 +482,7 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | + python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | @@ -538,7 +542,7 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | - python3 -m pip install sentence_transformers==2.2.2 --no-deps + python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml index 9243887ecf..9878c7672f 100644 --- a/.github/workflows/daily_ete_test_3090.yml +++ b/.github/workflows/daily_ete_test_3090.yml @@ -223,6 +223,7 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | + python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | @@ -287,6 +288,7 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | + python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | diff --git a/.github/workflows/daily_ete_test_5080.yml b/.github/workflows/daily_ete_test_5080.yml index 3a080f2615..53a8f99dd8 100644 --- a/.github/workflows/daily_ete_test_5080.yml +++ b/.github/workflows/daily_ete_test_5080.yml @@ -186,6 +186,7 @@ jobs: rm -rf $workdir mkdir $workdir chmod -R 777 $workdir + test_tools: if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}} runs-on: [self-hosted, 5080-r1] @@ -222,6 +223,7 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | + python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | @@ -263,6 +265,7 @@ jobs: rm -rf $workdir mkdir $workdir chmod -R 777 $workdir + test_restful: if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}} runs-on: [self-hosted, 5080-r1] @@ -286,6 +289,7 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | + python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | @@ -351,6 +355,7 @@ jobs: rm -rf $workdir mkdir $workdir chmod -R 777 $workdir + get_coverage_report: if: ${{!cancelled() && success()}} runs-on: [self-hosted, 5080-r1] diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml index 9f1db0dce8..4850ac2033 100644 --- a/.github/workflows/daily_ete_test_h800.yml +++ b/.github/workflows/daily_ete_test_h800.yml @@ -168,6 +168,7 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | + python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | @@ -258,6 +259,7 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | + python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index 9079d1fa68..c8258c35ed 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -123,6 +123,7 @@ jobs: name: my-artifact-${{ github.run_id }}-py310 - name: Install lmdeploy - dependency run: | + python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r /root/models/offline_pkg/requirements.txt - name: Install lmdeploy if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} diff --git a/.github/workflows/evaluate_h800.yml b/.github/workflows/evaluate_h800.yml index 07e66f9dc3..5d4f9768a5 100644 --- a/.github/workflows/evaluate_h800.yml +++ b/.github/workflows/evaluate_h800.yml @@ -123,6 +123,7 @@ jobs: name: my-artifact-${{ github.run_id }}-py310 - name: Install lmdeploy - dependency run: | + python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt - name: Install lmdeploy if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} From d7c5d45f26e75266a28fa18fdaf430c45d1df9d8 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Fri, 17 Oct 2025 10:29:13 +0800 Subject: [PATCH 6/7] update --- .github/workflows/api_eval_h800.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/api_eval_h800.yml b/.github/workflows/api_eval_h800.yml index 37640624be..473a7e35cd 100644 --- a/.github/workflows/api_eval_h800.yml +++ b/.github/workflows/api_eval_h800.yml @@ -134,6 +134,7 @@ jobs: - name: Install opencompass run: | python3 -m pip install opencompass + python3 -m pip install langdetect - name: Check env run: | python3 -m pip list From 5d5419d3a2829f8b81c98e780b4c8f78318d4dff Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Fri, 17 Oct 2025 11:33:12 +0800 Subject: [PATCH 7/7] update --- .github/workflows/api_eval.yml | 1 - .github/workflows/api_eval_h800.yml | 1 - .github/workflows/daily_ete_test.yml | 5 ----- .github/workflows/daily_ete_test_3090.yml | 2 -- .github/workflows/daily_ete_test_5080.yml | 2 -- .github/workflows/daily_ete_test_h800.yml | 2 -- .github/workflows/evaluate.yml | 1 - .github/workflows/evaluate_h800.yml | 1 - 8 files changed, 15 deletions(-) diff --git a/.github/workflows/api_eval.yml b/.github/workflows/api_eval.yml index d72d5569f7..2d43eb0e05 100644 --- a/.github/workflows/api_eval.yml +++ b/.github/workflows/api_eval.yml @@ -121,7 +121,6 @@ jobs: name: my-artifact-${{ github.run_id }}-py310 - name: Install lmdeploy - dependency run: | - python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | diff --git a/.github/workflows/api_eval_h800.yml b/.github/workflows/api_eval_h800.yml index 473a7e35cd..feb363bad5 100644 --- a/.github/workflows/api_eval_h800.yml +++ b/.github/workflows/api_eval_h800.yml @@ -124,7 +124,6 @@ jobs: name: my-artifact-${{ github.run_id }}-py310 - name: Install lmdeploy - dependency run: | - python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml index bfe61bbbd7..ecfb9ae54e 100644 --- a/.github/workflows/daily_ete_test.yml +++ b/.github/workflows/daily_ete_test.yml @@ -236,7 +236,6 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | - python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | @@ -339,7 +338,6 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | - python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | @@ -424,7 +422,6 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | - python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | @@ -482,7 +479,6 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | - python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | @@ -542,7 +538,6 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | - python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml index 9878c7672f..9243887ecf 100644 --- a/.github/workflows/daily_ete_test_3090.yml +++ b/.github/workflows/daily_ete_test_3090.yml @@ -223,7 +223,6 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | - python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | @@ -288,7 +287,6 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | - python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | diff --git a/.github/workflows/daily_ete_test_5080.yml b/.github/workflows/daily_ete_test_5080.yml index 53a8f99dd8..89487d7d94 100644 --- a/.github/workflows/daily_ete_test_5080.yml +++ b/.github/workflows/daily_ete_test_5080.yml @@ -223,7 +223,6 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | - python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | @@ -289,7 +288,6 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | - python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml index 4850ac2033..9f1db0dce8 100644 --- a/.github/workflows/daily_ete_test_h800.yml +++ b/.github/workflows/daily_ete_test_h800.yml @@ -168,7 +168,6 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | - python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | @@ -259,7 +258,6 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | - python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index c8258c35ed..9079d1fa68 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -123,7 +123,6 @@ jobs: name: my-artifact-${{ github.run_id }}-py310 - name: Install lmdeploy - dependency run: | - python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r /root/models/offline_pkg/requirements.txt - name: Install lmdeploy if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} diff --git a/.github/workflows/evaluate_h800.yml b/.github/workflows/evaluate_h800.yml index 5d4f9768a5..07e66f9dc3 100644 --- a/.github/workflows/evaluate_h800.yml +++ b/.github/workflows/evaluate_h800.yml @@ -123,7 +123,6 @@ jobs: name: my-artifact-${{ github.run_id }}-py310 - name: Install lmdeploy - dependency run: | - python3 -m pip install -r requirements_cuda.txt python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt - name: Install lmdeploy if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}