Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 26 additions & 5 deletions .github/workflows/api_eval.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,20 @@ on:
description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
type: string
default: "['turbomind', 'pytorch']"
execution_mode:
required: false
description: 'Select execution mode: infer, eval, or both. Default is "both"'
type: choice
options:
- both
- infer
- eval
default: 'both'
run_id:
required: false
description: 'Set custom run ID. If not provided, github.run_id will be used'
type: string
default: ''


env:
Expand Down Expand Up @@ -107,7 +121,6 @@ jobs:
name: my-artifact-${{ github.run_id }}-py310
- name: Install lmdeploy - dependency
run: |
python3 -m pip install -r requirements_cuda.txt
python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
- name: Install lmdeploy
run: |
Expand All @@ -116,6 +129,7 @@ jobs:
- name: Install opencompass
run: |
python3 -m pip install opencompass
python3 -m pip install langdetect
- name: Check env
run: |
python3 -m pip list
Expand All @@ -128,10 +142,17 @@ jobs:
run: |
overall_exit=0
ln -s /mnt/187/opencompass-data/data ./data
pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and not pr_test and ${{matrix.backend}}" -n 8 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and not pr_test and ${{matrix.backend}}" -n 4 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and not pr_test and ${{matrix.backend}}" -n 2 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and not pr_test and ${{matrix.backend}}" -n 1 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
ln -s /nvme/qa_test_models/resource/nltk_data /usr/share/nltk_data
execution_mode="${{ github.event.inputs.execution_mode || 'both' }}"
if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "infer" ]; then
pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and ${{matrix.backend}} and infer" -n 8 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and ${{matrix.backend}} and infer" -n 4 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and ${{matrix.backend}} and infer" -n 2 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and ${{matrix.backend}} and infer" -n 1 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
fi
if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "eval" ]; then
pytest autotest/evaluate/test_api_evaluate.py -m "${{matrix.backend}} and eval" -n 4 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
fi
exit $overall_exit
- name: Clear workspace
if: always()
Expand Down
39 changes: 30 additions & 9 deletions .github/workflows/api_eval_h800.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,20 @@ on:
description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
type: string
default: "['turbomind', 'pytorch']"
execution_mode:
required: false
description: 'Select execution mode: infer, eval, or both. Default is "both"'
type: choice
options:
- both
- infer
- eval
default: 'both'
run_id:
required: false
description: 'Set custom run ID. If not provided, github.run_id will be used'
type: string
default: ''


env:
Expand Down Expand Up @@ -91,9 +105,9 @@ jobs:
- /nvme/qa_test_models:/nvme/qa_test_models
- /nvme1/qa_test_models:/nvme1/qa_test_models
- /nvme2/share:/nvme2/share
- /mnt/137_nvme2:/mnt/137_nvme2
- /mnt/137_nvme3:/mnt/137_nvme3
- /mnt/137_nvme4:/mnt/137_nvme4
- /mnt/158_nvme2:/mnt/158_nvme2
- /mnt/158_nvme3:/mnt/158_nvme3
- /mnt/158_nvme4:/mnt/158_nvme4
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
steps:
- name: Create and change to _wk directory
Expand All @@ -110,7 +124,6 @@ jobs:
name: my-artifact-${{ github.run_id }}-py310
- name: Install lmdeploy - dependency
run: |
python3 -m pip install -r requirements_cuda.txt
python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
- name: Install lmdeploy
run: |
Expand All @@ -120,6 +133,7 @@ jobs:
- name: Install opencompass
run: |
python3 -m pip install opencompass
python3 -m pip install langdetect
- name: Check env
run: |
python3 -m pip list
Expand All @@ -131,11 +145,18 @@ jobs:
if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind')
run: |
overall_exit=0
ln -s /mnt/187/opencompass-data/data ./data
pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and not pr_test and ${{matrix.backend}}" -n 8 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and not pr_test and ${{matrix.backend}}" -n 4 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and not pr_test and ${{matrix.backend}}" -n 2 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and not pr_test and ${{matrix.backend}}" -n 1 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
ln -s /nvme/qa_test_models/resource/opencompass-data/data ./data
ln -s /nvme/qa_test_models/resource/nltk_data /usr/share/nltk_data
execution_mode="${{ github.event.inputs.execution_mode || 'both' }}"
if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "infer" ]; then
pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and ${{matrix.backend}} and infer" -n 8 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and ${{matrix.backend}} and infer" -n 4 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and ${{matrix.backend}} and infer" -n 2 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and ${{matrix.backend}} and infer" -n 1 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
fi
if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "eval" ]; then
pytest autotest/evaluate/test_api_evaluate.py -m "${{matrix.backend}} and eval" -n 4 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
fi
exit $overall_exit
- name: Clear workspace
if: always()
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/daily_ete_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -538,7 +538,6 @@ jobs:
cp -r ${{env.TEST_CODE_PATH}}/. .
- name: Install lmdeploy - dependency
run: |
python3 -m pip install sentence_transformers==2.2.2 --no-deps
python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
- name: Install lmdeploy
run: |
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/daily_ete_test_5080.yml
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ jobs:
rm -rf $workdir
mkdir $workdir
chmod -R 777 $workdir

test_tools:
if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
runs-on: [self-hosted, 5080-r1]
Expand Down Expand Up @@ -263,6 +264,7 @@ jobs:
rm -rf $workdir
mkdir $workdir
chmod -R 777 $workdir

test_restful:
if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}}
runs-on: [self-hosted, 5080-r1]
Expand Down Expand Up @@ -351,6 +353,7 @@ jobs:
rm -rf $workdir
mkdir $workdir
chmod -R 777 $workdir

get_coverage_report:
if: ${{!cancelled() && success()}}
runs-on: [self-hosted, 5080-r1]
Expand Down
1 change: 1 addition & 0 deletions autotest/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,7 @@ turbomind_quatization:
- deepseek-ai/DeepSeek-V2-Lite-Chat
no_kvint8:
- deepseek-ai/DeepSeek-V2-Chat
- Qwen/Qwen2.5-7B-Instruct

pytorch_quatization:
awq:
Expand Down
143 changes: 122 additions & 21 deletions autotest/evaluate/eval_config_chat.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,154 @@
# flake8: noqa

from mmengine.config import read_base
from opencompass.models import OpenAISDK
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
from opencompass.utils.text_postprocessors import extract_non_reasoning_content

#######################################################################
# PART 0 Essential Configs #
#######################################################################
with read_base():
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups # noqa: F401, E501

mmlu_datasets = [
x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [
'business_ethics', 'clinical_knowledge', 'college_medicine', 'global_facts', 'human_aging', 'management',
'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting',
'professional_medicine', 'virology'
]
]
# Datasets
from opencompass.configs.datasets.aime2025.aime2025_llmjudge_academic import aime2025_datasets
from opencompass.configs.datasets.gpqa.gpqa_cascade_eval_academic import gpqa_datasets
from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import ifeval_datasets
# LiveCodeBench dataset commented out to avoid version errors
# from opencompass.configs.datasets.livecodebench.livecodebench_v6_academic import \
# LCBCodeGeneration_dataset
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import mmlu_pro_datasets
# HLE dataset commented out to avoid connection errors
# from opencompass.configs.datasets.HLE.hle_llmverify_academic import \
# hle_datasets
# Summary Groups
from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups

datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
#######################################################################
# Model Configuration #
#######################################################################

MODEL_NAME = ''
MODEL_PATH = ''
API_BASE = ''
JUDGE_MODEL_PATH = ''
JUDGE_API_BASE = ''

api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
])

# Use OpenAISDK to configure LMDeploy OpenAI interface
models = [
dict(type=OpenAISDK,
abbr=f'{MODEL_NAME}-lmdeploy-api',
openai_api_base=API_BASE,
key='EMPTY',
path=MODEL_PATH,
key='EMPTY',
openai_api_base=API_BASE,
retry=3,
run_cfg=dict(num_gpus=0),
meta_template=api_meta_template,
max_out_len=32768,
batch_size=500,
temperature=0.1,
pred_postprocessor=dict(type=extract_non_reasoning_content))
]

#######################################################################
# PART 1 Datasets List #
#######################################################################
# datasets list for evaluation
# Remove LCBCodeGeneration_dataset due to version errors

mmlu_pro_datasets = [x for x in mmlu_pro_datasets if 'math' in x['abbr'] or 'other' in x['abbr']]

# Modify datasets list to exclude hle_datasets and LCBCodeGeneration_dataset
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets') and k != 'hle_datasets'), [])

# LLM judge config: using LLM to evaluate predictions
judge_cfg = dict(
type=OpenAISDK,
path=JUDGE_MODEL_PATH,
key='EMPTY',
openai_api_base=JUDGE_API_BASE,
meta_template=dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]),
query_per_second=16,
batch_size=1024,
temperature=0.001,
tokenizer_path=JUDGE_MODEL_PATH,
verbose=True,
max_out_len=16384,
max_seq_len=49152,
)

for item in datasets:
if 'judge_cfg' in item['eval_cfg']['evaluator']:
item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
if 'llm_evaluator' in item['eval_cfg']['evaluator'].keys(
) and 'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']:
item['eval_cfg']['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg

#######################################################################
# PART 2 Dataset Summarizer #
#######################################################################

core_summary_groups = [
{
'name':
'core_average',
'subsets': [
['IFEval', 'Prompt-level-strict-accuracy'],
# Remove hle_llmjudge due to unavailable dataset
# ['hle_llmjudge', 'accuracy'],
['aime2025_repeat_32', 'accuracy (32 runs average)'],
['GPQA_diamond_repeat_4', 'accuracy (4 runs average)'],
['mmlu_pro', 'naive_average'],
'mmlu_pro_math',
'mmlu_pro_other',
# Remove lcb_code_generation_repeat_6 due to version errors
# ['lcb_code_generation_repeat_6', 'pass@1 (6 runs average)'],
],
},
]

summarizer = dict(
dataset_abbrs=[
['mmlu', 'naive_average'],
['gsm8k', 'accuracy'],
'mmlu-other',
['core_average', 'naive_average'],
['IFEval', 'Prompt-level-strict-accuracy'],
# Remove hle_llmjudge due to unavailable dataset
# ['hle_llmjudge', 'accuracy'],
['GPQA_diamond_repeat_4', 'accuracy (4 runs average)'],
['aime2025_repeat_32', 'accuracy (32 runs average)'],
['mmlu_pro', 'naive_average'],
'mmlu_pro_math',
'mmlu_pro_other',
],
summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []) + core_summary_groups,
)

for item in datasets:
if 'max_out_len' in item['infer_cfg']['inferencer']:
del item['infer_cfg']['inferencer']['max_out_len']

#######################################################################
# PART 4 Inference/Evaluation Configuration #
#######################################################################

# infer with local runner
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(
type=LocalRunner,
max_num_workers=16,
retry=0,
task=dict(type=OpenICLInferTask),
),
)

# eval with local runner
eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=OpenICLEvalTask)),
)
Loading