Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CI] add base model evaluation #2490

Merged
merged 19 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 0 additions & 137 deletions .github/resources/opencompass-hf-results.json

This file was deleted.

102 changes: 26 additions & 76 deletions .github/scripts/action_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,31 +79,10 @@ def add_summary(csv_path: str):
_append_summary('\n')


def _load_hf_results(test_results: dict, model_name: str):
"""Read opencompass eval results."""
lmdeploy_dir = os.path.abspath(os.environ['LMDEPLOY_DIR'])
hf_res_path = os.path.join(
lmdeploy_dir, '.github/resources/opencompass-hf-results.json')
out = OrderedDict()
if os.path.exists(hf_res_path):
with open(hf_res_path, 'r') as f:
data = json.load(f)
if model_name in data:
res = data[model_name]
for dataset in test_results:
value = '-'
if dataset in res:
value = res[dataset]
out[dataset] = value
else:
logging.warning(
f'No opencompass results found for model {model_name}')
return out


def evaluate(models: List[str],
datasets: List[str],
workspace: str,
evaluate_type: str,
is_smoke: bool = False):
"""Evaluate models from lmdeploy using opencompass.

Expand All @@ -112,66 +91,48 @@ def evaluate(models: List[str],
workspace: Working directory.
"""
os.makedirs(workspace, exist_ok=True)
output_csv = os.path.join(workspace, 'results.csv')
output_csv = os.path.join(workspace, f'results_{evaluate_type}.csv')
if os.path.exists(output_csv):
os.remove(output_csv)
num_model = len(models)
test_model_names = set()
for idx, ori_model in enumerate(models):
print()
print(50 * '==')
print(f'Start evaluating {idx+1}/{num_model} {ori_model} ...')
model = ori_model.lower()
model_, precision = model.rsplit('_', 1)
do_lite = precision in ['4bits', 'kvint4', 'kvint8']
if do_lite:
model = model_
engine_type, model_ = model.split('_', 1)
if engine_type not in ['tb', 'pt', 'hf']:
engine_type = 'tb'
else:
model = model_

opencompass_dir = os.path.abspath(os.environ['OPENCOMPASS_DIR'])
lmdeploy_dir = os.path.abspath(os.environ['LMDEPLOY_DIR'])
config_path = os.path.join(
lmdeploy_dir, '.github/scripts/eval_opencompass_config.py')
lmdeploy_dir, f'.github/scripts/eval_{evaluate_type}_config.py')
config_path_new = os.path.join(opencompass_dir, 'configs',
'eval_lmdeploy.py')
if os.path.exists(config_path_new):
os.remove(config_path_new)
shutil.copy(config_path, config_path_new)
target_model = f'{engine_type}_{model}'
if do_lite:
target_model = target_model + f'_{precision}'

cfg = Config.fromfile(config_path_new)
if not hasattr(cfg, target_model):
logging.error(
f'Model {target_model} not found in configuration file')
if not hasattr(cfg, model):
logging.error(f'Model {model} not found in configuration file')
continue
if engine_type != 'hf':
model_cfg = cfg[target_model]
hf_model_path = model_cfg['path']
if not os.path.exists(hf_model_path):
logging.error(f'Model path not exists: {hf_model_path}')
continue
logging.info(
f'Start evaluating {target_model} ...\\nn{model_cfg}\n\n')
else:
hf_model_path = target_model

model_cfg = cfg[model]
logging.info(f'Start evaluating {model} ...\\nn{model_cfg}\n\n')

with open(config_path_new, 'a') as f:
f.write(f'\ndatasets = {datasets}\n')
if is_smoke:
f.write('\nfor d in datasets:\n')
f.write(" if d['reader_cfg'] is not None:\n")
f.write(" d['reader_cfg']['test_range'] = '[0:50]'\n")
if engine_type == 'hf':
f.write(f'\nmodels = [ *{target_model} ]\n')
if model.startswith('hf'):
f.write(f'\nmodels = [ *{model} ]\n')
else:
f.write(f'\nmodels = [ {target_model} ]\n')
f.write(f'\nmodels = [ {model} ]\n')

work_dir = os.path.join(workspace, target_model)
work_dir = os.path.join(workspace, model)
cmd_eval = [
f'python3 {opencompass_dir}/run.py {config_path_new} -w {work_dir} --reuse --max-num-workers 8' # noqa: E501
f'python3 {opencompass_dir}/run.py {config_path_new} -w {work_dir} --reuse --max-num-workers 8 --dump-eval-details' # noqa: E501
]
eval_log = os.path.join(workspace, f'eval.{ori_model}.txt')
start_time = time.time()
Expand Down Expand Up @@ -211,34 +172,20 @@ def evaluate(models: List[str],
acc = json.load(f)['accuracy']
acc = f'{float(acc):.2f}'
model_results['crows_pairs'] = acc
logging.info(f'\n{hf_model_path}\n{model_results}')
logging.info(f'\n{model}\n{model_results}')
dataset_names = list(model_results.keys())
prec = precision if do_lite else '-'

row = ','.join([model, engine_type, prec] +
[str(task_duration_seconds)] +
row = ','.join([model, str(task_duration_seconds)] +
[model_results[_] for _ in dataset_names])
hf_res_row = None
if hf_model_path not in test_model_names:
test_model_names.add(hf_model_path)
hf_res = _load_hf_results(model_results, hf_model_path)
if hf_res:
hf_metrics = [
hf_res[d] if d in hf_res else '-' for d in dataset_names
]
hf_res_row = ','.join([model, 'hf', '-', '-'] + hf_metrics)

if not os.path.exists(output_csv):
with open(output_csv, 'w') as f:
header = ','.join(['Model', 'Engine', 'Precision'] +
['task_duration_secs'] + dataset_names)
header = ','.join(['Model', 'task_duration_secs'] +
dataset_names)
f.write(header + '\n')
if hf_res_row:
f.write(hf_res_row + '\n')
f.write(row + '\n')
else:
with open(output_csv, 'a') as f:
if hf_res_row:
f.write(hf_res_row + '\n')
f.write(row + '\n')

# write to github action summary
Expand Down Expand Up @@ -267,15 +214,18 @@ def generate_benchmark_report(report_path: str):
subfolders = [f.path for f in os.scandir(report_path) if f.is_dir()]
for dir_path in subfolders:
second_subfolders = [
f.path for f in os.scandir(dir_path) if f.is_dir()
f.path for f in sorted(os.scandir(dir_path), key=lambda x: x.name)
if f.is_dir()
]
for sec_dir_path in second_subfolders:
model = sec_dir_path.replace(report_path + '/', '')
print('-' * 25, model, '-' * 25)
_append_summary('-' * 25 + model + '-' * 25 + '\n')

benchmark_subfolders = [
f.path for f in os.scandir(sec_dir_path) if f.is_dir()
f.path
for f in sorted(os.scandir(sec_dir_path), key=lambda x: x.name)
if f.is_dir()
]
for backend_subfolder in benchmark_subfolders:
benchmark_type = backend_subfolder.replace(
Expand Down
68 changes: 68 additions & 0 deletions .github/scripts/eval_base_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from copy import deepcopy

from mmengine.config import read_base

with read_base():
# choose a list of datasets
from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import \
bbh_datasets # noqa: F401, E501
from opencompass.configs.datasets.ceval.ceval_ppl import \
ceval_datasets # noqa: F401, E501
from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \
cmmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.crowspairs.crowspairs_ppl import \
crowspairs_datasets # noqa: F401, E501
from opencompass.configs.datasets.drop.drop_gen_a2697c import \
drop_datasets # noqa: F401, E501
from opencompass.configs.datasets.gpqa.gpqa_ppl_6bf57a import \
gpqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
gsm8k_datasets # noqa: F401, E501
from opencompass.configs.datasets.hellaswag.hellaswag_ppl import \
hellaswag_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import \
mmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.race.race_ppl_a138cd import \
race_datasets # noqa: F401, E501
# read models
from opencompass.configs.models.baichuan.hf_baichuan_7b import \
models as hf_baichuan_7b # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma_7b import \
models as hf_gemma_7b # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
models as hf_internlm2_5_7b # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
models as hf_internlm2_7b # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \
models as hf_internlm2_20b # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm_7b import \
models as hf_internlm_7b # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm_20b import \
models as hf_internlm_20b # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
models as lmdeploy_internlm2_5_7b # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama2_7b import \
models as hf_llama2_7b # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_8b import \
models as hf_llama3_8b # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_7b_v0_1 import \
models as hf_mistral_7b_v0_1 # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mixtral_8x7b_v0_1 import \
models as hf_mixtral_8x7b_v0_1 # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen1_5_7b import \
models as hf_qwen1_5_7b # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen2_7b import \
models as hf_qwen2_7b # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen_7b import \
models as hf_qwen_7b # noqa: F401, E501
from opencompass.configs.models.qwen.lmdeploy_qwen1_5_7b import \
models as lmdeploy_qwen1_5_7b # noqa: F401, E501
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b import \
models as lmdeploy_qwen2_7b # noqa: F401, E501
# and output the results in a chosen format
from opencompass.configs.summarizers.medium import \
summarizer # noqa: F401, E501
lvhan028 marked this conversation as resolved.
Show resolved Hide resolved

turbomind_qwen1_5_7b = deepcopy(*lmdeploy_qwen1_5_7b)
turbomind_qwen2_7b = deepcopy(*lmdeploy_qwen2_7b)
turbomind_internlm2_5_7b = deepcopy(*lmdeploy_internlm2_5_7b)
Loading
Loading