Skip to content

Commit

Permalink
[CI] add base model evaluation (#2490)
Browse files Browse the repository at this point in the history
* update

* update

* update

* update

* update

* update

* update

* updatre

* update

* update

* update

* update

* update

* update

* updaste

* update
  • Loading branch information
zhulinJulia24 authored Sep 25, 2024
1 parent 4bcfc18 commit 2c71f27
Show file tree
Hide file tree
Showing 7 changed files with 406 additions and 891 deletions.
137 changes: 0 additions & 137 deletions .github/resources/opencompass-hf-results.json

This file was deleted.

102 changes: 26 additions & 76 deletions .github/scripts/action_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,31 +79,10 @@ def add_summary(csv_path: str):
_append_summary('\n')


def _load_hf_results(test_results: dict, model_name: str):
"""Read opencompass eval results."""
lmdeploy_dir = os.path.abspath(os.environ['LMDEPLOY_DIR'])
hf_res_path = os.path.join(
lmdeploy_dir, '.github/resources/opencompass-hf-results.json')
out = OrderedDict()
if os.path.exists(hf_res_path):
with open(hf_res_path, 'r') as f:
data = json.load(f)
if model_name in data:
res = data[model_name]
for dataset in test_results:
value = '-'
if dataset in res:
value = res[dataset]
out[dataset] = value
else:
logging.warning(
f'No opencompass results found for model {model_name}')
return out


def evaluate(models: List[str],
datasets: List[str],
workspace: str,
evaluate_type: str,
is_smoke: bool = False):
"""Evaluate models from lmdeploy using opencompass.
Expand All @@ -112,66 +91,48 @@ def evaluate(models: List[str],
workspace: Working directory.
"""
os.makedirs(workspace, exist_ok=True)
output_csv = os.path.join(workspace, 'results.csv')
output_csv = os.path.join(workspace, f'results_{evaluate_type}.csv')
if os.path.exists(output_csv):
os.remove(output_csv)
num_model = len(models)
test_model_names = set()
for idx, ori_model in enumerate(models):
print()
print(50 * '==')
print(f'Start evaluating {idx+1}/{num_model} {ori_model} ...')
model = ori_model.lower()
model_, precision = model.rsplit('_', 1)
do_lite = precision in ['4bits', 'kvint4', 'kvint8']
if do_lite:
model = model_
engine_type, model_ = model.split('_', 1)
if engine_type not in ['tb', 'pt', 'hf']:
engine_type = 'tb'
else:
model = model_

opencompass_dir = os.path.abspath(os.environ['OPENCOMPASS_DIR'])
lmdeploy_dir = os.path.abspath(os.environ['LMDEPLOY_DIR'])
config_path = os.path.join(
lmdeploy_dir, '.github/scripts/eval_opencompass_config.py')
lmdeploy_dir, f'.github/scripts/eval_{evaluate_type}_config.py')
config_path_new = os.path.join(opencompass_dir, 'configs',
'eval_lmdeploy.py')
if os.path.exists(config_path_new):
os.remove(config_path_new)
shutil.copy(config_path, config_path_new)
target_model = f'{engine_type}_{model}'
if do_lite:
target_model = target_model + f'_{precision}'

cfg = Config.fromfile(config_path_new)
if not hasattr(cfg, target_model):
logging.error(
f'Model {target_model} not found in configuration file')
if not hasattr(cfg, model):
logging.error(f'Model {model} not found in configuration file')
continue
if engine_type != 'hf':
model_cfg = cfg[target_model]
hf_model_path = model_cfg['path']
if not os.path.exists(hf_model_path):
logging.error(f'Model path not exists: {hf_model_path}')
continue
logging.info(
f'Start evaluating {target_model} ...\\nn{model_cfg}\n\n')
else:
hf_model_path = target_model

model_cfg = cfg[model]
logging.info(f'Start evaluating {model} ...\\nn{model_cfg}\n\n')

with open(config_path_new, 'a') as f:
f.write(f'\ndatasets = {datasets}\n')
if is_smoke:
f.write('\nfor d in datasets:\n')
f.write(" if d['reader_cfg'] is not None:\n")
f.write(" d['reader_cfg']['test_range'] = '[0:50]'\n")
if engine_type == 'hf':
f.write(f'\nmodels = [ *{target_model} ]\n')
if model.startswith('hf'):
f.write(f'\nmodels = [ *{model} ]\n')
else:
f.write(f'\nmodels = [ {target_model} ]\n')
f.write(f'\nmodels = [ {model} ]\n')

work_dir = os.path.join(workspace, target_model)
work_dir = os.path.join(workspace, model)
cmd_eval = [
f'python3 {opencompass_dir}/run.py {config_path_new} -w {work_dir} --reuse --max-num-workers 8' # noqa: E501
f'python3 {opencompass_dir}/run.py {config_path_new} -w {work_dir} --reuse --max-num-workers 8 --dump-eval-details' # noqa: E501
]
eval_log = os.path.join(workspace, f'eval.{ori_model}.txt')
start_time = time.time()
Expand Down Expand Up @@ -211,34 +172,20 @@ def evaluate(models: List[str],
acc = json.load(f)['accuracy']
acc = f'{float(acc):.2f}'
model_results['crows_pairs'] = acc
logging.info(f'\n{hf_model_path}\n{model_results}')
logging.info(f'\n{model}\n{model_results}')
dataset_names = list(model_results.keys())
prec = precision if do_lite else '-'

row = ','.join([model, engine_type, prec] +
[str(task_duration_seconds)] +
row = ','.join([model, str(task_duration_seconds)] +
[model_results[_] for _ in dataset_names])
hf_res_row = None
if hf_model_path not in test_model_names:
test_model_names.add(hf_model_path)
hf_res = _load_hf_results(model_results, hf_model_path)
if hf_res:
hf_metrics = [
hf_res[d] if d in hf_res else '-' for d in dataset_names
]
hf_res_row = ','.join([model, 'hf', '-', '-'] + hf_metrics)

if not os.path.exists(output_csv):
with open(output_csv, 'w') as f:
header = ','.join(['Model', 'Engine', 'Precision'] +
['task_duration_secs'] + dataset_names)
header = ','.join(['Model', 'task_duration_secs'] +
dataset_names)
f.write(header + '\n')
if hf_res_row:
f.write(hf_res_row + '\n')
f.write(row + '\n')
else:
with open(output_csv, 'a') as f:
if hf_res_row:
f.write(hf_res_row + '\n')
f.write(row + '\n')

# write to github action summary
Expand Down Expand Up @@ -267,15 +214,18 @@ def generate_benchmark_report(report_path: str):
subfolders = [f.path for f in os.scandir(report_path) if f.is_dir()]
for dir_path in subfolders:
second_subfolders = [
f.path for f in os.scandir(dir_path) if f.is_dir()
f.path for f in sorted(os.scandir(dir_path), key=lambda x: x.name)
if f.is_dir()
]
for sec_dir_path in second_subfolders:
model = sec_dir_path.replace(report_path + '/', '')
print('-' * 25, model, '-' * 25)
_append_summary('-' * 25 + model + '-' * 25 + '\n')

benchmark_subfolders = [
f.path for f in os.scandir(sec_dir_path) if f.is_dir()
f.path
for f in sorted(os.scandir(sec_dir_path), key=lambda x: x.name)
if f.is_dir()
]
for backend_subfolder in benchmark_subfolders:
benchmark_type = backend_subfolder.replace(
Expand Down
68 changes: 68 additions & 0 deletions .github/scripts/eval_base_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from copy import deepcopy

from mmengine.config import read_base

with read_base():
# choose a list of datasets
from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import \
bbh_datasets # noqa: F401, E501
from opencompass.configs.datasets.ceval.ceval_ppl import \
ceval_datasets # noqa: F401, E501
from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \
cmmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.crowspairs.crowspairs_ppl import \
crowspairs_datasets # noqa: F401, E501
from opencompass.configs.datasets.drop.drop_gen_a2697c import \
drop_datasets # noqa: F401, E501
from opencompass.configs.datasets.gpqa.gpqa_ppl_6bf57a import \
gpqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
gsm8k_datasets # noqa: F401, E501
from opencompass.configs.datasets.hellaswag.hellaswag_ppl import \
hellaswag_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import \
mmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.race.race_ppl_a138cd import \
race_datasets # noqa: F401, E501
# read models
from opencompass.configs.models.baichuan.hf_baichuan_7b import \
models as hf_baichuan_7b # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma_7b import \
models as hf_gemma_7b # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
models as hf_internlm2_5_7b # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
models as hf_internlm2_7b # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \
models as hf_internlm2_20b # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm_7b import \
models as hf_internlm_7b # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm_20b import \
models as hf_internlm_20b # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
models as lmdeploy_internlm2_5_7b # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama2_7b import \
models as hf_llama2_7b # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_8b import \
models as hf_llama3_8b # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_7b_v0_1 import \
models as hf_mistral_7b_v0_1 # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mixtral_8x7b_v0_1 import \
models as hf_mixtral_8x7b_v0_1 # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen1_5_7b import \
models as hf_qwen1_5_7b # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen2_7b import \
models as hf_qwen2_7b # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen_7b import \
models as hf_qwen_7b # noqa: F401, E501
from opencompass.configs.models.qwen.lmdeploy_qwen1_5_7b import \
models as lmdeploy_qwen1_5_7b # noqa: F401, E501
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b import \
models as lmdeploy_qwen2_7b # noqa: F401, E501
# and output the results in a chosen format
from opencompass.configs.summarizers.medium import \
summarizer # noqa: F401, E501

turbomind_qwen1_5_7b = deepcopy(*lmdeploy_qwen1_5_7b)
turbomind_qwen2_7b = deepcopy(*lmdeploy_qwen2_7b)
turbomind_internlm2_5_7b = deepcopy(*lmdeploy_internlm2_5_7b)
Loading

0 comments on commit 2c71f27

Please sign in to comment.