[CI] add base model evaluation (#2490)

* update * update * update * update * update * update * update * updatre * update * update * update * update * update * update * updaste * update
InternLM · Sep 25, 2024 · 2c71f27 · 2c71f27
1 parent 4bcfc18
commit 2c71f27
Show file tree

Hide file tree

Showing 7 changed files with 406 additions and 891 deletions.
diff --git a/.github/resources/opencompass-hf-results.json b/.github/resources/opencompass-hf-results.json
diff --git a/.github/scripts/action_tools.py b/.github/scripts/action_tools.py
@@ -79,31 +79,10 @@ def add_summary(csv_path: str):
         _append_summary('\n')
 
 
-def _load_hf_results(test_results: dict, model_name: str):
-    """Read opencompass eval results."""
-    lmdeploy_dir = os.path.abspath(os.environ['LMDEPLOY_DIR'])
-    hf_res_path = os.path.join(
-        lmdeploy_dir, '.github/resources/opencompass-hf-results.json')
-    out = OrderedDict()
-    if os.path.exists(hf_res_path):
-        with open(hf_res_path, 'r') as f:
-            data = json.load(f)
-            if model_name in data:
-                res = data[model_name]
-                for dataset in test_results:
-                    value = '-'
-                    if dataset in res:
-                        value = res[dataset]
-                    out[dataset] = value
-            else:
-                logging.warning(
-                    f'No opencompass results found for model {model_name}')
-    return out
-
-
 def evaluate(models: List[str],
              datasets: List[str],
              workspace: str,
+             evaluate_type: str,
              is_smoke: bool = False):
     """Evaluate models from lmdeploy using opencompass.
 
@@ -112,66 +91,48 @@ def evaluate(models: List[str],
         workspace: Working directory.
     """
     os.makedirs(workspace, exist_ok=True)
-    output_csv = os.path.join(workspace, 'results.csv')
+    output_csv = os.path.join(workspace, f'results_{evaluate_type}.csv')
+    if os.path.exists(output_csv):
+        os.remove(output_csv)
     num_model = len(models)
-    test_model_names = set()
     for idx, ori_model in enumerate(models):
         print()
         print(50 * '==')
         print(f'Start evaluating {idx+1}/{num_model} {ori_model} ...')
         model = ori_model.lower()
-        model_, precision = model.rsplit('_', 1)
-        do_lite = precision in ['4bits', 'kvint4', 'kvint8']
-        if do_lite:
-            model = model_
-        engine_type, model_ = model.split('_', 1)
-        if engine_type not in ['tb', 'pt', 'hf']:
-            engine_type = 'tb'
-        else:
-            model = model_
 
         opencompass_dir = os.path.abspath(os.environ['OPENCOMPASS_DIR'])
         lmdeploy_dir = os.path.abspath(os.environ['LMDEPLOY_DIR'])
         config_path = os.path.join(
-            lmdeploy_dir, '.github/scripts/eval_opencompass_config.py')
+            lmdeploy_dir, f'.github/scripts/eval_{evaluate_type}_config.py')
         config_path_new = os.path.join(opencompass_dir, 'configs',
                                        'eval_lmdeploy.py')
         if os.path.exists(config_path_new):
             os.remove(config_path_new)
         shutil.copy(config_path, config_path_new)
-        target_model = f'{engine_type}_{model}'
-        if do_lite:
-            target_model = target_model + f'_{precision}'
+
         cfg = Config.fromfile(config_path_new)
-        if not hasattr(cfg, target_model):
-            logging.error(
-                f'Model {target_model} not found in configuration file')
+        if not hasattr(cfg, model):
+            logging.error(f'Model {model} not found in configuration file')
             continue
-        if engine_type != 'hf':
-            model_cfg = cfg[target_model]
-            hf_model_path = model_cfg['path']
-            if not os.path.exists(hf_model_path):
-                logging.error(f'Model path not exists: {hf_model_path}')
-                continue
-            logging.info(
-                f'Start evaluating {target_model} ...\\nn{model_cfg}\n\n')
-        else:
-            hf_model_path = target_model
+
+        model_cfg = cfg[model]
+        logging.info(f'Start evaluating {model} ...\\nn{model_cfg}\n\n')
 
         with open(config_path_new, 'a') as f:
             f.write(f'\ndatasets = {datasets}\n')
             if is_smoke:
                 f.write('\nfor d in datasets:\n')
                 f.write("    if d['reader_cfg'] is not None:\n")
                 f.write("        d['reader_cfg']['test_range'] = '[0:50]'\n")
-            if engine_type == 'hf':
-                f.write(f'\nmodels = [ *{target_model} ]\n')
+            if model.startswith('hf'):
+                f.write(f'\nmodels = [ *{model} ]\n')
             else:
-                f.write(f'\nmodels = [ {target_model} ]\n')
+                f.write(f'\nmodels = [ {model} ]\n')
 
-        work_dir = os.path.join(workspace, target_model)
+        work_dir = os.path.join(workspace, model)
         cmd_eval = [
-            f'python3 {opencompass_dir}/run.py {config_path_new} -w {work_dir} --reuse --max-num-workers 8'  # noqa: E501
+            f'python3 {opencompass_dir}/run.py {config_path_new} -w {work_dir} --reuse --max-num-workers 8 --dump-eval-details'  # noqa: E501
         ]
         eval_log = os.path.join(workspace, f'eval.{ori_model}.txt')
         start_time = time.time()
@@ -211,34 +172,20 @@ def evaluate(models: List[str],
                 acc = json.load(f)['accuracy']
                 acc = f'{float(acc):.2f}'
                 model_results['crows_pairs'] = acc
-        logging.info(f'\n{hf_model_path}\n{model_results}')
+        logging.info(f'\n{model}\n{model_results}')
         dataset_names = list(model_results.keys())
-        prec = precision if do_lite else '-'
 
-        row = ','.join([model, engine_type, prec] +
-                       [str(task_duration_seconds)] +
+        row = ','.join([model, str(task_duration_seconds)] +
                        [model_results[_] for _ in dataset_names])
-        hf_res_row = None
-        if hf_model_path not in test_model_names:
-            test_model_names.add(hf_model_path)
-            hf_res = _load_hf_results(model_results, hf_model_path)
-            if hf_res:
-                hf_metrics = [
-                    hf_res[d] if d in hf_res else '-' for d in dataset_names
-                ]
-                hf_res_row = ','.join([model, 'hf', '-', '-'] + hf_metrics)
+
         if not os.path.exists(output_csv):
             with open(output_csv, 'w') as f:
-                header = ','.join(['Model', 'Engine', 'Precision'] +
-                                  ['task_duration_secs'] + dataset_names)
+                header = ','.join(['Model', 'task_duration_secs'] +
+                                  dataset_names)
                 f.write(header + '\n')
-                if hf_res_row:
-                    f.write(hf_res_row + '\n')
                 f.write(row + '\n')
         else:
             with open(output_csv, 'a') as f:
-                if hf_res_row:
-                    f.write(hf_res_row + '\n')
                 f.write(row + '\n')
 
     # write to github action summary
@@ -267,15 +214,18 @@ def generate_benchmark_report(report_path: str):
     subfolders = [f.path for f in os.scandir(report_path) if f.is_dir()]
     for dir_path in subfolders:
         second_subfolders = [
-            f.path for f in os.scandir(dir_path) if f.is_dir()
+            f.path for f in sorted(os.scandir(dir_path), key=lambda x: x.name)
+            if f.is_dir()
         ]
         for sec_dir_path in second_subfolders:
             model = sec_dir_path.replace(report_path + '/', '')
             print('-' * 25, model, '-' * 25)
             _append_summary('-' * 25 + model + '-' * 25 + '\n')
 
             benchmark_subfolders = [
-                f.path for f in os.scandir(sec_dir_path) if f.is_dir()
+                f.path
+                for f in sorted(os.scandir(sec_dir_path), key=lambda x: x.name)
+                if f.is_dir()
             ]
             for backend_subfolder in benchmark_subfolders:
                 benchmark_type = backend_subfolder.replace(

diff --git a/.github/scripts/eval_base_config.py b/.github/scripts/eval_base_config.py
@@ -0,0 +1,68 @@
+from copy import deepcopy
+
+from mmengine.config import read_base
+
+with read_base():
+    # choose a list of datasets
+    from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import \
+        bbh_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.ceval.ceval_ppl import \
+        ceval_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \
+        cmmlu_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.crowspairs.crowspairs_ppl import \
+        crowspairs_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.drop.drop_gen_a2697c import \
+        drop_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.gpqa.gpqa_ppl_6bf57a import \
+        gpqa_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
+        gsm8k_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.hellaswag.hellaswag_ppl import \
+        hellaswag_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import \
+        mmlu_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.race.race_ppl_a138cd import \
+        race_datasets  # noqa: F401, E501
+    # read models
+    from opencompass.configs.models.baichuan.hf_baichuan_7b import \
+        models as hf_baichuan_7b  # noqa: F401, E501
+    from opencompass.configs.models.gemma.hf_gemma_7b import \
+        models as hf_gemma_7b  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
+        models as hf_internlm2_5_7b  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
+        models as hf_internlm2_7b  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \
+        models as hf_internlm2_20b  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm_7b import \
+        models as hf_internlm_7b  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm_20b import \
+        models as hf_internlm_20b  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
+        models as lmdeploy_internlm2_5_7b  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.hf_llama2_7b import \
+        models as hf_llama2_7b  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.hf_llama3_8b import \
+        models as hf_llama3_8b  # noqa: F401, E501
+    from opencompass.configs.models.mistral.hf_mistral_7b_v0_1 import \
+        models as hf_mistral_7b_v0_1  # noqa: F401, E501
+    from opencompass.configs.models.mistral.hf_mixtral_8x7b_v0_1 import \
+        models as hf_mixtral_8x7b_v0_1  # noqa: F401, E501
+    from opencompass.configs.models.qwen.hf_qwen1_5_7b import \
+        models as hf_qwen1_5_7b  # noqa: F401, E501
+    from opencompass.configs.models.qwen.hf_qwen2_7b import \
+        models as hf_qwen2_7b  # noqa: F401, E501
+    from opencompass.configs.models.qwen.hf_qwen_7b import \
+        models as hf_qwen_7b  # noqa: F401, E501
+    from opencompass.configs.models.qwen.lmdeploy_qwen1_5_7b import \
+        models as lmdeploy_qwen1_5_7b  # noqa: F401, E501
+    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b import \
+        models as lmdeploy_qwen2_7b  # noqa: F401, E501
+    # and output the results in a chosen format
+    from opencompass.configs.summarizers.medium import \
+        summarizer  # noqa: F401, E501
+
+turbomind_qwen1_5_7b = deepcopy(*lmdeploy_qwen1_5_7b)
+turbomind_qwen2_7b = deepcopy(*lmdeploy_qwen2_7b)
+turbomind_internlm2_5_7b = deepcopy(*lmdeploy_internlm2_5_7b)