InternLM · lvhan028 · Sep 18, 2024 · Sep 2, 2024 · Sep 2, 2024 · Sep 2, 2024
diff --git a/.github/scripts/action_tools.py b/.github/scripts/action_tools.py
@@ -101,7 +101,10 @@ def _load_hf_results(test_results: dict, model_name: str):
     return out
 
 
-def evaluate(models: List[str], datasets: List[str], workspace: str):
+def evaluate(models: List[str],
+             datasets: List[str],
+             workspace: str,
+             is_smoke: bool = False):
     """Evaluate models from lmdeploy using opencompass.
 
     Args:
@@ -157,6 +160,10 @@ def evaluate(models: List[str], datasets: List[str], workspace: str):
 
         with open(config_path_new, 'a') as f:
             f.write(f'\ndatasets = {datasets}\n')
+            if is_smoke:
+                f.write('\nfor d in datasets:\n')
+                f.write("    if d['reader_cfg'] is not None:\n")
+                f.write("        d['reader_cfg']['test_range'] = '[0:50]'\n")
             if engine_type == 'hf':
                 f.write(f'\nmodels = [ *{target_model} ]\n')
             else:

diff --git a/.github/scripts/eval_opencompass_config.py b/.github/scripts/eval_opencompass_config.py
@@ -6,68 +6,72 @@
 
 with read_base():
     # choose a list of datasets
-    from .datasets.bbh.bbh_gen_5b92b0 import bbh_datasets  # noqa: F401, E501
-    from .datasets.ceval.ceval_gen_2daf24 import \
+    from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \
+        bbh_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.ceval.ceval_gen_2daf24 import \
         ceval_datasets  # noqa: F401, E501
-    from .datasets.cmmlu.cmmlu_gen_c13365 import \
+    from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import \
         cmmlu_datasets  # noqa: F401, E501
-    from .datasets.crowspairs.crowspairs_gen_381af0 import \
+    from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import \
         crowspairs_datasets  # noqa: F401, E501
-    from .datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \
+    from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \
         GaokaoBench_datasets  # noqa: F401, E501
-    from .datasets.gpqa.gpqa_gen_4baadb import \
+    from opencompass.configs.datasets.gpqa.gpqa_gen_4baadb import \
         gpqa_datasets  # noqa: F401, E501
-    from .datasets.gsm8k.gsm8k_gen_1d7fe4 import \
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
         gsm8k_datasets  # noqa: F401, E501
-    from .datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
+    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
         hellaswag_datasets  # noqa: F401, E501
-    from .datasets.humaneval.humaneval_gen_8e312c import \
+    from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import \
         humaneval_datasets  # noqa: F401, E501
-    from .datasets.IFEval.IFEval_gen_3321a3 import \
+    from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \
         ifeval_datasets  # noqa: F401, E501
-    from .datasets.math.math_0shot_gen_393424 import \
+    from opencompass.configs.datasets.math.math_0shot_gen_393424 import \
         math_datasets  # noqa: F401, E501
-    from .datasets.mbpp.sanitized_mbpp_gen_a0fc46 import \
+    from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_a0fc46 import \
         sanitized_mbpp_datasets  # noqa: F401, E501
-    from .datasets.mmlu.mmlu_gen_4d595a import \
+    from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import \
         mmlu_datasets  # noqa: F401, E501
-    from .datasets.nq.nq_open_1shot_gen_01cf41 import \
+    from opencompass.configs.datasets.nq.nq_open_1shot_gen_01cf41 import \
         nq_datasets  # noqa: F401, E501
-    from .datasets.race.race_gen_69ee4f import \
+    from opencompass.configs.datasets.race.race_gen_69ee4f import \
         race_datasets  # noqa: F401, E501
-    from .datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
+    from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
         TheoremQA_datasets  # noqa: F401, E501
-    from .datasets.triviaqa.triviaqa_wiki_1shot_gen_eaf81e import \
+    from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_eaf81e import \
         triviaqa_datasets  # noqa: F401, E501
-    from .datasets.winogrande.winogrande_5shot_gen_b36770 import \
+    from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import \
         winogrande_datasets  # noqa: F401, E501
     # read hf models
-    from .models.baichuan.hf_baichuan2_7b_chat import \
+    from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import \
         models as hf_baichuan2_chat_7b  # noqa: F401, E501
-    from .models.gemma.hf_gemma_7b_it import \
+    from opencompass.configs.models.gemma.hf_gemma_7b_it import \
         models as hf_gemma_chat_7b  # noqa: F401, E501
-    from .models.hf_internlm.hf_internlm2_chat_7b import \
+    from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import \
         models as hf_internlm2_chat_7b  # noqa: F401, E501
-    from .models.hf_internlm.hf_internlm2_chat_20b import \
+    from opencompass.configs.models.hf_internlm.hf_internlm2_chat_20b import \
         models as hf_internlm2_chat_20b  # noqa: F401, E501
-    from .models.hf_internlm.hf_internlm_chat_7b import \
+    from opencompass.configs.models.hf_internlm.hf_internlm_chat_7b import \
         models as hf_internlm_chat_7b  # noqa: F401, E501
-    from .models.hf_internlm.hf_internlm_chat_20b import \
+    from opencompass.configs.models.hf_internlm.hf_internlm_chat_20b import \
         models as hf_internlm_chat_20b  # noqa: F401, E501
-    from .models.hf_llama.hf_llama2_7b_chat import \
+    from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import \
         models as hf_llama2_chat_7b  # noqa: F401, E501
-    from .models.hf_llama.hf_llama3_8b_instruct import \
+    from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
         models as hf_llama_3_8b_instruct  # noqa: F401, E501
-    from .models.mistral.hf_mistral_7b_instruct_v0_1 import \
+    from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_1 import \
         models as hf_mistral_chat_7b  # noqa: F401, E501
-    from .models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
+    from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
         models as hf_mixtral_chat_8x7b  # noqa: F401, E501
-    from .models.qwen.hf_qwen1_5_7b_chat import \
+    from opencompass.configs.models.qwen.hf_qwen1_5_7b_chat import \
         models as hf_qwen1_5_chat_7b  # noqa: F401, E501
-    from .models.qwen.hf_qwen_7b_chat import \
+    from opencompass.configs.models.qwen.hf_qwen2_7b_instruct import \
+        models as hf_qwen2_7b_instruct  # noqa: F401, E501
+    from opencompass.configs.models.qwen.hf_qwen_7b_chat import \
         models as hf_qwen_chat_7b  # noqa: F401, E501
     # and output the results in a chosen format
-    from .summarizers.medium import summarizer  # noqa: F401, E501
+    from opencompass.configs.summarizers.medium import \
+        summarizer  # noqa: F401, E501
 
 internlm_meta_template = dict(round=[
     dict(role='HUMAN', begin='<|User|>:', end='\n'),
@@ -117,7 +121,7 @@
              end='<|im_end|>\n',
              generate=True),
     ],
-    eos_token_id=151645,
+    eos_token_id=[151645, 151643],
 )
 
 baichuan2_meta_template = dict(round=[
@@ -202,7 +206,7 @@
 qwen_gen_config_template = dict(top_k=1,
                                 top_p=0.8,
                                 temperature=1.0,
-                                stop_words=[151645],
+                                stop_words=[151645, 151643],
                                 max_new_tokens=MAX_NEW_TOKENS)
 
 tokenizer_kwargs_template = dict(padding_side='left',
@@ -546,6 +550,19 @@
     run_cfg=dict(num_gpus=1),
 )
 
+pt_qwen2_7b_instruct = dict(type=LmdeployPytorchModel,
+                            abbr='pt_qwen2_7b_instruct',
+                            path='Qwen/Qwen2-7B-Instruct',
+                            engine_config=pt_engine_config_template_max_bs_128,
+                            gen_config=gen_config_template,
+                            max_out_len=MAX_NEW_TOKENS,
+                            max_seq_len=MAX_SESSION_LEN,
+                            batch_size=128,
+                            concurrency=128,
+                            meta_template=qwen1_5_meta_template,
+                            run_cfg=run_cfg_tp1_template,
+                            end_str='<|im_end|>')
+
 tb_qwen2_7b_instruct_4bits = deepcopy(tb_qwen2_7b_instruct)
 tb_qwen2_7b_instruct_kvint4 = deepcopy(tb_qwen2_7b_instruct)
 tb_qwen2_7b_instruct_kvint8 = deepcopy(tb_qwen2_7b_instruct)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -36,6 +36,7 @@ env:
   REPORT_DIR: /nvme/qa_test_models/benchmark-reports/${{ github.run_id }}
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
   dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib jmespath'}}
+  FAIL_CONFIG: ${{ github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
 
 jobs:
   linux-build:
@@ -111,29 +112,32 @@ jobs:
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
           python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+          python3 -m pip install -e /root/packages/AutoAWQ_kernels
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
           python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps
           python3 -m pip install ${{env.dependency_pkgs}}
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
-          python3 -m pip install lmdeploy-*.whl
-          python3 -m pip install triton==2.1.0
+          python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Install lmdeploy - offline
         if: ${{inputs.offline_mode}}
         run: |
-          python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl
-          python3 -m pip install triton==2.1.0
+          python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Check env
         run: |
+          pip uninstall -y nvidia-nccl-cu11
           python3 -m pip list
           lmdeploy check_env
+          mkdir ${{env.REPORT_DIR}}/allure-results/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/allure-results/.pytest_cache autotest
       - name: Run other benchmark
         run: |
-            pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n 8 --run_id ${{ github.run_id }} -m gpu_num_1 --lf --alluredir=${{env.REPORT_DIR}}/allure-results || true
-            pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n 4 --run_id ${{ github.run_id }} -m gpu_num_2 --lf --alluredir=${{env.REPORT_DIR}}/allure-results || true
-            pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n 2 --run_id ${{ github.run_id }} -m gpu_num_4 --lf --alluredir=${{env.REPORT_DIR}}/allure-results
+            pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n 8 --run_id ${{ github.run_id }} -m gpu_num_1 ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}}/allure-results || true
+            pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n 4 --run_id ${{ github.run_id }} -m gpu_num_2 ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}}/allure-results || true
+            pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n 2 --run_id ${{ github.run_id }} -m gpu_num_4 ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}}/allure-results
       - name: Clear workfile
         if: always()
         run: |