From dd4987717d10588c1f8845a9137e0ecded1b59ec Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Wed, 18 Sep 2024 19:42:59 +0800 Subject: [PATCH] [ci] regular update (#2431) * update * update * update * update * update * update * update * update * update * updaet * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * updaste * update --- .github/scripts/action_tools.py | 9 +- .github/scripts/eval_opencompass_config.py | 83 +++--- .github/workflows/benchmark.yml | 18 +- .github/workflows/daily_ete_test.yml | 242 ++++++++++++------ .github/workflows/evaluate.yml | 16 +- .github/workflows/stable.yml | 14 +- .../benchmark/test_apiserver_performance.py | 61 ++--- .../benchmark/test_generation_performance.py | 128 ++++----- .../benchmark/test_throughput_performance.py | 63 ++--- autotest/config.yaml | 31 ++- .../interface/pipeline/test_pipeline_func.py | 67 ++++- .../pipeline/test_pipeline_longtext_func.py | 7 +- autotest/utils/benchmark_utils.py | 35 ++- autotest/utils/config_utils.py | 6 +- autotest/utils/pipeline_chat.py | 16 +- autotest/utils/restful_return_check.py | 2 + autotest/utils/run_restful_chat.py | 2 +- 17 files changed, 459 insertions(+), 341 deletions(-) diff --git a/.github/scripts/action_tools.py b/.github/scripts/action_tools.py index 126147c43b..84f401af17 100644 --- a/.github/scripts/action_tools.py +++ b/.github/scripts/action_tools.py @@ -101,7 +101,10 @@ def _load_hf_results(test_results: dict, model_name: str): return out -def evaluate(models: List[str], datasets: List[str], workspace: str): +def evaluate(models: List[str], + datasets: List[str], + workspace: str, + is_smoke: bool = False): """Evaluate models from lmdeploy using opencompass. Args: @@ -157,6 +160,10 @@ def evaluate(models: List[str], datasets: List[str], workspace: str): with open(config_path_new, 'a') as f: f.write(f'\ndatasets = {datasets}\n') + if is_smoke: + f.write('\nfor d in datasets:\n') + f.write(" if d['reader_cfg'] is not None:\n") + f.write(" d['reader_cfg']['test_range'] = '[0:50]'\n") if engine_type == 'hf': f.write(f'\nmodels = [ *{target_model} ]\n') else: diff --git a/.github/scripts/eval_opencompass_config.py b/.github/scripts/eval_opencompass_config.py index 95baf04b7f..8dc2bb0d5d 100644 --- a/.github/scripts/eval_opencompass_config.py +++ b/.github/scripts/eval_opencompass_config.py @@ -6,68 +6,72 @@ with read_base(): # choose a list of datasets - from .datasets.bbh.bbh_gen_5b92b0 import bbh_datasets # noqa: F401, E501 - from .datasets.ceval.ceval_gen_2daf24 import \ + from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \ + bbh_datasets # noqa: F401, E501 + from opencompass.configs.datasets.ceval.ceval_gen_2daf24 import \ ceval_datasets # noqa: F401, E501 - from .datasets.cmmlu.cmmlu_gen_c13365 import \ + from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import \ cmmlu_datasets # noqa: F401, E501 - from .datasets.crowspairs.crowspairs_gen_381af0 import \ + from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import \ crowspairs_datasets # noqa: F401, E501 - from .datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \ + from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \ GaokaoBench_datasets # noqa: F401, E501 - from .datasets.gpqa.gpqa_gen_4baadb import \ + from opencompass.configs.datasets.gpqa.gpqa_gen_4baadb import \ gpqa_datasets # noqa: F401, E501 - from .datasets.gsm8k.gsm8k_gen_1d7fe4 import \ + from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \ gsm8k_datasets # noqa: F401, E501 - from .datasets.hellaswag.hellaswag_10shot_gen_e42710 import \ + from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \ hellaswag_datasets # noqa: F401, E501 - from .datasets.humaneval.humaneval_gen_8e312c import \ + from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import \ humaneval_datasets # noqa: F401, E501 - from .datasets.IFEval.IFEval_gen_3321a3 import \ + from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \ ifeval_datasets # noqa: F401, E501 - from .datasets.math.math_0shot_gen_393424 import \ + from opencompass.configs.datasets.math.math_0shot_gen_393424 import \ math_datasets # noqa: F401, E501 - from .datasets.mbpp.sanitized_mbpp_gen_a0fc46 import \ + from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_a0fc46 import \ sanitized_mbpp_datasets # noqa: F401, E501 - from .datasets.mmlu.mmlu_gen_4d595a import \ + from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import \ mmlu_datasets # noqa: F401, E501 - from .datasets.nq.nq_open_1shot_gen_01cf41 import \ + from opencompass.configs.datasets.nq.nq_open_1shot_gen_01cf41 import \ nq_datasets # noqa: F401, E501 - from .datasets.race.race_gen_69ee4f import \ + from opencompass.configs.datasets.race.race_gen_69ee4f import \ race_datasets # noqa: F401, E501 - from .datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \ + from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \ TheoremQA_datasets # noqa: F401, E501 - from .datasets.triviaqa.triviaqa_wiki_1shot_gen_eaf81e import \ + from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_eaf81e import \ triviaqa_datasets # noqa: F401, E501 - from .datasets.winogrande.winogrande_5shot_gen_b36770 import \ + from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import \ winogrande_datasets # noqa: F401, E501 # read hf models - from .models.baichuan.hf_baichuan2_7b_chat import \ + from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import \ models as hf_baichuan2_chat_7b # noqa: F401, E501 - from .models.gemma.hf_gemma_7b_it import \ + from opencompass.configs.models.gemma.hf_gemma_7b_it import \ models as hf_gemma_chat_7b # noqa: F401, E501 - from .models.hf_internlm.hf_internlm2_chat_7b import \ + from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import \ models as hf_internlm2_chat_7b # noqa: F401, E501 - from .models.hf_internlm.hf_internlm2_chat_20b import \ + from opencompass.configs.models.hf_internlm.hf_internlm2_chat_20b import \ models as hf_internlm2_chat_20b # noqa: F401, E501 - from .models.hf_internlm.hf_internlm_chat_7b import \ + from opencompass.configs.models.hf_internlm.hf_internlm_chat_7b import \ models as hf_internlm_chat_7b # noqa: F401, E501 - from .models.hf_internlm.hf_internlm_chat_20b import \ + from opencompass.configs.models.hf_internlm.hf_internlm_chat_20b import \ models as hf_internlm_chat_20b # noqa: F401, E501 - from .models.hf_llama.hf_llama2_7b_chat import \ + from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import \ models as hf_llama2_chat_7b # noqa: F401, E501 - from .models.hf_llama.hf_llama3_8b_instruct import \ + from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \ models as hf_llama_3_8b_instruct # noqa: F401, E501 - from .models.mistral.hf_mistral_7b_instruct_v0_1 import \ + from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_1 import \ models as hf_mistral_chat_7b # noqa: F401, E501 - from .models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \ + from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \ models as hf_mixtral_chat_8x7b # noqa: F401, E501 - from .models.qwen.hf_qwen1_5_7b_chat import \ + from opencompass.configs.models.qwen.hf_qwen1_5_7b_chat import \ models as hf_qwen1_5_chat_7b # noqa: F401, E501 - from .models.qwen.hf_qwen_7b_chat import \ + from opencompass.configs.models.qwen.hf_qwen2_7b_instruct import \ + models as hf_qwen2_7b_instruct # noqa: F401, E501 + from opencompass.configs.models.qwen.hf_qwen_7b_chat import \ models as hf_qwen_chat_7b # noqa: F401, E501 # and output the results in a chosen format - from .summarizers.medium import summarizer # noqa: F401, E501 + from opencompass.configs.summarizers.medium import \ + summarizer # noqa: F401, E501 internlm_meta_template = dict(round=[ dict(role='HUMAN', begin='<|User|>:', end='\n'), @@ -117,7 +121,7 @@ end='<|im_end|>\n', generate=True), ], - eos_token_id=151645, + eos_token_id=[151645, 151643], ) baichuan2_meta_template = dict(round=[ @@ -202,7 +206,7 @@ qwen_gen_config_template = dict(top_k=1, top_p=0.8, temperature=1.0, - stop_words=[151645], + stop_words=[151645, 151643], max_new_tokens=MAX_NEW_TOKENS) tokenizer_kwargs_template = dict(padding_side='left', @@ -546,6 +550,19 @@ run_cfg=dict(num_gpus=1), ) +pt_qwen2_7b_instruct = dict(type=LmdeployPytorchModel, + abbr='pt_qwen2_7b_instruct', + path='Qwen/Qwen2-7B-Instruct', + engine_config=pt_engine_config_template_max_bs_128, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=128, + concurrency=128, + meta_template=qwen1_5_meta_template, + run_cfg=run_cfg_tp1_template, + end_str='<|im_end|>') + tb_qwen2_7b_instruct_4bits = deepcopy(tb_qwen2_7b_instruct) tb_qwen2_7b_instruct_kvint4 = deepcopy(tb_qwen2_7b_instruct) tb_qwen2_7b_instruct_kvint8 = deepcopy(tb_qwen2_7b_instruct) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 6e026f3307..cf8283bbda 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -36,6 +36,7 @@ env: REPORT_DIR: /nvme/qa_test_models/benchmark-reports/${{ github.run_id }} ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib jmespath'}} + FAIL_CONFIG: ${{ github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}} jobs: linux-build: @@ -111,29 +112,32 @@ jobs: # manually install flash attn # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl + python3 -m pip install -e /root/packages/AutoAWQ_kernels + python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps python3 -m pip install ${{env.dependency_pkgs}} - name: Install lmdeploy if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} run: | - python3 -m pip install lmdeploy-*.whl - python3 -m pip install triton==2.1.0 + python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Install lmdeploy - offline if: ${{inputs.offline_mode}} run: | - python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl - python3 -m pip install triton==2.1.0 + python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Check env run: | + pip uninstall -y nvidia-nccl-cu11 python3 -m pip list lmdeploy check_env + mkdir ${{env.REPORT_DIR}}/allure-results/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/allure-results/.pytest_cache autotest - name: Run other benchmark run: | - pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n 8 --run_id ${{ github.run_id }} -m gpu_num_1 --lf --alluredir=${{env.REPORT_DIR}}/allure-results || true - pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n 4 --run_id ${{ github.run_id }} -m gpu_num_2 --lf --alluredir=${{env.REPORT_DIR}}/allure-results || true - pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n 2 --run_id ${{ github.run_id }} -m gpu_num_4 --lf --alluredir=${{env.REPORT_DIR}}/allure-results + pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n 8 --run_id ${{ github.run_id }} -m gpu_num_1 ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}}/allure-results || true + pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n 4 --run_id ${{ github.run_id }} -m gpu_num_2 ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}}/allure-results || true + pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n 2 --run_id ${{ github.run_id }} -m gpu_num_4 ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}}/allure-results - name: Clear workfile if: always() run: | diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml index bd7d6c259f..28a27cc60a 100644 --- a/.github/workflows/daily_ete_test.yml +++ b/.github/workflows/daily_ete_test.yml @@ -33,28 +33,13 @@ on: description: 'Dependency packages, you can also set a specific version' type: string default: 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath decord auto_gptq' - tools_regression: + regression_func: required: true - description: 'Whether start a tool regression' - type: boolean - default: true - restful_regression: - required: true - description: 'Whether start a restful api regression' - type: boolean - default: true - pipeline_regression: - required: true - description: 'Whether start an interface pipeline regression' - type: boolean - default: true - benchmark_regression: - required: true - description: 'Whether start a benchmark script regression' - type: boolean - default: true + description: 'regression functions' + type: string + default: "['tools','restful','pipeline','benchmark','evaluation']" schedule: - - cron: '00 18 * * 0-4' + - cron: '00 16 * * 0-4' env: HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache @@ -63,7 +48,8 @@ env: OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }} ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }} - COV_PATH: /opt/py3/lib/python3.10/site-packages/lmdeploy + COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy + FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}} jobs: linux-build: @@ -103,9 +89,9 @@ jobs: test_tools: needs: linux-build - if: ${{!cancelled() && (github.event_name == 'schedule' || inputs.tools_regression)}} + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}} runs-on: [self-hosted, linux-a100] - timeout-minutes: 420 + timeout-minutes: 450 env: PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA MODELSCOPE_CACHE: /root/modelscope_hub @@ -143,132 +129,135 @@ jobs: # manually install flash attn # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl - python3 -m pip install /root/packages/autoawq_kernels-0.0.6+cu118-cp310-cp310-linux_x86_64.whl + python3 -m pip install -e /root/packages/AutoAWQ_kernels python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps python3 -m pip install ${{env.dependency_pkgs}} - name: Install lmdeploy if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} run: | - python3 -m pip install lmdeploy-*.whl + python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt pip install /nvme/qa_test_models/offline_pkg/DeepSeek-VL --no-deps - name: Install lmdeploy - offline if: ${{inputs.offline_mode}} run: | - python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl + python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt pip install /nvme/qa_test_models/offline_pkg/DeepSeek-VL --no-deps - name: Check env run: | + pip uninstall -y nvidia-nccl-cu11 python3 -m pip list lmdeploy check_env cp -r /root/lora . rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Test lmdeploy - quantization w4a16 continue-on-error: true if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'quantization')) run: | - pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir --cov ${{env.COV_PATH}} || true + pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - quantization w8a8 continue-on-error: true if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'quantization')) run: | - pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - convert continue-on-error: true if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'convert')) run: | - pytest autotest/tools/convert -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/tools/convert -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - chat workspace continue-on-error: true if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'chat')) run: | - pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - chat hf turbomind continue-on-error: true if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'chat')) run: | - pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - chat hf torch continue-on-error: true if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'chat')) run: | - pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - pipeline turbomind continue-on-error: true if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'pipeline')) run: | - pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - - name: Test lmdeploy - pipeline torch + - name: Test lmdeploy - pipeline turbomind vl continue-on-error: true - if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'pipeline')) + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind-vl') && contains(fromJSON(github.event.inputs.model), 'pipeline')) run: | - pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - - name: Test lmdeploy - pipeline turbomind vl + - name: Test lmdeploy - restful turbomind continue-on-error: true - if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind-vl') && contains(fromJSON(github.event.inputs.model), 'pipeline')) + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'restful')) run: | - pytest autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/tools/restful/test_restful_chat_hf_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/tools/restful/test_restful_chat_hf_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - - name: Test lmdeploy - restful turbomind + - name: Test lmdeploy - restful workspace continue-on-error: true if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'restful')) run: | - pytest autotest/tools/restful/test_restful_chat_hf_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/restful/test_restful_chat_hf_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - restful turbomind vl continue-on-error: true if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind-vl') && contains(fromJSON(github.event.inputs.model), 'restful')) run: | - pytest autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - - name: Test lmdeploy - restful workspace + - name: Test lmdeploy - pipeline torch continue-on-error: true - if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'restful')) + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'pipeline')) run: | - pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - restful torch continue-on-error: true if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'restful')) run: | - pytest autotest/tools/restful/test_restful_chat_hf_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/tools/restful/test_restful_chat_hf_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/restful/test_restful_chat_hf_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/tools/restful/test_restful_chat_hf_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - local testcase if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.model), 'local_case') run: | - pytest /local_case/issue_regression --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}}|| true + pytest /local_case/issue_regression --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}}|| true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Clear workfile if: always() @@ -281,7 +270,7 @@ jobs: chmod -R 777 $workdir test_restful: - if: ${{!cancelled() && (github.event_name == 'schedule' || inputs.restful_regression)}} + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}} runs-on: [self-hosted, linux-a100] needs: test_tools strategy: @@ -321,18 +310,21 @@ jobs: - name: Install lmdeploy if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} run: | - python3 -m pip install lmdeploy-*.whl + python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Install lmdeploy - offline if: ${{inputs.offline_mode}} run: | - python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl + python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Check env run: | + pip uninstall -y nvidia-nccl-cu11 python3 -m pip list lmdeploy check_env rm -rf allure-results + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Start restful api turbomind if: matrix.backend == 'turbomind' run: | @@ -348,7 +340,7 @@ jobs: - name: Test lmdeploy - restful api timeout-minutes: 75 run: | - pytest autotest/interface/restful/test_restful_chat_func.py -n 20 -m 'not not_${{matrix.backend}}' --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/interface/restful/test_restful_chat_func.py -n 20 -m 'not not_${{matrix.backend}}' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Kill api server if: always() @@ -369,7 +361,7 @@ jobs: - name: Test lmdeploy - restful api - base timeout-minutes: 40 run: | - pytest autotest/interface/restful/test_restful_completions_v1.py -n 20 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/interface/restful/test_restful_completions_v1.py -n 20 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Kill api server if: always() @@ -386,7 +378,7 @@ jobs: chmod -R 777 $workdir test_pipeline: - if: ${{!cancelled() && (github.event_name == 'schedule' || inputs.pipeline_regression)}} + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'pipeline'))}} runs-on: [self-hosted, linux-a100] needs: test_tools timeout-minutes: 300 @@ -422,27 +414,30 @@ jobs: - name: Install lmdeploy if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} run: | - python3 -m pip install lmdeploy-*.whl + python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Install lmdeploy - offline if: ${{inputs.offline_mode}} run: | - python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl + python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Check env run: | + pip uninstall -y nvidia-nccl-cu11 python3 -m pip list lmdeploy check_env rm -rf allure-results + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Test lmdeploy - interface pipeline case run: | - pytest autotest/interface/pipeline/test_pipeline_func.py -m 'not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/interface/pipeline/test_pipeline_func.py -m 'not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Clear workfile if: always() @@ -456,7 +451,7 @@ jobs: test_benchmark: - if: ${{!cancelled() && (github.event_name == 'schedule' || inputs.benchmark_regression)}} + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}} runs-on: [self-hosted, linux-a100] needs: test_tools timeout-minutes: 300 @@ -492,22 +487,23 @@ jobs: - name: Install lmdeploy if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} run: | - python3 -m pip install lmdeploy-*.whl - python3 -m pip install triton==2.1.0 + python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Install lmdeploy - offline if: ${{inputs.offline_mode}} run: | - python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl - python3 -m pip install triton==2.1.0 + python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Check env run: | + pip uninstall -y nvidia-nccl-cu11 python3 -m pip list lmdeploy check_env + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Test benchmark script run: | - pytest autotest/benchmark -n 4 --run_id ${{ github.run_id }} -m function --lf --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + pytest autotest/benchmark -n 4 --run_id ${{ github.run_id }} -m function ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Clear workfile if: always() @@ -520,9 +516,97 @@ jobs: mkdir $workdir chmod -R 777 $workdir + test_evaluation: + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'evaluation'))}} + runs-on: [self-hosted, linux-a100] + needs: test_tools + timeout-minutes: 300 # 5hours + container: + image: openmmlab/lmdeploy:latest-cu11 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/github-actions/resources:/root/resources + - /nvme/github-actions/opencompass-data:/root/opencompass-data + - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports + - /nvme/qa_test_models:/root/models + - /nvme/qa_test_models/offline_pkg:/nvme/qa_test_models/offline_pkg + - /mnt/shared:/mnt/shared + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Setup systems + run: | + export TIME_STAMP="$(date +'%Y%m%d-%H%M%S')" + echo "TIME_STAMP=$TIME_STAMP" >> $GITHUB_ENV + - name: Clone repository + uses: actions/checkout@v2 + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Copy repository - offline + if: ${{inputs.offline_mode}} + run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. . + - name: Download Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + uses: actions/download-artifact@v4 + with: + name: my-artifact-${{ github.run_id }}-py310 + - name: Install lmdeploy - dependency + run: | + # manually install flash attn + # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases + python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl + python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps + python3 -m pip install ${{env.dependency_pkgs}} + - name: Install lmdeploy + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Install lmdeploy - offline + if: ${{inputs.offline_mode}} + run: | + python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Install opencompass + run: | + git clone --depth=1 https://github.com/open-compass/opencompass.git + cd opencompass + python3 -m pip install -e . + echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV + - name: Check env + run: | + pip uninstall -y nvidia-nccl-cu11 + python3 -m pip list + lmdeploy check_env + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Setup paths for evaluation + run: | + ln -s /root/opencompass-data ./data + python3 .github/scripts/action_tools.py create_model_links /root/models . + - name: Evaluate models + run: | + export LMDEPLOY_DIR=$(pwd) + + python3 .github/scripts/action_tools.py evaluate \ + --models "[tb_internlm2_5_7b_chat, tb_internlm2_5_7b_chat_4bits, tb_internlm2_5_7b_chat_kvint4, pt_internlm2_5_7b_chat, tb_internlm2_5_20b_chat, tb_internlm2_5_20b_chat_4bits, tb_internlm2_5_20b_chat_kvint4, pt_internlm2_5_20b_chat, tb_llama_3d1_8b_instruct, pt_llama_3d1_8b_instruct, tb_llama_3d1_8b_instruct_4bits, tb_llama_3d1_8b_instruct_kvint4, tb_qwen2_7b_instruct, tb_qwen2_7b_instruct_4bits, pt_qwen1_5_moe_2_7b_chat, pt_gemma_2_9b_it]" \ + --datasets "[*race_datasets, *gsm8k_datasets]" \ + --workspace /root/evaluation-reports/${{ github.run_id }} --is_smoke true + - name: Clear workspace + if: always() + run: | + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + get_benchmark_result: - if: ${{!cancelled() && (github.event_name == 'schedule' || inputs.benchmark_regression)}} + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}} needs: [test_benchmark] timeout-minutes: 5 runs-on: [self-hosted, linux-a100] @@ -571,12 +655,12 @@ jobs: - name: Install lmdeploy if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} run: | - python3 -m pip install lmdeploy-*.whl + python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Install lmdeploy - offline if: ${{inputs.offline_mode}} run: | - python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl + python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Get coverage report run: | @@ -597,7 +681,7 @@ jobs: notify_to_feishu: if: always() && !cancelled() && (github.ref_name == 'develop' || github.ref_name == 'main') - needs: [get_benchmark_result, get_coverage_report] + needs: [get_benchmark_result, get_coverage_report, test_evaluation] timeout-minutes: 5 runs-on: [self-hosted, linux-a100] steps: diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index 5b17fccb22..bbecedd9c5 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -17,7 +17,7 @@ on: required: true description: 'Tested TurboMind models list. eg. [internlm_chat_7b,internlm_chat_7b_w8a16]' type: string - default: '[tb_internlm2_chat_7b, tb_internlm2_chat_7b_4bits, tb_internlm2_chat_7b_kvint4, tb_internlm2_chat_7b_kvint8, pt_internlm2_chat_7b, tb_internlm2_5_7b_chat, tb_internlm2_5_7b_chat_4bits, tb_internlm2_5_7b_chat_kvint4, tb_internlm2_5_7b_chat_kvint8, pt_internlm2_5_7b_chat, tb_internlm2_5_20b_chat, tb_internlm2_5_20b_chat_4bits, tb_internlm2_5_20b_chat_kvint4, tb_internlm2_5_20b_chat_kvint8, pt_internlm2_5_20b_chat, tb_qwen1_5_7b_chat, tb_qwen1_5_7b_chat_4bits, tb_qwen1_5_7b_chat_kvint4, tb_qwen1_5_7b_chat_kvint8, pt_qwen1_5_7b_chat, tb_llama_3d1_8b_instruct, pt_llama_3d1_8b_instruct, tb_llama_3d1_8b_instruct_4bits, tb_llama_3d1_8b_instruct_kvint4, tb_llama_3d1_8b_instruct_kvint8, tb_qwen2_7b_instruct, tb_qwen2_7b_instruct_4bits, tb_qwen2_7b_instruct_kvint8, pt_qwen1_5_moe_2_7b_chat, pt_gemma_2_9b_it]' + default: '[tb_internlm2_chat_7b, pt_internlm2_chat_7b, tb_internlm2_5_7b_chat, pt_internlm2_5_7b_chat, tb_internlm2_5_20b_chat, pt_internlm2_5_20b_chat, tb_qwen1_5_7b_chat, pt_qwen1_5_7b_chat, tb_llama_3_8b_instruct, pt_llama_3_8b_instruct, tb_llama_3d1_8b_instruct, pt_llama_3d1_8b_instruct, tb_qwen2_7b_instruct, pt_qwen2_7b_instruct, pt_qwen1_5_moe_2_7b_chat, pt_gemma_2_9b_it, tb_internlm2_chat_7b_4bits, tb_internlm2_chat_7b_kvint4, tb_internlm2_chat_7b_kvint8, tb_internlm2_5_7b_chat_4bits, tb_internlm2_5_7b_chat_kvint4, tb_internlm2_5_7b_chat_kvint8, tb_internlm2_5_20b_chat_4bits, tb_internlm2_5_20b_chat_kvint4, tb_internlm2_5_20b_chat_kvint8, tb_qwen1_5_7b_chat_4bits, tb_qwen1_5_7b_chat_kvint4, tb_qwen1_5_7b_chat_kvint8, tb_llama_3_8b_instruct_4bits, tb_llama_3_8b_instruct_kvint4, tb_llama_3_8b_instruct_kvint8, tb_llama_3d1_8b_instruct_4bits, tb_llama_3d1_8b_instruct_kvint4, tb_llama_3d1_8b_instruct_kvint8, tb_qwen2_7b_instruct_4bits, tb_qwen2_7b_instruct_kvint8]' datasets: required: true description: 'Tested datasets list. eg. [*bbh_datasets,*ceval_datasets,*cmmlu_datasets,*GaokaoBench_datasets,*gpqa_datasets,*gsm8k_datasets,*hellaswag_datasets,*humaneval_datasets,*ifeval_datasets,*math_datasets,*sanitized_mbpp_datasets,*mmlu_datasets,*nq_datasets,*race_datasets,*TheoremQA_datasets,*triviaqa_datasets,*winogrande_datasets,*crowspairs_datasets]' @@ -85,8 +85,8 @@ jobs: runs-on: [self-hosted, linux-a100] timeout-minutes: 4320 # 72hours container: - image: nvcr.io/nvidia/tritonserver:24.03-py3 - options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip --pull never" + image: openmmlab/lmdeploy:latest-cu11 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip - /nvme/github-actions/packages:/root/packages @@ -110,30 +110,28 @@ jobs: uses: actions/download-artifact@v4 with: name: my-artifact-${{ github.run_id }}-py310 - - name: Install pytorch - run: | - python3 -m pip cache dir - python3 -m pip install torch==2.3.0 torchvision==0.18.0 --index-url https://download.pytorch.org/whl/cu118 - name: Install lmdeploy - dependency run: | # manually install flash attn # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl + python3 -m pip install -e /root/packages/AutoAWQ_kernels + python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps python3 -m pip install ${{env.dependency_pkgs}} - name: Install lmdeploy run: | - python3 -m pip install lmdeploy-*.whl + python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Install opencompass run: | git clone --depth=1 https://github.com/open-compass/opencompass.git cd opencompass python3 -m pip install -e . - python3 -m pip install triton==2.1.0 echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV - name: Check env run: | + pip uninstall -y nvidia-nccl-cu11 python3 -m pip list lmdeploy check_env - name: Setup paths for evaluation diff --git a/.github/workflows/stable.yml b/.github/workflows/stable.yml index c946177c0e..0bfd32d4ee 100644 --- a/.github/workflows/stable.yml +++ b/.github/workflows/stable.yml @@ -113,14 +113,12 @@ jobs: - name: Install lmdeploy if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} run: | - python3 -m pip install lmdeploy-*.whl - python3 -m pip install triton==2.1.0 + python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Install lmdeploy - offline if: ${{inputs.offline_mode}} run: | - python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl - python3 -m pip install triton==2.1.0 + python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Check env run: | @@ -134,10 +132,10 @@ jobs: sleep 120s - name: Test lmdeploy - restful api run: | - python3 benchmark/profile_restful_api.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json --stream-output True --num-prompts 10000 --csv ${{env.REPORT_DIR}}/stable.csv &> ${{env.REPORT_DIR}}/stable.log - python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-1.csv &> ${{env.REPORT_DIR}}/stable-internal-1.log - python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-2.csv &> ${{env.REPORT_DIR}}/stable-internal-2.log - python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-3.csv &> ${{env.REPORT_DIR}}/stable-internal-3.log + python3 benchmark/profile_restful_api.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json --stream-output True --num-prompts 10000 --csv ${{env.REPORT_DIR}}/stable.csv > ${{env.REPORT_DIR}}/stable.log + python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-1.csv > ${{env.REPORT_DIR}}/stable-internal-1.log + python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-2.csv > ${{env.REPORT_DIR}}/stable-internal-2.log + python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-3.csv > ${{env.REPORT_DIR}}/stable-internal-3.log - name: Kill api server if: always() run: | diff --git a/autotest/benchmark/test_apiserver_performance.py b/autotest/benchmark/test_apiserver_performance.py index 761cf0302b..b8842a4c74 100644 --- a/autotest/benchmark/test_apiserver_performance.py +++ b/autotest/benchmark/test_apiserver_performance.py @@ -1,4 +1,3 @@ -import allure import pytest from utils.benchmark_utils import restful_test from utils.config_utils import get_benchmark_model_list @@ -22,10 +21,12 @@ def getModelList(tp_num): model_list = get_benchmark_model_list(tp_num, kvint_list=[4, 8]) new_model_list = [] for model in model_list: - if 'Llama-2' in model: + if model['backend'] == 'pytorch': + model['extra'] = '--max-batch-size 256 --cache-max-entry-count 0.8' + elif 'Llama-2' in model['model']: model[ 'extra'] = '--max-batch-size 256 --cache-max-entry-count 0.95' - elif 'internlm2' in model: + elif 'internlm2' in model['model']: model['extra'] = '--max-batch-size 256 --cache-max-entry-count 0.9' else: model['extra'] = '--max-batch-size 256' @@ -40,14 +41,11 @@ def getModelList(tp_num): getModelList(tp_num=1), indirect=True) def test_restful_tp1(config, run_id, prepare_environment, worker_id): - result, restful_log, msg = restful_test(config, - run_id, - prepare_environment, - worker_id=worker_id) - - if restful_log is not None: - allure.attach.file(restful_log, - attachment_type=allure.attachment_type.TEXT) + result, msg = restful_test(config, + run_id, + prepare_environment, + worker_id=worker_id) + assert result, msg @@ -57,14 +55,11 @@ def test_restful_tp1(config, run_id, prepare_environment, worker_id): getModelList(tp_num=2), indirect=True) def test_restful_tp2(config, run_id, prepare_environment, worker_id): - result, restful_log, msg = restful_test(config, - run_id, - prepare_environment, - worker_id=worker_id) - - if restful_log is not None: - allure.attach.file(restful_log, - attachment_type=allure.attachment_type.TEXT) + result, msg = restful_test(config, + run_id, + prepare_environment, + worker_id=worker_id) + assert result, msg @@ -74,14 +69,11 @@ def test_restful_tp2(config, run_id, prepare_environment, worker_id): getModelList(tp_num=4), indirect=True) def test_restful_tp4(config, run_id, prepare_environment, worker_id): - result, restful_log, msg = restful_test(config, - run_id, - prepare_environment, - worker_id=worker_id) - - if restful_log is not None: - allure.attach.file(restful_log, - attachment_type=allure.attachment_type.TEXT) + result, msg = restful_test(config, + run_id, + prepare_environment, + worker_id=worker_id) + assert result, msg @@ -103,13 +95,10 @@ def test_restful_tp4(config, run_id, prepare_environment, worker_id): }], indirect=True) def test_restful_func_tp2(config, run_id, prepare_environment, worker_id): - result, restful_log, msg = restful_test(config, - run_id, - prepare_environment, - worker_id=worker_id, - is_smoke=True) - - if restful_log is not None: - allure.attach.file(restful_log, - attachment_type=allure.attachment_type.TEXT) + result, msg = restful_test(config, + run_id, + prepare_environment, + worker_id=worker_id, + is_smoke=True) + assert result, msg diff --git a/autotest/benchmark/test_generation_performance.py b/autotest/benchmark/test_generation_performance.py index cffdc53270..7f2e84c9a5 100644 --- a/autotest/benchmark/test_generation_performance.py +++ b/autotest/benchmark/test_generation_performance.py @@ -1,4 +1,3 @@ -import allure import pytest from utils.benchmark_utils import generation_test from utils.config_utils import (get_benchmark_model_list, @@ -9,16 +8,13 @@ @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=1)) def test_generation_tp1(config, run_id, run_config, worker_id): - result, generation_log, msg = generation_test( - config, - run_id, - run_config, - cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=1), - worker_id=worker_id) - - if generation_log is not None: - allure.attach.file(generation_log, - attachment_type=allure.attachment_type.TEXT) + result, msg = generation_test(config, + run_id, + run_config, + cuda_prefix=get_cuda_prefix_by_workerid( + worker_id, tp_num=1), + worker_id=worker_id) + assert result, msg @@ -27,17 +23,14 @@ def test_generation_tp1(config, run_id, run_config, worker_id): @pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=1, is_longtext=True)) def test_generation_longtext_tp1(config, run_id, run_config, worker_id): - result, generation_log, msg = generation_test( - config, - run_id, - run_config, - is_longtext=True, - cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=1), - worker_id=worker_id) - - if generation_log is not None: - allure.attach.file(generation_log, - attachment_type=allure.attachment_type.TEXT) + result, msg = generation_test(config, + run_id, + run_config, + is_longtext=True, + cuda_prefix=get_cuda_prefix_by_workerid( + worker_id, tp_num=1), + worker_id=worker_id) + assert result, msg @@ -45,16 +38,13 @@ def test_generation_longtext_tp1(config, run_id, run_config, worker_id): @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=2)) def test_generation_tp2(config, run_id, run_config, worker_id): - result, generation_log, msg = generation_test( - config, - run_id, - run_config, - cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2), - worker_id=worker_id) - - if generation_log is not None: - allure.attach.file(generation_log, - attachment_type=allure.attachment_type.TEXT) + result, msg = generation_test(config, + run_id, + run_config, + cuda_prefix=get_cuda_prefix_by_workerid( + worker_id, tp_num=2), + worker_id=worker_id) + assert result, msg @@ -64,17 +54,14 @@ def test_generation_tp2(config, run_id, run_config, worker_id): @pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=2, is_longtext=True)) def test_generation_longtext_tp2(config, run_id, run_config, worker_id): - result, generation_log, msg = generation_test( - config, - run_id, - run_config, - is_longtext=True, - cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2), - worker_id=worker_id) - - if generation_log is not None: - allure.attach.file(generation_log, - attachment_type=allure.attachment_type.TEXT) + result, msg = generation_test(config, + run_id, + run_config, + is_longtext=True, + cuda_prefix=get_cuda_prefix_by_workerid( + worker_id, tp_num=2), + worker_id=worker_id) + assert result, msg @@ -82,16 +69,13 @@ def test_generation_longtext_tp2(config, run_id, run_config, worker_id): @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=4)) def test_generation_tp4(config, run_id, run_config, worker_id): - result, generation_log, msg = generation_test( - config, - run_id, - run_config, - cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=4), - worker_id=worker_id) - - if generation_log is not None: - allure.attach.file(generation_log, - attachment_type=allure.attachment_type.TEXT) + result, msg = generation_test(config, + run_id, + run_config, + cuda_prefix=get_cuda_prefix_by_workerid( + worker_id, tp_num=4), + worker_id=worker_id) + assert result, msg @@ -100,17 +84,14 @@ def test_generation_tp4(config, run_id, run_config, worker_id): @pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=4, is_longtext=True)) def test_generation_longtext_tp4(config, run_id, run_config, worker_id): - result, generation_log, msg = generation_test( - config, - run_id, - run_config, - is_longtext=True, - cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=4), - worker_id=worker_id) - - if generation_log is not None: - allure.attach.file(generation_log, - attachment_type=allure.attachment_type.TEXT) + result, msg = generation_test(config, + run_id, + run_config, + is_longtext=True, + cuda_prefix=get_cuda_prefix_by_workerid( + worker_id, tp_num=4), + worker_id=worker_id) + assert result, msg @@ -127,15 +108,12 @@ def test_generation_longtext_tp4(config, run_id, run_config, worker_id): 'tp_num': 2 }]) def test_generation_fun_tp2(config, run_id, run_config, worker_id): - result, generation_log, msg = generation_test( - config, - run_id, - run_config, - cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2), - worker_id=worker_id, - is_smoke=True) - - if generation_log is not None: - allure.attach.file(generation_log, - attachment_type=allure.attachment_type.TEXT) + result, msg = generation_test(config, + run_id, + run_config, + cuda_prefix=get_cuda_prefix_by_workerid( + worker_id, tp_num=2), + worker_id=worker_id, + is_smoke=True) + assert result, msg diff --git a/autotest/benchmark/test_throughput_performance.py b/autotest/benchmark/test_throughput_performance.py index ad44b22b43..0ec9d31863 100644 --- a/autotest/benchmark/test_throughput_performance.py +++ b/autotest/benchmark/test_throughput_performance.py @@ -1,4 +1,3 @@ -import allure import pytest from utils.benchmark_utils import throughput_test from utils.config_utils import (get_benchmark_model_list, @@ -10,16 +9,13 @@ @pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=1, kvint_list=[4, 8])) def test_throughput_tp1(config, run_id, run_config, worker_id): - result, throughput_log, msg = throughput_test( - config, - run_id, - run_config, - cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=1), - worker_id=worker_id) + result, msg = throughput_test(config, + run_id, + run_config, + cuda_prefix=get_cuda_prefix_by_workerid( + worker_id, tp_num=1), + worker_id=worker_id) - if throughput_log is not None: - allure.attach.file(throughput_log, - attachment_type=allure.attachment_type.TEXT) assert result, msg @@ -28,16 +24,13 @@ def test_throughput_tp1(config, run_id, run_config, worker_id): @pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=2, kvint_list=[4, 8])) def test_throughput_tp2(config, run_id, run_config, worker_id): - result, throughput_log, msg = throughput_test( - config, - run_id, - run_config, - cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2), - worker_id=worker_id) + result, msg = throughput_test(config, + run_id, + run_config, + cuda_prefix=get_cuda_prefix_by_workerid( + worker_id, tp_num=2), + worker_id=worker_id) - if throughput_log is not None: - allure.attach.file(throughput_log, - attachment_type=allure.attachment_type.TEXT) assert result, msg @@ -46,16 +39,13 @@ def test_throughput_tp2(config, run_id, run_config, worker_id): @pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=4, kvint_list=[4, 8])) def test_throughput_tp4(config, run_id, run_config, worker_id): - result, throughput_log, msg = throughput_test( - config, - run_id, - run_config, - cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=4), - worker_id=worker_id) + result, msg = throughput_test(config, + run_id, + run_config, + cuda_prefix=get_cuda_prefix_by_workerid( + worker_id, tp_num=4), + worker_id=worker_id) - if throughput_log is not None: - allure.attach.file(throughput_log, - attachment_type=allure.attachment_type.TEXT) assert result, msg @@ -72,15 +62,12 @@ def test_throughput_tp4(config, run_id, run_config, worker_id): 'tp_num': 2 }]) def test_throughput_func_tp2(config, run_id, run_config, worker_id): - result, throughput_log, msg = throughput_test( - config, - run_id, - run_config, - cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2), - worker_id=worker_id, - is_smoke=True) + result, msg = throughput_test(config, + run_id, + run_config, + cuda_prefix=get_cuda_prefix_by_workerid( + worker_id, tp_num=2), + worker_id=worker_id, + is_smoke=True) - if throughput_log is not None: - allure.attach.file(throughput_log, - attachment_type=allure.attachment_type.TEXT) assert result, msg diff --git a/autotest/config.yaml b/autotest/config.yaml index b7c928909a..152bfdeca5 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -9,7 +9,6 @@ tp_config: internlm2-chat-20b: 2 Baichuan2-13B-Chat: 2 Mixtral-8x7B-Instruct-v0.1: 2 - internlm2-20b: 2 Qwen-VL-Chat: 2 llava-v1.5-13b: 2 internlm2_5-20b-chat: 2 @@ -17,6 +16,7 @@ tp_config: internlm2_5-7b-chat-1m: 4 Qwen2-7B-Instruct-GPTQ-Int4: 2 InternVL2-40B: 2 + MiniCPM-V-2_6: 2 turbomind_chat_model: - meta-llama/Meta-Llama-3-1-8B-Instruct @@ -25,9 +25,7 @@ turbomind_chat_model: - meta-llama/Llama-2-7b-chat-hf - internlm/internlm2_5-7b-chat - internlm/internlm2_5-20b-chat - - internlm/internlm2-chat-7b - internlm/internlm2-chat-20b - - internlm/internlm2-chat-7b-4bits - internlm/internlm2-chat-20b-4bits - internlm/internlm-chat-20b - internlm/internlm-xcomposer2-4khd-7b @@ -61,10 +59,10 @@ turbomind_chat_model: pytorch_chat_model: - meta-llama/Meta-Llama-3-8B-Instruct + - meta-llama/Meta-Llama-3-1-8B-Instruct - meta-llama/Llama-2-7b-chat-hf - internlm/internlm2_5-7b-chat - internlm/internlm2_5-20b-chat - - internlm/internlm2-chat-7b - internlm/internlm2-chat-20b - internlm/internlm-chat-20b - OpenGVLab/InternVL2-2B @@ -88,6 +86,7 @@ pytorch_chat_model: - google/gemma-2-9b-it - deepseek-ai/deepseek-moe-16b-chat - deepseek-ai/deepseek-coder-1.3b-instruct + - deepseek-ai/DeepSeek-V2-Lite-Chat - THUDM/chatglm2-6b - THUDM/cogvlm2-llama3-chinese-chat-19B - THUDM/glm-4v-9b @@ -99,14 +98,14 @@ pytorch_chat_model: turbomind_base_model: - internlm/internlm2_5-7b - internlm/internlm2_5-1_8b - - internlm/internlm2-20b + - internlm/internlm2_5-20b - codellama/CodeLlama-7b-hf pytorch_base_model: - tiiuae/falcon-7b - internlm/internlm2_5-7b - internlm/internlm2_5-1_8b - - internlm/internlm2-20b + - internlm/internlm2_5-20b vl_model: - Qwen/Qwen-VL-Chat @@ -129,6 +128,7 @@ vl_model: - THUDM/glm-4v-9b - microsoft/Phi-3-vision-128k-instruct - openbmb/MiniCPM-Llama3-V-2_5 + - openbmb/MiniCPM-V-2_6 turbomind_quatization: awq: @@ -139,7 +139,7 @@ turbomind_quatization: - internlm/internlm2_5-7b - internlm/internlm2_5-20b-chat - internlm/internlm2-chat-20b - - internlm/internlm2-20b + - internlm/internlm2_5-20b - internlm/internlm-chat-20b - internlm/internlm-xcomposer2-4khd-7b - internlm/internlm-xcomposer2d5-7b @@ -170,13 +170,9 @@ turbomind_quatization: - meta-llama/Llama-2-7b-chat-hf - internlm/internlm2_5-7b-chat - internlm/internlm2_5-20b-chat - - internlm/internlm2-chat-1_8b - - internlm/internlm2-chat-7b - internlm/internlm2-chat-20b - - internlm/internlm2-chat-7b-4bits - internlm/internlm2-chat-20b-4bits - internlm/internlm-chat-20b - - internlm/internlm-xcomposer2-7b - internlm/internlm-xcomposer2-4khd-7b - internlm/internlm-xcomposer2d5-7b - OpenGVLab/InternVL-Chat-V1-5 @@ -202,10 +198,10 @@ turbomind_quatization: pytorch_quatization: awq: - meta-llama/Meta-Llama-3-8B-Instruct + - meta-llama/Meta-Llama-3-1-8B-Instruct - meta-llama/Llama-2-7b-chat-hf - internlm/internlm2_5-7b-chat - internlm/internlm2_5-20b-chat - - internlm/internlm2-chat-7b - internlm/internlm2-chat-20b - OpenGVLab/InternVL-Chat-V1-5 - 01-ai/Yi-6B-Chat @@ -216,9 +212,11 @@ pytorch_quatization: - meta-llama/Meta-Llama-3-8B-Instruct - meta-llama/Llama-2-7b-chat-hf - internlm/internlm2-chat-20b - - internlm/internlm2-chat-7b + - internlm/internlm2_5-7b-chat + - internlm/internlm2_5-20b-chat - 01-ai/Yi-6B-Chat - - internlm/internlm2-20b + - internlm/internlm2_5-20b + - internlm/internlm2_5-7b longtext_model: @@ -235,3 +233,8 @@ benchmark_model: - meta-llama/Meta-Llama-3-1-70B-Instruct - internlm/internlm2_5-7b-chat - internlm/internlm2_5-20b-chat + - THUDM/glm-4-9b-chat + - Qwen/Qwen2-7B-Instruct + - mistralai/Mistral-7B-Instruct-v0.3 + - mistralai/Mixtral-8x7B-Instruct-v0.1 + - deepseek-ai/DeepSeek-V2-Lite-Chat diff --git a/autotest/interface/pipeline/test_pipeline_func.py b/autotest/interface/pipeline/test_pipeline_func.py index 9ee793a895..096918b6b1 100644 --- a/autotest/interface/pipeline/test_pipeline_func.py +++ b/autotest/interface/pipeline/test_pipeline_func.py @@ -288,7 +288,10 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) pipe = pipeline(model_path, backend_config=backend_config) - gen_config = GenerationConfig(logprobs=10, max_new_tokens=5, top_k=40) + gen_config = GenerationConfig(logprobs=10, + max_new_tokens=5, + top_k=40, + do_sample=True) response = pipe('Hi, pls intro yourself', gen_config=gen_config) result, msg = assert_pipeline_single_return(response, logprobs_num=10) save_pipeline_common_log(config, file_name, result, response, msg) @@ -318,7 +321,10 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) pipe = pipeline(model_path, backend_config=backend_config) - gen_config = GenerationConfig(logprobs=10, max_new_tokens=5, top_k=40) + gen_config = GenerationConfig(logprobs=10, + max_new_tokens=5, + top_k=40, + do_sample=True) response = [] for item in pipe.stream_infer('Hi, pls intro yourself', gen_config=gen_config): @@ -424,9 +430,7 @@ def run_pipeline_testcase_stop_words(config, model, backend, file_name): backend_config = backend(tp=2) pipe = pipeline(model_path, backend_config=backend_config) # test stop_words - gen_config = GenerationConfig(stop_words=[' and', '浦', ' to'], - random_seed=1, - temperature=0.01) + gen_config = GenerationConfig(stop_words=[' and', '浦', ' to']) response = pipe(['Hi, pls intro yourself', 'Shanghai is'], gen_config=gen_config) result = True @@ -465,9 +469,7 @@ def run_pipeline_testcase_bad_words(config, model, backend, file_name): backend_config = backend(tp=2) pipe = pipeline(model_path, backend_config=backend_config) # test bad_words - gen_config = GenerationConfig(bad_words=[' and', '浦', ' to'], - temperature=0.01, - random_seed=1) + gen_config = GenerationConfig(bad_words=[' and', '浦', ' to']) response = pipe(['Hi, pls intro yourself', 'Shanghai is'], gen_config=gen_config) result = '蒲' in response[0].text @@ -587,7 +589,9 @@ def run_pipeline_testcase_repetition_penalty(config, model, backend, backend_config = backend(tp=2) pipe = pipeline(model_path, backend_config=backend_config) # test repetition_penalty - gen_config = GenerationConfig(repetition_penalty=0.01, random_seed=1) + gen_config = GenerationConfig(repetition_penalty=0.01, + random_seed=1, + do_sample=True) response = pipe('Shanghai is', gen_config=gen_config) result = get_repeat_times(response.text, @@ -688,7 +692,9 @@ def run_pipeline_testcase(config, model, backend, file_name): backend_config = backend(tp=2) pipe = pipeline(model_path, backend_config=backend_config) # test repetition_penalty - gen_config = GenerationConfig(top_k=1, max_new_tokens=20) + gen_config = GenerationConfig(top_k=1, + max_new_tokens=20, + do_sample=True) response_list = [] for i in range(3): response_list.append(pipe('Shanghai is', gen_config=gen_config)) @@ -726,7 +732,8 @@ def run_pipeline_testcase(config, model, backend, file_name): for i in range(3): gen_config = GenerationConfig(random_seed=i, temperature=1.0, - top_k=40) + top_k=40, + do_sample=True) response_list.append(pipe('Shanghai is', gen_config=gen_config)) result = response_list[0].text != response_list[ 1].text and response_list[1].text != response_list[2].text @@ -758,7 +765,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) pipe = pipeline(model_path, backend_config=backend_config) - gen_config = GenerationConfig(random_seed=1, top_k=40) + gen_config = GenerationConfig(random_seed=1, top_k=40, do_sample=True) response_list = [] for i in range(3): response_list.append(pipe('Shanghai is', gen_config=gen_config)) @@ -782,6 +789,40 @@ def run_pipeline_testcase(config, model, backend, file_name): del os.environ['CUDA_VISIBLE_DEVICES'] +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) +@pytest.mark.parametrize('backend', + [TurbomindEngineConfig, PytorchEngineConfig]) +def test_gen_config_do_sample_batch(config, model, backend, worker_id): + + def run_pipeline_testcase(config, model, backend, file_name): + + model_path = '/'.join([config.get('model_path'), model]) + backend_config = backend(tp=2) + pipe = pipeline(model_path, backend_config=backend_config) + gen_config = GenerationConfig(temperature=1.0, + top_k=40, + do_sample=True) + response = pipe(['Shanghai is'] * 3, gen_config=gen_config) + result = response[0].text != response[1].text and response[ + 1].text != response[2].text + save_pipeline_common_log(config, file_name, result, response) + del pipe + torch.cuda.empty_cache() + + file_name = f'pipeline_log_{worker_id}.txt' + if 'gw' in worker_id: + os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, + tp_num=2) + p = Process(target=run_pipeline_testcase, + args=(config, model, backend, file_name)) + + p.start() + p.join() + assert_pipeline_common_log(config, file_name) + if 'gw' in worker_id: + del os.environ['CUDA_VISIBLE_DEVICES'] + + @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) @@ -971,7 +1012,7 @@ def test_backend_config_validate_pytorch(config, model, backend, worker_id): @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig]) def test_backend_config_tp(config, model, backend, worker_id): - with pytest.raises(AssertionError, match='tp should be 2\\^n'): + with pytest.raises(AssertionError): if 'gw' in worker_id: os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid( worker_id, tp_num=2) diff --git a/autotest/interface/pipeline/test_pipeline_longtext_func.py b/autotest/interface/pipeline/test_pipeline_longtext_func.py index 88b8a2847e..76625f5de6 100644 --- a/autotest/interface/pipeline/test_pipeline_longtext_func.py +++ b/autotest/interface/pipeline/test_pipeline_longtext_func.py @@ -33,10 +33,9 @@ def test_history_issue_tp1(config, model, worker_id): @pytest.mark.gpu_num_2 -@pytest.mark.parametrize('model', [ - 'internlm/internlm2-chat-20b', 'internlm/internlm2-chat-20b-inner-4bits', - 'internlm/internlm2-20b', 'internlm/internlm2-20b-inner-4bits' -]) +@pytest.mark.parametrize( + 'model', + ['internlm/internlm2-chat-20b', 'internlm/internlm2-chat-20b-inner-4bits']) def test_history_issue_tp2(config, model, worker_id): log_name = ''.join(['pipeline_longtext_issue_', worker_id, '.log']) if 'gw' in worker_id: diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py index 9356f40a3b..3da375ccb5 100644 --- a/autotest/utils/benchmark_utils.py +++ b/autotest/utils/benchmark_utils.py @@ -2,6 +2,7 @@ import subprocess from subprocess import PIPE, Popen +import allure import psutil from utils.config_utils import get_workerid from utils.run_restful_chat import health_check @@ -59,10 +60,11 @@ def generation_test(config, ]) returncode, stderr = run_testcase(cmd, benchmark_log) - + allure.attach.file(benchmark_log, + attachment_type=allure.attachment_type.TEXT) if returncode == 0 and not os.path.isfile(csv_path): - return False, benchmark_log, 'result is empty' - return returncode == 0, benchmark_log, stderr + return False, 'result is empty' + return returncode == 0, stderr def throughput_test(config, @@ -118,12 +120,15 @@ def throughput_test(config, ]) returncode, stderr = run_testcase(cmd, benchmark_log) + allure.attach.file(benchmark_log, + attachment_type=allure.attachment_type.TEXT) if returncode == 0 and not os.path.isfile(csv_path): - return False, benchmark_log, 'result is empty' + return False, 'result is empty' if returncode != 0: - return returncode == 0, benchmark_log, stderr - return returncode == 0, benchmark_log, stderr + return returncode == 0, stderr + + return returncode == 0, stderr def restful_test(config, @@ -161,7 +166,7 @@ def restful_test(config, http_url = f'http://localhost:{port}' if not health_check(http_url): - return False, None, 'server not start' + return False, 'server not start' command = f'python3 benchmark/profile_restful_api.py localhost:{port} {model_path} {dataset_path} --stream-output True ' # noqa: F401, E501 if is_smoke: @@ -186,13 +191,19 @@ def restful_test(config, text=True, encoding='utf-8') f.writelines(benchmark_res.stderr) + allure.attach.file(benchmark_log, + attachment_type=allure.attachment_type.TEXT) if benchmark_res.returncode == 0 and not os.path.isfile(csv_path): - return False, benchmark_log, 'result is empty' - return benchmark_res.returncode == 0, benchmark_log, benchmark_res.stderr + return False, 'result is empty' + return benchmark_res.returncode == 0, benchmark_res.stderr def run_testcase(cmd, benchmark_log): - with open(benchmark_log, 'w') as f: + if os.path.isfile(benchmark_log): + write_type = 'a' + else: + write_type = 'w' + with open(benchmark_log, write_type) as f: f.writelines('reproduce command: ' + cmd + '\n') print('reproduce command: ' + cmd) with Popen([cmd], @@ -236,8 +247,8 @@ def create_multi_level_directory(path): def get_max_cache_entry(model, backend): - if backend != 'turbomind': - return '' + if backend == 'pytorch': + return '--cache-max-entry-count 0.8' if 'Llama-2' in model: return '--cache-max-entry-count 0.95' elif 'internlm2' in model: diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index 1cc556748e..c8ff08ad91 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -188,12 +188,14 @@ def get_benchmark_model_list(tp_num, 'backend': 'turbomind', 'quant_policy': 0, 'tp_num': tp_num - } for item in model_list] + } for item in model_list if item.replace('-inner-4bits', '') in + config.get('turbomind_chat_model') or tp_num == 4] result += [{ 'model': item, 'backend': 'pytorch', 'tp_num': tp_num - } for item in model_list if '4bits' not in item] + } for item in model_list if '4bits' not in item and ( + item in config.get('pytorch_chat_model') or tp_num == 4)] for kvint in kvint_list: result += [{ 'model': item, diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py index cef5d76a2b..e94b331881 100644 --- a/autotest/utils/pipeline_chat.py +++ b/autotest/utils/pipeline_chat.py @@ -9,8 +9,7 @@ from utils.rule_condition_assert import assert_result from lmdeploy import pipeline -from lmdeploy.messages import (GenerationConfig, PytorchEngineConfig, - TurbomindEngineConfig) +from lmdeploy.messages import PytorchEngineConfig, TurbomindEngineConfig from lmdeploy.vl import load_image from lmdeploy.vl.constants import IMAGE_TOKEN @@ -53,9 +52,6 @@ def run_pipeline_chat_test(config, pipe = pipeline(hf_path, backend_config=backend_config) - # run testcases - gen_config = GenerationConfig(top_k=1) - config_log = os.path.join( log_path, '_'.join([ 'pipeline', 'config', type, worker_id, @@ -63,10 +59,12 @@ def run_pipeline_chat_test(config, ])) file = open(config_log, 'w') log_string = '\n'.join([ - 'reproduce config info:', 'engine_config = ' + str(backend_config), - 'gen_config = ' + str(gen_config), + 'reproduce config info:', + 'from lmdeploy.messages import PytorchEngineConfig', + 'from lmdeploy.messages import TurbomindEngineConfig', + 'engine_config = ' + str(backend_config), 'pipe = pipeline("' + hf_path + '", backend_config=engine_config)', - 'res = pipe("Hi, pls introduce shanghai", gen_config=gen_config)' + 'res = pipe("Hi, pls introduce shanghai")' ]) file.writelines(log_string) print(log_string) @@ -91,7 +89,7 @@ def run_pipeline_chat_test(config, prompts.append({'role': 'user', 'content': prompt}) file.writelines('prompt:' + prompt + '\n') - response = pipe([prompts], gen_config=gen_config)[0].text + response = pipe([prompts])[0].text case_result, reason = assert_result(response, prompt_detail.values(), diff --git a/autotest/utils/restful_return_check.py b/autotest/utils/restful_return_check.py index 386a077b0c..9de308bf6e 100644 --- a/autotest/utils/restful_return_check.py +++ b/autotest/utils/restful_return_check.py @@ -69,6 +69,7 @@ def assert_chat_completions_stream_return(output, is_last: bool = False, check_logprobs: bool = False, logprobs_num: int = 5): + print(output) assert output.get('id') is not None assert output.get('object') == 'chat.completion.chunk' assert output.get('model') == model_name @@ -97,6 +98,7 @@ def assert_completions_stream_return(output, is_last: bool = False, check_logprobs: bool = False, logprobs_num: int = 5): + print(output) assert output.get('id') is not None assert output.get('object') == 'text_completion' assert output.get('model') == model_name diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py index 2af65ad41b..6e60c53833 100644 --- a/autotest/utils/run_restful_chat.py +++ b/autotest/utils/run_restful_chat.py @@ -81,7 +81,6 @@ def start_restful_api(config, param, model, model_path, backend_type, text=True, encoding='utf-8') pid = startRes.pid - allure.attach.file(start_log, attachment_type=allure.attachment_type.TEXT) http_url = BASE_HTTP_URL + ':' + str(port) with open(start_log, 'r') as file: @@ -96,6 +95,7 @@ def start_restful_api(config, param, model, model_path, backend_type, result = health_check(http_url) if result or total_time >= 300: break + allure.attach.file(start_log, attachment_type=allure.attachment_type.TEXT) return pid, startRes