From dd4987717d10588c1f8845a9137e0ecded1b59ec Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Wed, 18 Sep 2024 19:42:59 +0800
Subject: [PATCH] [ci] regular update (#2431)

* update

* update

* update

* update

* update

* update

* update

* update

* update

* updaet

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* updaste

* update
---
 .github/scripts/action_tools.py               |   9 +-
 .github/scripts/eval_opencompass_config.py    |  83 +++---
 .github/workflows/benchmark.yml               |  18 +-
 .github/workflows/daily_ete_test.yml          | 242 ++++++++++++------
 .github/workflows/evaluate.yml                |  16 +-
 .github/workflows/stable.yml                  |  14 +-
 .../benchmark/test_apiserver_performance.py   |  61 ++---
 .../benchmark/test_generation_performance.py  | 128 ++++-----
 .../benchmark/test_throughput_performance.py  |  63 ++---
 autotest/config.yaml                          |  31 ++-
 .../interface/pipeline/test_pipeline_func.py  |  67 ++++-
 .../pipeline/test_pipeline_longtext_func.py   |   7 +-
 autotest/utils/benchmark_utils.py             |  35 ++-
 autotest/utils/config_utils.py                |   6 +-
 autotest/utils/pipeline_chat.py               |  16 +-
 autotest/utils/restful_return_check.py        |   2 +
 autotest/utils/run_restful_chat.py            |   2 +-
 17 files changed, 459 insertions(+), 341 deletions(-)

diff --git a/.github/scripts/action_tools.py b/.github/scripts/action_tools.py
index 126147c43b..84f401af17 100644
--- a/.github/scripts/action_tools.py
+++ b/.github/scripts/action_tools.py
@@ -101,7 +101,10 @@ def _load_hf_results(test_results: dict, model_name: str):
     return out
 
 
-def evaluate(models: List[str], datasets: List[str], workspace: str):
+def evaluate(models: List[str],
+             datasets: List[str],
+             workspace: str,
+             is_smoke: bool = False):
     """Evaluate models from lmdeploy using opencompass.
 
     Args:
@@ -157,6 +160,10 @@ def evaluate(models: List[str], datasets: List[str], workspace: str):
 
         with open(config_path_new, 'a') as f:
             f.write(f'\ndatasets = {datasets}\n')
+            if is_smoke:
+                f.write('\nfor d in datasets:\n')
+                f.write("    if d['reader_cfg'] is not None:\n")
+                f.write("        d['reader_cfg']['test_range'] = '[0:50]'\n")
             if engine_type == 'hf':
                 f.write(f'\nmodels = [ *{target_model} ]\n')
             else:
diff --git a/.github/scripts/eval_opencompass_config.py b/.github/scripts/eval_opencompass_config.py
index 95baf04b7f..8dc2bb0d5d 100644
--- a/.github/scripts/eval_opencompass_config.py
+++ b/.github/scripts/eval_opencompass_config.py
@@ -6,68 +6,72 @@
 
 with read_base():
     # choose a list of datasets
-    from .datasets.bbh.bbh_gen_5b92b0 import bbh_datasets  # noqa: F401, E501
-    from .datasets.ceval.ceval_gen_2daf24 import \
+    from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \
+        bbh_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.ceval.ceval_gen_2daf24 import \
         ceval_datasets  # noqa: F401, E501
-    from .datasets.cmmlu.cmmlu_gen_c13365 import \
+    from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import \
         cmmlu_datasets  # noqa: F401, E501
-    from .datasets.crowspairs.crowspairs_gen_381af0 import \
+    from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import \
         crowspairs_datasets  # noqa: F401, E501
-    from .datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \
+    from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \
         GaokaoBench_datasets  # noqa: F401, E501
-    from .datasets.gpqa.gpqa_gen_4baadb import \
+    from opencompass.configs.datasets.gpqa.gpqa_gen_4baadb import \
         gpqa_datasets  # noqa: F401, E501
-    from .datasets.gsm8k.gsm8k_gen_1d7fe4 import \
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
         gsm8k_datasets  # noqa: F401, E501
-    from .datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
+    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
         hellaswag_datasets  # noqa: F401, E501
-    from .datasets.humaneval.humaneval_gen_8e312c import \
+    from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import \
         humaneval_datasets  # noqa: F401, E501
-    from .datasets.IFEval.IFEval_gen_3321a3 import \
+    from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \
         ifeval_datasets  # noqa: F401, E501
-    from .datasets.math.math_0shot_gen_393424 import \
+    from opencompass.configs.datasets.math.math_0shot_gen_393424 import \
         math_datasets  # noqa: F401, E501
-    from .datasets.mbpp.sanitized_mbpp_gen_a0fc46 import \
+    from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_a0fc46 import \
         sanitized_mbpp_datasets  # noqa: F401, E501
-    from .datasets.mmlu.mmlu_gen_4d595a import \
+    from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import \
         mmlu_datasets  # noqa: F401, E501
-    from .datasets.nq.nq_open_1shot_gen_01cf41 import \
+    from opencompass.configs.datasets.nq.nq_open_1shot_gen_01cf41 import \
         nq_datasets  # noqa: F401, E501
-    from .datasets.race.race_gen_69ee4f import \
+    from opencompass.configs.datasets.race.race_gen_69ee4f import \
         race_datasets  # noqa: F401, E501
-    from .datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
+    from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
         TheoremQA_datasets  # noqa: F401, E501
-    from .datasets.triviaqa.triviaqa_wiki_1shot_gen_eaf81e import \
+    from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_eaf81e import \
         triviaqa_datasets  # noqa: F401, E501
-    from .datasets.winogrande.winogrande_5shot_gen_b36770 import \
+    from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import \
         winogrande_datasets  # noqa: F401, E501
     # read hf models
-    from .models.baichuan.hf_baichuan2_7b_chat import \
+    from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import \
         models as hf_baichuan2_chat_7b  # noqa: F401, E501
-    from .models.gemma.hf_gemma_7b_it import \
+    from opencompass.configs.models.gemma.hf_gemma_7b_it import \
         models as hf_gemma_chat_7b  # noqa: F401, E501
-    from .models.hf_internlm.hf_internlm2_chat_7b import \
+    from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import \
         models as hf_internlm2_chat_7b  # noqa: F401, E501
-    from .models.hf_internlm.hf_internlm2_chat_20b import \
+    from opencompass.configs.models.hf_internlm.hf_internlm2_chat_20b import \
         models as hf_internlm2_chat_20b  # noqa: F401, E501
-    from .models.hf_internlm.hf_internlm_chat_7b import \
+    from opencompass.configs.models.hf_internlm.hf_internlm_chat_7b import \
         models as hf_internlm_chat_7b  # noqa: F401, E501
-    from .models.hf_internlm.hf_internlm_chat_20b import \
+    from opencompass.configs.models.hf_internlm.hf_internlm_chat_20b import \
         models as hf_internlm_chat_20b  # noqa: F401, E501
-    from .models.hf_llama.hf_llama2_7b_chat import \
+    from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import \
         models as hf_llama2_chat_7b  # noqa: F401, E501
-    from .models.hf_llama.hf_llama3_8b_instruct import \
+    from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
         models as hf_llama_3_8b_instruct  # noqa: F401, E501
-    from .models.mistral.hf_mistral_7b_instruct_v0_1 import \
+    from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_1 import \
         models as hf_mistral_chat_7b  # noqa: F401, E501
-    from .models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
+    from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
         models as hf_mixtral_chat_8x7b  # noqa: F401, E501
-    from .models.qwen.hf_qwen1_5_7b_chat import \
+    from opencompass.configs.models.qwen.hf_qwen1_5_7b_chat import \
         models as hf_qwen1_5_chat_7b  # noqa: F401, E501
-    from .models.qwen.hf_qwen_7b_chat import \
+    from opencompass.configs.models.qwen.hf_qwen2_7b_instruct import \
+        models as hf_qwen2_7b_instruct  # noqa: F401, E501
+    from opencompass.configs.models.qwen.hf_qwen_7b_chat import \
         models as hf_qwen_chat_7b  # noqa: F401, E501
     # and output the results in a chosen format
-    from .summarizers.medium import summarizer  # noqa: F401, E501
+    from opencompass.configs.summarizers.medium import \
+        summarizer  # noqa: F401, E501
 
 internlm_meta_template = dict(round=[
     dict(role='HUMAN', begin='<|User|>:', end='\n'),
@@ -117,7 +121,7 @@
              end='<|im_end|>\n',
              generate=True),
     ],
-    eos_token_id=151645,
+    eos_token_id=[151645, 151643],
 )
 
 baichuan2_meta_template = dict(round=[
@@ -202,7 +206,7 @@
 qwen_gen_config_template = dict(top_k=1,
                                 top_p=0.8,
                                 temperature=1.0,
-                                stop_words=[151645],
+                                stop_words=[151645, 151643],
                                 max_new_tokens=MAX_NEW_TOKENS)
 
 tokenizer_kwargs_template = dict(padding_side='left',
@@ -546,6 +550,19 @@
     run_cfg=dict(num_gpus=1),
 )
 
+pt_qwen2_7b_instruct = dict(type=LmdeployPytorchModel,
+                            abbr='pt_qwen2_7b_instruct',
+                            path='Qwen/Qwen2-7B-Instruct',
+                            engine_config=pt_engine_config_template_max_bs_128,
+                            gen_config=gen_config_template,
+                            max_out_len=MAX_NEW_TOKENS,
+                            max_seq_len=MAX_SESSION_LEN,
+                            batch_size=128,
+                            concurrency=128,
+                            meta_template=qwen1_5_meta_template,
+                            run_cfg=run_cfg_tp1_template,
+                            end_str='<|im_end|>')
+
 tb_qwen2_7b_instruct_4bits = deepcopy(tb_qwen2_7b_instruct)
 tb_qwen2_7b_instruct_kvint4 = deepcopy(tb_qwen2_7b_instruct)
 tb_qwen2_7b_instruct_kvint8 = deepcopy(tb_qwen2_7b_instruct)
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 6e026f3307..cf8283bbda 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -36,6 +36,7 @@ env:
   REPORT_DIR: /nvme/qa_test_models/benchmark-reports/${{ github.run_id }}
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
   dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib jmespath'}}
+  FAIL_CONFIG: ${{ github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
 
 jobs:
   linux-build:
@@ -111,29 +112,32 @@ jobs:
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
           python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+          python3 -m pip install -e /root/packages/AutoAWQ_kernels
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
           python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps
           python3 -m pip install ${{env.dependency_pkgs}}
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
-          python3 -m pip install lmdeploy-*.whl
-          python3 -m pip install triton==2.1.0
+          python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Install lmdeploy - offline
         if: ${{inputs.offline_mode}}
         run: |
-          python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl
-          python3 -m pip install triton==2.1.0
+          python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Check env
         run: |
+          pip uninstall -y nvidia-nccl-cu11
           python3 -m pip list
           lmdeploy check_env
+          mkdir ${{env.REPORT_DIR}}/allure-results/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/allure-results/.pytest_cache autotest
       - name: Run other benchmark
         run: |
-            pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n 8 --run_id ${{ github.run_id }} -m gpu_num_1 --lf --alluredir=${{env.REPORT_DIR}}/allure-results || true
-            pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n 4 --run_id ${{ github.run_id }} -m gpu_num_2 --lf --alluredir=${{env.REPORT_DIR}}/allure-results || true
-            pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n 2 --run_id ${{ github.run_id }} -m gpu_num_4 --lf --alluredir=${{env.REPORT_DIR}}/allure-results
+            pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n 8 --run_id ${{ github.run_id }} -m gpu_num_1 ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}}/allure-results || true
+            pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n 4 --run_id ${{ github.run_id }} -m gpu_num_2 ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}}/allure-results || true
+            pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n 2 --run_id ${{ github.run_id }} -m gpu_num_4 ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}}/allure-results
       - name: Clear workfile
         if: always()
         run: |
diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
index bd7d6c259f..28a27cc60a 100644
--- a/.github/workflows/daily_ete_test.yml
+++ b/.github/workflows/daily_ete_test.yml
@@ -33,28 +33,13 @@ on:
         description: 'Dependency packages, you can also set a specific version'
         type: string
         default: 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath decord auto_gptq'
-      tools_regression:
+      regression_func:
         required: true
-        description: 'Whether start a tool regression'
-        type: boolean
-        default: true
-      restful_regression:
-        required: true
-        description: 'Whether start a restful api regression'
-        type: boolean
-        default: true
-      pipeline_regression:
-        required: true
-        description: 'Whether start an interface pipeline regression'
-        type: boolean
-        default: true
-      benchmark_regression:
-        required: true
-        description: 'Whether start a benchmark script regression'
-        type: boolean
-        default: true
+        description: 'regression functions'
+        type: string
+        default: "['tools','restful','pipeline','benchmark','evaluation']"
   schedule:
-    - cron:  '00 18 * * 0-4'
+    - cron:  '00 16 * * 0-4'
 
 env:
   HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
@@ -63,7 +48,8 @@ env:
   OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
   REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }}
-  COV_PATH: /opt/py3/lib/python3.10/site-packages/lmdeploy
+  COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
+  FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
 
 jobs:
   linux-build:
@@ -103,9 +89,9 @@ jobs:
 
   test_tools:
     needs: linux-build
-    if: ${{!cancelled() && (github.event_name == 'schedule' || inputs.tools_regression)}}
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
     runs-on: [self-hosted, linux-a100]
-    timeout-minutes: 420
+    timeout-minutes: 450
     env:
       PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
       MODELSCOPE_CACHE: /root/modelscope_hub
@@ -143,132 +129,135 @@ jobs:
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
           python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-          python3 -m pip install /root/packages/autoawq_kernels-0.0.6+cu118-cp310-cp310-linux_x86_64.whl
+          python3 -m pip install -e /root/packages/AutoAWQ_kernels
           python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
           python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps
           python3 -m pip install ${{env.dependency_pkgs}}
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
-          python3 -m pip install lmdeploy-*.whl
+          python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
           pip install /nvme/qa_test_models/offline_pkg/DeepSeek-VL --no-deps
       - name: Install lmdeploy - offline
         if: ${{inputs.offline_mode}}
         run: |
-          python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl
+          python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
           pip install /nvme/qa_test_models/offline_pkg/DeepSeek-VL --no-deps
       - name: Check env
         run: |
+          pip uninstall -y nvidia-nccl-cu11
           python3 -m pip list
           lmdeploy check_env
           cp -r /root/lora .
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - quantization w4a16
         continue-on-error: true
         if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'quantization'))
         run: |
-          pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir --cov ${{env.COV_PATH}}  || true
+          pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - quantization w8a8
         continue-on-error: true
         if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'quantization'))
         run: |
-          pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - convert
         continue-on-error: true
         if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'convert'))
         run: |
-          pytest autotest/tools/convert -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/tools/convert -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - chat workspace
         continue-on-error: true
         if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'chat'))
         run: |
-          pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - chat hf turbomind
         continue-on-error: true
         if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'chat'))
         run: |
-          pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - chat hf torch
         continue-on-error: true
         if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'chat'))
         run: |
-          pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - pipeline turbomind
         continue-on-error: true
         if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'pipeline'))
         run: |
-          pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-      - name: Test lmdeploy - pipeline torch
+      - name: Test lmdeploy - pipeline turbomind vl
         continue-on-error: true
-        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'pipeline'))
+        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind-vl') && contains(fromJSON(github.event.inputs.model), 'pipeline'))
         run: |
-          pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-      - name: Test lmdeploy - pipeline turbomind vl
+      - name: Test lmdeploy - restful turbomind
         continue-on-error: true
-        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind-vl') && contains(fromJSON(github.event.inputs.model), 'pipeline'))
+        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'restful'))
         run: |
-          pytest autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-      - name: Test lmdeploy - restful turbomind
+      - name: Test lmdeploy - restful workspace
         continue-on-error: true
         if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'restful'))
         run: |
-          pytest autotest/tools/restful/test_restful_chat_hf_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/restful/test_restful_chat_hf_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - restful turbomind vl
         continue-on-error: true
         if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind-vl') && contains(fromJSON(github.event.inputs.model), 'restful'))
         run: |
-          pytest autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-      - name: Test lmdeploy - restful workspace
+      - name: Test lmdeploy - pipeline torch
         continue-on-error: true
-        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'restful'))
+        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'pipeline'))
         run: |
-          pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - restful torch
         continue-on-error: true
         if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'restful'))
         run: |
-          pytest autotest/tools/restful/test_restful_chat_hf_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/restful/test_restful_chat_hf_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - local testcase
         if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.model), 'local_case')
         run: |
-          pytest /local_case/issue_regression --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}}|| true
+          pytest /local_case/issue_regression --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}}|| true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Clear workfile
         if: always()
@@ -281,7 +270,7 @@ jobs:
           chmod -R 777 $workdir
 
   test_restful:
-    if: ${{!cancelled() && (github.event_name == 'schedule' || inputs.restful_regression)}}
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}}
     runs-on: [self-hosted, linux-a100]
     needs: test_tools
     strategy:
@@ -321,18 +310,21 @@ jobs:
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
-          python3 -m pip install lmdeploy-*.whl
+          python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Install lmdeploy - offline
         if: ${{inputs.offline_mode}}
         run: |
-          python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl
+          python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Check env
         run: |
+          pip uninstall -y nvidia-nccl-cu11
           python3 -m pip list
           lmdeploy check_env
           rm -rf allure-results
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Start restful api turbomind
         if: matrix.backend == 'turbomind'
         run: |
@@ -348,7 +340,7 @@ jobs:
       - name: Test lmdeploy - restful api
         timeout-minutes: 75
         run: |
-          pytest autotest/interface/restful/test_restful_chat_func.py -n 20 -m 'not not_${{matrix.backend}}' --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/interface/restful/test_restful_chat_func.py -n 20 -m 'not not_${{matrix.backend}}' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Kill api server
         if: always()
@@ -369,7 +361,7 @@ jobs:
       - name: Test lmdeploy - restful api - base
         timeout-minutes: 40
         run: |
-          pytest autotest/interface/restful/test_restful_completions_v1.py -n 20 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/interface/restful/test_restful_completions_v1.py -n 20 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Kill api server
         if: always()
@@ -386,7 +378,7 @@ jobs:
           chmod -R 777 $workdir
 
   test_pipeline:
-    if: ${{!cancelled() && (github.event_name == 'schedule' || inputs.pipeline_regression)}}
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'pipeline'))}}
     runs-on: [self-hosted, linux-a100]
     needs: test_tools
     timeout-minutes: 300
@@ -422,27 +414,30 @@ jobs:
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
-          python3 -m pip install lmdeploy-*.whl
+          python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Install lmdeploy - offline
         if: ${{inputs.offline_mode}}
         run: |
-          python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl
+          python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Check env
         run: |
+          pip uninstall -y nvidia-nccl-cu11
           python3 -m pip list
           lmdeploy check_env
           rm -rf allure-results
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - interface pipeline case
         run: |
-          pytest autotest/interface/pipeline/test_pipeline_func.py -m 'not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/interface/pipeline/test_pipeline_func.py -m 'not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Clear workfile
         if: always()
@@ -456,7 +451,7 @@ jobs:
 
 
   test_benchmark:
-    if: ${{!cancelled() && (github.event_name == 'schedule' || inputs.benchmark_regression)}}
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}}
     runs-on: [self-hosted, linux-a100]
     needs: test_tools
     timeout-minutes: 300
@@ -492,22 +487,23 @@ jobs:
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
-          python3 -m pip install lmdeploy-*.whl
-          python3 -m pip install triton==2.1.0
+          python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Install lmdeploy - offline
         if: ${{inputs.offline_mode}}
         run: |
-          python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl
-          python3 -m pip install triton==2.1.0
+          python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Check env
         run: |
+          pip uninstall -y nvidia-nccl-cu11
           python3 -m pip list
           lmdeploy check_env
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test benchmark script
         run: |
-          pytest autotest/benchmark -n 4 --run_id ${{ github.run_id }} -m function --lf --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          pytest autotest/benchmark -n 4 --run_id ${{ github.run_id }} -m function ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Clear workfile
         if: always()
@@ -520,9 +516,97 @@ jobs:
           mkdir $workdir
           chmod -R 777 $workdir
 
+  test_evaluation:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'evaluation'))}}
+    runs-on: [self-hosted, linux-a100]
+    needs: test_tools
+    timeout-minutes: 300 # 5hours
+    container:
+      image: openmmlab/lmdeploy:latest-cu11
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/github-actions/resources:/root/resources
+        - /nvme/github-actions/opencompass-data:/root/opencompass-data
+        - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports
+        - /nvme/qa_test_models:/root/models
+        - /nvme/qa_test_models/offline_pkg:/nvme/qa_test_models/offline_pkg
+        - /mnt/shared:/mnt/shared
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Setup systems
+        run: |
+          export TIME_STAMP="$(date +'%Y%m%d-%H%M%S')"
+          echo "TIME_STAMP=$TIME_STAMP" >> $GITHUB_ENV
+      - name: Clone repository
+        uses: actions/checkout@v2
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Copy repository - offline
+        if: ${{inputs.offline_mode}}
+        run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
+      - name: Download Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        uses: actions/download-artifact@v4
+        with:
+          name: my-artifact-${{ github.run_id }}-py310
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+          python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install ${{env.dependency_pkgs}}
+      - name: Install lmdeploy
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Install lmdeploy - offline
+        if: ${{inputs.offline_mode}}
+        run: |
+          python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Install opencompass
+        run: |
+          git clone --depth=1 https://github.com/open-compass/opencompass.git
+          cd opencompass
+          python3 -m pip install -e .
+          echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV
+      - name: Check env
+        run: |
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Setup paths for evaluation
+        run: |
+          ln -s /root/opencompass-data ./data
+          python3 .github/scripts/action_tools.py create_model_links /root/models .
+      - name: Evaluate models
+        run: |
+          export LMDEPLOY_DIR=$(pwd)
+
+          python3 .github/scripts/action_tools.py evaluate \
+            --models "[tb_internlm2_5_7b_chat, tb_internlm2_5_7b_chat_4bits, tb_internlm2_5_7b_chat_kvint4, pt_internlm2_5_7b_chat, tb_internlm2_5_20b_chat, tb_internlm2_5_20b_chat_4bits, tb_internlm2_5_20b_chat_kvint4, pt_internlm2_5_20b_chat, tb_llama_3d1_8b_instruct, pt_llama_3d1_8b_instruct, tb_llama_3d1_8b_instruct_4bits, tb_llama_3d1_8b_instruct_kvint4, tb_qwen2_7b_instruct, tb_qwen2_7b_instruct_4bits, pt_qwen1_5_moe_2_7b_chat, pt_gemma_2_9b_it]" \
+            --datasets "[*race_datasets, *gsm8k_datasets]" \
+            --workspace /root/evaluation-reports/${{ github.run_id }} --is_smoke true
+      - name: Clear workspace
+        if: always()
+        run: |
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
 
   get_benchmark_result:
-    if: ${{!cancelled() && (github.event_name == 'schedule' || inputs.benchmark_regression)}}
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}}
     needs: [test_benchmark]
     timeout-minutes: 5
     runs-on: [self-hosted, linux-a100]
@@ -571,12 +655,12 @@ jobs:
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
-          python3 -m pip install lmdeploy-*.whl
+          python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Install lmdeploy - offline
         if: ${{inputs.offline_mode}}
         run: |
-          python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl
+          python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Get coverage report
         run: |
@@ -597,7 +681,7 @@ jobs:
 
   notify_to_feishu:
     if: always() && !cancelled() && (github.ref_name == 'develop' || github.ref_name == 'main')
-    needs: [get_benchmark_result, get_coverage_report]
+    needs: [get_benchmark_result, get_coverage_report, test_evaluation]
     timeout-minutes: 5
     runs-on: [self-hosted, linux-a100]
     steps:
diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml
index 5b17fccb22..bbecedd9c5 100644
--- a/.github/workflows/evaluate.yml
+++ b/.github/workflows/evaluate.yml
@@ -17,7 +17,7 @@ on:
         required: true
         description: 'Tested TurboMind models list. eg. [internlm_chat_7b,internlm_chat_7b_w8a16]'
         type: string
-        default: '[tb_internlm2_chat_7b, tb_internlm2_chat_7b_4bits, tb_internlm2_chat_7b_kvint4, tb_internlm2_chat_7b_kvint8, pt_internlm2_chat_7b, tb_internlm2_5_7b_chat, tb_internlm2_5_7b_chat_4bits, tb_internlm2_5_7b_chat_kvint4, tb_internlm2_5_7b_chat_kvint8, pt_internlm2_5_7b_chat, tb_internlm2_5_20b_chat, tb_internlm2_5_20b_chat_4bits, tb_internlm2_5_20b_chat_kvint4, tb_internlm2_5_20b_chat_kvint8, pt_internlm2_5_20b_chat, tb_qwen1_5_7b_chat, tb_qwen1_5_7b_chat_4bits, tb_qwen1_5_7b_chat_kvint4, tb_qwen1_5_7b_chat_kvint8, pt_qwen1_5_7b_chat,  tb_llama_3d1_8b_instruct, pt_llama_3d1_8b_instruct, tb_llama_3d1_8b_instruct_4bits, tb_llama_3d1_8b_instruct_kvint4, tb_llama_3d1_8b_instruct_kvint8, tb_qwen2_7b_instruct, tb_qwen2_7b_instruct_4bits, tb_qwen2_7b_instruct_kvint8, pt_qwen1_5_moe_2_7b_chat, pt_gemma_2_9b_it]'
+        default: '[tb_internlm2_chat_7b, pt_internlm2_chat_7b, tb_internlm2_5_7b_chat, pt_internlm2_5_7b_chat, tb_internlm2_5_20b_chat, pt_internlm2_5_20b_chat, tb_qwen1_5_7b_chat, pt_qwen1_5_7b_chat, tb_llama_3_8b_instruct, pt_llama_3_8b_instruct, tb_llama_3d1_8b_instruct, pt_llama_3d1_8b_instruct, tb_qwen2_7b_instruct, pt_qwen2_7b_instruct, pt_qwen1_5_moe_2_7b_chat, pt_gemma_2_9b_it, tb_internlm2_chat_7b_4bits, tb_internlm2_chat_7b_kvint4, tb_internlm2_chat_7b_kvint8, tb_internlm2_5_7b_chat_4bits, tb_internlm2_5_7b_chat_kvint4, tb_internlm2_5_7b_chat_kvint8, tb_internlm2_5_20b_chat_4bits, tb_internlm2_5_20b_chat_kvint4, tb_internlm2_5_20b_chat_kvint8, tb_qwen1_5_7b_chat_4bits, tb_qwen1_5_7b_chat_kvint4, tb_qwen1_5_7b_chat_kvint8, tb_llama_3_8b_instruct_4bits, tb_llama_3_8b_instruct_kvint4, tb_llama_3_8b_instruct_kvint8, tb_llama_3d1_8b_instruct_4bits, tb_llama_3d1_8b_instruct_kvint4, tb_llama_3d1_8b_instruct_kvint8, tb_qwen2_7b_instruct_4bits, tb_qwen2_7b_instruct_kvint8]'
       datasets:
         required: true
         description: 'Tested datasets list. eg. [*bbh_datasets,*ceval_datasets,*cmmlu_datasets,*GaokaoBench_datasets,*gpqa_datasets,*gsm8k_datasets,*hellaswag_datasets,*humaneval_datasets,*ifeval_datasets,*math_datasets,*sanitized_mbpp_datasets,*mmlu_datasets,*nq_datasets,*race_datasets,*TheoremQA_datasets,*triviaqa_datasets,*winogrande_datasets,*crowspairs_datasets]'
@@ -85,8 +85,8 @@ jobs:
     runs-on: [self-hosted, linux-a100]
     timeout-minutes: 4320 # 72hours
     container:
-      image: nvcr.io/nvidia/tritonserver:24.03-py3
-      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip --pull never"
+      image: openmmlab/lmdeploy:latest-cu11
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
         - /nvme/github-actions/packages:/root/packages
@@ -110,30 +110,28 @@ jobs:
         uses: actions/download-artifact@v4
         with:
           name: my-artifact-${{ github.run_id }}-py310
-      - name: Install pytorch
-        run: |
-          python3 -m pip cache dir
-          python3 -m pip install torch==2.3.0 torchvision==0.18.0 --index-url https://download.pytorch.org/whl/cu118
       - name: Install lmdeploy - dependency
         run: |
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
           python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+          python3 -m pip install -e /root/packages/AutoAWQ_kernels
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
           python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps
           python3 -m pip install ${{env.dependency_pkgs}}
       - name: Install lmdeploy
         run: |
-          python3 -m pip install lmdeploy-*.whl
+          python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Install opencompass
         run: |
           git clone --depth=1 https://github.com/open-compass/opencompass.git
           cd opencompass
           python3 -m pip install -e .
-          python3 -m pip install triton==2.1.0
           echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV
       - name: Check env
         run: |
+          pip uninstall -y nvidia-nccl-cu11
           python3 -m pip list
           lmdeploy check_env
       - name: Setup paths for evaluation
diff --git a/.github/workflows/stable.yml b/.github/workflows/stable.yml
index c946177c0e..0bfd32d4ee 100644
--- a/.github/workflows/stable.yml
+++ b/.github/workflows/stable.yml
@@ -113,14 +113,12 @@ jobs:
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
-          python3 -m pip install lmdeploy-*.whl
-          python3 -m pip install triton==2.1.0
+          python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Install lmdeploy - offline
         if: ${{inputs.offline_mode}}
         run: |
-          python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl
-          python3 -m pip install triton==2.1.0
+          python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Check env
         run: |
@@ -134,10 +132,10 @@ jobs:
           sleep 120s
       - name: Test lmdeploy - restful api
         run: |
-          python3 benchmark/profile_restful_api.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json --stream-output True --num-prompts 10000 --csv ${{env.REPORT_DIR}}/stable.csv &> ${{env.REPORT_DIR}}/stable.log
-          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-1.csv &> ${{env.REPORT_DIR}}/stable-internal-1.log
-          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-2.csv &> ${{env.REPORT_DIR}}/stable-internal-2.log
-          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-3.csv &> ${{env.REPORT_DIR}}/stable-internal-3.log
+          python3 benchmark/profile_restful_api.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json --stream-output True --num-prompts 10000 --csv ${{env.REPORT_DIR}}/stable.csv > ${{env.REPORT_DIR}}/stable.log
+          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-1.csv > ${{env.REPORT_DIR}}/stable-internal-1.log
+          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-2.csv > ${{env.REPORT_DIR}}/stable-internal-2.log
+          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-3.csv > ${{env.REPORT_DIR}}/stable-internal-3.log
       - name: Kill api server
         if: always()
         run: |
diff --git a/autotest/benchmark/test_apiserver_performance.py b/autotest/benchmark/test_apiserver_performance.py
index 761cf0302b..b8842a4c74 100644
--- a/autotest/benchmark/test_apiserver_performance.py
+++ b/autotest/benchmark/test_apiserver_performance.py
@@ -1,4 +1,3 @@
-import allure
 import pytest
 from utils.benchmark_utils import restful_test
 from utils.config_utils import get_benchmark_model_list
@@ -22,10 +21,12 @@ def getModelList(tp_num):
     model_list = get_benchmark_model_list(tp_num, kvint_list=[4, 8])
     new_model_list = []
     for model in model_list:
-        if 'Llama-2' in model:
+        if model['backend'] == 'pytorch':
+            model['extra'] = '--max-batch-size 256 --cache-max-entry-count 0.8'
+        elif 'Llama-2' in model['model']:
             model[
                 'extra'] = '--max-batch-size 256 --cache-max-entry-count 0.95'
-        elif 'internlm2' in model:
+        elif 'internlm2' in model['model']:
             model['extra'] = '--max-batch-size 256 --cache-max-entry-count 0.9'
         else:
             model['extra'] = '--max-batch-size 256'
@@ -40,14 +41,11 @@ def getModelList(tp_num):
                          getModelList(tp_num=1),
                          indirect=True)
 def test_restful_tp1(config, run_id, prepare_environment, worker_id):
-    result, restful_log, msg = restful_test(config,
-                                            run_id,
-                                            prepare_environment,
-                                            worker_id=worker_id)
-
-    if restful_log is not None:
-        allure.attach.file(restful_log,
-                           attachment_type=allure.attachment_type.TEXT)
+    result, msg = restful_test(config,
+                               run_id,
+                               prepare_environment,
+                               worker_id=worker_id)
+
     assert result, msg
 
 
@@ -57,14 +55,11 @@ def test_restful_tp1(config, run_id, prepare_environment, worker_id):
                          getModelList(tp_num=2),
                          indirect=True)
 def test_restful_tp2(config, run_id, prepare_environment, worker_id):
-    result, restful_log, msg = restful_test(config,
-                                            run_id,
-                                            prepare_environment,
-                                            worker_id=worker_id)
-
-    if restful_log is not None:
-        allure.attach.file(restful_log,
-                           attachment_type=allure.attachment_type.TEXT)
+    result, msg = restful_test(config,
+                               run_id,
+                               prepare_environment,
+                               worker_id=worker_id)
+
     assert result, msg
 
 
@@ -74,14 +69,11 @@ def test_restful_tp2(config, run_id, prepare_environment, worker_id):
                          getModelList(tp_num=4),
                          indirect=True)
 def test_restful_tp4(config, run_id, prepare_environment, worker_id):
-    result, restful_log, msg = restful_test(config,
-                                            run_id,
-                                            prepare_environment,
-                                            worker_id=worker_id)
-
-    if restful_log is not None:
-        allure.attach.file(restful_log,
-                           attachment_type=allure.attachment_type.TEXT)
+    result, msg = restful_test(config,
+                               run_id,
+                               prepare_environment,
+                               worker_id=worker_id)
+
     assert result, msg
 
 
@@ -103,13 +95,10 @@ def test_restful_tp4(config, run_id, prepare_environment, worker_id):
 }],
                          indirect=True)
 def test_restful_func_tp2(config, run_id, prepare_environment, worker_id):
-    result, restful_log, msg = restful_test(config,
-                                            run_id,
-                                            prepare_environment,
-                                            worker_id=worker_id,
-                                            is_smoke=True)
-
-    if restful_log is not None:
-        allure.attach.file(restful_log,
-                           attachment_type=allure.attachment_type.TEXT)
+    result, msg = restful_test(config,
+                               run_id,
+                               prepare_environment,
+                               worker_id=worker_id,
+                               is_smoke=True)
+
     assert result, msg
diff --git a/autotest/benchmark/test_generation_performance.py b/autotest/benchmark/test_generation_performance.py
index cffdc53270..7f2e84c9a5 100644
--- a/autotest/benchmark/test_generation_performance.py
+++ b/autotest/benchmark/test_generation_performance.py
@@ -1,4 +1,3 @@
-import allure
 import pytest
 from utils.benchmark_utils import generation_test
 from utils.config_utils import (get_benchmark_model_list,
@@ -9,16 +8,13 @@
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=1))
 def test_generation_tp1(config, run_id, run_config, worker_id):
-    result, generation_log, msg = generation_test(
-        config,
-        run_id,
-        run_config,
-        cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=1),
-        worker_id=worker_id)
-
-    if generation_log is not None:
-        allure.attach.file(generation_log,
-                           attachment_type=allure.attachment_type.TEXT)
+    result, msg = generation_test(config,
+                                  run_id,
+                                  run_config,
+                                  cuda_prefix=get_cuda_prefix_by_workerid(
+                                      worker_id, tp_num=1),
+                                  worker_id=worker_id)
+
     assert result, msg
 
 
@@ -27,17 +23,14 @@ def test_generation_tp1(config, run_id, run_config, worker_id):
 @pytest.mark.parametrize('run_config',
                          get_benchmark_model_list(tp_num=1, is_longtext=True))
 def test_generation_longtext_tp1(config, run_id, run_config, worker_id):
-    result, generation_log, msg = generation_test(
-        config,
-        run_id,
-        run_config,
-        is_longtext=True,
-        cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=1),
-        worker_id=worker_id)
-
-    if generation_log is not None:
-        allure.attach.file(generation_log,
-                           attachment_type=allure.attachment_type.TEXT)
+    result, msg = generation_test(config,
+                                  run_id,
+                                  run_config,
+                                  is_longtext=True,
+                                  cuda_prefix=get_cuda_prefix_by_workerid(
+                                      worker_id, tp_num=1),
+                                  worker_id=worker_id)
+
     assert result, msg
 
 
@@ -45,16 +38,13 @@ def test_generation_longtext_tp1(config, run_id, run_config, worker_id):
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=2))
 def test_generation_tp2(config, run_id, run_config, worker_id):
-    result, generation_log, msg = generation_test(
-        config,
-        run_id,
-        run_config,
-        cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2),
-        worker_id=worker_id)
-
-    if generation_log is not None:
-        allure.attach.file(generation_log,
-                           attachment_type=allure.attachment_type.TEXT)
+    result, msg = generation_test(config,
+                                  run_id,
+                                  run_config,
+                                  cuda_prefix=get_cuda_prefix_by_workerid(
+                                      worker_id, tp_num=2),
+                                  worker_id=worker_id)
+
     assert result, msg
 
 
@@ -64,17 +54,14 @@ def test_generation_tp2(config, run_id, run_config, worker_id):
 @pytest.mark.parametrize('run_config',
                          get_benchmark_model_list(tp_num=2, is_longtext=True))
 def test_generation_longtext_tp2(config, run_id, run_config, worker_id):
-    result, generation_log, msg = generation_test(
-        config,
-        run_id,
-        run_config,
-        is_longtext=True,
-        cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2),
-        worker_id=worker_id)
-
-    if generation_log is not None:
-        allure.attach.file(generation_log,
-                           attachment_type=allure.attachment_type.TEXT)
+    result, msg = generation_test(config,
+                                  run_id,
+                                  run_config,
+                                  is_longtext=True,
+                                  cuda_prefix=get_cuda_prefix_by_workerid(
+                                      worker_id, tp_num=2),
+                                  worker_id=worker_id)
+
     assert result, msg
 
 
@@ -82,16 +69,13 @@ def test_generation_longtext_tp2(config, run_id, run_config, worker_id):
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=4))
 def test_generation_tp4(config, run_id, run_config, worker_id):
-    result, generation_log, msg = generation_test(
-        config,
-        run_id,
-        run_config,
-        cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=4),
-        worker_id=worker_id)
-
-    if generation_log is not None:
-        allure.attach.file(generation_log,
-                           attachment_type=allure.attachment_type.TEXT)
+    result, msg = generation_test(config,
+                                  run_id,
+                                  run_config,
+                                  cuda_prefix=get_cuda_prefix_by_workerid(
+                                      worker_id, tp_num=4),
+                                  worker_id=worker_id)
+
     assert result, msg
 
 
@@ -100,17 +84,14 @@ def test_generation_tp4(config, run_id, run_config, worker_id):
 @pytest.mark.parametrize('run_config',
                          get_benchmark_model_list(tp_num=4, is_longtext=True))
 def test_generation_longtext_tp4(config, run_id, run_config, worker_id):
-    result, generation_log, msg = generation_test(
-        config,
-        run_id,
-        run_config,
-        is_longtext=True,
-        cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=4),
-        worker_id=worker_id)
-
-    if generation_log is not None:
-        allure.attach.file(generation_log,
-                           attachment_type=allure.attachment_type.TEXT)
+    result, msg = generation_test(config,
+                                  run_id,
+                                  run_config,
+                                  is_longtext=True,
+                                  cuda_prefix=get_cuda_prefix_by_workerid(
+                                      worker_id, tp_num=4),
+                                  worker_id=worker_id)
+
     assert result, msg
 
 
@@ -127,15 +108,12 @@ def test_generation_longtext_tp4(config, run_id, run_config, worker_id):
     'tp_num': 2
 }])
 def test_generation_fun_tp2(config, run_id, run_config, worker_id):
-    result, generation_log, msg = generation_test(
-        config,
-        run_id,
-        run_config,
-        cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2),
-        worker_id=worker_id,
-        is_smoke=True)
-
-    if generation_log is not None:
-        allure.attach.file(generation_log,
-                           attachment_type=allure.attachment_type.TEXT)
+    result, msg = generation_test(config,
+                                  run_id,
+                                  run_config,
+                                  cuda_prefix=get_cuda_prefix_by_workerid(
+                                      worker_id, tp_num=2),
+                                  worker_id=worker_id,
+                                  is_smoke=True)
+
     assert result, msg
diff --git a/autotest/benchmark/test_throughput_performance.py b/autotest/benchmark/test_throughput_performance.py
index ad44b22b43..0ec9d31863 100644
--- a/autotest/benchmark/test_throughput_performance.py
+++ b/autotest/benchmark/test_throughput_performance.py
@@ -1,4 +1,3 @@
-import allure
 import pytest
 from utils.benchmark_utils import throughput_test
 from utils.config_utils import (get_benchmark_model_list,
@@ -10,16 +9,13 @@
 @pytest.mark.parametrize('run_config',
                          get_benchmark_model_list(tp_num=1, kvint_list=[4, 8]))
 def test_throughput_tp1(config, run_id, run_config, worker_id):
-    result, throughput_log, msg = throughput_test(
-        config,
-        run_id,
-        run_config,
-        cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=1),
-        worker_id=worker_id)
+    result, msg = throughput_test(config,
+                                  run_id,
+                                  run_config,
+                                  cuda_prefix=get_cuda_prefix_by_workerid(
+                                      worker_id, tp_num=1),
+                                  worker_id=worker_id)
 
-    if throughput_log is not None:
-        allure.attach.file(throughput_log,
-                           attachment_type=allure.attachment_type.TEXT)
     assert result, msg
 
 
@@ -28,16 +24,13 @@ def test_throughput_tp1(config, run_id, run_config, worker_id):
 @pytest.mark.parametrize('run_config',
                          get_benchmark_model_list(tp_num=2, kvint_list=[4, 8]))
 def test_throughput_tp2(config, run_id, run_config, worker_id):
-    result, throughput_log, msg = throughput_test(
-        config,
-        run_id,
-        run_config,
-        cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2),
-        worker_id=worker_id)
+    result, msg = throughput_test(config,
+                                  run_id,
+                                  run_config,
+                                  cuda_prefix=get_cuda_prefix_by_workerid(
+                                      worker_id, tp_num=2),
+                                  worker_id=worker_id)
 
-    if throughput_log is not None:
-        allure.attach.file(throughput_log,
-                           attachment_type=allure.attachment_type.TEXT)
     assert result, msg
 
 
@@ -46,16 +39,13 @@ def test_throughput_tp2(config, run_id, run_config, worker_id):
 @pytest.mark.parametrize('run_config',
                          get_benchmark_model_list(tp_num=4, kvint_list=[4, 8]))
 def test_throughput_tp4(config, run_id, run_config, worker_id):
-    result, throughput_log, msg = throughput_test(
-        config,
-        run_id,
-        run_config,
-        cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=4),
-        worker_id=worker_id)
+    result, msg = throughput_test(config,
+                                  run_id,
+                                  run_config,
+                                  cuda_prefix=get_cuda_prefix_by_workerid(
+                                      worker_id, tp_num=4),
+                                  worker_id=worker_id)
 
-    if throughput_log is not None:
-        allure.attach.file(throughput_log,
-                           attachment_type=allure.attachment_type.TEXT)
     assert result, msg
 
 
@@ -72,15 +62,12 @@ def test_throughput_tp4(config, run_id, run_config, worker_id):
     'tp_num': 2
 }])
 def test_throughput_func_tp2(config, run_id, run_config, worker_id):
-    result, throughput_log, msg = throughput_test(
-        config,
-        run_id,
-        run_config,
-        cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2),
-        worker_id=worker_id,
-        is_smoke=True)
+    result, msg = throughput_test(config,
+                                  run_id,
+                                  run_config,
+                                  cuda_prefix=get_cuda_prefix_by_workerid(
+                                      worker_id, tp_num=2),
+                                  worker_id=worker_id,
+                                  is_smoke=True)
 
-    if throughput_log is not None:
-        allure.attach.file(throughput_log,
-                           attachment_type=allure.attachment_type.TEXT)
     assert result, msg
diff --git a/autotest/config.yaml b/autotest/config.yaml
index b7c928909a..152bfdeca5 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -9,7 +9,6 @@ tp_config:
     internlm2-chat-20b: 2
     Baichuan2-13B-Chat: 2
     Mixtral-8x7B-Instruct-v0.1: 2
-    internlm2-20b: 2
     Qwen-VL-Chat: 2
     llava-v1.5-13b: 2
     internlm2_5-20b-chat: 2
@@ -17,6 +16,7 @@ tp_config:
     internlm2_5-7b-chat-1m: 4
     Qwen2-7B-Instruct-GPTQ-Int4: 2
     InternVL2-40B: 2
+    MiniCPM-V-2_6: 2
 
 turbomind_chat_model:
     - meta-llama/Meta-Llama-3-1-8B-Instruct
@@ -25,9 +25,7 @@ turbomind_chat_model:
     - meta-llama/Llama-2-7b-chat-hf
     - internlm/internlm2_5-7b-chat
     - internlm/internlm2_5-20b-chat
-    - internlm/internlm2-chat-7b
     - internlm/internlm2-chat-20b
-    - internlm/internlm2-chat-7b-4bits
     - internlm/internlm2-chat-20b-4bits
     - internlm/internlm-chat-20b
     - internlm/internlm-xcomposer2-4khd-7b
@@ -61,10 +59,10 @@ turbomind_chat_model:
 
 pytorch_chat_model:
     - meta-llama/Meta-Llama-3-8B-Instruct
+    - meta-llama/Meta-Llama-3-1-8B-Instruct
     - meta-llama/Llama-2-7b-chat-hf
     - internlm/internlm2_5-7b-chat
     - internlm/internlm2_5-20b-chat
-    - internlm/internlm2-chat-7b
     - internlm/internlm2-chat-20b
     - internlm/internlm-chat-20b
     - OpenGVLab/InternVL2-2B
@@ -88,6 +86,7 @@ pytorch_chat_model:
     - google/gemma-2-9b-it
     - deepseek-ai/deepseek-moe-16b-chat
     - deepseek-ai/deepseek-coder-1.3b-instruct
+    - deepseek-ai/DeepSeek-V2-Lite-Chat
     - THUDM/chatglm2-6b
     - THUDM/cogvlm2-llama3-chinese-chat-19B
     - THUDM/glm-4v-9b
@@ -99,14 +98,14 @@ pytorch_chat_model:
 turbomind_base_model:
     - internlm/internlm2_5-7b
     - internlm/internlm2_5-1_8b
-    - internlm/internlm2-20b
+    - internlm/internlm2_5-20b
     - codellama/CodeLlama-7b-hf
 
 pytorch_base_model:
     - tiiuae/falcon-7b
     - internlm/internlm2_5-7b
     - internlm/internlm2_5-1_8b
-    - internlm/internlm2-20b
+    - internlm/internlm2_5-20b
 
 vl_model:
     - Qwen/Qwen-VL-Chat
@@ -129,6 +128,7 @@ vl_model:
     - THUDM/glm-4v-9b
     - microsoft/Phi-3-vision-128k-instruct
     - openbmb/MiniCPM-Llama3-V-2_5
+    - openbmb/MiniCPM-V-2_6
 
 turbomind_quatization:
     awq:
@@ -139,7 +139,7 @@ turbomind_quatization:
         - internlm/internlm2_5-7b
         - internlm/internlm2_5-20b-chat
         - internlm/internlm2-chat-20b
-        - internlm/internlm2-20b
+        - internlm/internlm2_5-20b
         - internlm/internlm-chat-20b
         - internlm/internlm-xcomposer2-4khd-7b
         - internlm/internlm-xcomposer2d5-7b
@@ -170,13 +170,9 @@ turbomind_quatization:
         - meta-llama/Llama-2-7b-chat-hf
         - internlm/internlm2_5-7b-chat
         - internlm/internlm2_5-20b-chat
-        - internlm/internlm2-chat-1_8b
-        - internlm/internlm2-chat-7b
         - internlm/internlm2-chat-20b
-        - internlm/internlm2-chat-7b-4bits
         - internlm/internlm2-chat-20b-4bits
         - internlm/internlm-chat-20b
-        - internlm/internlm-xcomposer2-7b
         - internlm/internlm-xcomposer2-4khd-7b
         - internlm/internlm-xcomposer2d5-7b
         - OpenGVLab/InternVL-Chat-V1-5
@@ -202,10 +198,10 @@ turbomind_quatization:
 pytorch_quatization:
     awq:
         - meta-llama/Meta-Llama-3-8B-Instruct
+        - meta-llama/Meta-Llama-3-1-8B-Instruct
         - meta-llama/Llama-2-7b-chat-hf
         - internlm/internlm2_5-7b-chat
         - internlm/internlm2_5-20b-chat
-        - internlm/internlm2-chat-7b
         - internlm/internlm2-chat-20b
         - OpenGVLab/InternVL-Chat-V1-5
         - 01-ai/Yi-6B-Chat
@@ -216,9 +212,11 @@ pytorch_quatization:
         - meta-llama/Meta-Llama-3-8B-Instruct
         - meta-llama/Llama-2-7b-chat-hf
         - internlm/internlm2-chat-20b
-        - internlm/internlm2-chat-7b
+        - internlm/internlm2_5-7b-chat
+        - internlm/internlm2_5-20b-chat
         - 01-ai/Yi-6B-Chat
-        - internlm/internlm2-20b
+        - internlm/internlm2_5-20b
+        - internlm/internlm2_5-7b
 
 
 longtext_model:
@@ -235,3 +233,8 @@ benchmark_model:
     - meta-llama/Meta-Llama-3-1-70B-Instruct
     - internlm/internlm2_5-7b-chat
     - internlm/internlm2_5-20b-chat
+    - THUDM/glm-4-9b-chat
+    - Qwen/Qwen2-7B-Instruct
+    - mistralai/Mistral-7B-Instruct-v0.3
+    - mistralai/Mixtral-8x7B-Instruct-v0.1
+    - deepseek-ai/DeepSeek-V2-Lite-Chat
diff --git a/autotest/interface/pipeline/test_pipeline_func.py b/autotest/interface/pipeline/test_pipeline_func.py
index 9ee793a895..096918b6b1 100644
--- a/autotest/interface/pipeline/test_pipeline_func.py
+++ b/autotest/interface/pipeline/test_pipeline_func.py
@@ -288,7 +288,10 @@ def run_pipeline_testcase(config, model, backend, file_name):
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
         pipe = pipeline(model_path, backend_config=backend_config)
-        gen_config = GenerationConfig(logprobs=10, max_new_tokens=5, top_k=40)
+        gen_config = GenerationConfig(logprobs=10,
+                                      max_new_tokens=5,
+                                      top_k=40,
+                                      do_sample=True)
         response = pipe('Hi, pls intro yourself', gen_config=gen_config)
         result, msg = assert_pipeline_single_return(response, logprobs_num=10)
         save_pipeline_common_log(config, file_name, result, response, msg)
@@ -318,7 +321,10 @@ def run_pipeline_testcase(config, model, backend, file_name):
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
         pipe = pipeline(model_path, backend_config=backend_config)
-        gen_config = GenerationConfig(logprobs=10, max_new_tokens=5, top_k=40)
+        gen_config = GenerationConfig(logprobs=10,
+                                      max_new_tokens=5,
+                                      top_k=40,
+                                      do_sample=True)
         response = []
         for item in pipe.stream_infer('Hi, pls intro yourself',
                                       gen_config=gen_config):
@@ -424,9 +430,7 @@ def run_pipeline_testcase_stop_words(config, model, backend, file_name):
         backend_config = backend(tp=2)
         pipe = pipeline(model_path, backend_config=backend_config)
         # test stop_words
-        gen_config = GenerationConfig(stop_words=[' and', '浦', ' to'],
-                                      random_seed=1,
-                                      temperature=0.01)
+        gen_config = GenerationConfig(stop_words=[' and', '浦', ' to'])
         response = pipe(['Hi, pls intro yourself', 'Shanghai is'],
                         gen_config=gen_config)
         result = True
@@ -465,9 +469,7 @@ def run_pipeline_testcase_bad_words(config, model, backend, file_name):
         backend_config = backend(tp=2)
         pipe = pipeline(model_path, backend_config=backend_config)
         # test bad_words
-        gen_config = GenerationConfig(bad_words=[' and', '浦', ' to'],
-                                      temperature=0.01,
-                                      random_seed=1)
+        gen_config = GenerationConfig(bad_words=[' and', '浦', ' to'])
         response = pipe(['Hi, pls intro yourself', 'Shanghai is'],
                         gen_config=gen_config)
         result = '蒲' in response[0].text
@@ -587,7 +589,9 @@ def run_pipeline_testcase_repetition_penalty(config, model, backend,
         backend_config = backend(tp=2)
         pipe = pipeline(model_path, backend_config=backend_config)
         # test repetition_penalty
-        gen_config = GenerationConfig(repetition_penalty=0.01, random_seed=1)
+        gen_config = GenerationConfig(repetition_penalty=0.01,
+                                      random_seed=1,
+                                      do_sample=True)
         response = pipe('Shanghai is', gen_config=gen_config)
 
         result = get_repeat_times(response.text,
@@ -688,7 +692,9 @@ def run_pipeline_testcase(config, model, backend, file_name):
         backend_config = backend(tp=2)
         pipe = pipeline(model_path, backend_config=backend_config)
         # test repetition_penalty
-        gen_config = GenerationConfig(top_k=1, max_new_tokens=20)
+        gen_config = GenerationConfig(top_k=1,
+                                      max_new_tokens=20,
+                                      do_sample=True)
         response_list = []
         for i in range(3):
             response_list.append(pipe('Shanghai is', gen_config=gen_config))
@@ -726,7 +732,8 @@ def run_pipeline_testcase(config, model, backend, file_name):
         for i in range(3):
             gen_config = GenerationConfig(random_seed=i,
                                           temperature=1.0,
-                                          top_k=40)
+                                          top_k=40,
+                                          do_sample=True)
             response_list.append(pipe('Shanghai is', gen_config=gen_config))
         result = response_list[0].text != response_list[
             1].text and response_list[1].text != response_list[2].text
@@ -758,7 +765,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
         pipe = pipeline(model_path, backend_config=backend_config)
-        gen_config = GenerationConfig(random_seed=1, top_k=40)
+        gen_config = GenerationConfig(random_seed=1, top_k=40, do_sample=True)
         response_list = []
         for i in range(3):
             response_list.append(pipe('Shanghai is', gen_config=gen_config))
@@ -782,6 +789,40 @@ def run_pipeline_testcase(config, model, backend, file_name):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
+@pytest.mark.parametrize('backend',
+                         [TurbomindEngineConfig, PytorchEngineConfig])
+def test_gen_config_do_sample_batch(config, model, backend, worker_id):
+
+    def run_pipeline_testcase(config, model, backend, file_name):
+
+        model_path = '/'.join([config.get('model_path'), model])
+        backend_config = backend(tp=2)
+        pipe = pipeline(model_path, backend_config=backend_config)
+        gen_config = GenerationConfig(temperature=1.0,
+                                      top_k=40,
+                                      do_sample=True)
+        response = pipe(['Shanghai is'] * 3, gen_config=gen_config)
+        result = response[0].text != response[1].text and response[
+            1].text != response[2].text
+        save_pipeline_common_log(config, file_name, result, response)
+        del pipe
+        torch.cuda.empty_cache()
+
+    file_name = f'pipeline_log_{worker_id}.txt'
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
+                                                                     tp_num=2)
+    p = Process(target=run_pipeline_testcase,
+                args=(config, model, backend, file_name))
+
+    p.start()
+    p.join()
+    assert_pipeline_common_log(config, file_name)
+    if 'gw' in worker_id:
+        del os.environ['CUDA_VISIBLE_DEVICES']
+
+
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend',
                          [TurbomindEngineConfig, PytorchEngineConfig])
@@ -971,7 +1012,7 @@ def test_backend_config_validate_pytorch(config, model, backend, worker_id):
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig])
 def test_backend_config_tp(config, model, backend, worker_id):
-    with pytest.raises(AssertionError, match='tp should be 2\\^n'):
+    with pytest.raises(AssertionError):
         if 'gw' in worker_id:
             os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(
                 worker_id, tp_num=2)
diff --git a/autotest/interface/pipeline/test_pipeline_longtext_func.py b/autotest/interface/pipeline/test_pipeline_longtext_func.py
index 88b8a2847e..76625f5de6 100644
--- a/autotest/interface/pipeline/test_pipeline_longtext_func.py
+++ b/autotest/interface/pipeline/test_pipeline_longtext_func.py
@@ -33,10 +33,9 @@ def test_history_issue_tp1(config, model, worker_id):
 
 
 @pytest.mark.gpu_num_2
-@pytest.mark.parametrize('model', [
-    'internlm/internlm2-chat-20b', 'internlm/internlm2-chat-20b-inner-4bits',
-    'internlm/internlm2-20b', 'internlm/internlm2-20b-inner-4bits'
-])
+@pytest.mark.parametrize(
+    'model',
+    ['internlm/internlm2-chat-20b', 'internlm/internlm2-chat-20b-inner-4bits'])
 def test_history_issue_tp2(config, model, worker_id):
     log_name = ''.join(['pipeline_longtext_issue_', worker_id, '.log'])
     if 'gw' in worker_id:
diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py
index 9356f40a3b..3da375ccb5 100644
--- a/autotest/utils/benchmark_utils.py
+++ b/autotest/utils/benchmark_utils.py
@@ -2,6 +2,7 @@
 import subprocess
 from subprocess import PIPE, Popen
 
+import allure
 import psutil
 from utils.config_utils import get_workerid
 from utils.run_restful_chat import health_check
@@ -59,10 +60,11 @@ def generation_test(config,
     ])
 
     returncode, stderr = run_testcase(cmd, benchmark_log)
-
+    allure.attach.file(benchmark_log,
+                       attachment_type=allure.attachment_type.TEXT)
     if returncode == 0 and not os.path.isfile(csv_path):
-        return False, benchmark_log, 'result is empty'
-    return returncode == 0, benchmark_log, stderr
+        return False, 'result is empty'
+    return returncode == 0, stderr
 
 
 def throughput_test(config,
@@ -118,12 +120,15 @@ def throughput_test(config,
         ])
 
         returncode, stderr = run_testcase(cmd, benchmark_log)
+        allure.attach.file(benchmark_log,
+                           attachment_type=allure.attachment_type.TEXT)
 
         if returncode == 0 and not os.path.isfile(csv_path):
-            return False, benchmark_log, 'result is empty'
+            return False, 'result is empty'
         if returncode != 0:
-            return returncode == 0, benchmark_log, stderr
-    return returncode == 0, benchmark_log, stderr
+            return returncode == 0, stderr
+
+    return returncode == 0, stderr
 
 
 def restful_test(config,
@@ -161,7 +166,7 @@ def restful_test(config,
 
     http_url = f'http://localhost:{port}'
     if not health_check(http_url):
-        return False, None, 'server not start'
+        return False, 'server not start'
 
     command = f'python3 benchmark/profile_restful_api.py localhost:{port} {model_path} {dataset_path} --stream-output True '  # noqa: F401, E501
     if is_smoke:
@@ -186,13 +191,19 @@ def restful_test(config,
                                            text=True,
                                            encoding='utf-8')
             f.writelines(benchmark_res.stderr)
+        allure.attach.file(benchmark_log,
+                           attachment_type=allure.attachment_type.TEXT)
     if benchmark_res.returncode == 0 and not os.path.isfile(csv_path):
-        return False, benchmark_log, 'result is empty'
-    return benchmark_res.returncode == 0, benchmark_log, benchmark_res.stderr
+        return False, 'result is empty'
+    return benchmark_res.returncode == 0, benchmark_res.stderr
 
 
 def run_testcase(cmd, benchmark_log):
-    with open(benchmark_log, 'w') as f:
+    if os.path.isfile(benchmark_log):
+        write_type = 'a'
+    else:
+        write_type = 'w'
+    with open(benchmark_log, write_type) as f:
         f.writelines('reproduce command: ' + cmd + '\n')
         print('reproduce command: ' + cmd)
         with Popen([cmd],
@@ -236,8 +247,8 @@ def create_multi_level_directory(path):
 
 
 def get_max_cache_entry(model, backend):
-    if backend != 'turbomind':
-        return ''
+    if backend == 'pytorch':
+        return '--cache-max-entry-count 0.8'
     if 'Llama-2' in model:
         return '--cache-max-entry-count 0.95'
     elif 'internlm2' in model:
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index 1cc556748e..c8ff08ad91 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -188,12 +188,14 @@ def get_benchmark_model_list(tp_num,
             'backend': 'turbomind',
             'quant_policy': 0,
             'tp_num': tp_num
-        } for item in model_list]
+        } for item in model_list if item.replace('-inner-4bits', '') in
+                   config.get('turbomind_chat_model') or tp_num == 4]
         result += [{
             'model': item,
             'backend': 'pytorch',
             'tp_num': tp_num
-        } for item in model_list if '4bits' not in item]
+        } for item in model_list if '4bits' not in item and (
+            item in config.get('pytorch_chat_model') or tp_num == 4)]
         for kvint in kvint_list:
             result += [{
                 'model': item,
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index cef5d76a2b..e94b331881 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -9,8 +9,7 @@
 from utils.rule_condition_assert import assert_result
 
 from lmdeploy import pipeline
-from lmdeploy.messages import (GenerationConfig, PytorchEngineConfig,
-                               TurbomindEngineConfig)
+from lmdeploy.messages import PytorchEngineConfig, TurbomindEngineConfig
 from lmdeploy.vl import load_image
 from lmdeploy.vl.constants import IMAGE_TOKEN
 
@@ -53,9 +52,6 @@ def run_pipeline_chat_test(config,
 
     pipe = pipeline(hf_path, backend_config=backend_config)
 
-    # run testcases
-    gen_config = GenerationConfig(top_k=1)
-
     config_log = os.path.join(
         log_path, '_'.join([
             'pipeline', 'config', type, worker_id,
@@ -63,10 +59,12 @@ def run_pipeline_chat_test(config,
         ]))
     file = open(config_log, 'w')
     log_string = '\n'.join([
-        'reproduce config info:', 'engine_config = ' + str(backend_config),
-        'gen_config = ' + str(gen_config),
+        'reproduce config info:',
+        'from lmdeploy.messages import PytorchEngineConfig',
+        'from lmdeploy.messages import TurbomindEngineConfig',
+        'engine_config = ' + str(backend_config),
         'pipe = pipeline("' + hf_path + '",  backend_config=engine_config)',
-        'res = pipe("Hi, pls introduce shanghai", gen_config=gen_config)'
+        'res = pipe("Hi, pls introduce shanghai")'
     ])
     file.writelines(log_string)
     print(log_string)
@@ -91,7 +89,7 @@ def run_pipeline_chat_test(config,
             prompts.append({'role': 'user', 'content': prompt})
             file.writelines('prompt:' + prompt + '\n')
 
-            response = pipe([prompts], gen_config=gen_config)[0].text
+            response = pipe([prompts])[0].text
 
             case_result, reason = assert_result(response,
                                                 prompt_detail.values(),
diff --git a/autotest/utils/restful_return_check.py b/autotest/utils/restful_return_check.py
index 386a077b0c..9de308bf6e 100644
--- a/autotest/utils/restful_return_check.py
+++ b/autotest/utils/restful_return_check.py
@@ -69,6 +69,7 @@ def assert_chat_completions_stream_return(output,
                                           is_last: bool = False,
                                           check_logprobs: bool = False,
                                           logprobs_num: int = 5):
+    print(output)
     assert output.get('id') is not None
     assert output.get('object') == 'chat.completion.chunk'
     assert output.get('model') == model_name
@@ -97,6 +98,7 @@ def assert_completions_stream_return(output,
                                      is_last: bool = False,
                                      check_logprobs: bool = False,
                                      logprobs_num: int = 5):
+    print(output)
     assert output.get('id') is not None
     assert output.get('object') == 'text_completion'
     assert output.get('model') == model_name
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index 2af65ad41b..6e60c53833 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -81,7 +81,6 @@ def start_restful_api(config, param, model, model_path, backend_type,
                                     text=True,
                                     encoding='utf-8')
         pid = startRes.pid
-    allure.attach.file(start_log, attachment_type=allure.attachment_type.TEXT)
 
     http_url = BASE_HTTP_URL + ':' + str(port)
     with open(start_log, 'r') as file:
@@ -96,6 +95,7 @@ def start_restful_api(config, param, model, model_path, backend_type,
         result = health_check(http_url)
         if result or total_time >= 300:
             break
+    allure.attach.file(start_log, attachment_type=allure.attachment_type.TEXT)
     return pid, startRes