InternLM · lvhan028 · Jan 16, 2025 · Jan 10, 2025 · Jan 15, 2025 · Jan 15, 2025
diff --git a/.github/scripts/eval_base_config.py b/.github/scripts/eval_base_config.py
@@ -191,4 +191,4 @@
 for model in [v for k, v in locals().items() if '_batch1' in k]:
     model['abbr'] = model['abbr'] + '_batch1'
     model['engine_config']['max_batch_size'] = 1
-    model['batch_size'] = 1
+    model['batch_size'] = 100
diff --git a/.github/scripts/eval_chat_config.py b/.github/scripts/eval_chat_config.py
@@ -54,10 +54,6 @@
         models as hf_internlm2_chat_7b  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.hf_internlm2_chat_20b import \
         models as hf_internlm2_chat_20b  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.hf_internlm_chat_7b import \
-        models as hf_internlm_chat_7b  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.hf_internlm_chat_20b import \
-        models as hf_internlm_chat_20b  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
         models as lmdeploy_internlm2_5_7b_chat  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \
@@ -131,11 +127,6 @@
 MAX_SESSION_LEN = 2048
 MAX_NEW_TOKENS = 1024
 
-# ===== Configs for internlm/internlm-chat-7b =====
-turbomind_internlm_chat_7b = deepcopy(*lmdeploy_internlm_chat_7b)
-turbomind_internlm_chat_7b_4bits = deepcopy(*lmdeploy_internlm_chat_7b)
-pytorch_internlm_chat_7b = deepcopy(*lmdeploy_internlm_chat_7b)
-
 # ===== Configs for internlm/internlm2-chat-7b =====
 turbomind_internlm2_chat_7b = deepcopy(*lmdeploy_internlm2_chat_7b)
 turbomind_internlm2_chat_7b_4bits = deepcopy(*lmdeploy_internlm2_chat_7b)
@@ -150,6 +141,21 @@
 turbomind_internlm2_5_7b_chat_kvint8 = deepcopy(*lmdeploy_internlm2_5_7b_chat)
 pytorch_internlm2_5_7b_chat = deepcopy(*lmdeploy_internlm2_5_7b_chat)
 pytorch_internlm2_5_7b_chat_w8a8 = deepcopy(*lmdeploy_internlm2_5_7b_chat)
+turbomind_internlm2_5_7b_chat_batch1 = deepcopy(*lmdeploy_internlm2_5_7b_chat)
+turbomind_internlm2_5_7b_chat_batch1_4bits = deepcopy(
+    *lmdeploy_internlm2_5_7b_chat)
+
+turbomind_internlm3_8b_instruct = deepcopy(*lmdeploy_internlm2_5_7b_chat)
+turbomind_internlm3_8b_instruct_4bits = deepcopy(*lmdeploy_internlm2_5_7b_chat)
+turbomind_internlm3_8b_instruct_kvint4 = deepcopy(
+    *lmdeploy_internlm2_5_7b_chat)
+turbomind_internlm3_8b_instruct_kvint8 = deepcopy(
+    *lmdeploy_internlm2_5_7b_chat)
+pytorch_internlm3_8b_instruct = deepcopy(*lmdeploy_internlm2_5_7b_chat)
+pytorch_internlm3_8b_instruct_w8a8 = deepcopy(*lmdeploy_internlm2_5_7b_chat)
+for model in [v for k, v in locals().items() if 'internlm3_8b_instruct' in k]:
+    model['abbr'] = 'turbomind-internlm3-8b-instruct'
+    model['path'] = 'internlm/internlm3-8b-instruct'
 
 # ===== Configs for internlm/internlm2_5_20b_chat =====
 turbomind_internlm2_5_20b_chat = deepcopy(*lmdeploy_internlm2_5_20b_chat)
@@ -223,9 +229,9 @@
 turbomind_llama2_7b_chat_kvint8 = deepcopy(*lmdeploy_llama2_7b_chat)
 
 for model in [v for k, v in locals().items() if k.startswith('turbomind_')]:
-    model['engine_config']['max_batch_size'] = 1
+    model['engine_config']['max_batch_size'] = 512
     model['gen_config']['do_sample'] = False
-    model['batch_size'] = 100
+    model['batch_size'] = 1000
 
 for model in [v for k, v in locals().items() if k.endswith('_4bits')]:
     model['engine_config']['model_format'] = 'awq'
@@ -247,19 +253,24 @@
 for model in [v for k, v in locals().items() if k.startswith('pytorch_')]:
     model['abbr'] = model['abbr'].replace('turbomind', 'pytorch')
     model['backend'] = 'pytorch'
-    model['engine_config']['max_batch_size'] = 1
+    model['engine_config']['max_batch_size'] = 512
     model['gen_config']['do_sample'] = False
     model['batch_size'] = 100
 
+for model in [v for k, v in locals().items() if '_batch1' in k]:
+    model['abbr'] = model['abbr'] + '_batch1'
+    model['engine_config']['max_batch_size'] = 1
+    model['batch_size'] = 100
+
 basic_pytorch_chat_tp1 = dict(type=TurboMindModelwithChatTemplate,
                               engine_config=dict(session_len=MAX_SESSION_LEN,
-                                                 max_batch_size=1,
+                                                 max_batch_size=512,
                                                  tp=1),
                               gen_config=dict(do_sample=False,
                                               max_new_tokens=MAX_NEW_TOKENS),
                               max_out_len=MAX_NEW_TOKENS,
                               max_seq_len=MAX_SESSION_LEN,
-                              batch_size=100,
+                              batch_size=1000,
                               run_cfg=dict(num_gpus=1))
 
 # ===== Configs for Qwen/Qwen1.5-MoE-A2.7B-Chat =====

diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
@@ -133,16 +133,14 @@ jobs:
     timeout-minutes: 150
     env:
       PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
-      MODELSCOPE_CACHE: /root/modelscope_hub
-      MODELSCOPE_MODULES_CACHE: /root/modelscope_modules
+      MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub
+      MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
     container:
       image: openmmlab/lmdeploy:latest-cu11
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
         - /nvme/github-actions/packages:/root/packages
-        - /nvme/github-actions/modelscope_hub:/root/modelscope_hub
-        - /nvme/github-actions/modelscope_modules:/root/modelscope_modules
         - /nvme/qa_test_models:/nvme/qa_test_models
         - /mnt/shared:/mnt/shared
         - /nvme/qa_test_models/lmdeploy/autotest:/local_case
@@ -225,16 +223,14 @@ jobs:
             function: local_case
     env:
       PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
-      MODELSCOPE_CACHE: /root/modelscope_hub
-      MODELSCOPE_MODULES_CACHE: /root/modelscope_modules
+      MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub
+      MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
     container:
       image: openmmlab/lmdeploy:latest-cu11
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
         - /nvme/github-actions/packages:/root/packages
-        - /nvme/github-actions/modelscope_hub:/root/modelscope_hub
-        - /nvme/github-actions/modelscope_modules:/root/modelscope_modules
         - /nvme/github-actions/resources/lora:/root/lora
         - /nvme/qa_test_models:/nvme/qa_test_models
         - /mnt/shared:/mnt/shared
@@ -602,7 +598,7 @@ jobs:
         run: |
           export LMDEPLOY_DIR=$(pwd)
 
-          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_chat_7b, pytorch_internlm2_chat_7b, turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat_w8a8, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama2_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct_w8a8, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, turbomind_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct_w8a8, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, pytorch_gemma_2_27b_it]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true
+          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_chat_7b, pytorch_internlm2_chat_7b, turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_internlm2_chat_20b, pytorch_internlm2_chat_20b, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, turbomind_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct, turbomind_llama2_7b_chat, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, pytorch_gemma_2_27b_it]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true
       - name: Evaluate base models
         if: matrix.evaluate_type == 'base'
         run: |

diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml
@@ -17,7 +17,7 @@ on:
         required: true
         description: 'Tested TurboMind models list. eg. [internlm_chat_7b,internlm_chat_7b_w8a16]'
         type: string
-        default: '[turbomind_internlm2_chat_7b, pytorch_internlm2_chat_7b, turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama2_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, turbomind_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, pytorch_gemma_2_27b_it, turbomind_internlm2_chat_7b_kvint4, turbomind_internlm2_chat_7b_kvint8, turbomind_internlm2_5_7b_chat_4bits, turbomind_internlm2_5_7b_chat_kvint4, turbomind_internlm2_5_7b_chat_kvint8, pytorch_internlm2_5_7b_chat_w8a8, turbomind_internlm2_5_20b_chat_4bits, turbomind_internlm2_5_20b_chat_kvint4, turbomind_internlm2_5_20b_chat_kvint8, turbomind_qwen1_5_7b_chat_4bits, turbomind_qwen1_5_7b_chat_kvint4, turbomind_qwen1_5_7b_chat_kvint8, turbomind_llama2_7b_chat_4bits, turbomind_llama2_7b_chat_kvint4, turbomind_llama2_7b_chat_kvint8, turbomind_llama3_8b_instruct_4bits, turbomind_llama3_8b_instruct_kvint4, turbomind_llama3_8b_instruct_kvint8, turbomind_llama3_1_8b_instruct_4bits, turbomind_llama3_1_8b_instruct_kvint4, turbomind_llama3_1_8b_instruct_kvint8, pytorch_llama3_1_8b_instruct_w8a8, turbomind_qwen2_7b_instruct_4bits, turbomind_qwen2_7b_instruct_kvint8, turbomind_qwen2_5_7b_instruct_4bits, turbomind_qwen2_5_7b_instruct_kvint8, pytorch_qwen2_5_7b_instruct_w8a8]'
+        default: '[turbomind_internlm2_chat_7b, pytorch_internlm2_chat_7b, turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_internlm2_chat_20b, pytorch_internlm2_chat_20b, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, turbomind_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct, turbomind_llama2_7b_chat, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, pytorch_gemma_2_27b_it, turbomind_internlm2_chat_7b_4bits, turbomind_internlm2_chat_7b_kvint4, turbomind_internlm2_chat_7b_kvint8, turbomind_internlm2_5_7b_chat_4bits, turbomind_internlm2_5_7b_chat_kvint4, turbomind_internlm2_5_7b_chat_kvint8, pytorch_internlm2_5_7b_chat_w8a8, turbomind_internlm3_8b_instruct_4bits, turbomind_internlm3_8b_instruct_kvint4, turbomind_internlm3_8b_instruct_kvint8, pytorch_internlm3_8b_instruct_w8a8, turbomind_internlm2_5_20b_chat_4bits, turbomind_internlm2_5_20b_chat_kvint4, turbomind_internlm2_5_20b_chat_kvint8, turbomind_llama3_8b_instruct_4bits, turbomind_llama3_8b_instruct_kvint4, turbomind_llama3_1_8b_instruct_4bits, turbomind_llama3_1_8b_instruct_kvint4, turbomind_llama3_1_8b_instruct_kvint8,turbomind_llama3_8b_instruct_kvint8, pytorch_llama3_1_8b_instruct_w8a8, turbomind_qwen2_7b_instruct_4bits, turbomind_qwen2_7b_instruct_kvint4, turbomind_qwen2_7b_instruct_kvint8, pytorch_qwen2_7b_instruct_w8a8, turbomind_qwen2_5_7b_instruct_4bits, turbomind_qwen2_5_7b_instruct_kvint4, turbomind_qwen2_5_7b_instruct_kvint8, pytorch_qwen2_5_7b_instruct_w8a8, turbomind_llama2_7b_chat_4bits, turbomind_llama2_7b_chat_kvint4, turbomind_llama2_7b_chat_kvint8]'
       chat_datasets:
         required: true
         description: 'Tested datasets list. eg. [*bbh_datasets,*ceval_datasets,*cmmlu_datasets,*GaokaoBench_datasets,*gpqa_datasets,*gsm8k_datasets,*hellaswag_datasets,*humaneval_datasets,*ifeval_datasets,*math_datasets,*sanitized_mbpp_datasets,*mmlu_datasets,*nq_datasets,*race_datasets,*TheoremQA_datasets,*triviaqa_datasets,*winogrande_datasets,*crowspairs_datasets]'

diff --git a/autotest/benchmark/test_throughput_performance.py b/autotest/benchmark/test_throughput_performance.py
@@ -71,3 +71,29 @@ def test_throughput_func_tp2(config, run_id, run_config, worker_id):
                                   is_smoke=True)
 
     assert result, msg
+
+
+@pytest.mark.function
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.gpu_num_1
+@pytest.mark.pr_test
+@pytest.mark.parametrize('run_config', [{
+    'model': 'meta-llama/Meta-Llama-3-1-8B-Instruct',
+    'backend': 'pytorch',
+    'tp_num': 1
+}, {
+    'model': 'meta-llama/Meta-Llama-3-1-8B-Instruct',
+    'backend': 'turbomind',
+    'quant_policy': 0,
+    'tp_num': 1
+}])
+def test_throughput_prtest_tp1(config, run_id, run_config, worker_id):
+    result, msg = throughput_test(config,
+                                  run_id,
+                                  run_config,
+                                  cuda_prefix=get_cuda_prefix_by_workerid(
+                                      worker_id, tp_num=1),
+                                  worker_id=worker_id,
+                                  is_smoke=True)
+
+    assert result, msg
diff --git a/autotest/config-v100.yaml b/autotest/config-v100.yaml
@@ -6,7 +6,6 @@ benchmark_path: /nvme/qa_test_models/benchmark-reports
 dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
 
 tp_config:
-    internlm-chat-20b: 2
     internlm2-chat-20b: 2
     Baichuan2-13B-Chat: 2
     Mixtral-8x7B-Instruct-v0.1: 2
@@ -28,6 +27,7 @@ turbomind_chat_model:
     - meta-llama/Meta-Llama-3-1-8B-Instruct-AWQ
     - meta-llama/Meta-Llama-3-8B-Instruct
     - meta-llama/Meta-Llama-3-8B-Instruct-inner-4bits
+    - internlm/internlm3-8b-instruct
     - internlm/internlm2_5-7b-chat
     - internlm/internlm2_5-20b-chat
     - internlm/internlm-xcomposer2d5-7b
@@ -48,6 +48,7 @@ turbomind_chat_model:
 pytorch_chat_model:
     - meta-llama/Meta-Llama-3-8B-Instruct
     - meta-llama/Meta-Llama-3-1-8B-Instruct
+    - internlm/internlm3-8b-instruct
     - internlm/internlm2_5-7b-chat
     - internlm/internlm2_5-20b-chat
     - OpenGVLab/InternVL2-1B
@@ -122,6 +123,7 @@ turbomind_quatization:
 
 pytorch_quatization:
     awq:
+        - internlm/internlm3-8b-instruct
         - internlm/internlm2_5-7b-chat
         - internlm/internlm2_5-20b-chat
         - Qwen/Qwen2-1.5B-Instruct

diff --git a/autotest/config.yaml b/autotest/config.yaml
@@ -6,7 +6,6 @@ benchmark_path: /nvme/qa_test_models/benchmark-reports
 dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
 
 tp_config:
-    internlm-chat-20b: 2
     internlm2-chat-20b: 2
     Baichuan2-13B-Chat: 2
     Mixtral-8x7B-Instruct-v0.1: 2
@@ -34,11 +33,11 @@ turbomind_chat_model:
     - meta-llama/Meta-Llama-3-1-70B-Instruct
     - meta-llama/Meta-Llama-3-8B-Instruct
     - meta-llama/Llama-2-7b-chat-hf
+    - internlm/internlm3-8b-instruct
     - internlm/internlm2_5-7b-chat
     - internlm/internlm2_5-20b-chat
     - internlm/internlm2-chat-20b
     - internlm/internlm2-chat-20b-4bits
-    - internlm/internlm-chat-20b
     - internlm/internlm-xcomposer2-4khd-7b
     - internlm/internlm-xcomposer2d5-7b
     - OpenGVLab/InternVL2_5-1B
@@ -91,10 +90,10 @@ pytorch_chat_model:
     - meta-llama/Llama-3.2-3B-Instruct
     - meta-llama/Llama-3.2-11B-Vision-Instruct
     - meta-llama/Llama-2-7b-chat-hf
+    - internlm/internlm3-8b-instruct
     - internlm/internlm2_5-7b-chat
     - internlm/internlm2_5-20b-chat
     - internlm/internlm2-chat-20b
-    - internlm/internlm-chat-20b
     - OpenGVLab/InternVL2_5-1B
     - OpenGVLab/InternVL2_5-8B
     - OpenGVLab/InternVL2_5-26B
@@ -235,6 +234,7 @@ pytorch_quatization:
         - meta-llama/Meta-Llama-3-8B-Instruct
         - meta-llama/Meta-Llama-3-1-8B-Instruct
         - meta-llama/Llama-2-7b-chat-hf
+        - internlm/internlm3-8b-instruct
         - internlm/internlm2_5-7b-chat
         - internlm/internlm2_5-20b-chat
         - internlm/internlm2-chat-20b
@@ -251,6 +251,7 @@ pytorch_quatization:
         - meta-llama/Meta-Llama-3-8B-Instruct
         - meta-llama/Llama-3.2-1B-Instruct
         - meta-llama/Llama-2-7b-chat-hf
+        - internlm/internlm3-8b-instruct
         - internlm/internlm2-chat-20b
         - internlm/internlm2_5-7b-chat
         - internlm/internlm2_5-20b-chat
@@ -298,6 +299,7 @@ benchmark_model:
     - meta-llama/Meta-Llama-3-1-8B-Instruct
     - meta-llama/Meta-Llama-3-8B-Instruct
     - meta-llama/Meta-Llama-3-1-70B-Instruct
+    - internlm/internlm3-8b-instruct
     - internlm/internlm2_5-7b-chat
     - internlm/internlm2_5-20b-chat
     - THUDM/glm-4-9b-chat

diff --git a/autotest/tools/chat/test_command_chat_hf_pytorch.py b/autotest/tools/chat/test_command_chat_hf_pytorch.py
@@ -125,7 +125,9 @@ def test_hf_pytorch_base_tp2(config, model, cli_case_config, worker_id):
 @pytest.mark.hf_pytorch_chat
 @pytest.mark.gpu_num_2
 @pytest.mark.pr_test
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
+@pytest.mark.parametrize(
+    'model',
+    ['internlm/internlm2_5-20b-chat', 'mistralai/Mixtral-8x7B-Instruct-v0.1'])
 def test_hf_pytorch_chat_pr(config, model, cli_case_config):
     usercase = 'chat_testcase'
     result, chat_log, msg = hf_command_line_test(
@@ -146,7 +148,7 @@ def test_hf_pytorch_chat_pr(config, model, cli_case_config):
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_pytorch_chat
 @pytest.mark.gpu_num_1
-@pytest.mark.parametrize('model', ['Qwen/Qwen-7B-Chat'])
+@pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct'])
 def test_modelscope_pytorch_chat_tp1(config, model, cli_case_config,
                                      worker_id):
     os.environ['LMDEPLOY_USE_MODELSCOPE'] = 'True'

diff --git a/autotest/tools/chat/test_command_chat_hf_turbomind.py b/autotest/tools/chat/test_command_chat_hf_turbomind.py
@@ -130,7 +130,8 @@ def test_hf_turbomind_base_tp2(config, model, cli_case_config, worker_id):
 @pytest.mark.pr_test
 @pytest.mark.parametrize('model', [
     'internlm/internlm2_5-20b-chat',
-    'internlm/internlm2_5-20b-chat-inner-4bits'
+    'internlm/internlm2_5-20b-chat-inner-4bits',
+    'mistralai/Mixtral-8x7B-Instruct-v0.1'
 ])
 def test_hf_turbomind_chat_pr(config, model, cli_case_config):
     usercase = 'chat_testcase'
@@ -154,7 +155,7 @@ def test_hf_turbomind_chat_pr(config, model, cli_case_config):
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
 @pytest.mark.gpu_num_1
-@pytest.mark.parametrize('model', ['Qwen/Qwen-7B-Chat'])
+@pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct'])
 def test_modelscope_turbomind_chat_tp1(config, model, cli_case_config,
                                        worker_id):
     os.environ['LMDEPLOY_USE_MODELSCOPE'] = 'True'

diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
@@ -250,25 +250,30 @@ def test_pipeline_chat_kvint8_tp4(config, common_case_config, model,
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
 @pytest.mark.pr_test
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
+@pytest.mark.parametrize(
+    'model',
+    ['internlm/internlm2_5-20b-chat', 'mistralai/Mixtral-8x7B-Instruct-v0.1'])
 def test_pipeline_chat_pytorch_pr(config, common_case_config, model):
     spawn_context = get_context('spawn')
+    case_config = {
+        k: v
+        for k, v in common_case_config.items() if k == 'memory_test'
+    }
     p = spawn_context.Process(target=run_pipeline_chat_test,
-                              args=(config, common_case_config, model,
-                                    'pytorch'))
+                              args=(config, case_config, model, 'pytorch'))
     p.start()
     p.join()
 
     # assert script
-    assert_pipeline_chat_log(config, common_case_config, model, 'pytorch')
+    assert_pipeline_chat_log(config, case_config, model, 'pytorch')
 
 
 @pytest.mark.order(6)
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat_pytorch
 @pytest.mark.gpu_num_1
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('model', ['Qwen/Qwen-7B-Chat'])
+@pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct'])
 def test_modelscope_pipeline_chat_pytorch_tp1(config, common_case_config,
                                               model, worker_id):
     if 'gw' in worker_id: