Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions tests/lora/test_chatglm3_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ def test_chatglm3_lora(chatglm3_lora_files):
max_loras=4,
max_lora_rank=64,
trust_remote_code=True,
enable_chunked_prefill=True,
)

output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
Expand All @@ -70,7 +69,6 @@ def test_chatglm3_lora(chatglm3_lora_files):


@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_chatglm3_lora_tp4(chatglm3_lora_files):
llm = vllm.LLM(
MODEL_PATH,
Expand All @@ -81,7 +79,6 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=False,
enable_chunked_prefill=True,
)

output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
Expand All @@ -93,7 +90,6 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):


@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
# https://github.com/NVIDIA/nccl/issues/1790, set a lower value for
# gpu_memory_utilization here because NCCL >= 2.26.3 seems to use
Expand All @@ -107,7 +103,6 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=True,
enable_chunked_prefill=True,
gpu_memory_utilization=0.85,
)
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
Expand Down
3 changes: 0 additions & 3 deletions tests/lora/test_llama_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ def test_llama_lora(sql_lora_files):


@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_llama_lora_tp4(sql_lora_files):
llm = vllm.LLM(
MODEL_PATH,
Expand All @@ -127,7 +126,6 @@ def test_llama_lora_tp4(sql_lora_files):


@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
llm = vllm.LLM(
MODEL_PATH,
Expand All @@ -142,7 +140,6 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):


@multi_gpu_test(num_gpus=2)
@create_new_process_for_each_test()
def test_tp2_serialize_and_deserialize_lora(
tmp_path, sql_lora_files, sql_lora_huggingface_id
):
Expand Down
6 changes: 3 additions & 3 deletions tests/lora/test_minicpmv_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform

from ..utils import create_new_process_for_each_test
from ..utils import multi_gpu_test

MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"

Expand Down Expand Up @@ -88,7 +88,7 @@ def test_minicpmv_lora(minicpmv_lora_files):
current_platform.is_rocm(),
reason="MiniCPM-V dependency xformers incompatible with ROCm",
)
@create_new_process_for_each_test()
@multi_gpu_test(num_gpus=4)
def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
llm = vllm.LLM(
MODEL_PATH,
Expand All @@ -112,7 +112,7 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
current_platform.is_rocm(),
reason="MiniCPM-V dependency xformers incompatible with ROCm",
)
@create_new_process_for_each_test()
@multi_gpu_test(num_gpus=4)
def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
llm = vllm.LLM(
MODEL_PATH,
Expand Down
7 changes: 1 addition & 6 deletions tests/lora/test_transformers_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform

from ..utils import create_new_process_for_each_test, multi_gpu_test
from ..utils import multi_gpu_test

MODEL_PATH = "hmellor/Ilama-3.2-1B"

Expand Down Expand Up @@ -54,7 +54,6 @@ def test_ilama_lora(ilama_lora_files):
max_loras=4,
max_lora_rank=16,
trust_remote_code=True,
enable_chunked_prefill=True,
)

output1 = do_sample(llm, ilama_lora_files, lora_id=1)
Expand All @@ -69,7 +68,6 @@ def test_ilama_lora(ilama_lora_files):
current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
)
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_ilama_lora_tp4(ilama_lora_files):
llm = vllm.LLM(
MODEL_PATH,
Expand All @@ -80,7 +78,6 @@ def test_ilama_lora_tp4(ilama_lora_files):
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=False,
enable_chunked_prefill=True,
)

output1 = do_sample(llm, ilama_lora_files, lora_id=1)
Expand All @@ -95,7 +92,6 @@ def test_ilama_lora_tp4(ilama_lora_files):
current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
)
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
llm = vllm.LLM(
MODEL_PATH,
Expand All @@ -106,7 +102,6 @@ def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=True,
enable_chunked_prefill=True,
)
output1 = do_sample(llm, ilama_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):
Expand Down