Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions tests/integration/defs/perf/test_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,8 +283,8 @@ def __init__(
backend: str = "",
mode: str = "plugin",
data_type: str = "float16",
max_batch_size: int = 2048,
max_num_tokens: int = 8192,
max_batch_size: int = 512,
max_num_tokens: int = 2048,
gpu_weights_percent: float = -1,
batch_sizes: List[int] = [0],
input_lens: List[int] = [8],
Expand Down Expand Up @@ -601,7 +601,7 @@ def validate(self):
if self.model_name in MODEL_PATH_DICT.keys():
VALID_QUANTS = [
"", "nvfp4", "fp8", "int8_sq", "int4_awq", "w4a8_awq",
"int8_wo", "int4_wo", "full_prec"
"w4a16_awq", "int8_wo", "int4_wo", "full_prec"
]
else:
VALID_QUANTS = [
Expand Down
14 changes: 14 additions & 0 deletions tests/integration/defs/perf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,20 @@ def run_cmd(self, cmd_idx: int, venv) -> str:
benchmark_cmd = mpi_cmd + command
output += subprocess.check_output(benchmark_cmd,
env=envs).decode()
# write config.json to output log
match = re.search(r'--engine_dir=([^\s]+)', current_cmd_str)
if match:
engine_dir = match.group(1)
print_info(
f'writing config.json in {engine_dir} to output log')
with open(os.path.join(engine_dir, "config.json"),
"r") as f:
config_content = f.read()
output += "\n" + "=" * 50 + "\n"
output += "ENGINE CONFIG:\n"
output += "=" * 50 + "\n"
output += config_content
output += "\n" + "=" * 50 + "\n"
return output

def get_cmd_str(self, cmd_idx) -> List[str]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,15 @@ trt_llm_release_perf_cluster_test:

# GB chip specific tests with high memory
- condition:
ranges:
system_gpu_count:
gte: 8
gpu_memory:
gt: 100000
wildcards:
chip: 'gb*'
gpu:
- '*b100*'
linux_distribution_name: '*'
tests:
- perf/test_perf.py::test_perf[deepseek_r1-bench-pytorch-float16-maxbs:512-input_output_len:128,128-quant:fp8-ep:8-tp:8-gpus:8]
- perf/test_perf.py::test_perf[deepseek_r1-bench-pytorch-float16-maxbs:1-input_output_len:1000,2000-quant:fp8-reqs:10-ep:4-tp:8-gpus:8] #min latency test
- perf/test_perf.py::test_perf[deepseek_r1-bench-pytorch-float16-maxbs:384-maxnt:1536-input_output_len:1000,2000-quant:fp8-reqs:49152-con:3072-ep:8-tp:8-gpus:8] #max throughput test
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:512-input_output_len:128,128-quant:nvfp4-ep:8-tp:8-gpus:8]
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-streaming-float16-maxbs:512-input_output_len:1000,1000-quant:nvfp4-con:4096-ep:8-tp:8-gpus:8]
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:1-input_output_len:1000,2000-quant:nvfp4-reqs:10-ep:4-tp:8-gpus:8] #min latency test
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:384-maxnt:1536-input_output_len:1000,2000-quant:nvfp4-reqs:49152-con:3072-ep:8-tp:8-gpus:8] #max throughput test
31 changes: 6 additions & 25 deletions tests/integration/test_lists/qa/trt_llm_release_perf_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ trt_llm_release_perf_test:
tests:
- perf/test_perf.py::test_perf[llama_70b_sq_per_tensor-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128+512,32-gpus:2]
- perf/test_perf.py::test_perf[mixtral_8x7b-cppmanager-exe-plugin_ifb-float16-mp-input_output_len:128,128+512,32-gpus:2]
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-float16-maxbs:256-input_output_len:128,128+512,32-gpus:2]
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128+512,32-gpus:2]
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-streaming-float16-input_output_len:128,128-gpus:2]

Expand Down Expand Up @@ -123,8 +122,8 @@ trt_llm_release_perf_test:
tests:
- perf/test_perf.py::test_perf[deepseek_r1-bench-pytorch-float16-maxbs:32-input_output_len:128,128-quant:fp8-ep:8-tp:8-gpus:8]
- perf/test_perf.py::test_perf[deepseek_r1-bench-pytorch-streaming-float16-maxbs:32-input_output_len:128,128-quant:fp8-ep:8-tp:8-gpus:8]
- perf/test_perf.py::test_perf[deepseek_r1-bench-pytorch-float16-maxbs:1-input_output_len:1000,2000-reqs:10-quant:fp8-con:1-ep:4-tp:8-gpus:8] #min latency test
- perf/test_perf.py::test_perf[deepseek_r1-bench-pytorch-float16-maxbs:128-maxnt:1127-input_output_len:1000,2000-reqs:5120-quant:fp8-con:1024-ep:8-tp:8-gpus:8] #max throughput test
- perf/test_perf.py::test_perf[deepseek_r1-bench-pytorch-float16-maxbs:1-input_output_len:1000,2000-quant:fp8-reqs:10-con:1-ep:4-tp:8-gpus:8] #min latency test
- perf/test_perf.py::test_perf[deepseek_r1-bench-pytorch-float16-maxbs:128-maxnt:1127-input_output_len:1000,2000-quant:fp8-reqs:5120-con:1024-ep:8-tp:8-gpus:8] #max throughput test

# FP8 specific tests
- condition:
Expand All @@ -134,7 +133,7 @@ trt_llm_release_perf_test:
- perf/test_perf.py::test_perf[llama_v3.1_8b-cppmanager-exe-plugin_ifb-float16-maxbs:256-input_output_len:128,128-beams:4-quant:fp8]

- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-float16-input_output_len:128,128-quant:fp8]
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-float16-input_output_len:128,128-quant:int4_awq]
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-float16-input_output_len:128,128-quant:w4a16_awq]
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-float16-input_output_len:128,128-quant:w4a8_awq]
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-float16-maxbs:256-input_output_len:128,128-quant:fp8]
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-float16-maxbs:256-input_output_len:512,32-quant:fp8]
Expand Down Expand Up @@ -176,7 +175,6 @@ trt_llm_release_perf_test:
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-quant:fp8-gpus:2]
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:512,32-quant:fp8-gpus:2]
- perf/test_perf.py::test_perf[llama_v3.2_11b-bench-float16-input_output_len:512,200-quant:fp8-gpus:2]
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-float16-input_output_len:128,128-quant:fp8-tp:2]

- condition:
terms:
Expand All @@ -203,28 +201,11 @@ trt_llm_release_perf_test:
# GB chip specific tests
- condition:
wildcards:
chip: 'gb*'
gpu:
- '*b100*'
- '*b40*'
linux_distribution_name: '*'
tests:
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-float16-input_output_len:128,128-quant:nvfp4]
- perf/test_perf.py::test_perf[deepseek_v3_lite_nvfp4-bench-pytorch-float16-input_output_len:128,128-quant:nvfp4]
- perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float16-input_output_len:128,128-quant:fp8]

# GB200 chip specific tests
- condition:
ranges:
system_gpu_count:
gte: 8
gpu_memory:
gt: 100000
wildcards:
chip: 'gb*'
linux_distribution_name: '*'
tests:
- perf/test_perf.py::test_perf[deepseek_r1-bench-pytorch-float16-maxbs:512-input_output_len:128,128-quant:fp8-ep:8-tp:8-gpus:8]
- perf/test_perf.py::test_perf[deepseek_r1-bench-pytorch-float16-maxbs:1-input_output_len:1000,2000-quant:fp8-reqs:10-ep:4-tp:8-gpus:8] #min latency test
- perf/test_perf.py::test_perf[deepseek_r1-bench-pytorch-float16-maxbs:384-maxnt:1536-input_output_len:1000,2000-quant:fp8-reqs:49152-con:3072-ep:8-tp:8-gpus:8] #max throughput test
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:512-input_output_len:128,128-quant:nvfp4-ep:8-tp:8-gpus:8]
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-streaming-float16-maxbs:512-input_output_len:1000,1000-quant:nvfp4-con:4096-ep:8-tp:8-gpus:8]
- perf/test_perf.py::test_perf[deepseek_r1-nvfp4-bench-pytorch-float16-maxbs:1-input_output_len:1000,2000-quant:nvfp4-reqs:10-ep:4-tp:8-gpus:8] #min latency test
- perf/test_perf.py::test_perf[deepseek_r1-nvfp4-bench-pytorch-float16-maxbs:384-maxnt:1536-input_output_len:1000,2000-quant:nvfp4-reqs:49152-con:3072-ep:8-tp:8-gpus:8] #max throughput test
17 changes: 17 additions & 0 deletions tests/integration/test_lists/waives.txt
Original file line number Diff line number Diff line change
Expand Up @@ -453,5 +453,22 @@ examples/test_medusa.py::test_llama_medusa_1gpu[llama-v2-7b-hf] SKIP (https://nv
examples/test_medusa.py::test_llama_medusa_1gpu[llama-3.2-1b] SKIP (https://nvbugs/5219534)
examples/test_medusa.py::test_llama_medusa_1gpu[llama-3.1-8b] SKIP (https://nvbugs/5219535)
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16] SKIP (https://nvbugspro.nvidia.com/bug/5226339)
examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5227342)
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5201168)
perf/test_perf.py::test_perf[gpt_350m-cppmanager-plugin-float16-bs:32-input_output_len:60,20] SKIP (https://nvbugs/5228840)
perf/test_perf.py::test_perf[gpt_350m-cppmanager-static_batching-plugin-float16-bs:32-input_output_len:60,20] SKIP (https://nvbugs/5228840)
perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20] SKIP # https://nvbugspro.nvidia.com/bug/5207477
perf/test_perf.py::test_perf[flan_t5_base-bench-float16-input_output_len:128,20] SKIP
perf/test_perf.py::test_perf[flan_t5_large-bench-float16-input_output_len:128,20] SKIP
perf/test_perf.py::test_perf[flan_t5_large-bench-float16-maxbs:1-input_output_len:128,20-gpus:2] SKIP
perf/test_perf.py::test_perf[whisper_large_v3-bench-float16-input_output_len:128,20] SKIP
perf/test_perf.py::test_perf[mamba_370m-bench-float16-input_output_len:128,128] SKIP
perf/test_perf.py::test_perf[mamba_370m-bench-float16-input_output_len:512,32] SKIP
perf/test_perf.py::test_perf[mamba_2.8b-bench-float16-input_output_len:128,128] SKIP
perf/test_perf.py::test_perf[mamba_2.8b-bench-float16-input_output_len:512,32] SKIP
perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20-gpus:2] SKIP
perf/test_perf.py::test_perf[t5-bench-float16-maxbs:1-input_output_len:128,20-gpus:2] SKIP
perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:128,128-reqs:80-gpus:8] SKIP
perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:512,32-reqs:80-gpus:8] SKIP
full:B40/perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float16-input_output_len:128,128-quant:fp8] SKIP #https://nvbugspro.nvidia.com/bug/5150255
full:B200/perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float16-input_output_len:128,128-quant:fp8] SKIP #https://nvbugspro.nvidia.com/bug/5150255