Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion tests/integration/defs/perf/test_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,7 @@ def __init__(
tp_size: int = 1,
pp_size: int = 1,
num_gpus: int = 1,
kv_cache_free_gpu_mem_fraction: float = 0.9,
):
# The model name.
self.model_name = model_name
Expand Down Expand Up @@ -428,6 +429,8 @@ def __init__(
self.num_gpus = num_gpus
# Just build engines
self.build_only = False
# kv cache free gpu mem fraction
self.kv_cache_free_gpu_mem_fraction = kv_cache_free_gpu_mem_fraction

def to_string(self,
custom_bs: int = None,
Expand Down Expand Up @@ -541,6 +544,10 @@ def to_string(self,
if self.num_gpus > 1:
entries.append(f"gpus:{self.num_gpus}")

# Add kv cache free gpu mem fraction.
if self.kv_cache_free_gpu_mem_fraction != 0.9:
entries.append(f"kv_frac:{self.kv_cache_free_gpu_mem_fraction}")

# Concatenate labels with "-".
return "-".join(entries)

Expand Down Expand Up @@ -648,6 +655,11 @@ def load_from_str(self, test_param_labels) -> None:
self.num_gpus = 1 if not labels[0].startswith("gpus:") else int(
labels.pop(0).replace("gpus:", ""))

if len(labels) > 0:
self.kv_cache_free_gpu_mem_fraction = 0.9 if not labels[
0].startswith("kv_frac:") else float(
labels.pop(0).replace("kv_frac:", ""))

assert len(
labels
) == 0, f"Invalid test name! Some labels cannot be parsed: {labels}"
Expand Down Expand Up @@ -998,7 +1010,8 @@ def get_trtllm_bench_build_command(self, engine_dir) -> list:
f"--workspace={engine_dir}", f"--model={hf_model_name}",
f"--model_path={model_dir}", "build", f"--dataset={dataset_path}",
f"--tp_size={self._config.tp_size}",
f"--pp_size={self._config.pp_size}"
f"--pp_size={self._config.pp_size}",
f"--kv_cache_free_gpu_mem_fraction={self._config.kv_cache_free_gpu_mem_fraction}"
]
max_seq_len = max(self._config.input_lens) + max(
self._config.output_lens)
Expand Down
22 changes: 12 additions & 10 deletions tests/integration/test_lists/qa/trt_llm_release_perf_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -473,19 +473,21 @@ trt_llm_release_perf_test:

#llama_v4_maverick_17b_128e_instruct_fp8
#pytorch backend
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:2000,500-reqs:3000-tp:8-gpus:8]
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:500,2000-reqs:3000-tp:8-gpus:8]
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:1000,1000-reqs:3000-tp:8-gpus:8]
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:8]
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-gpus:8]
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:2000,500-reqs:3000-ep:8-tp:8-gpus:8-kv_frac:0.6]
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:500,2000-reqs:3000-ep:8-tp:8-gpus:8-kv_frac:0.6]
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:1000,1000-reqs:3000-ep:8-tp:8-gpus:8-kv_frac:0.6]
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-ep:8-tp:8-gpus:8-kv_frac:0.6]
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-ep:8-tp:8-gpus:8-kv_frac:0.6]
#rcca case
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:20000,2000-reqs:1000-ep:8-tp:8-gpus:8-kv_frac:0.6]

#llama_v4_scout_17b_16e_instruct_fp8
#pytorch backend
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:2000,500-reqs:3000-tp:8-gpus:8]
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:500,2000-reqs:3000-tp:8-gpus:8]
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:1000,1000-reqs:3000-tp:8-gpus:8]
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-tp:8-gpus:8]
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-tp:8-gpus:8]
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:2000,500-reqs:3000-ep:8-tp:8-gpus:8-kv_frac:0.6]
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:500,2000-reqs:3000-ep:8-tp:8-gpus:8-kv_frac:0.6]
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:1000,1000-reqs:3000-ep:8-tp:8-gpus:8-kv_frac:0.6]
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-ep:8-tp:8-gpus:8-kv_frac:0.6]
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-ep:8-tp:8-gpus:8-kv_frac:0.6]

#deepseek_r1_fp8
#pytorch backend
Expand Down