diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index c06857247eee..4d2ea126b24a 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -123,7 +123,7 @@ def run_to_completion(profile_dir: Optional[str] = None): save_to_pytorch_benchmark_format(args, results) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser( description="Benchmark the latency of processing a single batch of " "requests till completion." @@ -171,6 +171,12 @@ def run_to_completion(profile_dir: Optional[str] = None): # V1 enables prefix caching by default which skews the latency # numbers. We need to disable prefix caching by default. parser.set_defaults(enable_prefix_caching=False) + + return parser + + +if __name__ == "__main__": + parser = create_argument_parser() args = parser.parse_args() if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: raise OSError( diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py index 00869fa94e71..6e0f3b51c9d2 100644 --- a/benchmarks/benchmark_long_document_qa_throughput.py +++ b/benchmarks/benchmark_long_document_qa_throughput.py @@ -142,7 +142,7 @@ def main(args): ) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser( description="Benchmark the performance with or " "without automatic prefix caching." @@ -192,5 +192,11 @@ def main(args): ) parser = EngineArgs.add_cli_args(parser) + + return parser + + +if __name__ == "__main__": + parser = create_argument_parser() args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 3e4704f0b820..b5e2613de1cd 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -218,7 +218,7 @@ def main(args): ) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser( description="Benchmark the performance with or without " "automatic prefix caching." @@ -268,5 +268,11 @@ def main(args): ) parser = EngineArgs.add_cli_args(parser) + + return parser + + +if __name__ == "__main__": + parser = create_argument_parser() args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index 5496703f23cc..bb453791c186 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -161,7 +161,7 @@ def main(args: argparse.Namespace): json.dump(results, f, indent=4) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser.add_argument( "--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm" @@ -204,6 +204,12 @@ def main(args: argparse.Namespace): ) parser = EngineArgs.add_cli_args(parser) + + return parser + + +if __name__ == "__main__": + parser = create_argument_parser() args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 81428fb7dae1..f38e45b26113 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -875,7 +875,7 @@ def main(args: argparse.Namespace): save_to_pytorch_benchmark_format(args, result_json, file_name) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser( description="Benchmark the online serving throughput." ) @@ -1225,6 +1225,10 @@ def main(args: argparse.Namespace): "script chooses a LoRA module at random.", ) - args = parser.parse_args() + return parser + +if __name__ == "__main__": + parser = create_argument_parser() + args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index c1501ad52c25..e23a5a9e2233 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -850,7 +850,7 @@ def main(args: argparse.Namespace): json.dump(results, outfile, indent=4) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser( description="Benchmark the online serving throughput." ) @@ -1034,5 +1034,10 @@ def main(args: argparse.Namespace): help="Ratio of Structured Outputs requests", ) + return parser + + +if __name__ == "__main__": + parser = create_argument_parser() args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index d19753d40e49..401ebe0bdb26 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -595,7 +595,7 @@ def validate_args(args): ) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser.add_argument( "--backend", @@ -717,6 +717,12 @@ def validate_args(args): ) parser = AsyncEngineArgs.add_cli_args(parser) + + return parser + + +if __name__ == "__main__": + parser = create_argument_parser() args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model