From 6ffc9b33bfead5d2903019f73499074c669d6568 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz Date: Tue, 4 Nov 2025 13:11:30 -0500 Subject: [PATCH 1/2] Add max-model-len param for vLLM Signed-off-by: Onur Yilmaz --- nemo_export/vllm_exporter.py | 4 ++++ scripts/deploy/nlp/deploy_vllm_triton.py | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/nemo_export/vllm_exporter.py b/nemo_export/vllm_exporter.py index a07883e126..b787825f59 100644 --- a/nemo_export/vllm_exporter.py +++ b/nemo_export/vllm_exporter.py @@ -116,6 +116,7 @@ def export( cpu_offload_gb: float = 0, enforce_eager: bool = False, max_seq_len_to_capture: int = 8192, + max_model_len: int = 8192, task: Literal["auto", "generate", "embedding"] = "auto", ): """ @@ -135,6 +136,7 @@ def export( cpu_offload_gb (float, optional): Amount of CPU offload memory (in GB). Defaults to 0. enforce_eager (bool, optional): Whether to enforce eager execution. Defaults to False. max_seq_len_to_capture (int, optional): Maximum sequence length to capture. Defaults to 8192. + max_model_len (int, optional): Maximum model length. Defaults to 8192. task (Literal["auto", "generate", "embedding"], optional): Task type for vLLM. Defaults to "auto". Raises: @@ -171,6 +173,7 @@ def export( cpu_offload_gb=cpu_offload_gb, enforce_eager=enforce_eager, max_seq_len_to_capture=max_seq_len_to_capture, + max_model_len=max_model_len, task=task, ) else: @@ -188,6 +191,7 @@ def export( cpu_offload_gb=cpu_offload_gb, enforce_eager=enforce_eager, max_seq_len_to_capture=max_seq_len_to_capture, + max_model_len=max_model_len, task=task, ) diff --git a/scripts/deploy/nlp/deploy_vllm_triton.py b/scripts/deploy/nlp/deploy_vllm_triton.py index 8e924ea56e..69e6febfec 100755 --- a/scripts/deploy/nlp/deploy_vllm_triton.py +++ b/scripts/deploy/nlp/deploy_vllm_triton.py @@ -95,6 +95,13 @@ def get_args(argv): type=float, help="GPU memory utilization percentage for vLLM.", ) + parser.add_argument( + "-mml", + "--max_model_len", + default=8192, + type=int, + help="Max model length for vLLM.", + ) parser.add_argument( "-sp", "--swap_space", @@ -198,6 +205,7 @@ def nemo_deploy(argv): cpu_offload_gb=args.cpu_offload_gb, enforce_eager=args.enforce_eager, max_seq_len_to_capture=args.max_seq_len_to_capture, + max_model_len=args.max_model_len, task="generate", ) From 4d16afaef0c8b79f8c8f7987093c4dcb0cbaa8d3 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz Date: Wed, 5 Nov 2025 14:48:29 -0500 Subject: [PATCH 2/2] Fix minor test Signed-off-by: Onur Yilmaz --- tests/unit_tests/export/test_vllm_exporter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/unit_tests/export/test_vllm_exporter.py b/tests/unit_tests/export/test_vllm_exporter.py index bf22635fff..f34d56df92 100644 --- a/tests/unit_tests/export/test_vllm_exporter.py +++ b/tests/unit_tests/export/test_vllm_exporter.py @@ -71,6 +71,7 @@ def test_export(exporter, mock_llm): cpu_offload_gb=0, enforce_eager=False, max_seq_len_to_capture=8192, + max_model_len=8192, task="auto", ) @@ -97,6 +98,7 @@ def test_export_with_lora(exporter, mock_llm): cpu_offload_gb=0, enforce_eager=False, max_seq_len_to_capture=8192, + max_model_len=8192, task="auto", ) @@ -123,6 +125,7 @@ def test_export_with_custom_params(exporter, mock_llm): cpu_offload_gb=0, enforce_eager=False, max_seq_len_to_capture=8192, + max_model_len=8192, task="auto", )