Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions nemo_export/vllm_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ def export(
cpu_offload_gb: float = 0,
enforce_eager: bool = False,
max_seq_len_to_capture: int = 8192,
max_model_len: int = 8192,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you @oyilmaz-nvidia . For my understanding, why is this param needed now ? Is it new introduced by vLLM

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We really didn't need to set the parameter up until now but some of the large models like llama 70B might need some tuning to fit model into the GPUs. And CI is giving error now to fit this model (it was working before but with the newer versions of vllm, we might need to tune it).

task: Literal["auto", "generate", "embedding"] = "auto",
):
"""
Expand All @@ -135,6 +136,7 @@ def export(
cpu_offload_gb (float, optional): Amount of CPU offload memory (in GB). Defaults to 0.
enforce_eager (bool, optional): Whether to enforce eager execution. Defaults to False.
max_seq_len_to_capture (int, optional): Maximum sequence length to capture. Defaults to 8192.
max_model_len (int, optional): Maximum model length. Defaults to 8192.
task (Literal["auto", "generate", "embedding"], optional): Task type for vLLM. Defaults to "auto".

Raises:
Expand Down Expand Up @@ -171,6 +173,7 @@ def export(
cpu_offload_gb=cpu_offload_gb,
enforce_eager=enforce_eager,
max_seq_len_to_capture=max_seq_len_to_capture,
max_model_len=max_model_len,
task=task,
)
else:
Expand All @@ -188,6 +191,7 @@ def export(
cpu_offload_gb=cpu_offload_gb,
enforce_eager=enforce_eager,
max_seq_len_to_capture=max_seq_len_to_capture,
max_model_len=max_model_len,
task=task,
)

Expand Down
8 changes: 8 additions & 0 deletions scripts/deploy/nlp/deploy_vllm_triton.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,13 @@ def get_args(argv):
type=float,
help="GPU memory utilization percentage for vLLM.",
)
parser.add_argument(
"-mml",
"--max_model_len",
default=8192,
type=int,
help="Max model length for vLLM.",
)
parser.add_argument(
"-sp",
"--swap_space",
Expand Down Expand Up @@ -198,6 +205,7 @@ def nemo_deploy(argv):
cpu_offload_gb=args.cpu_offload_gb,
enforce_eager=args.enforce_eager,
max_seq_len_to_capture=args.max_seq_len_to_capture,
max_model_len=args.max_model_len,
task="generate",
)

Expand Down