Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open source export and deploy modules #8743

Merged
merged 25 commits into from
Apr 6, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
9926619
export and deploy modules
oyilmaz-nvidia Mar 25, 2024
e86e60e
Add export tests
oyilmaz-nvidia Mar 25, 2024
62c6cdd
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 25, 2024
ef938b2
Address PR reviews
oyilmaz-nvidia Apr 2, 2024
2747fbe
Merge branch 'main' into oss-export-deploy
oyilmaz-nvidia Apr 2, 2024
300c780
Add try except
oyilmaz-nvidia Apr 2, 2024
cc8f98f
Moved query_llm to nlp folder
oyilmaz-nvidia Apr 2, 2024
6da56a6
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 2, 2024
0e11a3d
removed lambada.json
oyilmaz-nvidia Apr 2, 2024
083f901
Merge branch 'oss-export-deploy' of https://github.com/oyilmaz-nvidia…
oyilmaz-nvidia Apr 2, 2024
680ebee
Reverting the Jenkinsfile
oyilmaz-nvidia Apr 3, 2024
f22e9e0
Exclude deploy and export from the pip
oyilmaz-nvidia Apr 3, 2024
648acb6
Merge branch 'main' into oss-export-deploy
oyilmaz-nvidia Apr 3, 2024
fb27b4e
Address the CodeQL issues
oyilmaz-nvidia Apr 4, 2024
15effc8
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 4, 2024
75d57b8
Merge branch 'main' into oss-export-deploy
oyilmaz-nvidia Apr 4, 2024
c495af5
Addressing reviews
oyilmaz-nvidia Apr 4, 2024
07e34c8
remove deploy test for now
oyilmaz-nvidia Apr 5, 2024
4217047
Merge branch 'main' into oss-export-deploy
oyilmaz-nvidia Apr 5, 2024
541838d
Addressing CodeQL comments
oyilmaz-nvidia Apr 5, 2024
fd7a99c
Merge branch 'main' into oss-export-deploy
ericharper Apr 5, 2024
310dc49
wrap imports with try except
oyilmaz-nvidia Apr 5, 2024
9f8ef15
Merge branch 'oss-export-deploy' of https://github.com/oyilmaz-nvidia…
oyilmaz-nvidia Apr 5, 2024
b201bf5
Merge branch 'main' into oss-export-deploy
oyilmaz-nvidia Apr 5, 2024
56fd156
Add test data param and fix codeql issue
oyilmaz-nvidia Apr 5, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -2201,7 +2201,7 @@ pipeline {
}
}
}

stage('Punctuation & Capitalization tarred dataset') {
when {
anyOf {
Expand Down Expand Up @@ -2261,7 +2261,7 @@ pipeline {
}
}
}

stage('Punctuation & Capitalization, Different ways of passing labels to model') {
when {
anyOf {
Expand Down Expand Up @@ -5585,7 +5585,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
}
}

stage('L2: Megatron Mock Data Generation') {
when {
anyOf {
Expand Down Expand Up @@ -5815,4 +5815,4 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
cleanWs()
}
}
}
}
18 changes: 18 additions & 0 deletions nemo/deploy/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from nemo.deploy.deploy_base import DeployBase
from nemo.deploy.deploy_pytriton import DeployPyTriton
from nemo.deploy.triton_deployable import ITritonDeployable
109 changes: 109 additions & 0 deletions nemo/deploy/deploy_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import importlib
from abc import ABC, abstractmethod

try:
from pytorch_lightning import Trainer
except Exception:

Check notice

Code scanning / CodeQL

Empty except Note

'except' clause does nothing but pass and there is no explanatory comment.
pass

from nemo.deploy.triton_deployable import ITritonDeployable

use_nemo = True
try:
from nemo.core.classes.modelPT import ModelPT
except:

Check notice

Code scanning / CodeQL

Except block handles 'BaseException' Note

Except block directly handles BaseException.
use_nemo = False


class DeployBase(ABC):
oyilmaz-nvidia marked this conversation as resolved.
Show resolved Hide resolved
def __init__(
self,
triton_model_name: str,
titu1994 marked this conversation as resolved.
Show resolved Hide resolved
triton_model_version: int = 1,
checkpoint_path: str = None,
model=None,
max_batch_size: int = 128,
port: int = 8000,
address="0.0.0.0",
allow_grpc=True,
allow_http=True,
streaming=False,
pytriton_log_verbose=0,
):
self.checkpoint_path = checkpoint_path
self.triton_model_name = triton_model_name
self.triton_model_version = triton_model_version
self.max_batch_size = max_batch_size
self.model = model
self.port = port
self.address = address
self.triton = None
self.allow_grpc = allow_grpc
self.allow_http = allow_http
self.streaming = streaming
self.pytriton_log_verbose = pytriton_log_verbose

if checkpoint_path is None and model is None:
raise Exception("Either checkpoint_path or model should be provided.")

@abstractmethod
def deploy(self):
pass

@abstractmethod
def serve(self):
pass

@abstractmethod
def run(self):
pass

@abstractmethod
def stop(self):
pass

def _init_nemo_model(self):
if self.checkpoint_path is not None:
model_config = ModelPT.restore_from(self.checkpoint_path, return_config=True)
module_path, class_name = DeployBase.get_module_and_class(model_config.target)
cls = getattr(importlib.import_module(module_path), class_name)
self.model = cls.restore_from(restore_path=self.checkpoint_path, trainer=Trainer())
self.model.freeze()

# has to turn off activations_checkpoint_method for inference
try:
self.model.model.language_model.encoder.activations_checkpoint_method = None
except AttributeError:

Check notice

Code scanning / CodeQL

Empty except Note

'except' clause does nothing but pass and there is no explanatory comment.
pass

if self.model is None:
raise Exception("There is no model to deploy.")

self._is_model_deployable()

def _is_model_deployable(self):
if not issubclass(type(self.model), ITritonDeployable):
raise Exception(
"This model is not deployable to Triton." "nemo.deploy.ITritonDeployable class should be inherited"
)
else:
return True

@staticmethod
def get_module_and_class(target: str):
ln = target.rindex(".")
return target[0:ln], target[ln + 1 : len(target)]
182 changes: 182 additions & 0 deletions nemo/deploy/deploy_pytriton.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


try:
from pytriton.model_config import ModelConfig
from pytriton.triton import Triton, TritonConfig
except:

Check notice

Code scanning / CodeQL

Empty except Note

'except' clause does nothing but pass and there is no explanatory comment.

Check notice

Code scanning / CodeQL

Except block handles 'BaseException' Note

Except block directly handles BaseException.
pass
from nemo.deploy.deploy_base import DeployBase


class DeployPyTriton(DeployBase):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The classname has insufficient context. Deploy Pytriton with which model? We are planning to deploy pytriton for streaming asr and tts too as new tools, so please call this class DeployPytritonLLM or something that denotes what it is deploying.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no need for a specific model. The goal is to support the deployment of any model. You can either pass a nemo ckpt using this param checkpoint_path or in memory model using this model param. The in memory model support will be added fully later though.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, so you're saying the same code for streaming will apply to both LLM and ASR? Then no need to act on this comment, but let's keep it un resolved so ASR team can look into it later


"""
Deploys any models to Triton Inference Server that implements ITritonDeployable interface in nemo.deploy.

Example:
from nemo.deploy import DeployPyTriton, NemoQueryLLM
from nemo.export import TensorRTLLM

trt_llm_exporter = TensorRTLLM(model_dir="/path/for/model/files")
trt_llm_exporter.export(
nemo_checkpoint_path="/path/for/nemo/checkpoint",
model_type="llama",
n_gpus=1,
)

nm = DeployPyTriton(model=trt_llm_exporter, triton_model_name="model_name", port=8000)
nm.deploy()
nm.run()
nq = NemoQueryLLM(url="localhost", model_name="model_name")

prompts = ["hello, testing GPT inference", "another GPT inference test?"]
output = nq.query_llm(prompts=prompts, max_output_len=100)
print("prompts: ", prompts)
print("")
print("output: ", output)
print("")

prompts = ["Give me some info about Paris", "Do you think Londan is a good city to visit?", "What do you think about Rome?"]
output = nq.query_llm(prompts=prompts, max_output_len=250)
print("prompts: ", prompts)
print("")
print("output: ", output)
print("")

"""

def __init__(
self,
triton_model_name: str,
triton_model_version: int = 1,
checkpoint_path: str = None,
model=None,
max_batch_size: int = 128,
port: int = 8000,
address="0.0.0.0",
allow_grpc=True,
allow_http=True,
streaming=False,
pytriton_log_verbose=0,
):
"""
A nemo checkpoint or model is expected for serving on Triton Inference Server.

Args:
triton_model_name (str): Name for the service
triton_model_version(int): Version for the service
checkpoint_path (str): path of the nemo file
model (ITritonDeployable): A model that implements the ITritonDeployable from nemo.deploy import ITritonDeployable
max_batch_size (int): max batch size
port (int) : port for the Triton server
address (str): http address for Triton server to bind.
"""

super().__init__(
triton_model_name=triton_model_name,
triton_model_version=triton_model_version,
checkpoint_path=checkpoint_path,
model=model,
max_batch_size=max_batch_size,
port=port,
address=address,
allow_grpc=allow_grpc,
allow_http=allow_http,
streaming=streaming,
pytriton_log_verbose=pytriton_log_verbose,
)

def deploy(self):

"""
Deploys any models to Triton Inference Server.
"""

self._init_nemo_model()

try:
if self.streaming:
# TODO: can't set allow_http=True due to a bug in pytriton, will fix in latest pytriton
triton_config = TritonConfig(
log_verbose=self.pytriton_log_verbose,
allow_grpc=self.allow_grpc,
allow_http=self.allow_http,
grpc_address=self.address,
)
self.triton = Triton(config=triton_config)
self.triton.bind(
model_name=self.triton_model_name,
model_version=self.triton_model_version,
infer_func=self.model.triton_infer_fn_streaming,
inputs=self.model.get_triton_input,
outputs=self.model.get_triton_output,
config=ModelConfig(decoupled=True),
)
else:
triton_config = TritonConfig(
http_address=self.address,
http_port=self.port,
allow_grpc=self.allow_grpc,
allow_http=self.allow_http,
)
self.triton = Triton(config=triton_config)
self.triton.bind(
model_name=self.triton_model_name,
model_version=self.triton_model_version,
infer_func=self.model.triton_infer_fn,
inputs=self.model.get_triton_input,
outputs=self.model.get_triton_output,
config=ModelConfig(max_batch_size=self.max_batch_size),
)
except Exception as e:
self.triton = None
print(e)

def serve(self):
oyilmaz-nvidia marked this conversation as resolved.
Show resolved Hide resolved

"""
Starts serving the model and waits for the requests
"""

if self.triton is None:
raise Exception("deploy should be called first.")

try:
self.triton.serve()
except Exception as e:
self.triton = None
print(e)

def run(self):

"""
Starts serving the model asynchronously.
"""

if self.triton is None:
raise Exception("deploy should be called first.")

self.triton.run()

def stop(self):
"""
Stops serving the model.
"""

if self.triton is None:
raise Exception("deploy should be called first.")

self.triton.stop()
19 changes: 19 additions & 0 deletions nemo/deploy/nlp/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


try:
from nemo.deploy.nlp.query_llm import NemoQueryLLM
except:

Check notice

Code scanning / CodeQL

Empty except Note

'except' clause does nothing but pass and there is no explanatory comment.

Check notice

Code scanning / CodeQL

Except block handles 'BaseException' Note

Except block directly handles BaseException.
pass
Loading
Loading