Skip to content

Commit

Permalink
Merge branch 'NVIDIA:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
Eddie-Wang1120 authored Jan 15, 2024
2 parents 91c3935 + 12e82e3 commit ca6b703
Show file tree
Hide file tree
Showing 253 changed files with 158,334 additions and 5,198 deletions.
19 changes: 10 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ H200 is now 2.4x faster on Llama-70B with recent improvements to TensorRT-LLM GQ

- [TensorRT-LLM Overview](#tensorrt-llm-overview)
- [Installation](#installation)
- [Linux](./docs/source/installation.md)
- [Windows](windows/README.md)
- [Quick Start](#quick-start)
- [Support Matrix](#support-matrix)
- [Devices](#devices)
Expand Down Expand Up @@ -108,23 +110,20 @@ concepts used in TensorRT-LLM, we recommend you to read the following

## Installation

The documentation for installing TensorRT-LLM can be found
[here](./docs/source/installation.md). An image of a Docker container with
TensorRT-LLM and its Triton Inference Server Backend will be made available
soon.

The remaining commands in that document must be executed from the TensorRT-LLM
container.

*For Linux installation, see [`Linux`](./docs/source/installation.md).*
*For Windows installation, see [`Windows`](windows/README.md).*

Once installed, commands to build and run LLMs must be executed from the TensorRT-LLM container.

## Quick Start

Please be sure to complete the [installation steps](#installation) before proceeding with the following steps.

To create a TensorRT engine for an existing model, there are 3 steps:

1. Download pre-trained weights,
2. Build a fully-optimized engine of the model,
3. Deploy the engine.
3. Deploy the engine, in other words, run the fully-optimized model.

The following sections show how to use TensorRT-LLM to run the
[BLOOM-560m](https://huggingface.co/bigscience/bloom-560m) model.
Expand Down Expand Up @@ -181,6 +180,8 @@ the example [folder](examples/bloom). Many more [models](#models) than BLOOM
are implemented in TensorRT-LLM. They can be found in the
[examples](./examples/) directory.

Beyond local execution, you can also use the NVIDIA Triton Inference Server to create a production-ready deployment of your LLM as described in this [blog](https://developer.nvidia.com/blog/optimizing-inference-on-llms-with-tensorrt-llm-now-publicly-available/).

## Support Matrix

TensorRT-LLM optimizes the performance of a range of well-known models on
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/cpp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,9 @@ mpirun -n 8 ./benchmarks/gptSessionBenchmark \
# [BENCHMARK] batch_size 1 input_length 60 output_length 20 latency(ms) 792.14
```

If you want to obtain context and generation logits, you could build an enigne with `--gather_all_token_logits` and run gptSessionBenchmark with `--print_all_logits`. This will print a large number of logit values and has a certain impact on performance.
If you want to obtain context and generation logits, you could build an enigne with `--gather_context_logits` and `--gather_generation_logits`, respectively. Enable `--gather_all_token_logits` will enable both of them.

If you want to get the logits, you could run gptSessionBenchmark with `--print_all_logits`. This will print a large number of logit values and has a certain impact on performance.

*Please note that the expected outputs in that document are only for reference, specific performance numbers depend on the GPU you're using.*

Expand Down
5 changes: 3 additions & 2 deletions benchmarks/cpp/gptManagerBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -406,8 +406,9 @@ class GptServer
bool final_response, [[maybe_unused]] const std::string& errMsg)
{
// `response_tensors` contains `outputIds, sequenceLength, [contextLogits, generationLogits], logProbs,
// cumLogProbs`. `contextLogits, generationLogits` are optional, only contained when 'gather_all_token_logits'
// are set
// cumLogProbs`. `contextLogits, generationLogits` are optional, only contained when `gather_context_logits` and
// `gather_generation_logits` are enabled respectively. Or enable 'gather_all_token_logits' to enable both of
// them.
try
{
if (final_response)
Expand Down
64 changes: 52 additions & 12 deletions benchmarks/python/allowed_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,15 @@ class EncDecBuildConfig:
hidden_size: int
vocab_size: int
hidden_act: Optional[str]
n_positions: int
max_batch_size: int
n_positions: int = 0
num_decoder_layers: Optional[int] = None
head_size: Optional[int] = None
ffn_hidden_size: Optional[int] = None
num_buckets: Optional[int] = None
max_distance: Optional[int] = None
num_buckets: int = 0
max_distance: int = 0
has_embedding_scale: bool = False
normalize_before: Optional[bool] = None
max_encoder_input_len: Optional[int] = None
max_decoder_input_len: Optional[int] = None
max_output_len: Optional[int] = None
Expand Down Expand Up @@ -765,7 +767,7 @@ class ModelConfig:
)),
"flan_t5_small":
ModelConfig(name="flan_t5_small",
family="t5",
family="flan_t5",
benchmark_type="enc_dec",
build_config=EncDecBuildConfig(
num_layers=8,
Expand All @@ -775,7 +777,7 @@ class ModelConfig:
ffn_hidden_size=1024,
hidden_size=512,
vocab_size=32128,
hidden_act="gated-gelu",
hidden_act="gelu_new",
n_positions=512,
num_buckets=32,
max_distance=128,
Expand All @@ -787,7 +789,7 @@ class ModelConfig:
)),
"flan_t5_base":
ModelConfig(name="flan_t5_base",
family="t5",
family="flan_t5",
benchmark_type="enc_dec",
build_config=EncDecBuildConfig(
num_layers=12,
Expand All @@ -797,7 +799,7 @@ class ModelConfig:
ffn_hidden_size=2048,
hidden_size=768,
vocab_size=32128,
hidden_act="gated-gelu",
hidden_act="gelu_new",
n_positions=512,
num_buckets=32,
max_distance=128,
Expand All @@ -809,7 +811,7 @@ class ModelConfig:
)),
"flan_t5_large":
ModelConfig(name="flan_t5_large",
family="t5",
family="flan_t5",
benchmark_type="enc_dec",
build_config=EncDecBuildConfig(
num_layers=24,
Expand All @@ -819,7 +821,7 @@ class ModelConfig:
ffn_hidden_size=2816,
hidden_size=1024,
vocab_size=32128,
hidden_act="gated-gelu",
hidden_act="gelu_new",
n_positions=512,
num_buckets=32,
max_distance=128,
Expand All @@ -831,7 +833,7 @@ class ModelConfig:
)),
"flan_t5_xl":
ModelConfig(name="flan_t5_xl",
family="t5",
family="flan_t5",
benchmark_type="enc_dec",
build_config=EncDecBuildConfig(
num_layers=24,
Expand All @@ -841,7 +843,7 @@ class ModelConfig:
ffn_hidden_size=5120,
hidden_size=2048,
vocab_size=32128,
hidden_act="gated-gelu",
hidden_act="gelu_new",
n_positions=512,
num_buckets=32,
max_distance=128,
Expand All @@ -853,7 +855,7 @@ class ModelConfig:
)),
"flan_t5_xxl":
ModelConfig(name="flan_t5_xxl",
family="t5",
family="flan_t5",
benchmark_type="enc_dec",
build_config=EncDecBuildConfig(
num_layers=24,
Expand Down Expand Up @@ -888,6 +890,8 @@ class ModelConfig:
hidden_act="gelu",
n_positions=1024,
num_buckets=32,
has_embedding_scale=False,
normalize_before=False,
max_batch_size=8,
max_encoder_input_len=1024,
max_decoder_input_len=1,
Expand All @@ -908,6 +912,8 @@ class ModelConfig:
vocab_size=250054,
hidden_act="relu",
n_positions=1024,
has_embedding_scale=True,
normalize_before=True,
max_batch_size=8,
max_encoder_input_len=1024,
max_decoder_input_len=1,
Expand Down Expand Up @@ -1020,6 +1026,40 @@ class ModelConfig:
builder_opt=None,
bias=False,
)),
"qwen_7b_chat":
ModelConfig(name="qwen_7b_chat",
family="qwen",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=32,
num_heads=32,
hidden_size=4096,
vocab_size=151936,
hidden_act='silu',
n_positions=8192,
inter_size=22016,
max_batch_size=128,
max_input_len=512,
max_output_len=200,
builder_opt=None,
)),
"qwen_14b_chat":
ModelConfig(name="qwen_14b_chat",
family="qwen",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=40,
num_heads=40,
hidden_size=5120,
vocab_size=152064,
hidden_act='silu',
n_positions=8192,
inter_size=27392,
max_batch_size=64,
max_input_len=512,
max_output_len=200,
builder_opt=None,
)),
}


Expand Down
Loading

0 comments on commit ca6b703

Please sign in to comment.