Merge branch 'NVIDIA:main' into main

NVIDIA · Jan 15, 2024 · ca6b703 · ca6b703
2 parents 91c3935 + 12e82e3
commit ca6b703
Show file tree

Hide file tree

Showing 253 changed files with 158,334 additions and 5,198 deletions.
diff --git a/README.md b/README.md
@@ -45,6 +45,8 @@ H200 is now 2.4x faster on Llama-70B with recent improvements to TensorRT-LLM GQ
 
 - [TensorRT-LLM Overview](#tensorrt-llm-overview)
 - [Installation](#installation)
+  - [Linux](./docs/source/installation.md)
+  - [Windows](windows/README.md)
 - [Quick Start](#quick-start)
 - [Support Matrix](#support-matrix)
   - [Devices](#devices)
@@ -108,23 +110,20 @@ concepts used in TensorRT-LLM, we recommend you to read the following
 
 ## Installation
 
-The documentation for installing TensorRT-LLM can be found
-[here](./docs/source/installation.md). An image of a Docker container with
-TensorRT-LLM and its Triton Inference Server Backend will be made available
-soon.
-
-The remaining commands in that document must be executed from the TensorRT-LLM
-container.
-
+*For Linux installation, see [`Linux`](./docs/source/installation.md).*
 *For Windows installation, see [`Windows`](windows/README.md).*
 
+Once installed, commands to build and run LLMs must be executed from the TensorRT-LLM container.
+
 ## Quick Start
 
+Please be sure to complete the [installation steps](#installation) before proceeding with the following steps.
+
 To create a TensorRT engine for an existing model, there are 3 steps:
 
 1. Download pre-trained weights,
 2. Build a fully-optimized engine of the model,
-3. Deploy the engine.
+3. Deploy the engine, in other words, run the fully-optimized model.
 
 The following sections show how to use TensorRT-LLM to run the
 [BLOOM-560m](https://huggingface.co/bigscience/bloom-560m) model.
@@ -181,6 +180,8 @@ the example [folder](examples/bloom). Many more [models](#models) than BLOOM
 are implemented in TensorRT-LLM. They can be found in the
 [examples](./examples/) directory.
 
+Beyond local execution, you can also use the NVIDIA Triton Inference Server to create a production-ready deployment of your LLM as described in this [blog](https://developer.nvidia.com/blog/optimizing-inference-on-llms-with-tensorrt-llm-now-publicly-available/).
+
 ## Support Matrix
 
 TensorRT-LLM optimizes the performance of a range of well-known models on

diff --git a/benchmarks/cpp/README.md b/benchmarks/cpp/README.md
@@ -59,7 +59,9 @@ mpirun -n 8 ./benchmarks/gptSessionBenchmark \
 # [BENCHMARK] batch_size 1 input_length 60 output_length 20 latency(ms) 792.14
 ```
 
-If you want to obtain context and generation logits, you could build an enigne with `--gather_all_token_logits` and run gptSessionBenchmark with `--print_all_logits`. This will print a large number of logit values and has a certain impact on performance.
+If you want to obtain context and generation logits, you could build an enigne with `--gather_context_logits` and `--gather_generation_logits`, respectively. Enable `--gather_all_token_logits` will enable both of them.
+
+If you want to get the logits, you could run gptSessionBenchmark with `--print_all_logits`. This will print a large number of logit values and has a certain impact on performance.
 
 *Please note that the expected outputs in that document are only for reference, specific performance numbers depend on the GPU you're using.*
 

diff --git a/benchmarks/cpp/gptManagerBenchmark.cpp b/benchmarks/cpp/gptManagerBenchmark.cpp
@@ -406,8 +406,9 @@ class GptServer
         bool final_response, [[maybe_unused]] const std::string& errMsg)
     {
         // `response_tensors` contains `outputIds, sequenceLength, [contextLogits, generationLogits], logProbs,
-        // cumLogProbs`. `contextLogits, generationLogits` are optional, only contained when 'gather_all_token_logits'
-        // are set
+        // cumLogProbs`. `contextLogits, generationLogits` are optional, only contained when `gather_context_logits` and
+        // `gather_generation_logits` are enabled respectively. Or enable 'gather_all_token_logits' to enable both of
+        // them.
         try
         {
             if (final_response)

diff --git a/benchmarks/python/allowed_configs.py b/benchmarks/python/allowed_configs.py
@@ -69,13 +69,15 @@ class EncDecBuildConfig:
     hidden_size: int
     vocab_size: int
     hidden_act: Optional[str]
-    n_positions: int
     max_batch_size: int
+    n_positions: int = 0
     num_decoder_layers: Optional[int] = None
     head_size: Optional[int] = None
     ffn_hidden_size: Optional[int] = None
-    num_buckets: Optional[int] = None
-    max_distance: Optional[int] = None
+    num_buckets: int = 0
+    max_distance: int = 0
+    has_embedding_scale: bool = False
+    normalize_before: Optional[bool] = None
     max_encoder_input_len: Optional[int] = None
     max_decoder_input_len: Optional[int] = None
     max_output_len: Optional[int] = None
@@ -765,7 +767,7 @@ class ModelConfig:
                 )),
     "flan_t5_small":
     ModelConfig(name="flan_t5_small",
-                family="t5",
+                family="flan_t5",
                 benchmark_type="enc_dec",
                 build_config=EncDecBuildConfig(
                     num_layers=8,
@@ -775,7 +777,7 @@ class ModelConfig:
                     ffn_hidden_size=1024,
                     hidden_size=512,
                     vocab_size=32128,
-                    hidden_act="gated-gelu",
+                    hidden_act="gelu_new",
                     n_positions=512,
                     num_buckets=32,
                     max_distance=128,
@@ -787,7 +789,7 @@ class ModelConfig:
                 )),
     "flan_t5_base":
     ModelConfig(name="flan_t5_base",
-                family="t5",
+                family="flan_t5",
                 benchmark_type="enc_dec",
                 build_config=EncDecBuildConfig(
                     num_layers=12,
@@ -797,7 +799,7 @@ class ModelConfig:
                     ffn_hidden_size=2048,
                     hidden_size=768,
                     vocab_size=32128,
-                    hidden_act="gated-gelu",
+                    hidden_act="gelu_new",
                     n_positions=512,
                     num_buckets=32,
                     max_distance=128,
@@ -809,7 +811,7 @@ class ModelConfig:
                 )),
     "flan_t5_large":
     ModelConfig(name="flan_t5_large",
-                family="t5",
+                family="flan_t5",
                 benchmark_type="enc_dec",
                 build_config=EncDecBuildConfig(
                     num_layers=24,
@@ -819,7 +821,7 @@ class ModelConfig:
                     ffn_hidden_size=2816,
                     hidden_size=1024,
                     vocab_size=32128,
-                    hidden_act="gated-gelu",
+                    hidden_act="gelu_new",
                     n_positions=512,
                     num_buckets=32,
                     max_distance=128,
@@ -831,7 +833,7 @@ class ModelConfig:
                 )),
     "flan_t5_xl":
     ModelConfig(name="flan_t5_xl",
-                family="t5",
+                family="flan_t5",
                 benchmark_type="enc_dec",
                 build_config=EncDecBuildConfig(
                     num_layers=24,
@@ -841,7 +843,7 @@ class ModelConfig:
                     ffn_hidden_size=5120,
                     hidden_size=2048,
                     vocab_size=32128,
-                    hidden_act="gated-gelu",
+                    hidden_act="gelu_new",
                     n_positions=512,
                     num_buckets=32,
                     max_distance=128,
@@ -853,7 +855,7 @@ class ModelConfig:
                 )),
     "flan_t5_xxl":
     ModelConfig(name="flan_t5_xxl",
-                family="t5",
+                family="flan_t5",
                 benchmark_type="enc_dec",
                 build_config=EncDecBuildConfig(
                     num_layers=24,
@@ -888,6 +890,8 @@ class ModelConfig:
                     hidden_act="gelu",
                     n_positions=1024,
                     num_buckets=32,
+                    has_embedding_scale=False,
+                    normalize_before=False,
                     max_batch_size=8,
                     max_encoder_input_len=1024,
                     max_decoder_input_len=1,
@@ -908,6 +912,8 @@ class ModelConfig:
                     vocab_size=250054,
                     hidden_act="relu",
                     n_positions=1024,
+                    has_embedding_scale=True,
+                    normalize_before=True,
                     max_batch_size=8,
                     max_encoder_input_len=1024,
                     max_decoder_input_len=1,
@@ -1020,6 +1026,40 @@ class ModelConfig:
                     builder_opt=None,
                     bias=False,
                 )),
+    "qwen_7b_chat":
+    ModelConfig(name="qwen_7b_chat",
+                family="qwen",
+                benchmark_type="gpt",
+                build_config=BuildConfig(
+                    num_layers=32,
+                    num_heads=32,
+                    hidden_size=4096,
+                    vocab_size=151936,
+                    hidden_act='silu',
+                    n_positions=8192,
+                    inter_size=22016,
+                    max_batch_size=128,
+                    max_input_len=512,
+                    max_output_len=200,
+                    builder_opt=None,
+                )),
+    "qwen_14b_chat":
+    ModelConfig(name="qwen_14b_chat",
+                family="qwen",
+                benchmark_type="gpt",
+                build_config=BuildConfig(
+                    num_layers=40,
+                    num_heads=40,
+                    hidden_size=5120,
+                    vocab_size=152064,
+                    hidden_act='silu',
+                    n_positions=8192,
+                    inter_size=27392,
+                    max_batch_size=64,
+                    max_input_len=512,
+                    max_output_len=200,
+                    builder_opt=None,
+                )),
 }