Update TensorRT-LLM (20240116) (#891)

* Update TensorRT-LLM --------- Co-authored-by: Eddie-Wang1120 <[email protected]> Co-authored-by: Shixiaowei02 <[email protected]>
NVIDIA · Jan 16, 2024 · c896530 · c896530
1 parent 12e82e3
commit c896530
Show file tree

Hide file tree

Showing 255 changed files with 285,507 additions and 25,308 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -0,0 +1,116 @@
+name: "Bug Report"
+description: Submit a bug report to help us improve TensorRT-LLM
+labels: [ "bug" ]
+body:
+  - type: textarea
+    id: system-info
+    attributes:
+      label: System Info
+      description: Please share your system info with us.
+      placeholder: |
+        - CPU architecture (e.g., x86_64, aarch64)
+        - CPU/Host memory size (if known)
+        - GPU properties
+          - GPU name (e.g., NVIDIA H100, NVIDIA A100, NVIDIA L40S)
+          - GPU memory size (if known)
+          - Clock frequencies used (if applicable)
+        - Libraries
+          - TensorRT-LLM branch or tag (e.g., main, v0.7.1)
+          - TensorRT-LLM commit (if known)
+          - Versions of TensorRT, AMMO, CUDA, cuBLAS, etc. used
+          - Container used (if running TensorRT-LLM in a container)
+        - NVIDIA driver version
+        - OS (Ubuntu 22.04, CentOS 7, Windows 10)
+        - Any other information that may be useful in reproducing the bug
+    validations:
+      required: true
+
+  - type: textarea
+    id: who-can-help
+    attributes:
+      label: Who can help?
+      description: |
+        To expedite the response to your issue, it would be helpful if you could identify the appropriate person
+        to tag using the **@** symbol. Here is a general guideline on **whom to tag**.
+
+        Rest assured that all issues are reviewed by the core maintainers. If you are unsure about whom to tag,
+        you can leave it blank, and a core maintainer will make sure to involve the appropriate person.
+
+        Please tag fewer than 3 people.
+
+        Quantization: @Tracin
+
+        Documentation: @juney-nvidia
+
+        Feature request: @ncomly-nvidia
+
+        Performance: @kaiyux
+
+        Others: @byshiue
+
+      placeholder: "@Username ..."
+
+  - type: checkboxes
+    id: information-scripts-examples
+    attributes:
+      label: Information
+      description: 'The problem arises when using:'
+      options:
+        - label: "The official example scripts"
+        - label: "My own modified scripts"
+
+  - type: checkboxes
+    id: information-tasks
+    attributes:
+      label: Tasks
+      description: "The tasks I am working on are:"
+      options:
+        - label: "An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)"
+        - label: "My own task or dataset (give details below)"
+
+  - type: textarea
+    id: reproduction
+    validations:
+      required: true
+    attributes:
+      label: Reproduction
+      description: |
+        Kindly share a code example that demonstrates the issue you encountered. It is recommending to provide a code snippet directly.
+        Additionally, if you have any error messages, or stack traces related to the problem, please include them here.
+
+        Remember to use code tags to properly format your code. You can refer to the
+        link https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting for guidance on code formatting.
+
+        Please refrain from using screenshots, as they can be difficult to read and prevent others from copying and pasting your code.
+        It would be most helpful if we could reproduce your issue by simply copying and pasting your scripts and codes.
+
+      placeholder: |
+        Steps to reproduce the behavior:
+
+          1.
+          2.
+          3.
+
+  - type: textarea
+    id: expected-behavior
+    validations:
+      required: true
+    attributes:
+      label: Expected behavior
+      description: "Provide a brief summary of the expected behavior of the software. Provide output files or examples if possible."
+
+  - type: textarea
+    id: actual-behavior
+    validations:
+      required: true
+    attributes:
+      label: actual behavior
+      description: "Describe the actual behavior of the software and how it deviates from the expected behavior. Provide output files or examples if possible."
+
+  - type: textarea
+    id: additioanl-notes
+    validations:
+      required: true
+    attributes:
+      label: additional notes
+      description: "Provide any additional context here you think might be useful for the TensorRT-LLM team to help debug this issue (such as experiments done, potential things to investigate)."
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -15,7 +15,8 @@ repos:
     rev: v4.1.0
     hooks:
     -   id: check-added-large-files
-        exclude: 'cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/'
+        exclude: |
+            (?x)^(.*cubin.cpp)$
     -   id: check-merge-conflict
     -   id: check-symlinks
     -   id: detect-private-key

diff --git a/README.md b/README.md
@@ -45,8 +45,6 @@ H200 is now 2.4x faster on Llama-70B with recent improvements to TensorRT-LLM GQ
 
 - [TensorRT-LLM Overview](#tensorrt-llm-overview)
 - [Installation](#installation)
-  - [Linux](./docs/source/installation.md)
-  - [Windows](windows/README.md)
 - [Quick Start](#quick-start)
 - [Support Matrix](#support-matrix)
   - [Devices](#devices)
@@ -110,10 +108,26 @@ concepts used in TensorRT-LLM, we recommend you to read the following
 
 ## Installation
 
-*For Linux installation, see [`Linux`](./docs/source/installation.md).*
-*For Windows installation, see [`Windows`](windows/README.md).*
+After installing the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit),
+please run the following commands to install TensorRT-LLM.
 
-Once installed, commands to build and run LLMs must be executed from the TensorRT-LLM container.
+```bash
+# Obtain and start the basic docker image environment
+nvidia-docker run --entrypoint /bin/bash -it nvidia/cuda:12.1.0-devel-ubuntu22.04
+# Install dependencies, TensorRT-LLM requires Python 3.10
+apt-get update && apt-get -y install python3.10 python3-pip openmpi-bin libopenmpi-dev
+# Install the latest preview version (corresponding to the main branch) of TensorRT-LLM.
+# If you want to install the stable version (corresponding to the release branch), please
+# remove the `--pre` option.
+pip3 install tensorrt_llm -U --pre --extra-index-url https://pypi.nvidia.com
+# Check installation
+python3 -c "import tensorrt_llm; print(tensorrt_llm.__version__)"
+```
+
+For users who require the best performance or debugging capabilities, please refer to the instructions for
+[building from source code](docs/source/build_from_source.md).
+
+For Windows installation, see [`Windows`](windows/README.md).
 
 ## Quick Start
 

diff --git a/benchmarks/cpp/README.md b/benchmarks/cpp/README.md
@@ -7,7 +7,7 @@ multiple GPUs or multiple nodes with multiple GPUs.
 
 ### 1. Build TensorRT-LLM and benchmarking source code
 
-Please follow the [`installation document`](../../docs/source/installation.md) to build TensorRT-LLM.
+Please follow the [`installation document`](../../README.md#installation) to build TensorRT-LLM.
 
 Note that the benchmarking source code for C++ runtime is not built by default, you can use the argument `--benchmarks` in [`build_wheel.py`](source:scripts/build_wheel.py) to build the corresponding executable.
 

diff --git a/benchmarks/cpp/gptSessionBenchmark.cpp b/benchmarks/cpp/gptSessionBenchmark.cpp
@@ -123,6 +123,17 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
                     bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32),
                     bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)};
 
+                if (session.getModelConfig().computeContextLogits())
+                {
+                    generationOutput.contextLogits
+                        = bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kFLOAT);
+                }
+                if (session.getModelConfig().computeGenerationLogits())
+                {
+                    generationOutput.generationLogits
+                        = bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kFLOAT);
+                }
+
                 TLLM_LOG_INFO(memoryCounter.toString());
 
                 for (auto r = 0; r < warmUp; ++r)
@@ -175,21 +186,20 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
                     if (session.getModelConfig().computeContextLogits() && printAllLogits)
                     {
                         std::cout << "generationOutput.contextLogits.shape: "
-                                  << generationOutput.contextLogitsHost->getShape()
+                                  << generationOutput.contextLogits->getShape()
                                   << std::endl; // (batchsize, prompt_len, vocabsize)
-                        std::cout << "generationOutput.contextLogits: " << *generationOutput.contextLogitsHost
-                                  << std::endl;
+                        std::cout << "generationOutput.contextLogits: " << *generationOutput.contextLogits << std::endl;
                     }
 
                     if (session.getModelConfig().computeGenerationLogits() && printAllLogits)
                     {
                         std::cout << "generationOutput.generationLogits.shape: "
-                                  << generationOutput.generationLogitsHost->getShape()
+                                  << generationOutput.generationLogits->getShape()
                                   << std::endl; // (batchsize, beamwidth, maxNewTokens, vocabsize)
-                        generationOutput.generationLogitsHost->reshape(ITensor::makeShape({batchSize * beamWidth,
+                        generationOutput.generationLogits->reshape(ITensor::makeShape({batchSize * beamWidth,
                             maxNewTokens, modelConfig.getVocabSizePadded(worldConfig.getSize())}));
 
-                        std::cout << "generationOutput.generationLogits: " << *generationOutput.generationLogitsHost
+                        std::cout << "generationOutput.generationLogits: " << *generationOutput.generationLogits
                                   << std::endl;
                     }
                 }

diff --git a/benchmarks/python/all_reduce.py b/benchmarks/python/all_reduce.py
@@ -17,16 +17,17 @@
 
 # isort: off
 import torch
-import tensorrt as trt
 # isort: on
 from cuda import cuda, cudart
 from mpi4py import MPI
 from polygraphy.backend.trt import CreateConfig, EngineFromNetwork
 
 import tensorrt_llm as tllm
 from tensorrt_llm import Mapping, Tensor
-from tensorrt_llm._ipc_utils import IpcMemory, peer_access
+from tensorrt_llm._ipc_utils import peer_access
 from tensorrt_llm.functional import AllReduceStrategy, allreduce
+from tensorrt_llm.plugin.plugin import (current_all_reduce_helper,
+                                        init_all_reduce_helper)
 
 
 def allreduce_benchmark(dtype: str, test_range: str = "10,10000000,10"):
@@ -42,25 +43,17 @@ def allreduce_benchmark(dtype: str, test_range: str = "10,10000000,10"):
     if world_size == 1:
         raise RuntimeError("Benchmark must run with mpi_world_size > 1")
 
-    ipc_barriers_in = IpcMemory(
-        mapping, IpcMemory.IPC_BARRIERS_SIZE_PER_GPU * mapping.tp_size)
-    ipc_barriers_out = IpcMemory(
-        mapping, IpcMemory.IPC_BARRIERS_SIZE_PER_GPU * mapping.tp_size)
     torch_dtype = tllm._utils.str_dtype_to_torch(dtype)
-
     min_size, max_size, ratio = [int(i) for i in test_range.split(",")]
     inner_loop = 1000
 
     size = min_size
+    dtype_size = torch.finfo(torch_dtype).bits // 8
+    init_all_reduce_helper()
     while size < max_size:
-        ipc_buffers = IpcMemory(mapping, size * 4)
-        workspace = torch.tensor(ipc_buffers.serialize() +
-                                 ipc_barriers_in.serialize() +
-                                 ipc_barriers_out.serialize(),
-                                 dtype=torch.int64,
-                                 device="cpu")
-
-        input = torch.zeros(size, dtype=torch_dtype, device="cuda")
+        _buffers, workspace = current_all_reduce_helper().allocate_workspace(
+            mapping, size * dtype_size)
+        input = torch.ones(size, dtype=torch_dtype, device="cuda")
 
         for strategy in [
                 AllReduceStrategy.RING, AllReduceStrategy.ONESHOT,
@@ -77,16 +70,11 @@ def allreduce_benchmark(dtype: str, test_range: str = "10,10000000,10"):
                            shape=input.shape,
                            dtype=tllm.str_dtype_to_trt(dtype))
 
-                w = Tensor(name='workspace',
-                           shape=workspace.shape,
-                           dtype=trt.int64)
+                current_all_reduce_helper().set_workspace_tensor(mapping)
 
                 current = x
-                for i in range(inner_loop):
-                    current = allreduce(
-                        current, mapping.tp_group,
-                        w if strategy != AllReduceStrategy.RING else None, i,
-                        strategy)
+                for _ in range(inner_loop):
+                    current = allreduce(current, mapping.tp_group, strategy)
                 output = current.trt_tensor
 
                 output.name = 'output'
@@ -104,7 +92,7 @@ def allreduce_benchmark(dtype: str, test_range: str = "10,10000000,10"):
             output = torch.zeros_like(input)
 
             stream = torch.cuda.current_stream()
-            feed_dict = {'x': input, 'workspace': workspace}
+            feed_dict = {'x': input, 'all_reduce_workspace': workspace}
 
             session = tllm.runtime.Session.from_engine(build_engine())
             _, start = cuda.cuEventCreate(0)
@@ -119,9 +107,11 @@ def allreduce_benchmark(dtype: str, test_range: str = "10,10000000,10"):
                 cuda.cuEventRecord(stop, stream.cuda_stream)
             torch.cuda.synchronize()
             _, ms = cuda.cuEventElapsedTime(start, stop)
+            assert torch.allclose(output, (input * world_size)**inner_loop)
 
             if mapping.rank == 0:
                 print(f"{size=}, {strategy=}, {ms=}")
+
         size *= ratio
         if mapping.rank == 0:
             print("")

diff --git a/benchmarks/python/allowed_configs.py b/benchmarks/python/allowed_configs.py
@@ -197,8 +197,8 @@ class ModelConfig:
                     builder_opt=None,
                     quantization="int8_sq_per_token_channel",
                 )),
-    "gpt-next_2b":
-    ModelConfig(name="gpt-next_2b",
+    "gpt_next_2b":
+    ModelConfig(name="gpt_next_2b",
                 family="gpt",
                 benchmark_type="gpt",
                 build_config=BuildConfig(
@@ -305,25 +305,6 @@ class ModelConfig:
                     max_output_len=200,
                     builder_opt=None,
                 )),
-    "llama_7b_moe":
-    ModelConfig(name="llama_7b_moe",
-                family="llama",
-                benchmark_type="gpt",
-                build_config=BuildConfig(
-                    num_layers=32,
-                    num_heads=32,
-                    hidden_size=4096,
-                    vocab_size=32000,
-                    hidden_act='silu',
-                    n_positions=2048,
-                    inter_size=11008,
-                    max_batch_size=128,
-                    max_input_len=512,
-                    max_output_len=200,
-                    builder_opt=None,
-                    moe_num_experts=4,
-                    moe_top_k=1,
-                )),
     "llama_13b":
     ModelConfig(name="llama_13b",
                 family="llama",
@@ -427,6 +408,25 @@ class ModelConfig:
                                          max_output_len=200,
                                          builder_opt=None,
                                          quantization="int8_sq_per_tensor")),
+    "mixtral_8x7b":
+    ModelConfig(name="mixtral_8x7b",
+                family="llama",
+                benchmark_type="gpt",
+                build_config=BuildConfig(
+                    num_layers=32,
+                    num_heads=32,
+                    hidden_size=4096,
+                    vocab_size=32000,
+                    hidden_act='swiglu',
+                    n_positions=2048,
+                    inter_size=14336,
+                    max_batch_size=128,
+                    max_input_len=512,
+                    max_output_len=200,
+                    builder_opt=None,
+                    moe_num_experts=8,
+                    moe_top_k=2,
+                )),
     "gptj_6b":
     ModelConfig(name="gptj_6b",
                 family="gptj",