Skip to content

Commit

Permalink
Update TensorRT-LLM (#1492)
Browse files Browse the repository at this point in the history
* Update TensorRT-LLM

---------

Co-authored-by: Loki <[email protected]>
  • Loading branch information
kaiyux and Lokiiiiii authored Apr 24, 2024
1 parent 71d8d4d commit 66ef1df
Show file tree
Hide file tree
Showing 319 changed files with 21,366 additions and 37,304 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ cpp/.ccache/
tensorrt_llm/libs
tensorrt_llm/bindings.pyi
tensorrt_llm/bindings/*.pyi
*docs/cpp_docs*
*docs/source/_cpp_gen*

# Testing
.coverage.*
Expand Down
2 changes: 1 addition & 1 deletion 3rdparty/cutlass
Submodule cutlass updated 548 files
460 changes: 15 additions & 445 deletions README.md

Large diffs are not rendered by default.

10 changes: 3 additions & 7 deletions benchmarks/cpp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -225,23 +225,19 @@ python examples/llama/convert_checkpoint.py --model_dir ${MODEL_CHECKPOINT} \
--output_dir ${CONVERTED_CHECKPOINT} \
--dtype ${DTYPE} \
--tp_size ${TP} \
--pp_size 1 \
--lora_target_modules attn_qkv \
--max_lora_rank ${MAX_LORA_RANK}
--pp_size 1
${HOME}/.local/bin/trtllm-build \
--checkpoint_dir ${CONVERTED_CHECKPOINT} \
--output_dir ${LORA_ENGINE} \
--max_batch_size ${MAX_BATCH} \
--max_input_len $MAX_LEN \
--max_output_len $MAX_LEN \
--gpt_attention_plugin float16 \
--paged_kv_cache enable \
--remove_input_padding enable \
--gemm_plugin float16 \
--lora_plugin float16 \
--use_paged_context_fmha enable \
--use_custom_all_reduce disable
--lora_target_modules attn_qkv \
--max_lora_rank ${MAX_LORA_RANK}
NUM_LORAS=(8 16 24 32 64 128 256)
NUM_REQUESTS=1024
Expand Down
8 changes: 8 additions & 0 deletions benchmarks/cpp/gptSessionBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/*****************************************************************************
*
* GptSession is going to be deprecated soon.
* Please do not add new functionality in this file!
*
*****************************************************************************/

#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/mpiUtils.h"
#include "tensorrt_llm/plugins/api/tllmPlugin.h"
Expand Down
33 changes: 33 additions & 0 deletions benchmarks/python/allowed_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1127,6 +1127,39 @@ class ModelConfig:
max_output_len=200,
builder_opt=None,
)),
"qwen1.5_7b_chat":
ModelConfig(name="qwen1.5_7b_chat",
family="qwen2",
benchmark_type="gpt",
build_config=BuildConfig(num_layers=32,
num_heads=32,
hidden_size=4096,
vocab_size=151936,
hidden_act='silu',
n_positions=8192,
inter_size=11008,
max_batch_size=128,
max_input_len=512,
max_output_len=200,
builder_opt=None,
bias=False)),
"qwen1.5_14b_chat":
ModelConfig(name="qwen1.5_14b_chat",
family="qwen2",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=40,
num_heads=40,
hidden_size=5120,
vocab_size=152064,
hidden_act='silu',
n_positions=8192,
inter_size=13696,
max_batch_size=64,
max_input_len=512,
max_output_len=200,
builder_opt=None,
)),
"mamba_2.8b":
ModelConfig(name="mamba_2.8b",
family="mamba",
Expand Down
46 changes: 46 additions & 0 deletions benchmarks/python/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ def build_gpt(args):
builder_config_extra_kwargs['mamba_expand'] = build_config[
'mamba_expand']
builder_config_extra_kwargs['max_beam_width'] = max_beam_width
builder_config_extra_kwargs['layer_types'] = ['recurrent']
builder_config = builder.create_builder_config(
name=args.model,
precision=args.dtype,
Expand Down Expand Up @@ -715,6 +716,51 @@ def build_gpt(args):
build_config["moe_num_experts"],
'moe_top_k':
build_config["moe_top_k"],
'qwen_type':
'qwen',
}
config = PretrainedConfig.from_dict(config)
tensorrt_llm_model = tensorrt_llm.models.QWenForCausalLM(config)
elif family == "qwen2":
config = {
'architecture':
'QWenForCausalLM',
'dtype':
args.dtype,
'num_hidden_layers':
build_config['num_layers'],
'num_attention_heads':
build_config['num_heads'],
'num_key_value_heads':
build_config['num_heads'] if build_config['num_kv_heads'] is None
else build_config['num_kv_heads'],
'hidden_size':
build_config['hidden_size'],
'intermediate_size':
build_config['inter_size'],
'vocab_size':
build_config['vocab_size'],
'position_embedding_type':
'rope_gpt_neox',
'max_position_embeddings':
build_config['n_positions'],
'hidden_act':
build_config['hidden_act'],
'quantization': {
'group_size': 128,
'quant_algo': quant_algo,
'kv_cache_quant_algo': kv_cache_quant_algo
},
'mapping': {
'world_size': world_size,
'tp_size': world_size
},
'moe_num_experts':
build_config["moe_num_experts"],
'moe_top_k':
build_config["moe_top_k"],
'qwen_type':
'qwen2',
}
config = PretrainedConfig.from_dict(config)
tensorrt_llm_model = tensorrt_llm.models.QWenForCausalLM(config)
Expand Down
12 changes: 8 additions & 4 deletions cpp/include/tensorrt_llm/batch_manager/GptManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#include "tensorrt_llm/batch_manager/llmRequest.h"
#include "tensorrt_llm/batch_manager/schedulerPolicy.h"
#include "tensorrt_llm/batch_manager/trtGptModelOptionalParams.h"
#include "tensorrt_llm/runtime/gptModelConfig.h"
#include "tensorrt_llm/runtime/modelConfig.h"
#include "tensorrt_llm/runtime/worldConfig.h"

#include <atomic>
Expand Down Expand Up @@ -79,17 +79,21 @@ class GptManager
virtual ~GptManager();

protected:
/* Synchronizes the decoder */
virtual BatchManagerErrorCode_t forwardSync();

/* Invokes one step of backend
Updates state of all requests */
virtual BatchManagerErrorCode_t step(RequestList& activeRequests, std::set<uint64_t>& activeRequestsIds);
virtual BatchManagerErrorCode_t forwardAsync(
RequestList& activeRequests, std::unordered_set<uint64_t>& activeRequestsIds);

private:
[[nodiscard]] SizeType getMaxInputLen() const;
[[nodiscard]] SizeType getMaxSequenceLen() const;
[[nodiscard]] SizeType getMaxNumSequences() const;

void validateLlmRequest(
LlmRequest& newReq, runtime::GptModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig) const;
LlmRequest& newReq, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig) const;
static std::shared_ptr<LlmRequest> fillLlmRequest(std::shared_ptr<InferenceRequest> newReq);
static std::shared_ptr<std::vector<TokenIdType>> getReqInputTokens(std::shared_ptr<InferenceRequest> newReq);
static SizeType getMaxNewTokens(std::shared_ptr<InferenceRequest> newReq);
Expand All @@ -108,7 +112,7 @@ class GptManager
// List of live requests
RequestList mActiveRequests;
// IDs of live requests
std::set<uint64_t> mActiveRequestsIds;
std::unordered_set<uint64_t> mActiveRequestsIds;
// Boolean that controls if prompt should be included in output tokens for non-streaming
bool mExcludeInputInOutput;

Expand Down
2 changes: 2 additions & 0 deletions cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ class KvCacheConfig
&& hostCacheSize == other.hostCacheSize && onboardBlocks == other.onboardBlocks;
}

friend std::ostream& operator<<(std::ostream& os, KvCacheConfig const& self);

std::optional<SizeType> maxTokens;
std::optional<SizeType> maxAttentionWindow;
std::optional<SizeType> sinkTokenLength;
Expand Down
Loading

0 comments on commit 66ef1df

Please sign in to comment.