Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update TensorRT-LLM #1763

Merged
merged 1 commit into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
2 changes: 1 addition & 1 deletion benchmarks/cpp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ mpirun -n 2 ./benchmarks/gptManagerBenchmark \
--max_num_samples 500
```

`gptManagerBenchmark` can also be used with the high-level C++ API defined by the `executor::Executor` class (see `cpp/include/tensorrt_llm/executor/executor.h`). This can be done by passing the argument `--api executor`. Note that the Executor class is still under development and currently does not support models with tp or pp > 1.
`gptManagerBenchmark` by default uses the high-level C++ API defined by the `executor::Executor` class (see `cpp/include/tensorrt_llm/executor/executor.h`).

#### Emulated static batching

Expand Down
163 changes: 86 additions & 77 deletions benchmarks/cpp/gptManagerBenchmark.cpp

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions benchmarks/python/allowed_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ class EncDecBuildConfig:
builder_opt: Optional[int] = None
n_mels: Optional[int] = None
skip_cross_qkv: bool = False
use_implicit_relative_attention: Optional[bool] = False

def __post_init__(self) -> None:
assert self.head_size is not None
Expand Down Expand Up @@ -584,6 +585,25 @@ class ModelConfig:
builder_opt=None,
remove_input_padding=False,
)),
"glm_10b":
ModelConfig(name="glm_10b",
family="glm",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=48,
num_heads=64,
num_kv_heads=64,
hidden_size=4096,
inter_size=16384,
vocab_size=50304,
hidden_act='gelu',
n_positions=1024,
max_batch_size=128,
max_input_len=1024,
max_output_len=256,
builder_opt=None,
remove_input_padding=False,
)),
"bloom_560m":
ModelConfig(name="bloom_560m",
family="bloom",
Expand Down
52 changes: 47 additions & 5 deletions benchmarks/python/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ def build_gpt(args):
raise Exception(
f'--opt_num_tokens does not support ootb mode. Please using --opt_batch_size instead it.'
)

max_num_tokens = max_batch_size * max(max_input_len, max_beam_width)
quant_config = get_quant_config(args.quantization)
quant_algo = quant_config.quant_algo
kv_cache_quant_algo = quant_config.kv_cache_quant_algo
Expand Down Expand Up @@ -309,6 +309,7 @@ def build_gpt(args):
max_beam_width=max_beam_width,
max_input_len=max_input_len,
max_output_len=max_output_len,
max_num_tokens=max_num_tokens,
int8=(quant_mode.has_act_and_weight_quant()
or quant_mode.is_int8_weight_only()),
quant_mode=quant_mode,
Expand Down Expand Up @@ -572,6 +573,39 @@ def build_gpt(args):
config = PretrainedConfig.from_dict(config)
tensorrt_llm_model = tensorrt_llm.models.ChatGLMForCausalLM(config)

elif family == "glm":
config = {
'architecture': 'ChatGLMForCausalLM',
'dtype': args.dtype,
'num_hidden_layers': build_config['num_layers'],
'num_attention_heads': build_config['num_heads'],
'num_key_value_heads': build_config['num_kv_heads'],
'hidden_size': build_config['hidden_size'],
'intermediate_size': build_config['inter_size'],
'norm_epsilon': 1e-5,
'vocab_size': build_config['vocab_size'],
'position_embedding_type': 'learned_absolute',
'max_position_embeddings': build_config['n_positions'],
'hidden_act': build_config['hidden_act'],
'quantization': {
'quant_algo': quant_algo,
'kv_cache_quant_algo': kv_cache_quant_algo
},
'mapping': {
'world_size': world_size,
'tp_size': world_size
},
'chatglm_version': 'glm',
'add_bias_linear': True,
'add_qkv_bias': True,
'apply_query_key_layer_scaling': False,
'apply_residual_connection_post_layernorm': False,
'rmsnorm': False,
'rope_ratio': 1.0,
}
config = PretrainedConfig.from_dict(config)
tensorrt_llm_model = tensorrt_llm.models.ChatGLMForCausalLM(config)

elif family == "bloom":
config = {
'architecture': 'BloomForCausalLM',
Expand Down Expand Up @@ -871,6 +905,7 @@ def build_gpt(args):
'layer_types': build_config['layer_types'],
'rnn_hidden_size': build_config['rnn_hidden_size'],
'logits_soft_cap': build_config['logits_soft_cap'],
'rotary_pct': build_config['rotary_pct'],
}
config = PretrainedConfig.from_dict(config)
tensorrt_llm_model = tensorrt_llm.models.RecurrentGemmaForCausalLM(
Expand Down Expand Up @@ -935,10 +970,13 @@ def build_gpt(args):
print(
f"max_batch_size: {max_batch_size}, max_input_len: {max_input_len}, max_output_len: {max_output_len}, max_beam_width: {max_beam_width}"
)
# NOTE: all other models use PretrainedModel.prepare_inputs(...)
# except RecurrentGemmaForCausalLM and MambaForCausalLM
inputs = tensorrt_llm_model.prepare_inputs(
max_batch_size=max_batch_size,
max_input_len=max_input_len,
max_seq_len=max_input_len + max_output_len,
max_num_tokens=max_num_tokens,
use_cache=True,
max_beam_width=max_beam_width,
opt_batch_size=opt_batch_size,
Expand Down Expand Up @@ -1293,7 +1331,7 @@ def enc_dec_build_helper(component, config, args):
has_embedding_layernorm,
'has_embedding_scale':
config.get('has_embedding_scale', False),
'ffn_hidden_size':
'intermediate_size':
config['ffn_hidden_size'],
'q_scaling':
q_scaling,
Expand Down Expand Up @@ -1358,7 +1396,7 @@ def enc_dec_build_helper(component, config, args):
has_embedding_layernorm,
'has_embedding_scale':
config.get('has_embedding_scale', False),
'ffn_hidden_size':
'intermediate_size':
config['ffn_hidden_size'],
'q_scaling':
q_scaling,
Expand All @@ -1381,12 +1419,16 @@ def enc_dec_build_helper(component, config, args):
'encoder_head_size':
config['head_size'],
'skip_cross_qkv':
config['skip_cross_qkv']
config['skip_cross_qkv'],
'use_implicit_relative_attention':
config['use_implicit_relative_attention']
})
tllm_model = tensorrt_llm.models.DecoderModel(pretrained_config)
if use_weight_only and family == 'whisper':
tllm_model = quantize(tllm_model, quant_config)

tllm_model.precompute_relative_attention_bias(builder_config)

# Module -> Network
engine_name = get_engine_name(args.model, args.dtype, world_size,
runtime_rank)
Expand Down Expand Up @@ -1418,7 +1460,7 @@ def enc_dec_build_helper(component, config, args):
if family == 'whisper':
inputs = tllm_model.prepare_inputs(
max_batch_size=config['max_batch_size'], )
tllm_model(*inputs)
tllm_model(**inputs)
else:
inputs = tllm_model.prepare_inputs(
max_batch_size=config['max_batch_size'],
Expand Down
9 changes: 9 additions & 0 deletions benchmarks/python/gpt_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,15 @@ def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents,
top_p=args.top_p)
self.decoder = tensorrt_llm.runtime.GenerationSession(
model_config, engine_buffer, self.runtime_mapping)
if args.model == 'glm_10b':
self.sampling_config = tensorrt_llm.runtime.SamplingConfig(
end_id=50258,
pad_id=50256,
num_beams=self.num_beams,
top_k=args.top_k,
top_p=args.top_p)
self.decoder = tensorrt_llm.runtime.ChatGLMGenerationSession(
model_config, engine_buffer, self.runtime_mapping)
else:
end_id = 50256
pad_id = 50256
Expand Down
28 changes: 25 additions & 3 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,31 @@ if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
endif()
endif()

message(STATUS "GPU architectures: ${CMAKE_CUDA_ARCHITECTURES}")
# Store CMAKE_CUDA_ARCHITECTURES for later use since torch sets this to "OFF"
set(CMAKE_CUDA_ARCHITECTURES_ORIG "${CMAKE_CUDA_ARCHITECTURES}")
if(CMAKE_CUDA_ARCHITECTURES STREQUAL "native")
# Detect highest available compute capability
set(OUTPUTFILE ${PROJECT_BINARY_DIR}/detect_cuda_arch)
set(CUDAFILE ${CMAKE_SOURCE_DIR}/cmake/utils/detect_cuda_arch.cu)
execute_process(COMMAND ${CMAKE_CUDA_COMPILER} -lcuda ${CUDAFILE} -o
${OUTPUTFILE})
message(VERBOSE "Detecting native CUDA compute capability")
execute_process(
COMMAND ${OUTPUTFILE}
RESULT_VARIABLE CUDA_RETURN_CODE
OUTPUT_VARIABLE CUDA_ARCH_OUTPUT)
if(NOT ${CUDA_RETURN_CODE} EQUAL 0)
message(WARNING "Detecting native CUDA compute capability - fail")
message(
WARNING "CUDA compute capability detection failed, compiling for 'all'")
set(CMAKE_CUDA_ARCHITECTURES_ORIG "all")
else()
message(STATUS "Detecting native CUDA compute capability - done")
set(CMAKE_CUDA_ARCHITECTURES_ORIG "${CUDA_ARCH_OUTPUT}")
endif()
else()
# Store CMAKE_CUDA_ARCHITECTURES for later use since torch sets this to "OFF"
set(CMAKE_CUDA_ARCHITECTURES_ORIG "${CMAKE_CUDA_ARCHITECTURES}")
endif()
message(STATUS "GPU architectures: ${CMAKE_CUDA_ARCHITECTURES_ORIG}")

enable_language(C CXX CUDA)

Expand Down
39 changes: 39 additions & 0 deletions cpp/cmake/utils/detect_cuda_arch.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#include <algorithm>
#include <cuda_runtime.h>
#include <iomanip>
#include <iostream>
#include <vector>

int main(int argc, char* argv[])
{
int n_devices = 0;
int rc = cudaGetDeviceCount(&n_devices);
if (rc != cudaSuccess)
{
cudaError_t error = cudaGetLastError();
std::cout << "CUDA error: " << cudaGetErrorString(error) << std::endl;
return rc;
}

std::vector<std::pair<int, int>> arch(n_devices);
for (int cd = 0; cd < n_devices; ++cd)
{
cudaDeviceProp dev;
int rc = cudaGetDeviceProperties(&dev, cd);
if (rc != cudaSuccess)
{
cudaError_t error = cudaGetLastError();
std::cout << "CUDA error: " << cudaGetErrorString(error) << std::endl;
return rc;
}
else
{
arch[cd] = {dev.major, dev.minor};
}
}

std::pair<int, int> best_cc = *std::max_element(begin(arch), end(arch));
std::cout << best_cc.first << best_cc.second;

return 0;
}
6 changes: 3 additions & 3 deletions cpp/include/tensorrt_llm/batch_manager/GptManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ class GptManager
using RequestList = std::list<std::shared_ptr<LlmRequest>>;
using TensorPtr = runtime::ITensor::SharedPtr;

GptManager(std::filesystem::path const& trtEnginePath, TrtGptModelType modelType, SizeType32 maxBeamWidth,
executor::SchedulerConfig const& schedulerConfig, GetInferenceRequestsCallback getInferenceRequestsCb,
SendResponseCallback sendResponseCb, PollStopSignalCallback pollStopSignalCb = nullptr,
GptManager(std::filesystem::path const& trtEnginePath, TrtGptModelType modelType,
GetInferenceRequestsCallback getInferenceRequestsCb, SendResponseCallback sendResponseCb,
PollStopSignalCallback pollStopSignalCb = nullptr,
ReturnBatchManagerStatsCallback returnBatchManagerStatsCb = nullptr,
TrtGptModelOptionalParams const& optionalParams = TrtGptModelOptionalParams(),
std::optional<uint64_t> terminateReqId = std::nullopt, bool excludeInputInOutput = false);
Expand Down
19 changes: 19 additions & 0 deletions cpp/include/tensorrt_llm/batch_manager/llmRequest.h
Original file line number Diff line number Diff line change
Expand Up @@ -699,6 +699,12 @@ class GenericLlmRequest
runtime::ITensor::makeShape({mSamplingConfig.beamWidth, mMaxNewTokens, vocabSizePadded}), logitsDataType);
}

void allocTargetModelAcceptedTokenLogitsHost(SizeType32 vocabSizePadded, nvinfer1::DataType logitsDataType)
{
mGenerationLogitsHost = runtime::BufferManager::pinned(
runtime::ITensor::makeShape({getNumDraftTokens() + 1, vocabSizePadded}), logitsDataType);
}

[[nodiscard]] std::vector<TensorPtr> const& getGenerationLogitsFragments() const
{
return mGenerationLogitsFragments;
Expand Down Expand Up @@ -901,6 +907,18 @@ class GenericLlmRequest
result.generationLogits = executor::detail::ofITensor(getGenerationLogitsHost());
}

if (getReturnTargetModelAcceptedLogits())
{
auto targetModelAcceptedTokenLogitsShape = getGenerationLogitsHost()->getShape();
TLLM_CHECK(targetModelAcceptedTokenLogitsShape.nbDims == 2);
auto numAcceptedToken = targetModelAcceptedTokenLogitsShape.d[0];
auto vocabSizePadded = targetModelAcceptedTokenLogitsShape.d[1];
// Align the shape of accepted token logits and generation logits
TensorPtr targetModelAcceptedTokenLogitsHostView = runtime::ITensor::view(
getGenerationLogitsHost(), runtime::ITensor::makeShape({1, numAcceptedToken, vocabSizePadded}));
result.generationLogits = executor::detail::ofITensor(targetModelAcceptedTokenLogitsHostView);
}

if (getReturnEncoderOutput())
{
result.encoderOutput = executor::detail::ofITensor(getEncoderOutputHost());
Expand Down Expand Up @@ -1023,6 +1041,7 @@ class GenericLlmRequest
auto data = runtime::bufferCast<int32_t>(*tensor);
std::memcpy(data, words.data(), numWords * sizeof(int32_t));
std::memcpy(data + numWords, offsets.data(), numWords * sizeof(int32_t));

// Add leading dim of 1
tensor->unsqueeze(0);

Expand Down
14 changes: 11 additions & 3 deletions cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "tensorrt_llm/runtime/common.h"

#include <optional>
#include <utility>
#include <vector>

namespace tensorrt_llm::batch_manager
Expand All @@ -39,15 +40,19 @@ class TrtGptModelOptionalParams
bool enableTrtOverlap = false, std::optional<std::vector<SizeType32>> const& deviceIds = std::nullopt,
bool normalizeLogProbs = true, bool enableChunkedContext = false,
PeftCacheManagerConfig const& peftCacheManagerConfig = PeftCacheManagerConfig{},
executor::DecodingConfig const& decodingConfig = executor::DecodingConfig{}, float gpuWeightsPercent = 1)
executor::DecodingConfig decodingConfig = executor::DecodingConfig{}, float gpuWeightsPercent = 1,
std::optional<SizeType32> maxBeamWidth = std::nullopt,
executor::SchedulerConfig const& schedulerConfig = executor::SchedulerConfig{})
: kvCacheConfig{kvCacheConfig}
, enableTrtOverlap{enableTrtOverlap}
, deviceIds(deviceIds)
, normalizeLogProbs{normalizeLogProbs}
, enableChunkedContext{enableChunkedContext}
, peftCacheManagerConfig(peftCacheManagerConfig)
, decodingConfig(decodingConfig)
, decodingConfig(std::move(decodingConfig))
, gpuWeightsPercent(gpuWeightsPercent)
, maxBeamWidth(maxBeamWidth)
, schedulerConfig{schedulerConfig}
{
}

Expand All @@ -57,7 +62,8 @@ class TrtGptModelOptionalParams
executorConfig.getNormalizeLogProbs(), executorConfig.getEnableChunkedContext(),
PeftCacheManagerConfig(executorConfig.getPeftCacheConfig().value_or(executor::PeftCacheConfig())),
executorConfig.getDecodingConfig().value_or(executor::DecodingConfig{}),
executorConfig.getGpuWeightsPercent())
executorConfig.getGpuWeightsPercent(), executorConfig.getMaxBeamWidth(),
executorConfig.getSchedulerConfig())
{
}

Expand All @@ -80,6 +86,8 @@ class TrtGptModelOptionalParams
executor::DecodingConfig decodingConfig;
// Percentage of weights on the gpu at runtime
float gpuWeightsPercent;
std::optional<SizeType32> maxBeamWidth;
executor::SchedulerConfig schedulerConfig;
};

} // namespace tensorrt_llm::batch_manager
Loading