Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update TensorRT-LLM #2363

Merged
merged 1 commit into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,5 @@ cpp/include/tensorrt_llm/executor/version.h

# User config files
CMakeUserPresets.json
compile_commands.json
*.bin
15 changes: 9 additions & 6 deletions benchmarks/python/enc_dec_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,8 @@ def set_weight_streaming(self, config):
self.decoder_session.runtime._set_weight_streaming(gpu_weights_percent)

def prepare_inputs(self, config):
batch_size, encoder_input_len = config[0], config[1]
batch_size, encoder_input_len, output_len = config[0], config[
1], config[2]
attention_mask = None
whisper_decoder_encoder_input_lengths = None
outputs = {}
Expand Down Expand Up @@ -271,7 +272,8 @@ def prepare_inputs(self, config):
dtype=torch.int32,
device='cuda')
cross_attention_mask = torch.ones([
outputs['encoder_output'].shape[0], 1,
outputs['encoder_output'].shape[0],
decoder_input_lengths.max() + output_len,
outputs['encoder_output'].shape[1]
]).int().cuda()
else:
Expand All @@ -297,8 +299,11 @@ def prepare_inputs(self, config):
(batch_size, encoder_input_len)).int().cuda()
# cross attention mask, always set 1 as if all are valid tokens
# [batch_size, query_len, encoder_input_len] currently, use query_len=1
cross_attention_mask = torch.ones(
(batch_size, 1, encoder_input_len)).int().cuda()
cross_attention_mask = [
torch.ones(decoder_input_lengths.max() + output_len,
encoder_input_len).int().cuda()
for _ in range(batch_size)
]

hidden_size = (self.encoder_model_config.hidden_size *
self.world_size) # tp_size
Expand Down Expand Up @@ -396,8 +401,6 @@ def run(self, inputs, config, benchmark_profiler=None):
encoder_max_input_length=encoder_max_input_length,
)

cross_attention_mask = None if self.decoder_model_config.gpt_attention_plugin else cross_attention_mask

self.decoder_session.decode(
decoder_input_ids,
decoder_input_lengths,
Expand Down
2 changes: 1 addition & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -543,7 +543,7 @@ endif()

# Defer UCX/UCXX setup until after USE_CXX11_ABI is well defined, as UCXX will
# need to be built to have aligned symbols
set_ifndef(ENABLE_UCX 0)
set_ifndef(ENABLE_UCX 1)
if(ENABLE_UCX)
# Only enable UCX related features if the system has UCX library
find_package(ucx)
Expand Down
102 changes: 78 additions & 24 deletions cpp/include/tensorrt_llm/batch_manager/llmRequest.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ class GenericLlmRequest
executor::PriorityType priority = executor::Request::kDefaultPriority,
std::optional<TensorPtr> encoderInputFeatures = std::nullopt,
std::optional<SizeType32> encoderOutputLength = std::nullopt,
std::optional<TensorPtr> crossAttentionMask = std::nullopt,
LlmRequestType llmRequestType = LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION,
std::optional<std::shared_ptr<VecTokenExtraIds>> inputTokenExtraIds = std::nullopt,
SizeType32 numReturnSequences = 1)
Expand Down Expand Up @@ -150,6 +151,7 @@ class GenericLlmRequest
, mFinishReasons(samplingConfig.beamWidth)
, mEncoderInputFeatures(std::move(encoderInputFeatures))
, mEncoderOutputLength(encoderOutputLength)
, mCrossAttentionMask(std::move(crossAttentionMask))
, mLlmRequestType(llmRequestType)
, mInputTokenExtraIds(std::move(inputTokenExtraIds))
, mNumReturnSequences(numReturnSequences)
Expand Down Expand Up @@ -205,7 +207,7 @@ class GenericLlmRequest
, mEncoderOutputLength(req.getEncoderOutputLength())
, mContextPhaseParams(req.getContextPhaseParams())
, mInputTokenExtraIds(std::nullopt)
, mNumReturnSequences(req.getNumReturnSequences())
, mNumReturnSequences(1)
, mSequenceIndex(0)
{
if (req.getRequestType() == executor::RequestType::REQUEST_TYPE_GENERATION_ONLY)
Expand Down Expand Up @@ -243,7 +245,8 @@ class GenericLlmRequest

if (req.getEmbeddingBias())
{
mEmbeddingBias = executor::detail::toITensor(req.getEmbeddingBias().value());
mEmbeddingBias
= tensorrt_llm::runtime::ITensor::view(executor::detail::toITensor(req.getEmbeddingBias().value()));
// Add leading 1 dimension since that's what IFB code expects
mEmbeddingBias.value()->unsqueeze(0);
}
Expand Down Expand Up @@ -324,6 +327,16 @@ class GenericLlmRequest
mEncoderInputFeatures = std::nullopt;
}

auto const& crossAttentionMask = req.getCrossAttentionMask();
if (crossAttentionMask.has_value())
{
mCrossAttentionMask = executor::detail::toITensor(crossAttentionMask.value());
}
else
{
mCrossAttentionMask = std::nullopt;
}

switch (req.getRequestType())
{
case executor::RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION:
Expand Down Expand Up @@ -393,15 +406,6 @@ class GenericLlmRequest
mMaxNewTokens = maxNewTokens;
}

if (mNumReturnSequences > 1 && mSamplingConfig.beamWidth > 1)
{
TLLM_THROW(
"Using mNumReturnSequences (%d) > 1 with beam search is currently disabled, since TensorRT-LLM returns "
"a total of mNumReturnSequences x beamWidth beams, rather than limiting the number of returned beams "
"to mNumReturnSequences. This restriction will be removed once the issue is resolved.",
mNumReturnSequences);
}

TLLM_CHECK_WITH_INFO(mSamplingConfig.validate(), "Incorrect sampling config");

// validate extra ids when enabling kv cache reuse with prompt table
Expand Down Expand Up @@ -452,9 +456,20 @@ class GenericLlmRequest
/// @return The number of sequences to return.
[[nodiscard]] SizeType32 getNumReturnSequences() const
{
TLLM_LOG_WARNING(
"mNumReturnSequences in the LlmRequest class is deprecated. Please use numReturnSequences in "
"SamplingConfig directly.");
return mNumReturnSequences;
}

/// @brief Get the number of subrequests, the expected number of responses under non-streaming mode. In sampling
/// mode, it will be equal to mSamplingConfig.numReturnSequences, while it will be equal to 1 in beam search.
/// @return The number of subrequests in total request size.
[[nodiscard]] SizeType32 getNumSubRequests() const
{
return mSamplingConfig.beamWidth == 1 ? mSamplingConfig.numReturnSequences.value_or(1) : 1;
}

/// @brief Get child requests spawned by this req.
/// @return A vector of child requests.
[[nodiscard]] std::vector<RequestPtr> const& getChildRequests() const
Expand Down Expand Up @@ -661,8 +676,8 @@ class GenericLlmRequest
TLLM_CHECK_WITH_INFO(mChildRequests.size() <= static_cast<size_t>(numReturnSequences),
"Cannot set numReturnSequences %d smaller than the number %ld of child requests that have already created.",
numReturnSequences, mChildRequests.size());
mNumReturnSequences = numReturnSequences;
mSequenceFinalVec->resize(mNumReturnSequences);
mSamplingConfig.numReturnSequences = numReturnSequences;
mSequenceFinalVec->resize(numReturnSequences);
}

[[nodiscard]] bool constexpr isChild() const noexcept
Expand Down Expand Up @@ -1021,6 +1036,11 @@ class GenericLlmRequest
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
}

[[nodiscard]] TensorPtr const getCrossAttentionMask() const
{
return mCrossAttentionMask.value_or(nullptr);
}

[[nodiscard]] bool constexpr isStreaming() const noexcept
{
return mIsStreaming;
Expand Down Expand Up @@ -1267,6 +1287,12 @@ class GenericLlmRequest
return mPriority;
}

/// Get the counter of decoding iterations.
SizeType32 getDecodingIter()
{
return mDecodingIter;
}

/// Increment the counter of decoding iterations.
void advanceDecodingIter()
{
Expand Down Expand Up @@ -1307,7 +1333,6 @@ class GenericLlmRequest
result.isFinal = std::all_of(mSequenceFinalVec->begin(), mSequenceFinalVec->end(),
[](bool isSequenceFinal) { return isSequenceFinal; });

auto const nbBeams = mSamplingConfig.beamWidth;
auto const maxNbTokens = getMaxBeamNumTokens();

if (isDisaggContextTransmissionState() && isContextOnlyRequest())
Expand Down Expand Up @@ -1335,6 +1360,8 @@ class GenericLlmRequest

auto const maxNbTokensOut = calculateNbTokensOut(maxNbTokens);

auto const nbBeams = mSamplingConfig.getNumReturnBeams();

result.outputTokenIds.resize(nbBeams);

auto const startTokenPos = maxNbTokens - maxNbTokensOut;
Expand All @@ -1359,10 +1386,13 @@ class GenericLlmRequest
}
}

auto sliceBeams = [&nbBeams](auto beams)
{ return std::vector<typename decltype(beams)::value_type>(beams.begin(), beams.begin() + nbBeams); };

if (returnLogProbs())
{
result.cumLogProbs = getCumLogProbs();
result.logProbs = getLogProbs();
result.cumLogProbs = sliceBeams(getCumLogProbs());
result.logProbs = sliceBeams(getLogProbs());
}

if (getReturnContextLogits())
Expand All @@ -1372,7 +1402,8 @@ class GenericLlmRequest

if (getReturnGenerationLogits())
{
if (isStreaming())
bool hasDraftTokens = (mDraftTokens && mDraftTokens->size() > 0) ? true : false;
if (isStreaming() && !hasDraftTokens)
{
auto startGenTokenPos = startTokenPos - getOrigPromptLen();
TensorPtr generationLogitsHostCurrentStep
Expand All @@ -1386,7 +1417,8 @@ class GenericLlmRequest
}
else
{
result.generationLogits = executor::detail::ofITensor(getGenerationLogitsHost());
result.generationLogits = executor::detail::ofITensor(
runtime::ITensor::slice(getGenerationLogitsHost(), 0, nbBeams));
}
}

Expand All @@ -1395,7 +1427,7 @@ class GenericLlmRequest
result.encoderOutput = executor::detail::ofITensor(getEncoderOutputHost());
}

result.finishReasons = mFinishReasons;
result.finishReasons = sliceBeams(mFinishReasons);
result.decodingIter = mDecodingIter;

// Update position of last sent response
Expand Down Expand Up @@ -1560,6 +1592,7 @@ class GenericLlmRequest
std::optional<SizeType32>
mEncoderOutputLength; // For some models like Whisper, encoder output shape cannot be inferred from encoder
// input shape due to downsampling. Thus this is needed for setting buffer sizes correctly
std::optional<TensorPtr> mCrossAttentionMask; // Input cross attention mask
LlmRequestType mLlmRequestType;
std::optional<executor::ContextPhaseParams> mContextPhaseParams;

Expand Down Expand Up @@ -1644,10 +1677,30 @@ class GenericLlmRequest

setReturnLogProbs(outputLogProbs);

// Handling the backward compatibility of numReturnSequences.
if (mNumReturnSequences > 1)
{
if (!mSamplingConfig.numReturnSequences)
{
TLLM_LOG_WARNING(
"In the Executor class, mNumReturnSequences is deprecated. Please set numReturnSequences in "
"SamplingConfig directly.");
}
else if (mSamplingConfig.numReturnSequences
&& mSamplingConfig.numReturnSequences.value() != mNumReturnSequences)
{
TLLM_THROW(
"In the Executor class, both mSamplingConfig.numReturnSequences (%d) and mNumReturnSequences (%d) "
"are provided but unmatched. Please use numReturnSequences in SamplingConfig directly.",
mSamplingConfig.numReturnSequences.value(), mNumReturnSequences);
}
mSamplingConfig.numReturnSequences = mNumReturnSequences;
}

if (!isChild())
{
// Initialize result states unless it is a child and a child request should share parent's one.
mSequenceFinalVec = std::make_shared<std::vector<bool>>(getNumReturnSequences(), false);
mSequenceFinalVec = std::make_shared<std::vector<bool>>(getNumSubRequests(), false);
}
}

Expand Down Expand Up @@ -1715,6 +1768,7 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
executor::PriorityType priority = executor::Request::kDefaultPriority,
std::optional<TensorPtr> encoderInputFeatures = std::nullopt,
std::optional<SizeType32> encoderOutputLength = std::nullopt,
std::optional<TensorPtr> crossAttentionMask = std::nullopt,
LlmRequestType llmRequestType = LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION,
std::optional<std::shared_ptr<VecTokenExtraIds>> inputTokenExtraIds = std::nullopt,
SizeType32 numReturnSequences = 1)
Expand All @@ -1724,8 +1778,8 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
std::move(lookaheadConfig), returnLogProbs, returnContextLogits, returnGenerationLogits,
std::move(draftTokens), std::move(draftLogits), excludeInputFromOutput, std::move(logitsPostProcessor),
applyLogitsPostProcessorBatched, std::move(encoderInputTokens), returnEncoderOutput, clientId, priority,
std::move(encoderInputFeatures), std::move(encoderOutputLength), llmRequestType,
std::move(inputTokenExtraIds), numReturnSequences)
std::move(encoderInputFeatures), std::move(encoderOutputLength), std::move(crossAttentionMask),
llmRequestType, std::move(inputTokenExtraIds), numReturnSequences)
{
}

Expand All @@ -1742,8 +1796,8 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
std::shared_ptr<LlmRequest> createChildRequest(RequestIdType requestId)
{
TLLM_CHECK_WITH_INFO(!isChild(), "A child request cannot create its own child.");
TLLM_CHECK_WITH_INFO(mChildRequests.size() + 1 < static_cast<size_t>(getNumReturnSequences()),
"Cannot create child requests more than the number of return sequences (%d)", getNumReturnSequences());
TLLM_CHECK_WITH_INFO(mChildRequests.size() + 1 < static_cast<size_t>(getNumSubRequests()),
"Cannot create child requests more than the number of return sequences (%d)", getNumSubRequests());
auto childReq = std::make_shared<LlmRequest>(*this);
childReq->mRequestId = requestId;
childReq->mSequenceIndex = mChildRequests.size() + 1;
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/tensorrt_llm/common/cudaUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,8 @@ inline void syncAndCheck(char const* const file, int const line)
{
if (doCheckError())
{
cudaDeviceSynchronize();
check(cudaGetLastError(), "cudaGetLastError", file, line);
check(cudaDeviceSynchronize(), "cudaDeviceSynchronize", file, line);
}
}

Expand Down
1 change: 1 addition & 0 deletions cpp/include/tensorrt_llm/common/mpiUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <cuda_bf16.h>
#endif

#include <cstdint>
#include <cstdlib>
#include <memory>

Expand Down
Loading