Skip to content
32 changes: 0 additions & 32 deletions cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
#include "tensorrt_llm/runtime/common.h"
#include "tensorrt_llm/runtime/iTensor.h"
#include "tensorrt_llm/runtime/modelConfig.h"
#include "tensorrt_llm/runtime/request.h"
#include "tensorrt_llm/runtime/worldConfig.h"

namespace tensorrt_llm::runtime
Expand Down Expand Up @@ -88,37 +87,6 @@ class CreateNewDecoderRequests : Algorithm
SizeType32 maxSequenceLength, OptionalRef<MedusaBuffers const> medusaBuffers) const;

private:
//! @brief Setups decoder internal tensors for new speculative decoding request
static void newRequestSpeculativeDecoding(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
SamplingConfig const& samplingConfig, runtime::ModelConfig const& modelConfig,
DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream,
CudaStream const& decoderStream, SpeculativeDecodingMode const& speculativeDecodingMode,
SizeType32 maxDecodingEngineTokens);

//! @brief Setups decoder internal tensors for new request in Draft model Sps mode
static void newRequestDraftTokensExternal(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
SamplingConfig const& samplingConfig, DecodingInput& jointDecodingInput, CudaStream const& decoderStream);

//! @brief Setups decoder internal tensors for new Medusa request
static void newRequestMedusa(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
DecodingInput& jointDecodingInput, CudaStream const& decoderStream, SizeType32 maxDecodingEngineTokens);

//! @brief Setups decoder internal tensors for new Lookahead request
static void newRequestLookahead(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream);

//! @brief Setups decoder internal tensors for new Explicit draft tokens request
static void newRequestExplicitDraftTokens(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream);

//! @brief Setups decoder internal tensors for new Eagle request
static void newRequestEagle(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
runtime::ModelConfig const& modelConfig, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream);

[[nodiscard]] std::shared_ptr<runtime::ITensor> retrieveDraftLogits(runtime::ModelConfig const& modelConfig,
runtime::WorldConfig const& worldConfig, std::shared_ptr<runtime::ITensor> const& tensor,
runtime::BufferManager const& bufferManager) const;

bool mSpeculativeDecodingFastLogits;
bool mIsLeaderInOrchMode;
bool mIsNormalizeLogProbs;
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/tensorrt_llm/batch_manager/llmRequest.h
Original file line number Diff line number Diff line change
Expand Up @@ -1110,7 +1110,7 @@ class GenericLlmRequest

[[nodiscard]] SizeType32 getNumDraftTokens() const
{
return mDraftTokens->size();
return hasDraftTokens() ? mDraftTokens->size() : 0;
}

void discardDraftTokens(SizeType32 numTokensToDiscard)
Expand Down
2 changes: 2 additions & 0 deletions cpp/include/tensorrt_llm/runtime/decodingInput.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,13 @@ class DecodingInput
{
public:
TensorPtr draftLogits;
TensorPtr draftLogitsHost;
TensorPtr draftProbs;
TensorPtr targetProbs;
TensorPtr numDraftTokens;
TensorPtr numDraftTokensHost;
TensorPtr draftTokenIds;
TensorPtr draftTokenIdsHost;
TensorPtr useDraftLogits;
TensorPtr useDraftLogitsHost;

Expand Down
54 changes: 0 additions & 54 deletions cpp/include/tensorrt_llm/runtime/request.h

This file was deleted.

Loading