Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion cpp/include/tensorrt_llm/runtime/gptDecoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
#include "tensorrt_llm/runtime/bufferManager.h"
#include "tensorrt_llm/runtime/decodingInput.h"
#include "tensorrt_llm/runtime/decodingOutput.h"
#include "tensorrt_llm/runtime/request.h"
#include "tensorrt_llm/runtime/samplingConfig.h"

#include <NvInferRuntime.h>
Expand Down
59 changes: 3 additions & 56 deletions cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,13 @@
#include "tensorrt_llm/batch_manager/assignReqSeqSlots.h"
#include "tensorrt_llm/batch_manager/capacityScheduler.h"
#include "tensorrt_llm/batch_manager/createNewDecoderRequests.h"
#include "tensorrt_llm/batch_manager/handleContextLogits.h"
#include "tensorrt_llm/batch_manager/handleGenerationLogits.h"
#include "tensorrt_llm/batch_manager/kvCacheManager.h"
#include "tensorrt_llm/batch_manager/llmRequest.h"
#include "tensorrt_llm/batch_manager/logitsPostProcessor.h"
#include "tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h"
#include "tensorrt_llm/batch_manager/medusaBuffers.h"
#include "tensorrt_llm/batch_manager/microBatchScheduler.h"
#include "tensorrt_llm/batch_manager/pauseRequests.h"
#include "tensorrt_llm/batch_manager/peftCacheManager.h"
#include "tensorrt_llm/batch_manager/runtimeBuffers.h"
#include "tensorrt_llm/batch_manager/updateDecoderBuffers.h"
#include "tensorrt_llm/nanobind/common/customCasters.h"
#include "tensorrt_llm/runtime/decoderState.h"
#include "tensorrt_llm/runtime/torch.h"
Expand Down Expand Up @@ -94,48 +89,6 @@ void tensorrt_llm::nanobind::batch_manager::algorithms::initBindings(nb::module_
nb::arg("generation_requests"), nb::arg("model_config"), nb::arg("cross_kv_cache_manager") = std::nullopt)
.def("name", [](AllocateKvCache const&) { return AllocateKvCache::name; });

nb::class_<HandleContextLogits>(m, HandleContextLogits::name)
.def(nb::init<>())
.def(
"__call__",
[](HandleContextLogits const& self, DecoderInputBuffers& inputBuffers, RequestVector const& contextRequests,
at::Tensor const& logits, std::vector<tr::SizeType32> const& numContextLogitsVec,
tr::ModelConfig const& modelConfig, tr::BufferManager const& manager,
OptionalRef<MedusaBuffers> medusaBuffers = std::nullopt)
{
return self(inputBuffers, contextRequests, tr::TorchView::of(logits), numContextLogitsVec, modelConfig,
manager, medusaBuffers);
},
nb::arg("decoder_input_buffers"), nb::arg("context_requests"), nb::arg("logits"),
nb::arg("num_context_logits"), nb::arg("model_config"), nb::arg("buffer_manager"),
nb::arg("medusa_buffers") = std::nullopt)
.def("name", [](HandleContextLogits const&) { return HandleContextLogits::name; });

nb::class_<HandleGenerationLogits>(m, HandleGenerationLogits::name)
.def(nb::init<>())
.def(
"__call__",
[](HandleGenerationLogits const& self, DecoderInputBuffers& inputBuffers,
RequestVector const& generationRequests, at::Tensor const& logits, tr::SizeType32 logitsIndex,
tr::ModelConfig const& modelConfig, tr::BufferManager const& manager,
OptionalRef<RuntimeBuffers> genRuntimeBuffers = std::nullopt,
OptionalRef<MedusaBuffers> medusaBuffers = std::nullopt)
{
self(inputBuffers, generationRequests, tr::TorchView::of(logits), logitsIndex, modelConfig, manager,
genRuntimeBuffers, medusaBuffers);
},
nb::arg("decoder_input_buffers"), nb::arg("generation_requests"), nb::arg("logits"),
nb::arg("logits_index"), nb::arg("model_config"), nb::arg("buffer_manager"),
nb::arg("gen_runtime_buffers") = std::nullopt, nb::arg("medusa_buffers") = std::nullopt)
.def("name", [](HandleGenerationLogits const&) { return HandleGenerationLogits::name; });

nb::class_<MakeDecodingBatchInputOutput>(m, MakeDecodingBatchInputOutput::name)
.def(nb::init<>())
.def("__call__", &MakeDecodingBatchInputOutput::operator(), nb::arg("decoder_input_buffers"),
nb::arg("decoder_state"), nb::arg("model_config"), nb::arg("max_num_sequences"),
nb::arg("fused_runtime_buffers") = std::nullopt)
.def("name", [](MakeDecodingBatchInputOutput const&) { return MakeDecodingBatchInputOutput::name; });

nb::class_<LogitsPostProcessor>(m, LogitsPostProcessor::name)
.def(nb::init<>())
.def("__call__", &LogitsPostProcessor::operator(), nb::arg("decoder_input_buffers"),
Expand All @@ -154,8 +107,9 @@ void tensorrt_llm::nanobind::batch_manager::algorithms::initBindings(nb::module_
DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
tensorrt_llm::runtime::CudaStream const& runtimeStream,
tensorrt_llm::runtime::CudaStream const& decoderStream, SizeType32 maxSequenceLength,
SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers = std::nullopt)
SizeType32 beamWidth)
{
OptionalRef<MedusaBuffers const> medusaBuffers = std::nullopt;
auto [batchSlots, samplingConfigs, lookaheadPrompt, lookaheadAlgoConfigs] = self(modelConfig,
worldConfig, decodingConfig, contextRequests, bufferManager, logitsType, inputBuffers, decoderState,
runtimeStream, decoderStream, maxSequenceLength, beamWidth, medusaBuffers);
Expand All @@ -166,13 +120,6 @@ void tensorrt_llm::nanobind::batch_manager::algorithms::initBindings(nb::module_
nb::arg("model_config"), nb::arg("world_config"), nb::arg("decoding_config"), nb::arg("context_requests"),
nb::arg("buffer_manager"), nb::arg("logits_type"), nb::arg("decoder_input_buffers"),
nb::arg("decoder_state"), nb::arg("runtime_stream"), nb::arg("decoder_stream"),
nb::arg("max_sequence_length"), nb::arg("beam_width"), nb::arg("medusa_buffers") = std::nullopt)
nb::arg("max_sequence_length"), nb::arg("beam_width"))
.def("name", [](CreateNewDecoderRequests const&) { return CreateNewDecoderRequests::name; });

nb::class_<UpdateDecoderBuffers>(m, UpdateDecoderBuffers::name)
.def(nb::init<>())
.def("__call__", &UpdateDecoderBuffers::operator(), nb::arg("model_config"), nb::arg("decoder_output_buffers"),
nb::arg("copy_buffer_manager"), nb::arg("decoder_state"), nb::arg("return_log_probs"),
nb::arg("decoder_finish_event"))
.def("name", [](UpdateDecoderBuffers const&) { return UpdateDecoderBuffers::name; });
}
9 changes: 0 additions & 9 deletions cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,9 @@

#include "tensorrt_llm/batch_manager/common.h"
#include "tensorrt_llm/batch_manager/decoderBuffers.h"
#include "tensorrt_llm/batch_manager/medusaBuffers.h"
#include "tensorrt_llm/batch_manager/microBatchScheduler.h"
#include "tensorrt_llm/batch_manager/peftCacheManager.h"
#include "tensorrt_llm/batch_manager/rnnStateManager.h"
#include "tensorrt_llm/batch_manager/runtimeBuffers.h"
#include "tensorrt_llm/batch_manager/sequenceSlotManager.h"
#include "tensorrt_llm/nanobind/common/bindTypes.h"
#include "tensorrt_llm/runtime/gptDecoderBatched.h"
Expand Down Expand Up @@ -419,13 +417,6 @@ void initBindings(nb::module_& m)
.def_rw("log_probs_host", &tb::SlotDecoderBuffers::logProbsHost)
.def_rw("finish_reasons_host", &tb::SlotDecoderBuffers::finishReasonsHost);

nb::class_<tb::MedusaBuffers>(m, "MedusaBuffers")
.def(nb::init<runtime::SizeType32, runtime::SizeType32, runtime::BufferManager const&,
runtime::ModelConfig const&, runtime::WorldConfig const&, executor::DecodingConfig const&,
runtime::TllmRuntime const&>(),
nb::arg("max_beam_width"), nb::arg("max_seq_len"), nb::arg("buffer_manager"), nb::arg("model_config"),
nb::arg("world_config"), nb::arg("decoding_config"), nb::arg("runtime"));

m.def(
"add_new_tokens_to_requests",
[](std::vector<std::shared_ptr<tb::LlmRequest>>& requests,
Expand Down
1 change: 0 additions & 1 deletion cpp/tensorrt_llm/nanobind/bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@

namespace nb = nanobind;
namespace tb = tensorrt_llm::batch_manager;
namespace tbk = tensorrt_llm::batch_manager::kv_cache_manager;
namespace tpb = tensorrt_llm::nanobind::batch_manager;
namespace tc = tensorrt_llm::common;
namespace tr = tensorrt_llm::runtime;
Expand Down
7 changes: 2 additions & 5 deletions cpp/tensorrt_llm/nanobind/common/customCasters.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,11 @@
#include "tensorrt_llm/batch_manager/decoderBuffers.h"
#include "tensorrt_llm/common/optionalRef.h"
#include "tensorrt_llm/runtime/cudaStream.h"
#include "tensorrt_llm/runtime/request.h"
#include "tensorrt_llm/runtime/samplingConfig.h"
#include "tensorrt_llm/runtime/torch.h"
#include "tensorrt_llm/runtime/torchView.h"

#include <ATen/DLConvertor.h>
#include <deque>
#include <filesystem>
#include <nanobind/nanobind.h>
#include <nanobind/stl/filesystem.h>
#include <nanobind/stl/optional.h>
Expand All @@ -38,7 +35,8 @@
#include <torch/csrc/autograd/variable.h>
#include <torch/extension.h>
#include <torch/torch.h>
#include <vector>

#include <deque>

// Pybind requires to have a central include in order for type casters to work.
// Opaque bindings add a type caster, so they have the same requirement.
Expand All @@ -47,7 +45,6 @@
// Opaque bindings
NB_MAKE_OPAQUE(tensorrt_llm::batch_manager::ReqIdsSet)
NB_MAKE_OPAQUE(std::vector<tensorrt_llm::batch_manager::SlotDecoderBuffers>)
NB_MAKE_OPAQUE(std::vector<tensorrt_llm::runtime::decoder_batch::Request>)
NB_MAKE_OPAQUE(std::vector<tensorrt_llm::runtime::SamplingConfig>)

namespace nb = nanobind;
Expand Down
20 changes: 0 additions & 20 deletions cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
#include "tensorrt_llm/runtime/lookaheadBuffers.h"
#include "tensorrt_llm/runtime/loraCache.h"
#include "tensorrt_llm/runtime/mcastGPUBuffer.h"
#include "tensorrt_llm/runtime/request.h"
#include "tensorrt_llm/runtime/speculativeDecodingMode.h"
#include "tensorrt_llm/runtime/tllmRuntime.h"
#include "tensorrt_llm/runtime/torchView.h"
Expand Down Expand Up @@ -158,25 +157,6 @@ void initBindings(nb::module_& m)
.def_prop_ro("logits_dtype_from_engine",
[](tr::TllmRuntime& self) { return self.getEngine().getTensorDataType("logits"); });

nb::class_<tr::decoder_batch::Request>(m, "Request")
.def(nb::init<tr::decoder_batch::Request::TensorConstPtr, tr::SizeType32, std::optional<tr::SizeType32>,
std::optional<tr::SizeType32>>(),
nb::arg("ids"), nb::arg("input_len"), nb::arg("max_new_tokens") = std::nullopt,
nb::arg("end_id") = std::nullopt)
.def_rw("ids", &tr::decoder_batch::Request::ids)
.def_rw("input_len", &tr::decoder_batch::Request::inputLen)
.def_rw("max_new_tokens", &tr::decoder_batch::Request::maxNewTokens)
.def_rw("end_id", &tr::decoder_batch::Request::endId)
.def_rw("draft_logits", &tr::decoder_batch::Request::draftLogits)
.def_rw("embedding_bias", &tr::decoder_batch::Request::embeddingBias)
.def_rw("bad_words_list", &tr::decoder_batch::Request::badWordsList)
.def_rw("stop_words_list", &tr::decoder_batch::Request::stopWordsList)
.def_rw("generated_tokens_per_engine_step", &tr::decoder_batch::Request::generatedTokensPerEngineStep)
.def_rw("medusa_paths", &tr::decoder_batch::Request::medusaPaths)
.def_rw("medusa_tree_ids", &tr::decoder_batch::Request::medusaTreeIds)
.def_rw("lookahead_runtime_config", &tr::decoder_batch::Request::lookaheadRuntimeConfig);
nb::bind_vector<std::vector<tr::decoder_batch::Request>>(m, "RequestVector");

nb::class_<tr::decoder_batch::Input>(m, "DecoderBatchInput")
.def(nb::init<std::vector<std::vector<tr::ITensor::SharedConstPtr>>, tr::SizeType32>(), nb::arg("logits"),
nb::arg("max_decoding_engine_tokens"))
Expand Down
1 change: 0 additions & 1 deletion cpp/tensorrt_llm/pybind/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ set(TRTLLM_PYBIND_MODULE
set(SRCS
batch_manager/algorithms.cpp
batch_manager/bindings.cpp
batch_manager/buffers.cpp
batch_manager/cacheTransceiver.cpp
batch_manager/kvCacheManager.cpp
batch_manager/llmRequest.cpp
Expand Down
61 changes: 4 additions & 57 deletions cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -20,18 +20,13 @@
#include "tensorrt_llm/batch_manager/assignReqSeqSlots.h"
#include "tensorrt_llm/batch_manager/capacityScheduler.h"
#include "tensorrt_llm/batch_manager/createNewDecoderRequests.h"
#include "tensorrt_llm/batch_manager/handleContextLogits.h"
#include "tensorrt_llm/batch_manager/handleGenerationLogits.h"
#include "tensorrt_llm/batch_manager/kvCacheManager.h"
#include "tensorrt_llm/batch_manager/llmRequest.h"
#include "tensorrt_llm/batch_manager/logitsPostProcessor.h"
#include "tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h"
#include "tensorrt_llm/batch_manager/medusaBuffers.h"
#include "tensorrt_llm/batch_manager/microBatchScheduler.h"
#include "tensorrt_llm/batch_manager/pauseRequests.h"
#include "tensorrt_llm/batch_manager/peftCacheManager.h"
#include "tensorrt_llm/batch_manager/runtimeBuffers.h"
#include "tensorrt_llm/batch_manager/updateDecoderBuffers.h"
#include "tensorrt_llm/runtime/decoderState.h"
#include "tensorrt_llm/runtime/torch.h"
#include "tensorrt_llm/runtime/torchView.h"
Expand Down Expand Up @@ -96,48 +91,6 @@ void tensorrt_llm::pybind::batch_manager::algorithms::initBindings(pybind11::mod
py::arg("generation_requests"), py::arg("model_config"), py::arg("cross_kv_cache_manager") = std::nullopt)
.def("name", [](AllocateKvCache const&) { return AllocateKvCache::name; });

py::class_<HandleContextLogits>(m, HandleContextLogits::name)
.def(py::init())
.def(
"__call__",
[](HandleContextLogits const& self, DecoderInputBuffers& inputBuffers, RequestVector const& contextRequests,
at::Tensor const& logits, std::vector<tr::SizeType32> const& numContextLogitsVec,
tr::ModelConfig const& modelConfig, tr::BufferManager const& manager,
OptionalRef<MedusaBuffers> medusaBuffers = std::nullopt)
{
return self(inputBuffers, contextRequests, tr::TorchView::of(logits), numContextLogitsVec, modelConfig,
manager, medusaBuffers);
},
py::arg("decoder_input_buffers"), py::arg("context_requests"), py::arg("logits"),
py::arg("num_context_logits"), py::arg("model_config"), py::arg("buffer_manager"),
py::arg("medusa_buffers") = std::nullopt)
.def("name", [](HandleContextLogits const&) { return HandleContextLogits::name; });

py::class_<HandleGenerationLogits>(m, HandleGenerationLogits::name)
.def(py::init())
.def(
"__call__",
[](HandleGenerationLogits const& self, DecoderInputBuffers& inputBuffers,
RequestVector const& generationRequests, at::Tensor const& logits, tr::SizeType32 logitsIndex,
tr::ModelConfig const& modelConfig, tr::BufferManager const& manager,
OptionalRef<RuntimeBuffers> genRuntimeBuffers = std::nullopt,
OptionalRef<MedusaBuffers> medusaBuffers = std::nullopt)
{
self(inputBuffers, generationRequests, tr::TorchView::of(logits), logitsIndex, modelConfig, manager,
genRuntimeBuffers, medusaBuffers);
},
py::arg("decoder_input_buffers"), py::arg("generation_requests"), py::arg("logits"),
py::arg("logits_index"), py::arg("model_config"), py::arg("buffer_manager"),
py::arg("gen_runtime_buffers") = std::nullopt, py::arg("medusa_buffers") = std::nullopt)
.def("name", [](HandleGenerationLogits const&) { return HandleGenerationLogits::name; });

py::class_<MakeDecodingBatchInputOutput>(m, MakeDecodingBatchInputOutput::name)
.def(py::init())
.def("__call__", &MakeDecodingBatchInputOutput::operator(), py::arg("decoder_input_buffers"),
py::arg("decoder_state"), py::arg("model_config"), py::arg("max_num_sequences"),
py::arg("fused_runtime_buffers") = std::nullopt)
.def("name", [](MakeDecodingBatchInputOutput const&) { return MakeDecodingBatchInputOutput::name; });

py::class_<LogitsPostProcessor>(m, LogitsPostProcessor::name)
.def(py::init())
.def("__call__", &LogitsPostProcessor::operator(), py::arg("decoder_input_buffers"),
Expand All @@ -156,8 +109,9 @@ void tensorrt_llm::pybind::batch_manager::algorithms::initBindings(pybind11::mod
DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
tensorrt_llm::runtime::CudaStream const& runtimeStream,
tensorrt_llm::runtime::CudaStream const& decoderStream, SizeType32 maxSequenceLength,
SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers = std::nullopt)
SizeType32 beamWidth)
{
OptionalRef<MedusaBuffers const> medusaBuffers = std::nullopt;
auto [batchSlots, samplingConfigs, lookaheadPrompt, lookaheadAlgoConfigs] = self(modelConfig,
worldConfig, decodingConfig, contextRequests, bufferManager, logitsType, inputBuffers, decoderState,
runtimeStream, decoderStream, maxSequenceLength, beamWidth, medusaBuffers);
Expand All @@ -168,13 +122,6 @@ void tensorrt_llm::pybind::batch_manager::algorithms::initBindings(pybind11::mod
py::arg("model_config"), py::arg("world_config"), py::arg("decoding_config"), py::arg("context_requests"),
py::arg("buffer_manager"), py::arg("logits_type"), py::arg("decoder_input_buffers"),
py::arg("decoder_state"), py::arg("runtime_stream"), py::arg("decoder_stream"),
py::arg("max_sequence_length"), py::arg("beam_width"), py::arg("medusa_buffers") = std::nullopt)
py::arg("max_sequence_length"), py::arg("beam_width"))
.def("name", [](CreateNewDecoderRequests const&) { return CreateNewDecoderRequests::name; });

py::class_<UpdateDecoderBuffers>(m, UpdateDecoderBuffers::name)
.def(py::init())
.def("__call__", &UpdateDecoderBuffers::operator(), py::arg("model_config"), py::arg("decoder_output_buffers"),
py::arg("copy_buffer_manager"), py::arg("decoder_state"), py::arg("return_log_probs"),
py::arg("decoder_finish_event"))
.def("name", [](UpdateDecoderBuffers const&) { return UpdateDecoderBuffers::name; });
}
Loading