NVIDIA · Funatiq · Aug 1, 2025 · Jul 30, 2025 · Jul 30, 2025 · Jul 30, 2025
diff --git a/cpp/include/tensorrt_llm/runtime/gptDecoder.h b/cpp/include/tensorrt_llm/runtime/gptDecoder.h
@@ -20,7 +20,6 @@
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/decodingInput.h"
 #include "tensorrt_llm/runtime/decodingOutput.h"
-#include "tensorrt_llm/runtime/request.h"
 #include "tensorrt_llm/runtime/samplingConfig.h"
 
 #include <NvInferRuntime.h>

diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp
@@ -20,18 +20,13 @@
 #include "tensorrt_llm/batch_manager/assignReqSeqSlots.h"
 #include "tensorrt_llm/batch_manager/capacityScheduler.h"
 #include "tensorrt_llm/batch_manager/createNewDecoderRequests.h"
-#include "tensorrt_llm/batch_manager/handleContextLogits.h"
-#include "tensorrt_llm/batch_manager/handleGenerationLogits.h"
 #include "tensorrt_llm/batch_manager/kvCacheManager.h"
 #include "tensorrt_llm/batch_manager/llmRequest.h"
 #include "tensorrt_llm/batch_manager/logitsPostProcessor.h"
-#include "tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h"
 #include "tensorrt_llm/batch_manager/medusaBuffers.h"
 #include "tensorrt_llm/batch_manager/microBatchScheduler.h"
 #include "tensorrt_llm/batch_manager/pauseRequests.h"
 #include "tensorrt_llm/batch_manager/peftCacheManager.h"
-#include "tensorrt_llm/batch_manager/runtimeBuffers.h"
-#include "tensorrt_llm/batch_manager/updateDecoderBuffers.h"
 #include "tensorrt_llm/nanobind/common/customCasters.h"
 #include "tensorrt_llm/runtime/decoderState.h"
 #include "tensorrt_llm/runtime/torch.h"
@@ -94,48 +89,6 @@ void tensorrt_llm::nanobind::batch_manager::algorithms::initBindings(nb::module_
             nb::arg("generation_requests"), nb::arg("model_config"), nb::arg("cross_kv_cache_manager") = std::nullopt)
         .def("name", [](AllocateKvCache const&) { return AllocateKvCache::name; });
 
-    nb::class_<HandleContextLogits>(m, HandleContextLogits::name)
-        .def(nb::init<>())
-        .def(
-            "__call__",
-            [](HandleContextLogits const& self, DecoderInputBuffers& inputBuffers, RequestVector const& contextRequests,
-                at::Tensor const& logits, std::vector<tr::SizeType32> const& numContextLogitsVec,
-                tr::ModelConfig const& modelConfig, tr::BufferManager const& manager,
-                OptionalRef<MedusaBuffers> medusaBuffers = std::nullopt)
-            {
-                return self(inputBuffers, contextRequests, tr::TorchView::of(logits), numContextLogitsVec, modelConfig,
-                    manager, medusaBuffers);
-            },
-            nb::arg("decoder_input_buffers"), nb::arg("context_requests"), nb::arg("logits"),
-            nb::arg("num_context_logits"), nb::arg("model_config"), nb::arg("buffer_manager"),
-            nb::arg("medusa_buffers") = std::nullopt)
-        .def("name", [](HandleContextLogits const&) { return HandleContextLogits::name; });
-
-    nb::class_<HandleGenerationLogits>(m, HandleGenerationLogits::name)
-        .def(nb::init<>())
-        .def(
-            "__call__",
-            [](HandleGenerationLogits const& self, DecoderInputBuffers& inputBuffers,
-                RequestVector const& generationRequests, at::Tensor const& logits, tr::SizeType32 logitsIndex,
-                tr::ModelConfig const& modelConfig, tr::BufferManager const& manager,
-                OptionalRef<RuntimeBuffers> genRuntimeBuffers = std::nullopt,
-                OptionalRef<MedusaBuffers> medusaBuffers = std::nullopt)
-            {
-                self(inputBuffers, generationRequests, tr::TorchView::of(logits), logitsIndex, modelConfig, manager,
-                    genRuntimeBuffers, medusaBuffers);
-            },
-            nb::arg("decoder_input_buffers"), nb::arg("generation_requests"), nb::arg("logits"),
-            nb::arg("logits_index"), nb::arg("model_config"), nb::arg("buffer_manager"),
-            nb::arg("gen_runtime_buffers") = std::nullopt, nb::arg("medusa_buffers") = std::nullopt)
-        .def("name", [](HandleGenerationLogits const&) { return HandleGenerationLogits::name; });
-
-    nb::class_<MakeDecodingBatchInputOutput>(m, MakeDecodingBatchInputOutput::name)
-        .def(nb::init<>())
-        .def("__call__", &MakeDecodingBatchInputOutput::operator(), nb::arg("decoder_input_buffers"),
-            nb::arg("decoder_state"), nb::arg("model_config"), nb::arg("max_num_sequences"),
-            nb::arg("fused_runtime_buffers") = std::nullopt)
-        .def("name", [](MakeDecodingBatchInputOutput const&) { return MakeDecodingBatchInputOutput::name; });
-
     nb::class_<LogitsPostProcessor>(m, LogitsPostProcessor::name)
         .def(nb::init<>())
         .def("__call__", &LogitsPostProcessor::operator(), nb::arg("decoder_input_buffers"),
@@ -154,8 +107,9 @@ void tensorrt_llm::nanobind::batch_manager::algorithms::initBindings(nb::module_
                 DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
                 tensorrt_llm::runtime::CudaStream const& runtimeStream,
                 tensorrt_llm::runtime::CudaStream const& decoderStream, SizeType32 maxSequenceLength,
-                SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers = std::nullopt)
+                SizeType32 beamWidth)
             {
+                OptionalRef<MedusaBuffers const> medusaBuffers = std::nullopt;
                 auto [batchSlots, samplingConfigs, lookaheadPrompt, lookaheadAlgoConfigs] = self(modelConfig,
                     worldConfig, decodingConfig, contextRequests, bufferManager, logitsType, inputBuffers, decoderState,
                     runtimeStream, decoderStream, maxSequenceLength, beamWidth, medusaBuffers);
@@ -166,13 +120,6 @@ void tensorrt_llm::nanobind::batch_manager::algorithms::initBindings(nb::module_
             nb::arg("model_config"), nb::arg("world_config"), nb::arg("decoding_config"), nb::arg("context_requests"),
             nb::arg("buffer_manager"), nb::arg("logits_type"), nb::arg("decoder_input_buffers"),
             nb::arg("decoder_state"), nb::arg("runtime_stream"), nb::arg("decoder_stream"),
-            nb::arg("max_sequence_length"), nb::arg("beam_width"), nb::arg("medusa_buffers") = std::nullopt)
+            nb::arg("max_sequence_length"), nb::arg("beam_width"))
         .def("name", [](CreateNewDecoderRequests const&) { return CreateNewDecoderRequests::name; });
-
-    nb::class_<UpdateDecoderBuffers>(m, UpdateDecoderBuffers::name)
-        .def(nb::init<>())
-        .def("__call__", &UpdateDecoderBuffers::operator(), nb::arg("model_config"), nb::arg("decoder_output_buffers"),
-            nb::arg("copy_buffer_manager"), nb::arg("decoder_state"), nb::arg("return_log_probs"),
-            nb::arg("decoder_finish_event"))
-        .def("name", [](UpdateDecoderBuffers const&) { return UpdateDecoderBuffers::name; });
 }
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
@@ -20,11 +20,9 @@
 
 #include "tensorrt_llm/batch_manager/common.h"
 #include "tensorrt_llm/batch_manager/decoderBuffers.h"
-#include "tensorrt_llm/batch_manager/medusaBuffers.h"
 #include "tensorrt_llm/batch_manager/microBatchScheduler.h"
 #include "tensorrt_llm/batch_manager/peftCacheManager.h"
 #include "tensorrt_llm/batch_manager/rnnStateManager.h"
-#include "tensorrt_llm/batch_manager/runtimeBuffers.h"
 #include "tensorrt_llm/batch_manager/sequenceSlotManager.h"
 #include "tensorrt_llm/nanobind/common/bindTypes.h"
 #include "tensorrt_llm/runtime/gptDecoderBatched.h"
@@ -419,13 +417,6 @@ void initBindings(nb::module_& m)
         .def_rw("log_probs_host", &tb::SlotDecoderBuffers::logProbsHost)
         .def_rw("finish_reasons_host", &tb::SlotDecoderBuffers::finishReasonsHost);
 
-    nb::class_<tb::MedusaBuffers>(m, "MedusaBuffers")
-        .def(nb::init<runtime::SizeType32, runtime::SizeType32, runtime::BufferManager const&,
-                 runtime::ModelConfig const&, runtime::WorldConfig const&, executor::DecodingConfig const&,
-                 runtime::TllmRuntime const&>(),
-            nb::arg("max_beam_width"), nb::arg("max_seq_len"), nb::arg("buffer_manager"), nb::arg("model_config"),
-            nb::arg("world_config"), nb::arg("decoding_config"), nb::arg("runtime"));
-
     m.def(
         "add_new_tokens_to_requests",
         [](std::vector<std::shared_ptr<tb::LlmRequest>>& requests,

diff --git a/cpp/tensorrt_llm/nanobind/bindings.cpp b/cpp/tensorrt_llm/nanobind/bindings.cpp
@@ -50,7 +50,6 @@
 
 namespace nb = nanobind;
 namespace tb = tensorrt_llm::batch_manager;
-namespace tbk = tensorrt_llm::batch_manager::kv_cache_manager;
 namespace tpb = tensorrt_llm::nanobind::batch_manager;
 namespace tc = tensorrt_llm::common;
 namespace tr = tensorrt_llm::runtime;

diff --git a/cpp/tensorrt_llm/nanobind/common/customCasters.h b/cpp/tensorrt_llm/nanobind/common/customCasters.h
@@ -21,14 +21,11 @@
 #include "tensorrt_llm/batch_manager/decoderBuffers.h"
 #include "tensorrt_llm/common/optionalRef.h"
 #include "tensorrt_llm/runtime/cudaStream.h"
-#include "tensorrt_llm/runtime/request.h"
 #include "tensorrt_llm/runtime/samplingConfig.h"
 #include "tensorrt_llm/runtime/torch.h"
 #include "tensorrt_llm/runtime/torchView.h"
 
 #include <ATen/DLConvertor.h>
-#include <deque>
-#include <filesystem>
 #include <nanobind/nanobind.h>
 #include <nanobind/stl/filesystem.h>
 #include <nanobind/stl/optional.h>
@@ -38,7 +35,8 @@
 #include <torch/csrc/autograd/variable.h>
 #include <torch/extension.h>
 #include <torch/torch.h>
-#include <vector>
+
+#include <deque>
 
 // Pybind requires to have a central include in order for type casters to work.
 // Opaque bindings add a type caster, so they have the same requirement.
@@ -47,7 +45,6 @@
 // Opaque bindings
 NB_MAKE_OPAQUE(tensorrt_llm::batch_manager::ReqIdsSet)
 NB_MAKE_OPAQUE(std::vector<tensorrt_llm::batch_manager::SlotDecoderBuffers>)
-NB_MAKE_OPAQUE(std::vector<tensorrt_llm::runtime::decoder_batch::Request>)
 NB_MAKE_OPAQUE(std::vector<tensorrt_llm::runtime::SamplingConfig>)
 
 namespace nb = nanobind;

diff --git a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
@@ -36,7 +36,6 @@
 #include "tensorrt_llm/runtime/lookaheadBuffers.h"
 #include "tensorrt_llm/runtime/loraCache.h"
 #include "tensorrt_llm/runtime/mcastGPUBuffer.h"
-#include "tensorrt_llm/runtime/request.h"
 #include "tensorrt_llm/runtime/speculativeDecodingMode.h"
 #include "tensorrt_llm/runtime/tllmRuntime.h"
 #include "tensorrt_llm/runtime/torchView.h"
@@ -158,25 +157,6 @@ void initBindings(nb::module_& m)
         .def_prop_ro("logits_dtype_from_engine",
             [](tr::TllmRuntime& self) { return self.getEngine().getTensorDataType("logits"); });
 
-    nb::class_<tr::decoder_batch::Request>(m, "Request")
-        .def(nb::init<tr::decoder_batch::Request::TensorConstPtr, tr::SizeType32, std::optional<tr::SizeType32>,
-                 std::optional<tr::SizeType32>>(),
-            nb::arg("ids"), nb::arg("input_len"), nb::arg("max_new_tokens") = std::nullopt,
-            nb::arg("end_id") = std::nullopt)
-        .def_rw("ids", &tr::decoder_batch::Request::ids)
-        .def_rw("input_len", &tr::decoder_batch::Request::inputLen)
-        .def_rw("max_new_tokens", &tr::decoder_batch::Request::maxNewTokens)
-        .def_rw("end_id", &tr::decoder_batch::Request::endId)
-        .def_rw("draft_logits", &tr::decoder_batch::Request::draftLogits)
-        .def_rw("embedding_bias", &tr::decoder_batch::Request::embeddingBias)
-        .def_rw("bad_words_list", &tr::decoder_batch::Request::badWordsList)
-        .def_rw("stop_words_list", &tr::decoder_batch::Request::stopWordsList)
-        .def_rw("generated_tokens_per_engine_step", &tr::decoder_batch::Request::generatedTokensPerEngineStep)
-        .def_rw("medusa_paths", &tr::decoder_batch::Request::medusaPaths)
-        .def_rw("medusa_tree_ids", &tr::decoder_batch::Request::medusaTreeIds)
-        .def_rw("lookahead_runtime_config", &tr::decoder_batch::Request::lookaheadRuntimeConfig);
-    nb::bind_vector<std::vector<tr::decoder_batch::Request>>(m, "RequestVector");
-
     nb::class_<tr::decoder_batch::Input>(m, "DecoderBatchInput")
         .def(nb::init<std::vector<std::vector<tr::ITensor::SharedConstPtr>>, tr::SizeType32>(), nb::arg("logits"),
             nb::arg("max_decoding_engine_tokens"))

diff --git a/cpp/tensorrt_llm/pybind/CMakeLists.txt b/cpp/tensorrt_llm/pybind/CMakeLists.txt
@@ -6,7 +6,6 @@ set(TRTLLM_PYBIND_MODULE
 set(SRCS
     batch_manager/algorithms.cpp
     batch_manager/bindings.cpp
-    batch_manager/buffers.cpp
     batch_manager/cacheTransceiver.cpp
     batch_manager/kvCacheManager.cpp
     batch_manager/llmRequest.cpp

diff --git a/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,18 +20,13 @@
 #include "tensorrt_llm/batch_manager/assignReqSeqSlots.h"
 #include "tensorrt_llm/batch_manager/capacityScheduler.h"
 #include "tensorrt_llm/batch_manager/createNewDecoderRequests.h"
-#include "tensorrt_llm/batch_manager/handleContextLogits.h"
-#include "tensorrt_llm/batch_manager/handleGenerationLogits.h"
 #include "tensorrt_llm/batch_manager/kvCacheManager.h"
 #include "tensorrt_llm/batch_manager/llmRequest.h"
 #include "tensorrt_llm/batch_manager/logitsPostProcessor.h"
-#include "tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h"
 #include "tensorrt_llm/batch_manager/medusaBuffers.h"
 #include "tensorrt_llm/batch_manager/microBatchScheduler.h"
 #include "tensorrt_llm/batch_manager/pauseRequests.h"
 #include "tensorrt_llm/batch_manager/peftCacheManager.h"
-#include "tensorrt_llm/batch_manager/runtimeBuffers.h"
-#include "tensorrt_llm/batch_manager/updateDecoderBuffers.h"
 #include "tensorrt_llm/runtime/decoderState.h"
 #include "tensorrt_llm/runtime/torch.h"
 #include "tensorrt_llm/runtime/torchView.h"
@@ -96,48 +91,6 @@ void tensorrt_llm::pybind::batch_manager::algorithms::initBindings(pybind11::mod
             py::arg("generation_requests"), py::arg("model_config"), py::arg("cross_kv_cache_manager") = std::nullopt)
         .def("name", [](AllocateKvCache const&) { return AllocateKvCache::name; });
 
-    py::class_<HandleContextLogits>(m, HandleContextLogits::name)
-        .def(py::init())
-        .def(
-            "__call__",
-            [](HandleContextLogits const& self, DecoderInputBuffers& inputBuffers, RequestVector const& contextRequests,
-                at::Tensor const& logits, std::vector<tr::SizeType32> const& numContextLogitsVec,
-                tr::ModelConfig const& modelConfig, tr::BufferManager const& manager,
-                OptionalRef<MedusaBuffers> medusaBuffers = std::nullopt)
-            {
-                return self(inputBuffers, contextRequests, tr::TorchView::of(logits), numContextLogitsVec, modelConfig,
-                    manager, medusaBuffers);
-            },
-            py::arg("decoder_input_buffers"), py::arg("context_requests"), py::arg("logits"),
-            py::arg("num_context_logits"), py::arg("model_config"), py::arg("buffer_manager"),
-            py::arg("medusa_buffers") = std::nullopt)
-        .def("name", [](HandleContextLogits const&) { return HandleContextLogits::name; });
-
-    py::class_<HandleGenerationLogits>(m, HandleGenerationLogits::name)
-        .def(py::init())
-        .def(
-            "__call__",
-            [](HandleGenerationLogits const& self, DecoderInputBuffers& inputBuffers,
-                RequestVector const& generationRequests, at::Tensor const& logits, tr::SizeType32 logitsIndex,
-                tr::ModelConfig const& modelConfig, tr::BufferManager const& manager,
-                OptionalRef<RuntimeBuffers> genRuntimeBuffers = std::nullopt,
-                OptionalRef<MedusaBuffers> medusaBuffers = std::nullopt)
-            {
-                self(inputBuffers, generationRequests, tr::TorchView::of(logits), logitsIndex, modelConfig, manager,
-                    genRuntimeBuffers, medusaBuffers);
-            },
-            py::arg("decoder_input_buffers"), py::arg("generation_requests"), py::arg("logits"),
-            py::arg("logits_index"), py::arg("model_config"), py::arg("buffer_manager"),
-            py::arg("gen_runtime_buffers") = std::nullopt, py::arg("medusa_buffers") = std::nullopt)
-        .def("name", [](HandleGenerationLogits const&) { return HandleGenerationLogits::name; });
-
-    py::class_<MakeDecodingBatchInputOutput>(m, MakeDecodingBatchInputOutput::name)
-        .def(py::init())
-        .def("__call__", &MakeDecodingBatchInputOutput::operator(), py::arg("decoder_input_buffers"),
-            py::arg("decoder_state"), py::arg("model_config"), py::arg("max_num_sequences"),
-            py::arg("fused_runtime_buffers") = std::nullopt)
-        .def("name", [](MakeDecodingBatchInputOutput const&) { return MakeDecodingBatchInputOutput::name; });
-
     py::class_<LogitsPostProcessor>(m, LogitsPostProcessor::name)
         .def(py::init())
         .def("__call__", &LogitsPostProcessor::operator(), py::arg("decoder_input_buffers"),
@@ -156,8 +109,9 @@ void tensorrt_llm::pybind::batch_manager::algorithms::initBindings(pybind11::mod
                 DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
                 tensorrt_llm::runtime::CudaStream const& runtimeStream,
                 tensorrt_llm::runtime::CudaStream const& decoderStream, SizeType32 maxSequenceLength,
-                SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers = std::nullopt)
+                SizeType32 beamWidth)
             {
+                OptionalRef<MedusaBuffers const> medusaBuffers = std::nullopt;
                 auto [batchSlots, samplingConfigs, lookaheadPrompt, lookaheadAlgoConfigs] = self(modelConfig,
                     worldConfig, decodingConfig, contextRequests, bufferManager, logitsType, inputBuffers, decoderState,
                     runtimeStream, decoderStream, maxSequenceLength, beamWidth, medusaBuffers);
@@ -168,13 +122,6 @@ void tensorrt_llm::pybind::batch_manager::algorithms::initBindings(pybind11::mod
             py::arg("model_config"), py::arg("world_config"), py::arg("decoding_config"), py::arg("context_requests"),
             py::arg("buffer_manager"), py::arg("logits_type"), py::arg("decoder_input_buffers"),
             py::arg("decoder_state"), py::arg("runtime_stream"), py::arg("decoder_stream"),
-            py::arg("max_sequence_length"), py::arg("beam_width"), py::arg("medusa_buffers") = std::nullopt)
+            py::arg("max_sequence_length"), py::arg("beam_width"))
         .def("name", [](CreateNewDecoderRequests const&) { return CreateNewDecoderRequests::name; });
-
-    py::class_<UpdateDecoderBuffers>(m, UpdateDecoderBuffers::name)
-        .def(py::init())
-        .def("__call__", &UpdateDecoderBuffers::operator(), py::arg("model_config"), py::arg("decoder_output_buffers"),
-            py::arg("copy_buffer_manager"), py::arg("decoder_state"), py::arg("return_log_probs"),
-            py::arg("decoder_finish_event"))
-        .def("name", [](UpdateDecoderBuffers const&) { return UpdateDecoderBuffers::name; });
 }