A whole bunch of unit tests

jthomson04 · jthomson04 · commit 7faae02c9788 · 2025-07-29T22:00:28.000-07:00
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheConnector.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheConnector.h
@@ -26,8 +26,6 @@
 using SizeType32 = tensorrt_llm::runtime::SizeType32;
 using RequestIdType = tensorrt_llm::batch_manager::LlmRequest::RequestIdType;
 
-using namespace tensorrt_llm::batch_manager;
-
 namespace tensorrt_llm::batch_manager::kv_connector
 {
 
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheConnector.cpp b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheConnector.cpp
@@ -31,7 +31,7 @@ class PyKvCacheConnectorManager : public KvCacheConnectorManager, py::trampoline
 public:
     using KvCacheConnectorManager::KvCacheConnectorManager;
 
-    SizeType32 getNumNewMatchedTokens(LlmRequest const& request, SizeType32 numComputedTokens) override
+    SizeType32 getNumNewMatchedTokens(tb::LlmRequest const& request, SizeType32 numComputedTokens) override
     {
         PYBIND11_OVERRIDE_PURE_NAME(SizeType32, KvCacheConnectorManager, "get_num_new_matched_tokens",
             getNumNewMatchedTokens, request, numComputedTokens);
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
@@ -98,7 +98,7 @@ class PyKvCacheManager : public tbk::BaseKVCacheManager
 
     void addSequence(tb::LlmRequest::RequestIdType requestId, SizeType32 inputLength, SizeType32 beamWidth,
         tensorrt_llm::common::OptionalRef<tb::LlmRequest> llmRequest = std::nullopt,
-        tensorrt_llm::common::OptionalRef<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager
+        tensorrt_llm::common::OptionalRef<tb::kv_connector::KvCacheConnectorManager> kvCacheConnectorManager
         = std::nullopt) override
     {
         PYBIND11_OVERLOAD_PURE(void, tbk::BaseKVCacheManager, addSequence, requestId, inputLength, beamWidth,
@@ -238,10 +238,10 @@ class PyKvCacheManager : public tbk::BaseKVCacheManager
         PYBIND11_OVERLOAD_PURE(void, tbk::BaseKVCacheManager, flushIterationEvents);
     }
 
-    kv_connector::KvCacheConnectorPoolsData getKvCacheConnectorPoolsData() const override
+    [[nodiscard]] tb::kv_connector::KvCacheConnectorPoolsData getKvCacheConnectorPoolsData() const override
     {
         PYBIND11_OVERLOAD_PURE(
-            kv_connector::KvCacheConnectorPoolsData, tbk::BaseKVCacheManager, getKvCacheConnectorPoolsData);
+            tb::kv_connector::KvCacheConnectorPoolsData, tbk::BaseKVCacheManager, getKvCacheConnectorPoolsData);
     }
 };
 
diff --git a/tensorrt_llm/_torch/pyexecutor/connector.py b/tensorrt_llm/_torch/pyexecutor/connector.py
@@ -275,7 +275,7 @@ def get_num_new_matched_tokens(self, request: LlmRequest,
 
         # TODO(jthomson04): This part is a bit ugly.
         # When the connector indicates that a request will be loaded asynchronously, we need to suspend it's execution.
-        # This is problematic, since at this point when this function is called, the request has already been scheduled!
+        # This is problematic, since at the point when this function is called, the request has already been scheduled!
         # Because of this, we need to remove it from our list of scheduled requests (see `take_scheduled_requests_pending_load`).
         if load_kv_async:
             self.new_async_requests.loading[request.request_id] = request
@@ -308,8 +308,6 @@ def take_scheduled_requests_pending_load(
         # Update the list of scheduled requests.
         scheduled_requests.context_requests = allowed_context_requests
 
-        return scheduled_requests
-
     def build_connector_meta(self) -> object:
         metadata = self._run_on_leader(
             lambda: self.scheduler.build_connector_meta(self._scheduler_output))
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -924,7 +924,7 @@ def _executor_loop(self):
                     # We have to run this after we've run the KV cache manager (via the resource manager).
                     # This takes requests that are pending an async load, and removes them from the scheduled context batch.
                     if self.kv_connector_manager:
-                        scheduled_batch = self.kv_connector_manager.take_scheduled_requests_pending_load(
+                        self.kv_connector_manager.take_scheduled_requests_pending_load(
                             scheduled_batch)
 
                 if scheduled_batch.batch_size > 0 or (
diff --git a/tests/unittest/_torch/test_connector.py b/tests/unittest/_torch/test_connector.py
@@ -0,0 +1,153 @@
+import pickle
+import sys
+from unittest.mock import MagicMock
+
+import cloudpickle
+import mpi4py
+import pytest
+
+from tensorrt_llm import mpi_rank
+from tensorrt_llm._torch.pyexecutor.connector import KvCacheConnectorManager
+from tensorrt_llm._torch.pyexecutor.scheduler import ScheduledRequests
+
+cloudpickle.register_pickle_by_value(sys.modules[__name__])
+mpi4py.MPI.pickle.__init__(
+    cloudpickle.dumps,
+    cloudpickle.loads,
+    pickle.HIGHEST_PROTOCOL,
+)
+
+
+def run_across_mpi(executor, fun, num_ranks):
+    return list(executor.starmap(fun, [() for i in range(num_ranks)]))
+
+
+@pytest.mark.parametrize("mpi_pool_executor", [2], indirect=True)
+def test_connector_manager_get_finished_allgather(mpi_pool_executor):
+
+    def test():
+        worker = MagicMock()
+
+        if mpi_rank() == 0:
+            scheduler = MagicMock()
+
+            scheduler.request_finished.return_value = True
+        else:
+            scheduler = None
+
+        manager = KvCacheConnectorManager(worker, scheduler=scheduler)
+
+        req = MagicMock()
+
+        req.request_id = 42
+
+        manager.request_finished(req)
+
+        # To start, make both workers return nothing.
+        worker.get_finished.return_value = ([], [])
+
+        assert manager.get_finished() == []
+
+        assert worker.get_finished.call_count == 1
+        assert worker.get_finished.call_args[0] == ([42], [])
+
+        worker.get_finished.reset_mock()
+
+        # Now, only return the request id on one worker.
+        if mpi_rank() == 0:
+            worker.get_finished.return_value = ([42], [])
+        else:
+            worker.get_finished.return_value = ([], [])
+
+        # It should still return nothing, since rank 1 is still saving.
+        assert manager.get_finished() == []
+
+        assert worker.get_finished.call_count == 1
+        assert worker.get_finished.call_args[0] == ([], [])
+
+        # Now, also return it on worker 1.
+        if mpi_rank() == 0:
+            worker.get_finished.return_value = ([], [])
+        else:
+            worker.get_finished.return_value = ([42], [])
+
+        assert manager.get_finished() == [req]
+
+    run_across_mpi(mpi_pool_executor, test, 2)
+
+
+@pytest.mark.parametrize("mpi_pool_executor", [2], indirect=True)
+def test_connector_manager_num_matched_tokens(mpi_pool_executor):
+
+    def test():
+        worker = MagicMock()
+
+        if mpi_rank() == 0:
+            scheduler = MagicMock()
+            scheduler.get_num_new_matched_tokens.return_value = (16, True)
+        else:
+            scheduler = None
+
+        manager = KvCacheConnectorManager(worker, scheduler=scheduler)
+
+        req = MagicMock()
+
+        req.request_id = 42
+
+        assert manager.get_num_new_matched_tokens(req, 32) == 16
+        assert req.is_kv_cache_connector_async_onboard
+
+        if mpi_rank() == 0:
+            assert scheduler.get_num_new_matched_tokens.call_count == 1
+            assert scheduler.get_num_new_matched_tokens.call_args[0] == (req,
+                                                                         32)
+
+    run_across_mpi(mpi_pool_executor, test, 2)
+
+
+@pytest.mark.parametrize("mpi_pool_executor", [2], indirect=True)
+def test_connector_manager_take_scheduled_requests(mpi_pool_executor):
+
+    def test():
+        worker = MagicMock()
+
+        if mpi_rank() == 0:
+            scheduler = MagicMock()
+        else:
+            scheduler = None
+
+        manager = KvCacheConnectorManager(worker, scheduler=scheduler)
+
+        scheduled_requests = ScheduledRequests()
+
+        req0 = MagicMock()
+        req0.request_id = 0
+
+        req1 = MagicMock()
+        req1.request_id = 1
+
+        if mpi_rank() == 0:
+            scheduler.get_num_new_matched_tokens.return_value = (16, True)
+
+        assert manager.get_num_new_matched_tokens(req0, 0) == 16
+        if mpi_rank() == 0:
+            assert scheduler.get_num_new_matched_tokens.call_count == 1
+            assert scheduler.get_num_new_matched_tokens.call_args[0] == (req0,
+                                                                         0)
+
+            scheduler.get_num_new_matched_tokens.reset_mock()
+            scheduler.get_num_new_matched_tokens.return_value = (32, False)
+
+        assert manager.get_num_new_matched_tokens(req1, 0) == 32
+        if mpi_rank() == 0:
+            assert scheduler.get_num_new_matched_tokens.call_count == 1
+            assert scheduler.get_num_new_matched_tokens.call_args[0] == (req1,
+                                                                         0)
+
+        scheduled_requests.context_requests = [req0, req1]
+
+        manager.take_scheduled_requests_pending_load(scheduled_requests)
+
+        assert scheduled_requests.context_requests == [req1]
+
+    run_across_mpi(mpi_pool_executor, test, 2)
diff --git a/tests/unittest/bindings/test_connector_bindings.py b/tests/unittest/bindings/test_connector_bindings.py

Original file line number	Diff line number	Diff line change
`@@ -26,8 +26,6 @@`
`26`	`26`	`using SizeType32 = tensorrt_llm::runtime::SizeType32;`
`27`	`27`	`using RequestIdType = tensorrt_llm::batch_manager::LlmRequest::RequestIdType;`
`28`	`28`
`29`		`-using namespace tensorrt_llm::batch_manager;`
`30`		`-`
`31`	`29`	`namespace tensorrt_llm::batch_manager::kv_connector`
`32`	`30`	`{`
`33`	`31`
Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ class PyKvCacheConnectorManager : public KvCacheConnectorManager, py::trampoline`
`31`	`31`	`public:`
`32`	`32`	`using KvCacheConnectorManager::KvCacheConnectorManager;`
`33`	`33`
`34`		`- SizeType32 getNumNewMatchedTokens(LlmRequest const& request, SizeType32 numComputedTokens) override`
	`34`	`+ SizeType32 getNumNewMatchedTokens(tb::LlmRequest const& request, SizeType32 numComputedTokens) override`
`35`	`35`	`{`
`36`	`36`	`PYBIND11_OVERRIDE_PURE_NAME(SizeType32, KvCacheConnectorManager, "get_num_new_matched_tokens",`
`37`	`37`	`getNumNewMatchedTokens, request, numComputedTokens);`
Original file line number	Diff line number	Diff line change
`@@ -98,7 +98,7 @@ class PyKvCacheManager : public tbk::BaseKVCacheManager`
`98`	`98`
`99`	`99`	`void addSequence(tb::LlmRequest::RequestIdType requestId, SizeType32 inputLength, SizeType32 beamWidth,`
`100`	`100`	`tensorrt_llm::common::OptionalRef<tb::LlmRequest> llmRequest = std::nullopt,`
`101`		`- tensorrt_llm::common::OptionalRef<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager`
	`101`	`+ tensorrt_llm::common::OptionalRef<tb::kv_connector::KvCacheConnectorManager> kvCacheConnectorManager`
`102`	`102`	`= std::nullopt) override`
`103`	`103`	`{`
`104`	`104`	`PYBIND11_OVERLOAD_PURE(void, tbk::BaseKVCacheManager, addSequence, requestId, inputLength, beamWidth,`
`@@ -238,10 +238,10 @@ class PyKvCacheManager : public tbk::BaseKVCacheManager`
`238`	`238`	`PYBIND11_OVERLOAD_PURE(void, tbk::BaseKVCacheManager, flushIterationEvents);`
`239`	`239`	`}`
`240`	`240`
`241`		`- kv_connector::KvCacheConnectorPoolsData getKvCacheConnectorPoolsData() const override`
	`241`	`+ [[nodiscard]] tb::kv_connector::KvCacheConnectorPoolsData getKvCacheConnectorPoolsData() const override`
`242`	`242`	`{`
`243`	`243`	`PYBIND11_OVERLOAD_PURE(`
`244`		`- kv_connector::KvCacheConnectorPoolsData, tbk::BaseKVCacheManager, getKvCacheConnectorPoolsData);`
	`244`	`+ tb::kv_connector::KvCacheConnectorPoolsData, tbk::BaseKVCacheManager, getKvCacheConnectorPoolsData);`
`245`	`245`	`}`
`246`	`246`	`};`
`247`	`247`