vllm-project · aoshen02 · May 15, 2026 · gemini-code-assist · May 15, 2026
@@ -649,8 +649,29 @@ def reset_encoder_cache(self) -> None:
         # Reset the GPU model runner's encoder cache (physical storage)
         self.model_executor.reset_encoder_cache()
 
-    def _reset_caches(self, reset_running_requests=True) -> None:
-        self.reset_prefix_cache(reset_running_requests=reset_running_requests)
+    def _reset_caches(
+        self,
+        reset_running_requests: bool = True,
+        reset_connector: bool = True,
+    ) -> None:
+        # ``reset_connector`` defaults to True so external KV-store
+        # connectors (e.g. MooncakeStoreConnector) drop their state
+        # alongside the local prefix/mm/encoder caches. This matches the
+        # invariant callers of ``pause_generation(clear_cache=True)``
+        # expect: clear all caches, not just the on-engine ones.
+        # ``Scheduler.reset_connector_cache`` already handles the
+        # no-connector case (logs + short-circuits), so this is a no-op
+        # for engines without a configured KV connector.
+        #
+        # Internal callers that genuinely want a local-only invalidation
+        # can pass ``reset_connector=False``; users that need that escape
+        # hatch from outside should call
+        # ``scheduler.reset_prefix_cache(reset_connector=False)``
+        # directly rather than going through this cascade.
+        self.reset_prefix_cache(
+            reset_running_requests=reset_running_requests,
+            reset_connector=reset_connector,
+        )
         self.reset_mm_cache()
         self.reset_encoder_cache()