BerriAI · ishaan-jaff · Feb 3, 2026 · Feb 2, 2026 · Feb 2, 2026 · Feb 2, 2026
diff --git a/litellm/constants.py b/litellm/constants.py
@@ -67,6 +67,20 @@
     os.getenv("DEFAULT_REASONING_EFFORT_DISABLE_THINKING_BUDGET", 0)
 )
 
+# MCP Semantic Tool Filter Defaults
+DEFAULT_MCP_SEMANTIC_FILTER_EMBEDDING_MODEL = str(
+    os.getenv("DEFAULT_MCP_SEMANTIC_FILTER_EMBEDDING_MODEL", "text-embedding-3-small")
+)
+DEFAULT_MCP_SEMANTIC_FILTER_TOP_K = int(
+    os.getenv("DEFAULT_MCP_SEMANTIC_FILTER_TOP_K", 10)
+)
+DEFAULT_MCP_SEMANTIC_FILTER_SIMILARITY_THRESHOLD = float(
+    os.getenv("DEFAULT_MCP_SEMANTIC_FILTER_SIMILARITY_THRESHOLD", 0.3)
+)
+MAX_MCP_SEMANTIC_FILTER_TOOLS_HEADER_LENGTH = int(
+    os.getenv("MAX_MCP_SEMANTIC_FILTER_TOOLS_HEADER_LENGTH", 150)
+)
+
 # Gemini model-specific minimal thinking budget constants
 DEFAULT_REASONING_EFFORT_MINIMAL_THINKING_BUDGET_GEMINI_2_5_FLASH = int(
     os.getenv("DEFAULT_REASONING_EFFORT_MINIMAL_THINKING_BUDGET_GEMINI_2_5_FLASH", 1)

diff --git a/litellm/proxy/_experimental/mcp_server/semantic_tool_filter.py b/litellm/proxy/_experimental/mcp_server/semantic_tool_filter.py
@@ -0,0 +1,248 @@
+"""
+Semantic MCP Tool Filtering using semantic-router
+
+Filters MCP tools semantically for /chat/completions and /responses endpoints.
+"""
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+from litellm._logging import verbose_logger
+
+if TYPE_CHECKING:
+    from mcp.types import Tool as MCPTool
+    from semantic_router.routers import SemanticRouter
+
+    from litellm.router import Router
+
+
+class SemanticMCPToolFilter:
+    """Filters MCP tools using semantic similarity to reduce context window size."""
+
+    def __init__(
+        self,
+        embedding_model: str,
+        litellm_router_instance: "Router",
+        top_k: int = 10,
+        similarity_threshold: float = 0.3,
+        enabled: bool = True,
+    ):
+        """
+        Initialize the semantic tool filter.
+
+        Args:
+            embedding_model: Model to use for embeddings (e.g., "text-embedding-3-small")
+            litellm_router_instance: Router instance for embedding generation
+            top_k: Maximum number of tools to return
+            similarity_threshold: Minimum similarity score for filtering
+            enabled: Whether filtering is enabled
+        """
+        self.enabled = enabled
+        self.top_k = top_k
+        self.similarity_threshold = similarity_threshold
+        self.embedding_model = embedding_model
+        self.router_instance = litellm_router_instance
+        self.tool_router: Optional["SemanticRouter"] = None
+        self._tool_map: Dict[str, Any] = {}  # MCPTool objects or OpenAI function dicts
+
+    async def build_router_from_mcp_registry(self) -> None:
+        """Build semantic router from all MCP tools in the registry (no auth checks)."""
+        from litellm.proxy._experimental.mcp_server.mcp_server_manager import (
+            global_mcp_server_manager,
+        )
+
+        try:
+            # Get all servers from registry without auth checks
+            registry = global_mcp_server_manager.get_registry()
+            if not registry:
+                verbose_logger.warning("MCP registry is empty")
+                self.tool_router = None
+                return
+
+            # Fetch tools from all servers in parallel
+            all_tools = []
+            for server_id, server in registry.items():
+                try:
+                    tools = await global_mcp_server_manager.get_tools_for_server(server_id)
+                    all_tools.extend(tools)
+                except Exception as e:
+                    verbose_logger.warning(f"Failed to fetch tools from server {server_id}: {e}")
+                    continue
+
+            if not all_tools:
+                verbose_logger.warning("No MCP tools found in registry")
+                self.tool_router = None
+                return
+
+            verbose_logger.info(f"Fetched {len(all_tools)} tools from {len(registry)} MCP servers")
+            self._build_router(all_tools)
+
+        except Exception as e:
+            verbose_logger.error(f"Failed to build router from MCP registry: {e}")
+            self.tool_router = None
+            raise
+
+    def _extract_tool_info(self, tool) -> tuple[str, str]:
+        """Extract name and description from MCP tool or OpenAI function dict."""
+        if isinstance(tool, dict):
+            # OpenAI function format
+            name = tool.get("name", "")
+            description = tool.get("description", name)
+        else:
+            # MCPTool object
+            name = tool.name
+            description = tool.description or tool.name
+
+        return name, description
+
+    def _build_router(self, tools: List) -> None:
+        """Build semantic router with tools (MCPTool objects or OpenAI function dicts)."""
+        from semantic_router.routers import SemanticRouter
+        from semantic_router.routers.base import Route
+
+        from litellm.router_strategy.auto_router.litellm_encoder import (
+            LiteLLMRouterEncoder,
+        )
+
+        if not tools:
+            self.tool_router = None
+            return
+
+        try:
+            # Convert tools to routes
+            routes = []
+            self._tool_map = {}
+
+            for tool in tools:
+                name, description = self._extract_tool_info(tool)
+                self._tool_map[name] = tool
+
+                routes.append(
+                    Route(
+                        name=name,
+                        description=description,
+                        utterances=[description],
+                        score_threshold=self.similarity_threshold,
+                    )
+                )
+
+            self.tool_router = SemanticRouter(
+                routes=routes,
+                encoder=LiteLLMRouterEncoder(
+                    litellm_router_instance=self.router_instance,
+                    model_name=self.embedding_model,
+                    score_threshold=self.similarity_threshold,
+                ),
+                auto_sync="local",
+            )
+
+            verbose_logger.info(
+                f"Built semantic router with {len(routes)} tools"
+            )
+
+        except Exception as e:
+            verbose_logger.error(f"Failed to build semantic router: {e}")
+            self.tool_router = None
+            raise
+
+    async def filter_tools(
+        self,
+        query: str,
+        available_tools: List[Any],
+        top_k: Optional[int] = None,
+    ) -> List[Any]:
+        """
+        Filter tools semantically based on query.
+
+        Args:
+            query: User query to match against tools
+            available_tools: Full list of available MCP tools
+            top_k: Override default top_k (optional)
+
+        Returns:
+            Filtered and ordered list of tools (up to top_k)
+        """
+        # Early returns for cases where we can't/shouldn't filter
+        if not self.enabled:
+            return available_tools
+
+        if not available_tools:
+            return available_tools
+
+        if not query or not query.strip():
+            return available_tools
+
+        # Router should be built on startup - if not, something went wrong
+        if self.tool_router is None:
+            verbose_logger.warning("Router not initialized - was build_router_from_mcp_registry() called on startup?")
+            return available_tools
+
+        # Run semantic filtering
+        try:
+            limit = top_k or self.top_k
+            matches = self.tool_router(text=query, limit=limit)
+            matched_tool_names = self._extract_tool_names_from_matches(matches)
+
+            if not matched_tool_names:
+                return available_tools
+
+            return self._get_tools_by_names(matched_tool_names, available_tools)
+
+        except Exception as e:
+            verbose_logger.error(f"Semantic tool filter failed: {e}", exc_info=True)
+            return available_tools
+
+    def _extract_tool_names_from_matches(self, matches) -> List[str]:
+        """Extract tool names from semantic router match results."""
+        if not matches:
+            return []
+
+        # Handle single match
+        if hasattr(matches, "name") and matches.name:
+            return [matches.name]
+
+        # Handle list of matches
+        if isinstance(matches, list):
+            return [m.name for m in matches if hasattr(m, "name") and m.name]
+
+        return []
+
+    def _get_tools_by_names(
+        self, tool_names: List[str], available_tools: List[Any]
+    ) -> List[Any]:
+        """Get tools from available_tools by their names, preserving order."""
+        # Match tools from available_tools (preserves format - dict or MCPTool)
+        matched_tools = []
+        for tool in available_tools:
+            tool_name, _ = self._extract_tool_info(tool)
+            if tool_name in tool_names:
+                matched_tools.append(tool)
+
+        # Reorder to match semantic router's ordering
+        tool_map = {self._extract_tool_info(t)[0]: t for t in matched_tools}
+        return [tool_map[name] for name in tool_names if name in tool_map]
+
+    def extract_user_query(self, messages: List[Dict[str, Any]]) -> str:
+        """
+        Extract user query from messages for /chat/completions or /responses.
+
+        Args:
+            messages: List of message dictionaries (from 'messages' or 'input' field)
+
+        Returns:
+            Extracted query string
+        """
+        for msg in reversed(messages):
+            if msg.get("role") == "user":
+                content = msg.get("content", "")
+
+                if isinstance(content, str):
+                    return content
+
+                if isinstance(content, list):
+                    texts = [
+                        block.get("text", "") if isinstance(block, dict) else str(block)
+                        for block in content
+                        if isinstance(block, (dict, str))
+                    ]
+                    return " ".join(texts)
+
+        return ""
diff --git a/litellm/proxy/hooks/mcp_semantic_filter/ARCHITECTURE.md b/litellm/proxy/hooks/mcp_semantic_filter/ARCHITECTURE.md
@@ -0,0 +1,96 @@
+# MCP Semantic Tool Filter Architecture
+
+## Why Filter MCP Tools
+
+When multiple MCP servers are connected, the proxy may expose hundreds of tools. Sending all tools in every request wastes context window tokens and increases cost. The semantic filter keeps only the top-K most relevant tools based on embedding similarity.
+
+```mermaid
+sequenceDiagram
+    participant Client
+    participant Hook as SemanticToolFilterHook
+    participant Filter as SemanticMCPToolFilter
+    participant Router as semantic-router
+    participant LLM
+
+    Client->>Hook: POST /chat/completions
+    Note over Client,Hook: tools: [100+ MCP tools]
+    Note over Client,Hook: messages: [{"role": "user", "content": "Get my Jira issues"}]
+
+    rect rgb(240, 240, 240)
+        Note over Hook: 1. Extract User Query
+        Hook->>Filter: filter_tools("Get my Jira issues", tools)
+    end
+
+    rect rgb(240, 240, 240)
+        Note over Filter: 2. Convert Tools → Routes
+        Note over Filter: Tool name + description → Route
+    end
+
+    rect rgb(240, 240, 240)
+        Note over Filter: 3. Semantic Matching
+        Filter->>Router: router(query)
+        Router->>Router: Embeddings + similarity
+        Router-->>Filter: [top 10 matches]
+    end
+
+    rect rgb(240, 240, 240)
+        Note over Filter: 4. Return Filtered Tools
+        Filter-->>Hook: [10 relevant tools]
+    end
+
+    Hook->>LLM: POST /chat/completions
+    Note over Hook,LLM: tools: [10 Jira-related tools] ← FILTERED
+    Note over Hook,LLM: messages: [...] ← UNCHANGED
+
+    LLM-->>Client: Response (unchanged)
+```
+
+## Filter Operations
+
+The hook intercepts requests before they reach the LLM:
+
+| Operation | Description |
+|-----------|-------------|
+| **Extract query** | Get user message from `messages[-1]` |
+| **Convert to Routes** | Transform MCP tools into semantic-router Routes |
+| **Semantic match** | Use `semantic-router` to find top-K similar tools |
+| **Filter tools** | Replace request `tools` with filtered subset |
+
+## Trigger Conditions
+
+The filter only runs when:
+- Call type is `completion` or `acompletion`
+- Request contains `tools` field
+- Request contains `messages` field
+- Filter is enabled in config
+
+## What Does NOT Change
+
+- Request messages
+- Response body
+- Non-tool parameters
+
+## Integration with semantic-router
+
+Reuses existing LiteLLM infrastructure:
+- `semantic-router` - Already an optional dependency
+- `LiteLLMRouterEncoder` - Wraps `Router.aembedding()` for embeddings
+- `SemanticRouter` - Handles similarity calculation and top-K selection
+
+## Configuration
+
+```yaml
+litellm_settings:
+  mcp_semantic_tool_filter:
+    enabled: true
+    embedding_model: "openai/text-embedding-3-small"
+    top_k: 10
+    similarity_threshold: 0.3
+```
+
+## Error Handling
+
+The filter fails gracefully:
+- If filtering fails → Return all tools (no impact on functionality)
+- If query extraction fails → Skip filtering
+- If no matches found → Return all tools
diff --git a/litellm/proxy/hooks/mcp_semantic_filter/__init__.py b/litellm/proxy/hooks/mcp_semantic_filter/__init__.py
@@ -0,0 +1,9 @@
+"""
+MCP Semantic Tool Filter Hook
+
+Semantic filtering for MCP tools to reduce context window size
+and improve tool selection accuracy.
+"""
+from litellm.proxy.hooks.mcp_semantic_filter.hook import SemanticToolFilterHook
+
+__all__ = ["SemanticToolFilterHook"]