BerriAI · ishaan-jaff · Oct 18, 2025 · Oct 5, 2025 · Oct 5, 2025 · Oct 5, 2025
diff --git a/docs/my-website/docs/completion/knowledgebase.md b/docs/my-website/docs/completion/knowledgebase.md
@@ -412,6 +412,219 @@ This is sent to: `https://bedrock-agent-runtime.{aws_region}.amazonaws.com/knowl
 
 This process happens automatically whenever you include the `vector_store_ids` parameter in your request.
 
+## Accessing Search Results (Citations)
+
+When using vector stores, LiteLLM automatically returns search results in `provider_specific_fields`. This allows you to show users citations for the AI's response.
+
+### Key Concept
+
+Search results are always in: `response.choices[0].message.provider_specific_fields["search_results"]`
+
+For streaming: Results appear in the **final chunk** when `finish_reason == "stop"`
+
+### Non-Streaming Example
+
+
+**Non-Streaming Response with search results:**
+
+```json
+{
+  "id": "chatcmpl-abc123",
+  "choices": [{
+    "index": 0,
+    "message": {
+      "role": "assistant",
+      "content": "LiteLLM is a platform...",
+      "provider_specific_fields": {
+        "search_results": [{
+          "search_query": "What is litellm?",
+          "data": [{
+            "score": 0.95,
+            "content": [{"text": "...", "type": "text"}],
+            "filename": "litellm-docs.md",
+            "file_id": "doc-123"
+          }]
+        }]
+      }
+    },
+    "finish_reason": "stop"
+  }]
+}
+```
+
+<Tabs>
+<TabItem value="python-sdk" label="Python SDK">
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    base_url="http://localhost:4000",
+    api_key="your-litellm-api-key"
+)
+
+response = client.chat.completions.create(
+    model="claude-3-5-sonnet",
+    messages=[{"role": "user", "content": "What is litellm?"}],
+    tools=[{"type": "file_search", "vector_store_ids": ["T37J8R4WTM"]}]
+)
+
+# Get AI response
+print(response.choices[0].message.content)
+
+# Get search results (citations)
+search_results = response.choices[0].message.provider_specific_fields.get("search_results", [])
+
+for result_page in search_results:
+    for idx, item in enumerate(result_page['data'], 1):
+        print(f"[{idx}] {item.get('filename', 'Unknown')} (score: {item['score']:.2f})")
+```
+
+</TabItem>
+
+<TabItem value="typescript" label="TypeScript SDK">
+
+```typescript
+import OpenAI from 'openai';
+
+const client = new OpenAI({
+  baseURL: 'http://localhost:4000',
+  apiKey: process.env.LITELLM_API_KEY
+});
+
+const response = await client.chat.completions.create({
+  model: 'claude-3-5-sonnet',
+  messages: [{ role: 'user', content: 'What is litellm?' }],
+  tools: [{ type: 'file_search', vector_store_ids: ['T37J8R4WTM'] }]
+});
+
+// Get AI response
+console.log(response.choices[0].message.content);
+
+// Get search results (citations)
+const message = response.choices[0].message as any;
+const searchResults = message.provider_specific_fields?.search_results || [];
+
+searchResults.forEach((page: any) => {
+  page.data.forEach((item: any, idx: number) => {
+    console.log(`[${idx + 1}] ${item.filename || 'Unknown'} (${item.score.toFixed(2)})`);
+  });
+});
+```
+
+</TabItem>
+</Tabs>
+
+### Streaming Example
+
+**Streaming Response with search results (final chunk):**
+
+```json
+{
+  "id": "chatcmpl-abc123",
+  "choices": [{
+    "index": 0,
+    "delta": {
+      "provider_specific_fields": {
+        "search_results": [{
+          "search_query": "What is litellm?",
+          "data": [{
+            "score": 0.95,
+            "content": [{"text": "...", "type": "text"}],
+            "filename": "litellm-docs.md",
+            "file_id": "doc-123"
+          }]
+        }]
+      }
+    },
+    "finish_reason": "stop"
+  }]
+}
+```
+
+<Tabs>
+<TabItem value="python-sdk" label="Python SDK">
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    base_url="http://localhost:4000",
+    api_key="your-litellm-api-key"
+)
+
+stream = client.chat.completions.create(
+    model="claude-3-5-sonnet",
+    messages=[{"role": "user", "content": "What is litellm?"}],
+    tools=[{"type": "file_search", "vector_store_ids": ["T37J8R4WTM"]}],
+    stream=True
+)
+
+for chunk in stream:
+    # Stream content
+    if chunk.choices[0].delta.content:
+        print(chunk.choices[0].delta.content, end="", flush=True)
+
+    # Get citations in final chunk
+    if chunk.choices[0].finish_reason == "stop":
+        search_results = getattr(chunk.choices[0].delta, 'provider_specific_fields', {}).get('search_results', [])
+        if search_results:
+            print("\n\nSources:")
+            for page in search_results:
+                for idx, item in enumerate(page['data'], 1):
+                    print(f"  [{idx}] {item.get('filename', 'Unknown')} ({item['score']:.2f})")
+```
+
+</TabItem>
+
+<TabItem value="typescript" label="TypeScript SDK">
+
+```typescript
+import OpenAI from 'openai';
+
+const stream = await client.chat.completions.create({
+  model: 'claude-3-5-sonnet',
+  messages: [{ role: 'user', content: 'What is litellm?' }],
+  tools: [{ type: 'file_search', vector_store_ids: ['T37J8R4WTM'] }],
+  stream: true
+});
+
+for await (const chunk of stream) {
+  // Stream content
+  if (chunk.choices[0]?.delta?.content) {
+    process.stdout.write(chunk.choices[0].delta.content);
+  }
+
+  // Get citations in final chunk
+  if (chunk.choices[0]?.finish_reason === 'stop') {
+    const searchResults = (chunk.choices[0].delta as any).provider_specific_fields?.search_results || [];
+    if (searchResults.length > 0) {
+      console.log('\n\nSources:');
+      searchResults.forEach((page: any) => {
+        page.data.forEach((item: any, idx: number) => {
+          console.log(`  [${idx + 1}] ${item.filename || 'Unknown'} (${item.score.toFixed(2)})`);
+        });
+      });
+    }
+  }
+}
+```
+
+</TabItem>
+</Tabs>
+
+### Search Result Fields
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `search_query` | string | The query used to search the vector store |
+| `data` | array | Array of search results |
+| `data[].score` | float | Relevance score (0-1, higher is more relevant) |
+| `data[].content` | array | Content chunks with `text` and `type` |
+| `data[].filename` | string | Name of the source file (optional) |
+| `data[].file_id` | string | Identifier for the source file (optional) |
+| `data[].attributes` | object | Provider-specific metadata (optional) |
+
 ## API Reference
 
 ### LiteLLM Completion Knowledge Base Parameters

diff --git a/docs/my-website/docs/providers/bedrock_vector_store.md b/docs/my-website/docs/providers/bedrock_vector_store.md
@@ -138,7 +138,14 @@ print(response.choices[0].message.content)
 </Tabs>
 
 
-Futher Reading Vector Stores:
+## Accessing Search Results
+
+See how to access vector store search results in your response:
+- [Accessing Search Results (Non-Streaming & Streaming)](../completion/knowledgebase#accessing-search-results-citations)
+
+## Further Reading
+
+Vector Stores:
 - [Always on Vector Stores](https://docs.litellm.ai/docs/completion/knowledgebase#always-on-for-a-model)
 - [Listing available vector stores on litellm proxy](https://docs.litellm.ai/docs/completion/knowledgebase#listing-available-vector-stores)
 - [How LiteLLM Vector Stores Work](https://docs.litellm.ai/docs/completion/knowledgebase#how-it-works)
diff --git a/docs/my-website/docs/providers/vertex.md b/docs/my-website/docs/providers/vertex.md
@@ -12,7 +12,7 @@ import TabItem from '@theme/TabItem';
 | Provider Route on LiteLLM | `vertex_ai/` |
 | Link to Provider Doc | [Vertex AI ↗](https://cloud.google.com/vertex-ai) |
 | Base URL | 1. Regional endpoints<br/>`https://{vertex_location}-aiplatform.googleapis.com/`<br/>2. Global endpoints (limited availability)<br/>`https://aiplatform.googleapis.com/`|
-| Supported Operations | [`/chat/completions`](#sample-usage), `/completions`, [`/embeddings`](#embedding-models), [`/audio/speech`](#text-to-speech-apis), [`/fine_tuning`](#fine-tuning-apis), [`/batches`](#batch-apis), [`/files`](#batch-apis), [`/images`](#image-generation-models) |
+| Supported Operations | [`/chat/completions`](#sample-usage), `/completions`, [`/embeddings`](#embedding-models), [`/audio/speech`](#text-to-speech-apis), [`/fine_tuning`](#fine-tuning-apis), [`/batches`](#batch-apis), [`/files`](#batch-apis), [`/images`](#image-generation-models), [`/rerank`](#rerank-api) |
 
 
 <br />
@@ -3114,3 +3114,101 @@ Once that's done, when you deploy the new container in the Google Cloud Run serv
 
 
 s/o @[Darien Kindlund](https://www.linkedin.com/in/kindlund/) for this tutorial
+
+## **Rerank API**
+
+Vertex AI supports reranking through the Discovery Engine API, providing semantic ranking capabilities for document retrieval.
+
+### Setup
+
+Set your Google Cloud project ID:
+
+```bash
+export VERTEXAI_PROJECT="your-project-id"
+```
+
+### Usage
+
+```python
+from litellm import rerank
+
+# Using the latest model (recommended)
+response = rerank(
+    model="vertex_ai/semantic-ranker-default@latest",
+    query="What is Google Gemini?",
+    documents=[
+        "Gemini is a cutting edge large language model created by Google.",
+        "The Gemini zodiac symbol often depicts two figures standing side-by-side.",
+        "Gemini is a constellation that can be seen in the night sky."
+    ],
+    top_n=2,
+    return_documents=True  # Set to False for ID-only responses
+)
+
+# Using specific model versions
+response_v003 = rerank(
+    model="vertex_ai/semantic-ranker-default-003",
+    query="What is Google Gemini?",
+    documents=documents,
+    top_n=2
+)
+
+print(response.results)
+```
+
+### Parameters
+
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `model` | string | Model name (e.g., `vertex_ai/semantic-ranker-default@latest`) |
+| `query` | string | Search query |
+| `documents` | list | Documents to rank |
+| `top_n` | int | Number of top results to return |
+| `return_documents` | bool | Return full content (True) or IDs only (False) |
+
+### Supported Models
+
+- `semantic-ranker-default@latest`
+- `semantic-ranker-fast@latest` 
+- `semantic-ranker-default-003`
+- `semantic-ranker-default-002`
+
+For detailed model specifications, see the [Google Cloud ranking API documentation](https://cloud.google.com/generative-ai-app-builder/docs/ranking#rank_or_rerank_a_set_of_records_according_to_a_query).
+
+### Proxy Usage
+
+Add to your `config.yaml`:
+
+```yaml
+model_list:
+  - model_name: semantic-ranker-default@latest
+    litellm_params:
+      model: vertex_ai/semantic-ranker-default@latest
+      vertex_ai_project: "your-project-id"
+      vertex_ai_location: "us-central1"
+      vertex_ai_credentials: "path/to/service-account.json" 
+```
+
+Start the proxy:
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+Test with curl:
+
+```bash
+curl http://0.0.0.0:4000/rerank \
+  -H "Authorization: Bearer sk-1234" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "semantic-ranker-default@latest",
+    "query": "What is Google Gemini?",
+    "documents": [
+      "Gemini is a cutting edge large language model created by Google.",
+      "The Gemini zodiac symbol often depicts two figures standing side-by-side.",
+      "Gemini is a constellation that can be seen in the night sky."
+    ],
+    "top_n": 2
+  }'
+```
diff --git a/docs/my-website/docs/rerank.md b/docs/my-website/docs/rerank.md
@@ -121,4 +121,5 @@ curl http://0.0.0.0:4000/rerank \
 | HuggingFace|   [Usage](../docs/providers/huggingface_rerank)                 |  
 | Infinity|   [Usage](../docs/providers/infinity)                 |  
 | vLLM|   [Usage](../docs/providers/vllm#rerank-endpoint)                 |  
-| DeepInfra|   [Usage](../docs/providers/deepinfra#rerank-endpoint)                 |  
+| DeepInfra|   [Usage](../docs/providers/deepinfra#rerank-endpoint)                 |
+| Vertex AI|   [Usage](../docs/providers/vertex#rerank-api)                 |  
diff --git a/enterprise/litellm_enterprise/integrations/prometheus.py b/enterprise/litellm_enterprise/integrations/prometheus.py
@@ -1,6 +1,7 @@
 # used for /metrics endpoint on LiteLLM Proxy
 #### What this does ####
 #    On success, log events to Prometheus
+import os
 import sys
 from datetime import datetime, timedelta
 from typing import (
@@ -2211,7 +2212,13 @@ def _mount_metrics_endpoint(premium_user: bool):
             )
 
         # Create metrics ASGI app
-        metrics_app = make_asgi_app()
+        if 'PROMETHEUS_MULTIPROC_DIR' in os.environ:
+            from prometheus_client import CollectorRegistry, multiprocess
+            registry = CollectorRegistry()
+            multiprocess.MultiProcessCollector(registry)
+            metrics_app = make_asgi_app(registry)
+        else:
+            metrics_app = make_asgi_app()
 
         # Mount the metrics app to the app
         app.mount("/metrics", metrics_app)
@@ -2354,15 +2361,13 @@ def get_custom_labels_from_tags(tags: List[str]) -> Dict[str, str]:
     }
     """
 
-    from litellm.router_utils.pattern_match_deployments import PatternMatchRouter
     from litellm.types.integrations.prometheus import _sanitize_prometheus_label_name
 
     configured_tags = litellm.custom_prometheus_tags
     if configured_tags is None or len(configured_tags) == 0:
         return {}
 
     result: Dict[str, str] = {}
-    pattern_router = PatternMatchRouter()
 
     for configured_tag in configured_tags:
         label_name = _sanitize_prometheus_label_name(f"tag_{configured_tag}")