Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create a MongoDB retrieval Agent #711

Closed
wants to merge 36 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
80ef069
Add MongoDB retrieval agent
cozypet Nov 18, 2023
ac5ed7e
Merge branch 'microsoft:main' into main
cozypet Nov 18, 2023
1a9d8cf
Reformat correctly as requested
cozypet Nov 18, 2023
565c75a
Reformate
cozypet Nov 18, 2023
afdb163
remove project stage to show whole document
cozypet Nov 18, 2023
42405ba
formatting
cozypet Nov 18, 2023
32525b9
format again!!
cozypet Nov 18, 2023
37160ba
format 4th time
cozypet Nov 18, 2023
6f9d126
Add test file, update class definition
cozypet Nov 19, 2023
c08a5d9
Merge branch 'main' into main
cozypet Nov 19, 2023
8dfaa2a
test and class updated
cozypet Nov 19, 2023
48a8a93
get new version
cozypet Nov 19, 2023
f33cb7d
test file formatting
cozypet Nov 19, 2023
01ddb1e
format again
cozypet Nov 19, 2023
f6d7b43
format omg
cozypet Nov 19, 2023
3b908b2
format
cozypet Nov 19, 2023
8f6e2cb
format
cozypet Nov 19, 2023
0052bc4
format
cozypet Nov 19, 2023
f3879f4
format
cozypet Nov 19, 2023
ba2d85b
format
cozypet Nov 19, 2023
88ebf62
format
cozypet Nov 19, 2023
c5d6209
test update
cozypet Nov 19, 2023
8f77ae5
remove wrong file
cozypet Nov 19, 2023
fef0193
test update
cozypet Nov 19, 2023
8faf825
test update
cozypet Nov 19, 2023
5126168
test update
cozypet Nov 19, 2023
a06dea6
add notebook
cozypet Nov 19, 2023
402053e
clean pwd
cozypet Nov 19, 2023
1cf0ee6
Merge branch 'main' into main
cozypet Nov 21, 2023
25b88fb
note book
cozypet Nov 21, 2023
f14a095
add test
cozypet Nov 21, 2023
0e0ea77
Delete notebook/agentchat_mongoDB_retrieve.ipynb
cozypet Nov 21, 2023
272734e
cleaned version for class definition
cozypet Nov 21, 2023
09ce446
get main
cozypet Nov 21, 2023
3ef62c0
Add pymongo dependency
kbeaugrand Nov 24, 2023
85cdc5a
Merge branch 'main' into main
cozypet Nov 24, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 96 additions & 0 deletions autogen/agentchat/contrib/mongoDB_retrieve_user_proxy_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
from typing import Union, Optional, Callable, Dict, List
from pymongo import MongoClient
import pymongo
import openai

from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent


class MongoDBConfig:
def __init__(self, mongo_url, database, vector_collection, vector_index, embedding_field):
self.mongo_url = mongo_url
self.database = database
self.vector_collection = vector_collection
self.vector_index = vector_index
self.embedding_field = embedding_field


class MongoDBRetrieveUserProxyAgent(RetrieveUserProxyAgent):
def __init__(
self,
mongo_config,
name="RetrieveChatAgent", # default set to RetrieveChatAgent
human_input_mode: Optional[str] = "ALWAYS",
is_termination_msg: Optional[Callable[[Dict], bool]] = None,
retrieve_config: Optional[Dict] = None, # config for the retrieve agent
**kwargs,
):
super().__init__()
self.mongo_config = mongo_config
self._doc_ids = set()

def query_vector_db(
self,
query_texts: List[str],
n_results: int = 10,
search_string: str = "",
model="text-embedding-ada-002",
**kwargs,
) -> Dict[str, Union[List[str], List[List[str]]]]:
concatenated_text = " ".join(query_texts)
embedding = self.get_embedding(concatenated_text, model=model)
documents = self.find_similar_documents(embedding, n_results)
ids = [str(idx) for idx in range(1, len(documents) + 1)]
document_contents = [document.get("text_chunks", "") for document in documents]
return {
"ids": [ids],
"documents": [document_contents],
}

def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str = "", **kwargs):
results = self.query_vector_db(
query_texts=[problem],
n_results=n_results,
search_string=search_string,
**kwargs,
)
self._results = results
if isinstance(results["ids"], list) and len(results["ids"]) > 0 and isinstance(results["ids"][0], str):
doc_id = results["ids"][0]
if doc_id in self._doc_ids:
for idx, document in enumerate(results["documents"]):
if isinstance(document, dict):
print(f"Document {idx + 1}: {document.get('text_chunks', '')}")
else:
print(f"Document {idx + 1}: {document}")
else:
print("No documents found.")

def get_embedding(self, text, model="text-embedding-ada-002"):
text = text.replace("\n", " ")
return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]

def connect_mongodb(self):
client = pymongo.MongoClient(self.mongo_config.mongo_url)
db = client[self.mongo_config.database]
collection = db[self.mongo_config.vector_collection]
return collection

def find_similar_documents(self, embedding, limit):
collection = self.connect_mongodb()
documents = list(
collection.aggregate(
[
{
"$vectorSearch": {
"index": self.mongo_config.vector_index,
"path": self.mongo_config.embedding_field,
"queryVector": embedding,
"numCandidates": 50,
"limit": limit,
}
}
]
)
)
return documents
288 changes: 288 additions & 0 deletions notebook/agentchat_mongoDB_retrieve_agent.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,288 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"toc\"></a>\n",
"# Using RetrieveChat with MongoDB for Retrieve Augmented Code Generation and Question Answering\n",
"\n",
"MongoDB has been ranked as the best vector database(https://www.mongodb.com/blog/post/atlas-vector-search-commands-highest-developer-nps-retool-state-ai-2023-survey) in the Retool AI report, so it is quite important to add MongoDB vector search as an option for Autogen RAG.\n",
"\n",
"You can easily start the MongoDB vector search on a free tier M0 MongoDB Atlas cluster. Free tier cluster provides the full functionality of the MongoDB vector search. https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-overview/\n",
"\n",
"### But why is MongoDB such a standout? Well, there are a few key reasons.\n",
"\n",
" 1. MongoDB Atlas integrates smoothly with existing databases. For organizations already using MongoDB, this means a seamless expansion into the vector storage—no major system overhauls required!\n",
" 2. MongoDB Atlas is built to handle operational heavy-lifting. It excels when serving large-scale, mission-critical applications, offering robustness and reliability where it counts.\n",
" 3. MongoDB's flexibility in handling a variety of data types and structures makes it perfectly suited to the complexity of vector embeddings.\n",
"\n",
"As such, implementing MongoDB as a Retrieval Agent can unlock new potential in your AI applications, bringing the full power of vector storage to bear."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Requirements\n",
"Install pymongo for MongoDB connection.\n",
"```\n",
"pip install pymongo\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# %pip install pymongo\n",
"# %pip install \"pyautogen[retrievechat]~=0.2.0b5\" \n",
"# %pip install \"flaml[autogen]\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Set your API Endpoint\n",
"The config_list_from_json function loads a list of configurations from an environment variable or a json file."
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"models to use: ['gpt-3.5']\n"
]
}
],
"source": [
"import autogen\n",
"\n",
"config_list = autogen.config_list_from_json(\n",
" env_or_file=\"OAI_CONFIG_LIST\",\n",
" file_location=\".\",\n",
" filter_dict={\n",
" \"model\": {\n",
" \"gpt-3.5\",\n",
" # \"gpt4\",\n",
" # \"gpt-4\",\n",
" # \"gpt-4-32k-0314\",\n",
" # \"gpt-3.5-turbo\",\n",
" }\n",
" },\n",
")\n",
"\n",
"assert len(config_list) > 0\n",
"print(\"models to use: \", [config_list[i][\"model\"] for i in range(len(config_list))])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Insert necessary libraries.\n",
"\n",
"Import MongoDB retrieval agent and MongoDB config for vector search."
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"# Import necessary libraries\n",
"import os\n",
"import sys\n",
"import pytest\n",
"\n",
"from autogen import config_list_from_json\n",
"from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent\n",
"from autogen.agentchat.contrib.mongoDB_retrieve_user_proxy_agent import MongoDBRetrieveUserProxyAgent, MongoDBConfig"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Loading MongoDB parameters. \n",
"\n",
"In order to setup your mongoDB vector store, you can refer to MongoDB official documentation. https://www.mongodb.com/docs/atlas/atlas-search/field-types/knn-vector/\n",
"\n",
"In this example, we have a prepared vector store which stores mifid regulation information from wikipedia. For this specific vector database and collection, you can refer to this article to understand how data was prepared into vector store. https://artificialcorner.com/mongodb-and-langchain-magic-your-beginners-guide-to-setting-up-a-generative-ai-app-with-your-own-d1f90027d116"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Define MongoDB configuration (replace with your own configuration)\n",
"mongo_config = MongoDBConfig(\n",
" mongo_url=\"<mongodb+srv://your_login:your_password@your_cluster?retryWrites=true&w=majority>\",\n",
" database=\"<your_database>\",\n",
" vector_collection=\"<your_vector_collection>\",\n",
" vector_index=\"<your_vector_index>\",\n",
" embedding_field=\"<your_embedding_field-to-search>\",\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create Assistant Agent and MongoDB retrieval Agent\n",
"\n",
"Once the agents are created, ask questions based on the Mifid information."
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32mAdding doc_id 1 to context.\u001b[0m\n",
"\u001b[32mAdding doc_id 2 to context.\u001b[0m\n",
"\u001b[32mAdding doc_id 3 to context.\u001b[0m\n",
"\u001b[32mAdding doc_id 4 to context.\u001b[0m\n",
"\u001b[32mAdding doc_id 5 to context.\u001b[0m\n",
"\u001b[32mAdding doc_id 6 to context.\u001b[0m\n",
"\u001b[32mAdding doc_id 7 to context.\u001b[0m\n",
"\u001b[32mAdding doc_id 8 to context.\u001b[0m\n",
"\u001b[32mAdding doc_id 9 to context.\u001b[0m\n",
"\u001b[32mAdding doc_id 10 to context.\u001b[0m\n",
"\u001b[32mAdding doc_id 11 to context.\u001b[0m\n",
"\u001b[32mAdding doc_id 12 to context.\u001b[0m\n",
"\u001b[32mAdding doc_id 13 to context.\u001b[0m\n",
"\u001b[32mAdding doc_id 14 to context.\u001b[0m\n",
"\u001b[32mAdding doc_id 15 to context.\u001b[0m\n",
"\u001b[32mAdding doc_id 16 to context.\u001b[0m\n",
"\u001b[32mAdding doc_id 17 to context.\u001b[0m\n",
"\u001b[32mAdding doc_id 18 to context.\u001b[0m\n",
"\u001b[32mAdding doc_id 19 to context.\u001b[0m\n",
"\u001b[32mAdding doc_id 20 to context.\u001b[0m\n",
"\u001b[33mRetrieveChatAgent\u001b[0m (to assistant):\n",
"\n",
"You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the\n",
"context provided by the user. You should follow the following steps to answer a question:\n",
"Step 1, you estimate the user's intent based on the question and context. The intent can be a code generation task or\n",
"a question answering task.\n",
"Step 2, you reply based on the intent.\n",
"If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.\n",
"If user's intent is code generation, you must obey the following rules:\n",
"Rule 1. You MUST NOT install any packages because all the packages needed are already installed.\n",
"Rule 2. You must follow the formats below to write your code:\n",
"```language\n",
"# your code\n",
"```\n",
"\n",
"If user's intent is question answering, you must give as short an answer as possible.\n",
"\n",
"User's question is: when mifid was created?\n",
"\n",
"Context is: MiFID 2 (Markets in financial instruments directive 2), is a legal act of the European Union (EU).\n",
"MiFID 2 (Markets in financial instruments directive 2), is a legal act of the European Union (EU).\n",
"MiFID 2 (Markets in financial instruments directive 2), is a legal act of the European Union (EU).\n",
"MiFID 2 (Markets in financial instruments directive 2), is a legal act of the European Union (EU).\n",
"MiFID 2 (Markets in financial instruments directive 2), is a legal act of the European Union (EU).\n",
"However, MiFID 2 came into force on the revised date of 3 January 2018.\n",
"However, MiFID 2 came into force on the revised date of 3 January 2018.\n",
"However, MiFID 2 came into force on the revised date of 3 January 2018.\n",
"However, MiFID 2 came into force on the revised date of 3 January 2018.\n",
"However, MiFID 2 came into force on the revised date of 3 January 2018.\n",
"Both MiFID 2 and Regulation (EU) No 600/2014 have been effective from 3 January 2018.\n",
"Both MiFID 2 and Regulation (EU) No 600/2014 have been effective from 3 January 2018.\n",
"Both MiFID 2 and Regulation (EU) No 600/2014 have been effective from 3 January 2018.\n",
"Both MiFID 2 and Regulation (EU) No 600/2014 have been effective from 3 January 2018.\n",
"Both MiFID 2 and Regulation (EU) No 600/2014 have been effective from 3 January 2018.\n",
"the United Kingdom during the period of implementation. It incorporated MiFID 2 into its Handbook\n",
"the United Kingdom during the period of implementation. It incorporated MiFID 2 into its Handbook\n",
"the United Kingdom during the period of implementation. It incorporated MiFID 2 into its Handbook\n",
"the United Kingdom during the period of implementation. It incorporated MiFID 2 into its Handbook\n",
"the United Kingdom during the period of implementation. It incorporated MiFID 2 into its Handbook\n",
"\n",
"\n",
"\n",
"--------------------------------------------------------------------------------\n",
"\u001b[33massistant\u001b[0m (to RetrieveChatAgent):\n",
"\n",
"The first MiFID was created in 2004 and implemented in 2007. However, the current context only provides information about MiFID 2 which came into force on 3rd January 2018.\n",
"\n",
"--------------------------------------------------------------------------------\n"
]
}
],
"source": [
"#Create Assistant agent\n",
"assistant = RetrieveAssistantAgent(\n",
" name=\"assistant\",\n",
" system_message=\"You are a helpful assistant.\",\n",
" llm_config=config_list,\n",
")\n",
"\n",
"# Instantiate the User Proxy Agent with MongoDB functionality\n",
"ragproxyagent = MongoDBRetrieveUserProxyAgent(\n",
" name=\"MongoDB_RAG_Agent\",\n",
" human_input_mode=\"NEVER\",\n",
" max_consecutive_auto_reply=2,\n",
" retrieve_config={\n",
" \"task\": \"qa\",\n",
" },\n",
" mongo_config=mongo_config,\n",
" llm_config=config_list,\n",
")\n",
"\n",
"# Reset the assistant and retrieve documents for a specific problem\n",
"assistant.reset()\n",
"ragproxyagent.initiate_chat(\n",
" assistant,\n",
" problem=\"when mifid was created?\",\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
}
],
"metadata": {
"interpreter": {
"hash": "8f1d03dc03a6ec5d9eba90b824060e8b2d143014ce888a41a4072654a3762431"
},
"kernelspec": {
"display_name": "Python 3.10.0 64-bit ('3.10.0')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.0"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"flaml",
"python-dotenv",
"tiktoken",
"pymongo"
]

setuptools.setup(
Expand Down
Loading
Loading